Example #1
0
def make_kfold_datasets(k, fastafile, firefile, outpre):
    logging.warning("Making %s fold datasets" % k)
    # shuffle all the fasta entries in place
    np.random.shuffle(fastafile.names)
    # create k folds out of the names as a list of lists
    size = len(fastafile.names)
    folds = [firefile.names[i::k] for i in range(k)]
    # loop through each fold
    for test_idx in range(len(folds)):
        # make a seperate fasta file and fire file for the test fold
        this_test_fasta = fa.FastaFile()
        this_test_fire = FIREfile()
        for name in folds[test_idx]:
            this_test_fasta.add_entry(fastafile.pull_entry(name))
            this_test_fire.add_entry(name, firefile.pull_value(name))
        with open(outpre + "_test_%i.fa" % test_idx, mode="w") as outf:
            this_test_fasta.write(outf)
        this_test_fire.write(outpre + "_test_%i.txt" % test_idx)

        # make a sperate fasta and fire file for the train folds
        this_train_fasta = fa.FastaFile()
        this_train_fire = FIREfile()

        for train_idx in range(len(folds)):
            if train_idx != test_idx:
                for name in folds[train_idx]:
                    this_train_fasta.add_entry(fastafile.pull_entry(name))
                    this_train_fire.add_entry(name, firefile.pull_value(name))

        logging.warning("Writing fold %i" % test_idx)
        with open(outpre + "_train_%i.fa" % test_idx, mode="w") as outf:
            this_train_fasta.write(outf)
        this_train_fire.write(outpre + "_train_%i.txt" % test_idx)
def out_fasta(gb, features = ["CDS"], qual_names = ["locus_tag"]):
    import fasta as fa
    out_fasta = fa.FastaFile()
    fa_name = set()
    for feature in gb.features:
        if feature.type in features:
            # skip features with partial ends
            if check_partial_start(feature.location.start) or check_partial_end(feature.location.end):
                continue

            # often times things need a gene name, thus will replace with locus tag or
            # if that doesn't exist. NA_number
            unique_name = check_qualifiers(feature, qual_names)
            num = 1
            id_name = unique_name
            while id_name in fa_name:
                id_name = unique_name + "_%d"%(num)
                num += 1
            fa_name.add(id_name)
            header = ">" + id_name + \
            " " +  \
            str(feature.qualifiers.get("product", ["NA"])[0])
            seq = str(feature.extract(gb.seq))
            entry = fa.FastaEntry(header = header, seq = seq)
            out_fasta.add_entry(entry)
    out_fasta.write(sys.stdout)
def read_genome(infile):
    #    sys.path.append("/home/mbwolfe/src/circ_mapper")
    import fasta
    genome = fasta.FastaFile()
    with open(infile, mode="r") as f:
        genome.read_whole_file(f)
    chrm = genome.pull_entry(genome.chrm_names()[0])
    return chrm
Example #4
0
def out_fna(gb, chrm=None):
    import fasta as fa
    out_fasta = fa.FastaFile()
    if chrm:
        header = ">" + str(chrm)
    else:
        header = ">" + str(gb.name)
    seq = str(gb.seq)
    entry = fa.FastaEntry(header=header, seq=seq)
    out_fasta.add_entry(entry)
    out_fasta.write(sys.stdout)
Example #5
0
def seq_file (file, format=None, revcomp=False, name="", gap=None, contig=None):
    if (format == None): format = infer_format(file)
    if (contig != None) and (format not in ["fasta",None]):
        raise ValueError("Contigs are not supported for format %s" % format)
    if   (format == "fasta"): return fasta.FastaFile (file, revcomp=revcomp, name=name, gap=gap, contig=contig)
    elif (format == "nib"):   return nib.NibFile     (file, revcomp=revcomp, name=name, gap=gap)
    elif (format == "qdna"):  return qdna.QdnaFile   (file, revcomp=revcomp, name=name, gap=gap)
    else:
        if (format == None): format = ""
        else:                format = " " + format
        raise ValueError("Unknown sequence format%s in %s" % (format,file.name))
Example #6
0
def out_fasta(gb, features=["CDS"], qual_name="locus_tag"):
    import fasta as fa
    out_fasta = fa.FastaFile()
    fa_name = set()
    for NA_num, feature in enumerate(gb.features):
        if feature.type in features:
            # often times things need a gene name, thus will replace with locus tag or
            # if that doesn't exist. NA_number
            unique_name = str(
                feature.qualifiers.get(qual_name,
                                       ["%s_%d" % ("NA", NA_num)])[0])
            num = 1
            id_name = unique_name
            while id_name in fa_name:
                id_name = unique_name + "_%d" % (num)
                num += 1
            fa_name.add(id_name)
            header = ">" + id_name + \
            " " +  \
            str(feature.qualifiers.get("product", ["NA"])[0])
            seq = str(feature.extract(gb.seq))
            entry = fa.FastaEntry(header=header, seq=seq)
            out_fasta.add_entry(entry)
    out_fasta.write(sys.stdout)
Example #7
0
def dense_sampling_main(args):

    # parse arguments
    args = parser.parse_args()
    # figure out random seed
    np.random.seed(args.seed)
    # read in genome
    genome = fa.FastaFile()
    logging.warning("reading in full genome")
    with open(args.fasta) as inf:
        genome.read_whole_file(inf)

    # read in bed file
    inbed = bed.BedFile()
    logging.warning("reading in bed")
    inbed.from_bed_file(args.bedfile)

    # discretize the genome by size of window and step of window
    outbed = discretize_genome(args.wsize, args.stepsize, genome)

    # convert input bed to an interval file by chromosome
    intervals = {chrm.chrm_name(): it.Intervals() for chrm in genome}

    for feature in inbed:
        this_interval = intervals[feature["chrm"]]
        this_interval.add_interval(
            it.Interval(feature["start"], feature["end"]))
        intervals[feature["chrm"]] = this_interval

    # figure out which intervals overlap and which don't

    logging.warning("determining which intervals overlap")
    positive_bed = bed.BedFile()
    negative_bed = bed.BedFile()
    for i, window in enumerate(outbed):
        if i % 10000 == 0:
            logging.warning("Checking interval %s" % i)
        this_chrm = window["chrm"]
        this_intervals = intervals[this_chrm]
        window_interval = it.Interval(window["start"], window["end"])
        perc_overlap = this_intervals.check_percent_overlap(window_interval)
        if perc_overlap >= args.perc_overlap:
            positive_bed.add_entry(window)
        else:
            negative_bed.add_entry(window)
    # make fire file
    fire = FIREfile()
    out_fasta = fa.FastaFile()
    for feature in positive_bed:
        this_chrm = feature["chrm"]
        this_name = feature["name"]
        this_start = feature["start"]
        this_end = feature["end"]
        fire.add_entry(this_name, args.default_score)
        out_fasta.add_entry(
            fa.FastaEntry(
                ">" + this_name,
                genome.pull_entry(this_chrm).pull_seq(this_start, this_end)))

    for feature in negative_bed:
        this_chrm = feature["chrm"]
        this_name = feature["name"]
        this_start = feature["start"]
        this_end = feature["end"]
        fire.add_entry(this_name, args.rand_score)
        out_fasta.add_entry(
            fa.FastaEntry(
                ">" + this_name,
                genome.pull_entry(this_chrm).pull_seq(this_start, this_end)))

    # write files

    if args.true_bed:
        positive_bed.write_bed_file(args.outpre + "_true.bed")
    if args.rand_bed:
        negative_bed.write_bed_file(args.outpre + "_rand.bed")

    if not args.no_fasta:
        with open(args.outpre + ".fa") as outf:
            out_fasta.write(outf)

    fire.write(args.outpre + "_fire.txt")
Example #8
0
                        default=25,
                        help="Step size for dense mode, default = 25")
    parser.add_argument('--perc_overlap',
                        type=float,
                        default=0.5,
                        help="Overlap cutoff for dense mode, default = 0.5")

    # parse arguments
    args = parser.parse_args()
    if args.dense:
        dense_sampling_main(args)
        sys.exit()
    # figure out random seed
    np.random.seed(args.seed)
    # read in genome
    genome = fa.FastaFile()
    logging.warning("reading in full genome")
    with open(args.fasta) as inf:
        genome.read_whole_file(inf)

    # read in bed file
    inbed = bed.BedFile()
    logging.warning("reading in bed")
    inbed.from_bed_file(args.bedfile)

    # check how much of the genome the regions cover
    genome_length = {}
    total_length = 0
    for chrm in genome:
        genome_length[chrm] = len(chrm)
        total_length += len(chrm)
    import argparse
    parser = argparse.ArgumentParser("Combine fasta files into one fasta file")
    parser.add_argument('outfilepre', type=str, help="prefix for output files")
    parser.add_argument('infiles',
                        type=str,
                        nargs='+',
                        help="files to combine")
    parser.add_argument(
        '--masked_regions',
        type=str,
        help="A .bed file containing a list of regions to replace with Ns")

    args = parser.parse_args()
    genome_name = args.outfilepre
    fasta_files = args.infiles
    all_fastas = [fa.FastaFile() for fafile in fasta_files]
    # read in all fastas
    for fafile, fafilename in zip(all_fastas, fasta_files):
        with open(fafilename, mode="r") as inf:
            fafile.read_whole_file(inf)

    final_fasta = fa.FastaFile()
    for fafile in all_fastas:
        for entry in fafile:
            final_fasta.add_entry(entry)
    lengths = {}
    for entry in final_fasta:
        chrm_name = entry.chrm_name()
        lengths[chrm_name] = len(entry)

    # Replace masked regions with Ns