def make_kfold_datasets(k, fastafile, firefile, outpre): logging.warning("Making %s fold datasets" % k) # shuffle all the fasta entries in place np.random.shuffle(fastafile.names) # create k folds out of the names as a list of lists size = len(fastafile.names) folds = [firefile.names[i::k] for i in range(k)] # loop through each fold for test_idx in range(len(folds)): # make a seperate fasta file and fire file for the test fold this_test_fasta = fa.FastaFile() this_test_fire = FIREfile() for name in folds[test_idx]: this_test_fasta.add_entry(fastafile.pull_entry(name)) this_test_fire.add_entry(name, firefile.pull_value(name)) with open(outpre + "_test_%i.fa" % test_idx, mode="w") as outf: this_test_fasta.write(outf) this_test_fire.write(outpre + "_test_%i.txt" % test_idx) # make a sperate fasta and fire file for the train folds this_train_fasta = fa.FastaFile() this_train_fire = FIREfile() for train_idx in range(len(folds)): if train_idx != test_idx: for name in folds[train_idx]: this_train_fasta.add_entry(fastafile.pull_entry(name)) this_train_fire.add_entry(name, firefile.pull_value(name)) logging.warning("Writing fold %i" % test_idx) with open(outpre + "_train_%i.fa" % test_idx, mode="w") as outf: this_train_fasta.write(outf) this_train_fire.write(outpre + "_train_%i.txt" % test_idx)
def out_fasta(gb, features = ["CDS"], qual_names = ["locus_tag"]): import fasta as fa out_fasta = fa.FastaFile() fa_name = set() for feature in gb.features: if feature.type in features: # skip features with partial ends if check_partial_start(feature.location.start) or check_partial_end(feature.location.end): continue # often times things need a gene name, thus will replace with locus tag or # if that doesn't exist. NA_number unique_name = check_qualifiers(feature, qual_names) num = 1 id_name = unique_name while id_name in fa_name: id_name = unique_name + "_%d"%(num) num += 1 fa_name.add(id_name) header = ">" + id_name + \ " " + \ str(feature.qualifiers.get("product", ["NA"])[0]) seq = str(feature.extract(gb.seq)) entry = fa.FastaEntry(header = header, seq = seq) out_fasta.add_entry(entry) out_fasta.write(sys.stdout)
def read_genome(infile): # sys.path.append("/home/mbwolfe/src/circ_mapper") import fasta genome = fasta.FastaFile() with open(infile, mode="r") as f: genome.read_whole_file(f) chrm = genome.pull_entry(genome.chrm_names()[0]) return chrm
def out_fna(gb, chrm=None): import fasta as fa out_fasta = fa.FastaFile() if chrm: header = ">" + str(chrm) else: header = ">" + str(gb.name) seq = str(gb.seq) entry = fa.FastaEntry(header=header, seq=seq) out_fasta.add_entry(entry) out_fasta.write(sys.stdout)
def seq_file (file, format=None, revcomp=False, name="", gap=None, contig=None): if (format == None): format = infer_format(file) if (contig != None) and (format not in ["fasta",None]): raise ValueError("Contigs are not supported for format %s" % format) if (format == "fasta"): return fasta.FastaFile (file, revcomp=revcomp, name=name, gap=gap, contig=contig) elif (format == "nib"): return nib.NibFile (file, revcomp=revcomp, name=name, gap=gap) elif (format == "qdna"): return qdna.QdnaFile (file, revcomp=revcomp, name=name, gap=gap) else: if (format == None): format = "" else: format = " " + format raise ValueError("Unknown sequence format%s in %s" % (format,file.name))
def out_fasta(gb, features=["CDS"], qual_name="locus_tag"): import fasta as fa out_fasta = fa.FastaFile() fa_name = set() for NA_num, feature in enumerate(gb.features): if feature.type in features: # often times things need a gene name, thus will replace with locus tag or # if that doesn't exist. NA_number unique_name = str( feature.qualifiers.get(qual_name, ["%s_%d" % ("NA", NA_num)])[0]) num = 1 id_name = unique_name while id_name in fa_name: id_name = unique_name + "_%d" % (num) num += 1 fa_name.add(id_name) header = ">" + id_name + \ " " + \ str(feature.qualifiers.get("product", ["NA"])[0]) seq = str(feature.extract(gb.seq)) entry = fa.FastaEntry(header=header, seq=seq) out_fasta.add_entry(entry) out_fasta.write(sys.stdout)
def dense_sampling_main(args): # parse arguments args = parser.parse_args() # figure out random seed np.random.seed(args.seed) # read in genome genome = fa.FastaFile() logging.warning("reading in full genome") with open(args.fasta) as inf: genome.read_whole_file(inf) # read in bed file inbed = bed.BedFile() logging.warning("reading in bed") inbed.from_bed_file(args.bedfile) # discretize the genome by size of window and step of window outbed = discretize_genome(args.wsize, args.stepsize, genome) # convert input bed to an interval file by chromosome intervals = {chrm.chrm_name(): it.Intervals() for chrm in genome} for feature in inbed: this_interval = intervals[feature["chrm"]] this_interval.add_interval( it.Interval(feature["start"], feature["end"])) intervals[feature["chrm"]] = this_interval # figure out which intervals overlap and which don't logging.warning("determining which intervals overlap") positive_bed = bed.BedFile() negative_bed = bed.BedFile() for i, window in enumerate(outbed): if i % 10000 == 0: logging.warning("Checking interval %s" % i) this_chrm = window["chrm"] this_intervals = intervals[this_chrm] window_interval = it.Interval(window["start"], window["end"]) perc_overlap = this_intervals.check_percent_overlap(window_interval) if perc_overlap >= args.perc_overlap: positive_bed.add_entry(window) else: negative_bed.add_entry(window) # make fire file fire = FIREfile() out_fasta = fa.FastaFile() for feature in positive_bed: this_chrm = feature["chrm"] this_name = feature["name"] this_start = feature["start"] this_end = feature["end"] fire.add_entry(this_name, args.default_score) out_fasta.add_entry( fa.FastaEntry( ">" + this_name, genome.pull_entry(this_chrm).pull_seq(this_start, this_end))) for feature in negative_bed: this_chrm = feature["chrm"] this_name = feature["name"] this_start = feature["start"] this_end = feature["end"] fire.add_entry(this_name, args.rand_score) out_fasta.add_entry( fa.FastaEntry( ">" + this_name, genome.pull_entry(this_chrm).pull_seq(this_start, this_end))) # write files if args.true_bed: positive_bed.write_bed_file(args.outpre + "_true.bed") if args.rand_bed: negative_bed.write_bed_file(args.outpre + "_rand.bed") if not args.no_fasta: with open(args.outpre + ".fa") as outf: out_fasta.write(outf) fire.write(args.outpre + "_fire.txt")
default=25, help="Step size for dense mode, default = 25") parser.add_argument('--perc_overlap', type=float, default=0.5, help="Overlap cutoff for dense mode, default = 0.5") # parse arguments args = parser.parse_args() if args.dense: dense_sampling_main(args) sys.exit() # figure out random seed np.random.seed(args.seed) # read in genome genome = fa.FastaFile() logging.warning("reading in full genome") with open(args.fasta) as inf: genome.read_whole_file(inf) # read in bed file inbed = bed.BedFile() logging.warning("reading in bed") inbed.from_bed_file(args.bedfile) # check how much of the genome the regions cover genome_length = {} total_length = 0 for chrm in genome: genome_length[chrm] = len(chrm) total_length += len(chrm)
import argparse parser = argparse.ArgumentParser("Combine fasta files into one fasta file") parser.add_argument('outfilepre', type=str, help="prefix for output files") parser.add_argument('infiles', type=str, nargs='+', help="files to combine") parser.add_argument( '--masked_regions', type=str, help="A .bed file containing a list of regions to replace with Ns") args = parser.parse_args() genome_name = args.outfilepre fasta_files = args.infiles all_fastas = [fa.FastaFile() for fafile in fasta_files] # read in all fastas for fafile, fafilename in zip(all_fastas, fasta_files): with open(fafilename, mode="r") as inf: fafile.read_whole_file(inf) final_fasta = fa.FastaFile() for fafile in all_fastas: for entry in fafile: final_fasta.add_entry(entry) lengths = {} for entry in final_fasta: chrm_name = entry.chrm_name() lengths[chrm_name] = len(entry) # Replace masked regions with Ns