def write_sequence(args): _, ext = os.path.splitext(args.fasta) if ext: ext = ext[1:] # remove the dot from extension fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter) regions_to_fetch, split_function = split_regions(args) if not regions_to_fetch: regions_to_fetch = tuple(fasta.keys()) for region in regions_to_fetch: name, start, end = split_function(region) if args.split_files: # open output file based on sequence name filename = '.'.join(str(e) for e in (name, start, end, ext) if e) filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters) outfile = open(filename, 'w') else: outfile = sys.stdout try: for line in fetch_sequence(args, fasta, name, start, end): outfile.write(line) except FetchError as e: raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n") if args.split_files: outfile.close() fasta.__exit__()
def test_split_seq(self): """ Fetch sequence by blocks """ fa = Fasta('data/chr17.hg19.part.fa') gene = Fasta("data/gene.bed12.fasta") expect = gene[list(gene.keys())[0]][:].seq bed = "data/gene.bed12" with open(bed) as fi: record = fi.readline().strip().split("\t") chrom = record[0] start = int(record[1]) strand = record[5] # parse bed12 format starts = [int(x) for x in record[11].split(",")[:-1]] sizes = [int(x) for x in record[10].split(",")[:-1]] starts = [start + x for x in starts] ends = [start + size for start,size in zip(starts, sizes)] # bed half-open if strand == "-": starts = [start + 1 for start in starts] else: ends = [end - 1 for end in ends] intervals = zip(starts, ends) result = fa.get_spliced_seq(chrom, intervals, rc=True) print(result.seq) print("====") print(expect) assert result.seq == expect
def fetch(args): fasta = Fasta(args.fasta) regions = args.regions if args.list: with args.list as listfile: for region in listfile: regions.append(region.rstrip()) for region in regions: region = region.split()[0] try: rname, interval = region.split(':') except ValueError: rname = region interval = None try: start, end = interval.split('-') sequence = fasta[rname][int(start) - 1:int(end)] except (AttributeError, ValueError): sequence = fasta[rname][:] if args.complement: sequence = sequence.complement if args.reverse: sequence = sequence.reverse line_len = fasta[rname]._fa.faidx.index[rname]['lenc'] if args.name: sys.stdout.write('>' + sequence.name + '\n') for line in wrap_sequence(line_len, sequence.seq): sys.stdout.write(line) else: for line in wrap_sequence(line_len, sequence.seq): sys.stdout.write(line) fasta.close()
def main(options): transcripts=read_strand_file(options.strand) ref=Fasta(options.ref) for chrom in ref.keys(): print(chrom, file=sys.stderr) print(">"+chrom) plus=np.array([False]*len(ref[chrom])) minus=np.array([False]*len(ref[chrom])) ti=0 for transcript in transcripts["chr"+chrom]: if not ti % 1000: print("\r"+chrom+":trans"+str(ti), file=sys.stderr) if transcript[0]=="+": plus[transcript[1]:transcript[2]]=True elif transcript[0]=="-": minus[transcript[1]:transcript[2]]=True ti+=1 print(chrom+":writing", file=sys.stderr) chrom_tx_strand = "".join(MAP[1*plus+2*minus]) #output=textwrap.fill(chrom_tx_strand,40) print(chrom_tx_strand) print(chrom+":done", file=sys.stderr)
def calc_bkgd_counts(fasta_filename, region_size_min, region_size_max, ignore_chroms, only_chroms, verbose): ''' calculate nuc frequencies for normalization. Returns: dict of nucleotide frequencies. ''' nuc_counts = defaultdict(Counter) fasta = Fasta(fasta_filename, as_raw = True) for chrom in fasta.keys(): # skip data based on specified chromosomes if chrom in ignore_chroms: continue if only_chroms and chrom not in only_chroms: continue seq_len = len(fasta[chrom]) for idx in range(seq_len + 1): for region_size in range(region_size_min, region_size_max + 1): nucs = fasta[chrom][idx:idx+region_size] nuc_counts[region_size][nucs] += 1 # remove entries that are not equal to region_size for region_size, nuc_dict in nuc_counts.items(): for nuc, count in nuc_dict.items(): if len(nuc) != region_size: nuc_dict.pop(nuc) return nuc_counts
def test_renamed(self): """ Check if sequences in a FASTA file are properly renamed. """ renamer = bioformats.seqname.FastaSeqRenamer() renamer.read_renaming_dict(self.__renaming_dict) with open(self.__output, "w") as output_fasta: for line in renamer.renamed(self.__fasta): output_fasta.write(line) # perform the reverse renaming rev_renamer = bioformats.seqname.FastaSeqRenamer() rev_renamer.read_renaming_dict(self.__renaming_dict) with open(self.__rev_output, "w") as rev_output_fasta: for line in renamer.renamed(self.__output, reverse=True): rev_output_fasta.write(line) # compare the original and reverse-renamed FASTA files original_fasta = Fasta(self.__fasta) rev_renamed_fasta = Fasta(self.__rev_output) for x, y in zip(original_fasta.keys(), rev_renamed_fasta.keys()): self.assertEqual(x, y) # check if the missing sequence exception is raised del renamer.renaming_dict["seq2"] with self.assertRaises(MissingSeqNameError): for _ in renamer.renamed(self.__fasta): pass os.unlink(self.__output) os.unlink(self.__rev_output)
def generate_sizes(name, genome_dir): """Generate a sizes file with length of sequences in FASTA file.""" fa = os.path.join(genome_dir, name, "{}.fa".format(name)) sizes = fa + ".sizes" g = Fasta(fa) with open(sizes, "w") as f: for seqname in g.keys(): f.write("{}\t{}\n".format(seqname, len(g[seqname])))
def main(options): """ Iterate and remove motif (by setting to N) """ ref=Fasta(options.ref) reg=re.compile(motif) for chrom in ref.keys(): print(">"+chrom) new_seq=reg.sub("N"*motif_length, ref[chrom][:].seq.upper()) print(new_seq)
def get_prot_lens(faa_file, phage): len_dict={} digits=get_digits(faa_file) #def make_seq_len_dict(faa): f=Fasta(faa_file) for i in f.keys(): name=get_locus_tag(i, digits=digits, phage=phage) length=len(str(f[i])) len_dict[name]=length return len_dict
def filter_fasta(infa, outfa, regex=".*", v=False, force=False): """Filter fasta file based on regex. Parameters ---------- infa : str Filename of input fasta file. outfa : str Filename of output fasta file. Cannot be the same as infa. regex : str, optional Regular expression used for selecting sequences. v : bool, optional If set to True, select all sequence *not* matching regex. force : bool, optional If set to True, overwrite outfa if it already exists. Returns ------- fasta : Fasta instance pyfaidx Fasta instance of newly created file """ if infa == outfa: raise ValueError("Input and output FASTA are the same file.") if os.path.exists(outfa): if force: os.unlink(outfa) if os.path.exists(outfa + ".fai"): os.unlink(outfa + ".fai") else: raise ValueError( "{} already exists, set force to True to overwrite".format(outfa)) filt_function = re.compile(regex).search fa = Fasta(infa, filt_function=filt_function) seqs = fa.keys() if v: original_fa = Fasta(infa) seqs = [s for s in original_fa.keys() if s not in seqs] fa = original_fa if len(seqs) == 0: raise ValueError("No sequences left after filtering!") with open(outfa, "w") as out: for chrom in seqs: out.write(">{}\n".format(fa[chrom].name)) out.write("{}\n".format(fa[chrom][:].seq)) return Fasta(outfa)
def write_read_lengths_to_file(read_fasta_files, output_file): out=open(output_file,"w") out.write("fasta_file\tseq_id\tread_len\n") readfiles=list(read_fasta_files) for r in readfiles: f=Fasta(r) for i in f.keys(): length=len(str(f[i])) fasta=r.split("/")[-1] sequence=i out.write("%s\t%s\t%s\n" % (fasta, sequence, length)) out.close()
def binding_sites(kmer, genome_fp): genome = Fasta(genome_fp) locations = {} kmer = str(kmer) for record in genome.keys(): seq = str(genome[record]) locations[record] = substr_indices(kmer, seq) # append reversed primer locations as well locations[record] += substr_indices(revcomp(kmer), seq) if locations == {}: raise ValueError( "No locations for {} found in fg genome!".format(kmer)) return locations
def fillgaps(consensusdict, fasta): """ """ print("filling consensus...") fastascaf = Fasta(fasta, mutable=True) for chrom in fastascaf.keys(): for suc in consensusdict.keys(): t1 = int(suc.split(":")[0]) t2 = int(suc.split(":")[1]) assert (t2 - t1) == len(fastascaf[chrom][t1:t2].seq) # print(consensusdict[suc]) # print(fastascaf[chrom][t1:t2].seq) fastascaf[chrom][t1:t2] = consensusdict[suc] # print(fastascaf[chrom][t1:t2].seq) return(None)
def chromosome_ends(genome_fp): ''' Returns the locations of the starts/ends of each chromosome (record) in a genome where all the chromosomes are concatenated (so i.e. the 2nd genome start site is len(1st genome), and all indices are 0-based). ''' genome = Fasta(genome_fp) len_so_far = 0 chr_ends = {} for record in genome.keys(): chromosome = genome[record] chr_len = len(chromosome) chr_ends[record] = [len_so_far, chr_len + len_so_far - 1] len_so_far += chr_len return chr_ends
class TestFeatureKeyFunction: def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta, key_function=get_gene_name) self.genes = Fasta(self.fasta, key_function=get_gene_name) def test_keys(self): expect = ['BARD1', 'FGFR2', 'KF435149.1', 'MDM4', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1'] result = sorted(self.genes.keys()) assert result == expect def test_key_function_by_dictionary_get_key(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.genes['MDM4'][100-1:150] assert str(result) == expect def test_key_function_by_fetch(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.faidx.fetch('MDM4', 100, 150) assert str(result) == expect @raises(ValueError) def test_duplicated_keys(self): genes = Fasta(self.fasta, key_function=get_duplicated_gene_name)
def generate_gap_bed(fname, outname): """ Generate a BED file with gap locations. Parameters ---------- fname : str Filename of input FASTA file. outname : str Filename of output BED file. """ f = Fasta(fname) with open(outname, "w") as bed: for chrom in f.keys(): for m in re.finditer(r'N+', f[chrom][:].seq): bed.write("{}\t{}\t{}\n".format(chrom, m.start(0), m.end(0)))
def __init__(self, remote=False): if not remote: try: self.ref = Fasta(settings.REFERENCE_PATH) except IOError: self.ref = _RemoteReference(settings.BUILD) else: self.ref = _RemoteReference(settings.BUILD) # Add a get method. This will not be sensitive to "chr" prefixes. def get(fasta, chrom): chr_prefix = chrom.startswith("chr") try: return fasta[chrom] except KeyError: pass try: # If there was a prefix, we try without. if chr_prefix: return fasta[chrom[3:]] # If there was no prefix, we try with. else: return fasta["chr{}".format(chrom)] except KeyError: # If it is a true mismatch, we return None. return None self.ref.get = functools.partial(get, self.ref)
def test_get_seq_rc(self): """ Check get_seq with rc argument """ fa = Fasta('data/chr17.hg19.part.fa') result = fa.get_seq("chr17", 11, 20, rc=False) expect = "CCCTGTTCCT" print("normal") print(result.seq) print(expect) assert result.seq == expect result = fa.get_seq("chr17", 11, 20, rc=True) expect = "AGGAACAGGG" assert result.seq == expect print("rc") print(result.seq) print(expect)
def readFASTA(x, splitKey = None): """ Is sequence file? Load from file if so. File should be FASTA format Use pyfasta """ if type(x) is not str: raise TypeError("input must be type str. filename or sequence") if os.path.isfile(x): tmp_o = Fasta(x, key_function=lambda key: key.split()[0]) if (splitKey is None): o = tmp_o else: o = { i.split(splitKey)[0] : tmp_o[i] for i in tmp_o.keys() } else: o = x return o
def fasta_stats(fasta_fp): """ Retrieves the number of bases and number of records in a FASTA file. Also creates a FASTA index (.fai) for later searching. May be slow for very large files. """ # pyfaidx can't handle blank lines within records, so we have to check :( check_empty_lines(fasta_fp) try: fasta = Fasta(fasta_fp) length = fasta_len_quick(fasta_fp) nrecords = len(fasta.keys()) return length, nrecords except: click.secho( "\nError reading %s: invalid FASTA format?" % fasta_fp, fg="red") raise
def test_get_sequence_bed(self): expected = "GGATGGTGTGGTAG" coords = ["chr2", 13, 27, "ENSMUST7", ".", "+"] with Fasta("tests/test_genome.fa") as pf_genome: observed = get_sequence(coords, pf_genome, bed_input=True, strip_chr=True) self.assertEqual(observed, expected)
def test_keys(self): fasta = Fasta('data/genes.fasta', split_char='|', duplicate_action="drop") expect = [ '530364724', '530364725', '530364726', '530373235', '530373237', '530384534', '530384536', '530384538', '530384540', '543583738', '543583740', '543583785', '543583786', '543583788', '543583794', '543583795', '543583796', '557361097', '557361099', '563317589', 'AB821309.1', 'KF435149.1', 'KF435150.1', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'NR_104216.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1', 'dbj' ] result = sorted(fasta.keys()) assert result == expect
def Find_Random_panhandles(path_to_intervals, energy_threshold, handle_length_threshold, panhandle_length_threshold, k, genome_file, threads, need_suboptimal, kmers_stacking_matrix, N_seeds, strandness, what): start_time = time.time() df = pd.read_csv(path_to_intervals, sep='\t') df["gene_chr_start_end_strand"] = df.chr + "_" + df.start_gene.map(str) + "_" + df.end_gene.map(str) + "_" + df.strand df["interval_chr_start_end_strand"] = df["chr"] + "_" + df["start_interval"].map(str) + "_" + df["end_interval"].map(str) + "_" + df["strand"] df['start_interval'] = df['start_interval'].astype(int) df['end_interval'] = df['end_interval'].astype(int) if not ('sequences' in list(df.columns.values)): print('Attaching sequences..') genome = Fasta(genome_file) GetSequencesForDF2 = partial(GetSequencesForDF, genome) df.loc[:, 'sequences'] = df.apply(GetSequencesForDF2, axis=1) if strandness: print("Making complement of minus strand..") df.loc[:, 'sequences'] = df.apply(MakeComplement, axis=1) df.to_csv("../out/intervals_with_seqs.tsv", sep='\t', index=False) df.sequences = map(lambda x: x.upper(), df['sequences']) for seed in range(1, N_seeds + 1): print(seed) ##shuffle df_new = df.copy() df_new = Shuffle(df_new, what, seed) df_new["sequences_indxd"] = df_new['sequences'].apply(lambda x: Index_seq(x, k)) df_new = df_new.loc[df_new.sequences_indxd != False] print("Creating files..") with open('../out/genes_done2.txt', 'w') as done_f: done_f.write('Started alignment: \n') results_one_gene_table = pd.DataFrame( {'gene': [], 'energy': [], 'start_al1': [], 'end_al1': [], 'start_al2': [], 'end_al2': [], 'alignment1': [], 'alignment2': [], 'structure': [], 'interval1': [], 'interval2': []}) with open('../out/random_panhandles' + str(seed) + '.tsv', 'w') as f: results_one_gene_table.to_csv(f, sep='\t', index=False, header=True) with open('../out/counts_close_' + str(seed) + '.txt', 'w') as f: f.write('') print('Start to align..') p = mp.Pool(processes=threads) print('Created pool') m = mp.Manager() print('Created manager') lock = m.Lock() print('Created lock') f_shuffled = '../out/intervals_shuffled_' + str(seed) + '.tsv' df_new.to_csv(f_shuffled, sep='\t', index=False, header=True) Find_panhandles_one_gene2 = partial(Find_panhandles_one_gene, lock, df_new, energy_threshold, handle_length_threshold, panhandle_length_threshold, k, need_suboptimal, kmers_stacking_matrix, seed) print('Created partial') genes = df_new["gene_chr_start_end_strand"].unique() print('Created genes') p.map(Find_panhandles_one_gene2, genes) p.close() p.join() print("all done!") print(time.time() - start_time) return (0)
def get_gene_sequences(parent_dict, ref_chroms, args, liftover_type): fai = Fasta(args.r) if liftover_type == "unplaced": open(args.dir + "/unplaced_genes.fa", 'w') for chrom in ref_chroms: fasta_out = get_fasta_out(chrom, args.r, liftover_type, args.dir) sorted_parents = sorted(list(parent_dict.values()), key=lambda x: x.seqid) write_gene_sequences_to_file(chrom, args.r, fai, sorted_parents, fasta_out) fasta_out.close()
def from_linear_reference(cls, fasta_file_name, reference_name="ref", k=15, only_store_kmers=False): logging.info("Only store kmers? %s" % only_store_kmers) logging.info("k=%d" % k) genome_sequence = str(Fasta(fasta_file_name)[reference_name]) return cls.from_sequence(genome_sequence, k, only_store_kmers)
def __init__(self, filename): """ filename example: pacbio_new_gene_model.all_phase_peptide """ self.filename = filename self.peptidefasta = Fasta("../data/pacbio/" + filename + ".fasta") print(self.peptidefasta) self.signalplines = get_lines("../data/pacbio", filename + ".fasta.signalp") self.position2manualinfo = get_position2manualinfo( "../data/pacbio/pacbio_new_gene_model.tab")
def process_txt(self): """ process tab-delimited text file, containing the following columns: CHR POS REF ALT SAMPLE_ID """ fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000) nbp = (self.args.length - 1) // 2 samples_dict = {} numsites_keep = 0 numsites_skip = 0 chrseq = '0' with open(self.args.input, 'r') as txt_file: reader = csv.reader(txt_file, delimiter='\t') for row in reader: chrom = row[0] pos = int(row[1]) ref = row[2] alt = row[3] sample = row[4] if sample not in samples_dict: samples_dict[sample] = self.subtypes_dict.fromkeys( self.subtypes_dict, 0) if chrom != chrseq: sequence = fasta_reader[chrom] chrseq = chrom if (len(alt) == 1 and len(ref) == 1): mu_type = ref + alt category = getCategory(mu_type) if nbp > 0: lseq = sequence[pos - (nbp + 1):pos + nbp].seq else: lseq = sequence[pos - 1].seq # eprint("lseq:", lseq) motif_a = getMotif(lseq) subtype = str(category + "." + motif_a) if subtype not in self.subtypes_dict: continue samples_dict[sample][subtype] += 1 mdf = pd.DataFrame(samples_dict).T.fillna(0) samples = mdf.index.tolist( ) #instead of using samples_dict with sorted(), which leads to mismatching, simply retain the explicit ordering of the matrix dataframe. M = mdf.values out = collections.namedtuple('Out', ['M', 'samples'])(M, samples) return out
def test_fetch_whole_fasta(self): expect = [ line.rstrip('\n') for line in open('data/genes.fasta') if line[0] != '>' ] result = list( chain(*([line for line in record] for record in Fasta('data/genes.fasta.gz', as_raw=True)))) assert expect == result
def aligned_long_reads(input_sam_file, reference, output): #Get transcript sequences from 'paired-ended' reads, reference genome required ref = Fasta(reference) with open(output,'w') as f2: with open(input_sam_file,'rb') as f1: for line in f1: if not line.startswith("@"): alignment = line.strip().split('\t') ID = alignment[0] FLAG = alignment[1] chromesome = alignment[2] left = int(alignment[3]) Read_length = len(alignment[9]) PE_size = int(alignment[8]) Read = Seq(alignment[9]) if FLAG == "99": first_sequence = Read next_alignment = f1.next().strip().split('\t') next_ID = next_alignment[0] next_FLAG = next_alignment[1] next_sequence = Seq(next_alignment[9]) next_MDZ = next_alignment[12].split(':')[-1] if next_ID == ID and next_FLAG == "147": if PE_size <= 160: head_length = tail_length = PE_size - 80 shared_length = 160 - PE_size if first_sequence[80 - shared_length:80] == next_sequence[0:shared_length]: sequence = first_sequence[0:head_length] + next_sequence else: continue else: gap = PE_size - 160 sequence = first_sequence + Seq(ref[chromesome][left-1 + 80: left-1 + 80 + gap].seq) + next_sequence size = len(sequence) f2.write('%s\t%s\t%d\t%s\n' %(ID, sequence, size, "forward")) else: continue elif FLAG == "163": first_sequence = Read next_alignment = f1.next().strip().split('\t') next_ID = next_alignment[0] next_FLAG = next_alignment[1] next_sequence = Seq(next_alignment[9]) if next_ID == ID and next_FLAG == "83": if PE_size <= 160: head_length = tail_length = PE_size - 80 shared_length = 160 - PE_size if first_sequence[80 - shared_length:80] == next_sequence[0:shared_length]: sequence = (first_sequence[0:head_length] + next_sequence).reverse_complement() else: continue else: gap = PE_size - 160 sequence = (first_sequence + Seq(ref[chromesome][left-1 + 80: left-1 + 80 + gap].seq) + next_sequence).reverse_complement() size = len(sequence) f2.write('%s\t%s\t%d\t%s\n' %(ID, sequence, size, "reverse")) else: continue
def test_slice_whole_entry(self): fasta = Fasta('data/genes.fasta') if test_bio: with open('data/genes.fasta', "rU") as fh: seqio = SeqIO.to_dict(SeqIO.parse(fh, "fasta")) assert str(fasta['gi|557361099|gb|KF435150.1|'][::3]) == str( seqio['gi|557361099|gb|KF435150.1|'].seq[::3]) else: raise SkipTest
def test_reverse_missing_ref(self): ''' check that reverse works correctly ''' genome = Fasta(self.fa) var = self.Var(chrom='N', pos=1, ref='N', alts=['.']) rev = reverse_var(var, genome) self.assertEqual(rev.ref, 'N') self.assertEqual(rev.alts, ['.']) self.assertEqual(rev.pos, 1)
def splitGenome(self): fa = Fasta(self.fastaFileName) for seq in fa: with open('{}{}.fa'.format(STANDARD_GENOME_PATH, seq.name), 'w') as out: out.write('>{}\n'.format(seq.name)) for line in wrap_sequence(70, str(seq)): out.write(line) print("<<<<<<<Splitted>>>>>>")
def split_target_sequence(target_chroms, target_fasta_name, inter_files): Faidx(target_fasta_name) target_fasta_dict = Fasta(target_fasta_name, key_function=lambda x: x.split()[0]) for chrm in target_chroms: if chrm != target_fasta_name: out = open(inter_files + "/" + chrm + ".fa", 'w') out.write(">" + chrm + "\n" + str(target_fasta_dict[chrm])) return target_fasta_dict
class FastaWrapper(GenomeWrapper): def __init__(self, fasta_file, alpha='dna', one_hot=True, channel_last=True, in_mem=False, thread_safe=False, read_ahead=10000): super().__init__(alpha, one_hot, channel_last, in_mem, thread_safe) self.fasta = Fasta(fasta_file, as_raw=True, sequence_always_upper=True, read_ahead=read_ahead) self._chroms = list(self.fasta.keys()) seq_lens = [len(self.fasta[chrom]) for chrom in self._chroms] self._chroms_size = dict(zip(self._chroms, seq_lens)) self.read_ahead = read_ahead if in_mem: fasta_onehot_dict = self._encode_seqs(self.fasta) self.fasta.close() self.fasta = fasta_onehot_dict self.thread_safe = True else: if thread_safe: self.fasta.close() self.fasta = fasta_file def close(self): if not self.thread_safe: self.fasta.close() @staticmethod def _encode_seqs(fasta): # Converts a FASTA object into a dictionary of one-hot coded boolean matrices fasta_dict = {} pbar = tqdm(fasta) for record in pbar: pbar.set_description(desc='Loading sequence: ' + record.name) seq = record[:] seq = np.array(list(seq)) fasta_dict[record.name] = seq return fasta_dict def _get_seq(self, chrom, start, stop): if self.in_mem: seq = self.fasta[chrom][start:stop] else: if self.thread_safe: fasta = Fasta(self.fasta, as_raw=True, sequence_always_upper=True, read_ahead=self.read_ahead) seq = np.array(list(fasta[chrom][start:stop])) fasta.close() else: seq = np.array(list(self.fasta[chrom][start:stop])) return seq
def get_genomic_context(df, genome, n=85): """get the genomic context of variants necessary input columns: ['CHROM', 'POS']; added output columns: ['SEQ', 'SEQ_LONG']""" for i in df.index: df.at[i, 'SEQ_LONG'] = Fasta(genome)[str( df.at[i, 'CHROM'])][(df.at[i, 'POS'] - (n + 101)):(df.at[i, 'POS'] + (n + 100))].seq df.at[i, 'SEQ'] = df.at[i, 'SEQ_LONG'][100:-100] return df
def gc_correct(input, output, reference, frac_n, frac_r, iter, frac_lowess): fasta = Fasta(reference) bed_lines = [ BedLine(*map(attempt_numeric, x.split("\t"))) for x in open(input) ] corrected = correct(bed_lines, fasta, frac_n, frac_r, iter, frac_lowess) with open(output, "wb") as ohandle: for line in corrected: ohandle.write(bytes(str(line) + "\n", 'utf-8'))
def extract_chromosome_data(chromosome_key, start, end): data_path = path_for_chromosome_data(chromosome_key) if not data_path.exists(): raise Exception("Chromosome data not downloaded") all_data = Fasta(str(data_path)) sliced_data = all_data[chromosome_key][start:end].seq return sliced_data
def __init__(self, strain, feature): self.strain = strain self.feature = feature self.get_feature_ids() self.get_scaffold2len() self.get_id2left_right() self.get_id2distance() self.scaffoldfasta = Fasta("data/" + self.strain + "_CLC_de_novo_rmhost_mod.fa", as_raw=True)
def read_names_to_csv(file_pathway, sample_name): """ This function takes in the fasta files and pulls the read names and then outputs them into csv files. :param file_pathway: Pathway to Fasta files :param sample_name: Name of current sample being loaded (i.e. bio_mom) :return: The output filename """ output_filename = f"./read_names/{cross_used}/{sample_name}.csv" output_df = pd.DataFrame() print(f"-- Loading {sample_name} Fasta file --") file_read = Fasta(file_pathway) print(f"-- Loaded {sample_name} Fasta file --") file_read_names_list = [name for name in file_read.keys()] file_read_names_list.sort() output_df['names'] = file_read_names_list print(f"-- Output {sample_name} read names to {sample_name}.csv --") output_df.to_csv(output_filename, sep=',', index=False) print(f'-- {sample_name} finished --') return output_filename
def parse_hgvs(hgvs_name, fasta, genes): genome = Fasta(fasta, key_function=lambda x: 'chr{}'.format(x)) with open(genes) as infile: transcripts = hgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) return hgvs.parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript)
def INDEX_GENOME(OUTDIR, GENOME_FILE): LOGGER.info('Indexing the genome') GENOMEIDX = Fasta(GENOME_FILE) GENOMEPREFIX = os.path.splitext(GENOME_FILE)[0] FAIDX = pd.read_csv(GENOME_FILE + '.fai', sep='\t', names=['SCAFFOLD', 'SCAFF_LENGTH', 'three', 'four', 'five']) #FAIDX = FAIDX[['SCAFFOLD', 'SCAFF_LENGTH']] FILE = GENOMEPREFIX + '.fai' INDEX = os.path.join(OUTDIR, FILE) FAIDX.to_csv(INDEX, sep='\t', header=False, index=False) return INDEX
def post(self): gene_ids = request.get_json(force=True)['gene_ids'] edit = request.get_json(force=True)['edit'] genome = request.get_json(force=True)['genome'] if not gene_ids: # TODO improve raise BadRequest('gene_ids not set') if genome not in ['hg19', 'mm10']: # raise BadRequest(f'{genome} not supported') if edit and len(gene_ids) != 1: raise BadRequest('gene_ids needs to have length 1 if editing..') # TODO here goes all the computation for checking wether SNP and CNSD # influence the guides. For now return the 6 best guides aggregation_pipeline = [ # filter our genes { '$match': { '$and': [{ 'gene_id': { '$in': gene_ids } }, { 'genome': genome }] } }, # unwind guides so we can access their score # {'$unwind': '$guides'}, # # sort by score # {'$sort': {'guides.score': -1}}, # # group guides together again (contrary of unwind) # {'$group': { # '_id': '$_id', # 'gene_id': {'$first': '$gene_id'}, # 'chromosome': {'$first': '$chromosome'}, # 'pdbs': {'$first': '$pdbs'}, # 'exons': {'$first': '$exons'}, # 'guides': {'$push': '$guides'} # }}, ] result = list(guide_collection.aggregate(aggregation_pipeline)) if edit: df = gencode_exons(genome) exons = df[(df.gene_id == gene_ids[0])] chromosome = exons.seqname.iloc[0] # TODO here i have to change things.. fasta = Fasta(GENOME_FILE.format(GENOME), as_raw=True) seq = fasta[chromosome][min(exons.start):max(exons.end)] # if self.strand == '-': # i think this is done on the client... # seq = seq.reverse.complement result[0]['sequence'] = seq return result
class FastaOnehot: def __init__(self, file=None, seqlen=1000): self.fa = Fasta(file, sequence_always_upper=True, as_raw=True) self.l = len(self.fa.keys()) self.seqlen = seqlen self.dna_encoder = LabelEncoder().fit(array(['A', 'C', 'G', 'N', 'T'])) self.onehot_encoder = OneHotEncoder(sparse=False).fit( array(list(range(0, 5))).reshape(-1, 1)) def toOnehot(self, chunksize=10000): k = list(self.fa.keys()) r = 0 i = 0 while r < self.l: seqnames = k[i * chunksize:min((i + 1) * chunksize, self.l)] seq = [ array(list(self.fa[x][:].ljust(1000, 'N'))) for x in seqnames ] int_encoded = [self.dna_encoder.transform(s) for s in seq] int_encoded = [s.reshape(len(s), 1) for s in int_encoded] onehot_encoded = array( [self.onehot_encoder.transform(s) for s in int_encoded]) r = min((i + 1) * chunksize, self.l) i += 1 print('last record : ' + seqnames[-1]) yield (seqnames, onehot_encoded) def toh5(self, file="onehot.h5", chunksize=10000): with h5py.File(file, 'w') as f: cs = min(chunksize, self.l) f.create_dataset( "seqnames", data=[np.string_(s) for s in list(self.fa.keys())], chunks=(cs, )) f.create_dataset("onehot", shape=(self.l, self.seqlen, 5), maxshape=(None, self.seqlen, 5), chunks=(cs, self.seqlen, 5)) i = 0 for n, o in self.toOnehot(chunksize=chunksize): f["onehot"][(i * cs):min(((i + 1) * cs), self.l)] = o i += 1
def raw_error_rate(fig_fn): n = 0 tmp_out = os.path.dirname(os.path.abspath(fig_fn)) + '/raw_cons_error.out' for sample, read_fn, ref_fn, info_fn, cons_ep_fn in zip(samples, read_fas, ref_fns, cons_info_fns, cons_ep_fn): read_fa = Fasta(read_fn) ref_fa = Fasta(ref_fn) with open(ref_fn) as ref_fp, open(cons_ep_fn) as cons_ep_fp, open(info_fn) as info_fp, open(tmp_out, 'w') as out_fp: out_fp.write('Sample\tCopyNum\tRawError\tConsError\n') last_name = '' for cons_name in ref_fa.keys(): read_name = cons_name.rsplit('_')[0] if read_name == last_name: continue copy_num, raw_error, cons_error = 0, 0, 0 ref_seq = ref_fa[cons_name][:].seq.upper() read_seq = read_fa[read_name][:].seq.upper() raw_error = get_mp_error_rate(ref_seq, read_seq) if raw_error < 0: continue for eline in cons_ep_fp: if eline.startswith('#'): continue ele = iline.rsplit() name, error = ele[ep_idx['#READ_NAME']], ele[ep_idx['ERR_RATE']][:-1]/100.0 if name == cons_name: cons_error = error else: continue for sline in info_fp: ele = sline.rsplit() name, num = ele[info_idx['CONS_NAME']], ele[info_idx['COPY_NUM']] if name == cons_name: copy_num = int(num) else: continue out_fp.write('{}\t{}\t{}\t{}\n'.format(sample, copy_num, raw_error, cons_error)) last_name = read_name n+=1 if n== 10: sys.exit(1) cmd = 'Rscript /home/gaoy1/program/circ_plot/error_rate.R {} {}'.format(ep_fn, fig_fn) print(cmd)
def get_transcripts(reference_file, transcript_file, vcf_file): """Take a FASTA reference file and a VCF file, and generate a FASTA file with changes from the vcf file""" shutil.copyfile(reference_file, transcript_file) transcripts = Fasta(transcript_file, mutable=True) with open(vcf_file) as f: for (accession, pos, ref, alt) in get_variations(f): if accession not in transcripts: raise ValueError('VCF accession {0} not found in reference'.\ format(accession)) transcripts[accession][(pos - 1):pos] = alt
def test_ncbiseqrename_fasta(self): """ Check if NCBI sequence names in a FASTA file are properly changed. """ sys.argv = ['', self.__fasta, 'genbank', self.__output, 'ucsc', '--chr', self.__chr, '--unloc', self.__unloc, '--unpl', self.__unpl, '--fasta'] bioformats.cli.ncbirenameseq() # check if the obtained and original files are the same original_fasta = Fasta(self.__ucsc_fasta) renamed_fasta = Fasta(self.__output) for x, y in zip(original_fasta.keys(), renamed_fasta.keys()): self.assertEqual(x, y) os.unlink(self.__ucsc_fasta + '.fai') os.unlink(self.__output) os.unlink(self.__output + '.fai')
def read_pep_fa(protein_file): import pandas as pd proteins = Fasta(str(protein_file)) pl = [] for v in proteins: names = v.long_name.split(" ", 8) d = {"protein_id": names[0], 'protein_type': names[1]} d = {**d, **dict([n.split(":", 1) for n in names[2:]])} d['seq'] = str(proteins[v.name]) pl.append(d) return pd.DataFrame(pl)
def fasta_extract_regions(fa_fname, intervals): """Extract an iterable of regions from an indexed FASTA file. Input: FASTA file name; iterable of (seq_id, start, end) (1-based) Output: iterable of string sequences. """ with Fasta(fa_fname, as_raw=True) as fa_file: for chrom, rows in groupby(intervals, lambda cse: cse[0]): logging.info("Extracting sequences from chromosome %s", chrom) for _chrom, start, end in rows: yield fa_file[_chrom][start:end]
def processMAF(args, subtypes_dict): fasta_reader = Fasta(args.fastafile, read_ahead=1000000) nbp = (args.length-1)//2 samples_dict = {} # M = np.zeros((len(samples), len(subtypes_dict))) numsites_keep = 0 numsites_skip = 0 chrseq = '0' f = open(args.input, 'r', encoding = "ISO-8859-1") reader = csv.DictReader(filter(lambda row: row[0]!='#', f), delimiter='\t') counter = 0 for row in reader: if(row['Variant_Type'] != "SNP"): continue pos = int(row['Start_position']) ref = row['Reference_Allele'] alt = row['Tumor_Seq_Allele2'] sample = row[args.groupvar] if row['Chromosome'] != chrseq: sequence = fasta_reader[row['Chromosome']] chrseq = row['Chromosome'] counter += 1 mu_type = ref + alt category = getCategory(mu_type) lseq = sequence[pos-(nbp+1):pos+nbp].seq motif_a = getMotif(pos, lseq) subtype = str(category + "." + motif_a) st = subtypes_dict[subtype] if sample not in samples_dict: samples_dict[sample] = {} if subtype not in samples_dict[sample]: samples_dict[sample][subtype] = 1 else: samples_dict[sample][subtype] += 1 if (counter%1000 != 0): continue util_log.debug(args.input + ": " + str(counter) + " sites counted") M = DataFrame(samples_dict).T.fillna(0).values samples = sorted(samples_dict) out = collections.namedtuple('Out', ['M', 'samples'])(M, samples) return out
def set_peak_sequences_using_fasta(self, fasta_file_location="grch38.fasta"): logging.info("Setting peak sequences using fasta index") genome = Fasta(fasta_file_location) i = 0 for peak in self.peaks: if i % 10000 == 0: logging.info("%d/%d peaks processed" % (i, len(self.peaks))) i += 1 peak.set_sequence_using_fasta_index(genome)
def generate_fasta(intersection_bedtool, fasta_filename, revcomp, verbose): if verbose: print >> sys.stderr, ">> generating fasta of positions ..." # -s: force strandedness fasta_seqs = intersection_bedtool.sequence(fi=fasta_filename, s=True) fasta = Fasta(fasta_seqs.seqfn) return fasta
def construct_hg38_map(n2nl_aln, hg38_bam): """Constructs a map of hg38 position -> sequence alignment position -> MSA position""" # construct sequence alignment position -> MSA position map using the MSA aln_f = Fasta(n2nl_aln) seq_aln_map = defaultdict(dict) for name, seq in aln_f.iteritems(): seq_pos = 0 for aln_pos, x in enumerate(str(seq)): seq_aln_map[name][seq_pos] = aln_pos if x != '-': seq_pos += 1 # find maximum position for reversing negative strand max_pos = {x: max(y.keys()) for x, y in seq_aln_map.iteritems()} # construct a hg38 -> sequence positions using the sequences trivially mapped back to hg38 hg38_map = {} for rec in pysam.Samfile(hg38_bam): m = {y: x for x, y in rec.aligned_pairs} # invert positions for negative strand genes if rec.qname in ['NOTCH2', 'NOTCH2NL-A', 'NOTCH2NL-B']: m = {x: max_pos[rec.qname] - y for x, y in m.iteritems()} hg38_map[rec.qname] = m # construct a table mapping each alignment position to all hg38 positions r = defaultdict(dict) for name, pos_map in hg38_map.iteritems(): for hg38_pos, seq_pos in pos_map.iteritems(): aln_pos = seq_aln_map[name][seq_pos] r[name][aln_pos] = hg38_pos # now invert this map, so that we have our hg38 -> aln map final_map = {} for name in r: for aln_pos in r[name]: hg38_pos = r[name][aln_pos] assert hg38_pos not in final_map final_map[hg38_pos] = aln_pos return final_map
def size(args): if args.header: print("seqid\tsize") fname, fext = op.splitext(args.fi) if args.fi in ['stdin', '-'] or fext in ['.gz','.bz2']: fh = must_open(args.fi) for rcd in SeqIO.parse(fh, "fasta"): sid, size = rcd.id, len(rcd) if args.bed: print("%s\t%d\t%d" % (sid, 0, size)) else: print("%s\t%d" % (sid, size)) elif fext in [".%s" % x for x in FastaExt]: from pyfaidx import Fasta fas = Fasta(args.fi) for sid in fas.keys(): size = len(fas[sid]) if args.bed: print("%s\t%d\t%d" % (sid, 0, size)) else: print("%s\t%d" % (sid, size)) else: logging.error("%s is not a supported format" % fext)
def split_fasta(number_files, fasta_file): try: fasta=Fasta(fasta_file) except: print "could not open fasta" exit() number_seqs=len(fasta.keys()) splits=int(np.ceil(number_seqs/number_files)) #print(splits) ranges=range(0, number_seqs, splits) print(ranges) ranges[-1]=number_seqs print(ranges) for i in range(0, number_files): start=ranges[i] stop=ranges[i+1] label=re.sub(r"\.fa.*","."+str(i+1)+".fasta", fasta_file) out=open(label,"w") for f in fasta.keys()[start:stop]: out.write(">"+f+"\n"+str(fasta[f])+"\n") out.close()
def write_sequence(args): _, ext = os.path.splitext(args.fasta) if ext: ext = ext[1:] # remove the dot from extension filt_function = re.compile(args.regex).search fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, rebuild=not args.no_rebuild) regions_to_fetch, split_function = split_regions(args) if not regions_to_fetch: regions_to_fetch = fasta.keys() if args.invert_match: sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch]) fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild) regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude) split_function = ucsc_split header = False for region in regions_to_fetch: name, start, end = split_function(region) if args.size_range: if start is not None and end is not None: sequence_len = end - start else: sequence_len = len(fasta[name]) if args.size_range[0] > sequence_len or args.size_range[1] < sequence_len: continue if args.split_files: # open output file based on sequence name filename = '.'.join(str(e) for e in (name, start, end, ext) if e) filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters) outfile = open(filename, 'w') elif args.out: outfile = args.out else: outfile = sys.stdout try: if args.transform: if not header and args.transform == 'nucleotide': outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\n") header = True outfile.write(transform_sequence(args, fasta, name, start, end)) else: for line in fetch_sequence(args, fasta, name, start, end): outfile.write(line) except FetchError as e: raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n") if args.split_files: outfile.close() fasta.__exit__()
def count_crickMAX(args): """Count the number of sequences in the Crick fasta file""" with open(args.crick, 'r') as crick_in: count = 0 for line in crick_in: if line.startswith('>'): count +=1 return count if __name__ == '__main__': args = parse_options() CRICK_MAX = count_crickMAX(args) print "now starting Fasta import" seq_in = Fasta(args.seqin) print "done with Fasta import" clusters = open(args.clusters) outsam = args.samout # path = '/Volumes/data/epiGBS/Baseclear/Athal/' # path = '/Volumes/data/epiGBS/DNAVISION/Project_DNA11032___140919_SN170_0407_AC52R6ACXX/Sample_DNA11032-001-L1/output/seqykJJfz/scabiosa/' # path = '/tmp/' # path = '/Volumes/data/epiGBS/FINAL/Scabiosa/BASECLEAR/' # seq_in = Fasta(path+'Scabiosa_combined.fa') #fasta_in = SeqIO.parse(open('/tmp/test.fa', 'r'), 'fasta') seq_in_keymap = {} for key in seq_in.keys(): seq_in_keymap[key.split(';')[0]] = key faidx_rec = seq_in[key]
def rename(args): import re from pyfaidx import Fasta fi, fo, fmf, fmb = args.fi, args.fo, args.fmf, args.fmb merge_short, gap = args.merge_short, args.gap prefix_chr, prefix_ctg = args.prefix_chr, args.prefix_ctg db = Fasta(fi) ptn1 = "^(chr)?([0-9]{1,2})" ptn2 = "chromosome *([0-9]{1,2})" sdic, cdic = dict(), dict() ccnt = 1 for sid in db.keys(): size = len(db[sid]) res1 = re.search(ptn1, sid, re.IGNORECASE) if res1: sdic[sid] = [int(res1.group(2)), size] else: sid_long = db[sid].long_name res2 = re.search(ptn2, sid_long, re.IGNORECASE) if res2: sdic[sid] = [int(res2.group(1)), size] else: cdic[sid] = [ccnt, size] ccnt += 1 if len(sdic.keys()) == 0: print("Error: no chromosomes detected") sys.exit(1) slst = sorted(sdic.items(), key = lambda t: t[1][0]) clst = sorted(cdic.items(), key = lambda t: t[1][0]) nchrom = slst[-1][1][0] sdigits = ndigit(slst[-1][1][0]) cdigits = ndigit(clst[-1][1][0]) if len(clst) > 0 else 1 sfmt = "%s%%0%dd" % (prefix_chr, sdigits) cfmt = "%s%%0%dd" % (prefix_ctg, cdigits) logging.debug("%d chromosomes, %d scaffolds/contigs" % (len(sdic), len(cdic))) fname, fext = op.splitext(fi) if fext not in [".%s" % x for x in FastaExt]: logging.error("%s is not a supported format" % fext) sys.exit(1) fho = open(fo, "w") fhf = open(fmf, "w") fhb = open(fmb, "w") for sid, sval in slst: scnt, size = sval nsid = sfmt % scnt fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (sid, 0, size, nsid, 0, size, scnt)) fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (nsid, 0, size, sid, 0, size, scnt)) nrcd = SeqRecord(Seq(str(db[sid])), id = nsid, description = '') SeqIO.write(nrcd, fho, "fasta") i = nchrom + 1 if len(clst) > 0 and merge_short: zid = "%sx" % prefix_chr if sdigits == 2: zid = "%s99" % prefix_chr else: assert sdigits == 1, "wrong number of chroms: %d" % sdigits pos = 0 seq = '' for cid, sval in clst: ccnt, size = sval start, end = pos, pos + size if pos > 0: start += gap end += gap seq += "N" * gap seq += str(db[cid]) fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (cid, 0, size, zid, start, end, i)) fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (zid, start, end, cid, 0, size, i)) pos = end i += 1 nrcd = SeqRecord(Seq(seq), id = zid, description = '') SeqIO.write(nrcd, fho, "fasta") else: for cid, sval in clst: ccnt, size = sval ncid = cfmt % ccnt fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (cid, 0, size, ncid, 0, size, i)) fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (ncid, 0, size, cid, 0, size, i)) nrcd = SeqRecord(Seq(str(db[cid])), id = ncid, description = '') SeqIO.write(nrcd, fho, "fasta") i += 1 fhf.close() fhb.close() fho.close()
def test_renamed(self): formats = ( "refseq_full", "genbank_full", "refseq_gi", "genbank_gi", "refseq", "genbank", "chr_refseq", "chr_genbank", "chr", ) for i, j in itertools.product(formats[:-1], formats): renamer = bioformats.seqname.NcbiFastaSeqRenamer() for k in self.__acc_num_files: renamer.read_ncbi_acc_num(k, i, j) # convert sequence IDs input_file = os.path.join(self.__test_dir, "ncbi_" + i + ".fa") with open(self.__output, "w") as output_fasta: for line in renamer.renamed(input_file): output_fasta.write(line) example_file = os.path.join(self.__test_dir, "ncbi_" + j + ".fa") for k in (self.__output + ".fai", example_file + ".fai"): if os.path.isfile(k): os.unlink(k) output_fasta = Fasta(self.__output) example_fasta = Fasta(example_file) # compare the obtained file to the example self.assertEqual(output_fasta.keys, example_fasta.keys) # test for an incorrect format with self.assertRaises(SeqRenameError): renamer = bioformats.seqname.NcbiFastaSeqRenamer() renamer.read_ncbi_acc_num("unknown", "chr_refseq", os.path.join(self.__test_dir, "ncbi_chr_refseq.fa")) with self.assertRaises(SeqRenameError): renamer.read_ncbi_acc_num("chr_refseq", "unknown", os.path.join(self.__test_dir, "ncbi_chr_refseq.fa")) # test for an incorrect NCBI accession number dictionary with self.assertRaises(IncorrectDictError): renamer.read_ncbi_acc_num(self.__chr_incorrect, "refseq_full", "chr_refseq") # check if sequence versions are removed renamer = bioformats.seqname.NcbiFastaSeqRenamer() for k in self.__acc_num_files: renamer.read_ncbi_acc_num(k, "chr", "genbank", remove_seq_version=True) input_file = os.path.join(self.__test_dir, "ncbi_chr.fa") example_file = os.path.join(self.__test_dir, "ncbi_genbank_nover.fa") with open(self.__output, "w") as output_fasta: for line in renamer.renamed(input_file): output_fasta.write(line) for k in (self.__output + ".fai", example_file + ".fai"): if os.path.isfile(k): os.unlink(k) output_fasta = Fasta(self.__output) example_fasta = Fasta(example_file) self.assertEqual(output_fasta.keys, example_fasta.keys) os.unlink(example_file + ".fai") # remove temporary files and FASTA indices os.unlink(self.__output) os.unlink(self.__output + ".fai") for i in formats: os.unlink(os.path.join(self.__test_dir, "ncbi_" + i + ".fa.fai"))
z = defaultdict(list) # amount of sequence passed out of Cactus for species in inputList: print species x[species] = 0 y[species] = 0 z[species] = [] for file in os.listdir('.'): if file.endswith('.fai') and protId[species] in file: with open(file,'r') as f: lines = f.readlines() for line in lines: if line: x[species] += int(line.split('\t')[1])#abs(int(line.split('\t')[3]) - int(line.split('\t')[2]))#max(map(int,line.split('\t')[2:4]))-min(map(int,line.split('\t')[2:4])) for folder in [folder2 for folder2 in fastaFolders if species+'.fa' in os.listdir(folder2)]: try: fa = Fasta(folder+species+'.fa') #bedText = '\n'.join('\t'.join(['_'.join(line.split('_')[:-2])] + line.split('_')[-2:]) for line in fa.keys()) y[species] += sum([len(fa[key][:].seq) for key in fa.keys()])#findlen(BedTool(bedText, from_string=True)) except: print 'Error for ' + folder+species+'.fa' print y[species] """ with open('finalSyntenyMultipleSpecies.bed','r') as f: print 'Bed Open...' lineOut = [] for line in f.readlines(): lineOut.append('-'.join(line.split('\t')[0:4])+'|'+line[line.rfind('\t')+1:]) for line in lineOut: for seq in line.split('|'): y[specId[seq.split('-')[0]]] += abs(int(seq.split('-')[3]) - int(seq.split('-')[2]))
from pyfaidx import Fasta maysFasta = Fasta('name') maysFasta.keys()