def add_gc_content(df, df_sub, args): if args.feature == 'gene': fasta = {c[0]:c[1] for c in SimpleFastaParser(args.fasta)} gene2exon_indx = df.groupby(['gene_id', 'feature']) exons = defaultdict(str) gene_ids = set(df.gene_id) for gene in gene_ids: idx = gene2exon_indx.groups[(gene, 'exon')] nn = 0 for ii, row in df.iloc[idx,:].iterrows(): exon_key = '{}:{}-{}'.format(row['seqname'], row['start']-1, row['end']) seq = fasta.get(exon_key) exons[gene] += seq for gene_id in df_sub.index: if gene_id in exons: seq = exons[gene_id] gc_content = GC(seq) df_sub.at[gene_id, 'gc_content'] = gc_content else: print("missing gene_id in exons dict:") print(gene_id) elif args.feature == 'transcript': tx_ids = set(df['transcript_id'].values) for rec in FastaIterator(args.fasta): if rec.id in tx_ids: gc_content = GC(rec.seq) df_sub.loc[rec.id, 'gc_content'] = gc_content else: print(rec.id) else: raise ValueError('check feature type!') return df_sub
def parse(fasta_file): ref_prot_fasta_file = RefProtFastaFile(fasta_file) with open(ref_prot_fasta_file.filename) as ff: for record in FastaIterator(ff): entry = RefProtFastaEntry.parse_fasta_record(record, ref_prot_fasta_file.taxon_id) ref_prot_fasta_file.add_entry(entry) return ref_prot_fasta_file
def align(fh, transl=True): """ Translate and align pangenome cluster fasta file """ align_exe = MuscleCommandline( r'C:\Users\matthewwhiteside\workspace\b_ecoli\muscle\muscle3.8.31_i86win32.exe', clwstrict=True) # Align on stdin/stdout proc = subprocess.Popen(str(align_exe), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=False) sequences = FastaIterator(fh) inp = [ ">" + record.id + "\n" + str(record.translate(table="Bacterial").seq) + "\n" for record in sequences ] inp = "".join(inp) align, err = proc.communicate(input=inp) return (align)
def fasta_reader(filename): from Bio.SeqIO.FastaIO import FastaIterator input = [] with open(filename) as handle: for record in FastaIterator(handle): input += [[str(record.id), str(record.seq)]] return input
def writeClassifiedFastas(classType,Dirr,resultsDir, df): fasta_files_dict = Get_Dirr_All_Fasta (classType,Dirr) classDict = {} writerDict = {} for key, value in fasta_files_dict.items(): files = {key:value} for filename, classname in files.items(): with open(filename) as fasta: for record in FastaIterator(fasta): #SeqIO.SimpleFastaParser(fasta): title = record[0] seq_id = title.split(None, 1)[0] if (record.id in df.index): classname = df[record.id] if (classname not in writerDict): classname = "".join([c for c in classname if c.isalpha() or c.isdigit() or c==' ']).rstrip() file = resultsDir + '\\' + classname + '.fasta' classHandle = open(file, "w") classDict[classname] = classHandle myWriter = FastaWriter(classDict[classname]) myWriter.write_header() writerDict[classname] = myWriter writerDict[classname].write_record(record) for classname, classHandle in classDict.items(): writerDict[classname].write_footer() classDict[classname].close()
def cut_fasta_by_len(fa_file, len_cutoff, outdir, prefix, suffix): # https://stackoverflow.com/questions/273192/how-can-i-create-a-directory-if-it-does-not-exist # Defeats race condition when another thread created the path #if not os.path.exists(outdir): # os.mkdir(outdir) try: os.makedirs(outdir) except OSError as e: if e.errno != errno.EEXIST: raise cut_fa_file = os.path.join(outdir, prefix + ".ge" + str(len_cutoff) + suffix) if os.path.exists(cut_fa_file) and (os.path.getsize(cut_fa_file) > 0): return cut_fa_file if fa_file.endswith(".gz"): in_h = gzip.open(fa_file, 'rt') else: in_h = open(fa_file, 'r') with open(cut_fa_file, 'w') as out_h: #for rec in SeqIO.parse(in_h, 'fasta'): # if len(rec.seq) >= len_cutoff: # SeqIO.write(rec, out_h, 'fasta') # yes, the SeqIO.parse() API is more simple to use, easy to understand # but, try different method, you will find something writer = FastaWriter(out_h) writer.write_header() for rec in FastaIterator(in_h): if len(rec) >= len_cutoff: writer.write_record(rec) writer.write_footer() in_h.close() return cut_fa_file
def filter_influenza_fa(in_fasta, out_fasta, pattern, accession_set): ''' accession_set .. a set of accession IDs that we query the fasta header against l, count = filter_influenza_fa(path_fa, pattern, include_accession) ''' cache_previous = () count, l = 0, [] with open(in_fasta) as handle, open(out_fasta, 'a+') as out: for record in FastaIterator(handle): # [^1] if '(' not in list(record.description): cache_current = re.search(pattern, record.description).group(1, 2, 3) if cache_current[0] in cache_previous: # [^2] continue acc = cache_current[0] cache_previous = cache_current if acc in accession_set: count += 1 l.append(acc) out.write('>' + acc + '\n') out.write(str(record.seq) + '\n') return (count) # l could be returned also
def generate_fake_genome(sample: str, reference: Path, vcf_path: Path, ploidy_dict: Dict[str, int] ) -> Generator[SeqRecord, None, None]: """ Generate a fake genome given a VCF, a reference, and a ploidy dict. A fasta record for each chromosome will be created. :param sample: The name in the sample of the VCF to use :param reference: The reference fasta file to use :param vcf_path: The path to the VCF :param ploidy_dict: A dictionary containing the ploidies for each contig. :return: A Generator that creates the chromosomes one by one. """ mutations_dict = vcf_to_mutations(str(vcf_path), sample) with reference.open("rt") as reference_h: for seqrecord in FastaIterator(reference_h): ploidy = ploidy_dict.get(seqrecord.id, 2) for allele_no in range(ploidy): # Default to empty list if no mutations were listed. mutations = mutations_dict.get(seqrecord.id, {} ).get(allele_no, []) new_sequence = sequence_with_mutations( sequence=str(seqrecord.seq), mutations=mutations) new_id = seqrecord.id + "_" + str(allele_no) yield SeqRecord( Seq(new_sequence, seqrecord.seq.alphabet), id=new_id, name=new_id, description=new_id)
def readFasta(fastaFile): """ Reads a FASTA file and parses contigs for GC content. Args: fastaFile: The path to the FASTA file. Returns: contigs A dictionary mapping contigIDs to sidr.common.Contig objects with GC content as a variable. """ contigs = [] if ".gz" in fastaFile: # should support .fa.gz files in a seamless (if slow) way openFunc = gzip.open else: openFunc = open with openFunc(fastaFile) as data: click.echo("Reading %s" % fastaFile) with click.progressbar(FastaIterator(data)) as fi: for record in fi: # TODO: conditional formatting contigs.append( common.Contig(record.id.split(' ')[0], variables={"GC": GC(record.seq)})) if len(contigs) != len( set([x.contigid for x in contigs]) ): # exit if duplicate contigs, https://stackoverflow.com/questions/5278122/checking-if-all-elements-in-a-list-are-unique raise ValueError("Input FASTA contains duplicate contigIDs, exiting") return dict( (x.contigid, x) for x in contigs ) # https://stackoverflow.com/questions/3070242/reduce-python-list-of-objects-to-dict-object-id-object
def fastarename(input, relabel, output): from Bio.SeqIO.FastaIO import FastaIterator with open(output, 'w') as outfile: counter = 1 for record in FastaIterator(open(input)): newName = relabel + str(counter) outfile.write(">%s\n%s\n" % (newName, record.seq)) counter += 1
def _fasta_reader(filename: str) -> Iterator: """ Read FASTA file content including multifasta format """ with open(filename) as handle: for record in FastaIterator(handle): yield record
def load_files(): '''Load all files in to an arrary, unshuffled''' data = [] for i, filename in enumerate(FILES): with open("data/" + filename) as f: filedata = [(values, i) for values in FastaIterator(f)] data.extend(filedata) return data
def _fasta_reader(filename: str) -> SeqRecord: """ FASTA file reader as iterator """ with open(filename) as handle: for record in FastaIterator(handle): yield record
def main(): args = argument_parser().parse_args() source = Position.from_string(args.source) target = Position.from_string(args.target) with open(args.fasta, "rt") as fasta_h: records = FastaIterator(fasta_h) result = mutate(records, source, target) print(result.format("fasta"), end='')
def read_single_with_titles(filename, alphabet): global title_to_ids iterator = FastaIterator(open(filename), alphabet, title_to_ids) record = iterator.next() try: second = iterator.next() except StopIteration: second = None assert record is not None and second is None return record
def get_base(fasta: str, chromosome: str, start: int, end: Optional[int]): if end is None: end = start + 1 with open(fasta, "rt") as fasta_handle: records = FastaIterator(fasta_handle) for record in records: if record.id == chromosome: return record[start:end].seq # If we have not returned the chromosome was not there. raise ValueError(f"{chromosome} not found in {fasta}")
def multi_check(self, filename): """Test parsing multi-record FASTA files.""" msg = f"Test failure parsing file {filename}" re_titled = list(FastaIterator(filename, title2ids=title_to_ids)) default = list(SeqIO.parse(filename, "fasta")) self.assertEqual(len(re_titled), len(default), msg=msg) for old, new in zip(default, re_titled): idn, name, descr = title_to_ids(old.description) self.assertEqual(new.id, idn, msg=msg) self.assertEqual(new.name, name, msg=msg) self.assertEqual(new.description, descr, msg=msg) self.assertEqual(new.seq, old.seq, msg=msg)
def read_single_with_titles(filename, alphabet): global title_to_ids handle = open(filename) iterator = FastaIterator(handle, alphabet, title_to_ids) record = next(iterator) try: second = next(iterator) except StopIteration: second = None handle.close() assert record is not None and second is None return record
def read_single_with_titles(filename, alphabet): """Parser wrapper to confirm single entry FASTA file.""" global title_to_ids with open(filename) as handle: iterator = FastaIterator(handle, alphabet, title_to_ids) record = next(iterator) try: second = next(iterator) except StopIteration: second = None assert record is not None and second is None return record
def fasta_reader(filename): """ Read a multi or single fasta file. Inputs: filename - string that represents a name of the file or a path to the file. Outputs: A generator object containing a Seq and ID biopython objects. """ if filename.endswith('.gz'): with gzip.open(filename, 'rt') as handle: for record in FastaIterator(handle): yield str(record.id), str(record.seq) else: with open(filename) as handle: for record in FastaIterator(handle): yield str(record.id), str(record.seq)
def multi_check(self, filename, alphabet): """Basic test for parsing multi-record FASTA files.""" re_titled = list(FastaIterator(open(filename), alphabet, title_to_ids)) default = list(SeqIO.parse(open(filename), "fasta", alphabet)) self.assertEqual(len(re_titled), len(default)) for old, new in zip(default, re_titled): idn, name, descr = title_to_ids(old.description) self.assertEqual(new.id, idn) self.assertEqual(new.name, name) self.assertEqual(new.description, descr) self.assertEqual(str(new.seq), str(old.seq)) self.assertEqual(new.seq.alphabet, old.seq.alphabet)
def read_fasta(inputfile): """Method for loading sequences from a FASTA formatted file and storing them into a list of sequences and names. :param inputfile: .fasta file with sequences and headers to read :return: lists of sequences and names. """ names = list() # list for storing names sequences = list() # list for storing sequences with open(inputfile) as handle: for record in FastaIterator(handle): # use biopythons SeqIO module names.append(record.description) sequences.append(str(record.seq)) return sequences, names
def multi_check(self, filename, alphabet): """Test parsing multi-record FASTA files.""" msg = "Test failure parsing file %s" % filename re_titled = list(FastaIterator(filename, alphabet, title_to_ids)) default = list(SeqIO.parse(filename, "fasta", alphabet)) self.assertEqual(len(re_titled), len(default), msg=msg) for old, new in zip(default, re_titled): idn, name, descr = title_to_ids(old.description) self.assertEqual(new.id, idn, msg=msg) self.assertEqual(new.name, name, msg=msg) self.assertEqual(new.description, descr, msg=msg) self.assertEqual(str(new.seq), str(old.seq), msg=msg) self.assertEqual(new.seq.alphabet, old.seq.alphabet, msg=msg)
def parse_file(file_path): records_letters = {} with open(file_path) as in_handle: for record in FastaIterator(in_handle): records_letters[record.id] = {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'Y': 0, 'M': 0, 'S': 0, 'R': 0, 'W': 0, 'K': 0, 'N': 0, 'D': 0, 'B': 0, 'H': 0, 'V': 0, 'all': 0, 'a': 0, 'c': 0, 'g': 0, 't': 0, 'y': 0, 'm': 0, 's': 0, 'r': 0, 'w': 0, 'k': 0, 'n': 0, 'd': 0, 'b': 0, 'h': 0, 'v': 0, 'all_small': 0, 'all_big': 0} for letter in record.seq: if letter.islower(): records_letters[record.id]['all_small'] += 1 else: records_letters[record.id]['all_big'] += 1 records_letters[record.id][letter] += 1 records_letters[record.id]['all'] += 1 return records_letters
def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz): if in_gz: in_h = gzip.open(fa_in, 'rt') else: in_h = open(fa_in, 'r') if gz: out_h = bgzf.BgzfWriter(fa_out, 'wb') else: out_h = open(fa_out, 'w') writer = FastaWriter(out_h) writer.write_header() for rec in FastaIterator(in_h, title2ids=header_function): writer.write_record(rec) writer.write_footer() out_h.close() in_h.close()
def create_rs(self, file): newpath = Cf().create_file_folder(file=file) filename, extension = os.path.splitext(os.path.basename(file)) random_genome_file = os.path.join(newpath, os.path.normpath(os.path.join(filename + '_random' + extension))) with open(file, 'rU') as GenomeFile: with open(random_genome_file, 'w') as RgFile: for record in FastaIterator(handle=GenomeFile): print('Creating random record for: ' + record.id) created_random_seq = self.generate_rs(str(record.seq)) random_record = SeqRecord(BioPythonSeq(created_random_seq), id=record.id + '_random_', name=record.name + '_random_', description=record.description + '_random_') SeqIO.write(random_record, RgFile, 'fasta') RgFile.close() return random_genome_file
def parse_file(self, file_path): data = {} print("Analysing: " + file_path) with open(file_path) as file: for record in FastaIterator(file): data[record.id] = {} start_index = 0 end_index = len(record.seq) - 1 while start_index + self.window_size < end_index: data[record.id][start_index] = self.parse_sequence( record.seq[start_index:(start_index + self.window_size)]) start_index += self.window_size data[record.id][start_index] = self.parse_sequence( record.seq[start_index:end_index]) return data
def simple_check(self, filename): """Test parsing single record FASTA files.""" msg = f"Test failure parsing file {filename}" title, seq = read_title_and_seq(filename) # crude parser idn, name, descr = title_to_ids(title) # First check using Bio.SeqIO.FastaIO directly with title function. records = FastaIterator(filename, title2ids=title_to_ids) record = next(records) with self.assertRaises(StopIteration): next(records) self.assertEqual(record.id, idn, msg=msg) self.assertEqual(record.name, name, msg=msg) self.assertEqual(record.description, descr, msg=msg) self.assertEqual(record.seq, seq, msg=msg) # Now check using Bio.SeqIO (default settings) record = SeqIO.read(filename, "fasta") self.assertEqual(record.id, title.split()[0], msg=msg) self.assertEqual(record.name, title.split()[0], msg=msg) self.assertEqual(record.description, title, msg=msg) self.assertEqual(record.seq, seq, msg=msg)
def FindGene(PATRICID, Header): OUT = dict() SPGENE = pd.read_csv('/pylon5/br5phhp/tv349/AMR/PATRIC/SPGENE/' + PATRICID + '.PATRIC.spgene.tab', sep='\t') LocalPos = SPGENE.index[SPGENE['patric_id'] == Header].tolist() # if the sequence exists here: OUTSPGENE = dict() if len(LocalPos) == 1: OUTSPGENE = (SPGENE.loc[LocalPos, ['gene', 'product', 'property', 'function']] ).to_dict('records')[0] FEATURES = pd.read_csv('/pylon5/br5phhp/tv349/AMR/PATRIC/FEATURES/' + PATRICID + '.PATRIC.features.tab', sep='\t') LocalPos = FEATURES.index[FEATURES['patric_id'] == Header].tolist() OUTFEATURES = dict() if len(LocalPos) == 1: OUTFEATURES = (FEATURES.loc[LocalPos, ['gene', 'product']]).to_dict('records')[0] OUT = {**OUTFEATURES, **OUTSPGENE} # Get sequence with open("/pylon5/br5phhp/tv349/AMR/PATRIC/PROTEIN/" + PATRICID + ".PATRIC.faa") as handle: for record in FastaIterator(handle): if record.id == Header: AAseq = str(record.seq) OUT['translation'] = AAseq return OUT
FNULL = open(os.devnull, 'w') pid = os.getpid() #reverse complement rev primer ForPrimer = args.fwdprimer RevPrimer = revcomp_lib.RevComp(args.revprimer) print 'Loading ' + '{0:,}'.format(amptklib.countfasta( args.input)) + ' sequence records' print 'Searching for forward primer: %s, and reverse primer: %s' % (ForPrimer, RevPrimer) print 'Requiring reverse primer match with at least %i mismatches' % args.primer_mismatch #loop through seqs, remove primer if found, and truncate to length truncated = 'bold2amptk_' + str(pid) + '.truncate.tmp' with open(truncated, 'w') as output: for record in FastaIterator(open(args.input)): Seq = str(record.seq) StripSeq = '' ForCutPos = amptklib.findFwdPrimer(ForPrimer, Seq, args.primer_mismatch, amptklib.degenNucSimple) RevCutPos = amptklib.findRevPrimer(RevPrimer, Seq, args.primer_mismatch, amptklib.degenNucSimple) if ForCutPos and RevCutPos: StripSeq = Seq[ForCutPos:RevCutPos] elif not ForCutPos and RevCutPos: StripSeq = Seq[:RevCutPos] if len(StripSeq) >= args.minlen: output.write('>%s\n%s\n' % (record.description, StripSeq))