def main(): for afa_rec, fn_rec in \ zip(parse(sys.argv[1], 'fasta'), parse(sys.argv[2], 'fasta')): assert afa_rec.id == fn_rec.id afn_str = backalign(afa_rec.seq, fn_rec.seq.ungap('-')) write(SeqRecord(Seq(afn_str), id=afa_rec.id, description=""), sys.stdout, 'fasta')
def retriving(a, b, c): pdbid = a chainid = b uniid = c my_record = [] log = open('pdb.fasta', 'w') seqpy = Popen(["python", "pdb_seq.py", pdbid], stdout=PIPE, stderr=PIPE) stdout = seqpy.communicate()[0] log.write(stdout) wait = seqpy.wait() log.close() seqfile = open("pdb.fasta") for seq_record in parse(seqfile, "fasta"): r = seq_record.id.split('_') if r[0][-1] == chainid: my_record.append(seq_record) seqfile.close() url = 'https://www.uniprot.org/uniprot/' + uniid + '.fasta' seqfile2 = urlopen(url) for seq_record in parse(seqfile2, "fasta"): r = seq_record.id.split('|') uniprot = r[1] my_record.append(seq_record) seqfile2.close() write(my_record, "test.fasta", "fasta")
def trimByConstant(inPath1, outPath1, inPath2=None, outPath2=None, cEnd=20, minLen=75, cBegin=None, outFormat="fasta", inFormat="fastq", stats=None): """Trim constant number of nucleotides from the end of each read in a fastq file. If cBegin is given it will be used as the number of reads to cut from the beginning of a read. """ files = [parse(open(inPath1), inFormat)] if outPath1 is None: outFiles = [sys.stdout] else: outFiles = [open(outPath1, "w")] if inPath2: files.append(parse(open(path2), inFormat)) outFiles.append(open(outPath2, "w")) return _trimNStreams(_trimReadByConstant, files, outFiles, outFormat, stats, cEnd, cBegin, minLen)
def trimByQuality(path1, outPath1, path2=None, outPath2=None, minQual=20, minLen=75, bothEnds=True, outFormat="fasta", stats=None): """Trim fastq reads according to a quality threshold. If path two is given path1 and path2 should contain reads from read1 and read2 of a paired end library. Reads with length below minLen after trimming will be discarded (together with there mate if path2 is given). If bothEnds is true, nucleotides with quality below minQual will also be removed from the begining of the read. """ files = [parse(open(path1), "fastq")] if outPath1 is None: outFiles = [sys.stdout] else: outFiles = [open(outPath1, "w")] if path2: files.append(parse(open(path2), "fastq")) outFiles.append(open(outPath2, "w")) return _trimNStreams(_trimReadByQuality, files, outFiles, outFormat, stats, minQual, minLen, bothEnds)
def build_kmer_df_learn(lp_fasta, l_label=None): from Bio.SeqIO import parse from itertools import chain from pandas import DataFrame, Series, concat l_kmer_size = [1, 2, 3] l_letter = ['M', 'F', 'L', 'I', 'V', 'P', 'T', 'A', 'Y', 'H', 'Q', 'N', 'K', 'D', 'E', 'C', 'R', 'S', 'W', 'G'] l_kmer = list(chain(*[generate_kmer(kmer_size, l_letter, l_letter) for kmer_size in l_kmer_size])) l_l_kmer_freq = [] l_seq_id = [] for p_fasta in lp_fasta: for record in parse(p_fasta, 'fasta'): l_seq_id.append(record.id) d_record = {} seq = str(record.seq) len_seq = len(seq) for i in range(len_seq): for kmer in [seq[i:i + kmer_size] for kmer_size in l_kmer_size if i <= len_seq - kmer_size]: d_record[kmer] = 1 if kmer not in d_record.keys() else d_record[kmer] + 1 l_kmer_freq = [] for kmer in l_kmer: l_kmer_freq.append(d_record[kmer] if kmer in d_record.keys() else 0) l_l_kmer_freq.append(l_kmer_freq) df_data = DataFrame(l_l_kmer_freq, columns=l_kmer, index=l_seq_id) if l_label: s_label = Series(name='label') for p_fasta, label in zip(lp_fasta, l_label): l_seq_id = [record.id for record in parse(p_fasta, 'fasta')] s_label = concat([s_label, Series(label, name='label', index=l_seq_id)]) return df_data, s_label else: return df_data
def main(argv: Optional[List[str]] = None) -> int: parser = argument_parser() args = parser.parse_args(argv) try: query = parse(args.query, "fasta") reference = parse(args.reference, "fasta") print(orthoani(query, reference, threads=args.jobs)) return 0 except KeyboardInterrupt: print("Interrupted.", file=sys.stderr) return -signal.SIGINT except Exception as e: if args.traceback: print( "".join( better_exceptions.format_exception(type(e), e, e.__traceback__)), file=sys.stderr, ) else: print(e, file=sys.stderr) return typing.cast(int, getattr(e, "errno", 1))
def removSeqsWithN(threshold, inStream1, outStream1, inStream2=None, outStream2=None, fileForm="fastq"): """Write all sequences with equal or less than threshold Ns to the outStream. Optinal sequences with more Ns can be written to a different stream. """ t=0 n=0 strIt1 = parse(inStream1, fileForm) if not inStream2 is None: strIt2 = parse(inStream2, fileForm) while True: t+=1 try: r1 = strIt1.next() except StopIteration: break if threshold == -1: remove = r1.seq.count("N") == len(r1.seq) else: remove = r1.seq.count("N") > threshold if not inStream2 is None: r2 = strIt2.next() if threshold == -1: remove |= r2.seq.count("N") == len(r2.seq) else: remove |= r2.seq.count("N") > threshold if remove: n+=1 else: outStream1.write(r1.format(fileForm)) if not inStream2 is None: outStream2.write(r2.format(fileForm)) return t, n
def retriving(a,b,c): pdbid = a chainid = b uniid = c my_record = [] log = open('pdb.fasta','w') seqpy = Popen(["python","pdb_seq.py",pdbid],stdout=PIPE,stderr=PIPE) stdout = seqpy.communicate()[0] log.write(stdout) wait = seqpy.wait() log.close() seqfile = open("pdb.fasta") for seq_record in parse(seqfile, "fasta"): r = seq_record.id.split('_') if r[0][-1]==chainid: my_record.append(seq_record) seqfile.close() url = 'http://www.uniprot.org/uniprot/'+uniid+'.fasta' seqfile2 = urlopen(url) for seq_record in parse(seqfile2, "fasta"): r = seq_record.id.split('|') uniprot = r[1] my_record.append(seq_record) seqfile2.close() write(my_record, "test.fasta", "fasta")
def get_mgedb(mgepath): print("[-] Preparing mobile genetic element database") Path(mgepath).mkdir(parents=True, exist_ok=True) to_write = [] for i in range(1, 15): file = fetch_url( f'https://raw.githubusercontent.com/katholt/' f'Kleborate/master/kleborate/data/ICEKp_references' f'/ICEKp{i}.embl', None, f'{mgepath}/icekp{i}') flist = open(file).readlines() parsing = False fasta = '' for line in flist: if line.startswith("//"): parsing = False if parsing: fasta += line.replace(" ", "").strip() if line.startswith("SQ"): parsing = True icekp = f'>ICEKp{i}\n' + ''.join([i for i in fasta if not i.isdigit()]) for r in parse(StringIO(icekp), 'fasta'): to_write.append(r) remove(file) ice = fetch_url( 'https://db-mml.sjtu.edu.cn/ICEberg2/download/ICE_seq_all.fas', None, mgepath + '/ice.fna') t4ss = fetch_url( 'https://db-mml.sjtu.edu.cn/ICEberg2/download/T4SS-type_ICE_seq_all.fas', None, mgepath + '/t4ss.fna') aice = fetch_url( 'https://db-mml.sjtu.edu.cn/ICEberg2/download/AICE_seq_all.fas', None, mgepath + '/aice.fna') ime = fetch_url( 'https://db-mml.sjtu.edu.cn/ICEberg2/download/IME_seq_all.fas', None, mgepath + '/ime.fna') cime = fetch_url( 'https://db-mml.sjtu.edu.cn/ICEberg2/download/CIME_seq_all.fas', None, mgepath + '/cime.fna') filenames = [ice, t4ss, aice, ime, cime] accessions = ['ICEKp1'] for f in filenames: for r in parse(f, 'fasta'): r.id = r.id.split('|')[2] r.id = r.id.replace('[', '_') r.id = r.id.replace(']', '') if r.id not in accessions: accessions.append(r.id) to_write.append(r) remove(f) write(to_write, mgepath + '/mgedb', "fasta") return run_makeblastdb(mgepath + '/mgedb', 'nucl', f'{mgepath}/mgedb')
def test_multiple_contig(self): record = parse(fspath(self.data / "NZ_AAEN01000029.fna"), "fasta") with (self.data / "NZ_AAEN01000029.fna.chopped.fasta").open() as f: expected = list(parse(f, "fasta")) with tempfile.NamedTemporaryFile(mode="rt", suffix=".fna") as tmp: orthoani._chop(record, tmp.name, 1020) actual = list(parse(tmp, "fasta")) for actual_record, expected_record in zip(actual, expected): self.assertEqual(actual_record.seq, expected_record.seq)
def test_gibson_offtarget_primer2(self): """Create Gibson primers when there's offtarget in one's end (2).""" insert = next(parse(os.path.join(TEST_DIR, "BBa_K1649003.fa"), "fasta")) backbone = next(parse(os.path.join(TEST_DIR, "pDusk.fa"), "fasta")) plasmid, primer_pairs = gibson([insert, backbone]) self.assertTrue(plasmid and primer_pairs)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) align_index = {rec.id: rec for rec in parse(args.align_handle, args.fmt_align)} for rec in backalign_recs(parse(args.in_handle, args.fmt_infile), align_index): write(rec, args.out_handle, args.fmt_outfile)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) if args.match_order: for rec in get_recs(parse(args.in_handle, args.fmt_infile), get_list(args.list_handle)): write(rec, args.out_handle, args.fmt_outfile) else: recs = get_rec_list(parse(args.in_handle, args.fmt_infile), get_list(args.list_handle)) write(recs, args.out_handle, args.fmt_outfile)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) for rc_rec in revcompl_recs(parse(args.in_handle, args.fmt_infile)): write(rc_rec, args.out_handle, args.fmt_outfile)
def main(): start_time = time() parser = cmd_parse() # Parsing of command line arguments with open(parser['file']) as genome: fasta_genome = to_dict(parse(genome, 'fasta')) # Reading genome file jobs = parser['jobs'] # Number of processes to parallelize fragments = parser['fragments_num'] # Number of fragments to get frags_per_core = [fragments // jobs] * (jobs - 1) frags_per_core.append(fragments - sum(frags_per_core)) # Number of fragment to get from one process dis_file = parser['dis_file'] # Address of empirical distribution file emp_dis = rfd(dis_file) if dis_file is not None else None # Empirical distribution reading my_seed = parser['seed'] # Numpy seeding argument processes = [] # List of processes to parallelize for job, fragments_num in enumerate(frags_per_core): seeding = ((my_seed + job) % MAX_SEED if my_seed != -1 else my_seed) if my_seed is not None else None # Processing of seeding argument processes.append(Process(target=disassembler, args=(fasta_genome, parser['seq_type'], fragments_num, parser['out_file'], parser['depth'], parser['read_len'], job, seeding, emp_dis, parser['mean_len']))) processes[-1].start() for process in processes: process.join() print_verbose( 'The program completed disassembling without any errors. Elapsed time={:f}'.format(time() - start_time), parser['session_id'], parser['logfile'], parser['verbose'], parser['params']) # Parameters logging
def readFasta(inStream): """Read fasta file and save a table with sequence IDs and sequence length""" out = [] for record in parse(inStream, "fasta"): out.append( (record.id, len(record), GC(record.seq), record.seq.count("N"))) return out
def prep_database(locus_type, gene_type): from sys import exc_info # attempt to build local dicts try: # screen input arguments locus_type = str(locus_type).upper() gene = str(gene_type).upper() # pull location of corresponding ref_db ref_db_file = all_ref_dbs[locus_type][gene_type] # prep return seq_dict = {} type_dict = {} # parse fasta file to build local ref_dbs as seq_dict and type_dict for nt in parse(ref_db_file, "fasta"): nts = str(nt.seq).lower() p = nt.description.split('|') allele = p[1] gene_type = p[3] seq_dict[allele] = nts type_dict[allele] = gene_type return seq_dict, type_dict # handle incorrect number of arguments passed except TypeError: print( "Invalid input. prep_database() takes exactly 2 string inputs as arguments: locus_type and gene_type." ) raise TypeError # handle invalid arguments passed except ValueError: print( "Invalid input. prep_database() takes 2 string inputs as arguments: locus_type and gene_type." ) raise ValueError # handle missing file location for locus+gene lookup except KeyError: print("Mismatch between locus_type and gene_type.") print( "Specified gene_type may not be in given locus_type, or locus_type is not included in known options." ) raise KeyError # handle missing ref_db file for locus+gene lookup except FileNotFoundError: print( "Reference file for locus type {locus_type} and gene_type {gene_type} was not found." ) raise FileNotFoundError # handle unknown error except: print("Unexpected error:", exc_info()[0]) raise
def getReadStats(inStream): readStats = { "rId": [], "lane": [], "tile": [], "x": [], "y": [], "qual": [], "n_count": [], "length": [] } for rec in parse(inStream, "fastq"): #position idArr = rec.id.split(" ")[0].split(":") lane = int(idArr[3]) tile = int(idArr[4]) x = int(idArr[5]) y = int(idArr[6]) readStats["rId"].append(rec.id) readStats["lane"].append(lane) readStats["tile"].append(tile) readStats["x"].append(x) readStats["y"].append(y) #mean quality qual = float(sum( rec._per_letter_annotations["phred_quality"])) / len(rec) readStats["qual"].append(qual) #number of Ns nCount = rec.seq.count("N") readStats["n_count"].append(nCount) #length length = len(rec) readStats["length"].append(length) return readStats
def write_reads(readfile, reads, outfile, verbose): if not outfile: fh_out = sys.stdout else: fh_out = open(outfile, 'w') if ".gz" in readfile: fh_in = gz.open(readfile, 'rt') else: fh_in = open(readfile, 'r') written = 0 if verbose: logging.info("Parsing reads in {}".format(readfile)) for i, record in enumerate(parse(fh_in, "fastq"), start=1): if i % 100000 == 0 and verbose: logging.info("{} reads parsed, {} reads written...".format( i, written)) id = record.id id_split = id.rsplit("/")[0] if id in reads or id_split in reads: fh_out.write("{}".format(record.format("fastq"))) written += 1 if verbose: logging.info("{} reads parsed, {} reads written...Done\n".format( i, written))
def filter_seqs_by_len(infile, outfile, minlen): """Filters sequences by length Parameters ---------- infile: str Sequence file in fasta format outfile: str File in fasta format containing sequences longer than minlen minlen: int Minimum size of sequences to keep """ from Bio.SeqIO import parse, write i = 0 with open(outfile, 'w') as fh: for record in tqdm.tqdm(parse(infile, 'fasta'), unit=" sequences", ncols=100, desc="Filtering sequences"): if len(record) >= minlen: write(record, fh, "fasta") i += 1 sys.stderr.write("{} sequences longer than {} written to {}\n".format( i, minlen, outfile))
def percentages_from_proteins(path): file=open(path) names_list=[] sequence_list=[] sources_list = [] desc_list = [] taxo_list = [] keyw_list = [] taxid_list = [] for record in parse(file, "genbank"): cdsnum=0 for feat in record.features: prot=record.seq analysed_seq = ProteinAnalysis(str(prot)) #creating another class ProteinAnalysis sequence_list.append(analysed_seq.get_amino_acids_percent()) #invoking method on this class, it returns a dictionary, we store it in the list names_list.append(str(record.name)+ "_CDS#" + str(cdsnum)) sources_list.append(record.annotations['source']) keyw_list.append(record.annotations['keywords']) taxo_list.append(record.annotations['taxonomy']) desc_list.append(record.description) taxid_list.append(record.annotations["organism"]) cdsnum+=1 #List of dictionaties to the numpy array aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] nseqs = len(sequence_list) percents=np.zeros((nseqs,20)) for i in range(nseqs): percdict = sequence_list[i] for an in range(20): percents[i,an]= percdict[ aas[an] ] return percents, names_list, sources_list, desc_list, taxo_list, keyw_list, taxid_list, sequence_list
def write_regions(f, fh, saf=False): for record in parse(f, "fasta"): if saf: fh.write("{}\t{}\t{}\t{}\t{}\n".format(record.id, record.id, 0, len(record), "+")) else: fh.write("{}\t{}\t{}\n".format(record.id, 0, len(record)))
def test_lincoln(self): """Create a set of plasmids from a Combinatorial assembly.""" records = [] test_dir = os.path.join(TEST_DIR, "lincoln") for (_, _, filenames) in os.walk(test_dir): for file in filenames: if not file.endswith(".fa"): continue test_file = os.path.join(test_dir, file) for record in parse(test_file, "fasta"): records.append(record) design = Combinatorial(records) protocol = GoldenGate(design, enzymes=[BsaI], separate_reagents=True) protocol.run() csv_output = protocol.to_csv( os.path.join(OUT_DIR, "lincoln.layout.csv")) protocol.to_picklists(os.path.join(OUT_DIR, "lincoln.hamilton.csv"), platform="hamilton") self.assertTrue(protocol.output) self.assertEqual(1, csv_output.count("Plate:1")) self.assertIn(",,,,,,,,,,,,,A", csv_output) # empty wells in plate 1 before plate2
def translateSequence(self, file_handle, stop): records = parse(file_handle, "fasta") for record in records: self.sequence = Seq(str(record.seq), IUPAC.unambiguous_rna) self.name = record.name print("Name: {}".format(self.name)) if Alphabet._verify_alphabet( self.sequence) == True and stop == 'y': self.translated_seq = self.sequence.translate(to_stop=True) print("Sequence: {}".format(self.sequence)) print("Translated sequence: {}".format(self.translated_seq)) print( "------------------------------------------------------------" ) elif Alphabet._verify_alphabet( self.sequence) == True and stop == 'n': self.translated_seq = self.sequence.translate() print("Sequence: {}".format(self.sequence)) print("Translated sequence: {}".format(self.translated_seq)) print( "------------------------------------------------------------" ) else: print( "This sequence is not a RNA, can't translate that. Load correct sequence." )
def combineFastaFiles(inputDirectory, outputDirectory, pattern): seqObjects = [] print('I am inside extract sequences method.') for root, subdirs, files in walk(inputDirectory): # print('walking in this directory:' + root) for fileName in files: # print('checking this file:' + fileName) fullPath = join(root, fileName) relativePath = relpath(fullPath, inputDirectory) # print('full path:' + fullPath) # print('The relative path:' + relativePath) # If filename has the pattern in it if (pattern in relativePath): print('This is the file we are looking for: ' + relativePath) for record in parse(fullPath, "fasta"): record.id = record.id + ' ' + relativePath.replace('/', '_').replace('\\', '_') record.description = '' seqObjects.append(record) # Print sequences to file. outputFileName = join(outputDirectory, 'CombinedSequences.fasta') outputFile = createOutputFile(outputFileName) write(seqObjects, outputFile, 'fasta') outputFile.close()
def get_genomic_ref(fasta: str, outfile: str, gene_dict:dict) -> None: """Parses ungapped FASTA file and writes genes+alleles to FASTA file. This function uses only the alleles which were selected from the gapped FASTA. This ensures that the anchor file and genomic reference file correspond to one another. Parameters ---------- fasta : str Ungapped FASTA file. outfile : str Path for the FASTA file to be written. gene_dict : dict Dictionary which has a single key for gene+allele combinations. It contains only two alleles for each gene. For each gene+allele, it contains the numbers of gaps and type of functionality. Returns ------- None """ seqs = [] headers = [] for seq in parse(fasta, 'fasta'): headers.append(seq.description) seqs.append(str(seq.seq)) with open(outfile, 'w') as f: for idx,header in enumerate(headers): if header in gene_dict: f.write(">"+header+"\n") f.write(seqs[idx] + "\n")
def auto_detect_read_length(seqfile, file_type): """ Find median read length from first 10K reads in seqfile """ valid_lengths = [ 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 175, 200, 225, 250, 300, 350, 400, 450, 500 ] read_lengths = [] try: seq_iterator = parse(open_file(seqfile), file_type) for index, record in enumerate(seq_iterator): if index == 10000: break read_lengths.append(len(record.seq)) except Exception: sys.exit( "Could not detect read length of: %s\nThis may be due to an invalid format\nTry specifying it with -l" % seqfile) median_read_length = int(median(read_lengths)) if median_read_length < valid_lengths[0]: sys.exit( "Median read length is %s. Cannot compute AGS using reads shorter than 50 bp." % median_read_length) for index, read_length in enumerate(valid_lengths): if read_length > median_read_length: return valid_lengths[index - 1] return valid_lengths[-1]
def count(foo): '''takes a file named foo returns the number lines''' f = parse(open(foo,'rU'),'fasta') n = 0 for dummyX in f: n += 1 return n
def get_flankdb(flankpath): Path(flankpath).mkdir(parents=True, exist_ok=True) print("[-] Preparing flanking virulence gene database") patric = fetch_url( 'ftp://ftp.patricbrc.org/specialty_genes/referenceDBs/PATRIC_VF.faa', None, flankpath + '/patric.faa') victors = fetch_url( 'http://www.phidias.us/victors/downloads/gen_downloads_protein.php', None, flankpath + '/victors.faa') vfdb = fetch_url( 'http://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz', None, flankpath + '/vfdb.faa.gz') params = {'query': 'siderophore AND ' 'taxonomy:"Bacteria [2]" AND ' 'NOT receptor NOT partial NOT fragment', 'format': 'fasta'} bgcs = fetch_url('http://www.uniprot.org/uniprot/', params, flankpath + '/bgcs.faa') filenames = [patric, victors, vfdb, bgcs] db = '' for fname in filenames: if fname.endswith('.gz'): with gopen(fname, 'rt') as infile: for line in infile: db += line else: with open(fname, 'rt') as infile: for line in infile: db += line remove(fname) d1 = db.count('>') print(f"[-] {d1} total proteins downloaded") accessions = set() db2 = '' for r in parse(StringIO(db), 'fasta'): if r.id not in accessions: accessions.add(r.id) db2 += r.format('fasta') d2 = db2.count('>') print(f"[-] Removed {d1 - d2} duplicate accessions") fasta_lines = db2.split('>')[0:] # splits each sequence by header def remove_complete_duplicates(fasta_lines): print(f"[>] Removing redundancy... ", end="", flush=True) outputlist, setofuniqsequence = [], set() for sequence in fasta_lines: if sequence not in setofuniqsequence: outputlist.append(sequence) setofuniqsequence.add(sequence) print(f"{len(outputlist)} proteins remaining") return outputlist with open(flankpath + '/flankdb', 'w') as flank_file: flank_file.write('>'.join(remove_complete_duplicates(fasta_lines))) return run_makeblastdb(flankpath + '/flankdb', 'prot', flankpath + '/flankdb')
def main(): signal(SIGPIPE,SIG_DFL) args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) for trans_rec in translate_recs(parse(args.in_handle, args.fmt_infile), code=args.code): write(trans_rec, args.out_handle, args.fmt_outfile)
def import_primer_rev(index): # Import primers local_path = pathlib.Path(__file__).parent.absolute() primer_list = list(parse(str(local_path) + '/reverse_finalprimers.fasta','fasta')) # Extract primers that is added primer = str(primer_list[index].seq) return primer
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) for rec in rename_recs(parse(args.in_handle, args.fmt_infile), get_map(args.map_handle)): logger.debug("Writing {}".format(rec.id)) write(rec, args.out_handle, args.fmt_outfile)
def convertWithCtable(file_, ctable, out): """ Convert the names of a fasta using a conversion table""" from Bio.SeqIO import parse, write d = {k: v for i in open(ctable) for k, v in i.strip().split()} sequences, renamed = [f for f in parse(file_, 'fasta')], [] for s in sequences: s.id = d.get(s.id) write(sequences, out, 'fasta') return
def truncate_seqs(f, n): ''' truncate FASTA seqs truncates sequences in a FASTA file named f to at most n bases, writing a new FASTA file named f.n.fa ''' seq_recs = parse(open(f,'rU'),'fasta') # USE FILENAME CORRECTION SCHEME foo = f+str(n)+'.fa' writer(foo, (rec[0:n] for rec in seq_recs))
def mature_accession_to_name(mature_accessions, mature_fa=None): """Convert mature miRNA accessions to their respective names""" if mature_fa is None: mature_fa = _retrieve_mature() mton = { record.description.split(' ')[1]: record.id for record in parse(mature_fa, 'fasta') } return mature_accessions, mton
def remove_alignment_gap(a): if a.ofile: out = a.ofile else: out = a.ifile.split('.')[0] + '_nogap.fa' with open(out, 'w') as f: for rec in parse(a.ifile, 'fasta'): f.write('>' + rec.id + '\n') f.write(str(rec.seq).replace('-', '') + '\n')
def initialize_graph(genome): import networkx as nx from Bio.SeqIO import parse G=nx.Graph() contigs=[r for r in parse(genome,'fasta')] for c in contigs: id_,length=c.id,len(c.seq) G.add_node(id_,length=length) return G
def get_seq_from_files(filename): ext = filename.split('.')[-1] table = {'fasta':'fasta', 'gbk':'genbank'} fmt = table.get(ext, 'fasta') handle = open(filename) for r in parse(handle, fmt): print r.id print r.seq handle.close()
def initialize_graph(genome): import networkx as nx from Bio.SeqIO import parse G = nx.Graph() contigs = parse(genome, 'fasta') for c in contigs: id_, length = c.id, len(c.seq) G.add_node(id_, length=length) return G
def mature_name_to_accession(mature_names, mature_fa=None): """Convert a miRNA names to their respective accessions""" if mature_fa is None: mature_fa = _retrieve_mature() ntom = { record.id: record.description.split(' ')[1] for record in parse(mature_fa, 'fasta') } return mature_names, ntom
def store_lengths(f, minlen=False): r = {} for record in parse(f, "fasta"): if minlen: if len(record.seq) < minlen: continue r[record.id] = len(record.seq) df = pd.DataFrame(r,index=["length"]).T return df
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) recs = list(parse(args.in_handle, args.fmt_infile)) assert len(recs) > 0 for rec in rm_recs(recs, get_list(args.list_handle)): write(rec, args.out_handle, args.fmt_outfile)
def splitfasta(infile, wrap=False): for seq in parse(infile, 'fasta'): outfile = str(seq.id) for p in string.punctuation: outfile = outfile.replace(p, '_') outfile = outfile + '.fasta' with open(outfile, 'w') as fh: fasta_out = FastaIO.FastaWriter(fh, wrap=wrap) fasta_out.write_header() # Does nothing, but required fasta_out.write_record(seq) fasta_out.write_footer() # Does nothing, but required
def random_seq(foo, n): '''takes a file foo and returns n random sequences from it''' max_n = count(foo) record_numbers = itertools.repeat(random.randint(1,max_n),times=n) seq_recs = parse(open(foo,'rU'),'fasta') i = 0 seqs = [] for rec in seq_recs: i += 1 if i in record_numbers: seqs.append(rec) return seqs
def main(): with open(sys.argv[1]) as names_handle: remove_names = set(line.strip() for line in names_handle) out_recs = [] for rec in parse(sys.stdin, 'fasta'): if rec.name in remove_names: continue else: out_recs.append(rec) write(out_recs, sys.stdout, 'fasta')
def permute_fasta(f): ''' takes a FASTA file and returns a new FASTA file with each sequence randomly permuted (separately, such that its % A,T,G,C doesn't change) ''' mute = Bio.Seq.Seq.tomutable shuffle = random.shuffle with open(f + '_permuted.fa', 'w') as output: with open(f, 'rU') as fobj: for seq_rec in parse(fobj, 'fasta'): seq_rec.seq = mute(seq_rec.seq) shuffle(seq_rec.seq) write(seq_rec, output, 'fasta')
def auto_detect_fastq_format(seqfile): """ Use first 50,000 reads to detect quality encoding """ max_reads = 50000 formats = ['fastq-illumina', 'fastq-solexa', 'fastq-sanger'] for format in formats: try: index = 0 seq_iterator = parse(open_file(seqfile), format) for rec in seq_iterator: if index == max_reads: break index += 1 return format except Exception: pass sys.exit("Could not detect quality score encoding of: %s\nThis may be due to an invalid format\nTry specifying it with -c" % seqfile)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) all_recs = OrderedDict() for rec in parse(args.in_handle, args.fmt_infile): all_recs[rec] = len(rec) mode = Counter(all_recs.values()).most_common()[0][0] for rec in all_recs: if all_recs[rec] == mode: write(rec, args.out_handle, args.fmt_outfile) else: warn(cli.DropSequenceWarning( "{} had length {}, not {}".format(rec.id, len(rec), mode)))
def FastaToFDB(self, fastafile): fdb_registers = [] content = open(fastafile) sequences = parse(content, 'fasta') for sequence in sequences: fdb_register = FDBRegister() fdb_register.filename = fastafile fdb_register.description = sequence.id fdb_register.gene = str(sequence.seq) fdb_registers.append(fdb_register) content.close() return self.mount_fdb_file(fdb_registers)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) for rec_in in parse(args.in_handle, 'fastq'): logger.debug(rec_in) rec_out = quality_trim(rec_in, args.quality_threshold, keep_columns=args.keep_columns) length = len(rec_out.seq) if length < args.min_length: warn(("Length of sequence {} less than threshold. " "{} < {}. Dropping.").\ format(rec_out.id, length, args.min_length), cli.DropSequenceWarning) else: write(rec_out, args.out_handle, args.fmt_outfile)
def auto_detect_read_length(seqfile, file_type): """ Find median read length from first 10K reads in seqfile """ valid_lengths = [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 175, 200, 225, 250, 300, 350, 400, 450, 500] read_lengths = [] try: seq_iterator = parse(open_file(seqfile), file_type) for index, record in enumerate(seq_iterator): if index == 10000: break read_lengths.append(len(record.seq)) except Exception: sys.exit("Could not detect read length of: %s\nThis may be due to an invalid format\nTry specifying it with -l" % seqfile) median_read_length = int(median(read_lengths)) if median_read_length < valid_lengths[0]: sys.exit("Median read length is %s. Cannot compute AGS using reads shorter than 50 bp." % median_read_length) for index, read_length in enumerate(valid_lengths): if read_length > median_read_length: return valid_lengths[index-1] return valid_lengths[-1]
def fetch_names(id_list): organism_names = {} # Doing 100 by 100 to make sure requests to NCBI are not too big for i in range(0, len(id_list), 100): j = i + 100 if (j >= len(id_list)): j = len(id_list) sys.stderr.write("Fetching entries from %s to %s from GenBank\n" % (i, j)) sys.stderr.flush() result_handle = Entrez.efetch(db=db, rettype="gb", id=id_list[i:j]) # Populate result per organism name for record in parse(result_handle, 'genbank'): # Using NCBI name, which should match accession number passed organism_names[record.name] = record.annotations['organism'] return organism_names
def FastaToFDB(self, fastafile, username): fdb_registers = [] content = open(fastafile, "r") sequences = parse(content, 'fasta') for sequence in sequences: fdb_register = FDBRegister() fdb_register.description = sequence.description fdb_register.gene = str(sequence.seq) fdb_register.geneinfo = sequence.annotations fdb_register.filename = fastafile fdb_register.date = date.today() fdb_register.user = username fdb_registers.append(fdb_register) content.close() return self.mount_fdb_file(fdb_registers)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) if args.match_order and args.excluding: raise ValueError("--match-order and --excluding cannot both be set.") to_fetch = [line.strip() for line in args.list_handle] fetch_set = set(to_fetch) rec_iter = parse(args.in_handle, args.fmt_infile) if args.excluding: out_iter = exclude_iter(rec_iter, fetch_set) elif args.match_order: out_iter = order_iter(fetch_iter(rec_iter, fetch_set), to_fetch) else: out_iter = fetch_iter(rec_iter, fetch_set) write(out_iter, sys.stdout, args.fmt_outfile)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) hits = read_table(args.table_handle) hits['mis_sum'] = hits.mis_start + hits.mis_stop if args.max_mismatch: hits = hits[hits.mis_sum <= args.max_mismatch] if args.primer_set: hits = hits[hits.primer_set == args.primer_set] recs = parse(args.in_handle, args.fmt_infile) for rec in recs: amplicon, hit_info = get_amplicon(rec, hits, trim_primers=args.trim_primers) logger.debug(hit_info) if (type(hit_info) == type(None)) and args.drop: warn(cli.DropSequenceWarning("No hit found for {rec.id}".format(rec=rec))) else: write(amplicon, args.out_handle, args.fmt_outfile)
def process_seqfile(args, paths): """ Sample high quality reads from seqfile """ if args['verbose']: print ("====Estimating Average Genome Size====") print ("Sampling & trimming reads...") outfile = open(paths['tempfile'], 'w') # loop over sequences read_id, dups, too_short, low_qual = 0, 0, 0, 0 seqs = set([]) for seqfile in args['seqfiles']: i = 0 try: seq_iterator = parse(open_file(seqfile), args['fastq_format'] if args['file_type'] == 'fastq' else 'fasta') for rec in seq_iterator: i += 1 # record sequence if enough high quality bases remain if len(rec.seq) < args['read_length']: too_short += 1; continue # check if sequence is a duplicate elif args['filter_dups'] and (str(rec.seq) in seqs or str(rec.seq.reverse_complement()) in seqs): dups += 1; continue # check if sequence is low quality elif quality_filter(rec, args): low_qual += 1; continue # keep seq else: outfile.write('>'+str(read_id)+'\n'+str(rec.seq[0:args['read_length']])+'\n') read_id += 1 if args['filter_dups']: seqs.add(str(rec.seq)) if read_id == args['nreads']: break if read_id == args['nreads']: break except Exception, e: error = "\nAn error was encountered when parsing sequence #%s in the input file: %s\n" % (i+1, seqfile) error += "Make sure that the sequence and quality headers match for each sequence (- the 1st character)\n" error += "See: https://en.wikipedia.org/wiki/FASTQ_format" clean_up(paths) sys.exit(error)
from scipy.spatial.distance import squareform from functions import * from scoring import * from Bio.SeqIO import parse handle = open("unknown-proteobacteriae-pubmed-8422969.fasta", "r") sequences = map(lambda x: str(x.seq), list(parse(handle, "fasta"))) # copy the sequences array seqs_tmp = [seq for seq in sequences] similarity_vector = [] # get all pairs of sequences and get the distances while len(seqs_tmp) > 0: v1 = seqs_tmp.pop(0) for v2 in seqs_tmp: a1, a2 = pairwise_alignment(v1, v2, -2, exact_match) similarity_vector.append(pairwise_distance(a1, a2)) distance_matrix = 1 - np.array(similarity_vector) distance_matrix = squareform(distance_matrix) # print(np.round(distance_matrix, decimals=2)) t = build_guiding_tree(distance_matrix, sequences) # print(t) msa_wrapper(t, exact_match) msa_wrapper(t, average_match)
from Bio.SeqIO import parse from Bio import pairwise2 from Bio.Align.Applications import MuscleCommandline # all CDS of mercurialis # /scratch/cluster/monthly/gcossard/Hydrexpr_Kallisto/Hydrexpr_reads/CDS_listofbams2.txt.fas # /scratch/cluster/monthly/gcossard/SNPcall_EXPR/AllCDSMannua_v_AllCDSRicinus.txt # sortie de BLAST: /scratch/cluster/monthly/gcossard/SNPcall_EXPR/ALLCDS_v14.fasta sur /scratch/cluster/monthly/gcossard/Ricinus_data/TIGR_castorWGS_release_0.1.cds.fsa blastFile = "/scratch/cluster/monthly/gcossard/SNPcall_EXPR/AllCDSMannua_v_AllCDSRicinus.txt" ricinusFile = "/scratch/cluster/monthly/gcossard/Ricinus_data/TIGR_castorWGS_release_0.1.cds.fsa" mercuFile = "/scratch/cluster/monthly/croux/guillaume/ALLCDS_v14.fasta" ricinus = {} infile = parse(ricinusFile, "fasta") for i in infile: ricinus[i.id] = i.seq infile.close() mercu = {} infile = parse(mercuFile, "fasta") for i in infile: gene = i.id if gene not in mercu: mercu[gene] = {} mercu[gene] = i.seq infile.close() blast = {} infile = open(blastFile, "r")
#!/software/bin/python2.7 from Bio.SeqIO import parse infile = "/scratch/cluster/monthly/croux/mercurialis/Bams/reads2snps/diploids/orf_fastas/consensus_annua_orf_geneCapture.fas" input = parse(infile, "fasta") cnt1 = 0 cnt2 = 0 res = "" for i in input: cnt1 += 1 res += ">{0}\n{1}\n".format(i.id, i.seq) if cnt1%100 == 0: cnt2 += 1 output = open("input_blast_{0}.fas".format(str(cnt2)), "w") output.write(res) output.close() res = "" cnt1 = 0 cnt2 += 1 output = open("input_blast_{0}.fas".format(str(cnt2)), "w") output.write(res) output.close()
def reader(foo): ''' generator yielding Bio.Seq.Seq objects from a FASTA file ''' for record in parse(open(foo, 'rU'), 'fasta'): yield record.seq