def main(): for afa_rec, fn_rec in \ zip(parse(sys.argv[1], 'fasta'), parse(sys.argv[2], 'fasta')): assert afa_rec.id == fn_rec.id afn_str = backalign(afa_rec.seq, fn_rec.seq.ungap('-')) write(SeqRecord(Seq(afn_str), id=afa_rec.id, description=""), sys.stdout, 'fasta')
def retriving(a,b,c): pdbid = a chainid = b uniid = c my_record = [] log = open('pdb.fasta','w') seqpy = Popen(["python","pdb_seq.py",pdbid],stdout=PIPE,stderr=PIPE) stdout = seqpy.communicate()[0] log.write(stdout) wait = seqpy.wait() log.close() seqfile = open("pdb.fasta") for seq_record in parse(seqfile, "fasta"): r = seq_record.id.split('_') if r[0][-1]==chainid: my_record.append(seq_record) seqfile.close() url = 'http://www.uniprot.org/uniprot/'+uniid+'.fasta' seqfile2 = urlopen(url) for seq_record in parse(seqfile2, "fasta"): r = seq_record.id.split('|') uniprot = r[1] my_record.append(seq_record) seqfile2.close() write(my_record, "test.fasta", "fasta")
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) for rc_rec in revcompl_recs(parse(args.in_handle, args.fmt_infile)): write(rc_rec, args.out_handle, args.fmt_outfile)
def main(): tree = read(sys.argv[1], 'newick') seqs = index(sys.argv[2], 'fasta') if not tree.rooted: tree.root_at_midpoint() tree.ladderize(reverse=True) for leaf in tree.get_terminals(): write(seqs[leaf.name], sys.stdout, 'fasta')
def main(): signal(SIGPIPE,SIG_DFL) args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) for trans_rec in translate_recs(parse(args.in_handle, args.fmt_infile), code=args.code): write(trans_rec, args.out_handle, args.fmt_outfile)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) for rec in rename_recs(parse(args.in_handle, args.fmt_infile), get_map(args.map_handle)): logger.debug("Writing {}".format(rec.id)) write(rec, args.out_handle, args.fmt_outfile)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) recs = list(parse(args.in_handle, args.fmt_infile)) assert len(recs) > 0 for rec in rm_recs(recs, get_list(args.list_handle)): write(rec, args.out_handle, args.fmt_outfile)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) align_index = {rec.id: rec for rec in parse(args.align_handle, args.fmt_align)} for rec in backalign_recs(parse(args.in_handle, args.fmt_infile), align_index): write(rec, args.out_handle, args.fmt_outfile)
def main(): with open(sys.argv[1]) as names_handle: remove_names = set(line.strip() for line in names_handle) out_recs = [] for rec in parse(sys.stdin, 'fasta'): if rec.name in remove_names: continue else: out_recs.append(rec) write(out_recs, sys.stdout, 'fasta')
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) if args.match_order: for rec in get_recs(parse(args.in_handle, args.fmt_infile), get_list(args.list_handle)): write(rec, args.out_handle, args.fmt_outfile) else: recs = get_rec_list(parse(args.in_handle, args.fmt_infile), get_list(args.list_handle)) write(recs, args.out_handle, args.fmt_outfile)
def permute_fasta(f): ''' takes a FASTA file and returns a new FASTA file with each sequence randomly permuted (separately, such that its % A,T,G,C doesn't change) ''' mute = Bio.Seq.Seq.tomutable shuffle = random.shuffle with open(f + '_permuted.fa', 'w') as output: with open(f, 'rU') as fobj: for seq_rec in parse(fobj, 'fasta'): seq_rec.seq = mute(seq_rec.seq) shuffle(seq_rec.seq) write(seq_rec, output, 'fasta')
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) all_recs = OrderedDict() for rec in parse(args.in_handle, args.fmt_infile): all_recs[rec] = len(rec) mode = Counter(all_recs.values()).most_common()[0][0] for rec in all_recs: if all_recs[rec] == mode: write(rec, args.out_handle, args.fmt_outfile) else: warn(cli.DropSequenceWarning( "{} had length {}, not {}".format(rec.id, len(rec), mode)))
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) for rec_in in parse(args.in_handle, 'fastq'): logger.debug(rec_in) rec_out = quality_trim(rec_in, args.quality_threshold, keep_columns=args.keep_columns) length = len(rec_out.seq) if length < args.min_length: warn(("Length of sequence {} less than threshold. " "{} < {}. Dropping.").\ format(rec_out.id, length, args.min_length), cli.DropSequenceWarning) else: write(rec_out, args.out_handle, args.fmt_outfile)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) if args.match_order and args.excluding: raise ValueError("--match-order and --excluding cannot both be set.") to_fetch = [line.strip() for line in args.list_handle] fetch_set = set(to_fetch) rec_iter = parse(args.in_handle, args.fmt_infile) if args.excluding: out_iter = exclude_iter(rec_iter, fetch_set) elif args.match_order: out_iter = order_iter(fetch_iter(rec_iter, fetch_set), to_fetch) else: out_iter = fetch_iter(rec_iter, fetch_set) write(out_iter, sys.stdout, args.fmt_outfile)
def main(): args = parse_args(sys.argv) logging.basicConfig(level=args.log_level) logger.debug(args) hits = read_table(args.table_handle) hits['mis_sum'] = hits.mis_start + hits.mis_stop if args.max_mismatch: hits = hits[hits.mis_sum <= args.max_mismatch] if args.primer_set: hits = hits[hits.primer_set == args.primer_set] recs = parse(args.in_handle, args.fmt_infile) for rec in recs: amplicon, hit_info = get_amplicon(rec, hits, trim_primers=args.trim_primers) logger.debug(hit_info) if (type(hit_info) == type(None)) and args.drop: warn(cli.DropSequenceWarning("No hit found for {rec.id}".format(rec=rec))) else: write(amplicon, args.out_handle, args.fmt_outfile)
def main(): seq = read(sys.argv[1], 'fasta') qual = read(sys.argv[2], 'qual') seq.letter_annotations = qual.letter_annotations write(seq, sys.stdout, 'fastq')
hits.columns = ['orf', 'model', 'e_value', 'score', 'bias'] hits = hits[hits.e_value < opts.e_value_cutoff] if len(hits) == 0: sys.stdout.write("") sys.exit() hits['read'], hits['start'], hits['stop'] = \ np.array(hits.orf.str.\ match('^(.*)\(([0-9]*)-([0-9]*)\)$').values.tolist()).T read_set = set(hits.read) out_recs = [] for rec in parse(args[1], opts.format): if rec.name in read_set: read_hits = hits[hits.read == rec.name] read_set.remove(rec.name) orf_seq = Seq('') if len(read_hits) > 1: # If multiple orfs in one read then it's probably a frame # shift error or pseudo gene continue for index, hit in read_hits.sort('start').iterrows(): # This for-loop is completely uneccesary thanks to the if statement # above. start = int(hit['start']) stop = int(hit['stop']) if start < stop: orf_seq += rec.seq[(start - 1):stop] elif start > stop: orf_seq = rec.seq[(stop - 1):start].reverse_complement() + orf_seq orf_rec = SeqRecord(id=hit['orf'], seq=orf_seq, description='') write(orf_rec, sys.stdout, 'fasta')
def writer(foo, iterable): ''' writes SeqRecord objects from iterable to FASTA file foo. Warning: overwrites foo, does not append ''' write(iterable, open(foo, 'w'), 'fasta')
def __enter__(self): self.temp_file = NamedTemporaryFile(**self.kwargs) with open(self.temp_file.name,"w") as handle: write(self.records, handle, self.format) return self.temp_file.name
cas9Assemblies = listdir(assemblyDir) goodDomIDS = load(open("pickles/%s_GoodDomainIDS.p" % (gene), "rb")) goodDomMap = load(open("pickles/%s_GoodDomMap.p" % (gene), "rb")) hmm_parser = load(open("pickles/%s_HMM_Parsing_Results.p" % (gene), "rb")) print("All loaded") #Copy unique nucleotide sequence from Bio.SeqIO import index nukSeqHash, protSeqHash = set(), set() alreadyGotIt, count = 0, 0 for assembly in cas9Assemblies: baseID = assembly[:-6] allAssemblySeqs = index(assemblyDir + assembly, "fasta") overlap = goodDomIDS.intersection(allAssemblySeqs.keys()) for recID in overlap: seq = str(allAssemblySeqs[recID].seq).upper() if seq in nukSeqHash and len(goodDomMap[recID]) == 1: alreadyGotIt += 1 continue nukSeqHash.add(seq) #There may be more than 1 protein on the pseudochromosome, save both as separate files if len(goodDomMap[recID]) > 1: print("%i Cas9s on %s %s" % (len(goodDomMap[recID]), recID, baseID)) for orfID in goodDomMap[recID]: # protSeq = str(hmm_parser.results[baseID].proteins[orfID].seq).upper() with open("assemblies/pseudoChromos/%s.fasta" % (orfID), "w") as fh: write(allAssemblySeqs[recID], fh, "fasta") count += 1 if count % 1000 == 0: print(count, end=" ")
from sklearn.metrics import euclidean_distances from random import randint import numpy as np import warnings warnings.simplefilter(action='ignore', category=FutureWarning) from Bio.SeqIO import parse, write from Bio.SeqRecord import SeqRecord annotatedRegions = open("assemblies/AnnotatedContigs.fa", "w") geneDescripts = open("annotations/AnnotationDescripts.bed", "w") annotations = glob("mags/final.contigs.*/*.gbk") seqs = set() hypoCounter = 0 for fname in annotations: print(fname) for rec in parse(fname, 'genbank'): for feature in rec.features: try: product = feature.qualifiers['product'][0] hypoCounter += int(product == "hypothetical protein") geneDescripts.write("%s\t%i\t%i\t%s\n" % (rec.id, feature.location.start, feature.location.end, product)) if rec.id in seqs: continue rec.seq = rec.seq.upper() write(rec, annotatedRegions, "fasta") seqs.add(rec.id) except: pass annotatedRegions.close() print(len(seqs), "Seqs with hypo prots:", len(hypoCounter))
parser = optparse.OptionParser(usage=usage) parser.add_option("-f", "--informat", dest="informat", default=_DEFAULT_FORMAT, help=("the format of the input file. " "DEFAULT: {}").format(_DEFAULT_FORMAT)) parser.add_option("-F", "--outformat", dest="outformat", default=_DEFAULT_FORMAT, help=("the format of the output. " "DEFAULT: {}").format(_DEFAULT_FORMAT)) parser.add_option("-u", "--ungap", dest="ungap", action="store_true", default=_DEFAULT_UNGAP, help=("should the input sequences have gap " "characters ('-' and '.') removed before " "translation? DEFAULT: {}").\ format(_DEFAULT_UNGAP)) opts, args = parser.parse_args() if len(args) == 0: rec_iter = parse(sys.stdin, opts.informat) elif len(args) == 1: rec_iter = parse(args[0], opts.informat) else: parser.error("Too many arguments.") for rec in rec_iter: seq = rec.seq if opts.ungap: seq = seq.ungap('-').ungap('.') rec.seq = Seq(transl(seq)) write(rec, sys.stdout, opts.outformat)
DESCS[name] = re.search(r": (.*) \[", gb_archive.description).group(1) rbs.type = "RBS" rbs.qualifiers["note"] = [DESCS[name], "color: #f58a5e"] # sort features by start location, source always first gb_archive.features.sort(key=lambda f: (-len(gb.seq)) * (f.type == "source") + f.location.start) # translate color from notes to ApEinfo for feature in gb_archive.features: translate_color(feature) # Fix the direct submission reference if (gb_archive.annotations["references"][-1].title == "Direct Submission"): ref = gb_archive.annotations["references"][-1] else: ref = Reference() ref.title = "Direct Submission" gb_archive.annotations.append(ref) ref.authors = "Larralde M" ref.journal = "Distributed with the MoClo Python library\nhttps://github.com/althonos/moclo" # write the final record dst_dir = os.path.abspath( os.path.join(__file__, "..", "..", "moclo-cidar", "registry", "cidar")) dst_file = os.path.join(dst_dir, "{}.gb").format(info["id"]) write(gb_archive, dst_file, "gb")
def index_and_save( self, out_handle, format='fasta'): """Writes input sequences to out_handle in the specified format. Updates self.index simultaneously, to match the new file co-ordinates. """ try: pos = out_handle.tell() except IOError, e: # not seekable. out_handle has no context of position. msg = "{0}\n {1} has no knowledge of position."\ .format(e, out_handle) raise IOError(msg) from Bio.SeqIO import write idx = self.indexes for seq_record in self.parse(self.in_handle): write( seq_record, out_handle, format ) idx[seq_record.id] = pos pos = out_handle.tell() idx.save() @classmethod def parse(cls, handle): """An iterator function that yield's SeqRecord objects from a readable file-like object full of fasta-formatted sequences. This serves the same purpose as BioPython's SeqIO.FastaIO.FastaIterator function. It doesn't actually do any indexing, as is intended to be used when indexing an output file, as in `self.index_and_save(...)` """ sequence = ''
translate_color(feature) # merge annotations annotations = copy.deepcopy(gba.annotations) annotations.update(gbd.annotations) # Fix the direct submission annotation ref = annotations["references"][-1] ref.authors = "Larralde M" ref.journal = "Distributed with the MoClo Python library\nhttps://github.com/althonos/moclo" # Add the YTK type to the record comments annotations["comment"] = ["YTK:{}".format(type_)] # create the final record final = CircularRecord( seq=gba.seq, id=info["id"], name=info["id"], description=info["name"], dbxrefs=gba.dbxrefs + gbd.dbxrefs, features=features, annotations=annotations, ) # write the final record dst_dir = os.path.abspath( os.path.join(__file__, "..", "..", "moclo-ytk", "registry", "ptk")) dst_file = os.path.join(dst_dir, "{}.gb").format(info["id"]) write(final, dst_file, "gb")
def main(): for rec in parse(sys.argv[1], 'fasta'): rec.seq = inframe(rec.seq) write(rec, sys.stdout, 'fasta')