def main(fasta, prefix): # load data prots = {} with open(fasta, 'r') as infile: for rec in fasta_iter(infile): prot = Protein(rec) prots[prot.name] = prot # look for subsumed sequences subsumed = set() cnt = 0 for key, value in list(prots.items()): cnt += 1 if cnt % 100 == 0: print(f'{cnt} proteins checked') qry = value.sqn for rec in prots: # skip finding itself if key == rec: continue subj = prots[rec].sqn if qry in subj: subsumed.add(key) break # write out non-subsumed sequences print(f'Number of input sequences is {cnt}') print(f'Number of sequences subsumed within another is {len(subsumed)}') outname = '-'.join([prefix, 'nonSubsumed.fa']) outfile = open(outname, 'w') for key, value in list(prots.items()): if key not in subsumed: print(value.print_prot(), file=outfile) outfile.close()
def main(): """distribute-gff.py Extract annotations on the given scaffolds usage: distribute-gff.py annotations.gff sequences.fa output: sequences.gff """ gff = sys.argv[1] fasta = sys.argv[2] seqs = set() with open(fasta, "r") as fastafile: for rec in fasta_iter(fastafile): seqs.add(rec[0]) # clumsy way of getting base name of fasta file fastaname = re.sub(".fa", "", re.sub(".fasta", "", fasta)) outfile = open(fastaname + ".gff", "w") with open(gff, "r") as file_object: for line in file_object: if line[0] == "#": continue feature = GFF(line) if feature.seqid in seqs: outfile.write(line) outfile.close()
def main(): # read in gene locus_tags tags = 'version3-gene-locus-tags.tsv' genes = {} with open(tags, 'r') as infile: for line in infile: gid, loc = line.strip().split('\t') genes[gid] = Gene(gid) genes[gid].add_tag(loc) # read in sequence IDs transcripts = '/projects/bullfrog_assembly/genome/ARCS/annotation/fresh-Oct2018/shared-with-uvic/compare-proteins/v3-proteins.fa' with open(transcripts, 'r') as infile: for sid, sqn in fasta_iter(infile): gid = sid.split('-mRNA')[0] try: genes[gid].add_mrna(sid) except KeyError: print f'{sid} not in gene table!' # write out all tags outname = 'version3-transcript-locus-tags.tsv' with open(outname, 'w') as outfile: for entry in genes: this_gene = genes[entry] for tag in this_gene.get_tags(): this_mrna, this_tag = tag outbuff = ''.join([this_mrna, '\t', this_tag, '\n']) outfile.write(outbuff)
#!/usr/bin/env python import sys from fastatools import fasta_iter infile = sys.argv[1] bueno = ["A", "C", "G", "T", "N"] with open(infile, "r") as fasta: for rec in fasta_iter(fasta): seqn = rec[1].upper() nam = ">" + rec[0] print nam for base in seqn: if base not in bueno: sys.stdout.write("N") continue sys.stdout.write(base) sys.stdout.write("\n")
def main(): """Extract CDS coordinates for each mRNA in a gff file, then print the corresponding sequence. The subfeatures MUST be sorted by their start coordinate Usage: extract-cds.py genes.gff genome.fa > genes-cds.fa """ if len(sys.argv) is not 3: print main.__doc__ sys.exit(1) gff = sys.argv[1] fasta = sys.argv[2] cds_coords = {} with open(gff, "r") as file_object: for line in file_object: if line[0] == "#": sys.stdout.write(line) continue feature = gfftools.GFF(line) if feature.type == "CDS": for parent in feature.parent: if parent in cds_coords: if feature.strand == "+": cds_coords[parent]['coords'].append( [feature.start, feature.end]) else: cds_coords[parent]['coords'].insert( 0, [feature.start, feature.end]) else: cds_coords[parent] = { 'coords': [[feature.start, feature.end]], 'scaf': feature.seqid, 'strand': feature.strand } sequences = {} with open(fasta, "r") as file_object: for rec in fastatools.fasta_iter(file_object): sequences[rec[0]] = rec[1] for record in cds_coords: whole = "" for coords in cds_coords[record]['coords']: seqid = cds_coords[record]['scaf'] if cds_coords[record]['strand'] == "-": piece = fastatools.revcomp( sequences[seqid][int(coords[0]) - 1:int(coords[1])]) ### watch for OBO else: piece = sequences[seqid][int(coords[0]) - 1:int(coords[1])] ### watch for OBO whole += piece else: sys.stdout.write(">" + record + "_CDS\n" + whole + "\n")
args = parser.parse_args() oldsqn = args.OldFASTA if args.NewFASTA: newsqn = args.NewFASTA # read in old scaffs with sqn as key and id as value # then read in new scaffs and compare sqn to keys and write out old and new ids upon match # or, if only one FASTA given, output unique sequences seqdict = {} collisions = open("collisions.txt","w") with open(oldsqn,"r") as infile: for rec in fasta_iter(infile): seqid = rec[0] sqn = rec[1] if sqn in seqdict: print >> collisions, "Collision between " + seqid + " AND previously added " + seqdict[sqn] else: seqdict[sqn] = seqid collisions.close() if args.NewFASTA: print "OldID\tNewID" with open(newsqn,"r") as infile: for rec in fasta_iter(infile): seqid = rec[0] sqn = rec[1]