def splitFastaFile(infile, informat, outdir): for record in SeqIO.parse(open(infile), informat): iid = record.id if not os.path.exists(outdir): os.mkdir(outdir) f_out = os.path.join(outdir,iid+'.fasta') SeqIO.write([record],open(f_out,'w'),"fasta")
def test_generated(self): """Write and read back odd SeqRecord objects""" record1 = SeqRecord(Seq("ACGT"*500, generic_dna), id="Test", description="Long "*500, letter_annotations={"phred_quality":[40,30,20,10]*500}) record2 = SeqRecord(MutableSeq("NGGC"*1000), id="Mut", description="very "*1000+"long", letter_annotations={"phred_quality":[0,5,5,10]*1000}) record3 = SeqRecord(UnknownSeq(2000,character="N"), id="Unk", description="l"+("o"*1000)+"ng", letter_annotations={"phred_quality":[0,1]*1000}) record4 = SeqRecord(Seq("ACGT"*500), id="no_descr", description="", name="", letter_annotations={"phred_quality":[40,50,60,62]*500}) record5 = SeqRecord(Seq("",generic_dna), id="empty_p", description="(could have been trimmed lots)", letter_annotations={"phred_quality":[]}) record6 = SeqRecord(Seq(""), id="empty_s", description="(could have been trimmed lots)", letter_annotations={"solexa_quality":[]}) record7 = SeqRecord(Seq("ACNN"*500), id="Test_Sol", description="Long "*500, letter_annotations={"solexa_quality":[40,30,0,-5]*500}) record8 = SeqRecord(Seq("ACGT"), id="HighQual", description="With very large qualities that even Sanger FASTQ can't hold!", letter_annotations={"solexa_quality":[0,10,100,1000]}) #TODO - Record with no identifier? records = [record1, record2, record3, record4, record5, record6, record7, record8] #TODO - Have a Biopython defined "DataLossWarning?" warnings.simplefilter('ignore', BiopythonWarning) #TODO - Include phd output? for format in ["fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"]: handle = StringIO() SeqIO.write(records, handle, format) handle.seek(0) compare_records(records, list(SeqIO.parse(handle, format)), truncation_expected(format)) warnings.filters.pop()
def loop(self, filename, format): original_records = list(SeqIO.parse(open(filename, "rU"), format)) # now open a connection to load the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) db_name = "test_loop_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(original_records) self.assertEqual(count, len(original_records)) server.commit() #Now read them back... biosql_records = [db.lookup(name=rec.name) for rec in original_records] #And check they agree self.assertTrue(compare_records(original_records, biosql_records)) #Now write to a handle... handle = StringIO() SeqIO.write(biosql_records, handle, "gb") #Now read them back... handle.seek(0) new_records = list(SeqIO.parse(handle, "gb")) #And check they still agree self.assertEqual(len(new_records), len(original_records)) for old, new in zip(original_records, new_records): #TODO - remove this hack because we don't yet write these (yet): for key in ["comment", "references", "db_source"]: if key in old.annotations and key not in new.annotations: del old.annotations[key] self.assertTrue(compare_record(old, new)) #Done server.close()
def run_pal2nal(fname_aln, fname_nuc, fname_prot): """ Generate a codon alignment via PAL2NAL. @param fname_aln: MSA of protein sequences in CLUSTAL format (.aln) @param fname_nuc: Nucleotide sequences in FASTA format (.fasta) @param fname_prot: Protein sequences in FASTA format (.fasta) @return: Codon alignment in CLUSTAL format (.aln), suitable for codeml 1""" sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc)) # Reorder fname_nuc according to the order of the proteins in fname_aln, which # was reordered due to CLUSTALW2. Note that the first protein in each of # these files remains the same as at the start, however; this first protein # is our original query protein. nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")] prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")] records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records)) fname_nuc2 = "homologs_ordered.dna.fasta" with open(fname_nuc2, "w") as f: for record in SeqIO.parse(fname_aln, "clustal"): SeqIO.write(records_map[record.id], f, "fasta") fname_codon = "homologs.codon.aln" # TODO: use subprocess os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon)) return fname_codon
def test_fastq_2000(self): """Read and write back simple example with upper case 2000bp read""" data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", "ACGT"*500, "[email protected]~"*500) handle = StringIO() self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue())
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def needle_score(seq1, seq2, verbose=False, keep=False): """ get needlman-wunsch score for aligning two sequences """ ntf = tempfile.NamedTemporaryFile with ntf(prefix='seq1', delete = not keep) as fh1, \ ntf(prefix='seq2', delete = not keep) as fh2, \ ntf(prefix='align_out') as outfile, \ open(os.devnull) as dn: SeqIO.write(seq1, fh1, 'fasta') fh1.flush() SeqIO.write(seq2, fh2, 'fasta') fh2.flush() cmd = ['needle', '-gapopen', '0', '-gapextend', '0', '-outfile', outfile.name, fh1.name, fh2.name] if verbose: print(' '.join(cmd)) subprocess.check_call(cmd, stderr=dn) result = outfile.read() pattern = re.compile(r'# Score: (.*)') score = pattern.search(result) if score is not None: return float(score.group(1)) return 0
def frameshift_writer(contigs, file): sys.stderr.write("[predict] writing frameshifts...") seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values() if c.annotation['majority_frameshift']] SeqIO.write(seqs, file, "fasta") file.close() sys.stderr.write("\tdone.\n")
def no_relatives_writer(contigs, file): sys.stderr.write("[predict] writing contigs with no relatives...") seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values() if c.annotation['num_relatives'] == 0] SeqIO.write(seqs, file, "fasta") file.close() sys.stderr.write("\tdone.\n")
def __format__(self, format_spec): """Returns the record as a string in the specified file format. This method supports the python format() function added in Python 2.6/3.0. The format_spec should be a lower case string supported by Bio.SeqIO as an output file format. See also the SeqRecord's format() method. """ if not format_spec: #Follow python convention and default to using __str__ return str(self) from Bio import SeqIO if format_spec in SeqIO._BinaryFormats: #Return bytes on Python 3 try: #This is in Python 2.6+, but we need it on Python 3 from io import BytesIO handle = BytesIO() except ImportError: #Must be on Python 2.5 or older from StringIO import StringIO handle = StringIO() else: from StringIO import StringIO handle = StringIO() SeqIO.write(self, handle, format_spec) return handle.getvalue()
def main(): if len (sys.argv) != 4 : print "Please provide file, the file format, and the desired file format " sys.exit (1) else: f = sys.argv[1] fout = "".join(f.split('.')[:-1]) formatin = sys.argv[2] formatout = sys.argv[3] if formatout == 'nexus': AlignIO.convert(f,formatin,fout+'.'+formatout,formatout,alphabet= IUPAC.ambiguous_dna) if formatout == 'mega': handle = open(f, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "phylip-relaxed")) handle.close() outfile = open(fout+'.'+formatout,'w') outfile.write('#mega'+"\n") outfile.write('!Title Mytitle;'+"\n") outfile.write('!Format DataType=DNA indel=-;'+"\n\n") for n in record_dict: outfile.write('#'+n+"\n") newseq=wrap(str(record_dict[n].seq),60) for s in newseq: outfile.write(s+"\n") outfile.close() else: AlignIO.convert(f,formatin,fout+'.'+formatout,formatout)
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20): """ removes reads from a pair of fastq files that are shorter than a minimum length. removes both ends of a read if one end falls below the threshold while maintaining the order of the reads """ logger.info("Removing reads in %s and %s that " "are less than %d bases." % (fq1, fq2, min_length)) fq1_out = utils.append_stem(fq1, ".fixed") fq2_out = utils.append_stem(fq2, ".fixed") fq1_single = utils.append_stem(fq1, ".singles") fq2_single = utils.append_stem(fq2, ".singles") if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])): return [fq1_out, fq2_out] fq1_in = SeqIO.parse(fq1, quality_format) fq2_in = SeqIO.parse(fq2, quality_format) with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle: for fq1_record, fq2_record in izip(fq1_in, fq2_in): if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length: fq1_out_handle.write(fq1_record.format(quality_format)) fq2_out_handle.write(fq2_record.format(quality_format)) else: if len(fq1_record.seq) > min_length: fq1_single_handle.write(fq1_record.format(quality_format)) if len(fq2_record.seq) > min_length: fq2_single_handle.write(fq2_record.format(quality_format)) return [fq1_out, fq2_out]
def gather_est2genome_seqs(refseq_obj, est2genome_handle, log_line, velvet_file): seq_dir = log_line.split("\t")[1] tmp_refseq = seq_dir.split("/")[3].replace(".","%2E")#hardcoded in this position gff_file = refseq_obj.id + ".velvet_contigs.maker.output/" + seq_dir + "/" + tmp_refseq + ".gff" gff_handle = open(gff_file,'r') for gff_line in gff_handle: if(re.search("est2gneome",gff_line) and \ re.search("\texpressed_sequence_match\t",gff_line)): curr_start = int(gff_line.split("\t")[3]) curr_stop = int(gff_line.split("\t")[4]) curr_strand = gff_line.split("\t")[6] tmp_handle = open(velvet_file,'r') tmp_fasta = SeqIO.to_dict(SeqIO.parse(tmp_handle,"fasta")) tmp_handle.close() if seq_dir.split("/")[3] in tmp_fasta: curr_record = tmp_fasta[seq_dir.split("/")[3]] else: continue new_seq = curr_record.seq[curr_start - 1:curr_stop] if(curr_strand == "-"): new_seq = curr_record.seq[curr_start - 1:curr_stop].reverse_complement() new_record = SeqRecord(new_seq,id=seqname,name=seqname,description="") SeqIO.write(est2genome_handle,"fasta")
def check_convert_fails(in_filename, in_format, out_format, alphabet=None): qual_truncate = truncation_expected(out_format) #We want the SAME error message from parse/write as convert! err1 = None try: records = list(SeqIO.parse(in_filename,in_format, alphabet)) handle = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.write(records, handle, out_format) if qual_truncate: warnings.filters.pop() handle.seek(0) assert False, "Parse or write should have failed!" except ValueError as err: err1 = err #Now do the conversion... try: handle2 = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet) if qual_truncate: warnings.filters.pop() assert False, "Convert should have failed!" except ValueError as err2: assert str(err1) == str(err2), \ "Different failures, parse/write:\n%s\nconvert:\n%s" \ % (err1, err2)
def main_build_markov(promotor_filename = "promotor.fa", genome_filename = "genom.fa", symbol_length = 2, load_cached = False, save_cache = True): ''' Na podstawie plików z sekwencjami promotorowymi i genomem funkcja buduje model Markova''' promotor_sequences = [ x for x in SeqIO.parse("promotor.fa", "fasta")] genome = [ x for x in SeqIO.parse("genom.fa", "fasta")] if not load_cached: promotor_freqs = calc_symbol_freq(promotor_sequences) genome_freqs = calc_symbol_freq(genome) if save_cache: dump_obj(promotor_freqs, Dumpfiles.promotor_freq) dump_obj(genome_freqs, Dumpfiles.genome_freq) else: promotor_freqs = load_obj(Dumpfiles.promotor_freq) genome_freqs = load_obj(Dumpfiles.genome_freq) promotor_counts = calc_counts(promotor_sequences) genome_counts = calc_counts(genome) print promotor_counts promotor_freqs = fold_and_normalize(promotor_freqs[symbol_length], symbol_length, promotor_counts[symbol_length]) genome_freqs = fold_and_normalize(genome_freqs[symbol_length], symbol_length, genome_counts[symbol_length]) for k in promotor_freqs: assert(k in genome_freqs) for k in genome_freqs: assert(k in genome_freqs) print promotor_freqs (markov, states) = build_markov(genome_freqs, promotor_freqs) return (markov, states)
def illumina2sangerFq(inputfile): print help(SeqIO.convert) filename = inputfile[:-3]+'.fastq' SeqIO.convert(inputfile, "fastq-illumina", filename, "fastq")
def cluster_pid(folder): result = [] f_name = folder.split("/")[-1] try: genes = pd.read_csv(folder + "/report/" + f_name + "_genes.csv") genes = genes.loc[~(genes['cluster'].isin(['na', '0', 0])) & (genes['species'] == 'Homo sapiens')] if genes.shape[0] > 0: for cluster in set(genes['cluster']): pids = [] accs = genes.loc[genes['cluster'] == cluster, 'prot_acc'].values for seq1 in accs: for seq2 in accs: seq_1 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if x.name.split("|")[2] == seq1] seq_2 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if x.name.split("|")[2] == seq2] aln = pairwise2.align.globalxx(seq_1[0], seq_2[0])[0] mean_len = (len(aln[0]) + len(aln[1])) / 2 pids.append(aln[2] / mean_len) n_genes = len(pids) mean_pid = np.mean(pids) sd_id = np.std(pids) result.append(cluster, n_genes, mean_pid, sd_id) print(cluster) return result except OSError: return None
def CutOutDomain(coords,filename, header=False, column_id=0, column_start=8, column_stop=9): """COMMENTS""" from Bio import SeqIO fh=open(coords) seqfile=open(filename) Towrite=[] CoordIDDic={} if header==True: print 'header set to True, first line of %s will be ignored'%coords skip_header=fh.readline() else: print 'header not set to True, first line of %s will be processed'%coords for unformatedLine in fh: l=unformatedLine.replace('\xa0', '').strip().split(',') if l[column_id] not in CoordIDDic: CoordIDDic[l[column_id]]=l[column_start], l[column_stop] else: for s in SeqIO.parse(seqfile, 'fasta'): if s.id in CoordIDDic: start=(int(CoordIDDic.get(s.id)[0])-1) stop=int(CoordIDDic.get(s.id)[1]) s.id=s.id+'_%s_%s'%((start+1), stop) Towrite.append(s[start:stop]) else: Output=open('CutOutdomain_%s'%filename, 'w') SeqIO.write(Towrite, Output, 'fasta')
def main(gbdir, outdir): os.makedirs(gbdir, exist_ok=True) os.makedirs(outdir, exist_ok=True) tempq = 'tempquery.fasta' tempdb = 'tempdb.fasta' for org in tqdm(Organism.objects.all()): # get genbank and convert to fasta fpath = os.path.join(gbdir, '{}.gb'.format(org.accession)) if not os.path.isfile(fpath): print('\nFetching {} with accession {}'.format( org.name, org.accession )) fetch(fpath) SeqIO.convert(fpath, 'genbank', tempdb, 'fasta') # get spacers of organism and convert to fasta spacers = Spacer.objects.filter(loci__organism=org) fastatext = ''.join(['>{}\n{}\n'.format(spacer.id, spacer.sequence) for spacer in spacers]) with open(tempq, 'w') as f: f.write(fastatext) # run blast and save output outpath = os.path.join(outdir, '{}.json'.format(org.accession)) commandargs = ['blastn', '-query', tempq, '-subject', tempdb, '-out', outpath, '-outfmt', '15'] subprocess.run(commandargs, stdout=subprocess.DEVNULL) os.remove(tempq) os.remove(tempdb)
def _get_seq_dict(self): """Internal reusable function to get the sequence dictionary. """ seq_handle = open(self._test_seq_file) seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) seq_handle.close() return seq_dict
def blastclust_to_fasta(infname, seqfname, outdir): """Converts input BLASTCLUST output list to a subdirectory of FASTA files. Each individual FASTA file contains all sequences from a single cluster. The sequences matching the IDs listed in the BLASTCLUST output .lst file should all be found in the same file. Returns the output directory and a list of the files, as a tuple. """ outdirname = os.path.join(outdir, "blastclust_OTUs") if not os.path.exists(outdirname): os.makedirs(outdirname) seqdict = SeqIO.index(seqfname, 'fasta') outfnames = [] with open(infname, 'r') as fh: otu_id = 0 for line in fh: otu_id += 1 outfname = os.path.join(outdirname, "blastclust_OTU_%06d.fasta" % otu_id) SeqIO.write((seqdict[key] for key in line.split()), outfname, 'fasta') outfnames.append(outfname) return (outdirname, outfnames)
def not_t_full_celegans(self): """Test the full C elegans chromosome and GFF files. This is used to test GFF on large files and is not run as a standard test. You will need to download the files and adjust the paths to run this. """ # read the sequence information seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa") gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3") seq_handle = open(seq_file) seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) seq_handle.close() #with open(gff_file) as gff_handle: # possible_limits = feature_adder.available_limits(gff_handle) # pprint.pprint(possible_limits) rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type = rnai_types + gene_types) for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info): pass
def test_fasta_out(self): """Check FASTQ to FASTA output""" records = SeqIO.parse("Quality/example.fastq", "fastq") h = StringIO() SeqIO.write(records, h, "fasta") with open("Quality/example.fasta") as expected: self.assertEqual(h.getvalue(), expected.read())
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) if args.database_name not in server.keys(): server.new_database(args.database_name) db = server[args.database_name] gen = [] if args.fasta is not None: for rec in SeqIO.parse(args.fasta, 'fasta'): gen.append(rec.name) elif args.genbank is not None: for rec in SeqIO.parse(args.genbank, 'genbank'): gen.append(rec.name) elif args.input is not None: with open(args.input) as fp: for line in fp: gen.append(line.rstrip()) if args.remove: taxon_id = None else: taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid) for rec in gen: server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec))) server.commit()
def test_fastq_1000(self): """Read and write back simple example with mixed case 1000bp read""" data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", "ACGTNncgta"*100, "abcd!!efgh"*100) handle = StringIO() self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue())
def count_overlap(filename): for seq_record in SeqIO.parse(filename, "fasta"): for seq_record_1 in SeqIO.parse(filename, "fasta"): s1 = seq_record.seq s2 = seq_record_1.seq if s1 != s2 and s1[-3:] == s2[0:3]: print(seq_record.id + " " + seq_record_1.id)
def load_examples_from_fasta(signal, org, data_path): """ load examples from fasta file signal """ fn_pos = "%s/%s_sig_%s_example.fa" % (data_path, signal, "pos") fn_neg = "%s/%s_sig_%s_example.fa" % (data_path, signal, "neg") print "loading: \n %s \n %s" % (fn_pos, fn_neg) # parse file xt_pos = [str(rec.seq) for rec in SeqIO.parse(fn_pos, "fasta")] xt_neg = [str(rec.seq) for rec in SeqIO.parse(fn_neg, "fasta")] labels = [+1] * len(xt_pos) + [-1] * len(xt_neg) examples = xt_pos + xt_neg print ( "organism: %s, signal %s,\t num_labels: %i,\t num_examples %i,\t num_positives: %i,\t num_negatives: %i" % (org, signal, len(labels), len(examples), len(xt_pos), len(xt_neg)) ) examples_shuffled, labels_shuffled = helper.coshuffle(examples, labels) ret = {"examples": numpy.array(examples_shuffled), "labels": numpy.array(labels_shuffled)} return ret
def _validate_fasta(self, text): try: SeqIO.parse(text, 'fasta').next() return text except StopIteration: raise argparse.ArgumentTypeError( "{0} is not fasta file".format(text))
def standard_test_procedure(self, cline): """Standard testing procedure used by all tests.""" # Overwrite existing files. cline.force = True # Mark output files for later cleanup. self.add_file_to_clean(cline.outfile) if cline.guidetree_out: self.add_file_to_clean(cline.guidetree_out) input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta")) self.assertEqual(str(eval(repr(cline))), str(cline)) output, error = cline() self.assertTrue(not output or output.strip().startswith("CLUSTAL")) # Test if ClustalOmega executed successfully. self.assertTrue(error.strip() == "" or error.startswith("WARNING: Sequence type is DNA.") or error.startswith("WARNING: DNA alignment is still experimental.")) # Check the output... align = AlignIO.read(cline.outfile, "clustal") output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal")) self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys()))) for record in align: self.assertEqual(str(record.seq), str(output_records[record.id].seq)) # TODO - Try and parse this with Bio.Nexus? if cline.guidetree_out: self.assertTrue(os.path.isfile(cline.guidetree_out))
def test_acba_annot(self): replicon_filename = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' command = "integron_finder --outdir {out_dir} --func-annot --path-func-annot {annot_bank} --promoter-attI " \ "--gbk --keep-tmp " \ "{replicon}".format(out_dir=self.out_dir, annot_bank=self.resfams_dir, replicon=self.find_data(os.path.join('Replicons', '{}.fst'.format(replicon_filename))) ) with self.catch_io(out=True, err=False): main(command.split()[1:], loglevel='WARNING') result_dir = os.path.join(self.out_dir, 'Results_Integron_Finder_{}'.format(replicon_filename)) gbk = '{}.gbk'.format(replicon_id) expected_gbk = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), gbk)) gbk_test = os.path.join(result_dir, gbk) expected_gbk = SeqIO.read(expected_gbk, 'gb') gbk_test = SeqIO.read(gbk_test, 'gb') self.assertSeqRecordEqual(expected_gbk, gbk_test) output_filename = '{}.integrons'.format(replicon_filename) expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), output_filename)) test_result_path = os.path.join(result_dir, output_filename) self.assertIntegronResultEqual(expected_result_path, test_result_path) output_filename = os.path.join('tmp_{}'.format(replicon_id), replicon_id + '_Resfams_fa_table.res') expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), output_filename)) test_result_path = os.path.join(result_dir, output_filename) self.assertHmmEqual(expected_result_path, test_result_path)