def test_incompletegenenameend(self): for defline, seq, species in internal.get_gene_fastas(genes=['NP_0010352'], dbpaths=self.dbpaths, specieslist=self.specieslist): self.assertEqual(defline, None) self.assertEqual(seq, None) self.assertEqual(species, None)
def test_nogenelist(self): for defline, seq, species in internal.get_gene_fastas(genes='NP_001035293.1', dbpaths=self.dbpaths, specieslist=self.specieslist): self.assertEqual(defline, '>Amel|NP_001035293.1') self.assertEqual(seq, 'MPILIPHRNPASANYYENKDGARIVKASHFELDYMLGRKITFFCMATGFPRPEITWLKDGIELYHHKFFQVHEWPVGNDTLKSKMEIDPATQKDAGYYECQADNQYAVDRRGFRTDYVMISY') self.assertEqual(species, None)
def test_incompletegenenameend(self): for defline, seq, species in internal.get_gene_fastas( genes=['NP_0010352'], dbpaths=self.dbpaths, specieslist=self.specieslist): self.assertEqual(defline, None) self.assertEqual(seq, None) self.assertEqual(species, None)
def test_duplicate_matches(self): for defline, seq, species in internal.get_gene_fastas(genes=['XP','XP_006570708.1'], dbpaths=self.dbpaths, specieslist=self.specieslist): self.assertEqual(defline, '>Amel|XP_006570708.1') self.assertEqual(seq, 'MEIAASAMLDGLKNNRISKLALSRFLSQSLVSCILLGLLLEFRAQLETTGSPANKPASSASGSGTGGTSGTSVGANNLTTSGIATGSSGSGSGATVSGIGTVNAGSSGINIGANVGTGNVASGTVESRTSTIGVQNKQLQNVKGEHPSKAFLQNRSMSLVDMYIDNSEPSENVGQIHFSLEYDFQNTTLILRIIQGKDLPAKDLSGTSDPYVRVTLLPDKKHRLETKIKRRTLNPRWNETFYFEGFPIQKLQSRVLHLHVFDYDRFSRDDSIGEMFLPLCQVDFSDKPSFWKALKPPAKDKCGELLCSLCYHPSNSVLTLTLLKARNLKAKDINGKSDPYVKVWLQFGDKRIEKRKTPIFKCTLNPVFNEAFSFNVPWEKIRECSLDVMVMDFDNIGRNELIGRIQLAGKNGSGASETKHWQDMITKPRQTIVQWHRLKPE' ) self.assertEqual(species, None)
def test_onlyfastafilegiven(self): for defline, seq, species in internal.get_gene_fastas( fastafile=self.fastafile): self.assertEqual(defline, '>Cbir|LOC12345 testgene') self.assertEqual( seq, 'ABCDEFGABCDEFGABCDEFGABCDEFGABCDEFGhijklmnopHIJKLMhijklmnopHIJKLMNOPQRSTUV' ) self.assertEqual(species, None)
def test_duplicate_matches(self): for defline, seq, species in internal.get_gene_fastas( genes=['XP', 'XP_006570708.1'], dbpaths=self.dbpaths, specieslist=self.specieslist): self.assertEqual(defline, '>Amel|XP_006570708.1') self.assertEqual( seq, 'MEIAASAMLDGLKNNRISKLALSRFLSQSLVSCILLGLLLEFRAQLETTGSPANKPASSASGSGTGGTSGTSVGANNLTTSGIATGSSGSGSGATVSGIGTVNAGSSGINIGANVGTGNVASGTVESRTSTIGVQNKQLQNVKGEHPSKAFLQNRSMSLVDMYIDNSEPSENVGQIHFSLEYDFQNTTLILRIIQGKDLPAKDLSGTSDPYVRVTLLPDKKHRLETKIKRRTLNPRWNETFYFEGFPIQKLQSRVLHLHVFDYDRFSRDDSIGEMFLPLCQVDFSDKPSFWKALKPPAKDKCGELLCSLCYHPSNSVLTLTLLKARNLKAKDINGKSDPYVKVWLQFGDKRIEKRKTPIFKCTLNPVFNEAFSFNVPWEKIRECSLDVMVMDFDNIGRNELIGRIQLAGKNGSGASETKHWQDMITKPRQTIVQWHRLKPE' ) self.assertEqual(species, None)
def test_nogenelist(self): for defline, seq, species in internal.get_gene_fastas( genes='NP_001035293.1', dbpaths=self.dbpaths, specieslist=self.specieslist): self.assertEqual(defline, '>Amel|NP_001035293.1') self.assertEqual( seq, 'MPILIPHRNPASANYYENKDGARIVKASHFELDYMLGRKITFFCMATGFPRPEITWLKDGIELYHHKFFQVHEWPVGNDTLKSKMEIDPATQKDAGYYECQADNQYAVDRRGFRTDYVMISY' ) self.assertEqual(species, None)
def test_dbpath_no_specieslist(self): for defline, seq, species in internal.get_gene_fastas(genes=['NP_001035293.1'], dbpaths=self.dbpaths): self.assertEqual(defline, None) self.assertEqual(seq, None) self.assertEqual(species, None)
def test_onlygenegiven(self): for defline, seq, species in internal.get_gene_fastas(genes=['NP_001035293.1']): self.assertEqual(defline, None) self.assertEqual(seq, None) self.assertEqual(species, None)
def test_onlyfastafilegiven(self): for defline, seq, species in internal.get_gene_fastas(fastafile=self.fastafile): self.assertEqual(defline, '>Cbir|LOC12345 testgene') self.assertEqual(seq, 'ABCDEFGABCDEFGABCDEFGABCDEFGABCDEFGhijklmnopHIJKLMhijklmnopHIJKLMNOPQRSTUV') self.assertEqual(species, None)
def test_no_argsgiven(self): for defline, seq, species in internal.get_gene_fastas(): self.assertEqual(defline, None) self.assertEqual(seq, None) self.assertEqual(species, None)
def test_badfastaseq(self): for defline, seq, species in internal.get_gene_fastas(fastafile=self.fastafileillegal): self.assertEqual(defline, None) self.assertEqual(seq, None) self.assertEqual(species, None)
def test_dbpath_no_specieslist(self): for defline, seq, species in internal.get_gene_fastas( genes=['NP_001035293.1'], dbpaths=self.dbpaths): self.assertEqual(defline, None) self.assertEqual(seq, None) self.assertEqual(species, None)
def test_onlygenegiven(self): for defline, seq, species in internal.get_gene_fastas( genes=['NP_001035293.1']): self.assertEqual(defline, None) self.assertEqual(seq, None) self.assertEqual(species, None)
def test_badfastaseq(self): for defline, seq, species in internal.get_gene_fastas( fastafile=self.fastafileillegal): self.assertEqual(defline, None) self.assertEqual(seq, None) self.assertEqual(species, None)
for homolog in sorted(homologlist): # remove excluded genes before bothering to look up their sequence: searchname = internal.fix_leaky_pipes(homolog) if searchname in excluded_genes: continue if homologlist[homolog][0] in excluded_species: continue # extract sequences of remaining genes and add to conversion dictionary itercount += 1 for defline, seq, spec in internal.get_gene_fastas( genes=[searchname], species=homologlist[homolog][0], fastafile=None, dbpaths=dbpaths, specieslist=specieslist, comment=str(homologlist[homolog][1]) + str(itercount), short=False): if sequence_filter(seq, args.maxlength, args.minlength): continue else: seqdic[seq] = internal.remove_illegal_characters(defline) shortname = internal.phylipise(homologlist[homolog][0], itercount) conv_handle.write("%s %-5d %s\n" % (shortname, homologlist[homolog][1], homolog)) conv_dic[shortname] = (homolog, homologlist[homolog][1]) conv_handle.close()
def get_similar_sequences(temp_dir, buildhmmer=False, fastafile=None, specieslist={}, species=None, genes=[], dbpaths={}, mincollect=2, globalthresh=0.2, localthresh=0.8, verbalise=lambda *a: None): # clean gene list type and content: if not isinstance(genes, list): genes = [genes] genes = [ g for g in genes if g != '' ] # count genes provided: genelist_num, fasta_num = internal.count_genes(genes, fastafile) verbalise("Y", "Genelist size:%d\nFasta size:%d" % (genelist_num, fasta_num )) # if fasta files are provided, create a temp fastafile to search against with hmmer: if fastafile: extra_file = os.path.join(temp_dir, "query_fasta") handle = open(extra_file, 'w') for defline, seq in internal.parsefasta(fastafile): handle.write(">%s\n%s\n" % (defline, seq)) handle.close() extra_file_search = extra_file else: extra_file_search = None if genelist_num + fasta_num > 1: buildhmmer = True if buildhmmer: hmminput = os.path.join(temp_dir, "hmminput.fa") handle = open(hmminput, 'w') seqcount = 0 verbalise("B", "Extracting sequence data from %d peptides" % len(genes)) for defline, seq, species in internal.get_gene_fastas(genes=genes, species=None, fastafile=fastafile, specieslist=specieslist, dbpaths=dbpaths): if seq: seqcount += 1 fasta_seq = "%s\n%s\n" % (defline, seq) handle.write(fasta_seq) handle.close() if seqcount == 0: verbalise("R", "No gene sequences were found.") return {} # create alignment of input sequences: mafft_align1 = os.path.join(temp_dir, "mafft_align_input.fa") mafft_align(hmminput, mafft_align1) verbalise("B", "Creating hidden markov model from %d sequences" % seqcount) # create hmmbuild model of alignment: hmmmodel = os.path.join(temp_dir, "hmmmodel.fa") open(hmmmodel, 'a').close() handle = os.popen(" ".join(['hmmbuild --informat afa', hmmmodel, mafft_align1])) handle.close() homologlist = hmmer_search(None, specieslist, query_species=species, minthresh=localthresh, temp_dir=temp_dir, dbpaths=dbpaths, mincollect=mincollect, globalthresh=globalthresh, hmmfile=hmmmodel, verbalise=verbalise, extra_file_search=extra_file_search) os.remove(mafft_align1) os.remove(hmminput) else: verbalise("B", "Extracting sequence from %s" % genes) if not isinstance(genes, list): genes = [genes] # run phmmer on a single input gene/sequence: seq = "" for defline, seq, species in internal.get_gene_fastas(genes=genes, species=species, fastafile=fastafile, specieslist=specieslist, dbpaths=dbpaths): fasta_seq = "%s\n%s\n" % (defline, seq) verbalise("C", fasta_seq) if not seq: verbalise("R", "No genes sequences were found.") return {} ## phmmer all lpep files homologlist = hmmer_search(fasta_seq, specieslist, query_species=species, minthresh=localthresh, dbpaths=dbpaths, temp_dir=temp_dir, mincollect=mincollect, globalthresh=globalthresh, hmmfile=None, verbalise=verbalise, extra_file_search=extra_file_search) return homologlist
def get_similar_sequences(temp_dir, buildhmmer=False, fastafile=None, specieslist={}, species=None, genes=[], dbpaths={}, mincollect=2, globalthresh=0.2, localthresh=0.8, verbalise=lambda *a: None): # clean gene list type and content: if not isinstance(genes, list): genes = [genes] genes = [g for g in genes if g != ''] # count genes provided: genelist_num, fasta_num = internal.count_genes(genes, fastafile) verbalise("Y", "Genelist size:%d\nFasta size:%d" % (genelist_num, fasta_num)) # if fasta files are provided, create a temp fastafile to search against with hmmer: if fastafile: extra_file = os.path.join(temp_dir, "query_fasta") handle = open(extra_file, 'w') for defline, seq in internal.parsefasta(fastafile): handle.write(">%s\n%s\n" % (defline, seq)) handle.close() extra_file_search = extra_file else: extra_file_search = None if genelist_num + fasta_num > 1: buildhmmer = True if buildhmmer: hmminput = os.path.join(temp_dir, "hmminput.fa") handle = open(hmminput, 'w') seqcount = 0 verbalise("B", "Extracting sequence data from %d peptides" % len(genes)) for defline, seq, species in internal.get_gene_fastas( genes=genes, species=None, fastafile=fastafile, specieslist=specieslist, dbpaths=dbpaths): if seq: seqcount += 1 fasta_seq = "%s\n%s\n" % (defline, seq) handle.write(fasta_seq) handle.close() if seqcount == 0: verbalise("R", "No gene sequences were found.") return {} # create alignment of input sequences: mafft_align1 = os.path.join(temp_dir, "mafft_align_input.fa") mafft_align(hmminput, mafft_align1) verbalise("B", "Creating hidden markov model from %d sequences" % seqcount) # create hmmbuild model of alignment: hmmmodel = os.path.join(temp_dir, "hmmmodel.fa") open(hmmmodel, 'a').close() handle = os.popen(" ".join( ['hmmbuild --informat afa', hmmmodel, mafft_align1])) handle.close() homologlist = hmmer_search(None, specieslist, query_species=species, minthresh=localthresh, temp_dir=temp_dir, dbpaths=dbpaths, mincollect=mincollect, globalthresh=globalthresh, hmmfile=hmmmodel, verbalise=verbalise, extra_file_search=extra_file_search) os.remove(mafft_align1) os.remove(hmminput) else: verbalise("B", "Extracting sequence from %s" % genes) if not isinstance(genes, list): genes = [genes] # run phmmer on a single input gene/sequence: seq = "" for defline, seq, species in internal.get_gene_fastas( genes=genes, species=species, fastafile=fastafile, specieslist=specieslist, dbpaths=dbpaths): fasta_seq = "%s\n%s\n" % (defline, seq) verbalise("C", fasta_seq) if not seq: verbalise("R", "No genes sequences were found.") return {} ## phmmer all lpep files homologlist = hmmer_search(fasta_seq, specieslist, query_species=species, minthresh=localthresh, dbpaths=dbpaths, temp_dir=temp_dir, mincollect=mincollect, globalthresh=globalthresh, hmmfile=None, verbalise=verbalise, extra_file_search=extra_file_search) return homologlist
for homolog in sorted(homologlist): # remove excluded genes before bothering to look up their sequence: searchname = internal.fix_leaky_pipes(homolog) if searchname in excluded_genes: continue if homologlist[homolog][0] in excluded_species: continue # extract sequences of remaining genes and add to conversion dictionary itercount += 1 for defline, seq, spec in internal.get_gene_fastas( genes=[searchname], species=homologlist[homolog][0], fastafile=None, dbpaths=dbpaths, specieslist=specieslist, comment=str(homologlist[homolog][1]) + str(itercount), short=False, ): if sequence_filter(seq, args.maxlength, args.minlength): continue else: seqdic[seq] = internal.remove_illegal_characters(defline) shortname = internal.phylipise(homologlist[homolog][0], itercount) conv_handle.write("%s %-5d %s\n" % (shortname, homologlist[homolog][1], homolog)) conv_dic[shortname] = (homolog, homologlist[homolog][1]) conv_handle.close()