def test_arguments(self): seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', alphabet=Alphabet.generic_protein) seqdb = SeqIO.read('tests/data/matchtarget.fasta', 'fasta', alphabet=Alphabet.generic_protein) j = HMMER.jackhmmer(seq, seqdb) args = j.getArgs(max=True, E='something') self.assertEqual(args, ['--max', '-E', 'something'])
def update_models(): """Recalculate the HMM models""" print "Extracting C-Termini..." ct = extract.get_c_terminus(extract.extract(localization=None)) print "Done. Got {} tails".format(len(ct)) (E, Ep, DYW) = utils.get_tail_consensus() print "E" j = HMMER.jackhmmer(E,ct) j.hmms[-1].name = 'E' hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'E.hmm')) print "E+" j = HMMER.jackhmmer(Ep,ct) j.hmms[-1].name = 'E+' hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'E+.hmm')) print "DYW" j = HMMER.jackhmmer(DYW,ct) j.hmms[-1].name = 'DYW' hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'DYW.hmm'))
def test_jackhmmer(self): seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', alphabet=Alphabet.generic_protein) seqdb = SeqIO.read('tests/data/matchtarget.fasta', 'fasta', alphabet=Alphabet.generic_protein) j = HMMER.jackhmmer(seq, seqdb) #load the expected output seq_ = HMMER.wrap_seqrecords([seq,]) seqdb_ = HMMER.wrap_seqrecords([seqdb,]) m = matchfile.load('tests/data/jack_out', seq_, seqdb_) self.assertEqual(m, j.matches)
def test_jackhmmer_dna(self): seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', alphabet=Alphabet.generic_protein) seqdb = SeqIO.read('tests/data/dna_target.fasta', 'fasta', alphabet=Alphabet.generic_dna) j = HMMER.jackhmmer(seq, seqdb) #load the expected output seqdb_prot = SeqIO.read('tests/data/matchtarget.fasta', 'fasta', alphabet=Alphabet.generic_protein) seq_ = HMMER.wrap_seqrecords([seq,]) seqdb_ = HMMER.wrap_seqrecords([seqdb_prot,]) matches = matchfile.load('tests/data/jack_out', seq_, seqdb_) #scale the matches' locations to match the protein search for m in matches: m.scale(3) self.assertEqual([str(m) for m in matches], [str(m) for m in j.matches])
def find_homologs(): """Predict homologs of PPRs in other genomes based on footprints""" pprs = load_pprs() plastids = load_plastids(exclude=[ "Arabidopsis thaliana", ]) known_binding = SeqIO.read("output/ARA_annotated.gb", "gb") exact_features = [ f for f in known_binding.features if "exact" in f.type.lower() ] ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"] ara_genes.sort(key=lambda g: g.location.start) print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids)) for k, ppr in enumerate(pprs): print "Searching for homologs of \'{}\' ({}/{})".format( ppr.name, k + 1, len(pprs)) footprints = [ f for f in exact_features if f.type.lower() == "{}_exact".format(ppr.name.lower()) ] ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints] print "\tFound {} original genes, {}".format(len( ppr.genes), [g.qualifiers['gene'] for g in ppr.genes]) ppr.potentialHomologs = {} for i, plastid in enumerate(plastids): if plastid.name != "Alsophila spinulosa": continue print "\t\tSearch {}/{}".format(i + 1, len(plastids)) #search for homologs of each gene homologs = [] for gene in ppr.genes: g = SeqRecord(gene.extract(known_binding.seq).translate()) search = HMMER.jackhmmer(g, plastid) print "{} -> {} homologs".format(gene.qualifiers['gene'], len(search.matches)) homologs += search.getFeatures( type="{}_hl".format(gene.qualifiers['gene'])) #extract the sequence surrounding each homolog for h in homologs: h.location = FeatureLocation( max(0, h.location.start - 500), min(len(plastid), h.location.end + 500)) homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs] #find exact or close to exact binding domains for each and add to the #list of potential homologs for the PPR ph = [] for h in homologs: domains = [] for exact in ppr.exact: try: domains += binding.get_domains(exact, h, percentile=100.0, gaps=0) except KeyError: continue if domains: domains.sort(key=lambda d: -d.qualifiers['odds']) seq = str(domains[0].extract(h).seq) similarity = max([ sequence_similarity(original, seq) for original in ppr.footprints ]) print " {} -> \'{}\'".format(h.type, seq) ph.append((similarity, seq)) ph.sort(key=lambda p: -p[0]) ppr.potentialHomologs[plastid.name] = ph #try and avoid running out of RAM gc.collect() for ppr in pprs: print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints) print "potential homologs" for key, value in ppr.potentialHomologs.iteritems(): print "{}: {}".format(key, value) return stats = [] for plastid in plastids: length = 0 similarity = 0.0 for ppr in pprs: length += len(ppr.potentialHomologs[plastid.name]) similarity += sum( [p[0] for p in ppr.potentialHomologs[plastid.name]]) try: stats.append({ 'name': plastid.name, 'avg_similarity': similarity / float(length), 'avg_homologs': length / len(pprs), }) except ZeroDivisionError: stats.append({ 'name': plastid.name, 'avg_similarity': 0.0, 'avg_homologs': 0, }) stats.sort(key=lambda s: -s['avg_similarity']) f = open("tmp", "w") for s in stats[0:50]: f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s)) f.close()
def find_homologs(): """Predict homologs of PPRs in other genomes based on footprints""" pprs = load_pprs() plastids = load_plastids(exclude=["Arabidopsis thaliana",]) known_binding = SeqIO.read("output/ARA_annotated.gb", "gb") exact_features = [f for f in known_binding.features if "exact" in f.type.lower()] ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"] ara_genes.sort(key=lambda g: g.location.start) print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids)) for k,ppr in enumerate(pprs): print "Searching for homologs of \'{}\' ({}/{})".format( ppr.name,k+1,len(pprs)) footprints = [f for f in exact_features if f.type.lower() == "{}_exact".format(ppr.name.lower())] ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints] print "\tFound {} original genes, {}".format(len(ppr.genes), [g.qualifiers['gene'] for g in ppr.genes]) ppr.potentialHomologs = {} for i,plastid in enumerate(plastids): if plastid.name != "Alsophila spinulosa": continue print "\t\tSearch {}/{}".format(i+1, len(plastids)) #search for homologs of each gene homologs = [] for gene in ppr.genes: g = SeqRecord(gene.extract(known_binding.seq).translate()) search = HMMER.jackhmmer(g, plastid) print "{} -> {} homologs".format(gene.qualifiers['gene'], len(search.matches)) homologs += search.getFeatures(type="{}_hl".format(gene.qualifiers['gene'])) #extract the sequence surrounding each homolog for h in homologs: h.location = FeatureLocation( max(0,h.location.start - 500), min(len(plastid), h.location.end+500)) homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs] #find exact or close to exact binding domains for each and add to the #list of potential homologs for the PPR ph = [] for h in homologs: domains = [] for exact in ppr.exact: try: domains += binding.get_domains(exact, h, percentile=100.0, gaps=0) except KeyError: continue if domains: domains.sort(key=lambda d: -d.qualifiers['odds']) seq = str(domains[0].extract(h).seq) similarity = max([sequence_similarity(original, seq) for original in ppr.footprints]) print " {} -> \'{}\'".format(h.type, seq) ph.append((similarity, seq)) ph.sort(key=lambda p: -p[0]) ppr.potentialHomologs[plastid.name] = ph #try and avoid running out of RAM gc.collect() for ppr in pprs: print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints) print "potential homologs" for key,value in ppr.potentialHomologs.iteritems(): print "{}: {}".format(key, value) return stats = [] for plastid in plastids: length = 0 similarity = 0.0 for ppr in pprs: length += len(ppr.potentialHomologs[plastid.name]) similarity += sum([p[0] for p in ppr.potentialHomologs[plastid.name]]) try: stats.append({'name': plastid.name, 'avg_similarity': similarity / float(length), 'avg_homologs': length / len(pprs),}) except ZeroDivisionError: stats.append({'name': plastid.name, 'avg_similarity': 0.0, 'avg_homologs': 0,}) stats.sort(key=lambda s: -s['avg_similarity']) f = open("tmp", "w") for s in stats[0:50]: f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s)) f.close()