def test_jackhmmer(self): seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', alphabet=Alphabet.generic_protein) seqdb = SeqIO.read('tests/data/matchtarget.fasta', 'fasta', alphabet=Alphabet.generic_protein) j = HMMER.jackhmmer(seq, seqdb) #load the expected output seq_ = HMMER.wrap_seqrecords([seq,]) seqdb_ = HMMER.wrap_seqrecords([seqdb,]) m = matchfile.load('tests/data/jack_out', seq_, seqdb_) self.assertEqual(m, j.matches)
def fill_gaps(ppr): """Look for reluctant motifs in suspiciously sized gaps""" gaps = find_gaps(ppr, mingap=3 * 30, maxgap=3 * 40) #check if there could be a motif at the start if ppr.features[0].location.start > 3 * 30: gaps.append(FeatureLocation(0, ppr.features[0].location.start, 1)) for g in gaps: record = ppr[g.start - 15:g.end + 15] record.seq = record.seq.translate() #increase the reporting thresholds search = HMMER.hmmsearch(hmm=models[3], targets=record, F1=0.5, F2=0.5, F3=0.5) if search.matches: motif = search.getFeatures(record)[0] offset = g.start - 15 motif.location = FeatureLocation(offset + 3 * motif.location.start, offset + 3 * motif.location.end, strand=1) ppr.features.append(motif) ppr.features.sort(key=lambda p: p.location.start) return ppr
def test_search_results(domE=10.0, verbose=False): known = get_known()[0] for k in known: k.hit = 0 ara = SeqIO.parse(ARA_FILE, 'fasta').next() search = HMMER.hmmsearch(hmm = extract.models[3], targets = ara, domE=domE) found = search.getFeatures(ara) for f in found: p = int((f.location.start + f.location.end) / 2) for k in known: if p in k: k.hit += 1 break hits = [j.hit for j in known] ret = {'matches': len(found), 'correct_matches': sum(hits), 'hit_pprs': sum([1 for i in hits if i > 0]), 'total_pprs': len(known), 'hits': hits,} if verbose: print "Found {} HMM domains, {} within known PPRs".format( ret['matches'], ret['correct_matches']) print "Hits in {} of {} PPR proteins".format( ret['hit_pprs'], ret['total_pprs']) print "Hits per PPR: min: {} / max: {} / avg: {}".format( min(hits), max(hits), float(sum(hits)) / float(len(hits))) return ret
def test_search_results(domE=10.0, verbose=False): known = get_known()[0] for k in known: k.hit = 0 ara = SeqIO.parse(ARA_FILE, 'fasta').next() search = HMMER.hmmsearch(hmm=extract.models[3], targets=ara, domE=domE) found = search.getFeatures(ara) for f in found: p = int((f.location.start + f.location.end) / 2) for k in known: if p in k: k.hit += 1 break hits = [j.hit for j in known] ret = { 'matches': len(found), 'correct_matches': sum(hits), 'hit_pprs': sum([1 for i in hits if i > 0]), 'total_pprs': len(known), 'hits': hits, } if verbose: print "Found {} HMM domains, {} within known PPRs".format( ret['matches'], ret['correct_matches']) print "Hits in {} of {} PPR proteins".format(ret['hit_pprs'], ret['total_pprs']) print "Hits per PPR: min: {} / max: {} / avg: {}".format( min(hits), max(hits), float(sum(hits)) / float(len(hits))) return ret
def classify(pprs, family_annot="ppr_family", tail_annot='ppr_tail'): """Annotate each ppr with it's family type, (P,PLS,E,E+,DYW)""" ct = get_c_terminus(pprs) (E, Ep, DYW) = utils.get_tail_models() h = HMMER.hmmsearch([E,Ep,DYW], ct) #annotate each tail h.annotate(ct) for ppr,tail in zip(pprs, ct): fmt = '' if tail.features: f = sorted(tail.features, key=lambda(ft): int(ft.location.start)) fmt = ("-{.type}"*len(f)).format(*f) if fmt[-3:] == 'DYW': ppr.annotations[family_annot] = 'DYW' elif fmt.find('E+') >= 0: ppr.annotations[family_annot] = 'E+' elif fmt.find('E') >= 0: ppr.annotations[family_annot] = 'E' else: print "Unknown tail format \'{}\'".format(fmt) ppr.annotations[family_annot] = '??' else: l = len(ppr.features[0]) for f in ppr.features: if len(f) != l: ppr.annotations[family_annot]='PLS' continue ppr.annotations[family_annot] = 'P' ppr.annotations[tail_annot] = fmt
def locate_ppr(envelope): """Find and annotate the protein within""" #find all the PPR motifs search = HMMER.hmmsearch(hmm = models[3], targets = envelope) motifs = search.getFeatures(envelope) #A ppr must contain 2 or more PPR motifs if len(motifs) < 2: return None #order the motifs motifs.sort(key=lambda m: m.location.start) known_start = True known_stop = True #find start codon start = motifs[0].location.start while start > 0 and str(envelope.seq[start:start+3]).lower() != "atg": start -= 3 if start < 0: known_start = False start = 0 #find stop codon stop = motifs[-1].location.end while stop < len(envelope) and ( str(envelope.seq[stop:stop+3]).lower() not in ["tag", "tga", "taa"]): stop += 3 if stop > len(envelope): known_stop = False stop = len(envelope) #move the motifs for m in motifs: m.location = FeatureLocation(m.location.start-start, m.location.end-start) #get absolute start and end if envelope.annotations['src_strand'] > 0: src_from = envelope.annotations['src_from'] + start src_to = envelope.annotations['src_from'] + stop else: src_from = envelope.annotations['src_to'] - stop src_to = envelope.annotations['src_to'] - start annotations = { "src_from" : src_from, "src_to" : src_to, "src_strand": envelope.annotations['src_strand'], } if not known_stop: annotations['no_stop'] = True if not known_start: annotations['no_start'] = True #return a record return SeqRecord(envelope.seq[start:stop], features = motifs, annotations = annotations)
def locate_ppr(envelope): """Find and annotate the protein within""" #find all the PPR motifs search = HMMER.hmmsearch(hmm=models[3], targets=envelope) motifs = search.getFeatures(envelope) #A ppr must contain 2 or more PPR motifs if len(motifs) < 2: return None #order the motifs motifs.sort(key=lambda m: m.location.start) known_start = True known_stop = True #find start codon start = motifs[0].location.start while start > 0 and str(envelope.seq[start:start + 3]).lower() != "atg": start -= 3 if start < 0: known_start = False start = 0 #find stop codon stop = motifs[-1].location.end while stop < len(envelope) and (str(envelope.seq[stop:stop + 3]).lower() not in ["tag", "tga", "taa"]): stop += 3 if stop > len(envelope): known_stop = False stop = len(envelope) #move the motifs for m in motifs: m.location = FeatureLocation(m.location.start - start, m.location.end - start) #get absolute start and end if envelope.annotations['src_strand'] > 0: src_from = envelope.annotations['src_from'] + start src_to = envelope.annotations['src_from'] + stop else: src_from = envelope.annotations['src_to'] - stop src_to = envelope.annotations['src_to'] - start annotations = { "src_from": src_from, "src_to": src_to, "src_strand": envelope.annotations['src_strand'], } if not known_stop: annotations['no_stop'] = True if not known_start: annotations['no_start'] = True #return a record return SeqRecord(envelope.seq[start:stop], features=motifs, annotations=annotations)
def test_annotation(self): #load the hmm hmm = hmmfile.read('tests/data/valid.hmm') s1 = SeqIO.read('tests/data/PPR10.gb', 'genbank') s1.features = [] s2 = SeqIO.read('tests/data/PPR10.gb', 'genbank') s2.features = [] s2.seq = s2.seq.reverse_complement() h1 = HMMER.hmmsearch(hmm, s1) h2 = HMMER.hmmsearch(hmm, s2) h1.annotate(s1) h2.annotate(s2) for (f1, f2) in zip(s1.features, s2.features): self.assertEqual(str(f1.extract(s1.seq)), str(f2.extract(s2.seq)))
def test_arguments(self): seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', alphabet=Alphabet.generic_protein) seqdb = SeqIO.read('tests/data/matchtarget.fasta', 'fasta', alphabet=Alphabet.generic_protein) j = HMMER.jackhmmer(seq, seqdb) args = j.getArgs(max=True, E='something') self.assertEqual(args, ['--max', '-E', 'something'])
def test_jackhmmer_dna(self): seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', alphabet=Alphabet.generic_protein) seqdb = SeqIO.read('tests/data/dna_target.fasta', 'fasta', alphabet=Alphabet.generic_dna) j = HMMER.jackhmmer(seq, seqdb) #load the expected output seqdb_prot = SeqIO.read('tests/data/matchtarget.fasta', 'fasta', alphabet=Alphabet.generic_protein) seq_ = HMMER.wrap_seqrecords([seq,]) seqdb_ = HMMER.wrap_seqrecords([seqdb_prot,]) matches = matchfile.load('tests/data/jack_out', seq_, seqdb_) #scale the matches' locations to match the protein search for m in matches: m.scale(3) self.assertEqual([str(m) for m in matches], [str(m) for m in j.matches])
def update_models(): """Recalculate the HMM models""" print "Extracting C-Termini..." ct = extract.get_c_terminus(extract.extract(localization=None)) print "Done. Got {} tails".format(len(ct)) (E, Ep, DYW) = utils.get_tail_consensus() print "E" j = HMMER.jackhmmer(E,ct) j.hmms[-1].name = 'E' hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'E.hmm')) print "E+" j = HMMER.jackhmmer(Ep,ct) j.hmms[-1].name = 'E+' hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'E+.hmm')) print "DYW" j = HMMER.jackhmmer(DYW,ct) j.hmms[-1].name = 'DYW' hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'DYW.hmm'))
def test_translation_search(self): t = SeqIO.parse('tests/data/dna_target.fasta', 'fasta', alphabet=Alphabet.generic_dna) h = HMMER.hmmsearch('tests/data/valid.hmm', t) self.assertEqual(len(h.matches), 17) check_valid(self, h.matches) #check that matches all have frame 1 for m in h.matches: self.assertEqual(m.getFrame(), 1) #test feature extraction for m in h.matches: f = m.asSeqFeature() self.assertEqual(f.qualifiers['frame'], 1)
def test_search(self): t = SeqIO.parse('tests/data/matchtarget.fasta', 'fasta', alphabet=Alphabet.generic_protein) h = HMMER.hmmsearch('tests/data/valid.hmm', t) self.assertEqual(len(h.matches), 17) check_valid(self, h.matches)
def setUp(self): #build a fake search self.hs = HMMER.hmmsearch() self.hs.hmms = [self.hmm,] self.hs.targets = [self.target,]
def find_homologs(): """Predict homologs of PPRs in other genomes based on footprints""" pprs = load_pprs() plastids = load_plastids(exclude=[ "Arabidopsis thaliana", ]) known_binding = SeqIO.read("output/ARA_annotated.gb", "gb") exact_features = [ f for f in known_binding.features if "exact" in f.type.lower() ] ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"] ara_genes.sort(key=lambda g: g.location.start) print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids)) for k, ppr in enumerate(pprs): print "Searching for homologs of \'{}\' ({}/{})".format( ppr.name, k + 1, len(pprs)) footprints = [ f for f in exact_features if f.type.lower() == "{}_exact".format(ppr.name.lower()) ] ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints] print "\tFound {} original genes, {}".format(len( ppr.genes), [g.qualifiers['gene'] for g in ppr.genes]) ppr.potentialHomologs = {} for i, plastid in enumerate(plastids): if plastid.name != "Alsophila spinulosa": continue print "\t\tSearch {}/{}".format(i + 1, len(plastids)) #search for homologs of each gene homologs = [] for gene in ppr.genes: g = SeqRecord(gene.extract(known_binding.seq).translate()) search = HMMER.jackhmmer(g, plastid) print "{} -> {} homologs".format(gene.qualifiers['gene'], len(search.matches)) homologs += search.getFeatures( type="{}_hl".format(gene.qualifiers['gene'])) #extract the sequence surrounding each homolog for h in homologs: h.location = FeatureLocation( max(0, h.location.start - 500), min(len(plastid), h.location.end + 500)) homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs] #find exact or close to exact binding domains for each and add to the #list of potential homologs for the PPR ph = [] for h in homologs: domains = [] for exact in ppr.exact: try: domains += binding.get_domains(exact, h, percentile=100.0, gaps=0) except KeyError: continue if domains: domains.sort(key=lambda d: -d.qualifiers['odds']) seq = str(domains[0].extract(h).seq) similarity = max([ sequence_similarity(original, seq) for original in ppr.footprints ]) print " {} -> \'{}\'".format(h.type, seq) ph.append((similarity, seq)) ph.sort(key=lambda p: -p[0]) ppr.potentialHomologs[plastid.name] = ph #try and avoid running out of RAM gc.collect() for ppr in pprs: print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints) print "potential homologs" for key, value in ppr.potentialHomologs.iteritems(): print "{}: {}".format(key, value) return stats = [] for plastid in plastids: length = 0 similarity = 0.0 for ppr in pprs: length += len(ppr.potentialHomologs[plastid.name]) similarity += sum( [p[0] for p in ppr.potentialHomologs[plastid.name]]) try: stats.append({ 'name': plastid.name, 'avg_similarity': similarity / float(length), 'avg_homologs': length / len(pprs), }) except ZeroDivisionError: stats.append({ 'name': plastid.name, 'avg_similarity': 0.0, 'avg_homologs': 0, }) stats.sort(key=lambda s: -s['avg_similarity']) f = open("tmp", "w") for s in stats[0:50]: f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s)) f.close()
def simple_extract(target, localization = None, verbose=False): """Extract all the PPRs found in target""" if not isinstance(target, SeqRecord): raise TypeError("simple_extract requires a Bio.SeqRecord, not {}".format( type(target))) if verbose: print "Searching..." #find all easy-to-locate PPR motifs search = HMMER.hmmsearch(hmm = models[3], targets = target) #get features for each motif motifs = search.getFeatures(target) if verbose: print "Got {} motifs, grouping...".format(len(motifs)) #group features by frame and locatiion groups = group_motifs(motifs, max_gap=1500) if verbose: print "Got {} groups, extracting envelopes...".format(len(groups)) pprs = [] dbg_env = [] while groups: if verbose: print "Got {} groups, extracting envelopes...".format(len(groups)) #extract the sequence envelope around each group envelopes = [get_envelope(group, target, margin=1000) for group in groups] dbg_env += envelopes if verbose: print "Got {} envelopes, locating PPRs...".format(len(envelopes)) #locate the PPR within each envelope for envelope in envelopes: ppr = locate_ppr(envelope) if ppr: pprs.append(ppr) #look for overlapping pprs groups = remove_overlaps(pprs) ol = len(groups) if verbose: print "{} conflicts".format(ol) groups += remove_overgrown(pprs, 500) if verbose: print "{} overgrown PPRs".format(len(groups) - ol) pprs = [add_source(p, target) for p in pprs] if verbose: print "Got {} PPRs, cleaning...".format(len(pprs)) #clean the gaps between features pprs = [clean_ends(fill_gaps(ppr)) for ppr in pprs] #annotate the tail region and classify each PPR classify.classify(pprs) #predict each PPR's target targetp.targetp(pprs, annotation='localization') #filter the desired location if localization: pprs = [p for p in pprs if p.annotations['localization'] == localization] #return a list of nicely presented PPRs return pprs
def simple_extract(target, localization=None, verbose=False): """Extract all the PPRs found in target""" if not isinstance(target, SeqRecord): raise TypeError( "simple_extract requires a Bio.SeqRecord, not {}".format( type(target))) if verbose: print "Searching..." #find all easy-to-locate PPR motifs search = HMMER.hmmsearch(hmm=models[3], targets=target) #get features for each motif motifs = search.getFeatures(target) if verbose: print "Got {} motifs, grouping...".format(len(motifs)) #group features by frame and locatiion groups = group_motifs(motifs, max_gap=1500) if verbose: print "Got {} groups, extracting envelopes...".format(len(groups)) pprs = [] dbg_env = [] while groups: if verbose: print "Got {} groups, extracting envelopes...".format(len(groups)) #extract the sequence envelope around each group envelopes = [ get_envelope(group, target, margin=1000) for group in groups ] dbg_env += envelopes if verbose: print "Got {} envelopes, locating PPRs...".format(len(envelopes)) #locate the PPR within each envelope for envelope in envelopes: ppr = locate_ppr(envelope) if ppr: pprs.append(ppr) #look for overlapping pprs groups = remove_overlaps(pprs) ol = len(groups) if verbose: print "{} conflicts".format(ol) groups += remove_overgrown(pprs, 500) if verbose: print "{} overgrown PPRs".format(len(groups) - ol) pprs = [add_source(p, target) for p in pprs] if verbose: print "Got {} PPRs, cleaning...".format(len(pprs)) #clean the gaps between features pprs = [clean_ends(fill_gaps(ppr)) for ppr in pprs] #annotate the tail region and classify each PPR classify.classify(pprs) #predict each PPR's target targetp.targetp(pprs, annotation='localization') #filter the desired location if localization: pprs = [ p for p in pprs if p.annotations['localization'] == localization ] #return a list of nicely presented PPRs return pprs
def find_homologs(): """Predict homologs of PPRs in other genomes based on footprints""" pprs = load_pprs() plastids = load_plastids(exclude=["Arabidopsis thaliana",]) known_binding = SeqIO.read("output/ARA_annotated.gb", "gb") exact_features = [f for f in known_binding.features if "exact" in f.type.lower()] ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"] ara_genes.sort(key=lambda g: g.location.start) print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids)) for k,ppr in enumerate(pprs): print "Searching for homologs of \'{}\' ({}/{})".format( ppr.name,k+1,len(pprs)) footprints = [f for f in exact_features if f.type.lower() == "{}_exact".format(ppr.name.lower())] ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints] print "\tFound {} original genes, {}".format(len(ppr.genes), [g.qualifiers['gene'] for g in ppr.genes]) ppr.potentialHomologs = {} for i,plastid in enumerate(plastids): if plastid.name != "Alsophila spinulosa": continue print "\t\tSearch {}/{}".format(i+1, len(plastids)) #search for homologs of each gene homologs = [] for gene in ppr.genes: g = SeqRecord(gene.extract(known_binding.seq).translate()) search = HMMER.jackhmmer(g, plastid) print "{} -> {} homologs".format(gene.qualifiers['gene'], len(search.matches)) homologs += search.getFeatures(type="{}_hl".format(gene.qualifiers['gene'])) #extract the sequence surrounding each homolog for h in homologs: h.location = FeatureLocation( max(0,h.location.start - 500), min(len(plastid), h.location.end+500)) homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs] #find exact or close to exact binding domains for each and add to the #list of potential homologs for the PPR ph = [] for h in homologs: domains = [] for exact in ppr.exact: try: domains += binding.get_domains(exact, h, percentile=100.0, gaps=0) except KeyError: continue if domains: domains.sort(key=lambda d: -d.qualifiers['odds']) seq = str(domains[0].extract(h).seq) similarity = max([sequence_similarity(original, seq) for original in ppr.footprints]) print " {} -> \'{}\'".format(h.type, seq) ph.append((similarity, seq)) ph.sort(key=lambda p: -p[0]) ppr.potentialHomologs[plastid.name] = ph #try and avoid running out of RAM gc.collect() for ppr in pprs: print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints) print "potential homologs" for key,value in ppr.potentialHomologs.iteritems(): print "{}: {}".format(key, value) return stats = [] for plastid in plastids: length = 0 similarity = 0.0 for ppr in pprs: length += len(ppr.potentialHomologs[plastid.name]) similarity += sum([p[0] for p in ppr.potentialHomologs[plastid.name]]) try: stats.append({'name': plastid.name, 'avg_similarity': similarity / float(length), 'avg_homologs': length / len(pprs),}) except ZeroDivisionError: stats.append({'name': plastid.name, 'avg_similarity': 0.0, 'avg_homologs': 0,}) stats.sort(key=lambda s: -s['avg_similarity']) f = open("tmp", "w") for s in stats[0:50]: f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s)) f.close()
def fill_gaps(ppr): """Look for reluctant motifs in suspiciously sized gaps""" gaps = find_gaps(ppr, mingap=3*30, maxgap=3*40) #check if there could be a motif at the start if ppr.features[0].location.start > 3*30: gaps.append(FeatureLocation(0, ppr.features[0].location.start, 1)) for g in gaps: record = ppr[g.start-15:g.end+15] record.seq = record.seq.translate() #increase the reporting thresholds search = HMMER.hmmsearch(hmm = models[3], targets = record, F1=0.5,F2=0.5,F3=0.5) if search.matches: motif = search.getFeatures(record)[0] offset = g.start-15 motif.location = FeatureLocation( offset + 3*motif.location.start, offset + 3*motif.location.end, strand = 1) ppr.features.append(motif) ppr.features.sort(key=lambda p: p.location.start) return ppr