def annotate(pprs, plastid): for p in pprs: print "Annotate {}".format(p.name) print " - yagi" feats = binding.get_domains(p.yagi, plastid, type="{}_yagi".format(p.name), gaps=2) print " - barkan" feats += binding.get_domains(p.barkan, plastid, type="{}_barkan".format(p.name), gaps=2) print " - exact" feats += find_exact(p, plastid) plastid.features += feats
def find_exact(ppr, plastid): feats = [] for e in ppr.exact: pfeats = binding.get_domains(e, plastid, type="{}_exact".format(ppr.name), percentile=100.0, gaps=0) for feat in pfeats: if feat.qualifiers['odds'] >= 10.0 * len(e): feats.append(feat) return feats
def find_homologs(): """Predict homologs of PPRs in other genomes based on footprints""" pprs = load_pprs() plastids = load_plastids(exclude=[ "Arabidopsis thaliana", ]) known_binding = SeqIO.read("output/ARA_annotated.gb", "gb") exact_features = [ f for f in known_binding.features if "exact" in f.type.lower() ] ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"] ara_genes.sort(key=lambda g: g.location.start) print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids)) for k, ppr in enumerate(pprs): print "Searching for homologs of \'{}\' ({}/{})".format( ppr.name, k + 1, len(pprs)) footprints = [ f for f in exact_features if f.type.lower() == "{}_exact".format(ppr.name.lower()) ] ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints] print "\tFound {} original genes, {}".format(len( ppr.genes), [g.qualifiers['gene'] for g in ppr.genes]) ppr.potentialHomologs = {} for i, plastid in enumerate(plastids): if plastid.name != "Alsophila spinulosa": continue print "\t\tSearch {}/{}".format(i + 1, len(plastids)) #search for homologs of each gene homologs = [] for gene in ppr.genes: g = SeqRecord(gene.extract(known_binding.seq).translate()) search = HMMER.jackhmmer(g, plastid) print "{} -> {} homologs".format(gene.qualifiers['gene'], len(search.matches)) homologs += search.getFeatures( type="{}_hl".format(gene.qualifiers['gene'])) #extract the sequence surrounding each homolog for h in homologs: h.location = FeatureLocation( max(0, h.location.start - 500), min(len(plastid), h.location.end + 500)) homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs] #find exact or close to exact binding domains for each and add to the #list of potential homologs for the PPR ph = [] for h in homologs: domains = [] for exact in ppr.exact: try: domains += binding.get_domains(exact, h, percentile=100.0, gaps=0) except KeyError: continue if domains: domains.sort(key=lambda d: -d.qualifiers['odds']) seq = str(domains[0].extract(h).seq) similarity = max([ sequence_similarity(original, seq) for original in ppr.footprints ]) print " {} -> \'{}\'".format(h.type, seq) ph.append((similarity, seq)) ph.sort(key=lambda p: -p[0]) ppr.potentialHomologs[plastid.name] = ph #try and avoid running out of RAM gc.collect() for ppr in pprs: print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints) print "potential homologs" for key, value in ppr.potentialHomologs.iteritems(): print "{}: {}".format(key, value) return stats = [] for plastid in plastids: length = 0 similarity = 0.0 for ppr in pprs: length += len(ppr.potentialHomologs[plastid.name]) similarity += sum( [p[0] for p in ppr.potentialHomologs[plastid.name]]) try: stats.append({ 'name': plastid.name, 'avg_similarity': similarity / float(length), 'avg_homologs': length / len(pprs), }) except ZeroDivisionError: stats.append({ 'name': plastid.name, 'avg_similarity': 0.0, 'avg_homologs': 0, }) stats.sort(key=lambda s: -s['avg_similarity']) f = open("tmp", "w") for s in stats[0:50]: f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s)) f.close()
def find_homologs(): """Predict homologs of PPRs in other genomes based on footprints""" pprs = load_pprs() plastids = load_plastids(exclude=["Arabidopsis thaliana",]) known_binding = SeqIO.read("output/ARA_annotated.gb", "gb") exact_features = [f for f in known_binding.features if "exact" in f.type.lower()] ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"] ara_genes.sort(key=lambda g: g.location.start) print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids)) for k,ppr in enumerate(pprs): print "Searching for homologs of \'{}\' ({}/{})".format( ppr.name,k+1,len(pprs)) footprints = [f for f in exact_features if f.type.lower() == "{}_exact".format(ppr.name.lower())] ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints] print "\tFound {} original genes, {}".format(len(ppr.genes), [g.qualifiers['gene'] for g in ppr.genes]) ppr.potentialHomologs = {} for i,plastid in enumerate(plastids): if plastid.name != "Alsophila spinulosa": continue print "\t\tSearch {}/{}".format(i+1, len(plastids)) #search for homologs of each gene homologs = [] for gene in ppr.genes: g = SeqRecord(gene.extract(known_binding.seq).translate()) search = HMMER.jackhmmer(g, plastid) print "{} -> {} homologs".format(gene.qualifiers['gene'], len(search.matches)) homologs += search.getFeatures(type="{}_hl".format(gene.qualifiers['gene'])) #extract the sequence surrounding each homolog for h in homologs: h.location = FeatureLocation( max(0,h.location.start - 500), min(len(plastid), h.location.end+500)) homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs] #find exact or close to exact binding domains for each and add to the #list of potential homologs for the PPR ph = [] for h in homologs: domains = [] for exact in ppr.exact: try: domains += binding.get_domains(exact, h, percentile=100.0, gaps=0) except KeyError: continue if domains: domains.sort(key=lambda d: -d.qualifiers['odds']) seq = str(domains[0].extract(h).seq) similarity = max([sequence_similarity(original, seq) for original in ppr.footprints]) print " {} -> \'{}\'".format(h.type, seq) ph.append((similarity, seq)) ph.sort(key=lambda p: -p[0]) ppr.potentialHomologs[plastid.name] = ph #try and avoid running out of RAM gc.collect() for ppr in pprs: print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints) print "potential homologs" for key,value in ppr.potentialHomologs.iteritems(): print "{}: {}".format(key, value) return stats = [] for plastid in plastids: length = 0 similarity = 0.0 for ppr in pprs: length += len(ppr.potentialHomologs[plastid.name]) similarity += sum([p[0] for p in ppr.potentialHomologs[plastid.name]]) try: stats.append({'name': plastid.name, 'avg_similarity': similarity / float(length), 'avg_homologs': length / len(pprs),}) except ZeroDivisionError: stats.append({'name': plastid.name, 'avg_similarity': 0.0, 'avg_homologs': 0,}) stats.sort(key=lambda s: -s['avg_similarity']) f = open("tmp", "w") for s in stats[0:50]: f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s)) f.close()