def hits_that_share_interactors(self, interactors): "Return those hits that are associated with the given interactors." filtered_hits = biopsy.HitVec() for hit in self.analysis: if hit.binder in self.hits_to_interactors: if len(self.hits_to_interactors[hit.binder].intersection( interactors)): filtered_hits.append(hit) return filtered_hits
def consolidate_hits(hits): """ Takes a sequence of BiFa hits and consolidates all of those for the same binder that overlap. @arg hits: A sequence of BiFa hits @return: A consolidated sequence of BiFa hits """ from itertools import chain result = biopsy.HitVec() for hits in hits_per_binder(hits).values(): result.extend(consolidate_overlapping_hits(hits)) return result
def map_hits(hits, binder_map): """ Takes a sequence of BiFa binding hits and returns a sequence of BiFa hits over the binders that are mapped to binder_map. @arg hits: BiFa hits @arg binder_map: A dict like object that maps binder names. E.g. mapping BiFa pssm names to transcription factors @return: A sequence of BiFa hits """ result = biopsy.HitVec() for hit in hits: for mapped_binder in binder_map[hit.binder]: result.append(biopsy.Hit(mapped_binder, hit.location, hit.p_binding)) return result
def test_pssm_score(): # 'V$AP1_Q2' pssm_acc = biopsy.get_transfac_pssm_accession('V$DEAF1_01') pssm_info = biopsy.get_pssm(pssm_acc) # print pssm_info.pssm seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga' for i in range(len(seq) - len(pssm_info.pssm) + 1): s = biopsy.score_pssm(pssm_info.pssm, seq[i:]) p_binding = biopsy.get_p_binding( biopsy.get_odds_ratio(s, pssm_info.get_dist(True, False), pssm_info.get_dist(False, False))) if p_binding > 0.05: print i, s, p_binding result = biopsy.HitVec() p_binding = biopsy.score_pssm_on_sequence(pssm_acc, seq, 0.05, result) print 'Got', len(result), 'hits from', len(seq), 'bases' print p_binding
def write_svg(self, filename, hits, max_threshold=0.0, notes="", max_chain=biopsy.HitVec()): build_svg_args = biopsy.BuildSvgArgs(min_threshold=self.threshold, max_threshold=max_threshold, file=filename, title=self.name, notes=notes, open_file=False) if len(hits): biopsy.build_analysis_svg(hits, max_chain, self.converted_seqs[0], args=build_svg_args)
def test_pssm_pseudo_counts(): sascha_pssms = biopsy.SequenceVec() sascha_acc = 'M00975' # sascha_seq = 'gtaaaccaggctgcctGAgaacttgttgcgaatcc' sascha_seq = 'ttgttgcga' sascha_seq = 'ttgttgcaa' # plot_likelihoods( biopsy.get_pssm( 'M00975' ), 'M00975' ) # plot_likelihoods( biopsy.get_pssm( 'R02146' ), 'R02146' ) print 'Binding,Background,odds,p(binding),cumulative p(binding),Sequence' biopsy.PssmParameters.singleton().use_p_value = True # biopsy.PssmParameters.singleton().binding_background_odds_prior = 1; for pc in [0.0, 0.25, 0.5, 1.0, 2.0]: # force cache load biopsy.get_pssm(sascha_acc) biopsy.clear_pssm_cache() biopsy.PssmParameters.singleton().pseudo_counts = pc p = biopsy.get_pssm(sascha_acc) score = biopsy.score_pssm(p.pssm, sascha_seq) (bind, back, cum_bind, cum_back, odds_ratio, cum_odds_ratio, p_bind, cum_p_bind, p_value_p_bind) = biopsy.get_pssm_likelihoods_for_score(p, score) print pc, print \ '%f,%f,%f,%f,%f,%f,%f' \ % \ ( bind, back, cum_bind, cum_back, p_bind, cum_p_bind, p_value_p_bind ) biopsy.plot_likelihoods(p, sascha_acc + ': ' + str(pc), score) # print 'Trying with standard distributions' # biopsy.PssmParameters.singleton().use_cumulative_dists = False; # hits = biopsy.HitVec() # biopsy.score_pssm_on_sequence( sascha_acc, sascha_seq, 0.001, hits ) # print hits print 'Trying with cumulative distributions' biopsy.PssmParameters.singleton().use_cumulative_dists = True hits = biopsy.HitVec() biopsy.score_pssm_on_sequence(sascha_acc, sascha_seq, 0.001, hits) print hits print
def __missing__(self, k): self[k] = biopsy.HitVec() return self[k]
for species in remo.get_sequence_ids(): yield remo.get_sequence_for(species, True) def histogram(acc, score_counts): import pylab, numpy pylab.clf() pylab.bar(xrange(num_buckets), numpy.power(score_counts, 0.25)) pylab.savefig('graphs/%s.png' % acc) try: remome except NameError: remome_file = 'c:/data/remos/100/100.filtered' print 'Loading remome: %s' % remome_file remome = biopsy.Remome.load(remome_file) score_counts = {} for acc in itertools.islice(pssm_accs(), num_pssms): score_counts[acc] = numpy.zeros(num_buckets, numpy.uint8) for seq in itertools.islice(sequences_from_remome(remome), num_sequences): hits = biopsy.HitVec() p_bind = biopsy.score_pssm_on_sequence(pssm_name=acc, threshold=0.0, sequence=seq, result=hits) for h in hits: score_counts[acc][int(h.p_binding * num_buckets)] += 1 histogram(acc, score_counts[acc])
def map_binders(hits, map): result = biopsy.HitVec() result.extend(imap(hit_mapper(map), hits)) return result