def align_small_seqs(candidates, small_seqs, small_seqs_copies): print "\n\tSmall seq analysis" print "\tcandidates:", len(candidates), candidates[0].hairpin print "\tsmall seqs:", len(small_seqs), len(small_seqs_copies) max_frequent_seq = 0 c_used = set() s_used = set() # adding all canididate seqs to a substring dict # reversing the hairpin if it is on negative strand find_candidates = SubstringDict() for i, candidate in enumerate(candidates): hairpin_part = candidate.hairpin_padded_40[20:-20] if candidate.chromosome_direction == "-": hairpin_part = reverse_compliment(hairpin_part) find_candidates[hairpin_part] = i # for each small seq. find the set of matching candidates for j, seq in enumerate(small_seqs): candidate_set = find_candidates[seq] if len(candidate_set) > max_frequent_seq: max_frequent_seq = len(candidate_set) # add the seq. to that candidate for cnr in candidate_set: copies = small_seqs_copies[j] if not candidates[cnr].small_subs: candidates[cnr].small_subs = {} candidates[cnr].small_subs[seq] = copies c_used.add(cnr) s_used.add(j) print "\tmost frequent small hit", max_frequent_seq, "times" print "\tsmall seq hits:", len(s_used) print "\tcandidates with small seqs:",len(c_used)
def align_miRNAs(mirna_hits, hairpinID_to_mature, hpID_to_mseqs, candidate_tree, candidate_list, sequence_tree, seq_to_candidates, miRNA_species, miRNA_high_conf): candidate_to_miRNAid = {} noseq_set = set() unique_mirnas = set() candidate_already = 0 candidate_count = 0 miRNA_with_candidates = set() has_seqs = [] noseqs = 0 for loki in mirna_hits: miRNAid = ">" + loki[0] if miRNAid in unique_mirnas: continue # use first entry only unique_mirnas.add(miRNAid) strand_dir = loki[1] chromosome = loki[2].split("|")[3] genome_offset = int(loki[3]) hairpin = loki[4] mature_seqs = hpID_to_mseqs[miRNAid] if miRNAid in hpID_to_mseqs else [] mature_seqs_fixed = [] mature_pos = [] for seq in mature_seqs: if strand_dir == "-": seq = reverse_compliment(seq) pos = hairpin.find(seq) if pos > -1: mature_pos.append((pos,pos+len(seq))) else: print "mature seq not mapping:", seq, mature_seqs assert seq in hairpin mature_seqs_fixed.append(seq) if mature_pos: mature_pos = sorted(mature_pos) # sometimes the mature seqs overlap --> remove the last of them if len(mature_pos) > 1: if mature_pos[-2][1] > mature_pos[-1][0]: mature_pos.pop(-1) elif mature_pos[0][1] > mature_pos[1][0]: mature_pos.pop(1) # print _OLDmature_pos, mature_pos assert len(mature_pos) <= 2 begin_5p = -1 end_5p = -1 begin_3p = -1 end_3p = -1 # begin_5p = genome_offset + 0 # end_5p = genome_offset + mature_len # begin_3p = genome_offset + len(hairpin) - mature_len # end_3p = genome_offset + len(hairpin) if len(mature_pos) == 2: begin_5p = genome_offset + mature_pos[0][0] end_5p = genome_offset + mature_pos[0][1] begin_3p = genome_offset + mature_pos[1][0] end_3p = genome_offset + mature_pos[1][1] elif len(mature_pos) == 1: avg_val = (mature_pos[0][0] + mature_pos[0][1] ) / 2.0 if avg_val < len(hairpin) / 2.0: begin_5p = genome_offset + mature_pos[0][0] end_5p = genome_offset + mature_pos[0][1] else: begin_3p = genome_offset + mature_pos[0][0] end_3p = genome_offset + mature_pos[0][1] # print begin_5p, end_5p, begin_3p, end_3p assert begin_5p >= genome_offset or begin_5p == -1 assert begin_5p < end_5p or begin_5p == -1 assert end_5p <= begin_3p or end_5p == -1 or begin_3p == -1 assert begin_3p < end_3p or begin_3p == -1 is_candidate = False tree = candidate_tree[chromosome] if not tree: continue candidates = tree[genome_offset:genome_offset+len(hairpin)] if candidates: miRNA_with_candidates.add(miRNAid) candidate_already += 1 for candidate in candidates: if candidate.data.chromosome_direction != strand_dir: continue hashval = candidate.data.chromosome + strand_dir + str(candidate.data.pos_5p_begin) print candidate.data.hairpin_start print candidate.data.pos_5p_begin assert candidate.data.pos_5p_begin == candidate.data.hairpin_start candidate_count += 1 shift_start = abs(genome_offset - candidate.data.pos_5p_begin) shift_end = abs( (genome_offset+len(hairpin)) - candidate.data.pos_3p_end) # use candidate as miRNA if shift_start + shift_end < len(hairpin) / 2: candidate_to_miRNAid[hashval] = miRNAid if candidate.data.miRNAid: # don't want 2 equal continue if candidate.data.candidate_type >= 1: print candidate.data.candidate_type, "candidate type" print candidate.data.miRNAid, "miRNAid ???", miRNAid assert candidate.data.candidate_type < 1 # undecided or candidate c_type = structure.TYPE_HIGH_CONF if miRNAid in miRNA_high_conf else structure.TYPE_LOW_CONF candidate.data.candidate_type = c_type candidate.data.miRNAid = miRNAid candidate.data.mirBase_matures = mature_seqs_fixed is_candidate = True break if is_candidate: continue # no candidates aligns the "miRNA" tree = sequence_tree[chromosome] sequences = tree[genome_offset:genome_offset+len(hairpin)] sequences = [s for s in sequences if s.data[0] == strand_dir] # sequences = [s for s in sequences if s.begin >= genome_offset and s.end <= genome_offset+len(hairpin)] sequences = set(sequences) is_both_matures = begin_5p != -1 and end_5p != -1 and begin_3p != -1 and end_3p != -1 if sequences and not is_both_matures: best_start_pos, _, best_end_pos, _ = interval_tree_misc.best_interval(sequences, genome_offset) # TODO skip if have both mature seqs # print # print sequences # print "seqs in hp: ", strand_dir, genome_offset, genome_offset+len(hairpin), hairpin # for s in sorted(sequences): # print s.begin, s.end, s.data[2], s.data[2] in hairpin, s.data[0] avgpos = (best_start_pos + best_end_pos) / 2.0 if avgpos < 0: pass elif avgpos < len(hairpin) / 2.0 : # peak is 5p old1 = begin_5p old2 = end_5p begin_5p = genome_offset + best_start_pos end_5p = genome_offset + best_end_pos if end_5p > begin_3p and begin_3p != -1: print "manually changing overlapping (3p)", begin_5p, end_5p, begin_3p, end_3p print "old seq:", old1, old2, genome_offset print begin_3p = end_5p else: # peak is 3p begin_3p = genome_offset + best_start_pos end_3p = genome_offset + best_end_pos if begin_3p < end_5p and end_5p != -1: print "manually changing overlapping (5p)", begin_5p, end_5p, begin_3p, end_3p end_5p = begin_3p has_seqs.append(miRNAid) else: # no sequences at all noseqs += 1 noseq_set.add(miRNAid) pass if not ( begin_5p + 10 >= genome_offset or begin_5p == -1): print begin_5p, genome_offset assert begin_5p + 10 >= genome_offset or begin_5p == -1 assert begin_5p < end_5p or begin_5p == -1 if not (end_5p <= begin_3p or end_5p == -1 or begin_3p == -1): print end_5p, begin_3p assert end_5p <= begin_3p or end_5p == -1 or begin_3p == -1 assert begin_3p < end_3p or begin_3p == -1 candidate = structure.Candidate(chromosome, strand_dir, genome_offset, genome_offset+len(hairpin), begin_5p, end_5p, begin_3p, end_3p, sequences) candidate.mirBase_matures = mature_seqs_fixed candidate.hairpin = hairpin c_type = structure.TYPE_HIGH_CONF if miRNAid in miRNA_high_conf else structure.TYPE_LOW_CONF candidate.set_candidate_type = c_type intervals_before = sequence_tree[tree][begin_5p-30:begin_5p] if sequence_tree[tree] else [] intervals_after = sequence_tree[tree][end_3p:end_3p+30] if sequence_tree[tree] else [] intervals_before = [s for s in intervals_before if s.data[0] == strand_dir] intervals_after = [s for s in intervals_after if s.data[0] == strand_dir] candidate.set_seq_outside(intervals_before, intervals_after) for candidate_interval in sequences: name = candidate_interval.data[1] if name not in sequences: number_id = int(name.split("-")[0]) duplicates = float(name.split("-")[1]) s = structure.Sequence(number_id, duplicates, candidate_interval.data[2]) s.add_candidate(candidate) seq_to_candidates[name] = s else: seq_to_candidates[name].add_candidate(candidate) # if end_3p - begin_5p > 200: # print "\t200+ length", begin_5p, end_5p, begin_3p, end_3p # assert False # assert candidate.pos_5p_begin < candidate.pos_5p_end <= candidate.pos_3p_begin < candidate.pos_3p_end candidate_list.append(candidate) hashval = chromosome + strand_dir + str(genome_offset) candidate_to_miRNAid[hashval] = miRNAid has_seqs = set(has_seqs) hc_len = len( miRNA_high_conf) sec_cand = has_seqs | miRNA_with_candidates print print "miRNA aligning sequences:\t\t", len(has_seqs) * 1.0 / len(unique_mirnas) print "miRNA aligning candidates:\t\t", len(miRNA_with_candidates) * 1.0 / len(unique_mirnas) print "miRNA aligning seq/candidate:\t\t", len(sec_cand) * 1.0 / len(unique_mirnas) print print "HIGH CONFIDENCE -- sequences \t\t", len(miRNA_high_conf & has_seqs) * 1.0 / hc_len print "HIGH CONFIDENCE -- candidates \t\t", len(miRNA_high_conf & miRNA_with_candidates) * 1.0 / hc_len print "HIGH CONFIDENCE -- seq/candidates \t", len(miRNA_high_conf & sec_cand) * 1.0 / hc_len # print "no sequences aligning at all:\t", len(noseq_set) # print "no seqs vs high confidence:\t", len(noseq_set.intersection(miRNA_high_conf) ), len(miRNA_high_conf) return candidate_to_miRNAid
def align_dead_miRNAs(mirna_hits, _, id_to_mature, candidate_tree, candidate_list, sequence_tree, seq_to_candidates): print "\naligning dead miRNA" candidate_to_dead = {} unique_mirnas = set() candidated = set() seqd = set() both_matures = 0 for dead_loki in mirna_hits: miRNAid = dead_loki[0] if miRNAid in unique_mirnas: continue # use first entry only # print "\n\t new dead...", unique_mirnas.add(miRNAid) strand_dir = dead_loki[1] chromosome = dead_loki[2].split("|")[3] genome_offset = int(dead_loki[3]) hairpin = dead_loki[4] is_candidate = False begin_5p = -1 end_5p = -1 begin_3p = -1 end_3p = -1 # print # print miRNAid, begin_5p, begin_5p + len(hairpin) # put mature seq into 5p or 3p if miRNAid in id_to_mature: mature_seq = id_to_mature[miRNAid] if strand_dir == "-": mature_seq = reverse_compliment(mature_seq) # print "\t", strand_dir # print "\t", mature_seq in hairpin # print "\t", mature_seq # print "\t", hairpin begin_mature = hairpin.find(mature_seq) end_mature = begin_mature + len(mature_seq) avg_val = (begin_mature + end_mature ) / 2.0 if avg_val < len(hairpin) / 2.0: begin_5p = genome_offset + begin_mature end_5p = genome_offset + end_mature else: begin_3p = genome_offset + begin_mature end_3p =genome_offset + end_mature # print "\t", begin_mature, end_mature, len(hairpin) # print "\t", begin_5p, end_5p, begin_3p, end_3p # print begin_5p, end_5p, begin_3p, end_3p, "\t", len(hairpin) tree = candidate_tree[chromosome] if tree: candidates = tree[genome_offset:genome_offset+len(hairpin)] # print len(candidates), candidates for candidate in candidates: if candidate.data.chromosome_direction != strand_dir: continue hashval = candidate.data.chromosome + strand_dir + str(candidate.data.pos_5p_begin) _fail_message = (candidate.data.hairpin_start, candidate.data.pos_5p_begin) assert candidate.data.pos_5p_begin == candidate.data.hairpin_start, _fail_message shift_start = abs(genome_offset - candidate.data.pos_5p_begin) shift_end = abs( (genome_offset+len(hairpin)) - candidate.data.pos_3p_end) # print shift_start, shift_end, len(hairpin) if shift_start + shift_end < len(hairpin) / 2: candidate_to_dead[hashval] = miRNAid is_candidate = True # assert candidate.candidate_type != structure.TYPE_HIGH_CONF # assert candidate.candidate_type != structure.TYPE_LOW_CONF # candidate.candidate_type = structure.TYPE_DEAD candidated.add(miRNAid) break else: print "no", tree has_both_matures = begin_5p != -1 and end_5p != -1 and begin_3p != -1 and end_3p != -1 assert not has_both_matures has_3p = begin_3p != -1 and end_3p != -1 has_5p = begin_5p != -1 and end_5p != -1 if not is_candidate: # and (has_5p or has_3p) ???? # assert begin_5p != -1 or begin_3p != -1 # assert end_5p != -1 or end_3p != -1 tree = sequence_tree[chromosome] if not tree: continue sequences = tree[genome_offset:genome_offset+len(hairpin)] sequences = [s for s in sequences if s.data[0] == strand_dir] # sequences = [s for s in sequences if s.begin >= genome_offset and s.end <= genome_offset+len(hairpin)] sequences = set(sequences) if sequences: best_start_pos, _, best_end_pos, _ = interval_tree_misc.best_interval(sequences, genome_offset) avgpos = (best_start_pos + best_end_pos) / 2.0 halfsize = len(hairpin) / 2.0 # offset = begin_5p # print "--" # print avgpos, halfsize, halfsize_derp # print begin_5p, end_5p, begin_3p, end_3p if avgpos <= halfsize: # peak is 5p # print "<-" begin_5p = genome_offset + best_start_pos end_5p = genome_offset + best_end_pos # if end_5p > begin_3p: # begin_3p = end_5p else: # peak is 3p begin_3p = genome_offset + best_start_pos end_3p = genome_offset + best_end_pos # if begin_3p < end_5p: # end_5p = begin_3p candidate = structure.Candidate(chromosome, strand_dir, genome_offset, genome_offset+len(hairpin), begin_5p, end_5p, begin_3p, end_3p, sequences) candidate.hairpin = hairpin intervals_before = sequence_tree[tree][begin_5p-30:begin_5p] if sequence_tree[tree] else [] intervals_after = sequence_tree[tree][end_3p:end_3p+30] if sequence_tree[tree] else [] intervals_before = [s for s in intervals_before if s.data[0] == strand_dir] intervals_after = [s for s in intervals_after if s.data[0] == strand_dir] candidate.set_seq_outside(intervals_before, intervals_after) assert candidate.candidate_type != structure.TYPE_HIGH_CONF assert candidate.candidate_type != structure.TYPE_LOW_CONF candidate.candidate_type = structure.TYPE_DEAD if sequences: seqd.add(miRNAid) for candidate_interval in sequences: name = candidate_interval.data[1] if name not in sequences: number_id = int(name.split("-")[0]) duplicates = float(name.split("-")[1]) s = structure.Sequence(number_id, duplicates, candidate_interval.data[2]) s.add_candidate(candidate) seq_to_candidates[name] = s else: seq_to_candidates[name].add_candidate(candidate) assert candidate.pos_5p_begin < candidate.pos_5p_end or candidate.pos_5p_begin == -1 assert candidate.pos_3p_begin < candidate.pos_3p_end or candidate.pos_3p_begin == -1 if begin_5p != -1 and end_3p != -1: both_matures += 1 # if candidate.pos_5p_end >= candidate.pos_3p_begin + 10: # print candidate.pos_5p_end, candidate.pos_3p_begin # assert candidate.pos_5p_end < candidate.pos_3p_begin + 10 # assert candidate.pos_5p_begin < candidate.pos_5p_end <= candidate.pos_3p_begin < candidate.pos_3p_end candidate_list.append(candidate) hashval = chromosome + strand_dir + str(genome_offset) candidate_to_dead[hashval] = miRNAid # print "123" print len(unique_mirnas) print "\ndead stats:" print "has both seqs:", both_matures, (both_matures + len(candidated))*1.0 / len(unique_mirnas) print "aligning with candidates:\t", len(candidated), len(candidated)*1.0 / len(unique_mirnas) print "aligning with seqs:\t", len(seqd), len(seqd)*1.0 / len(unique_mirnas) print "aligning with either:\t", len(candidated | seqd), len(candidated | seqd)*1.0 / len(unique_mirnas) # assert False return candidate_to_dead
def small_seq_stats(candidates, sc=0, na=0, c_to_m=0): ''' compares the amount of small sequences vs long sequences in 3p and 5p positions uses log values of values larger than 1 RPM in each sum adds 1.0 to each sum to avoid division against 0-scores stores the log score, as some sums are still very large ''' # na = [x for li in na for x in li] # unpack folds # sc = zip(*sc) # print len(na), na[0], len(set(na)) # print len(sc), sc[0] # # names_to_scores = {n:s for n,s in zip(na,sc)} # # print na[0] # print sc[0] # print names_to_scores[na[0]] # # # print len(names_to_scores) # def get_name_scores(c): # hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start) # # mi_name = c_to_m[hashval] # scores = names_to_scores[mi_name] if mi_name in names_to_scores else "" # return mi_name, scores # def _is_miRNA(c): # hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start) # return hashval in candidate_to_miRNA def log_over_one(val): return math.log(val) if val > 1.0 else val def log_sum(list_to_sum): log_list = [log_over_one(x) for x in list_to_sum] list_sum = sum(log_list) return list_sum print "candidates:", len(candidates) has_small_seqs = [c for c in candidates if c.small_subs] print "small seqs:", len(has_small_seqs) padding = 20 for c in has_small_seqs: hairpin = c.hairpin_padded_40[padding:-padding] folding = c.hairpin_fold_40[padding:-padding] if c.chromosome_direction == "-": hairpin = reverse_compliment(hairpin) begin_5 = c.pos_5p_begin - c.hairpin_start + padding if c.has_5p else c.estimate_5pb end_5 = c.pos_5p_end - c.hairpin_start + padding if c.has_5p else c.estimate_5pe begin_3 = c.pos_3p_begin - c.hairpin_start + padding if c.has_3p else c.estimate_3pb end_3 = c.pos_3p_end - c.hairpin_start + padding if c.has_3p else c.estimate_3pe # swap positions if hairpin is reversed has_5p = c.has_5p has_3p = c.has_3p if c.chromosome_direction == "-": b5 = len(hairpin) - end_3 e5 = len(hairpin) - begin_3 b3 = len(hairpin) - end_5 e3 = len(hairpin) - begin_5 begin_5 = b5 end_5 = e5 begin_3 = b3 end_3 = e3 #swap 3p 5p positions has_5p, has_3p = has_3p, has_5p # filter out bad positions if not (begin_5 < end_5 < begin_3 < end_3): if has_5p: if begin_3 < end_5 or end_3 < end_5: begin_3 = -1 end_3 = -1 elif has_3p: if end_5 > begin_3 or begin_5 > begin_3: begin_5 = -1 end_5 = -1 if begin_5 > end_5: begin_5 = -1 end_5 = -1 if begin_3 > end_3: begin_3 = -1 end_3 = -1 # assert begin_5 < end_5 < begin_3 < end_3, (begin_5, end_5, begin_3, end_3) mature_pos = [pos for pos in [begin_5, end_5, begin_3, end_3] if 0 <= pos <= len(hairpin)] medium_short = [] print "\n---------------" print "---------------" print [begin_5, end_5, begin_3, end_3] print mature_pos print c.has_5p print c.has_3p print "direction:", c.chromosome_direction print c.pos_5p_begin - c.hairpin_start, c.pos_3p_end - c.hairpin_start # print get_name_scores(c) print folding print hairpin print " "*begin_5 + "5"*(end_5 - begin_5) + " "*(begin_3 - end_5) + "3"*(end_3 - begin_3) if c.mirBase_matures: for seq in c.mirBase_matures: if c.chromosome_direction == "-": seq = reverse_compliment(seq) start_seq = hairpin.find(seq) print " " * (start_seq) + seq, "\t", (start_seq) for seq, copies in c.small_subs.iteritems(): start_seq = hairpin.find(seq) end_seq = start_seq + len(seq) print " " * (start_seq) + seq, "\t", (start_seq, copies) medium_short.append( (len(seq), start_seq, end_seq, copies) ) # print "+" * len(hairpin) # print hairpin unscaled_long = 0 scaled_long = 0 if len(c.mapped_sequences): for i in c.mapped_sequences: copies = float(i.data[1].split("-")[1]) seq = i.data[2] if c.chromosome_direction == "-": seq = reverse_compliment(seq) seq_start = hairpin.find(seq) print " " * seq_start + seq, "\t", ( seq_start, copies) unscaled_long += copies scaled_long += log_over_one(copies) #=============================================================================== # # if seq_start in area_5p_long: # # area_5p_long_sum += log_over_one(copies) # # elif seq_start in area_3p_long: # # area_3p_long_sum += log_over_one(copies) #=============================================================================== unscaled_short = sum(c.small_subs.values()) + 1.0 unscaled_long += 1.0 unscaled_short_fraction = unscaled_short / unscaled_long scaled_short = log_sum(c.small_subs.values()) + 1.0 scaled_long += 1 scaled_short_fraction = scaled_short / scaled_long c.ratio_short_long = unscaled_short_fraction c.ratio_short_long_logval = scaled_short_fraction def nondecreasing(l): for i, el in enumerate(l[1:]): if l[i] > el: return False return True assert nondecreasing(mature_pos), (mature_pos, len(hairpin)) distances = [] # finding distances to start / end of mature sequences for (length, start, end, copies) in medium_short: start_dist = min_dist(mature_pos, start) end_dist = min_dist(mature_pos, end) smallest_dist = min(start_dist, end_dist) weight = math.log(copies+1.0) dist_weight_log = weight / (smallest_dist + 1.0) dist_weigth = copies / (smallest_dist + 1.0) distances.append( (length, dist_weigth, dist_weight_log, copies) ) def score_distance(minlen, maxlen): distance_weighted = sum( w for l, w, _lw, c in distances if minlen <= l <= maxlen ) all_weights = sum( c for l, w, _lw, c in distances if minlen <= l <= maxlen ) score = distance_weighted / all_weights if all_weights else 0 return score c.short_seq_align_10_13 = score_distance(10,13) c.short_seq_align_10_14 = score_distance(10,14) c.short_seq_align_10_15 = score_distance(10,15) c.short_seq_align_10_16 = score_distance(10,16) c.short_seq_align_10_17 = score_distance(10,17) c.short_seq_align_10_18 = score_distance(10,18) # c.short_seq_align_8_17 = score_distance(8,8) # c.short_seq_align_9_17 = score_distance(9,9) # c.short_seq_align_10_17 = score_distance(10,10) # c.short_seq_align_11_17 = score_distance(11,11) # c.short_seq_align_12_17 = score_distance(12,12) # c.short_seq_align_13_17 = score_distance(13,13) # c.short_seq_align_14_17 = score_distance(14,14) # c.short_seq_align_15_17 = score_distance(15,15) # c.short_seq_align_16_17 = score_distance(16,16) # c.short_seq_align_17_17 = score_distance(17,17) c.short_seq_align_8_17 = score_distance(8,17) c.short_seq_align_9_17 = score_distance(9,17) c.short_seq_align_10_17 = score_distance(10,17) c.short_seq_align_11_17 = score_distance(11,17) c.short_seq_align_12_17 = score_distance(12,17) c.short_seq_align_13_17 = score_distance(13,17) c.short_seq_align_14_17 = score_distance(14,17) c.short_seq_align_15_17 = score_distance(15,17) c.short_seq_align_16_17 = score_distance(16,17) c.short_seq_align_17_17 = score_distance(17,17) c.short_seq_align = score_distance(13,17)
def tailing_au_simple(candidates=None, sequences=None): # pickle.dump(candidates, open("candidates_test_only.p", "wb")) # pickle.dump(sequences, open("sequences_test_only.p", "wb")) # # assert False # # candidates = candidates[:50] # pickle.dump(candidates, open("candidates_test_only2.p", "wb")) # assert 0 # print "loading data ..." # candidates = pickle.load( open("candidates_test_only2.p", "rb")) # sequences = pickle.load( open("sequences_test_only.p", "rb")) # print "... loaded" tail_au = [] pre_au = [] tail_a_list = [] # pre_a = [] min_tailfree = 15 end_chars = {"A":0.0, "C":0.0, "G":0.0, "T":0.0} for s in sequences: n = s.nucleotides[-1] end_chars[n] += 1 print end_chars # assert 0 for sequence in sequences: seq = sequence.nucleotides count = sequence.duplicates if sequence.duplicates <= 1 else math.log(sequence.duplicates) tailfree, is_tail_end = _remove_trailing(seq) au_start, is_au_start = _remove_trailing_start(seq) if is_tail_end and len(tailfree) > min_tailfree: assert len(tailfree) != len(seq) tail_au.append((tailfree, seq, count)) if is_au_start and len(au_start) > min_tailfree: assert len(au_start) != len(seq) pre_au.append((au_start, seq, count)) tail_a, is_tail_a = _remove_trailing(seq, set("Aa")) if is_tail_a and tail_a > min_tailfree: tail_a_list.append((tail_a, seq, count)) print "long tails", len([1 for (au, seq, count) in tail_au if len(seq) - len(au) > 1]) print "long pre", len([1 for (au, seq, count) in pre_au if len(seq) - len(au) > 1]) print "found all tails and starts", len(tail_au), len(pre_au), "only a ends:", len(tail_a_list) # assert 0 for c in candidates: hairpin = c.hairpin_padded_40[20:-20] if c.chromosome_direction == "-": hairpin = reverse_compliment(hairpin) full_hits_tail = 0.0 full_hits_pre = 0.0 full_hits_bowtie = 0.0 au_tails = 0.0 au_starts = 0.0 hit_tailing = [] hit_pre = [] for notail, seq, count in tail_au: if len(notail) < 15: continue if notail in hairpin: if seq not in hairpin: pos = hairpin.find(notail) assert pos >= 0, (notail in hairpin, pos, len(seq), notail, seq[len(notail):], hairpin[pos:len(seq)+5]) # print "\t", notail in hairpin, pos, len(seq), notail, seq[len(notail):], hairpin[pos:len(seq)+5] au_tails += count hit_tailing.append((pos, pos+len(notail))) else: full_hits_tail += count for no_au_start, seq, count in pre_au: if no_au_start in hairpin: if seq not in hairpin: pos = hairpin.find(no_au_start) assert pos >= 0, (no_au_start in hairpin, pos, len(seq), no_au_start, seq[len(no_au_start):], hairpin[pos:len(seq)+5]) au_starts += count hit_pre.append( (pos, pos+len(no_au_start)) ) else: full_hits_pre += count # full_hits_bowtie = sum([float(s.data[1].split("-")[1])] for s in c.mapped_sequences) full_hits_missed = 0 for sequence in sequences: seq = sequence.nucleotides count = sequence.duplicates if sequence.duplicates <= 1 else math.log(sequence.duplicates) if seq in hairpin: full_hits_missed += count full_hits_bowtie = 0 for sequence in c.mapped_sequences: count = float(sequence.data[1].split("-")[1]) full_hits_bowtie += math.log(count) if count > 1 else count print c.chromosome_direction, len(hairpin) print au_tails, full_hits_tail, sorted(hit_tailing) print au_starts, full_hits_pre, sorted(hit_pre) print full_hits_bowtie, full_hits_missed print total_hits_tail = full_hits_bowtie + full_hits_missed + au_tails total_hits_tail = math.log(total_hits_tail) if total_hits_tail > 1 else total_hits_tail au_tails_score = math.log(au_tails) if au_tails > 1 else au_tails score_tailing = (au_tails_score + 1) / (total_hits_tail + 1) total_hits_tail = full_hits_bowtie + full_hits_missed + au_starts total_hits_tail = math.log(total_hits_tail) if total_hits_tail > 1 else total_hits_tail au_leading_score = math.log(au_starts) if au_starts > 1 else au_starts score_leading = (au_leading_score + 1) / (total_hits_tail + 1) c.tailing_au = score_tailing c.leading_au = score_leading