def align_miRNAs(mirna_hits, hairpinID_to_mature, hpID_to_mseqs, candidate_tree, candidate_list, sequence_tree, seq_to_candidates, miRNA_species, miRNA_high_conf): candidate_to_miRNAid = {} noseq_set = set() unique_mirnas = set() candidate_already = 0 candidate_count = 0 miRNA_with_candidates = set() has_seqs = [] noseqs = 0 for loki in mirna_hits: miRNAid = ">" + loki[0] if miRNAid in unique_mirnas: continue # use first entry only unique_mirnas.add(miRNAid) strand_dir = loki[1] chromosome = loki[2].split("|")[3] genome_offset = int(loki[3]) hairpin = loki[4] mature_seqs = hpID_to_mseqs[miRNAid] if miRNAid in hpID_to_mseqs else [] mature_seqs_fixed = [] mature_pos = [] for seq in mature_seqs: if strand_dir == "-": seq = reverse_compliment(seq) pos = hairpin.find(seq) if pos > -1: mature_pos.append((pos,pos+len(seq))) else: print "mature seq not mapping:", seq, mature_seqs assert seq in hairpin mature_seqs_fixed.append(seq) if mature_pos: mature_pos = sorted(mature_pos) # sometimes the mature seqs overlap --> remove the last of them if len(mature_pos) > 1: if mature_pos[-2][1] > mature_pos[-1][0]: mature_pos.pop(-1) elif mature_pos[0][1] > mature_pos[1][0]: mature_pos.pop(1) # print _OLDmature_pos, mature_pos assert len(mature_pos) <= 2 begin_5p = -1 end_5p = -1 begin_3p = -1 end_3p = -1 # begin_5p = genome_offset + 0 # end_5p = genome_offset + mature_len # begin_3p = genome_offset + len(hairpin) - mature_len # end_3p = genome_offset + len(hairpin) if len(mature_pos) == 2: begin_5p = genome_offset + mature_pos[0][0] end_5p = genome_offset + mature_pos[0][1] begin_3p = genome_offset + mature_pos[1][0] end_3p = genome_offset + mature_pos[1][1] elif len(mature_pos) == 1: avg_val = (mature_pos[0][0] + mature_pos[0][1] ) / 2.0 if avg_val < len(hairpin) / 2.0: begin_5p = genome_offset + mature_pos[0][0] end_5p = genome_offset + mature_pos[0][1] else: begin_3p = genome_offset + mature_pos[0][0] end_3p = genome_offset + mature_pos[0][1] # print begin_5p, end_5p, begin_3p, end_3p assert begin_5p >= genome_offset or begin_5p == -1 assert begin_5p < end_5p or begin_5p == -1 assert end_5p <= begin_3p or end_5p == -1 or begin_3p == -1 assert begin_3p < end_3p or begin_3p == -1 is_candidate = False tree = candidate_tree[chromosome] if not tree: continue candidates = tree[genome_offset:genome_offset+len(hairpin)] if candidates: miRNA_with_candidates.add(miRNAid) candidate_already += 1 for candidate in candidates: if candidate.data.chromosome_direction != strand_dir: continue hashval = candidate.data.chromosome + strand_dir + str(candidate.data.pos_5p_begin) print candidate.data.hairpin_start print candidate.data.pos_5p_begin assert candidate.data.pos_5p_begin == candidate.data.hairpin_start candidate_count += 1 shift_start = abs(genome_offset - candidate.data.pos_5p_begin) shift_end = abs( (genome_offset+len(hairpin)) - candidate.data.pos_3p_end) # use candidate as miRNA if shift_start + shift_end < len(hairpin) / 2: candidate_to_miRNAid[hashval] = miRNAid if candidate.data.miRNAid: # don't want 2 equal continue if candidate.data.candidate_type >= 1: print candidate.data.candidate_type, "candidate type" print candidate.data.miRNAid, "miRNAid ???", miRNAid assert candidate.data.candidate_type < 1 # undecided or candidate c_type = structure.TYPE_HIGH_CONF if miRNAid in miRNA_high_conf else structure.TYPE_LOW_CONF candidate.data.candidate_type = c_type candidate.data.miRNAid = miRNAid candidate.data.mirBase_matures = mature_seqs_fixed is_candidate = True break if is_candidate: continue # no candidates aligns the "miRNA" tree = sequence_tree[chromosome] sequences = tree[genome_offset:genome_offset+len(hairpin)] sequences = [s for s in sequences if s.data[0] == strand_dir] # sequences = [s for s in sequences if s.begin >= genome_offset and s.end <= genome_offset+len(hairpin)] sequences = set(sequences) is_both_matures = begin_5p != -1 and end_5p != -1 and begin_3p != -1 and end_3p != -1 if sequences and not is_both_matures: best_start_pos, _, best_end_pos, _ = interval_tree_misc.best_interval(sequences, genome_offset) # TODO skip if have both mature seqs # print # print sequences # print "seqs in hp: ", strand_dir, genome_offset, genome_offset+len(hairpin), hairpin # for s in sorted(sequences): # print s.begin, s.end, s.data[2], s.data[2] in hairpin, s.data[0] avgpos = (best_start_pos + best_end_pos) / 2.0 if avgpos < 0: pass elif avgpos < len(hairpin) / 2.0 : # peak is 5p old1 = begin_5p old2 = end_5p begin_5p = genome_offset + best_start_pos end_5p = genome_offset + best_end_pos if end_5p > begin_3p and begin_3p != -1: print "manually changing overlapping (3p)", begin_5p, end_5p, begin_3p, end_3p print "old seq:", old1, old2, genome_offset print begin_3p = end_5p else: # peak is 3p begin_3p = genome_offset + best_start_pos end_3p = genome_offset + best_end_pos if begin_3p < end_5p and end_5p != -1: print "manually changing overlapping (5p)", begin_5p, end_5p, begin_3p, end_3p end_5p = begin_3p has_seqs.append(miRNAid) else: # no sequences at all noseqs += 1 noseq_set.add(miRNAid) pass if not ( begin_5p + 10 >= genome_offset or begin_5p == -1): print begin_5p, genome_offset assert begin_5p + 10 >= genome_offset or begin_5p == -1 assert begin_5p < end_5p or begin_5p == -1 if not (end_5p <= begin_3p or end_5p == -1 or begin_3p == -1): print end_5p, begin_3p assert end_5p <= begin_3p or end_5p == -1 or begin_3p == -1 assert begin_3p < end_3p or begin_3p == -1 candidate = structure.Candidate(chromosome, strand_dir, genome_offset, genome_offset+len(hairpin), begin_5p, end_5p, begin_3p, end_3p, sequences) candidate.mirBase_matures = mature_seqs_fixed candidate.hairpin = hairpin c_type = structure.TYPE_HIGH_CONF if miRNAid in miRNA_high_conf else structure.TYPE_LOW_CONF candidate.set_candidate_type = c_type intervals_before = sequence_tree[tree][begin_5p-30:begin_5p] if sequence_tree[tree] else [] intervals_after = sequence_tree[tree][end_3p:end_3p+30] if sequence_tree[tree] else [] intervals_before = [s for s in intervals_before if s.data[0] == strand_dir] intervals_after = [s for s in intervals_after if s.data[0] == strand_dir] candidate.set_seq_outside(intervals_before, intervals_after) for candidate_interval in sequences: name = candidate_interval.data[1] if name not in sequences: number_id = int(name.split("-")[0]) duplicates = float(name.split("-")[1]) s = structure.Sequence(number_id, duplicates, candidate_interval.data[2]) s.add_candidate(candidate) seq_to_candidates[name] = s else: seq_to_candidates[name].add_candidate(candidate) # if end_3p - begin_5p > 200: # print "\t200+ length", begin_5p, end_5p, begin_3p, end_3p # assert False # assert candidate.pos_5p_begin < candidate.pos_5p_end <= candidate.pos_3p_begin < candidate.pos_3p_end candidate_list.append(candidate) hashval = chromosome + strand_dir + str(genome_offset) candidate_to_miRNAid[hashval] = miRNAid has_seqs = set(has_seqs) hc_len = len( miRNA_high_conf) sec_cand = has_seqs | miRNA_with_candidates print print "miRNA aligning sequences:\t\t", len(has_seqs) * 1.0 / len(unique_mirnas) print "miRNA aligning candidates:\t\t", len(miRNA_with_candidates) * 1.0 / len(unique_mirnas) print "miRNA aligning seq/candidate:\t\t", len(sec_cand) * 1.0 / len(unique_mirnas) print print "HIGH CONFIDENCE -- sequences \t\t", len(miRNA_high_conf & has_seqs) * 1.0 / hc_len print "HIGH CONFIDENCE -- candidates \t\t", len(miRNA_high_conf & miRNA_with_candidates) * 1.0 / hc_len print "HIGH CONFIDENCE -- seq/candidates \t", len(miRNA_high_conf & sec_cand) * 1.0 / hc_len # print "no sequences aligning at all:\t", len(noseq_set) # print "no seqs vs high confidence:\t", len(noseq_set.intersection(miRNA_high_conf) ), len(miRNA_high_conf) return candidate_to_miRNAid
def find_candidates_2(sequence_hits): ''' finds microRNA candidates from bowtie data (using interval trees) sequence_hits -- an iterable of lists on bowtie output format: 0 1 2 3 4 5 6 ['1-15830', '-', 'gi|224589818|ref|NC_000006.11|', '72113295', 'AGCTTCCAGTCGAGGATGTTTACA', 'IIIIIIIIIIIIIIIIIIIIIIII', '0'] returns a list of candidates, and the interval tree with all sequences ''' print "nr of sequences from bowtie: ", len(sequence_hits) sequence_tree = GenomeIntervalTree() candidate_tree = GenomeIntervalTree() # only candidates here candidate_list = [] all_mapped_sequences = set() seq_to_candidates = {} f = 0 # printing stats a = 0 print "adding all intervals to the tree" for prop in sequence_hits: seq_name = prop[0] strand_dir = prop[1] # forward: + backward: - genome_nr = prop[2].split("|")[3] # which genome (and version) genome_offset = int(prop[3]) # offset into the genome, 0-indexed dna_sequence = prop[4] # the dna_sequence matching this position. sequence_info = [strand_dir, seq_name, dna_sequence] sequence_tree.addi(genome_nr, genome_offset, genome_offset + len(dna_sequence), sequence_info) print "\tall intervals added to the tree" interval_sum = 0.0 # test all intervals to find candidates for tree in sequence_tree: print tree for interval in sorted(sequence_tree[tree]): if interval in all_mapped_sequences: continue start_interval = interval.begin end_interval = start_interval + SEARCH_LEN # find a peak in this interval candidate_intervals = sequence_tree[tree][start_interval:end_interval] if not candidate_intervals: continue # filter by direction candidate_intervals = [s for s in candidate_intervals if s.data[0] == interval.data[0]] if len(candidate_intervals) <= 1: continue # search for more sequences close to this one max_end_interval = max(candidate_intervals, key=lambda x:x.end) max_end = max_end_interval.end candidate_intervals = set(candidate_intervals) while max_end + MAX_HAIRPIN_LOOP + MAX_MATURE_SEQ > end_interval: # extend search area new_seqs = sequence_tree[tree][end_interval:end_interval+SEARCH_LEN] new_seqs = [s for s in new_seqs if s.data[0] == interval.data[0]] if len(new_seqs) == 0: break candidate_intervals.update(new_seqs) end_interval += SEARCH_LEN max_end_interval = max(new_seqs, key=lambda x:x.end) max_end = max_end_interval.end all_mapped_sequences.update(candidate_intervals) # do not use these next iteration candidate_intervals = sorted(candidate_intervals) # first interval is picked if several equal. i = 0 while candidate_intervals: i += 1 if i > 1: break # finding the best interval (highest peaks): start_peak, start_peak_val, end_peak, end_peak_val = best_interval(candidate_intervals, start_interval) # no intervals at all if start_peak_val == -1 or end_peak_val == -1 or start_peak == -1 or end_peak == -1: break # finding interval close before best start_before_limit = max(-3,start_peak - MAX_HAIRPIN_LOOP - MAX_MATURE_SEQ ) stop_before_limit = max(-3, start_peak - MIN_HAIRPIN_LOOP) five_intervals = filter_intervals(candidate_intervals, start_interval, start_before_limit, stop_before_limit) start_before, start_before_val, stop_before, stop_before_val = best_interval(five_intervals, start_interval) # interval close after best start_after_limit = end_peak + MIN_HAIRPIN_LOOP stop_after_limit = end_peak + MAX_HAIRPIN_LOOP + MAX_MATURE_SEQ three_intervals = filter_intervals(candidate_intervals, start_interval, start_after_limit, stop_after_limit) start_after, start_after_val, stop_after, stop_after_val = best_interval(three_intervals, start_interval) no_peak_after = start_after == -1 or stop_after == -1 no_peak_after = no_peak_after or start_after_val == -1 or stop_after_val == -1 no_peak_before = start_before == -1 or stop_before == -1 no_peak_before = no_peak_before or start_before_val == -1 or stop_before_val == -1 p1 = p2 = p3 = p4 = -1 a += 1 if no_peak_after and no_peak_before: f += 1 break # best peak is 5p elif no_peak_before or start_after_val + stop_after_val > start_before_val + stop_before_val: begin_5p = start_peak + start_interval # best peak end_5p = end_peak + start_interval begin_3p = start_after + start_interval # peak after is second best end_3p = stop_after + start_interval p1 = start_peak_val # peak value, used for testing p2 = end_peak_val p3 = start_after_val p4 = stop_after_val assert begin_5p < end_5p < begin_3p < end_3p assert start_after_val != -1 assert stop_after_val != -1 assert start_after_val + stop_after_val > start_before_val + stop_before_val # assert stop_after_val > stop_before_val # best peak is 3p else: begin_5p = start_before + start_interval # peak before end_5p = stop_before + start_interval begin_3p = start_peak + start_interval # best peak end_3p = end_peak + start_interval p1 = start_before_val p2 = stop_before_val p3 = start_peak_val p4 = end_peak_val assert begin_5p < end_5p < begin_3p < end_3p strand_dir = interval.data[0] chromosome = tree assert end_3p - begin_5p <= MAX_CANDIDATE_LEN # close intervals are the intervals overlapping the candidate close_intervals = set() for c in candidate_intervals: if begin_5p < c.begin < end_3p or begin_5p < c.end < end_3p: close_intervals.add(c) # close_intervals = set(c for c in candidate_intervals if c.begin <= end_3p or c.end >= begin_5p) candidate_intervals = set(candidate_intervals) candidate_intervals -= close_intervals candidate_intervals = list(candidate_intervals) hairpin_start = begin_5p hairpin_end = end_3p candidate = structure.Candidate(chromosome, strand_dir, hairpin_start, # used as gene offset. sometimes 5p mature seq. is missing... hairpin_end, begin_5p, end_5p, begin_3p, end_3p, close_intervals) candidate.candidate_type = structure.TYPE_CANDIDATE intervals_before = sequence_tree[tree][begin_5p-30:begin_5p] if sequence_tree[tree] else [] intervals_after = sequence_tree[tree][end_3p:end_3p+30] if sequence_tree[tree] else [] intervals_before = [s for s in intervals_before if s.data[0] == strand_dir] intervals_after = [s for s in intervals_after if s.data[0] == strand_dir] candidate.set_seq_outside(intervals_before, intervals_after) candidate.peak_5b = p1 candidate.peak_5e = p2 candidate.peak_3b = p3 candidate.peak_3e = p4 assert candidate.pos_5p_begin < candidate.pos_5p_end < candidate.pos_3p_begin < candidate.pos_3p_end for candidate_interval in close_intervals: name = candidate_interval.data[1] if name not in seq_to_candidates: number_id = int(name.split("-")[0]) duplicates = float(name.split("-")[1]) interval_sum += duplicates s = structure.Sequence(number_id, duplicates, candidate_interval.data[2]) s.add_candidate(candidate) seq_to_candidates[name] = s else: seq_to_candidates[name].add_candidate(candidate) candidate_tree[tree][begin_5p:end_3p] = candidate candidate_list.append(candidate) if len(all_mapped_sequences) == 0: all_mapped_sequences = candidate.all_mapped_sequences print "find candidates 2.0" print "candidates:", a-f print "tests:", a, (a-f) * 1.0/ a print "fail:", f, f * 1.0 / a print "sum interval in candidates:", interval_sum return candidate_tree, sequence_tree, candidate_list, seq_to_candidates
def align_dead_miRNAs(mirna_hits, _, id_to_mature, candidate_tree, candidate_list, sequence_tree, seq_to_candidates): print "\naligning dead miRNA" candidate_to_dead = {} unique_mirnas = set() candidated = set() seqd = set() both_matures = 0 for dead_loki in mirna_hits: miRNAid = dead_loki[0] if miRNAid in unique_mirnas: continue # use first entry only # print "\n\t new dead...", unique_mirnas.add(miRNAid) strand_dir = dead_loki[1] chromosome = dead_loki[2].split("|")[3] genome_offset = int(dead_loki[3]) hairpin = dead_loki[4] is_candidate = False begin_5p = -1 end_5p = -1 begin_3p = -1 end_3p = -1 # print # print miRNAid, begin_5p, begin_5p + len(hairpin) # put mature seq into 5p or 3p if miRNAid in id_to_mature: mature_seq = id_to_mature[miRNAid] if strand_dir == "-": mature_seq = reverse_compliment(mature_seq) # print "\t", strand_dir # print "\t", mature_seq in hairpin # print "\t", mature_seq # print "\t", hairpin begin_mature = hairpin.find(mature_seq) end_mature = begin_mature + len(mature_seq) avg_val = (begin_mature + end_mature ) / 2.0 if avg_val < len(hairpin) / 2.0: begin_5p = genome_offset + begin_mature end_5p = genome_offset + end_mature else: begin_3p = genome_offset + begin_mature end_3p =genome_offset + end_mature # print "\t", begin_mature, end_mature, len(hairpin) # print "\t", begin_5p, end_5p, begin_3p, end_3p # print begin_5p, end_5p, begin_3p, end_3p, "\t", len(hairpin) tree = candidate_tree[chromosome] if tree: candidates = tree[genome_offset:genome_offset+len(hairpin)] # print len(candidates), candidates for candidate in candidates: if candidate.data.chromosome_direction != strand_dir: continue hashval = candidate.data.chromosome + strand_dir + str(candidate.data.pos_5p_begin) _fail_message = (candidate.data.hairpin_start, candidate.data.pos_5p_begin) assert candidate.data.pos_5p_begin == candidate.data.hairpin_start, _fail_message shift_start = abs(genome_offset - candidate.data.pos_5p_begin) shift_end = abs( (genome_offset+len(hairpin)) - candidate.data.pos_3p_end) # print shift_start, shift_end, len(hairpin) if shift_start + shift_end < len(hairpin) / 2: candidate_to_dead[hashval] = miRNAid is_candidate = True # assert candidate.candidate_type != structure.TYPE_HIGH_CONF # assert candidate.candidate_type != structure.TYPE_LOW_CONF # candidate.candidate_type = structure.TYPE_DEAD candidated.add(miRNAid) break else: print "no", tree has_both_matures = begin_5p != -1 and end_5p != -1 and begin_3p != -1 and end_3p != -1 assert not has_both_matures has_3p = begin_3p != -1 and end_3p != -1 has_5p = begin_5p != -1 and end_5p != -1 if not is_candidate: # and (has_5p or has_3p) ???? # assert begin_5p != -1 or begin_3p != -1 # assert end_5p != -1 or end_3p != -1 tree = sequence_tree[chromosome] if not tree: continue sequences = tree[genome_offset:genome_offset+len(hairpin)] sequences = [s for s in sequences if s.data[0] == strand_dir] # sequences = [s for s in sequences if s.begin >= genome_offset and s.end <= genome_offset+len(hairpin)] sequences = set(sequences) if sequences: best_start_pos, _, best_end_pos, _ = interval_tree_misc.best_interval(sequences, genome_offset) avgpos = (best_start_pos + best_end_pos) / 2.0 halfsize = len(hairpin) / 2.0 # offset = begin_5p # print "--" # print avgpos, halfsize, halfsize_derp # print begin_5p, end_5p, begin_3p, end_3p if avgpos <= halfsize: # peak is 5p # print "<-" begin_5p = genome_offset + best_start_pos end_5p = genome_offset + best_end_pos # if end_5p > begin_3p: # begin_3p = end_5p else: # peak is 3p begin_3p = genome_offset + best_start_pos end_3p = genome_offset + best_end_pos # if begin_3p < end_5p: # end_5p = begin_3p candidate = structure.Candidate(chromosome, strand_dir, genome_offset, genome_offset+len(hairpin), begin_5p, end_5p, begin_3p, end_3p, sequences) candidate.hairpin = hairpin intervals_before = sequence_tree[tree][begin_5p-30:begin_5p] if sequence_tree[tree] else [] intervals_after = sequence_tree[tree][end_3p:end_3p+30] if sequence_tree[tree] else [] intervals_before = [s for s in intervals_before if s.data[0] == strand_dir] intervals_after = [s for s in intervals_after if s.data[0] == strand_dir] candidate.set_seq_outside(intervals_before, intervals_after) assert candidate.candidate_type != structure.TYPE_HIGH_CONF assert candidate.candidate_type != structure.TYPE_LOW_CONF candidate.candidate_type = structure.TYPE_DEAD if sequences: seqd.add(miRNAid) for candidate_interval in sequences: name = candidate_interval.data[1] if name not in sequences: number_id = int(name.split("-")[0]) duplicates = float(name.split("-")[1]) s = structure.Sequence(number_id, duplicates, candidate_interval.data[2]) s.add_candidate(candidate) seq_to_candidates[name] = s else: seq_to_candidates[name].add_candidate(candidate) assert candidate.pos_5p_begin < candidate.pos_5p_end or candidate.pos_5p_begin == -1 assert candidate.pos_3p_begin < candidate.pos_3p_end or candidate.pos_3p_begin == -1 if begin_5p != -1 and end_3p != -1: both_matures += 1 # if candidate.pos_5p_end >= candidate.pos_3p_begin + 10: # print candidate.pos_5p_end, candidate.pos_3p_begin # assert candidate.pos_5p_end < candidate.pos_3p_begin + 10 # assert candidate.pos_5p_begin < candidate.pos_5p_end <= candidate.pos_3p_begin < candidate.pos_3p_end candidate_list.append(candidate) hashval = chromosome + strand_dir + str(genome_offset) candidate_to_dead[hashval] = miRNAid # print "123" print len(unique_mirnas) print "\ndead stats:" print "has both seqs:", both_matures, (both_matures + len(candidated))*1.0 / len(unique_mirnas) print "aligning with candidates:\t", len(candidated), len(candidated)*1.0 / len(unique_mirnas) print "aligning with seqs:\t", len(seqd), len(seqd)*1.0 / len(unique_mirnas) print "aligning with either:\t", len(candidated | seqd), len(candidated | seqd)*1.0 / len(unique_mirnas) # assert False return candidate_to_dead