def align_small_seqs(candidates, small_seqs, small_seqs_copies):
    
    print "\n\tSmall seq analysis"
    print "\tcandidates:", len(candidates), candidates[0].hairpin
    print "\tsmall seqs:", len(small_seqs), len(small_seqs_copies)


    max_frequent_seq = 0
    c_used = set()
    s_used = set()

    # adding all canididate seqs to a substring dict
    # reversing the hairpin if it is on negative strand
    
    find_candidates = SubstringDict()

    for i, candidate in enumerate(candidates):
        
        hairpin_part = candidate.hairpin_padded_40[20:-20]
        if candidate.chromosome_direction == "-":
            hairpin_part = reverse_compliment(hairpin_part)
        find_candidates[hairpin_part] = i

    
    # for each small seq. find the set of matching candidates
    for j, seq in enumerate(small_seqs):
        
        
        candidate_set = find_candidates[seq]
        
        if len(candidate_set) > max_frequent_seq:
            max_frequent_seq = len(candidate_set)
            
        # add the seq. to that candidate
        for cnr in candidate_set:
            copies = small_seqs_copies[j]
            
            if not candidates[cnr].small_subs:
                candidates[cnr].small_subs = {}
                
            candidates[cnr].small_subs[seq] = copies
            
            c_used.add(cnr)
            s_used.add(j)
        
    print "\tmost frequent small hit", max_frequent_seq, "times"
    print "\tsmall seq hits:", len(s_used)
    print "\tcandidates with small seqs:",len(c_used)
def align_miRNAs(mirna_hits, hairpinID_to_mature, hpID_to_mseqs, candidate_tree, candidate_list, sequence_tree,
                 seq_to_candidates, miRNA_species, miRNA_high_conf):

    candidate_to_miRNAid = {}
    noseq_set = set()
    unique_mirnas = set()
    candidate_already = 0

    candidate_count = 0
    miRNA_with_candidates = set()
    has_seqs = []
    noseqs = 0
    
    
    for loki in mirna_hits:
        
        miRNAid = ">" + loki[0]
        
        if miRNAid in unique_mirnas:
            continue # use first entry only
        unique_mirnas.add(miRNAid)

        strand_dir = loki[1]
        chromosome = loki[2].split("|")[3]
        genome_offset = int(loki[3])
        hairpin = loki[4]

        mature_seqs = hpID_to_mseqs[miRNAid] if miRNAid in hpID_to_mseqs else []
        
        mature_seqs_fixed = []
        mature_pos = []
        for seq in mature_seqs:
            if strand_dir == "-":
                seq = reverse_compliment(seq)
            pos = hairpin.find(seq)
            if pos > -1:
                mature_pos.append((pos,pos+len(seq)))
            else:
                print "mature seq not mapping:", seq, mature_seqs
            assert seq in hairpin
            mature_seqs_fixed.append(seq)
                    
        
        if mature_pos:
            mature_pos = sorted(mature_pos)
            
        # sometimes the mature seqs overlap --> remove the last of them
        if len(mature_pos) > 1:
            
            if mature_pos[-2][1] > mature_pos[-1][0]:
                mature_pos.pop(-1)
                
            elif mature_pos[0][1] > mature_pos[1][0]:                
                mature_pos.pop(1)


#         print _OLDmature_pos, mature_pos
        assert len(mature_pos) <= 2
            
        begin_5p = -1
        end_5p = -1
        begin_3p = -1
        end_3p = -1
        
#         begin_5p = genome_offset + 0
#         end_5p =  genome_offset + mature_len
#         begin_3p =  genome_offset + len(hairpin) - mature_len
#         end_3p = genome_offset + len(hairpin)
        
        if len(mature_pos) == 2:
            begin_5p = genome_offset + mature_pos[0][0]
            end_5p =  genome_offset + mature_pos[0][1]
            begin_3p =  genome_offset + mature_pos[1][0]
            end_3p = genome_offset + mature_pos[1][1]
            
        elif len(mature_pos) == 1:
            
            avg_val = (mature_pos[0][0] + mature_pos[0][1] ) / 2.0
            
            if avg_val < len(hairpin) / 2.0:
                begin_5p = genome_offset + mature_pos[0][0]
                end_5p = genome_offset + mature_pos[0][1]
            else:
                begin_3p = genome_offset + mature_pos[0][0]
                end_3p = genome_offset + mature_pos[0][1]
            
        
#         print begin_5p, end_5p, begin_3p, end_3p
        assert begin_5p >= genome_offset or begin_5p == -1
        assert begin_5p < end_5p or begin_5p == -1
        assert end_5p  <= begin_3p or end_5p == -1 or begin_3p == -1
        assert begin_3p < end_3p or begin_3p == -1

        
        is_candidate = False
        
        tree = candidate_tree[chromosome]
        if not tree:
            continue
        
        candidates = tree[genome_offset:genome_offset+len(hairpin)]

        if candidates:
            miRNA_with_candidates.add(miRNAid)
            candidate_already += 1
            
            for candidate in candidates:
                
                if candidate.data.chromosome_direction != strand_dir:
                    continue

                hashval = candidate.data.chromosome + strand_dir + str(candidate.data.pos_5p_begin)
                
                print candidate.data.hairpin_start
                print candidate.data.pos_5p_begin
                assert candidate.data.pos_5p_begin == candidate.data.hairpin_start
                
                candidate_count += 1

                shift_start = abs(genome_offset - candidate.data.pos_5p_begin)
                shift_end = abs( (genome_offset+len(hairpin)) - candidate.data.pos_3p_end)
                
                
                # use candidate as miRNA
                if shift_start + shift_end < len(hairpin) / 2:
                    
                    candidate_to_miRNAid[hashval] = miRNAid
                    
                    if candidate.data.miRNAid: # don't want 2 equal 
                        continue
                    
                    if candidate.data.candidate_type >= 1:
                        print candidate.data.candidate_type, "candidate type"
                        print candidate.data.miRNAid, "miRNAid ???", miRNAid
                    assert candidate.data.candidate_type < 1 # undecided or candidate
                    

                    
                    c_type = structure.TYPE_HIGH_CONF if miRNAid in miRNA_high_conf else structure.TYPE_LOW_CONF
                    candidate.data.candidate_type = c_type
                    candidate.data.miRNAid = miRNAid
                    candidate.data.mirBase_matures = mature_seqs_fixed
                    is_candidate = True
                    
                    break


        if is_candidate:
            continue

        
#         no candidates aligns the "miRNA"
        tree = sequence_tree[chromosome]
        sequences = tree[genome_offset:genome_offset+len(hairpin)]
        sequences = [s for s in sequences if s.data[0] == strand_dir]
#         sequences = [s for s in sequences if s.begin >= genome_offset and s.end <= genome_offset+len(hairpin)]
        sequences = set(sequences)
        
        is_both_matures = begin_5p != -1 and end_5p != -1 and begin_3p != -1 and end_3p != -1
        
        if sequences and not is_both_matures:
            best_start_pos, _, best_end_pos, _ = interval_tree_misc.best_interval(sequences, genome_offset)
            # TODO skip if have both mature seqs
#             print
#             print sequences
#             print "seqs in hp: ", strand_dir, genome_offset, genome_offset+len(hairpin), hairpin
#             for s in sorted(sequences):
#                 print s.begin, s.end, s.data[2], s.data[2] in hairpin, s.data[0]
            
            
            avgpos = (best_start_pos + best_end_pos) / 2.0
            
            if avgpos < 0:
                pass
            elif avgpos < len(hairpin) / 2.0 :
                # peak is 5p
                old1 = begin_5p
                old2 = end_5p
                begin_5p = genome_offset + best_start_pos
                end_5p = genome_offset + best_end_pos
                if end_5p > begin_3p and begin_3p != -1:
                    print "manually changing overlapping (3p)", begin_5p, end_5p, begin_3p, end_3p
                    print "old seq:", old1, old2, genome_offset
                    print 
                    begin_3p = end_5p
            else:
                # peak is 3p
                begin_3p = genome_offset + best_start_pos
                end_3p = genome_offset + best_end_pos
                if begin_3p < end_5p and end_5p != -1:
                    print "manually changing overlapping (5p)", begin_5p, end_5p, begin_3p, end_3p
                    end_5p = begin_3p

            
            has_seqs.append(miRNAid)

            
        else:
#             no sequences at all
            noseqs += 1
            noseq_set.add(miRNAid)
            pass
        if not ( begin_5p + 10 >= genome_offset or begin_5p == -1):
            print begin_5p, genome_offset
            assert begin_5p + 10 >= genome_offset or begin_5p == -1 
        assert begin_5p < end_5p or begin_5p == -1
        
        if not (end_5p  <= begin_3p or end_5p == -1 or begin_3p == -1):
            print end_5p, begin_3p
            assert end_5p  <= begin_3p or end_5p == -1 or begin_3p == -1
        assert begin_3p < end_3p or begin_3p == -1
        
        candidate = structure.Candidate(chromosome,
                         strand_dir,
                         genome_offset,
                         genome_offset+len(hairpin),                         
                         begin_5p,
                         end_5p,
                         begin_3p,
                         end_3p,
                         sequences)
        
        candidate.mirBase_matures = mature_seqs_fixed
        
        candidate.hairpin = hairpin
        c_type = structure.TYPE_HIGH_CONF if miRNAid in miRNA_high_conf else structure.TYPE_LOW_CONF
        candidate.set_candidate_type = c_type
        
        intervals_before = sequence_tree[tree][begin_5p-30:begin_5p] if sequence_tree[tree] else []
        intervals_after = sequence_tree[tree][end_3p:end_3p+30] if sequence_tree[tree] else []

        intervals_before = [s for s in intervals_before if s.data[0] == strand_dir]
        intervals_after = [s for s in intervals_after if s.data[0] == strand_dir]
        
        candidate.set_seq_outside(intervals_before, intervals_after)
        

        for candidate_interval in sequences:
            name = candidate_interval.data[1]
            if name not in sequences:
                number_id = int(name.split("-")[0])
                duplicates = float(name.split("-")[1])

                s = structure.Sequence(number_id, duplicates, candidate_interval.data[2])
                s.add_candidate(candidate)
                seq_to_candidates[name] = s
            else:
                seq_to_candidates[name].add_candidate(candidate)
        
#         if end_3p - begin_5p > 200:
#             print "\t200+ length", begin_5p, end_5p, begin_3p, end_3p
#             assert False
#         assert candidate.pos_5p_begin < candidate.pos_5p_end <= candidate.pos_3p_begin < candidate.pos_3p_end
        
        candidate_list.append(candidate)
        hashval = chromosome + strand_dir + str(genome_offset)
        
        candidate_to_miRNAid[hashval] = miRNAid
    
    has_seqs = set(has_seqs)
    
    hc_len = len( miRNA_high_conf)
    sec_cand = has_seqs | miRNA_with_candidates
    
    print
    print "miRNA aligning sequences:\t\t", len(has_seqs) * 1.0 / len(unique_mirnas)
    print "miRNA aligning candidates:\t\t", len(miRNA_with_candidates) * 1.0 / len(unique_mirnas)    
    print "miRNA aligning seq/candidate:\t\t", len(sec_cand) * 1.0 / len(unique_mirnas)
    print
    print "HIGH CONFIDENCE -- sequences \t\t", len(miRNA_high_conf & has_seqs) * 1.0 / hc_len
    print "HIGH CONFIDENCE -- candidates \t\t", len(miRNA_high_conf & miRNA_with_candidates) * 1.0 / hc_len
    print "HIGH CONFIDENCE -- seq/candidates \t", len(miRNA_high_conf & sec_cand) * 1.0 / hc_len
    
#     print "no sequences aligning at all:\t",  len(noseq_set)
#     print "no seqs vs high confidence:\t", len(noseq_set.intersection(miRNA_high_conf) ), len(miRNA_high_conf)
    
    return candidate_to_miRNAid
def align_dead_miRNAs(mirna_hits, _,  id_to_mature, candidate_tree, candidate_list, sequence_tree,
                 seq_to_candidates):
    
    print "\naligning dead miRNA"
    candidate_to_dead = {}
    
    unique_mirnas = set()
    candidated = set()
    seqd = set()
    
    both_matures = 0
    
    for dead_loki in mirna_hits:
        
        miRNAid = dead_loki[0]
        if miRNAid in unique_mirnas:
            continue # use first entry only
        
#         print "\n\t new dead...",
        unique_mirnas.add(miRNAid)

        strand_dir = dead_loki[1]
        chromosome = dead_loki[2].split("|")[3]
        genome_offset = int(dead_loki[3])
        
        hairpin = dead_loki[4]
        
        is_candidate = False
        
        begin_5p =  -1
        end_5p = -1
        begin_3p = -1
        end_3p = -1
        
#         print
#         print miRNAid, begin_5p, begin_5p + len(hairpin)
        
        # put mature seq into 5p or 3p
        if miRNAid in id_to_mature:
            mature_seq = id_to_mature[miRNAid]
            if strand_dir == "-":
                mature_seq = reverse_compliment(mature_seq)
            
#             print "\t", strand_dir
#             print "\t", mature_seq in hairpin
#             print "\t", mature_seq
#             print "\t", hairpin

            begin_mature = hairpin.find(mature_seq)
            end_mature = begin_mature + len(mature_seq)
            
            avg_val = (begin_mature + end_mature ) / 2.0
            
            if avg_val < len(hairpin) / 2.0:
                begin_5p = genome_offset + begin_mature
                end_5p = genome_offset + end_mature
            else:
                begin_3p = genome_offset + begin_mature
                end_3p =genome_offset + end_mature
            
#             print "\t", begin_mature, end_mature, len(hairpin)
#             print "\t", begin_5p, end_5p, begin_3p, end_3p
        
#         print begin_5p, end_5p, begin_3p, end_3p, "\t", len(hairpin)
        
        
            
        tree = candidate_tree[chromosome]
        if tree:
            
            candidates = tree[genome_offset:genome_offset+len(hairpin)]
            
#             print len(candidates), candidates
            
            for candidate in candidates:
                
                if candidate.data.chromosome_direction != strand_dir:
                    continue
                
                hashval = candidate.data.chromosome + strand_dir + str(candidate.data.pos_5p_begin)
                
                _fail_message = (candidate.data.hairpin_start, candidate.data.pos_5p_begin)
                assert candidate.data.pos_5p_begin == candidate.data.hairpin_start, _fail_message
                
                shift_start = abs(genome_offset - candidate.data.pos_5p_begin)
                shift_end = abs( (genome_offset+len(hairpin)) - candidate.data.pos_3p_end)
                
#                 print shift_start, shift_end, len(hairpin)
                
                if shift_start + shift_end < len(hairpin) / 2:
                    candidate_to_dead[hashval] = miRNAid
                    is_candidate = True
                    
#                     assert candidate.candidate_type != structure.TYPE_HIGH_CONF
#                     assert candidate.candidate_type != structure.TYPE_LOW_CONF
#                     candidate.candidate_type = structure.TYPE_DEAD
                    
                    candidated.add(miRNAid)
                    break
    
        else:
            print "no", tree
        
        has_both_matures = begin_5p != -1 and end_5p != -1 and begin_3p != -1 and end_3p != -1
        assert not has_both_matures
        
        has_3p = begin_3p != -1 and end_3p != -1
        has_5p = begin_5p != -1 and end_5p != -1
        
        if not is_candidate: # and (has_5p or has_3p) ????  
            
            
#             assert begin_5p != -1 or begin_3p != -1
#             assert end_5p != -1 or end_3p != -1
            
            tree = sequence_tree[chromosome]
            
            if not tree:
                continue
            
            sequences = tree[genome_offset:genome_offset+len(hairpin)]
            sequences = [s for s in sequences if s.data[0] == strand_dir]
#             sequences = [s for s in sequences if s.begin >= genome_offset and s.end <= genome_offset+len(hairpin)]
            sequences = set(sequences)
            if sequences:


                best_start_pos, _, best_end_pos, _ = interval_tree_misc.best_interval(sequences, genome_offset)
                
                avgpos = (best_start_pos + best_end_pos) / 2.0
                halfsize = len(hairpin) / 2.0 
                
#                 offset = begin_5p
#                 print "--"
#                 print avgpos, halfsize, halfsize_derp
#                 print begin_5p, end_5p, begin_3p, end_3p
                
                if avgpos <= halfsize:
                    # peak is 5p
#                     print "<-"
                    begin_5p = genome_offset + best_start_pos
                    end_5p = genome_offset + best_end_pos
#                     if end_5p > begin_3p:
#                         begin_3p = end_5p
                else:
                    # peak is 3p
                    begin_3p = genome_offset + best_start_pos
                    end_3p = genome_offset + best_end_pos
#                     if begin_3p < end_5p:
#                         end_5p = begin_3p
            
        
            candidate = structure.Candidate(chromosome,
                     strand_dir,
                     genome_offset,
                     genome_offset+len(hairpin),
                     begin_5p,
                     end_5p,
                     begin_3p,
                     end_3p,
                     sequences)
            
            candidate.hairpin = hairpin
            
            intervals_before = sequence_tree[tree][begin_5p-30:begin_5p] if sequence_tree[tree] else []
            intervals_after = sequence_tree[tree][end_3p:end_3p+30] if sequence_tree[tree] else []

            intervals_before = [s for s in intervals_before if s.data[0] == strand_dir]
            intervals_after = [s for s in intervals_after if s.data[0] == strand_dir]
            
            candidate.set_seq_outside(intervals_before, intervals_after)
            
            assert candidate.candidate_type != structure.TYPE_HIGH_CONF
            assert candidate.candidate_type != structure.TYPE_LOW_CONF
            candidate.candidate_type = structure.TYPE_DEAD
            
            if sequences:
                seqd.add(miRNAid)
            
            
            for candidate_interval in sequences:
                name = candidate_interval.data[1]
                if name not in sequences:
                    number_id = int(name.split("-")[0])
                    duplicates = float(name.split("-")[1])
    
                    s = structure.Sequence(number_id, duplicates, candidate_interval.data[2])
                    s.add_candidate(candidate)
                    seq_to_candidates[name] = s
                else:
                    seq_to_candidates[name].add_candidate(candidate)

                    
            assert candidate.pos_5p_begin < candidate.pos_5p_end or candidate.pos_5p_begin == -1
            assert candidate.pos_3p_begin < candidate.pos_3p_end or candidate.pos_3p_begin == -1
            
            if begin_5p != -1 and end_3p != -1:
                both_matures += 1
#             if candidate.pos_5p_end >= candidate.pos_3p_begin + 10:
#                 print candidate.pos_5p_end, candidate.pos_3p_begin
#                 assert candidate.pos_5p_end < candidate.pos_3p_begin + 10

            
    #         assert candidate.pos_5p_begin < candidate.pos_5p_end <= candidate.pos_3p_begin < candidate.pos_3p_end
            
            candidate_list.append(candidate)
            hashval = chromosome + strand_dir + str(genome_offset)
            
            candidate_to_dead[hashval] = miRNAid
#             print "123"
        

    print len(unique_mirnas)
    
    print "\ndead stats:"
    print "has both seqs:", both_matures, (both_matures + len(candidated))*1.0 / len(unique_mirnas) 
    print "aligning with candidates:\t", len(candidated), len(candidated)*1.0 / len(unique_mirnas) 
    print "aligning with seqs:\t", len(seqd), len(seqd)*1.0 / len(unique_mirnas) 
    print "aligning with either:\t", len(candidated | seqd), len(candidated | seqd)*1.0 / len(unique_mirnas) 
    
#     assert False
    return candidate_to_dead
def small_seq_stats(candidates, sc=0, na=0, c_to_m=0):
    '''
    compares the amount of small sequences vs long sequences in 3p and 5p positions
    uses log values of values larger than 1 RPM in each sum
    adds 1.0 to each sum to avoid division against 0-scores
    stores the log score, as some sums are still very large
    '''
    
#     na = [x for li in na for x in li] # unpack folds
#     sc = zip(*sc)
#     print len(na), na[0], len(set(na))
#     print len(sc), sc[0]
#     
#     names_to_scores = {n:s for n,s in zip(na,sc)}
#     
#     print na[0]
#     print sc[0]
#     print names_to_scores[na[0]]
#     
#     
#     print len(names_to_scores)
    
    
#     def get_name_scores(c):
#         hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start)
#         
#         mi_name = c_to_m[hashval]
#         scores = names_to_scores[mi_name] if mi_name in names_to_scores else ""
#         return mi_name, scores
    
#     def _is_miRNA(c):
#         hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start)
#         return hashval in candidate_to_miRNA
    
    def log_over_one(val):
        return math.log(val) if val > 1.0 else val
     
    def log_sum(list_to_sum):
        log_list = [log_over_one(x) for x in list_to_sum]
        list_sum = sum(log_list)
        return list_sum
    
    print "candidates:", len(candidates)
    has_small_seqs = [c for c in candidates if c.small_subs]
    print "small seqs:", len(has_small_seqs)
    
    padding = 20
    
    for c in has_small_seqs:    
        
        hairpin = c.hairpin_padded_40[padding:-padding]
        folding = c.hairpin_fold_40[padding:-padding]
        if c.chromosome_direction == "-":
            hairpin = reverse_compliment(hairpin)
        
        begin_5 = c.pos_5p_begin - c.hairpin_start + padding if c.has_5p else c.estimate_5pb
        end_5 = c.pos_5p_end - c.hairpin_start + padding if c.has_5p else c.estimate_5pe
        
        begin_3 = c.pos_3p_begin - c.hairpin_start + padding if c.has_3p else c.estimate_3pb
        end_3 = c.pos_3p_end - c.hairpin_start + padding if c.has_3p else c.estimate_3pe
        
        
        # swap positions if hairpin is reversed
        has_5p = c.has_5p
        has_3p = c.has_3p
        
        if c.chromosome_direction == "-":
            b5 = len(hairpin) - end_3
            e5 = len(hairpin) - begin_3
            b3 = len(hairpin) - end_5
            e3 = len(hairpin) - begin_5
            
            begin_5 = b5 
            end_5 = e5 
            begin_3 = b3
            end_3 = e3
            
            #swap 3p 5p positions
            has_5p, has_3p = has_3p, has_5p
        
        # filter out bad positions
        if not (begin_5 < end_5 < begin_3 < end_3):
            if has_5p:
                if begin_3 < end_5 or end_3 < end_5:
                    begin_3 = -1
                    end_3 = -1
            elif has_3p:
                if end_5 > begin_3 or begin_5 > begin_3:
                    begin_5 = -1
                    end_5 = -1
            
            if begin_5 > end_5:
                begin_5 = -1
                end_5 = -1
            if begin_3 > end_3:
                begin_3 = -1
                end_3 = -1

#         assert begin_5 < end_5 < begin_3 < end_3, (begin_5, end_5, begin_3, end_3)
        
        mature_pos = [pos for pos in [begin_5, end_5, begin_3, end_3] if 0 <= pos <= len(hairpin)]     
        
        medium_short = []
        

         
        print "\n---------------"
        print "---------------"
        print [begin_5, end_5, begin_3, end_3]
        print mature_pos
        print c.has_5p
        print c.has_3p
        print "direction:", c.chromosome_direction
        print c.pos_5p_begin - c.hairpin_start, c.pos_3p_end - c.hairpin_start
#         print get_name_scores(c)
        print folding
        print hairpin
        print " "*begin_5 + "5"*(end_5 - begin_5) + " "*(begin_3 - end_5) + "3"*(end_3 - begin_3)
 
         
        if c.mirBase_matures:
            for seq in c.mirBase_matures:
                
                if c.chromosome_direction == "-":
                    seq = reverse_compliment(seq)
                
                start_seq = hairpin.find(seq)
                print " " * (start_seq) + seq, "\t", (start_seq)

        for seq, copies in c.small_subs.iteritems():
            start_seq = hairpin.find(seq)
            end_seq = start_seq + len(seq)
            
            print " " * (start_seq) + seq, "\t", (start_seq, copies)
            medium_short.append( (len(seq), start_seq, end_seq, copies) )
        
        
#         print "+" * len(hairpin)
#         print hairpin
        
        unscaled_long = 0
        scaled_long = 0
        
        if len(c.mapped_sequences):
            for i in c.mapped_sequences:
                
                
                copies = float(i.data[1].split("-")[1])

                seq = i.data[2]
                if c.chromosome_direction == "-":
                    seq = reverse_compliment(seq)

                seq_start = hairpin.find(seq)
                
                print " " * seq_start + seq, "\t", ( seq_start, copies)
                 
                unscaled_long += copies
                scaled_long += log_over_one(copies)
                
#===============================================================================
# #                 if seq_start in area_5p_long:
# #                     area_5p_long_sum += log_over_one(copies)
# #                 elif seq_start in area_3p_long:
# #                     area_3p_long_sum += log_over_one(copies)
#===============================================================================



            
        unscaled_short = sum(c.small_subs.values()) + 1.0
        unscaled_long += 1.0
        unscaled_short_fraction = unscaled_short / unscaled_long
        
        
        scaled_short = log_sum(c.small_subs.values()) + 1.0
        scaled_long += 1
        scaled_short_fraction = scaled_short / scaled_long
        
        
        c.ratio_short_long = unscaled_short_fraction
        c.ratio_short_long_logval = scaled_short_fraction
        
        
        def nondecreasing(l):
            for i, el in enumerate(l[1:]):
        
                if l[i] > el:
                    return False
            return True
        
        assert nondecreasing(mature_pos), (mature_pos, len(hairpin))


        

        

        distances = []
        # finding distances to start / end of mature sequences
        for (length, start, end, copies) in medium_short:
        
            start_dist = min_dist(mature_pos, start)
            end_dist = min_dist(mature_pos, end)
            smallest_dist = min(start_dist, end_dist)
            
            weight = math.log(copies+1.0)
            
            dist_weight_log = weight / (smallest_dist + 1.0)            
            dist_weigth = copies / (smallest_dist + 1.0)
            
            distances.append( (length, dist_weigth, dist_weight_log, copies) )
                    
        
        
        def score_distance(minlen, maxlen):
            
            distance_weighted = sum( w for l, w, _lw, c in distances if minlen <= l <= maxlen )
            all_weights = sum( c for l, w, _lw, c in distances if minlen <= l <= maxlen )
            score = distance_weighted / all_weights if all_weights else 0
            
            return score
            
        
        
        c.short_seq_align_10_13 = score_distance(10,13)
        c.short_seq_align_10_14 = score_distance(10,14)
        c.short_seq_align_10_15 = score_distance(10,15)
        c.short_seq_align_10_16 = score_distance(10,16)
        c.short_seq_align_10_17 = score_distance(10,17)
        c.short_seq_align_10_18 = score_distance(10,18)
        
        

#         c.short_seq_align_8_17 = score_distance(8,8)
#         c.short_seq_align_9_17 = score_distance(9,9)
#         c.short_seq_align_10_17 = score_distance(10,10)
#         c.short_seq_align_11_17 = score_distance(11,11)
#         c.short_seq_align_12_17 = score_distance(12,12)
#         c.short_seq_align_13_17 = score_distance(13,13)
#         c.short_seq_align_14_17 = score_distance(14,14)
#         c.short_seq_align_15_17 = score_distance(15,15)
#         c.short_seq_align_16_17 = score_distance(16,16)
#         c.short_seq_align_17_17 = score_distance(17,17)
        
        c.short_seq_align_8_17 = score_distance(8,17)
        c.short_seq_align_9_17 = score_distance(9,17)
        c.short_seq_align_10_17 = score_distance(10,17)
        c.short_seq_align_11_17 = score_distance(11,17)
        c.short_seq_align_12_17 = score_distance(12,17)
        c.short_seq_align_13_17 = score_distance(13,17)
        c.short_seq_align_14_17 = score_distance(14,17)
        c.short_seq_align_15_17 = score_distance(15,17)
        c.short_seq_align_16_17 = score_distance(16,17)
        c.short_seq_align_17_17 = score_distance(17,17)
        
        c.short_seq_align = score_distance(13,17)
Beispiel #5
0
def tailing_au_simple(candidates=None, sequences=None):
    
#     pickle.dump(candidates, open("candidates_test_only.p", "wb"))
#     pickle.dump(sequences, open("sequences_test_only.p", "wb"))
#     
#     assert False
# 
#     candidates = candidates[:50]
#     pickle.dump(candidates, open("candidates_test_only2.p", "wb"))
#     assert 0
    
#     print "loading data ..."
#     candidates = pickle.load( open("candidates_test_only2.p", "rb"))
#     sequences = pickle.load( open("sequences_test_only.p", "rb"))
#     print "... loaded"

    
    tail_au = []
    pre_au = []
    tail_a_list = []
#     pre_a = []
    
    min_tailfree = 15
    
    end_chars = {"A":0.0, "C":0.0, "G":0.0, "T":0.0}
    
    for s in sequences:
        n = s.nucleotides[-1]
        end_chars[n] += 1
        
    print end_chars
#     assert 0
    
    
    for sequence in sequences:
        
        seq = sequence.nucleotides
        count = sequence.duplicates if sequence.duplicates <= 1 else math.log(sequence.duplicates)
        
        tailfree, is_tail_end =  _remove_trailing(seq)
        au_start, is_au_start = _remove_trailing_start(seq)
        
        if is_tail_end and len(tailfree) > min_tailfree:
            assert len(tailfree) != len(seq)
            tail_au.append((tailfree, seq, count))
            
        if is_au_start and len(au_start) > min_tailfree:
            assert len(au_start) != len(seq)
            pre_au.append((au_start, seq, count))
            
        tail_a, is_tail_a = _remove_trailing(seq, set("Aa"))
        
        if is_tail_a and tail_a > min_tailfree:
            tail_a_list.append((tail_a, seq, count))
            
            

            
    
    print "long tails", len([1 for (au, seq, count) in tail_au if len(seq) - len(au) > 1])
    print "long pre", len([1 for (au, seq, count) in pre_au if len(seq) - len(au) > 1])
    print "found all tails and starts", len(tail_au), len(pre_au), "only a ends:", len(tail_a_list)
    
    
#     assert 0
            
    for c in candidates:
        
        hairpin = c.hairpin_padded_40[20:-20]
        
        if c.chromosome_direction == "-":
            hairpin = reverse_compliment(hairpin)
        
        
        full_hits_tail = 0.0
        full_hits_pre = 0.0
        full_hits_bowtie = 0.0
        
        au_tails = 0.0
        au_starts = 0.0
        hit_tailing = []
        hit_pre = []
        
        for notail, seq, count in tail_au:
            if len(notail) < 15:
                continue
            
            if notail in hairpin:
                
                
                if seq not in hairpin:
                    pos = hairpin.find(notail)
                    assert pos >= 0, (notail in hairpin, pos, len(seq), notail, seq[len(notail):], hairpin[pos:len(seq)+5])
#                     print "\t", notail in hairpin, pos, len(seq), notail, seq[len(notail):], hairpin[pos:len(seq)+5]
                    au_tails += count
                    hit_tailing.append((pos, pos+len(notail)))
                else:
                    full_hits_tail += count
                        
        
        for no_au_start, seq, count in pre_au:
             
            if no_au_start in hairpin:
                if seq not in hairpin:
                    pos = hairpin.find(no_au_start)
                    assert pos >= 0, (no_au_start in hairpin, pos, len(seq), no_au_start, seq[len(no_au_start):], hairpin[pos:len(seq)+5])
                    
                    au_starts += count
                    hit_pre.append( (pos, pos+len(no_au_start)) )
                else:
                    full_hits_pre += count
                    
        
#         full_hits_bowtie = sum([float(s.data[1].split("-")[1])] for s in c.mapped_sequences)
        
        full_hits_missed = 0
        for sequence in sequences:
            
            seq = sequence.nucleotides
            count = sequence.duplicates if sequence.duplicates <= 1 else math.log(sequence.duplicates)
            
            
            
            if seq in hairpin:
                full_hits_missed += count
            
            
        
        
        
        full_hits_bowtie = 0
        for sequence in c.mapped_sequences:
            count = float(sequence.data[1].split("-")[1])
            full_hits_bowtie += math.log(count) if count > 1 else count
            
                    
        print c.chromosome_direction, len(hairpin)
        print au_tails, full_hits_tail, sorted(hit_tailing)
        print au_starts, full_hits_pre, sorted(hit_pre)
        print full_hits_bowtie, full_hits_missed
        print
        
        
        total_hits_tail = full_hits_bowtie + full_hits_missed + au_tails
        total_hits_tail = math.log(total_hits_tail) if total_hits_tail > 1 else total_hits_tail
        
        au_tails_score = math.log(au_tails) if au_tails > 1 else au_tails
        
        score_tailing = (au_tails_score + 1) / (total_hits_tail + 1)
        
        
        total_hits_tail = full_hits_bowtie + full_hits_missed + au_starts
        total_hits_tail = math.log(total_hits_tail) if total_hits_tail > 1 else total_hits_tail
        
        
        au_leading_score = math.log(au_starts) if au_starts > 1 else au_starts
        
        score_leading = (au_leading_score + 1) / (total_hits_tail + 1)
        
        
        c.tailing_au = score_tailing
        c.leading_au = score_leading