from matplotlib import pyplot import mirbase import numpy from inputs import miRNA, special_types # import miRNAs and mature seqs. hairpin_file = "hairpin.fa" mature_seq_file = "mature.fa" high_conf_file = "high_conf_hairpin.fa" other_types = "mirTrons_other.txt" hsa_to_hairpin, other_to_hairpin = mirbase.read_miRNA_fasta(hairpin_file) hsa_to_mature, other_to_mature = mirbase.read_miRNA_fasta(mature_seq_file) miRNA_high_conf = miRNA.read_high_confidence(high_conf_file) before = len(hsa_to_hairpin) before2 = len(hsa_to_mature) special_types.remove_mirTrons(hsa_to_hairpin, other_types) special_types.remove_mirTrons(hsa_to_mature, other_types) assert before != len(hsa_to_hairpin) assert before2 != len(hsa_to_mature)
def main(): start_time = time.clock() print "starting miRNA analysis" # fasta2 = ["Demux.SRhi10002.Adipocyte", "Demux.SRhi10002.Alveolar", "Demux.SRhi10002.Amniotic", # "Demux.SRhi10002.Dendritic1", "Demux.SRhi10002.Dendritic2", "Demux.SRhi10002.Endothelial", # "Demux.SRhi10002.Fibroblast1", "Demux.SRhi10002.Fibroblast2", "Demux.SRhi10002.Fibroblast3", # "Demux.SRhi10002.Intestinal", "Demux.SRhi10002.Meningeal", "Demux.SRhi10002.Mesenchymal", # "Demux.SRhi10002.Osteoblast", "Demux.SRhi10002.Pericytes", "Demux.SRhi10002.Renal", # "Demux.SRhi10002.Sebocyte1", "Demux.SRhi10002.Sebocyte2", "Demux.SRhi10002.SmoothBrachiocephalic", # "Demux.SRhi10002.SmoothProstate", "Demux.SRhi10002.SmoothSubclavian", "Demux.SRhi10002.SmoothUterine"] # # fasta3 = ["Demux.SRhi10003.Adipocyte", "Demux.SRhi10003.Amniotic%20Epithelial", "Demux.SRhi10003.amniotic%20membrane", # "Demux.SRhi10003.Endothelial0", "Demux.SRhi10003.Endothelial1", "Demux.SRhi10003.Endothelial2", # "Demux.SRhi10003.Fibroblast1", "Demux.SRhi10003.Fibroblast2", "Demux.SRhi10003.Fibroblast3", # "Demux.SRhi10003.Keratinocyte", "Demux.SRhi10003.Mesenchymaladipose", "Demux.SRhi10003.Mesenchymalbone", # "Demux.SRhi10003.Osteoblast", "Demux.SRhi10003.Pancreatic", "Demux.SRhi10003.Peripheral", # "Demux.SRhi10003.Prostate", "Demux.SRhi10003.Renal", "Demux.SRhi10003.Sertoli", # "Demux.SRhi10003.Skeletal", "Demux.SRhi10003.SmoothBrain", "Demux.SRhi10003.SmoothPulmonary", # "Demux.SRhi10003.SmoothUmbilical"] # # fasta2 = ["hg19/"+n for n in fasta2] # fasta3 = ["hg19/"+n for n in fasta3] # # fasta4 = ["hg19/Demux.SRhi10004."+str(i) for i in range(1,23)] # fasta5 = ["hg19/Demux.SRhi10005."+str(i) for i in range(1,24)] # fasta_files.extend(fasta2) # # fasta2.extend(fasta3) # fasta2.extend(fasta4) # fasta2.extend(fasta5) # # fasta_files = ["SRR797059.collapsed", "SRR797060.collapsed", "SRR797061.collapsed", # "SRR797062.collapsed", "SRR797063.collapsed", "SRR797064.collapsed", # "SRR207110.collapsed", "SRR207111.collapsed", "SRR207112.collapsed"] # # fasta_files = ["SRR797060.collapsed", "SRR797061.collapsed", # "SRR797062.collapsed", "SRR797063.collapsed", "SRR797064.collapsed"] # fasta_files = ["SRR797060.collapsed", "SRR797061.collapsed", "SRR207111.collapsed"] # fasta_files = ["SRR797060.collapsed", "SRR797061.collapsed"] # fasta_files = ["SRR207110.collapsed", "SRR207111.collapsed", "SRR207112.collapsed"] # fasta_file = "SRR797062.fa" fasta_files = ["SRR797062.collapsed"] # small file for fast testing fasta_files_large_folder = ["fastas/Demux.SRhi." + str(i) + ".collapsed" for i in range(296)] fasta_files_small = ["SRR797059.collapsed", "SRR797060.collapsed", "SRR797061.collapsed", "SRR797062.collapsed", "SRR797063.collapsed", "SRR797064.collapsed", "SRR207110.collapsed", "SRR207111.collapsed", "SRR207112.collapsed", "SRR207113.collapsed", "SRR207114.collapsed", "SRR207115.collapsed", "SRR207116.collapsed", "SRR207117.collapsed", "SRR207118.collapsed", "SRR207119.collapsed"] fasta_files = fasta_files_large_folder # fasta_files = fasta_files_large_folder[:40] hairpin_file = "hairpin.fa" mature_seq_file = "mature.fa" miRNA_file_name = "mirnas.fa" high_conf_file = "high_conf_hairpin.fa" miRNA_family_file = "miFam.dat" other_types = "mirTrons_other.txt" # dead_mirnas = "miRNA.dead" dead_mirnas = "dead_list" dead_mirna_hairpins = "dead_hairpins.txt" dead_mirna_bowtie_file = "dead_hairpin_bowtie.fa" dead_mirna_bowtie_out = "dead_hairpin_locations.map" all_reads_file = "all.collapsed" bowtie_output = "bowtie_out.map" miRNA_bowtie_output = "miRNA.map" ml_folds = 10 is_new_run = True is_new_run = False #=========================================================================== # # making data for mirdeep2 #=========================================================================== # not_human_file(mature_seq_file, "other_matures.fa") # assert 0 # human_only_file(mature_seq_file, "human_matures.fa") # # # hsa_to_hairpin, other_to_hairpin = mirbase.read_miRNA_fasta(hairpin_file) # # write_human_hairpins(hsa_to_hairpin, "human_hairpins.fa") # assert 0 # # # assert 0 # # one_large_fasta(fasta_files, "all_fasta.fa") # assert 0 # # #=========================================================================== if is_new_run: id_to_dead_hp, id_to_dead_mature = dead_mirna.get_hairpin(dead_mirnas) # assert False print "merging",len(fasta_files), "collapsed files" if len(fasta_files)>1 else "" dict_collapsed = merge.collapse_collapsed(fasta_files, min_len=8, min_count=2) # split small and larger sequences # write reads to file reads, reads_count, small_reads, small_reads_count = merge.filter_seqeunces(dict_collapsed, 18) merge.write_collapsed(all_reads_file, reads, reads_count) print "long reads:", len(reads), "small:", len(small_reads), len(small_reads_count) print "fraction small / all:", len(small_reads)*1.0 / (len(small_reads) + len(reads)) # aligning to genome using bowtie _align_bowtie(bowtie_output, all_reads_file) print "finished bowtie in ", time.clock() - start_time, " seconds" # read genome alignment from bowtie fixed_lines = [line.strip().split("\t") for line in open(bowtie_output)] print "read positions in ", time.clock() - start_time, " seconds" print "loading miRNA hairpins:" hsa_to_hairpin, other_to_hairpin = mirbase.read_miRNA_fasta(hairpin_file) hsa_to_mature, other_to_mature = mirbase.read_miRNA_fasta(mature_seq_file) special_types.remove_mirTrons(hsa_to_hairpin, other_types) special_types.remove_mirTrons(hsa_to_mature, other_types) miRNA_species = mirbase.similar_hairpins(hsa_to_hairpin, other_to_hairpin) hairpinID_to_mature, harpinID_to_matseqs = mirbase.combine_hairpin_mature(hsa_to_hairpin, hsa_to_mature) miRNA_high_conf = miRNA.read_high_confidence(high_conf_file) # pickle.dump(harpinID_to_matseqs, open("harpinID_to_matseqs.p", "wb")) # pickle.dump(hsa_to_hairpin, open("hsa_to_hairpin.p", "wb")) print "\nhigh confidence set:", len(miRNA_high_conf), print miRNA_high_conf.issubset(miRNA_species.keys()) print "\nreading miRNA family info (mifam)" miRNA_fam = miRNA.read_family(miRNA_family_file) # print len(set(miRNA_species.keys()) & set(miRNA_fam.keys())) # run write micro rnas to file mirbase.write_miRNA(hsa_to_hairpin, miRNA_file_name) print "\nwrote human miRNAs to file", time.clock() - start_time, " seconds" # run bowtie to find miRNA positions _align_bowtie(miRNA_bowtie_output, miRNA_file_name) print "aligned miRNAs in", time.clock() - start_time, " seconds" miRNA_bowtie_hits = [line.strip().split("\t") for line in open(miRNA_bowtie_output)] unique_mirna_hits = set([x[0] for x in miRNA_bowtie_hits]) print "miRNA bowtie hits:", len(miRNA_bowtie_hits) print "unique miRNA hits:", len(unique_mirna_hits) print "\nDead mirnas" mirbase.write_dead_mirna(id_to_dead_hp, dead_mirna_bowtie_file) _align_bowtie(dead_mirna_bowtie_out, dead_mirna_bowtie_file) dead_miRNA_hits = [line.strip().split("\t") for line in open(dead_mirna_bowtie_out)] print "TOTAL dead mirnas:", len(dead_miRNA_hits) # assert 0 # using sequence tree to find possible candidates # candidate_tree, sequence_tree, candidates, seq_to_candidates = interval_tree_search.find_candidates(fixed_lines) candidate_tree, sequence_tree, candidates, seq_to_candidates = interval_tree_search.find_candidates_2(fixed_lines) print "\n\tfound candidates in ", time.clock() - start_time, " seconds" print "\tbowtie hits", len(fixed_lines) print "\tcandidate tree", len(candidate_tree) print "\tcandidates", len(candidates) print "\tsequence tree", len(sequence_tree) print "\tmapped seqs", len(candidates[0].all_mapped_sequences) # 0 1 2[0] [1] [2] [3] # ['1-15830', '-', 'gi|224589818|ref|NC_000006.11|', # NC_000006.11 print "\naligning miRNAs to sequences" candidate_to_miRNA = interval_tree_miRNA.align_miRNAs(miRNA_bowtie_hits, hairpinID_to_mature, harpinID_to_matseqs, candidate_tree, candidates, sequence_tree, seq_to_candidates, miRNA_species, miRNA_high_conf) mirdeep_new = mirdeep_make_roc_data(candidate_tree, candidate_to_miRNA, miRNA_high_conf) candidate_to_dead = interval_tree_dead.align_dead_miRNAs(dead_miRNA_hits, id_to_dead_hp, id_to_dead_mature, candidate_tree, candidates, sequence_tree, seq_to_candidates) print "\npadding all miRNA and Candidates" gene.include_padding(candidates) print "padded all candidates in ", time.clock() - start_time, " seconds" print "\nrunning vienna rnafold" vienna.energy_fold2(candidates) print "finished vienna folding" # align_small_seqs(candidates, small_reads, small_reads_count) # small_seq_stats(candidates) # # plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, "ratio_short_long_5p", False ) # assert 0 print "saving 123" pickle.dump(mirdeep_new, open("mirdeep_new.p", "wb")) pickle.dump(candidate_tree, open("candidate_tree.p", "wb")) pickle.dump(candidates, open("candidates_pre.p", "wb")) pickle.dump(candidate_to_miRNA, open("candidate_to_miRNA.p", "wb")) pickle.dump(miRNA_high_conf, open("miRNA_high_conf.p", "wb")) print "saving 234" pickle.dump(candidate_to_dead, open("candidate_to_dead.p", "wb")) pickle.dump(miRNA_fam, open("miRNA_fam.p", "wb")) pickle.dump(small_reads, open("small_reads.p", "wb")) pickle.dump(small_reads_count, open("small_reads_count.p", "wb")) pickle.dump(seq_to_candidates, open("seq_to_candidates.p", "wb")) pickle.dump(reads, open("reads.p", "wb")) pickle.dump(reads_count, open("reads_count.p", "wb")) print "saved 456" candidate_to_dead = pickle.load( open("candidate_to_dead.p", "rb")) # print len(candidate_to_dead) # # print len(candidate_to_dead.values()) # print len(set(candidate_to_dead.values())) # assert 0 print "loading miRNAs" mirdeep_new = pickle.load( open("mirdeep_new.p", "rb")) print len(mirdeep_new) # assert 0 harpinID_to_matseqs = pickle.load( open("harpinID_to_matseqs.p", "rb")) hsa_to_hairpin = pickle.load( open("hsa_to_hairpin.p", "rb")) # print "loading tree..." # candidate_tree = pickle.load( open("candidate_tree.p", "rb")) print "loading picled stuff ...", time.clock() - start_time candidate_to_miRNA = pickle.load( open("candidate_to_miRNA.p", "rb")) miRNA_high_conf = pickle.load( open("miRNA_high_conf.p", "rb")) # mirdeep_make_roc_data(candidate_tree, candidate_to_miRNA, miRNA_high_conf) candidates = pickle.load( open("candidates_pre.p", "rb")) hp_50 = pickle.load( open("candidate_classified_miRNA.p", "rb")) hp_99 = pickle.load( open("candidate_classified_99.p", "rb")) candidate_to_dead = pickle.load( open("candidate_to_dead.p", "rb")) miRNA_fam = pickle.load( open("miRNA_fam.p", "rb")) small_reads = pickle.load( open("small_reads.p", "rb")) small_reads_count = pickle.load( open("small_reads_count.p", "rb")) seq_to_candidates = pickle.load( open("seq_to_candidates.p", "rb")) reads = pickle.load( open("reads.p", "rb")) reads_count = pickle.load( open("reads_count.p", "rb")) print "loaded back", time.clock() - start_time def get_miRNAid(c): hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start) return candidate_to_miRNA[hashval] if hashval in candidate_to_miRNA else None annotated_data, annotations, low_confidence_data = create_folds(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, miRNA_fam, ml_folds) names = [map(get_miRNAid, data_fold) for data_fold in annotated_data] # names = map(get_miRNAid, annotated_data[0]) pickle.dump(names, open("names_data.p", "wb")) print len(names[0]), len(annotated_data[0]) # assert 0 fix_miRNA_training_test(annotated_data, annotations, low_confidence_data, hsa_to_hairpin, harpinID_to_matseqs, candidate_to_miRNA) # length_distribution(small_reads, small_reads_count) # # assert 0 # overhang calculated using fold seq. overhang.get_alignment(candidates) # calculating hairpin stats (length + overhang using pairing prob.) hairpin_stats(candidates, candidate_to_miRNA, miRNA_high_conf) def _is_miRNA(c): hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start) return hashval in candidate_to_miRNA def _is_hc(c): hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start) if hashval in candidate_to_miRNA: if candidate_to_miRNA[hashval] in miRNA_high_conf: return True return False def _is_dead(c): hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start) return hashval in candidate_to_dead def is_good_candidate(c): return c.has_hairpin_struct and not _is_dead(c) and not _is_miRNA(c) hp_candidates = [c for c in candidates if is_good_candidate(c)] print len(hp_candidates), 572, len(hp_50), len(hp_99) assert len(hp_candidates) == len(hp_50) == len(hp_99) hp_99_candidates = [h for h, is_99 in zip(hp_candidates, hp_99) if is_99] hp_50_candidates = [h for h, is_50 in zip(hp_candidates, hp_50) if is_50] def hash_val(c): return c.chromosome+c.chromosome_direction+str(c.hairpin_start) hp99_to_candidate = {hash_val(c):c for c in hp_99_candidates} hp50_to_candidate = {hash_val(c):c for c in hp_50_candidates} # print hp50_to_candidate.issubset(candidate_to_miRNA) # print hp99_to_candidate.issubset(candidate_to_miRNA) # # assert hp50_to_candidate.issubset(candidate_to_miRNA) # assert hp99_to_candidate.issubset(candidate_to_miRNA) # print "all candidates+miRNA+other:", len(candidates) print "\twith hairpin struct:\t", len([c for c in candidates if c.has_hairpin_struct]) _mirnas = [c for c in candidates if _is_miRNA(c)] # _mirnas2 = [c for c in candidates if c.miRNAid != None] _mirna_hc = [c for c in candidates if _is_hc(c)] _mirna_lc = [c for c in _mirnas if not _is_hc(c)] _mirna_dead = [c for c in candidates if _is_dead(c)] _not_mirnas = [c for c in candidates if not _is_miRNA(c) and not _is_dead(c)] # _not_mirnas2 = [c for c in candidates if c.miRNAid == None] print "mirnas:", len(_mirnas), len(candidate_to_miRNA) print "\twith hairpin struct:\t", len([m for m in _mirnas if m.has_hairpin_struct]) print "candidates:", len(_not_mirnas) print "\twith hairpin struct:\t", len([m for m in _not_mirnas if m.has_hairpin_struct]) print "HC mirnas:", len(_mirna_hc), len(miRNA_high_conf) print "\twith hairpin struct:\t", len([m for m in _mirna_hc if m.has_hairpin_struct]) print "LC mirnas:", len(_mirna_lc) print "\twith hairpin struct:\t", len([m for m in _mirna_lc if m.has_hairpin_struct]) # fail_hc = [c for c in _mirna_hc if not c.has_hairpin_struct] # fail_lc = [c for c in _mirnas if not c.has_hairpin_struct and not _is_hc(c)] # # hairpin.hairpin_stats(fail_hc, candidate_to_miRNA, miRNA_high_conf) # hairpin.hairpin_stats(fail_lc, candidate_to_miRNA, miRNA_high_conf) # hairpin.hairpin_stats(_mirnas, candidate_to_miRNA, miRNA_high_conf) not_mapped_reads = [structure.Sequence(i,n,read) for i,(read,n) in enumerate(zip(reads, reads_count)) if read not in seq_to_candidates] # aligning small sequences against hairpins align_small_seqs(candidates, small_reads, small_reads_count) # lc_10 = pickle.load( open("lc_scores_10.p", "rb")) # lc_10_all = pickle.load( open("lc_scores_10_w_cand.p", "rb")) # lc_names = pickle.load( open("save_low_confidence_names.p", "rb")) # # # # # # # small_seq_stats(_mirna_lc, lc_10, lc_names, candidate_to_miRNA) # print lc_names[0] # assert 0 # small_seq_stats(candidates) small_seq_stats(_mirna_hc) # for testing only # A/U ends for all remaining candidates tailing.tailing_au_fast(candidates, not_mapped_reads) # degree of entropy in structure and nucleotides entropy.entropy(candidates) # # heterogenity (position counting) heterogenity.heterogenity(candidates) # # candidate quality: nr of sequence hits / all candidate hits for given sequences quality.candidate_quality(candidates, seq_to_candidates) # save candidates again here ? # pickle.dump(candidates, open("candidates_with_features.p", "wb")) # plotting all features # FEATURES = ["hairpin_energy", "hairpin_energy_10", "hairpin_energy_40", # "entropy_nucleotides", "entropy_structure", "heterogenity_5_begin", # "heterogenity_5_end", "heterogenity_3_begin", "heterogenity_3_end", # "quality", "bindings_max_10", "overhang_level_outer_10", # "overhang_outer_10", "overhang_level_inner_10", "overhang_inner_10", # "bulge_factor"] # # # log_scaled = [False]*16 + [True]*3 # print log_scaled # for feat_name, logs in zip(FEATURES, log_scaled): # # plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead, # miRNA_high_conf, feat_name, logs ) # removed_nonvalues = [c for c in candidates if c.ratio_short_long_5p != 1.0] # # print "nonvalues left", len( [c for c in candidates if c.ratio_short_long_5p == 1.0]) # # plot_any.plot(removed_nonvalues, candidate_to_miRNA, candidate_to_dead, # miRNA_high_conf, "ratio_short_long_5p", isLog=True ) # short_correlate_13_17_max = [] # maxlen_range = range(13, 18) # # for i in maxlen_range: # res = plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead, # miRNA_high_conf, "short_seq_align_10_"+str(i), isLog=False ) # # short_correlate_13_17_max.append(res) # # for l in short_correlate_13_17_max: # print l # # print # for l in zip(*short_correlate_13_17_max): # print l # # (ks_val, p_2s_ks, t_student, p_student, t_welch, p_welch) = zip(*short_correlate_13_17_max) # # # plot_kstest(ks_val, maxlen_range, True) # plot_ttest(t_student, t_welch, maxlen_range, True) # # # short_correlate_8_17_min = [] # minlen_range = range(8, 18) # for i in minlen_range: # res = plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead, # miRNA_high_conf, "short_seq_align_" + str(i) + "_17", isLog=False ) # # short_correlate_8_17_min.append(res) # # for l in short_correlate_8_17_min: # print l # # print # for l in zip(*short_correlate_8_17_min): # print l # # (ks_val, p_2s_ks, t_student, p_student, t_welch, p_welch) = zip(*short_correlate_8_17_min) # # plot_kstest(ks_val, minlen_range, False) # plot_ttest(t_student, t_welch, minlen_range, False) d_hp = 0 d_tot = 0 c_hp = 0 c_tot = 0 h_hp = 0 h_tot = 0 l_hp = 0 l_tot = 0 m_hp = 0 m_tot = 0 for c in candidates: hashval = c.chromosome + c.chromosome_direction + str(c.hairpin_start) hairpin_one = 1 if c.has_hairpin_struct else 0 if hashval in candidate_to_dead: d_hp += hairpin_one d_tot += 1 elif hashval in candidate_to_miRNA: mi = candidate_to_miRNA[hashval] if mi in miRNA_high_conf: h_hp += hairpin_one h_tot += 1 else: l_hp += hairpin_one l_tot += 1 elif hashval in mirdeep_new: m_hp += hairpin_one m_tot += 1 else: c_hp += hairpin_one c_tot += 1 print "===============================================================================" print "dead: \t", d_hp, d_tot, (d_hp*1.0 / d_tot) print "candidates:\t", c_hp, c_tot, (c_hp*1.0 / c_tot) print "high conf:\t", h_hp, h_tot, (h_hp*1.0 / h_tot) print "low conf:\t", l_hp, l_tot, (l_hp*1.0 / l_tot) print "mirDeep2:\t", m_hp, m_tot, (m_hp*1.0 / m_tot) print "===============================================================================" FEATURES_old = [ "hairpin_energy_10", "entropy_nucleotides", "entropy_structure", "heterogenity_5_begin", "heterogenity_5_end", "heterogenity_3_begin", "heterogenity_3_end", "quality", "overhang_level_outer_10", "overhang_outer_10", "overhang_level_inner_10", "overhang_inner_10", ] FEATURES_old = [ "hairpin_energy_10", "entropy_nucleotides", "entropy_structure", "heterogenity_5_begin", "heterogenity_5_end", "heterogenity_3_begin", "heterogenity_3_end", "overhang_level_outer_10", "overhang_outer_10", "overhang_level_inner_10", "overhang_inner_10", "quality" ] FEATURES = [ # "ratio_short_long", "short_seq_align", # 0 until best val is found # "ratio_short_long_logval", # "leading_au", # "tailing_au", "overhang_inner", "overhang_outer", "loop_size", "folds_5p", "folds_3p", "folds_before", "folds_after", ] all_features = FEATURES + FEATURES_old logvals = [True] + [False]*10 + [False]*len(FEATURES_old) # plot_pearson_correlation(candidates, FEATURES) # plot_spearman_correlation(candidates, FEATURES) plot_spearman_correlation(candidates, all_features) LC_100_best_all = pickle.load( open("LC_100_best_all.p", "rb")) LC_100_best_nonhp = pickle.load( open("LC_100_best_nonhp.p", "rb")) LC_100_worst_all = pickle.load( open("LC_100_worst_all.p", "rb")) LC_100_worst_nonhp = pickle.load( open("LC_100_worst_nonhp.p", "rb")) c_scores = [] mir_scores = [] for feat_name, lv in zip(all_features, logvals): # (ks_mirdeep, ks_classify) = plot_candidate_results(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, # mirdeep_new, hp50_to_candidate, hp99_to_candidate, feat_name, lv) plot_LC_results(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, LC_100_worst_all, LC_100_worst_nonhp, LC_100_best_all, LC_100_best_nonhp, feat_name, lv) # c_scores.append(ks_classify) # mir_scores.append(ks_mirdeep) pyplot.plot(c_scores) pyplot.plot(mir_scores) pyplot.title("mirDeep") pyplot.show() #======================================================================= # plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead, # miRNA_high_conf, feat_name ) #======================================================================= #=========================================================================== # for feat_name, lv in zip(all_features, logvals): # # plot_mirdeep(candidates, candidate_to_miRNA, candidate_to_dead, # miRNA_high_conf, mirdeep_new, feat_name, lv) # #=========================================================================== print print " features finished:", time.clock() - start_time, " seconds" #=============================================================================== # saving data for use in classification #=============================================================================== def is_good_or_miRNA(c): return _is_miRNA(c) or c.has_hairpin_struct or _is_dead(c) def is_bad_or_miRNA(c): return _is_miRNA(c) or _is_dead(c) or not c.has_hairpin_struct # def is_good_candidate(c): # du # return c.has_hairpin_struct and not _is_dead(c) and not _is_miRNA(c) # # # removed bad candidates # removed_bad_candidates = [c for c in candidates if is_good_or_miRNA(c)] # classifying LC, not using low quality miRNA # annotated_data, annotations, low_confidence_data = create_folds(removed_bad_candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, miRNA_fam, ml_folds) # all candidates annotated_data, annotations, low_confidence_data = create_folds(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, miRNA_fam, ml_folds) # removed the good candidates (which are classified) print "candidates?" not_good_candidates = [c for c in candidates if is_bad_or_miRNA(c)] # classifying bad candidates annotated_data_new, annotations_new, _low_confidence_data = create_folds(not_good_candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, miRNA_fam, ml_folds) low_confidence_names = [map(get_miRNAid, l) for l in low_confidence_data] def sum_reads(c): reads = 0.0 for el in c.mapped_sequences: name = el.data[1] reads += float(name.split("-")[1]) return reads low_confidence_reads = [sum_reads(c) for fold in low_confidence_data for c in fold ] print low_confidence_reads pickle.dump(low_confidence_reads, open("low_confidence_reads.p", "wb")) # assert 0 def scale_data(data): return preprocessing.scale(data, with_mean=False, with_std=False) # data for LC classification: vector_data = map(vectorize.candidates_to_array, annotated_data) scaled_data = map(scale_data, vector_data) # low confidence miRNA low_confidence_data = map(vectorize.candidates_to_array, low_confidence_data) low_confidence_data = map(scale_data, low_confidence_data) # data for new miRNA classification: vector_data_new = map(vectorize.candidates_to_array, annotated_data_new) scaled_data_new = map(scale_data, vector_data_new) # good candidates (not miRNA, has hairpin struct) hp_candidates = [c for c in candidates if is_good_candidate(c)] print "hp_candidates", len(hp_candidates) hp_candidates = vectorize.candidates_to_array(hp_candidates) hp_candidates = map(scale_data, hp_candidates) print print "saving data ...", #=========================================================================== # data used for classifying LOW CONFIDENCE: #=========================================================================== pickle.dump(scaled_data, open("save_scaled_data.p", "wb")) # candidates, HC, DEAD pickle.dump(annotations, open("save_an.p", "wb")) # annotations for data over [00001111000] pickle.dump(low_confidence_data, open("save_low_confidence_data.p", "wb")) # low confidence miRNA pickle.dump(low_confidence_names, open("save_low_confidence_names.p", "wb")) # mirbase names, like >hsa-mir-516a-1 # pickle.dump(annotated_data, open("save_da.p", "wb")) # extra stuff not needed #=========================================================================== # data used for classifying new miRNA: #=========================================================================== pickle.dump(scaled_data_new, open("save_scaled_data_new.p", "wb")) # non-hp candidates, HC, DEAD pickle.dump(annotations_new, open("save_an_new.p", "wb")) # annotations for data over [001111000] pickle.dump(hp_candidates, open("save_hp_candidates_new.p", "wb")) # annotations for data over [001111000] #TODO: position in genome for all candidates probably... print "...saved." print "Now loading back for testing ...", annotations = pickle.load( open("save_an.p", "rb")) scaled123 = pickle.load( open("save_scaled_data.p", "rb")) low_confidence_data = pickle.load( open("save_low_confidence_data.p", "rb")) low_confidence_names = pickle.load( open("save_low_confidence_names.p", "rb")) # annotated_data = pickle.load( open("save_da.p", "rb")) scaled_data_new = pickle.load( open("save_scaled_data_new.p", "rb")) annotations_new = pickle.load( open("save_an_new.p", "rb")) hp_candidates = pickle.load( open("save_hp_candidates_new.p", "rb")) print "... done", (len(scaled123)) print "finished:", time.clock() - start_time, "seconds"