def process_varscan_samples(): avail_samples = [x for x in os.listdir(XPDIR + "results/varscan/") if x.endswith(".vcf")] for _, samp in time_iterator(avail_samples, logger, msg_prefix="VARSCAN results"): name = samp.split("_on_")[0] try: if mode == "UNSUPERVISED": res = process_varscan_sample_unsupervised(sample_name=name) else: res = process_varscan_sample(sample_name=name) except Exception as e: print "failed on sample", samp # continue raise e yield res
def process_micado_samples(): # avail_samples = [x for x in os.listdir("../micado_synthetic_results/synthetic/") if x.endswith(".significant_alterations.json")] avail_samples = [x for x in os.listdir(XPDIR + "results/micado/") if x.endswith(".significant_alterations.json")] for _, samp in time_iterator(avail_samples, logger, msg_prefix="MICADo results"): name = samp.split(".")[0] try: if mode == "UNSUPERVISED": res = process_micado_sample_unsupervised(sample_name=name) else: res = process_micado_sample(sample_name=name) except Exception as e: print "failed on sample", samp raise e # continue yield res
def process_varscan_samples(): avail_samples = [ x for x in os.listdir("data/synthetic/results/varscan/") if x.endswith(".vcf") ] for _, samp in time_iterator(avail_samples, logger, msg_prefix="VARSCAN results"): name = samp.split("_on_")[0] try: res = process_varscan_sample(sample_name=name) except Exception as e: print "failed on sample", samp continue # raise e yield res
def build_read_library(FASTQFILE_PATH): read_library = collections.defaultdict(list) FASTQFILE_ALL = os.listdir(FASTQFILE_PATH) logger.info("Found %d fastq file to process", len(FASTQFILE_ALL)) for j, a_fastq_file in time_iterator(FASTQFILE_ALL, logger, msg_prefix="Building read library"): if a_fastq_file == ".DS_Store": continue fastq = open(FASTQFILE_PATH + "/" + a_fastq_file, 'r') lines = fastq.readlines() fastq.close() lines = map(str.strip, lines) for i_line in range(1, len(lines), 4): read_library[a_fastq_file].append(lines[i_line]) # mutate everything back to lists # read_library[a_fastq_file]={k:list(v) for k,v in read_library.items()} return read_library
def process_varscan_samples(): avail_samples = [ x for x in os.listdir(XPDIR + "results/varscan/") if x.endswith(".vcf") ] for _, samp in time_iterator(avail_samples, logger, msg_prefix="VARSCAN results"): name = samp.split("_on_")[0] try: if mode == "UNSUPERVISED": res = process_varscan_sample_unsupervised(sample_name=name) else: res = process_varscan_sample(sample_name=name) except Exception as e: print "failed on sample", samp # continue raise e yield res
def process_micado_samples(): # avail_samples = [x for x in os.listdir("../micado_synthetic_results/synthetic/") if x.endswith(".significant_alterations.json")] # avail_samples = [x for x in os.listdir("data/synthetic/results/micado/") if x.endswith(".significant_alterations.json")] avail_samples = [ x for x in os.listdir("data/synthetic/results/micado/") if x.endswith(".combined_alterations.json") ] for _, samp in time_iterator(avail_samples, logger, msg_prefix="MICADo results"): name = samp.split(".")[0] try: res = process_micado_sample(sample_name=name) except Exception as e: print "failed on sample", samp # raise e continue yield res
def process_micado_samples(): # avail_samples = [x for x in os.listdir("../micado_synthetic_results/synthetic/") if x.endswith(".significant_alterations.json")] avail_samples = [ x for x in os.listdir(XPDIR + "results/micado/") if x.endswith(".significant_alterations.json") ] for _, samp in time_iterator(avail_samples, logger, msg_prefix="MICADo results"): name = samp.split(".")[0] try: if mode == "UNSUPERVISED": res = process_micado_sample_unsupervised(sample_name=name) else: res = process_micado_sample(sample_name=name) except Exception as e: print "failed on sample", samp raise e # continue yield res
def build_read_library(FASTQFILE_PATH): pattern = re.compile('([NC])_(\d+)_(\d+)') read_library = {'N': collections.defaultdict(set), 'C': collections.defaultdict(set)} FASTQFILE_ALL = os.listdir(FASTQFILE_PATH) logger.info("Found %d fastq file to process", len(FASTQFILE_ALL)) for j, a_fastq_file in time_iterator(FASTQFILE_ALL, logger, msg_prefix="Building read library"): if a_fastq_file == ".DS_Store": continue fragment = pattern.search(a_fastq_file).group(1) individu = pattern.search(a_fastq_file).group(2) fastq = open(FASTQFILE_PATH + "/" + a_fastq_file, 'r') lines = fastq.readlines() fastq.close() lines = map(str.strip, lines) # if individu not in read_library[fragment]: # read_library[fragment][individu] = [] for i_line in range(1, len(lines), 4): read_library[fragment][individu].add(lines[i_line]) #mutate everything back to lists read_library['N']={k:list(v) for k,v in read_library['N'].items()} read_library['C']={k:list(v) for k,v in read_library['C'].items()} return read_library
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, max_len, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None, output_results=None, disable_cycle_breaking=False): import seq_lib as seq_lib_module seq_lib_module.library_itit(experiment_name) # g_reference construction logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file) g_reference = RG(kmer_length, fasta_file, snp_file) # Is there cycles in reference graph? if list(nx.simple_cycles(g_reference.dbg)): if kmer_length >= 70: logger.info("There are always cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length+1) return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results, disable_cycle_breaking=disable_cycle_breaking) # g_patient construction logger.info("Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage) fastq_files = fastq_files.split(",") g_patient = PG(fastq_files, kmer_length) logger.info("Before cleaning: %d nodes", len(g_patient.dbg)) g_patient.graph_cleaned_init(min_support_percentage) logger.info("After cleaning: %d nodes", len(g_patient.dbgclean)) # Is there cycles in patient graph? if not disable_cycle_breaking and list(nx.simple_cycles(g_patient.dbgclean)): if kmer_length >= 70: logger.info("There are still cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length+1) return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results) # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation) g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg) # search for alternative paths in dbg_refrm (.alteration_list creation) g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage, max_len) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers = set() for an_alt in g_patient.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for _, _ in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers, seq_lib_module=seq_lib_module) for i in range(0, len(g_patient.alteration_list)): i_alteration = g_patient.alteration_list[i] ref_path = i_alteration.reference_path alt_path = i_alteration.alternative_path g_random_data = g_random.check_path(ref_path, alt_path, i_alteration.min_coverage) i_alteration.random_ratio_list.append(g_random_data[0]) i_alteration.random_reference_count_list.append(g_random_data[1]) i_alteration.random_alternative_count_list.append(g_random_data[2]) logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list)) for i in range(0, len(g_patient.alteration_list)): g_patient.alteration_list[i].pvalue_init() g_patient.significant_alteration_list_init(p_value_threshold=p_value_threshold) # Annotation annotate_and_output_results(g_patient, g_reference, output_results) # SNP dir_stat = get_or_create_dir("output/snp") graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w') for snp_id in g_reference.snp.keys(): if g_reference.snp[snp_id][1] in g_patient.dbgclean: if g_reference.snp[snp_id][0] in g_patient.dbgclean: graph_snp.write("%s\t%s\t%d\t%d\n" % ( sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n']))) else: graph_snp.write( "%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
def build_a_sample(n_reads, fraction_altered, n_alterations, output_file_prefix, alterations_weight=None, multi_mismatch=False): if not alterations_weight: alterations_weight = [1.0, 1.0, 1.0] global all_ranges # sample some reads sub_reads = aligned_reads.sample(n=n_reads, random_state=args.seed, replace=False) # compute reference coordinates using the CIGAR all_ranges = [] for i, an_alignment in sub_reads.iterrows(): all_ranges.extend(coordinate_map(an_alignment)) logger.info("Mapped coordinates to reference") all_ranges = pd.DataFrame.from_records(all_ranges) all_ranges.set_index("label", inplace=True, drop=False) # sample altered reads altered_reads_labels = sub_reads.QNAME.sample(int(len(sub_reads) * fraction_altered), random_state=args.seed, replace=False) altered_reads_row = all_ranges.ix[altered_reads_labels] non_altered_reads_labels = set(sub_reads.QNAME).difference(altered_reads_labels) assert set(altered_reads_labels).isdisjoint(set(non_altered_reads_labels)) # identify start and stop positions of reads that should be altered (with 10nt slack...) ref_start = min([min(x) for x in altered_reads_row.ref_coord]) + 10 ref_end = max([max(x) for x in altered_reads_row.ref_coord]) - 10 # sample random alterations, reads that should be altered / kept as it alterations_modify_content = False max_try = 100 i = 0 while (not alterations_modify_content) and (i < max_try): some_alterations = dict([random_alteration(ref_start, ref_end, weights=alterations_weight, multi_mismatch=multi_mismatch) for _ in range(n_alterations)]) # check that artificial alterations actually modify reads (case of generating a substitution corresponding to the actual content of the read) a_label = random.choice(altered_reads_labels) altered_sequence = "".join(mutating_sequence_iterator(read_label=a_label, alterations=some_alterations)) non_altered_sequence = sub_reads.ix[a_label].SEQ if min_dist([x[0] for x in some_alterations])<=20: logger.info("Alterations %s are too close, iterating", some_alterations) elif altered_sequence != non_altered_sequence: alterations_modify_content = True else: logger.info("Alterations %s correspond to the real read content, iterating", some_alterations) i += 1 logger.info("Generated alterations %s after %d trial", some_alterations, i) # generate original reads with open(output_file_prefix + "_non_alt.fastq", "w") as f: for i, read_label in time_iterator(sub_reads.QNAME, logger, msg_prefix="Generating non altered fastq, non altered reads", delta_percent=0.1): print >> f, "@%s" % (clean_label(read_label)) + "_ORIG" print >> f, sub_reads.ix[read_label].SEQ print >> f, "+" print >> f, sub_reads.ix[read_label].QUAL print >> f, "\n" # generate altered reads fastq files output_reads = set() with open(output_file_prefix + ".fastq", "w") as f: for i, read_label in time_iterator(altered_reads_labels, logger, msg_prefix="Generating altered fastq, altered reads", delta_percent=0.1): assert read_label not in output_reads output_reads.add(read_label) print >> f, "@%s" % (clean_label(read_label)) + "_ALT" print >> f, "".join(mutating_sequence_iterator(read_label=read_label, alterations=some_alterations)) print >> f, "+" print >> f, "".join(mutating_sequence_iterator(read_label=read_label, alterations=some_alterations, output="qual")) print >> f, "\n" for i, read_label in time_iterator(non_altered_reads_labels, logger, msg_prefix="Generating altered fastq, non altered reads", delta_percent=0.1): assert read_label not in output_reads output_reads.add(read_label) print >> f, "@%s" % (clean_label(read_label)) + "_ORIG" print >> f, sub_reads.ix[read_label].SEQ print >> f, "+" print >> f, sub_reads.ix[read_label].QUAL print >> f, "\n" serialize_results(output_file_prefix, some_alterations) logger.info("finished generation for %d reads, %d alterations, output files are", n_reads, n_alterations) logger.info("%s: Original sampled reads", output_file_prefix + "_non_alt.fastq") logger.info("%s: Altered sampled reads", output_file_prefix + ".fastq") logger.info("%s: Alterations description", output_file_prefix + ".alterations.txt") logger.info("Alterations are %s", some_alterations)
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None, destination_directory=".", export_gml=False, output_results=None): if experiment_name == "TP53": from randomreadsgraph_TP53 import RandomReadsGraph as RRG else: from randomreadsgraph import RandomReadsGraph as RRG # g_reference construction logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file) g_reference = RG(kmer_length, fasta_file, snp_file) # Is there cycles in reference graph? if list(nx.simple_cycles(g_reference.dbg)): if kmer_length > 70: logger.info("There are always cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length) return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results) # g_patient construction logger.info( "Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage) fastq_files = fastq_files.split(",") g_patient = PG(fastq_files, kmer_length) logger.info("Before cleaning: %d nodes", len(g_patient.dbg)) g_patient.graph_cleaned_init(min_support_percentage) logger.info("After cleaning: %d nodes", len(g_patient.dbgclean)) # Is there cycles in patient graph? if list(nx.simple_cycles(g_patient.dbgclean)): if kmer_length > 70: logger.info("There are still cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length) return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results) # Some prints for stats dir_stat = get_or_create_dir("output/statistics") # graph stat graph_stat_file = open(dir_stat + "/graph_stat_file" + sample_key + ".tsv", 'w') graph_stat_file.write("%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" % (kmer_length, g_reference.dbg.size(), sample_key, g_patient.coverage['total'], g_patient.dbg.size(), g_patient.dbgclean.size(), g_patient.dbg.in_degree().values().count(0), g_patient.dbg.out_degree().values().count(0), g_patient.dbgclean.in_degree().values().count(0), g_patient.dbgclean.out_degree().values().count(0))) # kmer stat kmer_stat_file = open(dir_stat + "/kmer_stat_file" + sample_key + ".tsv", 'w') for node_print in g_patient.dbg.nodes(): fragment_print = ",".join(g_patient.dbg.node[node_print]['fastq_id']) reads_print = len(g_patient.dbg.node[node_print]['read_list_n']) kmer_stat_file.write("%s\t%s\t%s\t%d\n" % ( sample_key, node_print, fragment_print, reads_print, )) # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation) g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg) # search for alternative paths in dbg_refrm (.alteration_list creation) g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers = set() for an_alt in g_patient.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers) for i_alteration in range(0, len(g_patient.alteration_list)): g_random_data = g_random.check_path( g_patient.alteration_list[i_alteration].reference_path, g_patient.alteration_list[i_alteration].alternative_path, g_patient.alteration_list[i_alteration].min_coverage) g_patient.alteration_list[i_alteration].random_ratio_list.append( g_random_data[0]) g_patient.alteration_list[ i_alteration].random_reference_count_list.append( g_random_data[1]) g_patient.alteration_list[ i_alteration].random_alternative_count_list.append( g_random_data[2]) logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list)) for i_alteration in range(0, len(g_patient.alteration_list)): g_patient.alteration_list[i_alteration].pvalue_init() g_patient.significant_alteration_list_init( p_value_threshold=p_value_threshold) # If more than one significant alteration, check if they are not in "spike" (en épis) if len(g_patient.significant_alteration_list) > 1: g_patient.multiple_alternative_path_filter() ## Stat # alteration stat alt_stat_file = open(dir_stat + "/alt_stat_file" + sample_key + ".tsv", 'w') for i_alteration in range(0, len(g_patient.alteration_list)): if g_patient.alteration_list[i_alteration].pvalue_ratio <= 1: alt_stat_file.write( "%d\t%s\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\n" % (i_alteration + 1, sample_key, g_patient.coverage['total'], g_patient.alteration_list[i_alteration].reference_sequence, g_patient.alteration_list[i_alteration].alternative_sequence, g_patient.alteration_list[i_alteration].reference_read_count, g_patient.alteration_list[i_alteration]. alternative_read_count, g_patient.alteration_list[i_alteration].ratio_read_count, g_patient.alteration_list[i_alteration].pvalue_ratio, str(g_patient.alteration_list[i_alteration].zscore), "\t".join( map( str, g_patient.alteration_list[i_alteration]. random_ratio_list)))) # For visualisation graph_name = "G_%s_" % sample_key merged_graph_name = "G_%s_merged_" % sample_key cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage if export_gml: logger.info("Will save viz graph for %s with k==%d", sample_key, kmer_length) get_or_create_dir(destination_directory) # for the refrence graph g_reference_merge = VISU.merge_reference_graph(g_reference.dbg.copy()) g_reference_visu = VISU.reference_graph_visualization_formatting( g_reference.dbg.copy()) g_reference_merge_visu = VISU.reference_graph_merged_visualization_formatting( g_reference_merge.copy()) nx.write_gml( g_reference_visu, destination_directory + "/g_reference_visu" + str(kmer_length) + ".gml") nx.write_gml( g_reference_merge_visu, destination_directory + "/g_reference_merge_visu" + str(kmer_length) + ".gml") # for the patient graph g_patient_visu = VISU.individu_graph_visualization_formating( g_patient.dbg.copy(), g_reference.dbg.copy()) g_patient_clean_visu = VISU.individu_graph_visualization_formating( g_patient.dbgclean.copy(), g_reference.dbg.copy()) g_patient_merged = VISU.merge_individu_graph(g_patient.dbg.copy(), g_reference.dbg.copy()) g_patient_merged_visu = VISU.individu_graph_merged_visualization_formating( g_patient_merged.copy(), g_reference.dbg.copy()) g_patient_clean_merged = VISU.merge_individu_graph( g_patient.dbgclean.copy(), g_reference.dbg.copy()) g_patient_clean_merged_visu = VISU.individu_graph_merged_visualization_formating( g_patient_clean_merged.copy(), g_reference.dbg.copy()) nx.write_gml( g_patient_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml") nx.write_gml( g_patient_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml") nx.write_gml( g_patient_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml") nx.write_gml( g_patient_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml") # Annotation if experiment_name == "TP53": annotate_and_output_results(g_patient, g_reference, output_results) # SNP dir_stat = get_or_create_dir("output/snp") # graph stat graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w') for snp_id in g_reference.snp.keys(): if g_reference.snp[snp_id][1] in g_patient.dbgclean: if g_reference.snp[snp_id][0] in g_patient.dbgclean: graph_snp.write("%s\t%s\t%d\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id] [0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id] [1]]['read_list_n']))) else: graph_snp.write("%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id] [1]]['read_list_n'])))
def build_a_sample(n_reads, fraction_altered, n_alterations, output_reads_prefix, output_result_prefix, alterations_weight=None, multi_mismatch=False): if not alterations_weight: alterations_weight = [1.0, 1.0, 1.0] global all_ranges # sample some reads sub_reads = aligned_reads.sample(n=n_reads, random_state=args.seed, replace=False) # compute reference coordinates using the CIGAR all_ranges = [] for i, an_alignment in sub_reads.iterrows(): all_ranges.extend(coordinate_map(an_alignment)) logger.info("Mapped coordinates to reference") all_ranges = pd.DataFrame.from_records(all_ranges) all_ranges.set_index("label", inplace=True, drop=False) # sample altered reads altered_reads_labels = sub_reads.QNAME.sample(int(len(sub_reads) * fraction_altered), random_state=args.seed, replace=False) altered_read_rows = all_ranges.ix[altered_reads_labels] non_altered_reads_labels = set(sub_reads.QNAME).difference(altered_reads_labels) assert set(altered_reads_labels).isdisjoint(set(non_altered_reads_labels)) # pick a random label to test alterations a_label = random.choice(altered_reads_labels) if n_alterations > 0: some_alterations = generate_alterations(a_label, alterations_weight, altered_read_rows, multi_mismatch, n_alterations, sub_reads) else: some_alterations = {} if args.do_not_output_reads: return some_alterations # generate original reads with open(output_reads_prefix + "_non_alt.fastq", "w") as f: for i, read_label in time_iterator(sub_reads.QNAME, logger, msg_prefix="Generating non altered fastq, non altered reads", delta_percent=0.3): print >> f, "@%s" % (clean_label(read_label)) + "_ORIG" print >> f, sub_reads.ix[read_label].SEQ print >> f, "+" print >> f, sub_reads.ix[read_label].QUAL # print >> f, "\n" # generate altered reads fastq files output_reads = set() with open(output_reads_prefix + ".fastq", "w") as f: for i, read_label in time_iterator(altered_reads_labels, logger, msg_prefix="Generating altered fastq, altered reads", delta_percent=0.3): assert read_label not in output_reads output_reads.add(read_label) print >> f, "@%s" % (clean_label(read_label)) + "_ALT" print >> f, "".join(mutating_sequence_iterator(read_label=read_label, alterations=some_alterations)) print >> f, "+" print >> f, "".join(mutating_sequence_iterator(read_label=read_label, alterations=some_alterations, output="qual")) # print >> f, "\n" for i, read_label in time_iterator(non_altered_reads_labels, logger, msg_prefix="Generating altered fastq, non altered reads", delta_percent=0.3): assert read_label not in output_reads output_reads.add(read_label) print >> f, "@%s" % (clean_label(read_label)) + "_ORIG" print >> f, sub_reads.ix[read_label].SEQ print >> f, "+" print >> f, sub_reads.ix[read_label].QUAL # print >> f, "\n" serialize_results(output_result_prefix, some_alterations) logger.info("finished generation for %d reads, %d alterations, output files are", n_reads, n_alterations) logger.info("%s: Original sampled reads", output_reads_prefix + "_non_alt.fastq") logger.info("%s: Altered sampled reads", output_reads_prefix + ".fastq") logger.info("%s: Alterations description", output_result_prefix + ".alterations.txt") logger.info("Alterations are %s", some_alterations) pp.pprint(sorted(some_alterations.items(), key=lambda (pos, alt): pos[0]))
def build_a_sample(n_reads, fraction_altered, n_alterations, output_file_prefix, alterations_weight=None, multi_mismatch=False): if not alterations_weight: alterations_weight = [1.0, 1.0, 1.0] global all_ranges # sample some reads sub_reads = aligned_reads.sample(n=n_reads, random_state=args.seed, replace=False) # compute reference coordinates using the CIGAR all_ranges = [] for i, an_alignment in sub_reads.iterrows(): all_ranges.extend(coordinate_map(an_alignment)) logger.info("Mapped coordinates to reference") all_ranges = pd.DataFrame.from_records(all_ranges) all_ranges.set_index("label", inplace=True, drop=False) # sample altered reads altered_reads_labels = sub_reads.QNAME.sample(int( len(sub_reads) * fraction_altered), random_state=args.seed, replace=False) altered_reads_row = all_ranges.ix[altered_reads_labels] non_altered_reads_labels = set( sub_reads.QNAME).difference(altered_reads_labels) assert set(altered_reads_labels).isdisjoint(set(non_altered_reads_labels)) # identify start and stop positions of reads that should be altered (with 10nt slack...) ref_start = min([min(x) for x in altered_reads_row.ref_coord]) + 10 ref_end = max([max(x) for x in altered_reads_row.ref_coord]) - 10 # sample random alterations, reads that should be altered / kept as it alterations_modify_content = False max_try = 100 i = 0 while (not alterations_modify_content) and (i < max_try): some_alterations = dict([ random_alteration(ref_start, ref_end, weights=alterations_weight, multi_mismatch=multi_mismatch) for _ in range(n_alterations) ]) # check that artificial alterations actually modify reads (case of generating a substitution corresponding to the actual content of the read) a_label = random.choice(altered_reads_labels) altered_sequence = "".join( mutating_sequence_iterator(read_label=a_label, alterations=some_alterations)) non_altered_sequence = sub_reads.ix[a_label].SEQ if min_dist([x[0] for x in some_alterations]) <= 20: logger.info("Alterations %s are too close, iterating", some_alterations) elif altered_sequence != non_altered_sequence: alterations_modify_content = True else: logger.info( "Alterations %s correspond to the real read content, iterating", some_alterations) i += 1 logger.info("Generated alterations %s after %d trial", some_alterations, i) # generate original reads with open(output_file_prefix + "_non_alt.fastq", "w") as f: for i, read_label in time_iterator( sub_reads.QNAME, logger, msg_prefix="Generating non altered fastq, non altered reads", delta_percent=0.1): print >> f, "@%s" % (clean_label(read_label)) + "_ORIG" print >> f, sub_reads.ix[read_label].SEQ print >> f, "+" print >> f, sub_reads.ix[read_label].QUAL print >> f, "\n" # generate altered reads fastq files output_reads = set() with open(output_file_prefix + ".fastq", "w") as f: for i, read_label in time_iterator( altered_reads_labels, logger, msg_prefix="Generating altered fastq, altered reads", delta_percent=0.1): assert read_label not in output_reads output_reads.add(read_label) print >> f, "@%s" % (clean_label(read_label)) + "_ALT" print >> f, "".join( mutating_sequence_iterator(read_label=read_label, alterations=some_alterations)) print >> f, "+" print >> f, "".join( mutating_sequence_iterator(read_label=read_label, alterations=some_alterations, output="qual")) print >> f, "\n" for i, read_label in time_iterator( non_altered_reads_labels, logger, msg_prefix="Generating altered fastq, non altered reads", delta_percent=0.1): assert read_label not in output_reads output_reads.add(read_label) print >> f, "@%s" % (clean_label(read_label)) + "_ORIG" print >> f, sub_reads.ix[read_label].SEQ print >> f, "+" print >> f, sub_reads.ix[read_label].QUAL print >> f, "\n" serialize_results(output_file_prefix, some_alterations) logger.info( "finished generation for %d reads, %d alterations, output files are", n_reads, n_alterations) logger.info("%s: Original sampled reads", output_file_prefix + "_non_alt.fastq") logger.info("%s: Altered sampled reads", output_file_prefix + ".fastq") logger.info("%s: Alterations description", output_file_prefix + ".alterations.txt") logger.info("Alterations are %s", some_alterations)
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, max_len, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None, output_results=None, disable_cycle_breaking=False): import seq_lib as seq_lib_module seq_lib_module.library_itit(experiment_name) # g_reference construction logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file) g_reference = RG(kmer_length, fasta_file, snp_file) # Is there cycles in reference graph? if list(nx.simple_cycles(g_reference.dbg)): if kmer_length >= 70: logger.info("There are always cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length + 1) return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results, disable_cycle_breaking=disable_cycle_breaking) # g_patient construction logger.info( "Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage) fastq_files = fastq_files.split(",") g_patient = PG(fastq_files, kmer_length) logger.info("Before cleaning: %d nodes", len(g_patient.dbg)) g_patient.graph_cleaned_init(min_support_percentage) logger.info("After cleaning: %d nodes", len(g_patient.dbgclean)) # Is there cycles in patient graph? if not disable_cycle_breaking and list(nx.simple_cycles( g_patient.dbgclean)): if kmer_length >= 70: logger.info("There are still cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length + 1) return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results) # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation) g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg) # search for alternative paths in dbg_refrm (.alteration_list creation) g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage, max_len) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers = set() for an_alt in g_patient.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for _, _ in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers, seq_lib_module=seq_lib_module) for i in range(0, len(g_patient.alteration_list)): i_alteration = g_patient.alteration_list[i] ref_path = i_alteration.reference_path alt_path = i_alteration.alternative_path g_random_data = g_random.check_path(ref_path, alt_path, i_alteration.min_coverage) i_alteration.random_ratio_list.append(g_random_data[0]) i_alteration.random_reference_count_list.append(g_random_data[1]) i_alteration.random_alternative_count_list.append(g_random_data[2]) logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list)) for i in range(0, len(g_patient.alteration_list)): g_patient.alteration_list[i].pvalue_init() g_patient.significant_alteration_list_init( p_value_threshold=p_value_threshold) # Annotation annotate_and_output_results(g_patient, g_reference, output_results) # SNP dir_stat = get_or_create_dir("output/snp") graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w') for snp_id in g_reference.snp.keys(): if g_reference.snp[snp_id][1] in g_patient.dbgclean: if g_reference.snp[snp_id][0] in g_patient.dbgclean: graph_snp.write("%s\t%s\t%d\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id] [0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id] [1]]['read_list_n']))) else: graph_snp.write("%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id] [1]]['read_list_n'])))
continue record = build_alteration_pair_description(pair) record.update(xp_metadata) accounted_alt.add(record['micado_hash']) accounted_alt.add(record['injected_hash']) result_table.append(record) return result_table results_dir = "../micado_synthetic_results/synthetic/" avail_results = [results_dir + x for x in os.listdir(results_dir) if x.endswith(".json") and "combined" in x] len(avail_results) result_table = [] for i, input_json in time_iterator(avail_results, logger=logger): # input_json = random.choice(avail_results) with open(input_json, "r") as f: try: result_dict = simplejson.load(f) except simplejson.JSONDecodeError: logger.critical("Malformed json file %s",input_json) continue # result_dict.keys() # result_dict['sampler']['injected_alterations'] # result_dict['significant_alterations'] this_result_table = tabulate_result(result_dict) result_table.extend(this_result_table) all_results = pd.DataFrame.from_records(result_table)
def build_a_sample(n_reads, fraction_altered, n_alterations, output_reads_prefix, output_result_prefix, alterations_weight=None, multi_mismatch=False): if not alterations_weight: alterations_weight = [1.0, 1.0, 1.0] global all_ranges # sample some reads sub_reads = aligned_reads.sample(n=n_reads, random_state=args.seed, replace=False) # compute reference coordinates using the CIGAR all_ranges = [] for i, an_alignment in sub_reads.iterrows(): all_ranges.extend(coordinate_map(an_alignment)) logger.info("Mapped coordinates to reference") all_ranges = pd.DataFrame.from_records(all_ranges) all_ranges.set_index("label", inplace=True, drop=False) # sample altered reads altered_reads_labels = sub_reads.QNAME.sample(int( len(sub_reads) * fraction_altered), random_state=args.seed, replace=False) altered_read_rows = all_ranges.ix[altered_reads_labels] non_altered_reads_labels = set( sub_reads.QNAME).difference(altered_reads_labels) assert set(altered_reads_labels).isdisjoint(set(non_altered_reads_labels)) # pick a random label to test alterations a_label = random.choice(altered_reads_labels) if n_alterations > 0: some_alterations = generate_alterations(a_label, alterations_weight, altered_read_rows, multi_mismatch, n_alterations, sub_reads) else: some_alterations = {} if args.do_not_output_reads: return some_alterations # generate original reads with open(output_reads_prefix + "_non_alt.fastq", "w") as f: for i, read_label in time_iterator( sub_reads.QNAME, logger, msg_prefix="Generating non altered fastq, non altered reads", delta_percent=0.3): print >> f, "@%s" % (clean_label(read_label)) + "_ORIG" print >> f, sub_reads.ix[read_label].SEQ print >> f, "+" print >> f, sub_reads.ix[read_label].QUAL # print >> f, "\n" # generate altered reads fastq files output_reads = set() with open(output_reads_prefix + ".fastq", "w") as f: for i, read_label in time_iterator( altered_reads_labels, logger, msg_prefix="Generating altered fastq, altered reads", delta_percent=0.3): assert read_label not in output_reads output_reads.add(read_label) print >> f, "@%s" % (clean_label(read_label)) + "_ALT" print >> f, "".join( mutating_sequence_iterator(read_label=read_label, alterations=some_alterations)) print >> f, "+" print >> f, "".join( mutating_sequence_iterator(read_label=read_label, alterations=some_alterations, output="qual")) # print >> f, "\n" for i, read_label in time_iterator( non_altered_reads_labels, logger, msg_prefix="Generating altered fastq, non altered reads", delta_percent=0.3): assert read_label not in output_reads output_reads.add(read_label) print >> f, "@%s" % (clean_label(read_label)) + "_ORIG" print >> f, sub_reads.ix[read_label].SEQ print >> f, "+" print >> f, sub_reads.ix[read_label].QUAL # print >> f, "\n" serialize_results(output_result_prefix, some_alterations) logger.info( "finished generation for %d reads, %d alterations, output files are", n_reads, n_alterations) logger.info("%s: Original sampled reads", output_reads_prefix + "_non_alt.fastq") logger.info("%s: Altered sampled reads", output_reads_prefix + ".fastq") logger.info("%s: Alterations description", output_result_prefix + ".alterations.txt") logger.info("Alterations are %s", some_alterations) pp.pprint(sorted(some_alterations.items(), key=lambda (pos, alt): pos[0]))
result_table.append(record) return result_table # results_dir = "../micado_synthetic_results/synthetic/" results_dir = "data/synthetic/results/micado/" avail_results = [ results_dir + x for x in os.listdir(results_dir) if x.endswith(".json") and "combined" in x ] len(avail_results) # avail_results=[results_dir+ 'C_FOOFOO_2897_150_045_3_1-1-1.combined_alterations.json'] result_table = [] for i, input_json in time_iterator(avail_results, logger=logger): # input_json = random.choice(avail_results) with open(input_json, "r") as f: try: result_dict = simplejson.load(f) except simplejson.JSONDecodeError: logger.critical("Malformed json file %s", input_json) continue # result_dict.keys() # result_dict['sampler']['injected_alterations'] # result_dict['significant_alterations'] this_result_table = tabulate_result(result_dict) result_table.extend(this_result_table) all_results = pd.DataFrame.from_records(result_table)
def process_sample(kmer_length, min_support_percentage, n_permutations, sample_key=None, c_fastq_file=None, n_fastq_file=None, destination_directory=".", export_gml=False): # g_ref construction logger.info("Will build reference graph with k==%d", kmer_length) g_ref = RG.ref_constructor(kmer_length) # g_ind construction fastq = [c_fastq_file, n_fastq_file] fastq = [f for f in fastq if f] logger.info("Will build sample graph for %s with k==%d and minimum support (percentage) = %d", fastq, kmer_length, min_support_percentage) g_test = IG(fastq, kmer_length) g_test.graph_cleaned_init(min_support_percentage) # .dbgclean creation # Is there cycles ? if list(nx.simple_cycles(g_test.dbgclean)): if kmer_length > 50: logger.info("There are always cycle(s) with k==50...exiting") sys.exit(0) # Check non depassement valeur limite de k return process_sample(kmer_length=kmer_length+1,sample_key=sample_key,c_fastq_file=c_fastq_file,n_fastq_file=n_fastq_file, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml) # Some prints for stats dir_stat = get_or_create_dir("output/statistics") # graph stat graph_stat_file = open(dir_stat+"/graph_stat_file"+sample_key+".tsv", 'w') graph_stat_file.write( "%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d"%( kmer_length, g_ref.size(), sample_key, g_test.coverage['C'], g_test.coverage['N'], g_test.dbg.size(), g_test.dbgclean.size(), g_test.dbg.in_degree().values().count(0), g_test.dbg.out_degree().values().count(0), g_test.dbgclean.in_degree().values().count(0), g_test.dbgclean.out_degree().values().count(0) )) # kmer stat kmer_stat_file = open(dir_stat+"/kmer_stat_file"+sample_key+".tsv", 'w') for node_print in g_test.dbg.nodes(): fragment_print = "".join(g_test.dbg.node[node_print]['fragment']) reads_print = len(g_test.dbg.node[node_print]['read_list_n']) kmer_stat_file.write( "%s\t%s\t%s\t%d\n"%( sample_key, node_print, fragment_print, reads_print, )) g_test.graph_rmRefEdges_init(g_test.dbgclean, g_ref) # .dbg_refrm creation # For visualisation graph_name = "G_%s_" % sample_key if export_gml: logger.info("Will save viz graph for %s with k==%d", fastq, kmer_length) get_or_create_dir(destination_directory) G_ref_merge = VISU.merge_reference_graph(g_ref.copy()) G_ref_visu = VISU.reference_graph_visualization_formatting(g_ref.copy()) G_ref_merge_visu = VISU.reference_graph_merged_visualization_formatting(G_ref_merge.copy()) nx.write_gml(G_ref_visu,destination_directory+"/G_ref_visu"+str(kmer_length)+".gml") nx.write_gml(G_ref_merge_visu,destination_directory+"/G_ref_merge_visu"+str(kmer_length)+".gml") g_test_visu = VISU.individu_graph_visualization_formating(g_test.dbg.copy(), g_ref.copy()) g_test_clean_visu = VISU.individu_graph_visualization_formating(g_test.dbgclean.copy(), g_ref.copy()) cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage nx.write_gml(g_test_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml") nx.write_gml(g_test_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml") # Graph merged logger.info("Will merge graph for %s with k==%d", fastq, kmer_length) g_test_merged = VISU.merge_individu_graph(g_test.dbg.copy(), g_ref.copy()) g_test_merged_visu = VISU.individu_graph_merged_visualization_formating(g_test_merged.copy(), g_ref.copy()) merged_graph_name = "G_%s_merged_" % sample_key nx.write_gml(g_test_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml") g_test_clean_merged = VISU.merge_individu_graph(g_test.dbgclean.copy(), g_ref.copy()) g_test_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(g_test_clean_merged.copy(), g_ref.copy()) merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage nx.write_gml(g_test_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml") # .alteration_list creation g_test.alteration_list_init(g_ref, kmer_length,min_support_percentage) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers=set() for an_alt in g_test.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_test.coverage, kmer_length,restrict_to=all_possible_kmers) for i_alteration in range(0, len(g_test.alteration_list)): g_random_data = g_random.check_path(g_test.alteration_list[i_alteration].reference_path, g_test.alteration_list[i_alteration].alternative_path, g_test.alteration_list[i_alteration].min_coverage) g_test.alteration_list[i_alteration].random_ratio_list.append(g_random_data[0]) g_test.alteration_list[i_alteration].random_reference_count_list.append(g_random_data[1]) g_test.alteration_list[i_alteration].random_alternative_count_list.append(g_random_data[2]) logger.info("Will generate p-values") for i_alteration in range(0, len(g_test.alteration_list)): g_test.alteration_list[i_alteration].pvalue_init() g_test.significant_alteration_list_init() # If more than one significant alteration, check if they are not in "spike" (en épis) if len(g_test.significant_alteration_list) > 1: g_test.multiple_alternative_path_filter() ## Stat # graph stat alt_stat_file = open(dir_stat+"/alt_stat_file"+sample_key+".tsv", 'w') for i_alteration in range(0, len(g_test.significant_alteration_list)): if g_test.significant_alteration_list[i_alteration].pvalue_ratio <= 1: # print "%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f" % ( # alt_stat_file.write("%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%s" % ( alt_stat_file.write("%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s" % ( i_alteration+1, sample_key, g_test.coverage['C'], g_test.coverage['N'], g_test.significant_alteration_list[i_alteration].reference_sequence, g_test.significant_alteration_list[i_alteration].alternative_sequence, g_test.significant_alteration_list[i_alteration].reference_read_count, g_test.significant_alteration_list[i_alteration].alternative_read_count, g_test.significant_alteration_list[i_alteration].ratio_read_count, g_test.significant_alteration_list[i_alteration].pvalue_ratio, # g_test.significant_alteration_list[i_alteration].zscore, "\t".join(map(str,g_test.significant_alteration_list[i_alteration].random_ratio_list)) )) ### MICADo + ### ANNO.alteration_list_to_transcrit_mutation(g_test,g_ref)
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None, destination_directory=".", export_gml=False, output_results=None): if experiment_name == "TP53": from randomreadsgraph_TP53 import RandomReadsGraph as RRG else: from randomreadsgraph import RandomReadsGraph as RRG # g_reference construction logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file) g_reference = RG(kmer_length, fasta_file, snp_file) # Is there cycles in reference graph? if list(nx.simple_cycles(g_reference.dbg)): if kmer_length > 70: logger.info("There are always cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length) return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results) # g_patient construction logger.info("Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage) fastq_files = fastq_files.split(",") g_patient = PG(fastq_files, kmer_length) logger.info("Before cleaning: %d nodes", len(g_patient.dbg)) g_patient.graph_cleaned_init(min_support_percentage) logger.info("After cleaning: %d nodes", len(g_patient.dbgclean)) # Is there cycles in patient graph? if list(nx.simple_cycles(g_patient.dbgclean)): if kmer_length > 70: logger.info("There are still cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length) return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results) # Some prints for stats dir_stat = get_or_create_dir("output/statistics") # graph stat graph_stat_file = open(dir_stat + "/graph_stat_file" + sample_key + ".tsv", 'w') graph_stat_file.write( "%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" % ( kmer_length, g_reference.dbg.size(), sample_key, g_patient.coverage['total'], g_patient.dbg.size(), g_patient.dbgclean.size(), g_patient.dbg.in_degree().values().count(0), g_patient.dbg.out_degree().values().count(0), g_patient.dbgclean.in_degree().values().count(0), g_patient.dbgclean.out_degree().values().count(0) )) # kmer stat kmer_stat_file = open(dir_stat + "/kmer_stat_file" + sample_key + ".tsv", 'w') for node_print in g_patient.dbg.nodes(): fragment_print = ",".join(g_patient.dbg.node[node_print]['fastq_id']) reads_print = len(g_patient.dbg.node[node_print]['read_list_n']) kmer_stat_file.write( "%s\t%s\t%s\t%d\n" % ( sample_key, node_print, fragment_print, reads_print, )) # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation) g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg) # search for alternative paths in dbg_refrm (.alteration_list creation) g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers = set() for an_alt in g_patient.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers) for i_alteration in range(0, len(g_patient.alteration_list)): g_random_data = g_random.check_path(g_patient.alteration_list[i_alteration].reference_path, g_patient.alteration_list[i_alteration].alternative_path, g_patient.alteration_list[i_alteration].min_coverage) g_patient.alteration_list[i_alteration].random_ratio_list.append(g_random_data[0]) g_patient.alteration_list[i_alteration].random_reference_count_list.append(g_random_data[1]) g_patient.alteration_list[i_alteration].random_alternative_count_list.append(g_random_data[2]) logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list)) for i_alteration in range(0, len(g_patient.alteration_list)): g_patient.alteration_list[i_alteration].pvalue_init() g_patient.significant_alteration_list_init(p_value_threshold=p_value_threshold) # If more than one significant alteration, check if they are not in "spike" (en épis) if len(g_patient.significant_alteration_list) > 1: g_patient.multiple_alternative_path_filter() ## Stat # alteration stat alt_stat_file = open(dir_stat + "/alt_stat_file" + sample_key + ".tsv", 'w') for i_alteration in range(0, len(g_patient.alteration_list)): if g_patient.alteration_list[i_alteration].pvalue_ratio <= 1: alt_stat_file.write("%d\t%s\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\n" % ( i_alteration + 1, sample_key, g_patient.coverage['total'], g_patient.alteration_list[i_alteration].reference_sequence, g_patient.alteration_list[i_alteration].alternative_sequence, g_patient.alteration_list[i_alteration].reference_read_count, g_patient.alteration_list[i_alteration].alternative_read_count, g_patient.alteration_list[i_alteration].ratio_read_count, g_patient.alteration_list[i_alteration].pvalue_ratio, str(g_patient.alteration_list[i_alteration].zscore), "\t".join(map(str, g_patient.alteration_list[i_alteration].random_ratio_list)) )) # For visualisation graph_name = "G_%s_" % sample_key merged_graph_name = "G_%s_merged_" % sample_key cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage if export_gml: logger.info("Will save viz graph for %s with k==%d", sample_key, kmer_length) get_or_create_dir(destination_directory) # for the refrence graph g_reference_merge = VISU.merge_reference_graph(g_reference.dbg.copy()) g_reference_visu = VISU.reference_graph_visualization_formatting(g_reference.dbg.copy()) g_reference_merge_visu = VISU.reference_graph_merged_visualization_formatting(g_reference_merge.copy()) nx.write_gml(g_reference_visu, destination_directory + "/g_reference_visu" + str(kmer_length) + ".gml") nx.write_gml(g_reference_merge_visu, destination_directory + "/g_reference_merge_visu" + str(kmer_length) + ".gml") # for the patient graph g_patient_visu = VISU.individu_graph_visualization_formating(g_patient.dbg.copy(), g_reference.dbg.copy()) g_patient_clean_visu = VISU.individu_graph_visualization_formating(g_patient.dbgclean.copy(), g_reference.dbg.copy()) g_patient_merged = VISU.merge_individu_graph(g_patient.dbg.copy(), g_reference.dbg.copy()) g_patient_merged_visu = VISU.individu_graph_merged_visualization_formating(g_patient_merged.copy(), g_reference.dbg.copy()) g_patient_clean_merged = VISU.merge_individu_graph(g_patient.dbgclean.copy(), g_reference.dbg.copy()) g_patient_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(g_patient_clean_merged.copy(), g_reference.dbg.copy()) nx.write_gml(g_patient_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml") nx.write_gml(g_patient_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml") nx.write_gml(g_patient_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml") nx.write_gml(g_patient_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml") # Annotation if experiment_name == "TP53": annotate_and_output_results(g_patient, g_reference, output_results) # SNP dir_stat = get_or_create_dir("output/snp") # graph stat graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w') for snp_id in g_reference.snp.keys(): if g_reference.snp[snp_id][1] in g_patient.dbgclean: if g_reference.snp[snp_id][0] in g_patient.dbgclean: graph_snp.write("%s\t%s\t%d\t%d\n" % ( sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n']))) else: graph_snp.write("%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))