def process_varscan_samples():
	avail_samples = [x for x in os.listdir(XPDIR + "results/varscan/") if x.endswith(".vcf")]
	for _, samp in time_iterator(avail_samples, logger, msg_prefix="VARSCAN results"):
		name = samp.split("_on_")[0]
		try:
			if mode == "UNSUPERVISED":
				res = process_varscan_sample_unsupervised(sample_name=name)
			else:
				res = process_varscan_sample(sample_name=name)
		except Exception as e:
			print "failed on sample", samp
			# continue
			raise e
		yield res
def process_micado_samples():
	# avail_samples = [x for x in os.listdir("../micado_synthetic_results/synthetic/") if x.endswith(".significant_alterations.json")]
	avail_samples = [x for x in os.listdir(XPDIR + "results/micado/") if x.endswith(".significant_alterations.json")]
	for _, samp in time_iterator(avail_samples, logger, msg_prefix="MICADo results"):
		name = samp.split(".")[0]
		try:
			if mode == "UNSUPERVISED":
				res = process_micado_sample_unsupervised(sample_name=name)
			else:
				res = process_micado_sample(sample_name=name)
		except Exception as e:
			print "failed on sample", samp
			raise e
		# continue
		yield res
Ejemplo n.º 3
0
def process_varscan_samples():
    avail_samples = [
        x for x in os.listdir("data/synthetic/results/varscan/")
        if x.endswith(".vcf")
    ]
    for _, samp in time_iterator(avail_samples,
                                 logger,
                                 msg_prefix="VARSCAN results"):
        name = samp.split("_on_")[0]
        try:
            res = process_varscan_sample(sample_name=name)
        except Exception as e:
            print "failed on sample", samp
            continue
        # raise e
        yield res
Ejemplo n.º 4
0
def build_read_library(FASTQFILE_PATH):
	read_library = collections.defaultdict(list)
	FASTQFILE_ALL = os.listdir(FASTQFILE_PATH)
	logger.info("Found %d fastq file to process", len(FASTQFILE_ALL))
	for j, a_fastq_file in time_iterator(FASTQFILE_ALL, logger, msg_prefix="Building read library"):
		if a_fastq_file == ".DS_Store":
			continue
		fastq = open(FASTQFILE_PATH + "/" + a_fastq_file, 'r')
		lines = fastq.readlines()
		fastq.close()
		lines = map(str.strip, lines)
		for i_line in range(1, len(lines), 4):
			read_library[a_fastq_file].append(lines[i_line])
	# mutate everything back to lists
	# read_library[a_fastq_file]={k:list(v) for k,v in read_library.items()}
	return read_library
Ejemplo n.º 5
0
def process_varscan_samples():
    avail_samples = [
        x for x in os.listdir(XPDIR + "results/varscan/") if x.endswith(".vcf")
    ]
    for _, samp in time_iterator(avail_samples,
                                 logger,
                                 msg_prefix="VARSCAN results"):
        name = samp.split("_on_")[0]
        try:
            if mode == "UNSUPERVISED":
                res = process_varscan_sample_unsupervised(sample_name=name)
            else:
                res = process_varscan_sample(sample_name=name)
        except Exception as e:
            print "failed on sample", samp
            # continue
            raise e
        yield res
Ejemplo n.º 6
0
def process_micado_samples():
    # avail_samples = [x for x in os.listdir("../micado_synthetic_results/synthetic/") if x.endswith(".significant_alterations.json")]
    # avail_samples = [x for x in os.listdir("data/synthetic/results/micado/") if x.endswith(".significant_alterations.json")]
    avail_samples = [
        x for x in os.listdir("data/synthetic/results/micado/")
        if x.endswith(".combined_alterations.json")
    ]
    for _, samp in time_iterator(avail_samples,
                                 logger,
                                 msg_prefix="MICADo results"):
        name = samp.split(".")[0]
        try:
            res = process_micado_sample(sample_name=name)
        except Exception as e:
            print "failed on sample", samp
            # raise e
            continue
        yield res
Ejemplo n.º 7
0
def process_micado_samples():
    # avail_samples = [x for x in os.listdir("../micado_synthetic_results/synthetic/") if x.endswith(".significant_alterations.json")]
    avail_samples = [
        x for x in os.listdir(XPDIR + "results/micado/")
        if x.endswith(".significant_alterations.json")
    ]
    for _, samp in time_iterator(avail_samples,
                                 logger,
                                 msg_prefix="MICADo results"):
        name = samp.split(".")[0]
        try:
            if mode == "UNSUPERVISED":
                res = process_micado_sample_unsupervised(sample_name=name)
            else:
                res = process_micado_sample(sample_name=name)
        except Exception as e:
            print "failed on sample", samp
            raise e
        # continue
        yield res
Ejemplo n.º 8
0
def build_read_library(FASTQFILE_PATH):
	pattern = re.compile('([NC])_(\d+)_(\d+)')
	read_library = {'N': collections.defaultdict(set), 'C': collections.defaultdict(set)}
	FASTQFILE_ALL = os.listdir(FASTQFILE_PATH)
	logger.info("Found %d fastq file to process", len(FASTQFILE_ALL))
	for j, a_fastq_file in time_iterator(FASTQFILE_ALL, logger, msg_prefix="Building read library"):
		if a_fastq_file == ".DS_Store":
			continue
		fragment = pattern.search(a_fastq_file).group(1)
		individu = pattern.search(a_fastq_file).group(2)
		fastq = open(FASTQFILE_PATH + "/" + a_fastq_file, 'r')
		lines = fastq.readlines()
		fastq.close()
		lines = map(str.strip, lines)
		# if individu not in read_library[fragment]:
		# read_library[fragment][individu] = []
		for i_line in range(1, len(lines), 4):
			read_library[fragment][individu].add(lines[i_line])
	#mutate everything back to lists
	read_library['N']={k:list(v) for k,v in read_library['N'].items()}
	read_library['C']={k:list(v) for k,v in read_library['C'].items()}
	return read_library
Ejemplo n.º 9
0
def build_read_library(FASTQFILE_PATH):
	pattern = re.compile('([NC])_(\d+)_(\d+)')
	read_library = {'N': collections.defaultdict(set), 'C': collections.defaultdict(set)}
	FASTQFILE_ALL = os.listdir(FASTQFILE_PATH)
	logger.info("Found %d fastq file to process", len(FASTQFILE_ALL))
	for j, a_fastq_file in time_iterator(FASTQFILE_ALL, logger, msg_prefix="Building read library"):
		if a_fastq_file == ".DS_Store":
			continue
		fragment = pattern.search(a_fastq_file).group(1)
		individu = pattern.search(a_fastq_file).group(2)
		fastq = open(FASTQFILE_PATH + "/" + a_fastq_file, 'r')
		lines = fastq.readlines()
		fastq.close()
		lines = map(str.strip, lines)
		# if individu not in read_library[fragment]:
		# read_library[fragment][individu] = []
		for i_line in range(1, len(lines), 4):
			read_library[fragment][individu].add(lines[i_line])
	#mutate everything back to lists
	read_library['N']={k:list(v) for k,v in read_library['N'].items()}
	read_library['C']={k:list(v) for k,v in read_library['C'].items()}
	return read_library
Ejemplo n.º 10
0
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, max_len, sample_key=None, fastq_files=None,
				   fasta_file=None, snp_file=None, experiment_name=None, output_results=None, disable_cycle_breaking=False):
	import seq_lib as seq_lib_module
	seq_lib_module.library_itit(experiment_name)


	# g_reference construction
	logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file)
	g_reference = RG(kmer_length, fasta_file, snp_file)

	# Is there cycles in reference graph?
	if list(nx.simple_cycles(g_reference.dbg)):
		if kmer_length >= 70:
			logger.info("There are always cycle(s) with k==70...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k
		logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length+1)
		return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations,
							p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=fastq_files, 
							fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results, 
							disable_cycle_breaking=disable_cycle_breaking)

	# g_patient construction
	logger.info("Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage)
	fastq_files = fastq_files.split(",")
	g_patient = PG(fastq_files, kmer_length)
	logger.info("Before cleaning: %d nodes", len(g_patient.dbg))
	g_patient.graph_cleaned_init(min_support_percentage)
	logger.info("After cleaning: %d nodes", len(g_patient.dbgclean))

	# Is there cycles in patient graph?
	if not disable_cycle_breaking and list(nx.simple_cycles(g_patient.dbgclean)):
		if kmer_length >= 70:
			logger.info("There are still cycle(s) with k==70...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k
		logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length+1)
		return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, 
							 p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=",".join(fastq_files), 
							 fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results)

	# copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation)
	g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg)

	# search for alternative paths in dbg_refrm (.alteration_list creation)
	g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage, max_len)

	### Permutation test ###
	logger.info("Will create random graphs")
	all_possible_kmers = set()
	for an_alt in g_patient.alteration_list:
		all_possible_kmers.update(an_alt.reference_path)
		all_possible_kmers.update(an_alt.alternative_path)

	for _, _ in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"):
		g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers, seq_lib_module=seq_lib_module)
		for i in range(0, len(g_patient.alteration_list)):
			i_alteration = g_patient.alteration_list[i]
			ref_path = i_alteration.reference_path
			alt_path = i_alteration.alternative_path
			g_random_data = g_random.check_path(ref_path,
												alt_path,
												i_alteration.min_coverage)
			i_alteration.random_ratio_list.append(g_random_data[0])
			i_alteration.random_reference_count_list.append(g_random_data[1])
			i_alteration.random_alternative_count_list.append(g_random_data[2])

	logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list))
	for i in range(0, len(g_patient.alteration_list)):
		g_patient.alteration_list[i].pvalue_init()

	g_patient.significant_alteration_list_init(p_value_threshold=p_value_threshold)

	# Annotation
	annotate_and_output_results(g_patient, g_reference, output_results)
	# SNP
	dir_stat = get_or_create_dir("output/snp")
	graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w')
	for snp_id in g_reference.snp.keys():
		if g_reference.snp[snp_id][1] in g_patient.dbgclean:
			if g_reference.snp[snp_id][0] in g_patient.dbgclean:
				graph_snp.write("%s\t%s\t%d\t%d\n" % (
					sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][0]]['read_list_n']),
					len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
			else:
				graph_snp.write(
					"%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
Ejemplo n.º 11
0
def build_a_sample(n_reads, fraction_altered, n_alterations, output_file_prefix, alterations_weight=None, multi_mismatch=False):
	if not alterations_weight:
		alterations_weight = [1.0, 1.0, 1.0]
	global all_ranges

	# sample some reads
	sub_reads = aligned_reads.sample(n=n_reads, random_state=args.seed, replace=False)

	# compute reference coordinates using the CIGAR
	all_ranges = []
	for i, an_alignment in sub_reads.iterrows():
		all_ranges.extend(coordinate_map(an_alignment))
	logger.info("Mapped coordinates to reference")
	all_ranges = pd.DataFrame.from_records(all_ranges)
	all_ranges.set_index("label", inplace=True, drop=False)

	# sample altered reads
	altered_reads_labels = sub_reads.QNAME.sample(int(len(sub_reads) * fraction_altered), random_state=args.seed, replace=False)
	altered_reads_row = all_ranges.ix[altered_reads_labels]
	non_altered_reads_labels = set(sub_reads.QNAME).difference(altered_reads_labels)
	assert set(altered_reads_labels).isdisjoint(set(non_altered_reads_labels))

	# identify start and stop positions of reads that should be altered (with 10nt slack...)
	ref_start = min([min(x) for x in altered_reads_row.ref_coord]) + 10
	ref_end = max([max(x) for x in altered_reads_row.ref_coord]) - 10

	# sample random alterations, reads that should be altered / kept as it
	alterations_modify_content = False
	max_try = 100
	i = 0
	while (not alterations_modify_content) and (i < max_try):
		some_alterations = dict([random_alteration(ref_start, ref_end, weights=alterations_weight, multi_mismatch=multi_mismatch) for _ in range(n_alterations)])
		# check that artificial alterations actually modify reads (case of generating a substitution corresponding to the actual content of the read)
		a_label = random.choice(altered_reads_labels)
		altered_sequence = "".join(mutating_sequence_iterator(read_label=a_label, alterations=some_alterations))
		non_altered_sequence = sub_reads.ix[a_label].SEQ
		if min_dist([x[0] for x in some_alterations])<=20:
			logger.info("Alterations %s are too close, iterating", some_alterations)
		elif altered_sequence != non_altered_sequence:
			alterations_modify_content = True
		else:
			logger.info("Alterations %s correspond to the real read content, iterating", some_alterations)
		i += 1

	logger.info("Generated alterations %s after %d trial", some_alterations, i)

	# generate original reads
	with open(output_file_prefix + "_non_alt.fastq", "w") as f:
		for i, read_label in time_iterator(sub_reads.QNAME, logger, msg_prefix="Generating non altered fastq, non altered reads", delta_percent=0.1):
			print >> f, "@%s" % (clean_label(read_label)) + "_ORIG"
			print >> f, sub_reads.ix[read_label].SEQ
			print >> f, "+"
			print >> f, sub_reads.ix[read_label].QUAL
			print >> f, "\n"

	# generate altered reads fastq files
	output_reads = set()
	with open(output_file_prefix + ".fastq", "w") as f:

		for i, read_label in time_iterator(altered_reads_labels, logger, msg_prefix="Generating altered fastq, altered reads", delta_percent=0.1):
			assert read_label not in output_reads
			output_reads.add(read_label)
			print >> f, "@%s" % (clean_label(read_label)) + "_ALT"
			print >> f, "".join(mutating_sequence_iterator(read_label=read_label, alterations=some_alterations))
			print >> f, "+"
			print >> f, "".join(mutating_sequence_iterator(read_label=read_label, alterations=some_alterations, output="qual"))
			print >> f, "\n"

		for i, read_label in time_iterator(non_altered_reads_labels, logger, msg_prefix="Generating altered fastq, non altered reads", delta_percent=0.1):
			assert read_label not in output_reads
			output_reads.add(read_label)

			print >> f, "@%s" % (clean_label(read_label)) + "_ORIG"
			print >> f, sub_reads.ix[read_label].SEQ
			print >> f, "+"
			print >> f, sub_reads.ix[read_label].QUAL
			print >> f, "\n"
	serialize_results(output_file_prefix, some_alterations)

	logger.info("finished generation for %d reads, %d alterations, output files are", n_reads, n_alterations)
	logger.info("%s: Original sampled reads", output_file_prefix + "_non_alt.fastq")
	logger.info("%s: Altered sampled reads", output_file_prefix + ".fastq")
	logger.info("%s: Alterations description", output_file_prefix + ".alterations.txt")
	logger.info("Alterations are %s", some_alterations)
Ejemplo n.º 12
0
def process_sample(kmer_length,
                   min_support_percentage,
                   n_permutations,
                   p_value_threshold,
                   sample_key=None,
                   fastq_files=None,
                   fasta_file=None,
                   snp_file=None,
                   experiment_name=None,
                   destination_directory=".",
                   export_gml=False,
                   output_results=None):
    if experiment_name == "TP53":
        from randomreadsgraph_TP53 import RandomReadsGraph as RRG
    else:
        from randomreadsgraph import RandomReadsGraph as RRG

    # g_reference construction
    logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s",
                kmer_length, fasta_file, snp_file)
    g_reference = RG(kmer_length, fasta_file, snp_file)

    # Is there cycles in reference graph?
    if list(nx.simple_cycles(g_reference.dbg)):
        if kmer_length > 70:
            logger.info("There are always cycle(s) with k==70...exiting")
            sys.exit(0)
        # Check non depassement valeur limite de k
        logger.info("[Reference graph] Increasing k to %d to remove cycles",
                    kmer_length)
        return process_sample(kmer_length=kmer_length + 1,
                              sample_key=sample_key,
                              fastq_files=fastq_files,
                              fasta_file=fasta_file,
                              snp_file=snp_file,
                              experiment_name=experiment_name,
                              min_support_percentage=min_support_percentage,
                              n_permutations=n_permutations,
                              destination_directory=destination_directory,
                              export_gml=export_gml,
                              p_value_threshold=p_value_threshold,
                              output_results=output_results)

    # g_patient construction
    logger.info(
        "Will build patient graph for %s with k==%d and minimum support = %dpct",
        fastq_files, kmer_length, min_support_percentage)
    fastq_files = fastq_files.split(",")
    g_patient = PG(fastq_files, kmer_length)
    logger.info("Before cleaning: %d nodes", len(g_patient.dbg))
    g_patient.graph_cleaned_init(min_support_percentage)
    logger.info("After cleaning: %d nodes", len(g_patient.dbgclean))

    # Is there cycles in patient graph?
    if list(nx.simple_cycles(g_patient.dbgclean)):
        if kmer_length > 70:
            logger.info("There are still cycle(s) with k==70...exiting")
            sys.exit(0)
        # Check non depassement valeur limite de k
        logger.info("[Sample graph] Increasing k to %d to remove cycles",
                    kmer_length)
        return process_sample(kmer_length=kmer_length + 1,
                              sample_key=sample_key,
                              fastq_files=",".join(fastq_files),
                              fasta_file=fasta_file,
                              snp_file=snp_file,
                              experiment_name=experiment_name,
                              min_support_percentage=min_support_percentage,
                              n_permutations=n_permutations,
                              destination_directory=destination_directory,
                              export_gml=export_gml,
                              p_value_threshold=p_value_threshold,
                              output_results=output_results)

    # Some prints for stats
    dir_stat = get_or_create_dir("output/statistics")
    # graph stat
    graph_stat_file = open(dir_stat + "/graph_stat_file" + sample_key + ".tsv",
                           'w')
    graph_stat_file.write("%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" %
                          (kmer_length, g_reference.dbg.size(),
                           sample_key, g_patient.coverage['total'],
                           g_patient.dbg.size(), g_patient.dbgclean.size(),
                           g_patient.dbg.in_degree().values().count(0),
                           g_patient.dbg.out_degree().values().count(0),
                           g_patient.dbgclean.in_degree().values().count(0),
                           g_patient.dbgclean.out_degree().values().count(0)))
    # kmer stat
    kmer_stat_file = open(dir_stat + "/kmer_stat_file" + sample_key + ".tsv",
                          'w')
    for node_print in g_patient.dbg.nodes():
        fragment_print = ",".join(g_patient.dbg.node[node_print]['fastq_id'])
        reads_print = len(g_patient.dbg.node[node_print]['read_list_n'])
        kmer_stat_file.write("%s\t%s\t%s\t%d\n" % (
            sample_key,
            node_print,
            fragment_print,
            reads_print,
        ))

    # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation)
    g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg)

    # search for alternative paths in dbg_refrm (.alteration_list creation)
    g_patient.alteration_list_init(g_reference.dbg, kmer_length,
                                   min_support_percentage)

    ### Permutation test ###
    logger.info("Will create random graphs")
    all_possible_kmers = set()
    for an_alt in g_patient.alteration_list:
        all_possible_kmers.update(an_alt.reference_path)
        all_possible_kmers.update(an_alt.alternative_path)

    for i, j in time_iterator(range(0, n_permutations),
                              logger,
                              msg_prefix="permuting"):
        g_random = RRG(g_patient.coverage,
                       kmer_length,
                       restrict_to=all_possible_kmers)
        for i_alteration in range(0, len(g_patient.alteration_list)):
            g_random_data = g_random.check_path(
                g_patient.alteration_list[i_alteration].reference_path,
                g_patient.alteration_list[i_alteration].alternative_path,
                g_patient.alteration_list[i_alteration].min_coverage)
            g_patient.alteration_list[i_alteration].random_ratio_list.append(
                g_random_data[0])
            g_patient.alteration_list[
                i_alteration].random_reference_count_list.append(
                    g_random_data[1])
            g_patient.alteration_list[
                i_alteration].random_alternative_count_list.append(
                    g_random_data[2])

    logger.info("Will generate p-values for %d possible alterations",
                len(g_patient.alteration_list))
    for i_alteration in range(0, len(g_patient.alteration_list)):
        g_patient.alteration_list[i_alteration].pvalue_init()

    g_patient.significant_alteration_list_init(
        p_value_threshold=p_value_threshold)

    # If more than one significant alteration, check if they are not in "spike" (en épis)
    if len(g_patient.significant_alteration_list) > 1:
        g_patient.multiple_alternative_path_filter()

    ## Stat
    # alteration stat
    alt_stat_file = open(dir_stat + "/alt_stat_file" + sample_key + ".tsv",
                         'w')
    for i_alteration in range(0, len(g_patient.alteration_list)):
        if g_patient.alteration_list[i_alteration].pvalue_ratio <= 1:
            alt_stat_file.write(
                "%d\t%s\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\n" %
                (i_alteration + 1, sample_key, g_patient.coverage['total'],
                 g_patient.alteration_list[i_alteration].reference_sequence,
                 g_patient.alteration_list[i_alteration].alternative_sequence,
                 g_patient.alteration_list[i_alteration].reference_read_count,
                 g_patient.alteration_list[i_alteration].
                 alternative_read_count,
                 g_patient.alteration_list[i_alteration].ratio_read_count,
                 g_patient.alteration_list[i_alteration].pvalue_ratio,
                 str(g_patient.alteration_list[i_alteration].zscore),
                 "\t".join(
                     map(
                         str, g_patient.alteration_list[i_alteration].
                         random_ratio_list))))

    # For visualisation
    graph_name = "G_%s_" % sample_key
    merged_graph_name = "G_%s_merged_" % sample_key
    cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage
    merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage
    if export_gml:
        logger.info("Will save viz graph for %s with k==%d", sample_key,
                    kmer_length)
        get_or_create_dir(destination_directory)
        # for the refrence graph
        g_reference_merge = VISU.merge_reference_graph(g_reference.dbg.copy())
        g_reference_visu = VISU.reference_graph_visualization_formatting(
            g_reference.dbg.copy())
        g_reference_merge_visu = VISU.reference_graph_merged_visualization_formatting(
            g_reference_merge.copy())
        nx.write_gml(
            g_reference_visu, destination_directory + "/g_reference_visu" +
            str(kmer_length) + ".gml")
        nx.write_gml(
            g_reference_merge_visu, destination_directory +
            "/g_reference_merge_visu" + str(kmer_length) + ".gml")
        # for the patient graph
        g_patient_visu = VISU.individu_graph_visualization_formating(
            g_patient.dbg.copy(), g_reference.dbg.copy())
        g_patient_clean_visu = VISU.individu_graph_visualization_formating(
            g_patient.dbgclean.copy(), g_reference.dbg.copy())
        g_patient_merged = VISU.merge_individu_graph(g_patient.dbg.copy(),
                                                     g_reference.dbg.copy())
        g_patient_merged_visu = VISU.individu_graph_merged_visualization_formating(
            g_patient_merged.copy(), g_reference.dbg.copy())
        g_patient_clean_merged = VISU.merge_individu_graph(
            g_patient.dbgclean.copy(), g_reference.dbg.copy())
        g_patient_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(
            g_patient_clean_merged.copy(), g_reference.dbg.copy())
        nx.write_gml(
            g_patient_visu, destination_directory + "/" + graph_name +
            str(kmer_length) + ".gml")
        nx.write_gml(
            g_patient_clean_visu, destination_directory + "/" +
            cleaned_graph_name + str(kmer_length) + ".gml")
        nx.write_gml(
            g_patient_merged_visu, destination_directory + "/" +
            merged_graph_name + str(kmer_length) + ".gml")
        nx.write_gml(
            g_patient_clean_merged_visu, destination_directory + "/" +
            merged_cleaned_graph_name + str(kmer_length) + ".gml")

    # Annotation
    if experiment_name == "TP53":
        annotate_and_output_results(g_patient, g_reference, output_results)
    # SNP
    dir_stat = get_or_create_dir("output/snp")
    # graph stat
    graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w')
    for snp_id in g_reference.snp.keys():
        if g_reference.snp[snp_id][1] in g_patient.dbgclean:
            if g_reference.snp[snp_id][0] in g_patient.dbgclean:
                graph_snp.write("%s\t%s\t%d\t%d\n" %
                                (sample_key, snp_id,
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [0]]['read_list_n']),
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [1]]['read_list_n'])))
            else:
                graph_snp.write("%s\t%s\t0\t%d\n" %
                                (sample_key, snp_id,
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [1]]['read_list_n'])))
Ejemplo n.º 13
0
def build_a_sample(n_reads, fraction_altered, n_alterations, output_reads_prefix, output_result_prefix, alterations_weight=None,
				   multi_mismatch=False):
	if not alterations_weight:
		alterations_weight = [1.0, 1.0, 1.0]
	global all_ranges

	# sample some reads
	sub_reads = aligned_reads.sample(n=n_reads, random_state=args.seed, replace=False)

	# compute reference coordinates using the CIGAR
	all_ranges = []
	for i, an_alignment in sub_reads.iterrows():
		all_ranges.extend(coordinate_map(an_alignment))
	logger.info("Mapped coordinates to reference")
	all_ranges = pd.DataFrame.from_records(all_ranges)
	all_ranges.set_index("label", inplace=True, drop=False)

	# sample altered reads
	altered_reads_labels = sub_reads.QNAME.sample(int(len(sub_reads) * fraction_altered), random_state=args.seed, replace=False)
	altered_read_rows = all_ranges.ix[altered_reads_labels]
	non_altered_reads_labels = set(sub_reads.QNAME).difference(altered_reads_labels)
	assert set(altered_reads_labels).isdisjoint(set(non_altered_reads_labels))

	# pick a random label to test alterations
	a_label = random.choice(altered_reads_labels)
	if n_alterations > 0:
		some_alterations = generate_alterations(a_label, alterations_weight, altered_read_rows, multi_mismatch, n_alterations, sub_reads)
	else:
		some_alterations = {}

	if args.do_not_output_reads:
		return some_alterations

	# generate original reads
	with open(output_reads_prefix + "_non_alt.fastq", "w") as f:
		for i, read_label in time_iterator(sub_reads.QNAME, logger, msg_prefix="Generating non altered fastq, non altered reads",
										   delta_percent=0.3):
			print >> f, "@%s" % (clean_label(read_label)) + "_ORIG"
			print >> f, sub_reads.ix[read_label].SEQ
			print >> f, "+"
			print >> f, sub_reads.ix[read_label].QUAL
		# print >> f, "\n"

	# generate altered reads fastq files
	output_reads = set()
	with open(output_reads_prefix + ".fastq", "w") as f:

		for i, read_label in time_iterator(altered_reads_labels, logger, msg_prefix="Generating altered fastq, altered reads",
										   delta_percent=0.3):
			assert read_label not in output_reads
			output_reads.add(read_label)
			print >> f, "@%s" % (clean_label(read_label)) + "_ALT"
			print >> f, "".join(mutating_sequence_iterator(read_label=read_label, alterations=some_alterations))
			print >> f, "+"
			print >> f, "".join(mutating_sequence_iterator(read_label=read_label, alterations=some_alterations, output="qual"))
		# print >> f, "\n"

		for i, read_label in time_iterator(non_altered_reads_labels, logger, msg_prefix="Generating altered fastq, non altered reads",
										   delta_percent=0.3):
			assert read_label not in output_reads
			output_reads.add(read_label)

			print >> f, "@%s" % (clean_label(read_label)) + "_ORIG"
			print >> f, sub_reads.ix[read_label].SEQ
			print >> f, "+"
			print >> f, sub_reads.ix[read_label].QUAL
		# print >> f, "\n"
	serialize_results(output_result_prefix, some_alterations)

	logger.info("finished generation for %d reads, %d alterations, output files are", n_reads, n_alterations)
	logger.info("%s: Original sampled reads", output_reads_prefix + "_non_alt.fastq")
	logger.info("%s: Altered sampled reads", output_reads_prefix + ".fastq")
	logger.info("%s: Alterations description", output_result_prefix + ".alterations.txt")
	logger.info("Alterations are %s", some_alterations)
	pp.pprint(sorted(some_alterations.items(), key=lambda (pos, alt): pos[0]))
Ejemplo n.º 14
0
def build_a_sample(n_reads,
                   fraction_altered,
                   n_alterations,
                   output_file_prefix,
                   alterations_weight=None,
                   multi_mismatch=False):
    if not alterations_weight:
        alterations_weight = [1.0, 1.0, 1.0]
    global all_ranges

    # sample some reads
    sub_reads = aligned_reads.sample(n=n_reads,
                                     random_state=args.seed,
                                     replace=False)

    # compute reference coordinates using the CIGAR
    all_ranges = []
    for i, an_alignment in sub_reads.iterrows():
        all_ranges.extend(coordinate_map(an_alignment))
    logger.info("Mapped coordinates to reference")
    all_ranges = pd.DataFrame.from_records(all_ranges)
    all_ranges.set_index("label", inplace=True, drop=False)

    # sample altered reads
    altered_reads_labels = sub_reads.QNAME.sample(int(
        len(sub_reads) * fraction_altered),
                                                  random_state=args.seed,
                                                  replace=False)
    altered_reads_row = all_ranges.ix[altered_reads_labels]
    non_altered_reads_labels = set(
        sub_reads.QNAME).difference(altered_reads_labels)
    assert set(altered_reads_labels).isdisjoint(set(non_altered_reads_labels))

    # identify start and stop positions of reads that should be altered (with 10nt slack...)
    ref_start = min([min(x) for x in altered_reads_row.ref_coord]) + 10
    ref_end = max([max(x) for x in altered_reads_row.ref_coord]) - 10

    # sample random alterations, reads that should be altered / kept as it
    alterations_modify_content = False
    max_try = 100
    i = 0
    while (not alterations_modify_content) and (i < max_try):
        some_alterations = dict([
            random_alteration(ref_start,
                              ref_end,
                              weights=alterations_weight,
                              multi_mismatch=multi_mismatch)
            for _ in range(n_alterations)
        ])
        # check that artificial alterations actually modify reads (case of generating a substitution corresponding to the actual content of the read)
        a_label = random.choice(altered_reads_labels)
        altered_sequence = "".join(
            mutating_sequence_iterator(read_label=a_label,
                                       alterations=some_alterations))
        non_altered_sequence = sub_reads.ix[a_label].SEQ
        if min_dist([x[0] for x in some_alterations]) <= 20:
            logger.info("Alterations %s are too close, iterating",
                        some_alterations)
        elif altered_sequence != non_altered_sequence:
            alterations_modify_content = True
        else:
            logger.info(
                "Alterations %s correspond to the real read content, iterating",
                some_alterations)
        i += 1

    logger.info("Generated alterations %s after %d trial", some_alterations, i)

    # generate original reads
    with open(output_file_prefix + "_non_alt.fastq", "w") as f:
        for i, read_label in time_iterator(
                sub_reads.QNAME,
                logger,
                msg_prefix="Generating non altered fastq, non altered reads",
                delta_percent=0.1):
            print >> f, "@%s" % (clean_label(read_label)) + "_ORIG"
            print >> f, sub_reads.ix[read_label].SEQ
            print >> f, "+"
            print >> f, sub_reads.ix[read_label].QUAL
            print >> f, "\n"

    # generate altered reads fastq files
    output_reads = set()
    with open(output_file_prefix + ".fastq", "w") as f:

        for i, read_label in time_iterator(
                altered_reads_labels,
                logger,
                msg_prefix="Generating altered fastq, altered reads",
                delta_percent=0.1):
            assert read_label not in output_reads
            output_reads.add(read_label)
            print >> f, "@%s" % (clean_label(read_label)) + "_ALT"
            print >> f, "".join(
                mutating_sequence_iterator(read_label=read_label,
                                           alterations=some_alterations))
            print >> f, "+"
            print >> f, "".join(
                mutating_sequence_iterator(read_label=read_label,
                                           alterations=some_alterations,
                                           output="qual"))
            print >> f, "\n"

        for i, read_label in time_iterator(
                non_altered_reads_labels,
                logger,
                msg_prefix="Generating altered fastq, non altered reads",
                delta_percent=0.1):
            assert read_label not in output_reads
            output_reads.add(read_label)

            print >> f, "@%s" % (clean_label(read_label)) + "_ORIG"
            print >> f, sub_reads.ix[read_label].SEQ
            print >> f, "+"
            print >> f, sub_reads.ix[read_label].QUAL
            print >> f, "\n"
    serialize_results(output_file_prefix, some_alterations)

    logger.info(
        "finished generation for %d reads, %d alterations, output files are",
        n_reads, n_alterations)
    logger.info("%s: Original sampled reads",
                output_file_prefix + "_non_alt.fastq")
    logger.info("%s: Altered sampled reads", output_file_prefix + ".fastq")
    logger.info("%s: Alterations description",
                output_file_prefix + ".alterations.txt")
    logger.info("Alterations are %s", some_alterations)
Ejemplo n.º 15
0
def process_sample(kmer_length,
                   min_support_percentage,
                   n_permutations,
                   p_value_threshold,
                   max_len,
                   sample_key=None,
                   fastq_files=None,
                   fasta_file=None,
                   snp_file=None,
                   experiment_name=None,
                   output_results=None,
                   disable_cycle_breaking=False):
    import seq_lib as seq_lib_module
    seq_lib_module.library_itit(experiment_name)

    # g_reference construction
    logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s",
                kmer_length, fasta_file, snp_file)
    g_reference = RG(kmer_length, fasta_file, snp_file)

    # Is there cycles in reference graph?
    if list(nx.simple_cycles(g_reference.dbg)):
        if kmer_length >= 70:
            logger.info("There are always cycle(s) with k==70...exiting")
            sys.exit(0)
        # Check non depassement valeur limite de k
        logger.info("[Reference graph] Increasing k to %d to remove cycles",
                    kmer_length + 1)
        return process_sample(kmer_length=kmer_length + 1,
                              min_support_percentage=min_support_percentage,
                              n_permutations=n_permutations,
                              p_value_threshold=p_value_threshold,
                              max_len=max_len,
                              sample_key=sample_key,
                              fastq_files=fastq_files,
                              fasta_file=fasta_file,
                              snp_file=snp_file,
                              experiment_name=experiment_name,
                              output_results=output_results,
                              disable_cycle_breaking=disable_cycle_breaking)

    # g_patient construction
    logger.info(
        "Will build patient graph for %s with k==%d and minimum support = %dpct",
        fastq_files, kmer_length, min_support_percentage)
    fastq_files = fastq_files.split(",")
    g_patient = PG(fastq_files, kmer_length)
    logger.info("Before cleaning: %d nodes", len(g_patient.dbg))
    g_patient.graph_cleaned_init(min_support_percentage)
    logger.info("After cleaning: %d nodes", len(g_patient.dbgclean))

    # Is there cycles in patient graph?
    if not disable_cycle_breaking and list(nx.simple_cycles(
            g_patient.dbgclean)):
        if kmer_length >= 70:
            logger.info("There are still cycle(s) with k==70...exiting")
            sys.exit(0)
        # Check non depassement valeur limite de k
        logger.info("[Sample graph] Increasing k to %d to remove cycles",
                    kmer_length + 1)
        return process_sample(kmer_length=kmer_length + 1,
                              min_support_percentage=min_support_percentage,
                              n_permutations=n_permutations,
                              p_value_threshold=p_value_threshold,
                              max_len=max_len,
                              sample_key=sample_key,
                              fastq_files=",".join(fastq_files),
                              fasta_file=fasta_file,
                              snp_file=snp_file,
                              experiment_name=experiment_name,
                              output_results=output_results)

    # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation)
    g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg)

    # search for alternative paths in dbg_refrm (.alteration_list creation)
    g_patient.alteration_list_init(g_reference.dbg, kmer_length,
                                   min_support_percentage, max_len)

    ### Permutation test ###
    logger.info("Will create random graphs")
    all_possible_kmers = set()
    for an_alt in g_patient.alteration_list:
        all_possible_kmers.update(an_alt.reference_path)
        all_possible_kmers.update(an_alt.alternative_path)

    for _, _ in time_iterator(range(0, n_permutations),
                              logger,
                              msg_prefix="permuting"):
        g_random = RRG(g_patient.coverage,
                       kmer_length,
                       restrict_to=all_possible_kmers,
                       seq_lib_module=seq_lib_module)
        for i in range(0, len(g_patient.alteration_list)):
            i_alteration = g_patient.alteration_list[i]
            ref_path = i_alteration.reference_path
            alt_path = i_alteration.alternative_path
            g_random_data = g_random.check_path(ref_path, alt_path,
                                                i_alteration.min_coverage)
            i_alteration.random_ratio_list.append(g_random_data[0])
            i_alteration.random_reference_count_list.append(g_random_data[1])
            i_alteration.random_alternative_count_list.append(g_random_data[2])

    logger.info("Will generate p-values for %d possible alterations",
                len(g_patient.alteration_list))
    for i in range(0, len(g_patient.alteration_list)):
        g_patient.alteration_list[i].pvalue_init()

    g_patient.significant_alteration_list_init(
        p_value_threshold=p_value_threshold)

    # Annotation
    annotate_and_output_results(g_patient, g_reference, output_results)
    # SNP
    dir_stat = get_or_create_dir("output/snp")
    graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w')
    for snp_id in g_reference.snp.keys():
        if g_reference.snp[snp_id][1] in g_patient.dbgclean:
            if g_reference.snp[snp_id][0] in g_patient.dbgclean:
                graph_snp.write("%s\t%s\t%d\t%d\n" %
                                (sample_key, snp_id,
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [0]]['read_list_n']),
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [1]]['read_list_n'])))
            else:
                graph_snp.write("%s\t%s\t0\t%d\n" %
                                (sample_key, snp_id,
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [1]]['read_list_n'])))
Ejemplo n.º 16
0
			continue

		record = build_alteration_pair_description(pair)
		record.update(xp_metadata)
		accounted_alt.add(record['micado_hash'])
		accounted_alt.add(record['injected_hash'])
		result_table.append(record)
	return result_table


results_dir = "../micado_synthetic_results/synthetic/"
avail_results = [results_dir + x for x in os.listdir(results_dir) if x.endswith(".json") and "combined" in x]
len(avail_results)

result_table = []
for i, input_json in time_iterator(avail_results, logger=logger):
	# input_json = random.choice(avail_results)
	with open(input_json, "r") as f:
		try:
			result_dict = simplejson.load(f)
		except simplejson.JSONDecodeError:
			logger.critical("Malformed json file %s",input_json)
			continue

	# result_dict.keys()
	# result_dict['sampler']['injected_alterations']
	# result_dict['significant_alterations']
	this_result_table = tabulate_result(result_dict)
	result_table.extend(this_result_table)

all_results = pd.DataFrame.from_records(result_table)
Ejemplo n.º 17
0
def build_a_sample(n_reads,
                   fraction_altered,
                   n_alterations,
                   output_reads_prefix,
                   output_result_prefix,
                   alterations_weight=None,
                   multi_mismatch=False):
    if not alterations_weight:
        alterations_weight = [1.0, 1.0, 1.0]
    global all_ranges

    # sample some reads
    sub_reads = aligned_reads.sample(n=n_reads,
                                     random_state=args.seed,
                                     replace=False)

    # compute reference coordinates using the CIGAR
    all_ranges = []
    for i, an_alignment in sub_reads.iterrows():
        all_ranges.extend(coordinate_map(an_alignment))
    logger.info("Mapped coordinates to reference")
    all_ranges = pd.DataFrame.from_records(all_ranges)
    all_ranges.set_index("label", inplace=True, drop=False)

    # sample altered reads
    altered_reads_labels = sub_reads.QNAME.sample(int(
        len(sub_reads) * fraction_altered),
                                                  random_state=args.seed,
                                                  replace=False)
    altered_read_rows = all_ranges.ix[altered_reads_labels]
    non_altered_reads_labels = set(
        sub_reads.QNAME).difference(altered_reads_labels)
    assert set(altered_reads_labels).isdisjoint(set(non_altered_reads_labels))

    # pick a random label to test alterations
    a_label = random.choice(altered_reads_labels)
    if n_alterations > 0:
        some_alterations = generate_alterations(a_label, alterations_weight,
                                                altered_read_rows,
                                                multi_mismatch, n_alterations,
                                                sub_reads)
    else:
        some_alterations = {}

    if args.do_not_output_reads:
        return some_alterations

    # generate original reads
    with open(output_reads_prefix + "_non_alt.fastq", "w") as f:
        for i, read_label in time_iterator(
                sub_reads.QNAME,
                logger,
                msg_prefix="Generating non altered fastq, non altered reads",
                delta_percent=0.3):
            print >> f, "@%s" % (clean_label(read_label)) + "_ORIG"
            print >> f, sub_reads.ix[read_label].SEQ
            print >> f, "+"
            print >> f, sub_reads.ix[read_label].QUAL
        # print >> f, "\n"

    # generate altered reads fastq files
    output_reads = set()
    with open(output_reads_prefix + ".fastq", "w") as f:

        for i, read_label in time_iterator(
                altered_reads_labels,
                logger,
                msg_prefix="Generating altered fastq, altered reads",
                delta_percent=0.3):
            assert read_label not in output_reads
            output_reads.add(read_label)
            print >> f, "@%s" % (clean_label(read_label)) + "_ALT"
            print >> f, "".join(
                mutating_sequence_iterator(read_label=read_label,
                                           alterations=some_alterations))
            print >> f, "+"
            print >> f, "".join(
                mutating_sequence_iterator(read_label=read_label,
                                           alterations=some_alterations,
                                           output="qual"))
        # print >> f, "\n"

        for i, read_label in time_iterator(
                non_altered_reads_labels,
                logger,
                msg_prefix="Generating altered fastq, non altered reads",
                delta_percent=0.3):
            assert read_label not in output_reads
            output_reads.add(read_label)

            print >> f, "@%s" % (clean_label(read_label)) + "_ORIG"
            print >> f, sub_reads.ix[read_label].SEQ
            print >> f, "+"
            print >> f, sub_reads.ix[read_label].QUAL
        # print >> f, "\n"
    serialize_results(output_result_prefix, some_alterations)

    logger.info(
        "finished generation for %d reads, %d alterations, output files are",
        n_reads, n_alterations)
    logger.info("%s: Original sampled reads",
                output_reads_prefix + "_non_alt.fastq")
    logger.info("%s: Altered sampled reads", output_reads_prefix + ".fastq")
    logger.info("%s: Alterations description",
                output_result_prefix + ".alterations.txt")
    logger.info("Alterations are %s", some_alterations)
    pp.pprint(sorted(some_alterations.items(), key=lambda (pos, alt): pos[0]))
Ejemplo n.º 18
0
        result_table.append(record)
    return result_table


# results_dir = "../micado_synthetic_results/synthetic/"
results_dir = "data/synthetic/results/micado/"
avail_results = [
    results_dir + x for x in os.listdir(results_dir)
    if x.endswith(".json") and "combined" in x
]
len(avail_results)

# avail_results=[results_dir+ 'C_FOOFOO_2897_150_045_3_1-1-1.combined_alterations.json']

result_table = []
for i, input_json in time_iterator(avail_results, logger=logger):
    # input_json = random.choice(avail_results)
    with open(input_json, "r") as f:
        try:
            result_dict = simplejson.load(f)
        except simplejson.JSONDecodeError:
            logger.critical("Malformed json file %s", input_json)
            continue

    # result_dict.keys()
    # result_dict['sampler']['injected_alterations']
    # result_dict['significant_alterations']
    this_result_table = tabulate_result(result_dict)
    result_table.extend(this_result_table)

all_results = pd.DataFrame.from_records(result_table)
Ejemplo n.º 19
0
def process_sample(kmer_length, min_support_percentage,  n_permutations, sample_key=None, c_fastq_file=None, n_fastq_file=None, destination_directory=".", export_gml=False):

	# g_ref construction
	logger.info("Will build reference graph with k==%d", kmer_length)
	g_ref = RG.ref_constructor(kmer_length)

	# g_ind construction
	fastq = [c_fastq_file, n_fastq_file]
	fastq = [f for f in fastq if f]

	logger.info("Will build sample graph for %s with k==%d and minimum support (percentage) = %d", fastq, kmer_length, min_support_percentage)
	g_test = IG(fastq, kmer_length)
	g_test.graph_cleaned_init(min_support_percentage)  # .dbgclean creation


	# Is there cycles ?
	if list(nx.simple_cycles(g_test.dbgclean)):
		if kmer_length > 50:
			logger.info("There are always cycle(s) with k==50...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k 
		return process_sample(kmer_length=kmer_length+1,sample_key=sample_key,c_fastq_file=c_fastq_file,n_fastq_file=n_fastq_file, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml)

	# Some prints for stats 
	dir_stat = get_or_create_dir("output/statistics") 
	# graph stat
	graph_stat_file = open(dir_stat+"/graph_stat_file"+sample_key+".tsv", 'w')
	graph_stat_file.write(
		"%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d"%(
		kmer_length,
		g_ref.size(),
		sample_key,
		g_test.coverage['C'],
		g_test.coverage['N'],
		g_test.dbg.size(),
		g_test.dbgclean.size(),
		g_test.dbg.in_degree().values().count(0),
		g_test.dbg.out_degree().values().count(0),
		g_test.dbgclean.in_degree().values().count(0),
		g_test.dbgclean.out_degree().values().count(0)
		))

	# kmer stat
	kmer_stat_file = open(dir_stat+"/kmer_stat_file"+sample_key+".tsv", 'w')
	for node_print in g_test.dbg.nodes():
		fragment_print = "".join(g_test.dbg.node[node_print]['fragment'])
		reads_print = len(g_test.dbg.node[node_print]['read_list_n'])
		kmer_stat_file.write(
			"%s\t%s\t%s\t%d\n"%(
			sample_key,
			node_print,
			fragment_print,
			reads_print,
			))

	g_test.graph_rmRefEdges_init(g_test.dbgclean, g_ref)  # .dbg_refrm creation

	# For visualisation
	graph_name = "G_%s_" % sample_key
	if export_gml:
		logger.info("Will save viz graph for %s with k==%d", fastq, kmer_length)
		get_or_create_dir(destination_directory)
		G_ref_merge = VISU.merge_reference_graph(g_ref.copy())
		G_ref_visu = VISU.reference_graph_visualization_formatting(g_ref.copy())
		G_ref_merge_visu = VISU.reference_graph_merged_visualization_formatting(G_ref_merge.copy())
		nx.write_gml(G_ref_visu,destination_directory+"/G_ref_visu"+str(kmer_length)+".gml")
		nx.write_gml(G_ref_merge_visu,destination_directory+"/G_ref_merge_visu"+str(kmer_length)+".gml")
		g_test_visu = VISU.individu_graph_visualization_formating(g_test.dbg.copy(), g_ref.copy())
		g_test_clean_visu = VISU.individu_graph_visualization_formating(g_test.dbgclean.copy(), g_ref.copy())
		cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage
		nx.write_gml(g_test_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml")
		nx.write_gml(g_test_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml")
		# Graph merged
		logger.info("Will merge graph for %s with k==%d", fastq, kmer_length)
		g_test_merged = VISU.merge_individu_graph(g_test.dbg.copy(), g_ref.copy())
		g_test_merged_visu = VISU.individu_graph_merged_visualization_formating(g_test_merged.copy(), g_ref.copy())
		merged_graph_name = "G_%s_merged_" % sample_key
		nx.write_gml(g_test_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml")
		g_test_clean_merged = VISU.merge_individu_graph(g_test.dbgclean.copy(), g_ref.copy())
		g_test_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(g_test_clean_merged.copy(), g_ref.copy())
		merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage
		nx.write_gml(g_test_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml")

	# .alteration_list creation
	g_test.alteration_list_init(g_ref, kmer_length,min_support_percentage)  

	### Permutation test ###
	logger.info("Will create random graphs")
	all_possible_kmers=set()
	for an_alt in g_test.alteration_list:
		all_possible_kmers.update(an_alt.reference_path)
		all_possible_kmers.update(an_alt.alternative_path)

	for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"):
		g_random = RRG(g_test.coverage, kmer_length,restrict_to=all_possible_kmers)
		for i_alteration in range(0, len(g_test.alteration_list)):
			g_random_data = g_random.check_path(g_test.alteration_list[i_alteration].reference_path, g_test.alteration_list[i_alteration].alternative_path, g_test.alteration_list[i_alteration].min_coverage)
			g_test.alteration_list[i_alteration].random_ratio_list.append(g_random_data[0])
			g_test.alteration_list[i_alteration].random_reference_count_list.append(g_random_data[1])
			g_test.alteration_list[i_alteration].random_alternative_count_list.append(g_random_data[2])

	logger.info("Will generate p-values")
	for i_alteration in range(0, len(g_test.alteration_list)):
		g_test.alteration_list[i_alteration].pvalue_init()

	g_test.significant_alteration_list_init()
	
	# If more than one significant alteration, check if they are not in "spike" (en épis)
	if len(g_test.significant_alteration_list) > 1:
	 	g_test.multiple_alternative_path_filter()

	## Stat 
	# graph stat
	alt_stat_file = open(dir_stat+"/alt_stat_file"+sample_key+".tsv", 'w')
	for i_alteration in range(0, len(g_test.significant_alteration_list)):
		if g_test.significant_alteration_list[i_alteration].pvalue_ratio <= 1:
			# print "%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f" % (
			# alt_stat_file.write("%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%s" % (				
			alt_stat_file.write("%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s" % (				
			i_alteration+1,
			sample_key,
			g_test.coverage['C'],
			g_test.coverage['N'],
			g_test.significant_alteration_list[i_alteration].reference_sequence,
			g_test.significant_alteration_list[i_alteration].alternative_sequence,
			g_test.significant_alteration_list[i_alteration].reference_read_count,
			g_test.significant_alteration_list[i_alteration].alternative_read_count,
			g_test.significant_alteration_list[i_alteration].ratio_read_count,
			g_test.significant_alteration_list[i_alteration].pvalue_ratio,
			# g_test.significant_alteration_list[i_alteration].zscore,
			"\t".join(map(str,g_test.significant_alteration_list[i_alteration].random_ratio_list))
			))

	### MICADo + ###
	ANNO.alteration_list_to_transcrit_mutation(g_test,g_ref)
Ejemplo n.º 20
0
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None,
				   destination_directory=".", export_gml=False, output_results=None):
	if experiment_name == "TP53":
		from randomreadsgraph_TP53 import RandomReadsGraph as RRG
	else:
		from randomreadsgraph import RandomReadsGraph as RRG

	# g_reference construction
	logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file)
	g_reference = RG(kmer_length, fasta_file, snp_file)

	# Is there cycles in reference graph?
	if list(nx.simple_cycles(g_reference.dbg)):
		if kmer_length > 70:
			logger.info("There are always cycle(s) with k==70...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k
		logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length)
		return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file,
							  experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations,
							  destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results)

	# g_patient construction
	logger.info("Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage)
	fastq_files = fastq_files.split(",")
	g_patient = PG(fastq_files, kmer_length)
	logger.info("Before cleaning: %d nodes", len(g_patient.dbg))
	g_patient.graph_cleaned_init(min_support_percentage)
	logger.info("After cleaning: %d nodes", len(g_patient.dbgclean))

	# Is there cycles in patient graph?
	if list(nx.simple_cycles(g_patient.dbgclean)):
		if kmer_length > 70:
			logger.info("There are still cycle(s) with k==70...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k
		logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length)
		return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file,
							  experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations,
							  destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results)

	# Some prints for stats 
	dir_stat = get_or_create_dir("output/statistics")
	# graph stat
	graph_stat_file = open(dir_stat + "/graph_stat_file" + sample_key + ".tsv", 'w')
	graph_stat_file.write(
		"%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" % (
			kmer_length,
			g_reference.dbg.size(),
			sample_key,
			g_patient.coverage['total'],
			g_patient.dbg.size(),
			g_patient.dbgclean.size(),
			g_patient.dbg.in_degree().values().count(0),
			g_patient.dbg.out_degree().values().count(0),
			g_patient.dbgclean.in_degree().values().count(0),
			g_patient.dbgclean.out_degree().values().count(0)
		))
	# kmer stat
	kmer_stat_file = open(dir_stat + "/kmer_stat_file" + sample_key + ".tsv", 'w')
	for node_print in g_patient.dbg.nodes():
		fragment_print = ",".join(g_patient.dbg.node[node_print]['fastq_id'])
		reads_print = len(g_patient.dbg.node[node_print]['read_list_n'])
		kmer_stat_file.write(
			"%s\t%s\t%s\t%d\n" % (
				sample_key,
				node_print,
				fragment_print,
				reads_print,
			))

	# copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation)
	g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg)

	# search for alternative paths in dbg_refrm (.alteration_list creation)
	g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage)

	### Permutation test ###
	logger.info("Will create random graphs")
	all_possible_kmers = set()
	for an_alt in g_patient.alteration_list:
		all_possible_kmers.update(an_alt.reference_path)
		all_possible_kmers.update(an_alt.alternative_path)

	for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"):
		g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers)
		for i_alteration in range(0, len(g_patient.alteration_list)):
			g_random_data = g_random.check_path(g_patient.alteration_list[i_alteration].reference_path, g_patient.alteration_list[i_alteration].alternative_path,
												g_patient.alteration_list[i_alteration].min_coverage)
			g_patient.alteration_list[i_alteration].random_ratio_list.append(g_random_data[0])
			g_patient.alteration_list[i_alteration].random_reference_count_list.append(g_random_data[1])
			g_patient.alteration_list[i_alteration].random_alternative_count_list.append(g_random_data[2])

	logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list))
	for i_alteration in range(0, len(g_patient.alteration_list)):
		g_patient.alteration_list[i_alteration].pvalue_init()

	g_patient.significant_alteration_list_init(p_value_threshold=p_value_threshold)

	# If more than one significant alteration, check if they are not in "spike" (en épis)
	if len(g_patient.significant_alteration_list) > 1:
		g_patient.multiple_alternative_path_filter()

	## Stat 
	# alteration stat
	alt_stat_file = open(dir_stat + "/alt_stat_file" + sample_key + ".tsv", 'w')
	for i_alteration in range(0, len(g_patient.alteration_list)):
		if g_patient.alteration_list[i_alteration].pvalue_ratio <= 1:
			alt_stat_file.write("%d\t%s\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\n" % (
				i_alteration + 1,
				sample_key,
				g_patient.coverage['total'],
				g_patient.alteration_list[i_alteration].reference_sequence,
				g_patient.alteration_list[i_alteration].alternative_sequence,
				g_patient.alteration_list[i_alteration].reference_read_count,
				g_patient.alteration_list[i_alteration].alternative_read_count,
				g_patient.alteration_list[i_alteration].ratio_read_count,
				g_patient.alteration_list[i_alteration].pvalue_ratio,
				str(g_patient.alteration_list[i_alteration].zscore),
				"\t".join(map(str, g_patient.alteration_list[i_alteration].random_ratio_list))
			))

	# For visualisation
	graph_name = "G_%s_" % sample_key
	merged_graph_name = "G_%s_merged_" % sample_key
	cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage
	merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage
	if export_gml:
		logger.info("Will save viz graph for %s with k==%d", sample_key, kmer_length)
		get_or_create_dir(destination_directory)
		# for the refrence graph
		g_reference_merge = VISU.merge_reference_graph(g_reference.dbg.copy())
		g_reference_visu = VISU.reference_graph_visualization_formatting(g_reference.dbg.copy())
		g_reference_merge_visu = VISU.reference_graph_merged_visualization_formatting(g_reference_merge.copy())
		nx.write_gml(g_reference_visu, destination_directory + "/g_reference_visu" + str(kmer_length) + ".gml")
		nx.write_gml(g_reference_merge_visu, destination_directory + "/g_reference_merge_visu" + str(kmer_length) + ".gml")
		# for the patient graph
		g_patient_visu = VISU.individu_graph_visualization_formating(g_patient.dbg.copy(), g_reference.dbg.copy())
		g_patient_clean_visu = VISU.individu_graph_visualization_formating(g_patient.dbgclean.copy(), g_reference.dbg.copy())
		g_patient_merged = VISU.merge_individu_graph(g_patient.dbg.copy(), g_reference.dbg.copy())
		g_patient_merged_visu = VISU.individu_graph_merged_visualization_formating(g_patient_merged.copy(), g_reference.dbg.copy())
		g_patient_clean_merged = VISU.merge_individu_graph(g_patient.dbgclean.copy(), g_reference.dbg.copy())
		g_patient_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(g_patient_clean_merged.copy(), g_reference.dbg.copy())
		nx.write_gml(g_patient_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml")
		nx.write_gml(g_patient_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml")
		nx.write_gml(g_patient_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml")
		nx.write_gml(g_patient_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml")

	# Annotation
	if experiment_name == "TP53":
		annotate_and_output_results(g_patient, g_reference, output_results)
	# SNP
	dir_stat = get_or_create_dir("output/snp")
	# graph stat
	graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w')
	for snp_id in g_reference.snp.keys():
		if g_reference.snp[snp_id][1] in g_patient.dbgclean:
			if g_reference.snp[snp_id][0] in g_patient.dbgclean:
				graph_snp.write("%s\t%s\t%d\t%d\n" % (
					sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
			else:
				graph_snp.write("%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))