Beispiel #1
0
def build_serialize_library(FASTQFILE_PATH):
	logger.info("will rebuild library")
	read_library = build_read_library(FASTQFILE_PATH)
	logger.info("Will save %d items", len(read_library['N'])+len(read_library['C']))
	packed_docs = msgpack.packb(read_library, default=lambda x: x.__dict__)
	logger.info("Packed to %d chars", len(packed_docs))
	get_or_create_dir("data")
	get_or_create_dir("data/seq")
	tgt_file = "data/seq/"+experiment_name+"_%s_%d.packb" % ((int(time.time())), len(read_library))
	with open(tgt_file, "w") as f:
		f.write(packed_docs)

	logger.info("Serialized to file %s" % tgt_file)
Beispiel #2
0
def build_serialize_library():
	logger.info("will rebuild library")
	read_library = build_read_library()
	logger.info("Will save %d items", len(read_library['N'])+len(read_library['C']))
	packed_docs = msgpack.packb(read_library, default=lambda x: x.__dict__)
	logger.info("Packed to %d chars", len(packed_docs))
	get_or_create_dir("data")
	get_or_create_dir("data/seq")
	tgt_file = "data/seq/all_pool_trimmed0.1_%s_%d.packb" % ((int(time.time())), len(read_library))
	with open(tgt_file, "w") as f:
		f.write(packed_docs)

	logger.info("Serialized to file %s" % tgt_file)
Beispiel #3
0
def build_serialize_library(FASTQFILE_PATH):
	logger.info("will rebuild library")
	read_library = build_read_library(FASTQFILE_PATH)
	logger.info("Will save %d items", len(read_library))
	packed_docs = msgpack.packb(read_library, default=lambda x: x.__dict__)
	logger.info("Packed to %d chars", len(packed_docs))
	get_or_create_dir("data")
	get_or_create_dir("data/seq")
	tgt_file = "data/seq/"+experiment_name+"_%s_%d.packb" % ((int(time.time())), len(read_library))
	with open(tgt_file, "w") as f:
		f.write(packed_docs)

	logger.info("Serialized to file %s" % tgt_file)
Beispiel #4
0
def process_sample(kmer_length, min_support_percentage,  n_permutations, sample_key=None, c_fastq_file=None, n_fastq_file=None, destination_directory=".", export_gml=False):

	# g_ref construction
	logger.info("Will build reference graph with k==%d", kmer_length)
	g_ref = RG.ref_constructor(kmer_length)

	# g_ind construction
	fastq = [c_fastq_file, n_fastq_file]
	fastq = [f for f in fastq if f]

	logger.info("Will build sample graph for %s with k==%d and minimum support (percentage) = %d", fastq, kmer_length, min_support_percentage)
	g_test = IG(fastq, kmer_length)
	g_test.graph_cleaned_init(min_support_percentage)  # .dbgclean creation


	# Is there cycles ?
	if list(nx.simple_cycles(g_test.dbgclean)):
		if kmer_length > 50:
			logger.info("There are always cycle(s) with k==50...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k 
		return process_sample(kmer_length=kmer_length+1,sample_key=sample_key,c_fastq_file=c_fastq_file,n_fastq_file=n_fastq_file, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml)

	# Some prints for stats 
	dir_stat = get_or_create_dir("output/statistics") 
	# graph stat
	graph_stat_file = open(dir_stat+"/graph_stat_file"+sample_key+".tsv", 'w')
	graph_stat_file.write(
		"%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d"%(
		kmer_length,
		g_ref.size(),
		sample_key,
		g_test.coverage['C'],
		g_test.coverage['N'],
		g_test.dbg.size(),
		g_test.dbgclean.size(),
		g_test.dbg.in_degree().values().count(0),
		g_test.dbg.out_degree().values().count(0),
		g_test.dbgclean.in_degree().values().count(0),
		g_test.dbgclean.out_degree().values().count(0)
		))

	# kmer stat
	kmer_stat_file = open(dir_stat+"/kmer_stat_file"+sample_key+".tsv", 'w')
	for node_print in g_test.dbg.nodes():
		fragment_print = "".join(g_test.dbg.node[node_print]['fragment'])
		reads_print = len(g_test.dbg.node[node_print]['read_list_n'])
		kmer_stat_file.write(
			"%s\t%s\t%s\t%d\n"%(
			sample_key,
			node_print,
			fragment_print,
			reads_print,
			))

	g_test.graph_rmRefEdges_init(g_test.dbgclean, g_ref)  # .dbg_refrm creation

	# For visualisation
	graph_name = "G_%s_" % sample_key
	if export_gml:
		logger.info("Will save viz graph for %s with k==%d", fastq, kmer_length)
		get_or_create_dir(destination_directory)
		G_ref_merge = VISU.merge_reference_graph(g_ref.copy())
		G_ref_visu = VISU.reference_graph_visualization_formatting(g_ref.copy())
		G_ref_merge_visu = VISU.reference_graph_merged_visualization_formatting(G_ref_merge.copy())
		nx.write_gml(G_ref_visu,destination_directory+"/G_ref_visu"+str(kmer_length)+".gml")
		nx.write_gml(G_ref_merge_visu,destination_directory+"/G_ref_merge_visu"+str(kmer_length)+".gml")
		g_test_visu = VISU.individu_graph_visualization_formating(g_test.dbg.copy(), g_ref.copy())
		g_test_clean_visu = VISU.individu_graph_visualization_formating(g_test.dbgclean.copy(), g_ref.copy())
		cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage
		nx.write_gml(g_test_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml")
		nx.write_gml(g_test_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml")
		# Graph merged
		logger.info("Will merge graph for %s with k==%d", fastq, kmer_length)
		g_test_merged = VISU.merge_individu_graph(g_test.dbg.copy(), g_ref.copy())
		g_test_merged_visu = VISU.individu_graph_merged_visualization_formating(g_test_merged.copy(), g_ref.copy())
		merged_graph_name = "G_%s_merged_" % sample_key
		nx.write_gml(g_test_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml")
		g_test_clean_merged = VISU.merge_individu_graph(g_test.dbgclean.copy(), g_ref.copy())
		g_test_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(g_test_clean_merged.copy(), g_ref.copy())
		merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage
		nx.write_gml(g_test_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml")

	# .alteration_list creation
	g_test.alteration_list_init(g_ref, kmer_length,min_support_percentage)  

	### Permutation test ###
	logger.info("Will create random graphs")
	all_possible_kmers=set()
	for an_alt in g_test.alteration_list:
		all_possible_kmers.update(an_alt.reference_path)
		all_possible_kmers.update(an_alt.alternative_path)

	for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"):
		g_random = RRG(g_test.coverage, kmer_length,restrict_to=all_possible_kmers)
		for i_alteration in range(0, len(g_test.alteration_list)):
			g_random_data = g_random.check_path(g_test.alteration_list[i_alteration].reference_path, g_test.alteration_list[i_alteration].alternative_path, g_test.alteration_list[i_alteration].min_coverage)
			g_test.alteration_list[i_alteration].random_ratio_list.append(g_random_data[0])
			g_test.alteration_list[i_alteration].random_reference_count_list.append(g_random_data[1])
			g_test.alteration_list[i_alteration].random_alternative_count_list.append(g_random_data[2])

	logger.info("Will generate p-values")
	for i_alteration in range(0, len(g_test.alteration_list)):
		g_test.alteration_list[i_alteration].pvalue_init()

	g_test.significant_alteration_list_init()
	
	# If more than one significant alteration, check if they are not in "spike" (en épis)
	if len(g_test.significant_alteration_list) > 1:
	 	g_test.multiple_alternative_path_filter()

	## Stat 
	# graph stat
	alt_stat_file = open(dir_stat+"/alt_stat_file"+sample_key+".tsv", 'w')
	for i_alteration in range(0, len(g_test.significant_alteration_list)):
		if g_test.significant_alteration_list[i_alteration].pvalue_ratio <= 1:
			# print "%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f" % (
			# alt_stat_file.write("%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%s" % (				
			alt_stat_file.write("%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s" % (				
			i_alteration+1,
			sample_key,
			g_test.coverage['C'],
			g_test.coverage['N'],
			g_test.significant_alteration_list[i_alteration].reference_sequence,
			g_test.significant_alteration_list[i_alteration].alternative_sequence,
			g_test.significant_alteration_list[i_alteration].reference_read_count,
			g_test.significant_alteration_list[i_alteration].alternative_read_count,
			g_test.significant_alteration_list[i_alteration].ratio_read_count,
			g_test.significant_alteration_list[i_alteration].pvalue_ratio,
			# g_test.significant_alteration_list[i_alteration].zscore,
			"\t".join(map(str,g_test.significant_alteration_list[i_alteration].random_ratio_list))
			))

	### MICADo + ###
	ANNO.alteration_list_to_transcrit_mutation(g_test,g_ref)
Beispiel #5
0
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None,
				   destination_directory=".", export_gml=False, output_results=None):
	if experiment_name == "TP53":
		from randomreadsgraph_TP53 import RandomReadsGraph as RRG
	else:
		from randomreadsgraph import RandomReadsGraph as RRG

	# g_reference construction
	logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file)
	g_reference = RG(kmer_length, fasta_file, snp_file)

	# Is there cycles in reference graph?
	if list(nx.simple_cycles(g_reference.dbg)):
		if kmer_length > 70:
			logger.info("There are always cycle(s) with k==70...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k
		logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length)
		return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file,
							  experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations,
							  destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results)

	# g_patient construction
	logger.info("Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage)
	fastq_files = fastq_files.split(",")
	g_patient = PG(fastq_files, kmer_length)
	logger.info("Before cleaning: %d nodes", len(g_patient.dbg))
	g_patient.graph_cleaned_init(min_support_percentage)
	logger.info("After cleaning: %d nodes", len(g_patient.dbgclean))

	# Is there cycles in patient graph?
	if list(nx.simple_cycles(g_patient.dbgclean)):
		if kmer_length > 70:
			logger.info("There are still cycle(s) with k==70...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k
		logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length)
		return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file,
							  experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations,
							  destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results)

	# Some prints for stats 
	dir_stat = get_or_create_dir("output/statistics")
	# graph stat
	graph_stat_file = open(dir_stat + "/graph_stat_file" + sample_key + ".tsv", 'w')
	graph_stat_file.write(
		"%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" % (
			kmer_length,
			g_reference.dbg.size(),
			sample_key,
			g_patient.coverage['total'],
			g_patient.dbg.size(),
			g_patient.dbgclean.size(),
			g_patient.dbg.in_degree().values().count(0),
			g_patient.dbg.out_degree().values().count(0),
			g_patient.dbgclean.in_degree().values().count(0),
			g_patient.dbgclean.out_degree().values().count(0)
		))
	# kmer stat
	kmer_stat_file = open(dir_stat + "/kmer_stat_file" + sample_key + ".tsv", 'w')
	for node_print in g_patient.dbg.nodes():
		fragment_print = ",".join(g_patient.dbg.node[node_print]['fastq_id'])
		reads_print = len(g_patient.dbg.node[node_print]['read_list_n'])
		kmer_stat_file.write(
			"%s\t%s\t%s\t%d\n" % (
				sample_key,
				node_print,
				fragment_print,
				reads_print,
			))

	# copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation)
	g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg)

	# search for alternative paths in dbg_refrm (.alteration_list creation)
	g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage)

	### Permutation test ###
	logger.info("Will create random graphs")
	all_possible_kmers = set()
	for an_alt in g_patient.alteration_list:
		all_possible_kmers.update(an_alt.reference_path)
		all_possible_kmers.update(an_alt.alternative_path)

	for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"):
		g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers)
		for i_alteration in range(0, len(g_patient.alteration_list)):
			g_random_data = g_random.check_path(g_patient.alteration_list[i_alteration].reference_path, g_patient.alteration_list[i_alteration].alternative_path,
												g_patient.alteration_list[i_alteration].min_coverage)
			g_patient.alteration_list[i_alteration].random_ratio_list.append(g_random_data[0])
			g_patient.alteration_list[i_alteration].random_reference_count_list.append(g_random_data[1])
			g_patient.alteration_list[i_alteration].random_alternative_count_list.append(g_random_data[2])

	logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list))
	for i_alteration in range(0, len(g_patient.alteration_list)):
		g_patient.alteration_list[i_alteration].pvalue_init()

	g_patient.significant_alteration_list_init(p_value_threshold=p_value_threshold)

	# If more than one significant alteration, check if they are not in "spike" (en épis)
	if len(g_patient.significant_alteration_list) > 1:
		g_patient.multiple_alternative_path_filter()

	## Stat 
	# alteration stat
	alt_stat_file = open(dir_stat + "/alt_stat_file" + sample_key + ".tsv", 'w')
	for i_alteration in range(0, len(g_patient.alteration_list)):
		if g_patient.alteration_list[i_alteration].pvalue_ratio <= 1:
			alt_stat_file.write("%d\t%s\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\n" % (
				i_alteration + 1,
				sample_key,
				g_patient.coverage['total'],
				g_patient.alteration_list[i_alteration].reference_sequence,
				g_patient.alteration_list[i_alteration].alternative_sequence,
				g_patient.alteration_list[i_alteration].reference_read_count,
				g_patient.alteration_list[i_alteration].alternative_read_count,
				g_patient.alteration_list[i_alteration].ratio_read_count,
				g_patient.alteration_list[i_alteration].pvalue_ratio,
				str(g_patient.alteration_list[i_alteration].zscore),
				"\t".join(map(str, g_patient.alteration_list[i_alteration].random_ratio_list))
			))

	# For visualisation
	graph_name = "G_%s_" % sample_key
	merged_graph_name = "G_%s_merged_" % sample_key
	cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage
	merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage
	if export_gml:
		logger.info("Will save viz graph for %s with k==%d", sample_key, kmer_length)
		get_or_create_dir(destination_directory)
		# for the refrence graph
		g_reference_merge = VISU.merge_reference_graph(g_reference.dbg.copy())
		g_reference_visu = VISU.reference_graph_visualization_formatting(g_reference.dbg.copy())
		g_reference_merge_visu = VISU.reference_graph_merged_visualization_formatting(g_reference_merge.copy())
		nx.write_gml(g_reference_visu, destination_directory + "/g_reference_visu" + str(kmer_length) + ".gml")
		nx.write_gml(g_reference_merge_visu, destination_directory + "/g_reference_merge_visu" + str(kmer_length) + ".gml")
		# for the patient graph
		g_patient_visu = VISU.individu_graph_visualization_formating(g_patient.dbg.copy(), g_reference.dbg.copy())
		g_patient_clean_visu = VISU.individu_graph_visualization_formating(g_patient.dbgclean.copy(), g_reference.dbg.copy())
		g_patient_merged = VISU.merge_individu_graph(g_patient.dbg.copy(), g_reference.dbg.copy())
		g_patient_merged_visu = VISU.individu_graph_merged_visualization_formating(g_patient_merged.copy(), g_reference.dbg.copy())
		g_patient_clean_merged = VISU.merge_individu_graph(g_patient.dbgclean.copy(), g_reference.dbg.copy())
		g_patient_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(g_patient_clean_merged.copy(), g_reference.dbg.copy())
		nx.write_gml(g_patient_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml")
		nx.write_gml(g_patient_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml")
		nx.write_gml(g_patient_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml")
		nx.write_gml(g_patient_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml")

	# Annotation
	if experiment_name == "TP53":
		annotate_and_output_results(g_patient, g_reference, output_results)
	# SNP
	dir_stat = get_or_create_dir("output/snp")
	# graph stat
	graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w')
	for snp_id in g_reference.snp.keys():
		if g_reference.snp[snp_id][1] in g_patient.dbgclean:
			if g_reference.snp[snp_id][0] in g_patient.dbgclean:
				graph_snp.write("%s\t%s\t%d\t%d\n" % (
					sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
			else:
				graph_snp.write("%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
Beispiel #6
0
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, max_len, sample_key=None, fastq_files=None,
				   fasta_file=None, snp_file=None, experiment_name=None, output_results=None, disable_cycle_breaking=False):
	import seq_lib as seq_lib_module
	seq_lib_module.library_itit(experiment_name)


	# g_reference construction
	logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file)
	g_reference = RG(kmer_length, fasta_file, snp_file)

	# Is there cycles in reference graph?
	if list(nx.simple_cycles(g_reference.dbg)):
		if kmer_length >= 70:
			logger.info("There are always cycle(s) with k==70...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k
		logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length+1)
		return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations,
							p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=fastq_files, 
							fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results, 
							disable_cycle_breaking=disable_cycle_breaking)

	# g_patient construction
	logger.info("Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage)
	fastq_files = fastq_files.split(",")
	g_patient = PG(fastq_files, kmer_length)
	logger.info("Before cleaning: %d nodes", len(g_patient.dbg))
	g_patient.graph_cleaned_init(min_support_percentage)
	logger.info("After cleaning: %d nodes", len(g_patient.dbgclean))

	# Is there cycles in patient graph?
	if not disable_cycle_breaking and list(nx.simple_cycles(g_patient.dbgclean)):
		if kmer_length >= 70:
			logger.info("There are still cycle(s) with k==70...exiting")
			sys.exit(0)
		# Check non depassement valeur limite de k
		logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length+1)
		return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, 
							 p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=",".join(fastq_files), 
							 fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results)

	# copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation)
	g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg)

	# search for alternative paths in dbg_refrm (.alteration_list creation)
	g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage, max_len)

	### Permutation test ###
	logger.info("Will create random graphs")
	all_possible_kmers = set()
	for an_alt in g_patient.alteration_list:
		all_possible_kmers.update(an_alt.reference_path)
		all_possible_kmers.update(an_alt.alternative_path)

	for _, _ in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"):
		g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers, seq_lib_module=seq_lib_module)
		for i in range(0, len(g_patient.alteration_list)):
			i_alteration = g_patient.alteration_list[i]
			ref_path = i_alteration.reference_path
			alt_path = i_alteration.alternative_path
			g_random_data = g_random.check_path(ref_path,
												alt_path,
												i_alteration.min_coverage)
			i_alteration.random_ratio_list.append(g_random_data[0])
			i_alteration.random_reference_count_list.append(g_random_data[1])
			i_alteration.random_alternative_count_list.append(g_random_data[2])

	logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list))
	for i in range(0, len(g_patient.alteration_list)):
		g_patient.alteration_list[i].pvalue_init()

	g_patient.significant_alteration_list_init(p_value_threshold=p_value_threshold)

	# Annotation
	annotate_and_output_results(g_patient, g_reference, output_results)
	# SNP
	dir_stat = get_or_create_dir("output/snp")
	graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w')
	for snp_id in g_reference.snp.keys():
		if g_reference.snp[snp_id][1] in g_patient.dbgclean:
			if g_reference.snp[snp_id][0] in g_patient.dbgclean:
				graph_snp.write("%s\t%s\t%d\t%d\n" % (
					sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][0]]['read_list_n']),
					len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
			else:
				graph_snp.write(
					"%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
Beispiel #7
0
def process_sample(kmer_length,
                   min_support_percentage,
                   n_permutations,
                   p_value_threshold,
                   max_len,
                   sample_key=None,
                   fastq_files=None,
                   fasta_file=None,
                   snp_file=None,
                   experiment_name=None,
                   output_results=None,
                   disable_cycle_breaking=False):
    import seq_lib as seq_lib_module
    seq_lib_module.library_itit(experiment_name)

    # g_reference construction
    logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s",
                kmer_length, fasta_file, snp_file)
    g_reference = RG(kmer_length, fasta_file, snp_file)

    # Is there cycles in reference graph?
    if list(nx.simple_cycles(g_reference.dbg)):
        if kmer_length >= 70:
            logger.info("There are always cycle(s) with k==70...exiting")
            sys.exit(0)
        # Check non depassement valeur limite de k
        logger.info("[Reference graph] Increasing k to %d to remove cycles",
                    kmer_length + 1)
        return process_sample(kmer_length=kmer_length + 1,
                              min_support_percentage=min_support_percentage,
                              n_permutations=n_permutations,
                              p_value_threshold=p_value_threshold,
                              max_len=max_len,
                              sample_key=sample_key,
                              fastq_files=fastq_files,
                              fasta_file=fasta_file,
                              snp_file=snp_file,
                              experiment_name=experiment_name,
                              output_results=output_results,
                              disable_cycle_breaking=disable_cycle_breaking)

    # g_patient construction
    logger.info(
        "Will build patient graph for %s with k==%d and minimum support = %dpct",
        fastq_files, kmer_length, min_support_percentage)
    fastq_files = fastq_files.split(",")
    g_patient = PG(fastq_files, kmer_length)
    logger.info("Before cleaning: %d nodes", len(g_patient.dbg))
    g_patient.graph_cleaned_init(min_support_percentage)
    logger.info("After cleaning: %d nodes", len(g_patient.dbgclean))

    # Is there cycles in patient graph?
    if not disable_cycle_breaking and list(nx.simple_cycles(
            g_patient.dbgclean)):
        if kmer_length >= 70:
            logger.info("There are still cycle(s) with k==70...exiting")
            sys.exit(0)
        # Check non depassement valeur limite de k
        logger.info("[Sample graph] Increasing k to %d to remove cycles",
                    kmer_length + 1)
        return process_sample(kmer_length=kmer_length + 1,
                              min_support_percentage=min_support_percentage,
                              n_permutations=n_permutations,
                              p_value_threshold=p_value_threshold,
                              max_len=max_len,
                              sample_key=sample_key,
                              fastq_files=",".join(fastq_files),
                              fasta_file=fasta_file,
                              snp_file=snp_file,
                              experiment_name=experiment_name,
                              output_results=output_results)

    # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation)
    g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg)

    # search for alternative paths in dbg_refrm (.alteration_list creation)
    g_patient.alteration_list_init(g_reference.dbg, kmer_length,
                                   min_support_percentage, max_len)

    ### Permutation test ###
    logger.info("Will create random graphs")
    all_possible_kmers = set()
    for an_alt in g_patient.alteration_list:
        all_possible_kmers.update(an_alt.reference_path)
        all_possible_kmers.update(an_alt.alternative_path)

    for _, _ in time_iterator(range(0, n_permutations),
                              logger,
                              msg_prefix="permuting"):
        g_random = RRG(g_patient.coverage,
                       kmer_length,
                       restrict_to=all_possible_kmers,
                       seq_lib_module=seq_lib_module)
        for i in range(0, len(g_patient.alteration_list)):
            i_alteration = g_patient.alteration_list[i]
            ref_path = i_alteration.reference_path
            alt_path = i_alteration.alternative_path
            g_random_data = g_random.check_path(ref_path, alt_path,
                                                i_alteration.min_coverage)
            i_alteration.random_ratio_list.append(g_random_data[0])
            i_alteration.random_reference_count_list.append(g_random_data[1])
            i_alteration.random_alternative_count_list.append(g_random_data[2])

    logger.info("Will generate p-values for %d possible alterations",
                len(g_patient.alteration_list))
    for i in range(0, len(g_patient.alteration_list)):
        g_patient.alteration_list[i].pvalue_init()

    g_patient.significant_alteration_list_init(
        p_value_threshold=p_value_threshold)

    # Annotation
    annotate_and_output_results(g_patient, g_reference, output_results)
    # SNP
    dir_stat = get_or_create_dir("output/snp")
    graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w')
    for snp_id in g_reference.snp.keys():
        if g_reference.snp[snp_id][1] in g_patient.dbgclean:
            if g_reference.snp[snp_id][0] in g_patient.dbgclean:
                graph_snp.write("%s\t%s\t%d\t%d\n" %
                                (sample_key, snp_id,
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [0]]['read_list_n']),
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [1]]['read_list_n'])))
            else:
                graph_snp.write("%s\t%s\t0\t%d\n" %
                                (sample_key, snp_id,
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [1]]['read_list_n'])))
Beispiel #8
0
def process_sample(kmer_length,
                   min_support_percentage,
                   n_permutations,
                   p_value_threshold,
                   sample_key=None,
                   fastq_files=None,
                   fasta_file=None,
                   snp_file=None,
                   experiment_name=None,
                   destination_directory=".",
                   export_gml=False,
                   output_results=None):
    if experiment_name == "TP53":
        from randomreadsgraph_TP53 import RandomReadsGraph as RRG
    else:
        from randomreadsgraph import RandomReadsGraph as RRG

    # g_reference construction
    logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s",
                kmer_length, fasta_file, snp_file)
    g_reference = RG(kmer_length, fasta_file, snp_file)

    # Is there cycles in reference graph?
    if list(nx.simple_cycles(g_reference.dbg)):
        if kmer_length > 70:
            logger.info("There are always cycle(s) with k==70...exiting")
            sys.exit(0)
        # Check non depassement valeur limite de k
        logger.info("[Reference graph] Increasing k to %d to remove cycles",
                    kmer_length)
        return process_sample(kmer_length=kmer_length + 1,
                              sample_key=sample_key,
                              fastq_files=fastq_files,
                              fasta_file=fasta_file,
                              snp_file=snp_file,
                              experiment_name=experiment_name,
                              min_support_percentage=min_support_percentage,
                              n_permutations=n_permutations,
                              destination_directory=destination_directory,
                              export_gml=export_gml,
                              p_value_threshold=p_value_threshold,
                              output_results=output_results)

    # g_patient construction
    logger.info(
        "Will build patient graph for %s with k==%d and minimum support = %dpct",
        fastq_files, kmer_length, min_support_percentage)
    fastq_files = fastq_files.split(",")
    g_patient = PG(fastq_files, kmer_length)
    logger.info("Before cleaning: %d nodes", len(g_patient.dbg))
    g_patient.graph_cleaned_init(min_support_percentage)
    logger.info("After cleaning: %d nodes", len(g_patient.dbgclean))

    # Is there cycles in patient graph?
    if list(nx.simple_cycles(g_patient.dbgclean)):
        if kmer_length > 70:
            logger.info("There are still cycle(s) with k==70...exiting")
            sys.exit(0)
        # Check non depassement valeur limite de k
        logger.info("[Sample graph] Increasing k to %d to remove cycles",
                    kmer_length)
        return process_sample(kmer_length=kmer_length + 1,
                              sample_key=sample_key,
                              fastq_files=",".join(fastq_files),
                              fasta_file=fasta_file,
                              snp_file=snp_file,
                              experiment_name=experiment_name,
                              min_support_percentage=min_support_percentage,
                              n_permutations=n_permutations,
                              destination_directory=destination_directory,
                              export_gml=export_gml,
                              p_value_threshold=p_value_threshold,
                              output_results=output_results)

    # Some prints for stats
    dir_stat = get_or_create_dir("output/statistics")
    # graph stat
    graph_stat_file = open(dir_stat + "/graph_stat_file" + sample_key + ".tsv",
                           'w')
    graph_stat_file.write("%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" %
                          (kmer_length, g_reference.dbg.size(),
                           sample_key, g_patient.coverage['total'],
                           g_patient.dbg.size(), g_patient.dbgclean.size(),
                           g_patient.dbg.in_degree().values().count(0),
                           g_patient.dbg.out_degree().values().count(0),
                           g_patient.dbgclean.in_degree().values().count(0),
                           g_patient.dbgclean.out_degree().values().count(0)))
    # kmer stat
    kmer_stat_file = open(dir_stat + "/kmer_stat_file" + sample_key + ".tsv",
                          'w')
    for node_print in g_patient.dbg.nodes():
        fragment_print = ",".join(g_patient.dbg.node[node_print]['fastq_id'])
        reads_print = len(g_patient.dbg.node[node_print]['read_list_n'])
        kmer_stat_file.write("%s\t%s\t%s\t%d\n" % (
            sample_key,
            node_print,
            fragment_print,
            reads_print,
        ))

    # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation)
    g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg)

    # search for alternative paths in dbg_refrm (.alteration_list creation)
    g_patient.alteration_list_init(g_reference.dbg, kmer_length,
                                   min_support_percentage)

    ### Permutation test ###
    logger.info("Will create random graphs")
    all_possible_kmers = set()
    for an_alt in g_patient.alteration_list:
        all_possible_kmers.update(an_alt.reference_path)
        all_possible_kmers.update(an_alt.alternative_path)

    for i, j in time_iterator(range(0, n_permutations),
                              logger,
                              msg_prefix="permuting"):
        g_random = RRG(g_patient.coverage,
                       kmer_length,
                       restrict_to=all_possible_kmers)
        for i_alteration in range(0, len(g_patient.alteration_list)):
            g_random_data = g_random.check_path(
                g_patient.alteration_list[i_alteration].reference_path,
                g_patient.alteration_list[i_alteration].alternative_path,
                g_patient.alteration_list[i_alteration].min_coverage)
            g_patient.alteration_list[i_alteration].random_ratio_list.append(
                g_random_data[0])
            g_patient.alteration_list[
                i_alteration].random_reference_count_list.append(
                    g_random_data[1])
            g_patient.alteration_list[
                i_alteration].random_alternative_count_list.append(
                    g_random_data[2])

    logger.info("Will generate p-values for %d possible alterations",
                len(g_patient.alteration_list))
    for i_alteration in range(0, len(g_patient.alteration_list)):
        g_patient.alteration_list[i_alteration].pvalue_init()

    g_patient.significant_alteration_list_init(
        p_value_threshold=p_value_threshold)

    # If more than one significant alteration, check if they are not in "spike" (en épis)
    if len(g_patient.significant_alteration_list) > 1:
        g_patient.multiple_alternative_path_filter()

    ## Stat
    # alteration stat
    alt_stat_file = open(dir_stat + "/alt_stat_file" + sample_key + ".tsv",
                         'w')
    for i_alteration in range(0, len(g_patient.alteration_list)):
        if g_patient.alteration_list[i_alteration].pvalue_ratio <= 1:
            alt_stat_file.write(
                "%d\t%s\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\n" %
                (i_alteration + 1, sample_key, g_patient.coverage['total'],
                 g_patient.alteration_list[i_alteration].reference_sequence,
                 g_patient.alteration_list[i_alteration].alternative_sequence,
                 g_patient.alteration_list[i_alteration].reference_read_count,
                 g_patient.alteration_list[i_alteration].
                 alternative_read_count,
                 g_patient.alteration_list[i_alteration].ratio_read_count,
                 g_patient.alteration_list[i_alteration].pvalue_ratio,
                 str(g_patient.alteration_list[i_alteration].zscore),
                 "\t".join(
                     map(
                         str, g_patient.alteration_list[i_alteration].
                         random_ratio_list))))

    # For visualisation
    graph_name = "G_%s_" % sample_key
    merged_graph_name = "G_%s_merged_" % sample_key
    cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage
    merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage
    if export_gml:
        logger.info("Will save viz graph for %s with k==%d", sample_key,
                    kmer_length)
        get_or_create_dir(destination_directory)
        # for the refrence graph
        g_reference_merge = VISU.merge_reference_graph(g_reference.dbg.copy())
        g_reference_visu = VISU.reference_graph_visualization_formatting(
            g_reference.dbg.copy())
        g_reference_merge_visu = VISU.reference_graph_merged_visualization_formatting(
            g_reference_merge.copy())
        nx.write_gml(
            g_reference_visu, destination_directory + "/g_reference_visu" +
            str(kmer_length) + ".gml")
        nx.write_gml(
            g_reference_merge_visu, destination_directory +
            "/g_reference_merge_visu" + str(kmer_length) + ".gml")
        # for the patient graph
        g_patient_visu = VISU.individu_graph_visualization_formating(
            g_patient.dbg.copy(), g_reference.dbg.copy())
        g_patient_clean_visu = VISU.individu_graph_visualization_formating(
            g_patient.dbgclean.copy(), g_reference.dbg.copy())
        g_patient_merged = VISU.merge_individu_graph(g_patient.dbg.copy(),
                                                     g_reference.dbg.copy())
        g_patient_merged_visu = VISU.individu_graph_merged_visualization_formating(
            g_patient_merged.copy(), g_reference.dbg.copy())
        g_patient_clean_merged = VISU.merge_individu_graph(
            g_patient.dbgclean.copy(), g_reference.dbg.copy())
        g_patient_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(
            g_patient_clean_merged.copy(), g_reference.dbg.copy())
        nx.write_gml(
            g_patient_visu, destination_directory + "/" + graph_name +
            str(kmer_length) + ".gml")
        nx.write_gml(
            g_patient_clean_visu, destination_directory + "/" +
            cleaned_graph_name + str(kmer_length) + ".gml")
        nx.write_gml(
            g_patient_merged_visu, destination_directory + "/" +
            merged_graph_name + str(kmer_length) + ".gml")
        nx.write_gml(
            g_patient_clean_merged_visu, destination_directory + "/" +
            merged_cleaned_graph_name + str(kmer_length) + ".gml")

    # Annotation
    if experiment_name == "TP53":
        annotate_and_output_results(g_patient, g_reference, output_results)
    # SNP
    dir_stat = get_or_create_dir("output/snp")
    # graph stat
    graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w')
    for snp_id in g_reference.snp.keys():
        if g_reference.snp[snp_id][1] in g_patient.dbgclean:
            if g_reference.snp[snp_id][0] in g_patient.dbgclean:
                graph_snp.write("%s\t%s\t%d\t%d\n" %
                                (sample_key, snp_id,
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [0]]['read_list_n']),
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [1]]['read_list_n'])))
            else:
                graph_snp.write("%s\t%s\t0\t%d\n" %
                                (sample_key, snp_id,
                                 len(g_patient.dbg.node[g_reference.snp[snp_id]
                                                        [1]]['read_list_n'])))