Python SigProfilerMatrixGeneratorFunc Exemples, SigProfilerMatrixGenerator.scripts.SigProfilerMatrixGeneratorFunc Python Exemples

Exemple #1

0

Afficher le fichier

def main():
    # Initiate the parser
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-v",
        "--vcf_file_folder",
        help="path to folder containing small variant VCF files")
    parser.add_argument("-n",
                        "--name",
                        help="string to associate with the files/figures",
                        default="mut_spec")
    parser.add_argument("-r",
                        "--reference",
                        help="Must be one of GRCh38,GRCh37,mm10,mm9,etc.",
                        default="GRCh37")

    #get at the arguments
    args = parser.parse_args()
    if (args.vcf_file_folder == None
            or not os.path.exists(args.vcf_file_folder)):
        parser.print_help()
        sys.exit()

    #set up to use GRCh37
    #genInstall.install('GRCh37', rsync=False, bash=True)  #should be set up when container is built
    matGen.SigProfilerMatrixGeneratorFunc(args.name,
                                          args.reference,
                                          args.vcf_file_folder,
                                          plot=True)

Exemple #2

0

Afficher le fichier

def load(mut_class):
    # Extract mutational matrices via SigProfiler
    matrices = matGen.SigProfilerMatrixGeneratorFunc(
        project=mut_class.project_name,
        genome=mut_class.reference_genome,
        vcfFiles=mut_class.vcf,
        exome=mut_class.exome)
    return

Exemple #3

0

Afficher le fichier

def main():
    """ Generates matrix from MAF file """
    args = parse_arguments()
    outdir = os.path.dirname(args.maf)  # output made in MAF directory

    # Generates matrix from MAF found in outdir
    matrices = mg.SigProfilerMatrixGeneratorFunc(args.project, args.ref, \
                                                 outdir, tsb_stat = True)
    return ()

Exemple #4

0

Afficher le fichier

Fichier : install.py Projet : xtmgah/SigProfilerMatrixGenerator

def benchmark(genome, ref_dir):
    #current_dir = os.path.realpath(__file__)
    #ref_dir = re.sub('\/install.py$', '', current_dir)
    ref_dir = os.path.dirname(os.path.abspath(__file__))
    vcf_path = ref_dir + "/references/vcf_files/" + genome + "_bench/"

    start_time = time.time()
    matGen.SigProfilerMatrixGeneratorFunc(genome + "_bench", genome, vcf_path)
    end_time = time.time()

    original_matrix_96 = ref_dir + "/scripts/Benchmark/" + genome + "_bench_orig_96.txt"
    original_matrix_3072 = ref_dir + "/scripts/Benchmark/" + genome + "_bench_orig_3072.txt"
    new_matrix_96 = vcf_path + "output/SBS/" + genome + "_bench.SBS96.all"
    new_matrix_3072 = vcf_path + "output/SBS/" + genome + "_bench.SBS6144.all"

    #genome = "GRCh37"

    ############# Cosine Test ###################################################
    data_orig = pd.read_csv(original_matrix_96, sep='\t', header=0)
    data_new = pd.read_csv(new_matrix_96, sep='\t', header=0)
    count = 0
    range_count = min(len(data_orig.loc[0]), len(data_new.loc[0]))
    for i in range(1, range_count, 1):
        orig_list = list(data_orig[data_orig.columns[i]])
        new_list = list(data_new[data_new.columns[i]])
        cosine_result = (1 - spatial.distance.cosine(orig_list, new_list))
        if cosine_result != 1:
            count += 1
    if count != 0:
        print(
            "There seems to be some errors in the newly generated matrix. The installation may not have been successful."
        )

    data_orig = pd.read_csv(original_matrix_3072, sep='\t', header=0)
    data_new = pd.read_csv(new_matrix_3072, sep='\t', header=0)
    count = 0
    range_count = min(len(data_orig.loc[0]), len(data_new.loc[0]))
    for i in range(1, range_count, 1):
        orig_list = data_orig[data_orig.columns[i]]
        new_list = data_new[data_new.columns[i]]
        cosine_result = (1 - spatial.distance.cosine(orig_list, new_list))
        if cosine_result <= 0.85:
            count += 1
    if count != 0:
        print(
            "There seems to be some errors in the newly generated matrix. The installation may not have been successful."
        )

    end_time = time.time()
    print("Installation was succesful.\nSigProfilerMatrixGenerator took " +
          str(end_time - start_time) + " seconds to complete.")

Exemple #5

0

Afficher le fichier

    def generate():
        print(genome_ref, mutationalPattern, sigflow, sigfit, deconstructSigs)
        x = 1
        yield "data:" + str(x) + "\n\n"
        yield "data:" + str(x) + "\n\n"

        if glob.glob("uploads/*.vcf"):
            matGen.SigProfilerMatrixGeneratorFunc("MetaMutationalSigs",
                                                  'GRCh37', "uploads")
            x = x + 33
            yield "data:" + str(x) + "\n\n"
            subprocess.call([
                'Rscript', "../meta_sig_main_flask.r", "uploads", genome_ref,
                mutationalPattern, sigflow, sigfit, deconstructSigs
            ])
            x = x + 33
            yield "data:" + str(x) + "\n\n"
            subprocess.call([
                'python3.8', "../plot_graphs.py", "uploads", mutationalPattern,
                sigflow, sigfit, deconstructSigs
            ])

            shutil.rmtree("uploads" + "/input")
            shutil.rmtree("uploads" + "/logs")
            shutil.rmtree("uploads" + "/output")

            files_in_directory = os.listdir("uploads")

            filtered_files = [
                file for file in files_in_directory if file.endswith(".vcf")
            ]

            for file in filtered_files:
                path_to_file = os.path.join("uploads", file)
                os.remove(path_to_file)

            zipf = zipfile.ZipFile("metaMutationalSignatures_results.zip", 'w',
                                   zipfile.ZIP_DEFLATED)
            # os.chdir("")
            zipdir("./uploads/", zipf)
            zipf.close()

            # shutil.rmtree("uploads")
            if not os.path.isdir(app.config['UPLOAD_FOLDER']):
                os.mkdir(app.config['UPLOAD_FOLDER'])
            x = x + 33
            yield "data:" + str(x) + "\n\n"

        else:
            pass

Exemple #6

0

Afficher le fichier

Fichier : SigProfilerSimulator.py Projet : mdbarnesUCSD/SigProfilerSimulator

def SigProfilerSimulator (project, project_path, genome, contexts, exome=None, simulations=1, updating=False, bed_file=None, overlap=False, gender='female', seqInfo=False, chrom_based=False, seed_file=None, spacing=1, noisePoisson=False, noiseAWGN=0, cushion=100, region=None, vcf=False, mask=None):
	'''
	contexts -> [] must be a list
	'''
	print("\n======================================\n        SigProfilerSimulator        \n======================================\n\nChecking for all reference files and relevant matrices...")
	start_run = time.time()

	# Ensures proper string for the project's path
	if project_path[-1] != "/":
		project_path += "/"

	# Sorts the user-provided contexts
	contexts.sort(reverse=True)


	bed = False
	if bed_file:
		bed = True
	exome_file = None

	# Asigns a species based on the genome parameter
	species = None
	if genome.upper() == 'GRCH37' or genome.upper() == 'GRCH38': 
		species = "homo_sapiens"
	elif genome.upper() == 'MM10' or genome.upper() == 'MM9': 
		species = "mus_musculus"
	else:
		species = "custom"

	############################## References ###########################################################################################################
	chromosomes = ['X', 'Y', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 
				   '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']
	
	tsb_ref = {0:['N','A'], 1:['N','C'], 2:['N','G'], 3:['N','T'],
			   4:['T','A'], 5:['T','C'], 6:['T','G'], 7:['T','T'],
			   8:['U','A'], 9:['U','C'], 10:['U','G'], 11:['U','T'],
			   12:['B','A'], 13:['B','C'], 14:['B','G'], 15:['B','T'],
			   16:['N','N'], 17:['T','N'], 18:['U','N'], 19:['B','N']}

	tsb_ref_rev = {'N':{'A':0, 'C':1, 'G':2, 'T':3, 'N':16},
				   'T':{'A':4, 'C':5, 'G':6, 'T':7, 'N':17},
				   'U':{'A':8, 'C':9, 'G':10, 'T':11, 'N':18},
				   'B':{'A':12, 'C':13, 'G':14, 'T':15, 'N':19}}
				   
	if species == 'mus_musculus':
		chromosomes = chromosomes[:21]

	chromosome_string_path, ref_dir = matRef.reference_paths(genome)
	if species == 'custom':
		chromosome_string_path, ref_dir = matRef.reference_paths(genome)
		chromosomes = os.listdir(chromosome_string_path)
		if ".DS_Store" in chromosomes:
			chromosomes.remove(".DS_Store")

		chromosomes = [x.split(".")[0] for x in chromosomes if len(x.split(".")[0]) < 8]
		if genome == 'yeast':
			chromosomes = sorted(chromosomes, key = lambda x: (['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI'].index(x)))
	if gender == 'female' or gender.upper() == 'FEMALE':
		if "Y" in chromosomes:
			chromosomes.remove('Y')

	if region:
		chromosomes = [region]
	############################## Log and Error Files ##################################################################################################
	time_stamp = datetime.date.today()
	error_file = project_path + 'logs/SigProfilerSimulator_' + project + "_" + genome + "_" + str(time_stamp) + ".err"
	log_file = project_path + 'logs/SigProfilerSimulator_' + project + "_" + genome + "_" + str(time_stamp) + ".out"

	if not os.path.exists(project_path + "logs/"):
		os.makedirs(project_path + "logs/")

	if os.path.exists(error_file):
		# os.system("rm " + error_file)
		os.remove(error_file)
	if os.path.exists(log_file):
		# os.system("rm " + log_file)
		os.remove(log_file)


	sys.stderr = open(error_file, 'w')
	log_out = open(log_file, 'w')
	log_out.write("THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME\n\n\n")
	log_out.write("-------System Info-------\n")
	log_out.write("Operating System Name: "+ platform.uname()[0]+"\n"+"Nodename: "+ platform.uname()[1]+"\n"+"Release: "+ platform.uname()[2]+"\n"+"Version: "+ platform.uname()[3]+"\n")
	log_out.write("\n-------Python and Package Versions------- \n")
	log_out.write("Python Version: "+str(platform.sys.version_info.major)+"."+str(platform.sys.version_info.minor)+"."+str(platform.sys.version_info.micro)+"\n")
	log_out.write("SigProfilerSimulator Version: "+sigSim.__version__+"\n")
	log_out.write("SigProfilerMatrixGenerator Version: "+sig.__version__+"\n")
	log_out.write("numpy version: "+np.__version__+"\n")
	
	log_out.write("\n-------Vital Parameters Used for the execution -------\n")
	log_out.write("Project: {}\nGenome: {}\nInput File Path: {}\ncontexts: {}\nexome: {}\nsimulations: {}\nupdating: {}\nbed_file: {}\noverlap: {}\ngender: {}\nseqInfo: {}\nchrom_based: {}\nseed_file: {}\n".format(project, project_path, genome, contexts, str(exome), str(simulations),  str(updating), str(bed_file), str(overlap), gender, str(seqInfo), str(chrom_based), str(seed_file)))
	log_out.write("\n-------Date and Time Data------- \n")
	tic = datetime.datetime.now()
	log_out.write("Date and Clock time when the execution started: "+str(tic)+"\n\n\n")
	


	############################## Pre-simulation Checks ##################################################################################################
	# Ensures that the chromosome strings are saves properly:
	chromosome_string_path, ref_dir = matRef.reference_paths(genome)
	if os.path.exists(chromosome_string_path) == False or len(os.listdir(chromosome_string_path)) < len(chromosomes):
		print("     The chromosome strings were not saved properly or have not been created yet. Please refer to the SigProfilerMatrixGenerator README for installation instructions:\n\thttps://github.com/AlexandrovLab/SigProfilerMatrixGenerator")
		sys.exit()
	# Ensures that the chromosome proportions are saved: 
	if os.path.exists(chromosome_string_path + genome + "_proportions.txt") == False:
		print("     Chromosome proportion file does not exist. Creating now...", end='')
		chromosomeProbs = simScript.chrom_proportions(chromosome_string_path, genome, chromosomes)
		print("Completed!")

	if bed_file:
		print("     Creating a chromosome proportion file for the given BED file ranges...", end='')
		chromosomeProbs = simScript.chrom_proportions_BED(bed_file, chromosome_string_path, genome, chromosomes)
		print("Completed!")

	# Ensures that the mutational matrices exist:
	catalogue_files = {}	
	for context in contexts:
		matrix_path = project_path + "output/"
		if context == 'DINUC' or 'DBS' in context:
			context_folder = 'DBS'
			matrix_path = matrix_path + context_folder + "/"
			if context == 'DBS' or context == 'DINUC' or context == '78':
				file_name = ".DBS78"
			else:
				file_name = '.' + context 
		elif context == 'INDEL' or 'ID' in context or '415' in context:
			context_folder = 'ID'
			matrix_path = matrix_path + context_folder + "/"
			if context == 'INDEL' or context == 'ID' or context == '83':
				file_name = '.ID83'
			else:
				file_name = "." + context
		else:
			context_folder = 'SBS'
			matrix_path = matrix_path + context_folder + "/"
			file_name = '.SBS' + context

		if exome:
			catalogue_file = matrix_path + project + file_name + '.exome'
		else:
			if bed_file:
				catalogue_file = matrix_path + project + file_name + '.region'
			else:
				catalogue_file = matrix_path + project + file_name + '.all'

	
		catalogue_files[context] = catalogue_file

		vcf_files_1 = project_path
		vcf_files_2 = project_path + "input/"
		parent_dir = os.getcwd()
		matrix_dir = "scripts/"
		if chrom_based:
			if os.path.exists (catalogue_file + '.chr1') == False:
				if os.path.exists (vcf_files_2) == False and len(os.listdir(vcf_files_1)) == 0:
					print ("     Please place your vcf files for each sample into the 'references/vcf_files/[project]/' directory. Once you have done that, rerun this script.")
				else:
					print("     Matrices per chromosomes do not exist. Creating the matrix files now.")
					matGen.SigProfilerMatrixGeneratorFunc(project, genome, project_path ,plot=False, exome=exome, bed_file=bed_file, chrom_based=True, cushion=cushion)
					# print("The matrix file has been created. Continuing with simulations...")
			if os.path.exists (catalogue_file) == False:
				if os.path.exists (vcf_files_2) == False and len(os.listdir(vcf_files_1)) == 0:
					print ("     Please place your vcf files for each sample into the 'references/vcf_files/[project]/' directory. Once you have done that, rerun this script.")
				else:
					print("     " + catalogue_file + " does not exist. Creating the matrix file now.")
					matGen.SigProfilerMatrixGeneratorFunc(project, genome, project_path ,plot=False, exome=exome, bed_file=bed_file, cushion=cushion)
					# print("The matrix file has been created. Continuing with simulations...")


		else:	
			if os.path.exists (catalogue_file) == False:# or bed_file:
				if os.path.exists (vcf_files_2) == False and len(os.listdir(vcf_files_1)) == 0:
					print ("     Please place your vcf files for each sample into the 'references/vcf_files/[project]/' directory. Once you have done that, rerun this script.")
				else:
					print("     " + catalogue_file + " does not exist. Creating the matrix file now.")
					matGen.SigProfilerMatrixGeneratorFunc(project, genome, project_path ,plot=False, exome=exome, bed_file=bed_file, cushion=cushion)
					# print("The matrix file has been created. Continuing with simulations...")

	if exome:
		exome_file = ref_dir + "/references/chromosomes/exome/" + genome + "/" + genome + "_exome.interval_list"

	# Esnures that the nucleotide context files are saved properly
	nucleotide_context_files = {}
	for context in contexts:
		nucleotide_context_file = chromosome_string_path.split("/")
		ref_path = nucleotide_context_file[:-3]
		ref_path = '/'.join([x for x in ref_path])
		nucleotide_context_file = ref_path + '/context_distributions/'
		
		if bed_file:
			if region:
				nucleotide_context_file += "context_distribution_" + genome + "_" + context + "_" + gender + ".csv"
			else:
				nucleotide_context_file += "context_distribution_" + genome + "_" + context + "_" + gender + "_BED.csv"
		else:
			if exome:
				nucleotide_context_file += "context_distribution_" + genome + "_" + context + "_" + gender + "_exome.csv"
			else:
				nucleotide_context_file += "context_distribution_" + genome + "_" + context + "_" + gender + ".csv"

		nucleotide_context_files[context] = nucleotide_context_file
		if os.path.exists(nucleotide_context_file) == True and bed and not region:
			os.remove(nucleotide_context_file)

		if os.path.exists(nucleotide_context_file) == False and (context != 'INDEL' and context != 'ID' and context != 'ID415'):
			print("     The context distribution file does not exist. This file needs to be created before simulating. This may take several hours...")
			if bed:
				output_file = ref_path + '/context_distributions/context_distribution_' + genome + "_" + context + "_" + gender + '_BED.csv'
				context_dist.context_distribution_BED(context, output_file, chromosome_string_path, chromosomes, bed, bed_file, exome, exome_file, genome, ref_path, tsb_ref, gender)
			elif exome:
				output_file = ref_path + '/context_distributions/context_distribution_' + genome + "_" + context + "_" + gender + '_exome.csv'
				context_dist.context_distribution_BED(context, output_file, chromosome_string_path, chromosomes, bed, bed_file, exome, exome_file, genome, ref_dir, tsb_ref, gender)
			else:
				output_file = ref_path + '/context_distributions/context_distribution_' + genome + "_" + context + "_" + gender + '.csv'
				context_dist.context_distribution(context, output_file, chromosome_string_path, chromosomes, tsb_ref, genome)
			print("     The context distribution file has been created!")
			if gender == 'female' or gender.upper() == 'FEMALE':
				if "Y" in chromosomes:
					chromosomes.remove('Y')


	############################## Set-up output files ##################################################################################################
	context_string = "_".join(contexts)
	if bed_file:
		output_path = project_path + "output/simulations/" + project + '_simulations_' + genome + '_' + context_string + '_BED/'
	elif exome:
		output_path = project_path + "output/simulations/" + project + '_simulations_' + genome + '_' + context_string + '_exome/'
	else:
		output_path = project_path + "output/simulations/" + project + '_simulations_' + genome + '_' + context_string + '/'


	if os.path.exists(output_path):
		shutil.rmtree(output_path)
		os.makedirs(output_path)
	else:
		os.makedirs(output_path)

	if "M" in chromosomes:
		chromosomes.remove("M")
	if "MT" in chromosomes:
		chromosomes.remove("MT")
	############################## Begin the simulation process ##################################################################################################
	print()
	if chrom_based:
		sample_names, mut_prep, mut_dict = simScript.mutation_preparation_chromosomes(catalogue_files, matrix_path, chromosomes, project, log_file)
		reference_sample = sample_names[0]
	elif region:
		sample_names, mut_prep, mut_dict = simScript.mutation_preparation_region(catalogue_files, matrix_path, project, log_file, region)
		reference_sample = sample_names[0]		
	else:
		sample_names, mut_prep = simScript.mutation_preparation(catalogue_files, log_file)
		reference_sample = sample_names[0]
		mut_dict = simScript.mut_tracker(sample_names,  mut_prep, reference_sample, nucleotide_context_files, chromosome_string_path, genome, chromosomes, bed_file, log_file)
	

	if vcf:
		if "" in sample_names:
			sample_names.remove("")
		for sample in sample_names:
			if not os.path.exists(output_path + sample + "/"):
				os.makedirs(output_path + sample + "/")

	# Add desired noise if applicable:
	# if noisePoisson or noiseAWGN:
	# 	mut_dict = simScript.noise(mut_dict, noisePoisson, noiseAWGN)

	# Set-up parallelization:
	processors = mp.cpu_count()
	max_seed = processors
	if processors > len(chromosomes):
		max_seed = len(chromosomes)
	pool = mp.Pool(max_seed)

	chrom_break = len(chromosomes)/max_seed
	chromosomes_parallel = [[] for i in range(max_seed)]

	chrom_bin = 0
	for chrom in chromosomes:
		if chrom_bin == max_seed:
			chrom_bin = 0
		chromosomes_parallel[chrom_bin].append(chrom)
		chrom_bin += 1

	iterations_parallel = [[] for i in range(max_seed)]
	iter_bin = 0
	for i in range(1, simulations + 1, 1):
		if iter_bin == max_seed:
			iter_bin = 0
		iterations_parallel[iter_bin].append(i)
		iter_bin += 1


	# Generate unique seeds for each process
	log_out.write("\n-------Seeds for random number generation per process------- \n")
	seeds = []
	if seed_file == None:
		ref_dir, tail = os.path.split(os.path.dirname(os.path.abspath(__file__)))
		seed_file = ref_dir + "/SigProfilerSimulator/seeds.txt"
	with open(seed_file) as f:
		for i in range (0, max_seed, 1):
			new_seed = int(f.readline().strip()) + time.time()
			seeds.append(new_seed)
			log_out.write("Process " + str(i) + ": " + str(new_seed) + "\n")

	log_out.write("\n\n\n-------Runtime Checkpoints------- \n")
	log_out.close()

	if exome:
		bed = True
		bed_file = ref_dir + "/SigProfilerMatrixGenerator/references/chromosomes/exome/" + genome + "/" + genome + "_exome.interval_list"

	if seqInfo:
		seqOut_path = project_path + "output/vcf_files/simulations/"
		if not os.path.exists(seqOut_path):
			os.makedirs(seqOut_path)

		for context in contexts:
			if not os.path.exists(seqOut_path + context + "/"):
				os.makedirs(seqOut_path + context + "/")
			else:
				print(seqOut_path+ context + "/")
				shutil.rmtree(seqOut_path+ context + "/")
				os.makedirs(seqOut_path+ context + "/")

	pool = mp.Pool(max_seed)
	results = []
	for i in range (0, len(chromosomes_parallel), 1):
		mut_dict_parallel = {k1:{k2:{k3:{k4:v4 for k4, v4 in v3.items() if k4 in chromosomes_parallel[i]} for k3, v3 in v2.items()} for k2, v2 in v1.items()} for k1, v1 in mut_dict.items()}
		r = pool.apply_async(simScript.simulator, args=(sample_names, mut_dict_parallel, chromosome_string_path, tsb_ref, tsb_ref_rev, simulations, seeds[i], cushion, output_path, updating, chromosomes_parallel[i], project, genome, bed, bed_file, contexts, overlap, project_path, seqInfo, log_file, spacing, noisePoisson, noiseAWGN, vcf, mask))
		results.append(r)
	pool.close()
	pool.join()
	# simScript.simulator(sample_names, mut_dict, chromosome_string_path, tsb_ref, tsb_ref_rev, simulations, seeds[0], output_path, updating, chromosomes, project, genome, bed, bed_file, contexts, overlap, project_path, seqInfo, log_file, spacing, noisePoisson, noiseAWGN)
	for r in results:
		r.wait()
		if not r.successful():
			# Raises an error when not successful
			r.get()

	pool = mp.Pool(max_seed)

	#if region:
	bed=False

	for i in range (0, len(iterations_parallel), 1):
		r = pool.apply_async(simScript.combine_simulation_files, args=(iterations_parallel[i], output_path, chromosomes, sample_names, bed, exome, vcf))
	pool.close()
	pool.join()

	for r in results:
		r.wait()
		if not r.successful():
			# Raises an error when not successful
			r.get()

	end_run = time.time()
	run_time = end_run - start_run
	log_out = open(log_file, 'a')
	print("Simulation completed\nJob took " , run_time, " seconds", file=log_out)
	print("Simulation completed\nJob took " , run_time, " seconds")
	log_out.close()
	sys.stderr.close()

Exemple #7

0

Afficher le fichier

def single_sample(data,
                  output,
                  ref="GRCh37",
                  sig_database="default",
                  check_rules=True,
                  exome=False):
    """
    Decompose the query samples into the global signatures.
    
    parameters
    ----------
    vcf: string or dataframe. The name of the folder containing the vcf files. The folder should be present in the current working directory. If a dataframe is used, that should be a mutational catalogue where the row 
    index will be the names of mutations and the column names will be the sample names. 
    outputdir: A string. The name of the output folder. The output folder will be generated in the current working directory according to name provided in the current working directory. 
    ref:  string. The name of the reference genome file. The file should be installed previously through "SigProfilerMatrixGenerator". 
    Please see the "INSTALLATION" part of the README.md file. The default reference genome is "GRCh37".
    sig_database: dataframe. This is signature catalogue where the row index will be the names of mutations and the column names will be the sample names. The sum of each column should be one. The row numbers should be equal 
    to the row the number of the mutational catalogue and the order/sequence of the mutation types should be same as of those in the mutational catalogue. 
    check_rules: boolean. If true, check the signature rules. Not functional for the custom signature database.  
    exome: boolean. If the agrument is True, that will genearate the mutational profile only for the exomes. If False, the profile 
    for the whole genome sequence will be generated. 
          
    
    Returns:
    -------
    After the single_sample function is successfully executed, an output directory will be generated in the current working directory. 
    
    The output folder will contain the following files:
        -exposure.txt 
        -signature.txt 
        -probabilities.txt 
        -signature plot pdf 
        -dendrogram plot
        -decomposition profile.csv

    Example: 
    -------
    >>> from sigproSS import spss 
    >>> data = spss.importdata()
    >>> spss.single_sample(data, "results", ref="GRCh37", exome=False)
    
    
        
    """

    if not os.path.exists(output):
        os.makedirs(output)

    #get the path for files
    paths = cosmic.__path__[0]

    #set the signature database:
    if type(sig_database) == str:
        signatures_names = paths + '/input/signaturesSet.txt'
        wholegenome_singnatures = paths + '/input/genomeSignatures.txt'
        exome_signatures = paths + '/input/exomeSignatures.txt'

        #extract data from the signature database
        signaturesNames = open(signatures_names, 'r').read().split('\n')
        allGenomeSignatures = np.loadtxt(wholegenome_singnatures)
        allExomeSignatures = np.loadtxt(exome_signatures)

    else:
        signaturesNames = list(sig_database.columns)
        allGenomeSignatures = np.array(sig_database)
        allExomeSignatures = np.array(sig_database)

    # take the inputs

    # check if the input type is a vcf or a dataframe
    if type(data) == str:

        vcf = data
        if vcf[-1] != "/":
            vcf_name = vcf.split("/")[-1]
        else:
            vcf_name = vcf.split("/")[-2]

        data = matGen.SigProfilerMatrixGeneratorFunc(vcf_name,
                                                     ref,
                                                     vcf,
                                                     exome=exome,
                                                     tsb_stat=True)

        # make the totalExposure dataframe which have dimention of totalsignatures and totalsamples
        p_value = data["7_pvalue"]
        data = data["96"]

    else:
        p_value = "none"
        check_rules = False

    number_of_signatures = len(signaturesNames)
    totalExposures = np.zeros([number_of_signatures, data.shape[1]])
    listOfSamples = list(data.columns)

    # open a file to profile the signatures
    fh = open(output + "/decomposition_profile.csv", "w")
    fh.write("Sample_Names,Global_NMF_Signatures,Similarity\n")
    fh.close()

    #set the signature database:
    if type(sig_database) == str:
        signatures_names = paths + '/input/signaturesSet.txt'
        wholegenome_singnatures = paths + '/input/genomeSignatures.txt'
        exome_signatures = paths + '/input/exomeSignatures.txt'

        #extract data from the signature database
        signaturesNames = open(signatures_names, 'r').read().split('\n')
        allGenomeSignatures = np.loadtxt(wholegenome_singnatures)
        allExomeSignatures = np.loadtxt(exome_signatures)

    else:
        signaturesNames = sig_database.columns
        allGenomeSignatures = np.array(sig_database)
        allExomeSignatures = np.array(sig_database)

    for i in range(data.shape[1]):
        print("##########################################################")
        print("Exacting Profile for " + "Sample " + str(i + 1))
        index = i
        samples = data.iloc[:, index:index + 1]
        #print(p_value)
        samples = np.array(samples)
        sampleNames = list(data.head(0))[index:index + 1]
        cancerType = ['Breast Cancer'] * samples.shape[1]
        seqType = ['WGS'] * samples.shape[1]
        totalMutations = np.sum(samples, axis=0)

        #results variable contains [indices,exposures, signatureNames, allSignatures, similarity]
        results = analysis_individual_samples(
            samples, check_rules, signaturesNames, allGenomeSignatures,
            allExomeSignatures, sampleNames, cancerType, cancerType, seqType,
            totalMutations, p_value,
            paths + '/input/20181108_Signature_Rules.xml')
        totalExposures[results[0], i] = results[1]
        listOfSignatures = results[2]
        signatures = pd.DataFrame(results[3])
        profile = decomposition_profile(totalExposures[:, i], results[4],
                                        results[2], sampleNames[0])

        #write the profiles into file
        fh = open(output + "/decomposition_profile.csv", "a")
        fh.write(profile)
        fh.close()

    #prepare the exposures dataframe

    totalExposures = pd.DataFrame(totalExposures)
    totalExposures = totalExposures.set_index(listOfSignatures)
    totalExposures.columns = listOfSamples
    totalExposures = totalExposures.rename_axis("Samples", axis="columns")
    #Convert the floats to integers
    totalExposures[listOfSamples] = totalExposures[listOfSamples].applymap(
        np.int64)

    #remove the rows with all zeros to create the final exposure dataframe
    exposures = totalExposures.loc[~(totalExposures == 0).all(axis=1)]

    #presure the signatures dataframe
    signatures = pd.DataFrame(results[3])
    signatures.columns = listOfSignatures
    signatures = signatures.set_index(data.index)
    signatures = signatures.rename_axis("Signatures", axis="columns")

    #Filter the signatures by the exposures rows to get the final signature dataframe
    signatures = signatures.loc[:, list(exposures.index)]

    #create the probalities
    probability = sub.probabilities(signatures, exposures, data.index,
                                    signatures.columns, totalExposures.columns)
    probability = probability.set_index("Sample Names")
    probability = probability.rename_axis("", axis="columns")

    try:
        #create the dedrogrames
        Y, dn = sub.dendrogram(exposures, 0.05, output)
    except:
        pass

    #export results

    signatures.to_csv(output + "/signatures.txt",
                      "\t",
                      index_label=[signatures.columns.name])
    exposures.to_csv(output + "/Sig_activities.txt",
                     "\t",
                     index_label=[exposures.columns.name])
    probability.to_csv(output + "/Mutation_Probabilities.txt", "\t")
    try:
        plot.plotSBS(output + "/signatures.txt",
                     output + "/Signature_plot",
                     "",
                     "96",
                     True,
                     custom_text_upper=" ")
    except:
        print(
            "SORRY! THE MUTATION CONTEXT YOU PROVIDED COULD NOT BE PLOTTED\n\n"
        )

    print(
        "CONGRATULATIONS! THE SIGPROFILER SINGLE SAMPLE ANALYSIS ENDED SUCCESSFULLY"
    )

Exemple #8

0

Afficher le fichier

if __name__ == "__main__":

    args = parse_args()

    if args.directory is None:
        current = getcwd()
        args.directory = current + "/" + "sigprof_input"
    try:
        if not isdir(args.directory):
            mkdir(args.directory)
    except:
        print("ERROR: creation of directory", args.directory,
              "failed. Please use the -d option to create a valid directory.")

    try:
        shutil.copyfile(args.maf, args.directory + "/" + basename(args.maf))
    except:
        print("File copy failed.", args.maf,
              args.directory + basename(args.maf))

    matrices = matGen.SigProfilerMatrixGeneratorFunc(args.project,
                                                     "GRCh37",
                                                     args.directory,
                                                     plot=args.plot,
                                                     exome=False,
                                                     bed_file=None,
                                                     chrom_based=False,
                                                     tsb_stat=False,
                                                     seqInfo=False,
                                                     cushion=args.cushion)

Exemple #9

0

Afficher le fichier

from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen

matrices = matGen.SigProfilerMatrixGeneratorFunc("./output/",
                                                 "GRCh37",
                                                 "./data/",
                                                 plot=True,
                                                 exome=False,
                                                 bed_file=None,
                                                 chrom_based=False,
                                                 tsb_stat=False,
                                                 seqInfo=False,
                                                 cushion=100)

Exemple #10

0

Afficher le fichier

#out_dir = snakemake.params["out_dir"]
#bed = snakemake.input["bed"]

sample = "SJCBF"
genome_version = "GRCh37"
out_dir = ".tests/"
bed = None

from SigProfilerMatrixGenerator import install as genInstall

genInstall.install(genome_version, rsync=False, bash=True)

from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen

matrices = matGen.SigProfilerMatrixGeneratorFunc(sample,
                                                 genome_version,
                                                 out_dir,
                                                 plot=True,
                                                 exome=True,
                                                 bed_file=bed,
                                                 chrom_based=False,
                                                 tsb_stat=False,
                                                 seqInfo=False,
                                                 cushion=100)

matrix_path = out_dir + "output/SBS/{sample}.SBS96.exome".format(sample=sample)

from sigProfilerPlotting import sample_portrait as sP

sP.samplePortrait(sample_matrices_path, output_path, project, percentage=False)

Exemple #11

0

Afficher le fichier

def main():
    # Parse and validate arguments
    args = parse_arguments()

    matrices = matGen.SigProfilerMatrixGeneratorFunc(args.project,
                                                     args.genome,
                                                     args.vcfpath,
                                                     plot=True,
                                                     exome=args.exome,
                                                     bed_file=None,
                                                     chrom_based=False,
                                                     tsb_stat=False,
                                                     seqInfo=False,
                                                     cushion=100)

    num_tasks = 0
    sig_list = []

    try:
        if matrices['96'][args.project].sum() > 0:
            num_tasks = num_tasks + 1
            sig_list.append(('SBS', '96'))
        else:
            if os.path.exists(args.vcfpath + "/output/SBS"):
                for f in os.listdir(args.vcfpath + "/output/SBS"):
                    os.remove(os.path.join(args.vcfpath + "/output/SBS", f))
                os.rmdir(args.vcfpath + "/output/SBS")
    except:
        if os.path.exists(args.vcfpath + "/output/SBS"):
            for f in os.listdir(args.vcfpath + "/output/SBS"):
                os.remove(os.path.join(args.vcfpath + "/output/SBS", f))
            os.rmdir(args.vcfpath + "/output/SBS")

    try:
        if matrices['DINUC'][args.project].sum() > 0:
            num_tasks = num_tasks + 1
            sig_list.append(('DBS', '78'))
        else:
            if os.path.exists(args.vcfpath + "/output/DBS"):
                for f in os.listdir(args.vcfpath + "/output/DBS"):
                    os.remove(os.path.join(args.vcfpath + "/output/DBS", f))
                os.rmdir(args.vcfpath + "/output/DBS")
    except:
        if os.path.exists(args.vcfpath + "/output/DBS"):
            for f in os.listdir(args.vcfpath + "/output/DBS"):
                os.remove(os.path.join(args.vcfpath + "/output/DBS", f))
            os.rmdir(args.vcfpath + "/output/DBS")

    try:
        if matrices['ID'][args.project].sum() > 0:
            num_tasks = num_tasks + 1
            sig_list.append(('ID', '83'))
        else:
            if os.path.exists(args.vcfpath + "/output/ID"):
                for f in os.listdir(args.vcfpath + "/output/ID"):
                    os.remove(os.path.join(args.vcfpath + "/output/ID", f))
                os.rmdir(args.vcfpath + "/output/ID")
    except:
        if os.path.exists(args.vcfpath + "/output/ID"):
            for f in os.listdir(args.vcfpath + "/output/ID"):
                os.remove(os.path.join(args.vcfpath + "/output/ID", f))
            os.rmdir(args.vcfpath + "/output/ID")

    if num_tasks > 0:
        cpus_per_task = max(int(args.threads / num_tasks), 1)
        with ThreadPoolExecutor(max_workers=3) as e:
            for sigClass, sigContext in sig_list:
                e.submit(extractSignatures, args.output, args.vcfpath,
                         args.genome, args.project, sigClass, sigContext,
                         args.exome, cpus_per_task)

Exemple #12

0

Afficher le fichier

Fichier : sigprofiler_matrixgenerator.py Projet : edawson/presig

import argparse
from os.path import dirname
from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", dest="input", help="MAF file from which to extract matrix.", required=True)
    parser.add_argument("-e", "--exome", dest="exome", help="Exome data - restrict genome to exome regions", default=False)
    parser.add_argument("-p", "--project", dest="project", default="PROJECT", help="Project name for output.")
    parser.add_argument("-d", "--directory", dest="directory", default="input", help="Input/Output directory")
    parser.add_argument("-P", "--plot", dest="plot", action="store_true", help="Output plots of input data.")

    return parser.parse_args()


if __name__ == "__main__":
    
    args = parse_args()

    if args.directory is None:
        args.directory = dirname(args.input)

    matrices = matGen.SigProfilerMatrixGeneratorFunc(args.project, "GRCh37", dirname(args.input), plot=args.plot, exome=False, bed_file=None, chrom_based=False, tsb_stat=False, seqInfo=True, cushion=100)

Exemple #13

0

Afficher le fichier

def sigProfilerExtractor(input_type,
                         out_put,
                         input_data,
                         refgen="GRCh37",
                         genome_build='GRCh37',
                         startProcess=1,
                         endProcess=10,
                         totalIterations=8,
                         cpu=-1,
                         hierarchy=False,
                         mtype=["default"],
                         exome=False,
                         par_h=0.90,
                         penalty=0.05,
                         resample=True):
    memory_usage()
    """
    Extracts mutational signatures from an array of samples.
    
    
    Parameters
    ----------
    
    input_type: A string. Type of input. The type of input should be one of the following:
            - "vcf": used for vcf format inputs.
            - "table": used for table format inputs using a tab seperated file.
             
        
    out_put: A string. The name of the output folder. The output folder will be generated in the current working directory. 
            
    input_data: A string. Name of the input folder (in case of "vcf" type input) or the input file (in case of "table"  type input). The project file or folder should be inside the current working directory. For the "vcf" type input,the project has to be a folder which will contain the vcf files in vcf format or text formats. The "text"type projects have to be a file.   
            
    refgen: A string, optional. The name of the reference genome. The default reference genome is "GRCh37". This parameter is applicable only if the input_type is "vcf".
            
    startProcess: A positive integer, optional. The minimum number of signatures to be extracted. The default value is 1 
    
    endProcess: A positive integer, optional. The maximum number of signatures to be extracted. The default value is 10
    
    totalIterations: A positive integer, optional. The number of iteration to be performed to extract each number signature. The default value is 8
            
    cpu: An integer, optional. The number of processors to be used to extract the signatures. The default value is -1 which will use all available processors. 
    
    hierarchy: Boolean, optional. Defines if the signature will be extracted in a hierarchical fashion. The default value is "False".
    
    par_h = Float, optional. Ranges from 0 t0 1. Default is 0.90. Active only if the "hierarchy" is True. Sets the cutoff to select the unexplained samples in a hierarchical layer based on the cosine similarity 
    between the original and reconstructed samples.  
    
    mtype: A list of strings, optional. The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is ["96", "DINUC" , "ID"], where "96" is the SBS96 context, "DINUC"
    is the DINULEOTIDE context and ID is INDEL context. 
            
    exome: Boolean, optional. Defines if the exomes will be extracted. The default value is "False".
    
    penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the thresh-hold cutoff to asaign signatures to a sample.    
    
    resample: Boolean, optional. Default is True. If True, add poisson noise to samples by resampling.  
    
    
    Returns
    -------
    
    After sigProfilerExtractor is successfully executed, an output directory will be generated in the current working directory 
    according to the name of the  parameter of the "out_put" argument. In the "output" directory there will be subfolder 
    for each type of mutational contexts. 
    
    If the "hierarchy" parameter is false, inside of each mutational context subdirectory, there will be subdirectories named 
    "All solutions" and "Final solution". Besides the subdirectories, there will be a file named "results_stat.csv" which 
    will contain the record of the relative reconstruction error and process stability for each number of signatures. 
    Another file named stibility.pdf will contain the plot of recontruction error vs process stability. The "All solution"
    directory will contain the subdirectories for each number of signatures which will further contain the solution files 
    ("signature.txt", "exposure.txt", "probabilities.txt" and a pdf file that depicts the  proportion of the mututaions 
    for each number signatures. On the other hand, the "Final solution" directory contains two subdirectories: "De Novo Solution"
    and "Decomposed Solution". The "De Novo Solution" subdirectory will contain the solution files for the optimum number of 
    "De Novo Signatures" signatures with a dendrogram file where the samples are clustered by the de novo signatures. The "Decomposed 
    Solution" subfolder contains the records  where "De Novo Signatures" are further decomposed into the global signatures. 
    
    If the "hierarchy" parameter is true, inside of each mutational context subdirectory, there will be a subdirectory named
    "All_Solution_by_Layer" which will further contain the solutions  in the layer (L) subdirectories. Everything else will be similar to
    the previously deccribed directory structures. The structure of the result folder is synopsized below:
        
        If Hierarchy is False:
            
        -Mutational Context folder
            -All solution folder
                -Signature folder
                    -exposure.txt file
                    -signature.txt file
                    -probabilities.txt file
                    -signature plot pdf file
            -Selected_Solution folder
                -De_Novo_Solution folder
                    -exposure.txt file
                    -signature.txt file
                    -probabilities.txt file
                    -signature plot pdf file
                    -dendrogram plot file
                -Decomposed_Solution folder
                    -comparison with global signature.csv file
                    -exposure.txt file
                    -signature.txt file
                    -probabilities.txt file
                    -signature plot pdf file
                    -dendrogram plot file
            -results_stat.csv file
            -stability plot pdf
            
                    
        If Hierarchy is True:
            
        -Mutational Context folder
            -All Solution by Layer folder
                -Layer folder (L)
                    -All solution folder
                        -Signature folder
                            -exposure.txt file
                            -signature.txt file
                            -probabilities.txt file
                            -signature plot pdf file
                    -L1_solution folder
                        -exposure.txt file
                        -signature.txt file
                        -probabilities.txt file
                        -signature plot pdf file
                    -results_stat.csv file
                    -stability plot pdf
            -Selected_Solution folder
                -De_Novo_Solution folder
                    -exposure.txt file
                    -signature.txt file
                    -probabilities.txt file
                    -signature plot pdf file
                    -dendrogram plot file
                -Decomposed_Solution folder
                    -comparison with global signature.csv file
                    -exposure.txt file
                    -signature.txt file
                    -probabilities.txt file
                    -signature plot pdf file
                    -dendrogram plot file
            -results_stat.csv file
            -stability plot pdf
    
    Examples
    --------
    
    >>> from sigproextractor import sigpro as sig
    >>> data = sig.importdata("vcf")
    >>> sig.sigProfilerExtractor("vcf", "example_output", data, startProcess=1, endProcess=3)
    
    Wait untill the excecution is finished. The process may a couple of hours based on the size of the data.
    Check the current working directory for the "example_output" folder.
    
    
    """

    #################################### At first create the system data file ####################################
    if not os.path.exists(out_put):
        os.makedirs(out_put)
    sysdata = open(out_put + "/JOB_METADATA.txt", "w")
    sysdata.write(
        "THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME\n\n\n")
    sysdata.write("-------System Info-------\n")
    sysdata.write("Operating System Name: " + platform.uname()[0] + "\n" +
                  "Nodename: " + platform.uname()[1] + "\n" + "Release: " +
                  platform.uname()[2] + "\n" + "Version: " +
                  platform.uname()[3] + "\n")
    sysdata.write("\n-------Python and Package Versions------- \n")
    sysdata.write("Python Version: " + str(platform.sys.version_info.major) +
                  "." + str(platform.sys.version_info.minor) + "." +
                  str(platform.sys.version_info.micro) + "\n")
    sysdata.write("Sigproextractor Version: " + cosmic.__version__ + "\n")
    sysdata.write("SigprofilerPlotting Version: " +
                  sigProfilerPlotting.__version__ + "\n")
    sysdata.write("SigprofilerMatrixGenerator Version: " +
                  SigProfilerMatrixGenerator.__version__ + "\n")
    sysdata.write("Pandas version: " + pd.__version__ + "\n")
    sysdata.write("Numpy version: " + np.__version__ + "\n")
    sysdata.write("Scipy version: " + scipy.__version__ + "\n")
    sysdata.write("Scikit-learn version: " + sklearn.__version__ + "\n")
    sysdata.write("Nimfa version: " + nimfa.__version__ + "\n")

    sysdata.write("\n-------Vital Parameters Used for the execution -------\n")
    #format the project_name first:
    project = input_data  #will use this variable as the parameter for project argument in SigprofilerMatrixGenerator
    if project[-1] != "/":
        project_name = project.split(
            "/"
        )[-1]  #will use this variable as the parameter for project_name argument in SigprofilerMatrixGenerator
    else:
        project_name = project.split("/")[-2]
    sysdata.write(
        "input_type: {}\ninputdata: {}\nstartProcess: {}\nendProcess: {}\ntotalIterations: {}\ncpu: {}\nhierarchy: {}\nrefgen: {}\ngenome_build: {}\nmtype: {}\n"
        .format(input_type, project_name, startProcess, endProcess,
                totalIterations, cpu, hierarchy, refgen, genome_build, mtype))

    sysdata.write("\n-------Date and Time Data------- \n")
    tic = datetime.datetime.now()
    sysdata.write("Date and Clock time when the execution started: " +
                  str(tic) + "\n")
    sysdata.close()

    ################################ take the inputs from the mandatory arguments ####################################
    input_type = input_type
    out_put = out_put
    #project = input_data   #the variable was already set above

    ################################ take the inputs from the general optional arguments ####################################
    startProcess = startProcess
    endProcess = endProcess
    totalIterations = totalIterations
    cpu = cpu
    hierarchi = hierarchy

    if input_type == "text" or input_type == "table":

        ################################### For text input files ######################################################

        text_file = project
        title = ""  # set the title for plotting

        data = pd.read_csv(text_file, sep="\t").iloc[:, :]
        data = data.dropna(axis=1, inplace=False)
        data = data.loc[:, (data != 0).any(axis=0)]
        genomes = data.iloc[:, 1:]
        genomes = np.array(genomes)
        allgenomes = genomes.copy(
        )  # save the allgenomes for the final results

        #Contruct the indeces of the matrix
        #setting index and columns names of processAvg and exposureAvg
        index = data.iloc[:, 0]
        colnames = data.columns[1:]
        allcolnames = colnames.copy(
        )  # save the allcolnames for the final results

        #creating list of mutational type to sync with the vcf type input
        mtypes = [str(genomes.shape[0])]
        if mtypes[0] == "78":
            mtypes = ["DINUC"]
        elif mtypes[0] == "83":
            mtypes = ["ID"]

    ###############################################################################################################

    ###########################################################################################################################################################################################
    elif input_type == "csv":
        ################################# For matlab input files #######################################################

        filename = project
        title = ""  # set the title for plotting

        genomes, index, colnames, mtypes = sub.read_csv(filename)
        allgenomes = genomes.copy()
        allcolnames = colnames.copy()

        # Define the mtypes

        if mtypes[0] == "78":
            mtypes = ["DINUC"]
        elif mtypes[0] == "83":
            mtypes = ["ID"]

    #################################################################################################################

    ###########################################################################################################################################################################################
    elif input_type == "matobj":
        ################################# For matlab input files #######################################################

        mat_file = project
        title = ""  # set the title for plotting

        mat = scipy.io.loadmat(mat_file)
        mat = sub.extract_input(mat)
        genomes = mat[1]
        allgenomes = genomes.copy(
        )  # save the allgenomes for the final results

        #Contruct the indeces of the matrix
        #setting index and columns names of processAvg and exposureAvg
        index1 = mat[3]
        index2 = mat[4]
        index = []
        for i, j in zip(index1, index2):
            index.append(i[0] + "[" + j + "]" + i[2])
        colnames = np.array(pd.Series(mat[2]))
        allcolnames = colnames.copy(
        )  # save the allcolnames for the final results
        index = np.array(pd.Series(index))

        #creating list of mutational type to sync with the vcf type input
        mtypes = [str(genomes.shape[0])]
        if mtypes[0] == "78":
            mtypes = ["DINUC"]
        elif mtypes[0] == "83":
            mtypes = ["ID"]

        #################################################################################################################

    elif input_type == "vcf":
        ################################# For vcf input files #######################################################

        project = project
        title = project  # set the title for plotting

        refgen = refgen

        exome = exome

        #project_name = project.split("/")[-1]
        data = datadump.SigProfilerMatrixGeneratorFunc(project_name,
                                                       refgen,
                                                       project,
                                                       exome=exome,
                                                       bed_file=None,
                                                       chrom_based=False,
                                                       plot=False,
                                                       gs=False)

        # Selecting the mutation types
        if mtype != ["default"]:
            mkeys = data.keys()
            mtypes = mtype
            if any(x not in mkeys for x in mtypes):
                raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\
                                "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters")

        else:
            if set(["96", "DINUC", "ID"]).issubset(data):
                mtypes = ["96", "DINUC", "ID"]
            elif set(["96", "DINUC"]).issubset(data):
                mtypes = ["96", "DINUC"]
            elif set(["ID"]).issubset(data):
                mtypes = ["ID"]
        #print (mtypes)
        #change working directory

        #set the genome_build
        genome_build = refgen

    else:
        raise ValueError(
            "Please provide a correct input_type. Check help for more details")

    ###########################################################################################################################################################################################
    for m in mtypes:

        # Determine the types of mutation which will be needed for exporting and copying the files
        if not (m == "DINUC" or m == "ID"):
            mutation_type = "SBS" + m

        else:
            if m == "DINUC":
                mutation_type = "DBS78"
            elif m == "ID":
                mutation_type = "ID83"

        if input_type == "vcf":
            genomes = pd.DataFrame(data[m])

            #check if the genome is a nonzero matrix
            shape = genomes.shape
            if shape == (0, 0):
                sysdata = open(out_put + "/JOB_METADATA.txt", "a")
                sysdata.write(
                    "Sample is not a nonzero matrix for the mutation context "
                    + m + "\n")
                print(
                    "Sample is not a nozero matrix for the mutation context " +
                    m)
                sysdata.close()
                continue

            genomes = genomes.loc[:, (genomes != 0).any(axis=0)]
            allgenomes = genomes.copy(
            )  # save the allgenomes for the final results
            index = genomes.index.values
            colnames = genomes.columns
            allcolnames = colnames.copy(
            )  # save the allcolnames for the final results

        #in the plotting funciton "ID" is used as "INDEL"
        if m == "ID":
            m = "INDEL"  #for plotting
        #create output directories to store all the results
        output = out_put + "/" + mutation_type

        est_genomes = np.zeros([1, 1])
        listofsignatures = []
        listofsignaturesSTE = []
        list_of_signature_stabilities = []
        list_of_signature_total_mutations = []

        H_iteration = 1
        flag = True  # We need to enter into the first while loop regardless any condition
        # While loop starts here
        while flag:
            genomes = np.array(genomes)

            information = []
            if hierarchi is True:
                layer_directory = output + "/All_Solution_Layer/L" + str(
                    H_iteration)
            elif hierarchi is False:
                layer_directory = output

            try:
                if not os.path.exists(layer_directory):
                    os.makedirs(layer_directory)
                    #os.makedirs(output+"/pickle_objects")
                    #os.makedirs(output+"/All solutions")

            except:
                print("The {} folder could not be created".format("output"))

            fh = open(layer_directory + "/All_solutions_stat.csv", "w")
            fh.write("Total Signatures,Stability,Matrix Frobenius%\n")
            fh.close()
            # The following for loop operates to extract data from each number of signature

            all_similirities_list = [
            ]  #this list is going to store the dataframes of different similirieties as items
            minimum_stabilities = []
            #similarity_dataframe = pd.DataFrame({"Sample Name": list(colnames)})

            #normatlize the genomes before running nmf
            genomes = sub.normalize_samples(genomes,
                                            normalize=False,
                                            all_samples=False,
                                            number=30000)
            for i in range(startProcess, endProcess + 1):
                current_time_start = datetime.datetime.now()

                #memory_usage()
                processAvg, \
                exposureAvg, \
                processStd, \
                exposureStd, \
                avgSilhouetteCoefficients, \
                clusterSilhouetteCoefficients, \
                finalgenomeErrors, \
                finalgenomesReconstructed, \
                finalWall, \
                finalHall, \
                reconstruction_error, \
                processes = sub.decipher_signatures(genomes= genomes, \
                                                    i = i, \
                                                    totalIterations=totalIterations, \
                                                    cpu=cpu, \
                                                    mut_context=m, \
                                                    resample = resample)

                ####################################################################### add sparsity in the exposureAvg #################################################################

                # remove signatures only if the process stability is above a thresh-hold of 0.85
                if avgSilhouetteCoefficients > -1.0:
                    stic = time.time()

                    #removing signatures:
                    # =============================================================================
                    #                     pool = mp.Pool()
                    #                     results = [pool.apply_async(sub.remove_all_single_signatures_pool, args=(x,processAvg,exposureAvg,genomes,)) for x in range(genomes.shape[1])]
                    #                     pooloutput = [p.get() for p in results]
                    #
                    #                     #print(results)
                    #                     pool.close()
                    #
                    #                     for i in range(len(pooloutput)):
                    #                         #print(results[i])
                    #                         exposureAvg[:,i]=pooloutput[i]
                    # =============================================================================

                    #refitting signatures:
                    #removing signatures:
                    pool = mp.Pool()
                    results = [
                        pool.apply_async(ss.fit_signatures_pool,
                                         args=(
                                             genomes,
                                             processAvg,
                                             x,
                                         )) for x in range(genomes.shape[1])
                    ]
                    pooloutput = [p.get() for p in results]
                    pool.close()

                    for i in range(len(pooloutput)):

                        exposureAvg[:, i] = pooloutput[i][0]

                    stoc = time.time()
                    print("Optimization time is {} seconds".format(stoc -
                                                                   stic))

                #report progress to the system file:
                current_time_end = datetime.datetime.now()
                sysdata = open(out_put + "/JOB_METADATA.txt", "a")
                if hierarchi is True:
                    sysdata.write(
                        "\nSignature extraction for {} completed for layer {} {} signatures for {}! TimeStamp: {}\n"
                        .format(mutation_type, H_iteration, processes,
                                current_time_end - current_time_start,
                                current_time_end))
                else:
                    sysdata.write(
                        "\nSignature extraction for {} completed for {} signatures for {}! TimeStamp: {}\n"
                        .format(mutation_type, processes,
                                current_time_end - current_time_start,
                                current_time_end))

                #Get total mutationation for each signature
                signature_total_mutations = np.sum(exposureAvg,
                                                   axis=1).astype(int)

                signature_stats = pd.DataFrame({
                    "Stability":
                    clusterSilhouetteCoefficients,
                    "Total Mutations":
                    signature_total_mutations
                })
                minimum_stabilities.append(
                    round(np.mean(clusterSilhouetteCoefficients), 2)
                )  #here minimum stability is the average stability !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                # Compute the estimated genome from the processAvg and exposureAvg
                est_genomes = np.dot(processAvg, exposureAvg)

                #check the similarities between the original and estimated genome for each number of signatures

                all_similarities, cosine_similarities = sub.calculate_similarities(
                    genomes, est_genomes, colnames)
                #print(totalMutations)
                ##########################################################################################################################################################################
                # store the resutls of the loop.  Here,  processStd and exposureStd are standard Errors, NOT STANDARD DEVIATIONS.
                loopResults = [
                    genomes, processAvg, exposureAvg, processStd, exposureStd,
                    avgSilhouetteCoefficients, clusterSilhouetteCoefficients,
                    signature_total_mutations, all_similarities,
                    signature_stats, reconstruction_error, finalgenomeErrors,
                    finalgenomesReconstructed, finalWall, finalHall, processes
                ]
                information.append([
                    processAvg, exposureAvg, processStd, exposureStd,
                    clusterSilhouetteCoefficients, signature_total_mutations,
                    signature_stats, all_similarities
                ])  #Will be used during hierarchical approach

                ################################# Export the results ###########################################################
                sub.export_information(loopResults, m, layer_directory, index,
                                       colnames)

                all_similirities_list.append(all_similarities)
                #
                #similarity_dataframe["Total Signatures "+str(processes)] = cosine_similarities

            ################################################################################################################
            ########################################## Plot Stabiltity vs Reconstruction Error #############################
            ################################################################################################################
            # Print the Stabiltity vs Reconstruction Error as get the solution as well
            solution, all_stats = sub.stabVsRError(
                layer_directory + "/All_solutions_stat.csv", layer_directory,
                title, all_similirities_list, mutation_type)
            all_stats.insert(
                0, 'Stability (Avg Silhouette)', minimum_stabilities
            )  #!!!!!!!!!!!!!!!!1 here minimum stability is avg stability
            all_stats.to_csv(layer_directory + "/All_solutions_stat.csv",
                             sep=",")

            # add more information to results_stat.csv

            #Set index for the  the Similarity Dataframe
            #similarity_dataframe = similarity_dataframe.set_index("Sample Name")

            #Add the total mutations of each sample
            #sample_total_mutations = list(np.sum(genomes, axis =0))

            #similarity_dataframe.insert(loc=0, column = "Total Mutations", value = sample_total_mutations)

            # write the name of Samples and Matrix participating in each Layer.
            layer_genome = pd.DataFrame(genomes)
            layer_genome = layer_genome.set_index(index)
            layer_genome.columns = colnames
            layer_genome = layer_genome.rename_axis("Mutation Types",
                                                    axis="columns")

            ################################### Hierarchical Extraction  #########################
            if hierarchi is True:
                #data_stat_folder = layer_directory+"/Data_Stats"
                # =============================================================================
                #                 try:
                #                     if not os.path.exists(data_stat_folder):
                #                         os.makedirs(data_stat_folder)
                #                 except:
                #                         print ("The {} folder could not be created".format("Data_Stats"))
                # =============================================================================

                layer_genome.to_csv(layer_directory + "/Samples_in_Layer_" +
                                    str(H_iteration) + ".text",
                                    sep="\t",
                                    index_label=[layer_genome.columns.name])
                #similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t")
                del layer_genome

                # =============================================================================
                #                 for i in range(startProcess,endProcess+1):
                #                     all_similirities_list[i-startProcess].to_csv(data_stat_folder+"/Similatiry_Data_Sig"+str(i)+".text", sep="\t")
                # =============================================================================

                # =============================================================================
                #                 sample_record = open(output+"/Samples_Selected_by_Layers.text", "a")
                #                 sample_record.write("\nSamples participating in Layer"+str(H_iteration)+"\n"+"Total number of samples in this layer is: "+str(len(colnames))+"\n\n" )
                #
                #
                #                 for sn in colnames:
                #                     # sn is the abbreviation of "Sample Name", used as a iterator variable
                #                     sample_record.write(sn+" ,\n" )
                #                 sample_record.write("######################################################################################\n")
                #                 sample_record.write("######################################################################################\n")
                #                 sample_record.write("######################################################################################\n")
                #                 sample_record.write("######################################################################################\n\n\n\n\n")
                #                 sample_record.close()
                # =============================================================================

                if os.path.exists(layer_directory + "/L" + str(H_iteration) +
                                  "_solution"):
                    shutil.rmtree(layer_directory + "/L" + str(H_iteration) +
                                  "_solution")
                # Copy the best solution the "selected solution" folder
                solutionFolderFrom = layer_directory + "/All_solutions/" + mutation_type + "_" + str(
                    solution) + "_Signatures"
                solutionFolderTo = layer_directory + "/L" + str(
                    H_iteration) + "_Solution/" + mutation_type + "_" + str(
                        solution) + "_Signatures"
                shutil.copytree(solutionFolderFrom, solutionFolderTo)

                # load the best processAvg, exposureAvg and processSTE based on the solution
                processAvg = information[solution - startProcess][0]
                exposureAvg = information[solution - startProcess][1]
                processSTE = information[solution - startProcess][2]
                list_of_signature_stabilities = list_of_signature_stabilities + list(
                    information[solution - startProcess][4])
                list_of_signature_total_mutations = list_of_signature_total_mutations + list(
                    information[solution - startProcess][5])
                all_similarities = information[solution - startProcess][7]

                #del information

                # Compute the estimated genome from the processAvg and exposureAvg
                est_genomes = np.dot(processAvg, exposureAvg)

                # make the list of the samples which have similarity lower than the thresh-hold with the estimated ones
                low_similarity_idx = []

                for i in range(genomes.shape[1]):
                    similarity = sub.cos_sim(genomes[:, i], est_genomes[:, i])

                    # The tresh-hold for hierarchy is 0.95 for now
                    if similarity < par_h:
                        low_similarity_idx.append(i)

                if len(low_similarity_idx) == 0:
                    low_similarity_idx = []
                #print(low_similarity_idx)

                # Accumulated the signatures and signaturesSTE for the final results
                listofsignatures.append(processAvg)
                listofsignaturesSTE.append(processSTE)

                genomes = genomes[:, low_similarity_idx]
                colnames = colnames[low_similarity_idx]
                H_iteration = H_iteration + 1

                #########################################################################################################
                # do the necessary operations and put the outputs in the "Final Solution" folder when the while loop ends
                if genomes.shape[1] < 10 or est_genomes.shape[
                        1] == genomes.shape[1]:
                    flag = False  #update the flag for the whileloop

                    # create the folder for the final solution/ De Novo Solution
                    layer_directory1 = output + "/Suggested_Solution/De_Novo_Solution"
                    try:
                        if not os.path.exists(layer_directory1):
                            os.makedirs(layer_directory1)
                    except:
                        print("The {} folder could not be created".format(
                            "output"))

                    count = 0
                    for p, q in zip(listofsignatures, listofsignaturesSTE):
                        if count == 0:
                            processAvg = p
                            processSTE = q
                        else:
                            processAvg = np.hstack([processAvg, p])
                            processSTE = np.hstack([processSTE, q])
                        count += 1

                    # make the texts for signature plotting

                    signature_stabilities = sub.signature_plotting_text(
                        list_of_signature_stabilities, "Stability", "float")
                    signature_total_mutations = sub.signature_plotting_text(
                        list_of_signature_total_mutations, "Total Mutations",
                        "integer")
                    signature_stats = pd.DataFrame({
                        "Stability":
                        signature_stabilities,
                        "Total Mutations":
                        signature_total_mutations
                    })
                    # make de novo solution(processAvg, allgenomes, layer_directory1)
                    listOfSignatures = sub.make_letter_ids(
                        idlenth=processAvg.shape[1])
                    exposureAvg = sub.make_final_solution(
                        processAvg,
                        allgenomes,
                        listOfSignatures,
                        layer_directory1,
                        m,
                        index,
                        allcolnames,
                        process_std_error=processSTE,
                        signature_stabilities=signature_stabilities,
                        signature_total_mutations=signature_total_mutations,
                        signature_stats=signature_stats,
                        penalty=penalty)

                    try:
                        # create the folder for the final solution/ Decomposed Solution
                        layer_directory2 = output + "/Suggested_Solution/Decomposed_Solution"
                        try:
                            if not os.path.exists(layer_directory2):
                                os.makedirs(layer_directory2)
                        except:
                            print("The {} folder could not be created".format(
                                "output"))

                        final_signatures = sub.signature_decomposition(
                            processAvg,
                            m,
                            layer_directory2,
                            genome_build=genome_build)
                        # extract the global signatures and new signatures from the final_signatures dictionary
                        globalsigs = final_signatures["globalsigs"]
                        globalsigs = np.array(globalsigs)
                        newsigs = final_signatures["newsigs"]
                        processAvg = np.hstack([globalsigs, newsigs])
                        allsigids = final_signatures[
                            "globalsigids"] + final_signatures["newsigids"]
                        attribution = final_signatures["dictionary"]
                        background_sigs = final_signatures["background_sigs"]

                        exposureAvg = sub.make_final_solution(processAvg, allgenomes, allsigids, layer_directory2, m, index, allcolnames, \
                                                remove_sigs=True, attribution = attribution, denovo_exposureAvg  = exposureAvg ,  background_sigs=background_sigs, penalty=penalty, genome_build=genome_build)
                    except:
                        print(
                            "\nWARNING!!! We apolozize we don't have a global signature database for the mutational context you provided. We have a database only for SBS96, DINUC and INDELS.\nTherefore no result for signature Decomposition is generated."
                        )
                        shutil.rmtree(layer_directory2)

                #######################################################################################################
            elif hierarchi is False:
                # =============================================================================
                #                 data_stat_folder = output+"/Data_Stats"
                #                 try:
                #                     if not os.path.exists(data_stat_folder):
                #                         os.makedirs(data_stat_folder)
                #                 except:
                #                         print ("The {} folder could not be created".format("Data_Stats"))
                #
                #                 layer_genome.to_csv(data_stat_folder+"/Samples.text", sep = "\t", index_label=[layer_genome.columns.name])
                #                 similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs.text", sep = "\t")
                #                 del layer_genome
                #                 for i in range(startProcess,endProcess+1):
                #                     all_similirities_list[i-startProcess].to_csv(data_stat_folder+"/Similatiry_Data_Sig_"+str(i)+".text", sep="\t")
                # =============================================================================
                # record the samples
                layer_genome.to_csv(output + "/Samples.txt",
                                    sep="\t",
                                    index_label=[layer_genome.columns.name])
                #similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t")
                del layer_genome
                ################################### Decompose the new signatures into global signatures   #########################
                processAvg = information[solution - startProcess][0]
                processSTE = information[solution - startProcess][2]
                signature_stabilities = information[solution - startProcess][4]
                signature_total_mutations = information[solution -
                                                        startProcess][5]
                signature_stats = information[solution - startProcess][6]
                all_similarities = information[solution - startProcess][7]

                # create the folder for the final solution/ De Novo Solution
                layer_directory1 = output + "/Suggested_Solution/De_Novo_Solution"
                try:
                    if not os.path.exists(layer_directory1):
                        os.makedirs(layer_directory1)
                except:
                    print(
                        "The {} folder could not be created".format("output"))

                # make the texts for signature plotting
                signature_stabilities = sub.signature_plotting_text(
                    signature_stabilities, "Stability", "float")
                signature_total_mutations = sub.signature_plotting_text(
                    signature_total_mutations, "Total Mutations", "integer")
                # make de novo solution(processAvg, allgenomes, layer_directory1)
                listOfSignatures = sub.make_letter_ids(
                    idlenth=processAvg.shape[1])
                exposureAvg = sub.make_final_solution(processAvg, allgenomes, listOfSignatures, layer_directory1, m, index, \
                               allcolnames, process_std_error = processSTE, signature_stabilities = signature_stabilities, \
                               signature_total_mutations = signature_total_mutations, signature_stats = signature_stats, penalty=penalty)

                try:
                    # create the folder for the final solution/ Decomposed Solution
                    layer_directory2 = output + "/Suggested_Solution/Decomposed_Solution"
                    try:
                        if not os.path.exists(layer_directory2):
                            os.makedirs(layer_directory2)
                    except:
                        print("The {} folder could not be created".format(
                            "output"))

                    if processAvg.shape[
                            0] == 1536:  #collapse the 1596 context into 96 only for the deocmposition
                        processAvg = pd.DataFrame(processAvg, index=index)
                        processAvg = processAvg.groupby(
                            processAvg.index.str[1:8]).sum()
                        genomes = pd.DataFrame(genomes, index=index)
                        genomes = genomes.groupby(genomes.index.str[1:8]).sum()
                        index = genomes.index
                        processAvg = np.array(processAvg)
                        genomes = np.array(genomes)

                    final_signatures = sub.signature_decomposition(
                        processAvg,
                        m,
                        layer_directory2,
                        genome_build=genome_build)
                    # extract the global signatures and new signatures from the final_signatures dictionary
                    globalsigs = final_signatures["globalsigs"]
                    globalsigs = np.array(globalsigs)
                    newsigs = final_signatures["newsigs"]
                    processAvg = np.hstack([globalsigs, newsigs])
                    allsigids = final_signatures[
                        "globalsigids"] + final_signatures["newsigids"]
                    attribution = final_signatures["dictionary"]
                    background_sigs = final_signatures["background_sigs"]




                    exposureAvg = sub.make_final_solution(processAvg, genomes, allsigids, layer_directory2, m, index, colnames, \
                                            remove_sigs=True, attribution = attribution, denovo_exposureAvg  = exposureAvg , background_sigs=background_sigs, penalty=penalty, genome_build=genome_build)

                except:
                    print(
                        "\nWARNING!!! We apolozize we don't have a global signature database for the mutational context you provided. We have a database only for SBS96, DINUC and INDELS.\nTherefore no result for signature Decomposition is generated."
                    )
                    shutil.rmtree(layer_directory2)

                break
    sysdata = open(out_put + "/JOB_METADATA.txt", "a")
    toc = datetime.datetime.now()
    sysdata.write("\nDate and Clock time when the execution ended: " +
                  str(toc) + "\n")

    sysdata.write("-------Job Status------- \n")
    sysdata.write(
        "Analysis of mutational signatures completed successfully! Total execution time: "
        + str(toc - tic) + ". Results can be found in: [" + out_put +
        "] folder")
    sysdata.close()

    print(
        "\n\n \nYour Job Is Successfully Completed! Thank You For Using SigProfiler Extractor.\n "
    )

Exemple #14

0

Afficher le fichier

Fichier : metaMutatationalSignatures.py Projet : EESI/MetaMutationalSigs

        if runsigflow == False:
            runsigflow = "TRUE"
        if runsigflow == True:
            runsigflow = "FALSE"

        if runsigfit == False:
            runsigfit = "TRUE"
        if runsigfit == True:
            runsigfit = "FALSE"

        if runDeconstructSigs == False:
            runDeconstructSigs = "TRUE"
        if runDeconstructSigs == True:
            runDeconstructSigs = "FALSE"

        matGen.SigProfilerMatrixGeneratorFunc("MetaMutationalSigs", genome_ref,
                                              input_dir)
        subprocess.call([
            'Rscript', "meta_sig_main_flask.r", input_dir, genome_ref,
            runMutationalPatterns, runsigflow, runsigfit, runDeconstructSigs
        ])
        subprocess.call([
            'python3.8', "errors_pie_heatmap.py", input_dir,
            runMutationalPatterns, runsigflow, runsigfit, runDeconstructSigs
        ])

        shutil.rmtree(input_dir + "/input")
        shutil.rmtree(input_dir + "/logs")
        shutil.rmtree(input_dir + "/output")

        files_in_directory = os.listdir(input_dir)

Exemple #15

0

Afficher le fichier

Fichier : sigpro.py Projet : edawson/SigProfilerExtractor

def sigProfilerExtractor(input_type,
                         out_put,
                         input_data,
                         refgen="GRCh37",
                         genome_build='GRCh37',
                         startProcess=1,
                         endProcess=10,
                         totalIterations=100,
                         init="alexandrov-lab-custom",
                         cpu=-1,
                         mtype="default",
                         exome=False,
                         penalty=0.05,
                         resample=True,
                         wall=False,
                         gpu=False):
    memory_usage()
    """
    Extracts mutational signatures from an array of samples.
    
    
    Parameters
    ----------
    
    input_type: A string. Type of input. The type of input should be one of the following:
            - "vcf": used for vcf format inputs.
            - "matrix": used for table format inputs using a tab seperated file.
             
        
    out_put: A string. The name of the output folder. The output folder will be generated in the current working directory. 
            
    input_data: A string. Name of the input folder (in case of "vcf" type input) or the input file (in case of "table"  type input). The project file or folder should be inside the current working directory. For the "vcf" type input,the project has to be a folder which will contain the vcf files in vcf format or text formats. The "text"type projects have to be a file.   
            
    refgen: A string, optional. The name of the reference genome. The default reference genome is "GRCh37". This parameter is applicable only if the input_type is "vcf".
            
    startProcess: A positive integer, optional. The minimum number of signatures to be extracted. The default value is 1 
    
    endProcess: A positive integer, optional. The maximum number of signatures to be extracted. The default value is 10
    
    totalIterations: A positive integer, optional. The number of iteration to be performed to extract each number signature. The default value is 100
    
    init: A String. The initialization algorithm for W and H matrix of NMF
    
    wall: A Boolean. If true, the Ws and Hs from all the NMF iterations are generated in the output. 
            
    cpu: An integer, optional. The number of processors to be used to extract the signatures. The default value is -1 which will use all available processors. 
    
    mtype: A list of strings, optional. The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is ["96", "DINUC" , "ID"], where "96" is the SBS96 context, "DINUC"
    is the DINULEOTIDE context and ID is INDEL context. 
            
    exome: Boolean, optional. Defines if the exomes will be extracted. The default value is "False".
    
    penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the thresh-hold cutoff to asaign signatures to a sample.    
    
    resample: Boolean, optional. Default is True. If True, add poisson noise to samples by resampling.  
    
    
    Returns
    -------
    To learn about the output, please visit https://osf.io/t6j7u/wiki/home/
    
    
    Examples
    --------
    
    >>> from SigProfilerExtractor import sigpro as sig
    >>> data = sig.importdata("vcf")
    >>> sig.sigProfilerExtractor("vcf", "example_output", data, startProcess=1, endProcess=3)
    
    Wait untill the excecution is finished. The process may a couple of hours based on the size of the data.
    Check the results in the "example_output" folder.
    """

    if gpu == True:
        import torch

        if gpu and (torch.cuda.device_count() == 0):
            raise RuntimeError("GPU not available!")

    #################################### At first create the system data file ####################################
    if not os.path.exists(out_put):
        os.makedirs(out_put)
    sysdata = open(out_put + "/JOB_METADATA.txt", "w")
    sysdata.write(
        "THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME\n\n\n")
    sysdata.write("-------System Info-------\n")
    sysdata.write("Operating System Name: " + platform.uname()[0] + "\n" +
                  "Nodename: " + platform.uname()[1] + "\n" + "Release: " +
                  platform.uname()[2] + "\n" + "Version: " +
                  platform.uname()[3] + "\n")
    sysdata.write("\n-------Python and Package Versions------- \n")
    sysdata.write("Python Version: " + str(platform.sys.version_info.major) +
                  "." + str(platform.sys.version_info.minor) + "." +
                  str(platform.sys.version_info.micro) + "\n")
    sysdata.write("Sigproextractor Version: " + cosmic.__version__ + "\n")
    sysdata.write("SigprofilerPlotting Version: " +
                  sigProfilerPlotting.__version__ + "\n")
    sysdata.write("SigprofilerMatrixGenerator Version: " +
                  SigProfilerMatrixGenerator.__version__ + "\n")
    sysdata.write("Pandas version: " + pd.__version__ + "\n")
    sysdata.write("Numpy version: " + np.__version__ + "\n")
    sysdata.write("Scipy version: " + scipy.__version__ + "\n")
    sysdata.write("Scikit-learn version: " + sklearn.__version__ + "\n")
    #sysdata.write("Nimfa version: "+nimfa.__version__+"\n")

    sysdata.write("\n-------Vital Parameters Used for the execution -------\n")
    #format the project_name first:
    project = input_data  #will use this variable as the parameter for project argument in SigprofilerMatrixGenerator
    try:
        if project[-1] != "/":
            project_name = project.split(
                "/"
            )[-1]  #will use this variable as the parameter for project_name argument in SigprofilerMatrixGenerator
        else:
            project_name = project.split("/")[-2]
    except:
        project_name = "Input from DataFrame"
    sysdata.write(
        "input_type: {}\ninputdata: {}\nstartProcess: {}\nendProcess: {}\ntotalIterations: {}\ncpu: {}\nrefgen: {}\ngenome_build: {}\nmtype: {} \ninit: {}\n"
        .format(input_type, project_name, startProcess, endProcess,
                totalIterations, cpu, refgen, genome_build, mtype, init))

    sysdata.write("\n-------Date and Time Data------- \n")
    tic = datetime.datetime.now()
    sysdata.write("Date and Clock time when the execution started: " +
                  str(tic) + "\n")
    sysdata.close()

    ################################ take the inputs from the mandatory arguments ####################################
    input_type = input_type
    out_put = out_put
    #project = input_data   #the variable was already set above

    ################################ take the inputs from the general optional arguments ####################################
    startProcess = startProcess
    endProcess = endProcess

    totalIterations = totalIterations
    cpu = cpu
    hierarchi = False  #No use

    if input_type == "text" or input_type == "table" or input_type == "matrix":

        ################################### For text input files ######################################################

        text_file = project
        title = ""  # set the title for plotting

        if type(text_file) != str:
            data = text_file
        else:
            data = pd.read_csv(text_file, sep="\t").iloc[:, :]

        data = data.dropna(axis=1, inplace=False)
        data = data.loc[:, (data != 0).any(axis=0)]
        genomes = data.iloc[:, 1:]
        genomes = np.array(genomes)

        allgenomes = genomes.copy(
        )  # save the allgenomes for the final results

        #Contruct the indeces of the matrix
        #setting index and columns names of processAvg and exposureAvg
        index = data.iloc[:, 0]
        colnames = data.columns[1:]
        allcolnames = colnames.copy(
        )  # save the allcolnames for the final results

        #creating list of mutational type to sync with the vcf type input
        mtypes = [str(genomes.shape[0])]
        if mtypes[0] == "78":
            mtypes = ["DBS78"]
        elif mtypes[0] == "83":
            mtypes = ["ID83"]
        else:
            mtypes = ["SBS" + mtypes[0]]

    ###############################################################################################################

    ###########################################################################################################################################################################################
    elif input_type == "csv":
        ################################# For matlab input files #######################################################

        filename = project
        title = ""  # set the title for plotting

        genomes, index, colnames, mtypes = sub.read_csv(filename)
        allgenomes = genomes.copy()
        allcolnames = colnames.copy()

        # Define the mtypes
        mtypes = [str(genomes.shape[0])]
        if mtypes[0] == "78":
            mtypes = ["DINUC"]
        elif mtypes[0] == "83":
            mtypes = ["ID"]

    #################################################################################################################

    ###########################################################################################################################################################################################
    elif input_type == "matobj":
        ################################# For matlab input files #######################################################

        mat_file = project
        title = ""  # set the title for plotting

        mat = scipy.io.loadmat(mat_file)
        mat = sub.extract_input(mat)
        genomes = mat[1]
        allgenomes = genomes.copy(
        )  # save the allgenomes for the final results

        #Contruct the indeces of the matrix
        #setting index and columns names of processAvg and exposureAvg
        index1 = mat[3]
        index2 = mat[4]
        index = []
        for i, j in zip(index1, index2):
            index.append(i[0] + "[" + j + "]" + i[2])
        colnames = np.array(pd.Series(mat[2]))
        allcolnames = colnames.copy(
        )  # save the allcolnames for the final results
        index = np.array(pd.Series(index))

        #creating list of mutational type to sync with the vcf type input
        mtypes = [str(genomes.shape[0])]
        if mtypes[0] == "78":
            mtypes = ["DINUC"]
        elif mtypes[0] == "83":
            mtypes = ["ID"]

        #################################################################################################################

    elif input_type == "vcf":
        ################################# For vcf input files #######################################################

        project = project
        title = project  # set the title for plotting

        refgen = refgen

        exome = exome

        #project_name = project.split("/")[-1]
        data = datadump.SigProfilerMatrixGeneratorFunc(project_name,
                                                       refgen,
                                                       project,
                                                       exome=exome,
                                                       bed_file=None,
                                                       chrom_based=False,
                                                       plot=False,
                                                       gs=False)

        # Selecting the mutation types
        if mtype == ["default"]:
            if set(["96", "DINUC", "ID"]).issubset(data):
                mtypes = ["SBS96", "DBS78", "ID83"]
            elif set(["96", "DINUC"]).issubset(data):
                mtypes = ["SBS96", "DBS78"]
            elif set(["ID"]).issubset(data):
                mtypes = ["ID83"]

        elif mtype == "default":
            if set(["96", "DINUC", "ID"]).issubset(data):
                mtypes = ["SBS96", "DBS78", "ID83"]
            elif set(["96", "DINUC"]).issubset(data):
                mtypes = ["SBS96", "DBS78"]
            elif set(["ID"]).issubset(data):
                mtypes = ["ID83"]

        else:
            #mkeys = data.keys()
            mtype = mtype.upper()
            mtype = mtype.replace(" ", "")
            mtypes = mtype.split(",")
# =============================================================================
#             if any(x not in mkeys for x in mtypes):
#                  raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\
#                                  "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters")
# =============================================================================

#change working directory

#set the genome_build
        genome_build = refgen

    else:
        raise ValueError(
            "Please provide a correct input_type. Check help for more details")

    ###########################################################################################################################################################################################

    for m in mtypes:

        mutation_context = m

        # we need to rename the m because users input could be SBS96, SBS1536, DBS78, ID83 etc
        if m.startswith("SBS"):
            m = m[3:]  #removing "SBS"
        elif m.startswith("DBS"):
            m = "DINUC"
        elif m.startswith("ID"):
            m = "ID"

        # Determine the types of mutation which will be needed for exporting and copying the files
        if not (m == "DINUC" or m.startswith("DBS") or m.startswith("ID")):

            if m.startswith("SBS"):
                mutation_type = m
            else:
                mutation_type = "SBS" + m

        else:
            if m == "DINUC" or m.startswith("DBS"):
                mutation_type = "DBS78"
            elif m == "ID" or m.stratswith("ID"):
                mutation_type = "ID83"

        if input_type == "vcf":

            try:

                genomes = pd.DataFrame(data[m])
            except:
                raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\
                                 "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters")

            #check if the genome is a nonzero matrix
            shape = genomes.shape
            if shape == (0, 0):
                sysdata = open(out_put + "/JOB_METADATA.txt", "a")
                sysdata.write(
                    "Sample is not a nonzero matrix for the mutation context "
                    + m + "\n")
                print(
                    "Sample is not a nozero matrix for the mutation context " +
                    m)
                sysdata.close()
                continue

            genomes = genomes.loc[:, (genomes != 0).any(axis=0)]

            allgenomes = genomes.copy(
            )  # save the allgenomes for the final results
            index = genomes.index.values
            colnames = genomes.columns
            allcolnames = colnames.copy(
            )  # save the allcolnames for the final results

        #check if start and end processes are bigger than the number of samples
        startProcess = min(startProcess, genomes.shape[1])
        endProcess = min(endProcess, genomes.shape[1])

        #in the plotting funciton "ID" is used as "INDEL"
        if m == "ID":
            m = "INDEL"  #for plotting

        #create output directories to store all the results
        output = out_put + "/" + mutation_type

        est_genomes = np.zeros([1, 1])
        H_iteration = 1
        genomes = np.array(genomes)
        information = []
        layer_directory = output
        try:
            if not os.path.exists(layer_directory):
                os.makedirs(layer_directory)
                #os.makedirs(output+"/pickle_objects")
                #os.makedirs(output+"/All solutions")
        except:
            print("The {} folder could not be created".format("output"))

        fh = open(layer_directory + "/All_solutions_stat.csv", "w")
        fh.write("Total Signatures,Stability,Matrix Frobenius%,avgStability\n")
        fh.close()
        # The following for loop operates to extract data from each number of signature

        all_similirities_list = [
        ]  #this list is going to store the dataframes of different similirieties as items
        minimum_stabilities = []
        #similarity_dataframe = pd.DataFrame({"Sample Name": list(colnames)})

        # set up the seeds generation same matrices for different number of signatures
        seeds = np.random.randint(
            0, 10000000, size=totalIterations
        )  # set the seeds ranging from 0 to 10000000 for resampling and same seeds are used in different number of signatures

        # get the cutoff for normatization to handle the hypermutators

        normalization_cutoff = sub.get_normalization_cutoff(genomes)
        #print("Normalization Cutoff is :", normalization_cutoff)

        #genomes = sub.normalize_samples(genomes, normalize=False, all_samples=False, number=30000)

        for i in range(startProcess, endProcess + 1):
            current_time_start = datetime.datetime.now()

            #memory_usage()
            processAvg, \
            exposureAvg, \
            processStd, \
            exposureStd, \
            avgSilhouetteCoefficients, \
            clusterSilhouetteCoefficients, \
            finalgenomeErrors, \
            finalgenomesReconstructed, \
            finalWall, \
            finalHall, \
            converge_information, \
            reconstruction_error, \
            processes = sub.decipher_signatures(genomes= genomes, \
                                                i = i, \
                                                totalIterations=totalIterations, \
                                                cpu=cpu, \
                                                mut_context=m, \
                                                resample = resample,
                                                seeds=seeds,
                                                init = init,
                                                normalization_cutoff=normalization_cutoff,
                                                gpu=gpu,)

            #denormalize the genomes and exposures
            #genomes = sub.denormalize_samples(genomes, totalMutations, normalization_value=100000)
            #exposureStd = sub.denormalize_samples(exposureStd, totalMutations, normalization_value=100000)
            ####################################################################### add sparsity in the exposureAvg #################################################################

            # remove signatures only if the process stability is above a thresh-hold of 0.85
            if avgSilhouetteCoefficients > -1.0:
                stic = time.time()

                #removing signatures:
                # =============================================================================
                #                     pool = mp.Pool()
                #                     results = [pool.apply_async(sub.remove_all_single_signatures_pool, args=(x,processAvg,exposureAvg,genomes,)) for x in range(genomes.shape[1])]
                #                     pooloutput = [p.get() for p in results]
                #
                #                     #print(results)
                #                     pool.close()
                #
                #                     for i in range(len(pooloutput)):
                #                         #print(results[i])
                #                         exposureAvg[:,i]=pooloutput[i]
                # =============================================================================

                #refitting signatures:
                #removing signatures:
                pool = mp.Pool()
                results = [
                    pool.apply_async(ss.fit_signatures_pool,
                                     args=(
                                         genomes,
                                         processAvg,
                                         x,
                                     )) for x in range(genomes.shape[1])
                ]
                pooloutput = [p.get() for p in results]
                pool.close()

                for i in range(len(pooloutput)):

                    exposureAvg[:, i] = pooloutput[i][0]

                stoc = time.time()
                print("Optimization time is {} seconds".format(stoc - stic))

            #report progress to the system file:
            current_time_end = datetime.datetime.now()
            sysdata = open(out_put + "/JOB_METADATA.txt", "a")
            if hierarchi is True:
                sysdata.write(
                    "\nSignature extraction for {} completed for layer {} {} signatures for {}! TimeStamp: {}\n"
                    .format(mutation_type, H_iteration, processes,
                            current_time_end - current_time_start,
                            current_time_end))
            else:
                sysdata.write(
                    "\nSignature extraction for {} completed for {} signatures for {}! TimeStamp: {}\n"
                    .format(mutation_type, processes,
                            current_time_end - current_time_start,
                            current_time_end))

            #Get total mutationation for each signature in reverse order and order the signatures from high to low mutation barden
            signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int)
            sorted_idx = np.argsort(-signature_total_mutations)
            processAvg = np.take(processAvg, sorted_idx, axis=1)
            exposureAvg = np.take(exposureAvg, sorted_idx, axis=0)
            signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int)

            signature_stats = pd.DataFrame({
                "Stability":
                clusterSilhouetteCoefficients,
                "Total Mutations":
                signature_total_mutations
            })
            minimum_stabilities.append(
                round(np.mean(clusterSilhouetteCoefficients), 2)
            )  #here minimum stability is the average stability !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            # Compute the estimated genome from the processAvg and exposureAvg
            est_genomes = np.dot(processAvg, exposureAvg)

            #check the similarities between the original and estimated genome for each number of signatures

            all_similarities, cosine_similarities = sub.calculate_similarities(
                genomes, est_genomes, colnames)
            #print(totalMutations)
            ##########################################################################################################################################################################
            # store the resutls of the loop.  Here,  processStd and exposureStd are standard Errors, NOT STANDARD DEVIATIONS.
            loopResults = [
                genomes, processAvg, exposureAvg, processStd, exposureStd,
                avgSilhouetteCoefficients, clusterSilhouetteCoefficients,
                signature_total_mutations, all_similarities, signature_stats,
                reconstruction_error, finalgenomeErrors,
                finalgenomesReconstructed, converge_information, finalWall,
                finalHall, processes
            ]
            information.append([
                processAvg, exposureAvg, processStd, exposureStd,
                clusterSilhouetteCoefficients, signature_total_mutations,
                signature_stats, all_similarities
            ])  #Will be used during hierarchical approach

            ################################# Export the results ###########################################################
            sub.export_information(loopResults,
                                   m,
                                   layer_directory,
                                   index,
                                   colnames,
                                   wall=wall)

            all_similirities_list.append(all_similarities)
            #
            #similarity_dataframe["Total Signatures "+str(processes)] = cosine_similarities

        ################################################################################################################
        ########################################## Plot Stabiltity vs Reconstruction Error #############################
        ################################################################################################################
        # Print the Stabiltity vs Reconstruction Error as get the solution as well
        solution, all_stats = sub.stabVsRError(
            layer_directory + "/All_solutions_stat.csv", layer_directory,
            title, all_similirities_list, mutation_type)
        all_stats.insert(
            0, 'Stability (Avg Silhouette)', minimum_stabilities
        )  #!!!!!!!!!!!!!!!!1 here minimum stability is avg stability
        all_stats.to_csv(layer_directory + "/All_solutions_stat.csv", sep=",")

        # add more information to results_stat.csv

        #Set index for the  the Similarity Dataframe
        #similarity_dataframe = similarity_dataframe.set_index("Sample Name")

        #Add the total mutations of each sample
        #sample_total_mutations = list(np.sum(genomes, axis =0))

        #similarity_dataframe.insert(loc=0, column = "Total Mutations", value = sample_total_mutations)

        # write the name of Samples and Matrix participating in each Layer.
        layer_genome = pd.DataFrame(genomes)
        layer_genome = layer_genome.set_index(index)
        layer_genome.columns = colnames
        layer_genome = layer_genome.rename_axis("Mutation Types",
                                                axis="columns")

        # =============================================================================
        #                 data_stat_folder = output+"/Data_Stats"
        #                 try:
        #                     if not os.path.exists(data_stat_folder):
        #                         os.makedirs(data_stat_folder)
        #                 except:
        #                         print ("The {} folder could not be created".format("Data_Stats"))
        #
        #                 layer_genome.to_csv(data_stat_folder+"/Samples.text", sep = "\t", index_label=[layer_genome.columns.name])
        #                 similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs.text", sep = "\t")
        #                 del layer_genome
        #                 for i in range(startProcess,endProcess+1):
        #                     all_similirities_list[i-startProcess].to_csv(data_stat_folder+"/Similatiry_Data_Sig_"+str(i)+".text", sep="\t")
        # =============================================================================
        # record the samples
        layer_genome.to_csv(output + "/Samples.txt",
                            sep="\t",
                            index_label=[layer_genome.columns.name])
        #similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t")
        del layer_genome
        ################################### Decompose the new signatures into global signatures   #########################
        processAvg = information[solution - startProcess][0]
        processSTE = information[solution - startProcess][2]
        signature_stabilities = information[solution - startProcess][4]
        signature_total_mutations = information[solution - startProcess][5]
        signature_stats = information[solution - startProcess][6]
        all_similarities = information[solution - startProcess][7]

        # create the folder for the final solution/ De Novo Solution
        layer_directory1 = output + "/Suggested_Solution/De_Novo_Solution"
        try:
            if not os.path.exists(layer_directory1):
                os.makedirs(layer_directory1)
        except:
            print("The {} folder could not be created".format("output"))

        # make the texts for signature plotting
        signature_stabilities = sub.signature_plotting_text(
            signature_stabilities, "Stability", "float")
        signature_total_mutations = sub.signature_plotting_text(
            signature_total_mutations, "Total Mutations", "integer")
        # make de novo solution(processAvg, allgenomes, layer_directory1)

        listOfSignatures = sub.make_letter_ids(idlenth=processAvg.shape[1],
                                               mtype=mutation_context)
        allgenomes = pd.DataFrame(allgenomes)





        exposureAvg = sub.make_final_solution(processAvg, allgenomes, listOfSignatures, layer_directory1, m, index, \
                       allcolnames, process_std_error = processSTE, signature_stabilities = signature_stabilities, \
                       signature_total_mutations = signature_total_mutations, signature_stats = signature_stats, penalty=penalty)

        try:
            # create the folder for the final solution/ Decomposed Solution

            layer_directory2 = output + "/Suggested_Solution/Decomposed_Solution"
            try:
                if not os.path.exists(layer_directory2):
                    os.makedirs(layer_directory2)
            except:
                print("The {} folder could not be created".format("output"))

            if processAvg.shape[
                    0] == 1536:  #collapse the 1596 context into 96 only for the deocmposition
                processAvg = pd.DataFrame(processAvg, index=index)
                processAvg = processAvg.groupby(
                    processAvg.index.str[1:8]).sum()
                genomes = pd.DataFrame(genomes, index=index)
                genomes = genomes.groupby(genomes.index.str[1:8]).sum()
                index = genomes.index
                processAvg = np.array(processAvg)
                genomes = np.array(genomes)

            final_signatures = sub.signature_decomposition(
                processAvg,
                m,
                layer_directory2,
                genome_build=genome_build,
                mutation_context=mutation_context)

            # extract the global signatures and new signatures from the final_signatures dictionary
            globalsigs = final_signatures["globalsigs"]
            globalsigs = np.array(globalsigs)
            newsigs = final_signatures["newsigs"]
            processAvg = np.hstack([globalsigs, newsigs])
            allsigids = final_signatures["globalsigids"] + final_signatures[
                "newsigids"]
            attribution = final_signatures["dictionary"]
            background_sigs = final_signatures["background_sigs"]
            genomes = pd.DataFrame(genomes)

            #print(exposureAvg)

            exposureAvg = sub.make_final_solution(processAvg, genomes, allsigids, layer_directory2, m, index, colnames, \
                                    remove_sigs=True, attribution = attribution, denovo_exposureAvg  = exposureAvg , background_sigs=background_sigs, penalty=penalty, genome_build=genome_build)

        except:
            print(
                "\nWARNING!!! We apolozize we don't have a global signature database for the mutational context you provided. We have a database only for SBS96, DINUC and INDELS.\nTherefore no result for signature Decomposition is generated."
            )
            shutil.rmtree(layer_directory2)

    sysdata = open(out_put + "/JOB_METADATA.txt", "a")
    toc = datetime.datetime.now()
    sysdata.write("\nDate and Clock time when the execution ended: " +
                  str(toc) + "\n")

    sysdata.write("-------Job Status------- \n")
    sysdata.write(
        "Analysis of mutational signatures completed successfully! Total execution time: "
        + str(toc - tic) + ". Results can be found in: [" + out_put +
        "] folder")
    sysdata.close()

    print(
        "\n\n \nYour Job Is Successfully Completed! Thank You For Using SigProfiler Extractor.\n "
    )

Exemple #16

0

Afficher le fichier

Fichier : sigpro.py Projet : AFEvangelista/SigProfilerExtractor

def sigProfilerExtractor(input_type,
                         output,
                         input_data,
                         reference_genome="GRCh37",
                         opportunity_genome="GRCh37",
                         context_type="default",
                         exome=False,
                         minimum_signatures=1,
                         maximum_signatures=25,
                         nmf_replicates=100,
                         resample=True,
                         batch_size=1,
                         cpu=-1,
                         gpu=False,
                         nmf_init="nndsvd_min",
                         precision="single",
                         matrix_normalization="gmm",
                         seeds="random",
                         min_nmf_iterations=10000,
                         max_nmf_iterations=1000000,
                         nmf_test_conv=10000,
                         nmf_tolerance=1e-15,
                         nnls_add_penalty=0.05,
                         nnls_remove_penalty=0.01,
                         de_novo_fit_penalty=0.02,
                         initial_remove_penalty=0.05,
                         refit_denovo_signatures=True,
                         clustering_distance="cosine",
                         export_probabilities=True,
                         make_decomposition_plots=True,
                         stability=0.8,
                         min_stability=0.2,
                         combined_stability=1.0,
                         get_all_signature_matrices=False):
    memory_usage()
    """
    Extracts mutational signatures from an array of samples.
    
    
    Parameters
    ----------
    
    INPUT DATA:-
    
    input_type: A string. Type of input. The type of input should be one of the following:
            - "vcf": used for vcf format inputs.
            - "matrix": used for table format inputs using a tab seperated file.
             
        
    output: A string. The name of the output folder. The output folder will be generated in the current working directory. 
            
    input_data: A string. Name of the input folder (in case of "vcf" type input) or the input file (in case of "table"  type input). The project file or folder should be inside the current working directory. For the "vcf" type input,the project has to be a folder which will contain the vcf files in vcf format or text formats. The "text"type projects have to be a file.   
            
    reference_genome: A string, optional. The name of the reference genome. The default reference genome is "GRCh37". This parameter is applicable only if the input_type is "vcf".
       
    opportunity_genome: The build or version of the reference signatures for the reference genome. The default opportunity genome is GRCh37. If the input_type is "vcf", the genome_build automatically matches the input reference genome value.    
     
    context_type: A list of strings, optional. The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is "SBS96,DBS78,ID83". 
    
    exome: Boolean, optional. Defines if the exomes will be extracted. The default value is "False".
    
    
    NMF RUNS:-
    
    minimum_signature: A positive integer, optional. The minimum number of signatures to be extracted. The default value is 1 
    
    maximum_signatures: A positive integer, optional. The maximum number of signatures to be extracted. The default value is 10
    
    nmf_replicates: A positive integer, optional. The number of iteration to be performed to extract each number signature. The default value is 100
    
    resample: Boolean, optional. Default is True. If True, add poisson noise to samples by resampling.  
    
    seeds: Boolean. Default is "random". If random, then the seeds for resampling will be random for different analysis.
                  If not random, then seeds will be obtained from a given path of a .txt file that contains a list of seed. 
    
    NMF RUNS:-
    
    matrix_normalization: A string. Method of normalizing the genome matrix before it is analyzed by NMF. Default is "log2". Other options are "gmm", "100X" or "no_normalization".         
    
    nmf_init: A String. The initialization algorithm for W and H matrix of NMF. Options are 'random', 'nndsvd', 'nndsvda', 'nndsvdar' and 'nndsvd_min'
              Default is 'nndsvd_min'.
    
    precision: A string. Values should be single or double. Default is single.
    
    min_nmf_iterations: An integer. Value defines the minimum number of iterations to be completed before NMF converges. Default is 2000.
    
    max_nmf_iterations: An integer. Value defines the maximum number of iterations to be completed before NMF converges. Default is 200000
    
    nmf_test_conv: An integer. Value definer the number number of iterations to done between checking next convergence.
            
    nmf_tolerance: A float. Value defines the tolerance to achieve to converge. 
    
    
    EXECUTION:-
    
    cpu: An integer, optional. The number of processors to be used to extract the signatures. The default value is -1 which will use all available        processors. 
    
    gpu:Boolean, optional. Defines if the GPU resource will used if available. Default is False. If True, the GPU resource 
        will be used in the computation.

    batch_size: An integer. Will be effective only if the GPU is used. Defines the number of NMF replicates to be performed
              by each CPU during the parallel processing. Default is 1.
              
    
    SOLUTION ESTIMATION THRESH-HOLDS:-

    stability: A float. Default is 0.8. The cutoff thresh-hold of the average stability. Solutions with average stabilities below this thresh-hold will not be considered. 

    min_stability: A float. Default is 0.2. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. 

    combined_stability: A float. Default is 1.0. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered.            
    
    
    DECOMPOSITION:-
    
    de_novo_fit_penalty: Float, optional. Takes any positive float. Default is 0.02. Defines the weak (remove) thresh-hold cutoff to be assigned denovo signatures to a sample. 
    
    nnls_add_penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the strong (add) thresh-hold cutoff to be assigned COSMIC signatures to a sample. 
    
    nnls_remove_penalty: Float, optional. Takes any positive float. Default is 0.01. Defines the weak (remove) thresh-hold cutoff to be assigned COSMIC signatures to a sample.
     
    initial_remove_penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the initial weak (remove) thresh-hold cutoff to be COSMIC assigned signatures to a sample.
    
    refit_denovo_signatures: Boolean, optional. Default is False. If True, then refit the denovo signatures with nnls.
    
    make_decomposition_plots: Boolean, optional. Defualt is True. If True, Denovo to Cosmic sigantures decompostion plots will be created as a part the results.

    
    OTHERS:-
    
    get_all_signature_matrices: A Boolean. If true, the Ws and Hs from all the NMF iterations are generated in the output.
    
    export_probabilities: A Boolean. Defualt is True. If False, then doesn't create the probability matrix.
    

    
    Returns
    -------
    To learn about the output, please visit https://osf.io/t6j7u/wiki/home/
    
    
    Examples
    --------
    
    Examples
    --------

    >>> from SigProfilerExtractor import sigpro as sig
    
    # to get input from vcf files
    >>> path_to_example_folder_containing_vcf_files = sig.importdata("vcf")
    >>> data = path_to_example_folder_containing_vcf_files # you can put the path to your folder containing the vcf samples
    >>> sig.sigProfilerExtractor("vcf", "example_output", data, minimum_signatures=1, maximum_signatures=3)
    
    Wait untill the excecution is finished. The process may a couple of hours based on the size of the data.
    Check the current working directory for the "example_output" folder.
    
    # to get input from table format (mutation catalog matrix)
    >>> path_to_example_table = sig.importdata("matrix")
    >>> data = path_to_example_table # you can put the path to your tab delimited file containing the mutational catalog         matrix/table
    >>> sig.sigProfilerExtractor("matrix", "example_output", data, opportunity_genome="GRCh38", minimum_signatures=1, maximum_signatures=3)
    
    Wait untill the excecution is finished. The process may a couple of hours based on the size of the data.
    Check the results in the "example_output" folder.
    """
    #record the start time
    start_time = datetime.datetime.now()

    #set the output variable
    out_put = output

    if gpu == True:
        import torch

        if gpu and (torch.cuda.device_count() == 0):
            raise RuntimeError("GPU not available!")

    #################################### At first create the system data file ####################################
    if not os.path.exists(out_put):
        os.makedirs(out_put)
    sysdata = open(out_put + "/JOB_METADATA.txt", "w")
    sysdata.write(
        "THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME\n\n\n")
    sysdata.write("-------System Info-------\n")
    sysdata.write("Operating System Name: " + platform.uname()[0] + "\n" +
                  "Nodename: " + platform.uname()[1] + "\n" + "Release: " +
                  platform.uname()[2] + "\n" + "Version: " +
                  platform.uname()[3] + "\n")
    sysdata.write("\n-------Python and Package Versions------- \n")
    sysdata.write("Python Version: " + str(platform.sys.version_info.major) +
                  "." + str(platform.sys.version_info.minor) + "." +
                  str(platform.sys.version_info.micro) + "\n")
    sysdata.write("Sigproextractor Version: " + cosmic.__version__ + "\n")
    sysdata.write("SigprofilerPlotting Version: " +
                  sigProfilerPlotting.__version__ + "\n")
    sysdata.write("SigprofilerMatrixGenerator Version: " +
                  SigProfilerMatrixGenerator.__version__ + "\n")
    sysdata.write("Pandas version: " + pd.__version__ + "\n")
    sysdata.write("Numpy version: " + np.__version__ + "\n")
    sysdata.write("Scipy version: " + scipy.__version__ + "\n")
    sysdata.write("Scikit-learn version: " + sklearn.__version__ + "\n")
    #sysdata.write("Nimfa version: "+nimfa.__version__+"\n")

    #format the project_name first:
    project = input_data  #will use this variable as the parameter for project argument in SigprofilerMatrixGenerator
    try:
        if project[-1] != "/":
            project_name = project.split(
                "/"
            )[-1]  #will use this variable as the parameter for project_name argument in SigprofilerMatrixGenerator
        else:
            project_name = project.split("/")[-2]
    except:
        project_name = "Input from DataFrame"

    excecution_parameters = {
        "input_type": input_type,
        "output": output,
        "input_data": input_data,
        "reference_genome": reference_genome,
        "opportunity_genome": opportunity_genome,
        "context_type": context_type,
        "exome": exome,
        "minimum_signatures": minimum_signatures,
        "maximum_signatures": maximum_signatures,
        "NMF_replicates": nmf_replicates,
        "cpu": cpu,
        "gpu": gpu,
        "batch_size": batch_size,
        "NMF_init": nmf_init,
        "precision": precision,
        "matrix_normalization": matrix_normalization,
        "resample": resample,
        "seeds": seeds,
        "min_NMF_iterations": min_nmf_iterations,
        "max_NMF_iterations": max_nmf_iterations,
        "NMF_test_conv": nmf_test_conv,
        "NMF_tolerance": nmf_tolerance,
        "nnls_add_penalty": nnls_add_penalty,
        "nnls_remove_penalty": nnls_remove_penalty,
        "initial_remove_penalty": initial_remove_penalty,
        "de_novo_fit_penalty": de_novo_fit_penalty,
        "refit_denovo_signatures": refit_denovo_signatures,
        "dist": clustering_distance,
        "export_probabilities": export_probabilities,
        "make_decompostion_plots": make_decomposition_plots,
        "stability": stability,
        "min_stability": min_stability,
        "combined_stability": combined_stability,
        "get_all_signature_matrices": get_all_signature_matrices
    }

    ################################ take the inputs from the mandatory arguments ####################################
    input_type = input_type

    #project = input_data   #the variable was already set above

    ################################ take the inputs from the general optional arguments ####################################
    startProcess = minimum_signatures
    endProcess = maximum_signatures

    #totalIterations=nmf_replicates
    cpu = cpu
    hierarchy = False  #No use
    mtype = context_type
    #init=nmf_init
    wall = get_all_signature_matrices
    add_penalty = nnls_add_penalty
    remove_penalty = nnls_remove_penalty
    genome_build = opportunity_genome
    refgen = reference_genome
    refit_denovo_signatures
    #set the squence type ("genome" or "exome") for the tmb plot inside the make_final_solution function
    if exome == False:
        sequence = "genome"
    if exome == True:
        sequence = "exome"

    #setting seeds
    if seeds == "random":
        excecution_parameters["seeds"] = seeds
        replicates = list(range(1, nmf_replicates + 1))
        seed = np.random.randint(0, 10000000, size=nmf_replicates)
        seeds = pd.DataFrame(list(zip(replicates, seed)),
                             columns=["Replicates", "Seeds"])
        seeds = seeds.set_index("Replicates")
        seeds.to_csv(out_put + "/Seeds.txt", sep="\t")
    else:
        try:
            excecution_parameters["seeds"] = seeds
            seeds = pd.read_csv(seeds, sep="\t", index_col=0)
            seeds.to_csv(out_put + "/Seeds.txt", sep="\t")
            seed = np.array(seeds["Seeds"])

        except:
            "Please set valid seeds"

    if input_type == "text" or input_type == "table" or input_type == "matrix":

        ################################### For text input files ######################################################

        text_file = project
        title = ""  # set the title for plotting

        if type(text_file) != str:
            data = text_file
            excecution_parameters["input_data"] = "Matrix[" + str(
                data.shape[0]) + " rows X " + str(data.shape[1]) + " columns]"
        else:
            data = pd.read_csv(text_file, sep="\t").iloc[:, :]

        data = data.dropna(axis=1, inplace=False)
        data = data.loc[:, (data != 0).any(axis=0)]
        genomes = data.iloc[:, 1:]
        genomes = np.array(genomes)

        allgenomes = genomes.copy(
        )  # save the allgenomes for the final results

        #Contruct the indeces of the matrix
        #setting index and columns names of processAvg and exposureAvg
        index = data.iloc[:, 0]
        colnames = data.columns[1:]
        allcolnames = colnames.copy(
        )  # save the allcolnames for the final results

        #creating list of mutational type to sync with the vcf type input
        mtypes = [str(genomes.shape[0])]
        if mtypes[0] == "78":
            mtypes = ["DBS78"]
        elif mtypes[0] == "83":
            mtypes = ["ID83"]
        else:
            mtypes = ["SBS" + mtypes[0]]

    ###############################################################################################################

    ###########################################################################################################################################################################################
    elif input_type == "csv":
        ################################# For matlab input files #######################################################

        filename = project
        title = ""  # set the title for plotting

        genomes, index, colnames, mtypes = sub.read_csv(filename)
        allgenomes = genomes.copy()
        allcolnames = colnames.copy()

        # Define the mtypes
        mtypes = [str(genomes.shape[0])]
        if mtypes[0] == "78":
            mtypes = ["DINUC"]
        elif mtypes[0] == "83":
            mtypes = ["ID"]

    #################################################################################################################

    ###########################################################################################################################################################################################
    elif input_type == "matobj":
        ################################# For matlab input files #######################################################

        mat_file = project
        title = ""  # set the title for plotting

        mat = scipy.io.loadmat(mat_file)
        mat = sub.extract_input(mat)
        genomes = mat[1]
        allgenomes = genomes.copy(
        )  # save the allgenomes for the final results

        #Contruct the indeces of the matrix
        #setting index and columns names of processAvg and exposureAvg
        index1 = mat[3]
        index2 = mat[4]
        index = []
        for i, j in zip(index1, index2):
            index.append(i[0] + "[" + j + "]" + i[2])
        colnames = np.array(pd.Series(mat[2]))
        allcolnames = colnames.copy(
        )  # save the allcolnames for the final results
        index = np.array(pd.Series(index))

        #creating list of mutational type to sync with the vcf type input
        mtypes = [str(genomes.shape[0])]
        if mtypes[0] == "78":
            mtypes = ["DINUC"]
        elif mtypes[0] == "83":
            mtypes = ["ID"]

        #################################################################################################################

    elif input_type == "vcf":
        ################################# For vcf input files #######################################################

        project = project
        title = project  # set the title for plotting

        refgen = refgen

        exome = exome

        #project_name = project.split("/")[-1]
        data = datadump.SigProfilerMatrixGeneratorFunc(project_name,
                                                       refgen,
                                                       project,
                                                       exome=exome,
                                                       bed_file=None,
                                                       chrom_based=False,
                                                       plot=False,
                                                       gs=False)

        # Selecting the mutation types
        if mtype == ["default"]:
            if set(["96", "DINUC", "ID"]).issubset(data):
                mtypes = ["SBS96", "DBS78", "ID83"]
            elif set(["96", "DINUC"]).issubset(data):
                mtypes = ["SBS96", "DBS78"]
            elif set(["ID"]).issubset(data):
                mtypes = ["ID83"]

        elif mtype == "default":
            if set(["96", "DINUC", "ID"]).issubset(data):
                mtypes = ["SBS96", "DBS78", "ID83"]
            elif set(["96", "DINUC"]).issubset(data):
                mtypes = ["SBS96", "DBS78"]
            elif set(["ID"]).issubset(data):
                mtypes = ["ID83"]

        else:
            #mkeys = data.keys()
            mtype = mtype.upper()
            mtype = mtype.replace(" ", "")
            mtypes = mtype.split(",")
# =============================================================================
#             if any(x not in mkeys for x in mtypes):
#                  raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\
#                                  "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters")
# =============================================================================

#change working directory

#set the genome_build
        genome_build = refgen

    else:
        raise ValueError(
            "Please provide a correct input_type. Check help for more details")

    #recording context types
    excecution_parameters["context_type"] = ",".join(mtypes)

    record_parameters(sysdata, excecution_parameters, start_time)
    sysdata.close()

    ###########################################################################################################################################################################################

    for m in mtypes:

        mutation_context = m

        # we need to rename the m because users input could be SBS96, SBS1536, DBS78, ID83 etc
        if m.startswith("SBS"):
            m = m[3:]  #removing "SBS"
        elif m.startswith("DBS"):
            m = "DINUC"
        elif m.startswith("ID"):
            m = "ID"

        # Determine the types of mutation which will be needed for exporting and copying the files
        if not (m == "DINUC" or m.startswith("DBS") or m.startswith("ID")):

            if m.startswith("SBS"):
                mutation_type = m
            else:
                mutation_type = "SBS" + m

        else:
            if m == "DINUC" or m.startswith("DBS"):
                mutation_type = "DBS78"
            elif m == "ID" or m.stratswith("ID"):
                mutation_type = "ID83"

        if input_type == "vcf":

            try:

                genomes = pd.DataFrame(data[m])
            except:
                raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\
                                 "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters")

            #check if the genome is a nonzero matrix
            shape = genomes.shape
            if shape == (0, 0):
                sysdata = open(out_put + "/JOB_METADATA.txt", "a")
                sysdata.write(
                    "Sample is not a nonzero matrix for the mutation context "
                    + m + "\n")
                print(
                    "Sample is not a nozero matrix for the mutation context " +
                    m)
                sysdata.close()
                continue

            genomes = genomes.loc[:, (genomes != 0).any(axis=0)]

            allgenomes = genomes.copy(
            )  # save the allgenomes for the final results
            index = genomes.index.values
            colnames = genomes.columns
            allcolnames = colnames.copy(
            )  # save the allcolnames for the final results

        #check if start and end processes are bigger than the number of samples
        startProcess = min(startProcess, genomes.shape[1])
        endProcess = min(endProcess, genomes.shape[1])

        #in the plotting funciton "ID" is used as "INDEL"
        if m == "ID":
            m = "INDEL"  #for plotting

        #create output directories to store all the results
        output = out_put + "/" + mutation_type

        est_genomes = np.zeros([1, 1])
        H_iteration = 1
        genomes = np.array(genomes)
        information = []
        layer_directory = output
        try:
            if not os.path.exists(layer_directory):
                os.makedirs(layer_directory)
                #os.makedirs(output+"/pickle_objects")
                #os.makedirs(output+"/All solutions")
        except:
            print("The {} folder could not be created".format("output"))

        fh = open(layer_directory + "/All_solutions_stat.csv", "w")
        fh.write("Total Signatures,Stability,Matrix Frobenius%,avgStability\n")
        fh.close()
        # The following for loop operates to extract data from each number of signature

        all_similirities_list = [
        ]  #this list is going to store the dataframes of different similirieties as items
        minimum_stabilities = []
        #similarity_dataframe = pd.DataFrame({"Sample Name": list(colnames)})

        # get the cutoff for normatization to handle the hypermutators

        normalization_cutoff = sub.get_normalization_cutoff(genomes,
                                                            manual_cutoff=100 *
                                                            genomes.shape[0])
        #print("Normalization Cutoff is :", normalization_cutoff)
        excecution_parameters["normalization_cutoff"] = normalization_cutoff

        #pass the seed values to inner funtions:
        excecution_parameters["seeds"] = seed

        if genomes.shape[1] < endProcess:
            endProcess = genomes.shape[1]

        #report the notmatlization criteria
        sysdata = open(out_put + "/JOB_METADATA.txt", "a")
        context_start_time = datetime.datetime.now()
        sysdata.write("\n##################################\n")
        sysdata.write(
            "\n[{}] Analysis started for {}. Matrix size [{} rows x {} columns]\n"
            .format(
                str(context_start_time).split(".")[0], mutation_type,
                genomes.shape[0], genomes.shape[1]))
        if excecution_parameters["matrix_normalization"] == "gmm":
            sysdata.write("\n[{}] Normalization GMM with cutoff value set at {}\n". \
                          format(str(datetime.datetime.now()).split(".")[0], normalization_cutoff))
        elif excecution_parameters["matrix_normalization"] == "100X":
            sysdata.write("\n[{}] Normalization 100X with cutoff value set at {}\n". \
                          format(str(datetime.datetime.now()).split(".")[0],(genomes.shape[0]*100)))
        elif excecution_parameters["matrix_normalization"] == "log2":
            sysdata.write("\n[{}] Normalization Log2\n". \
                              format(str(datetime.datetime.now()).split(".")[0]))
        elif excecution_parameters["matrix_normalization"] == "none":
            sysdata.write("\n[{}] Analysis is proceeding without normalization\n". \
                          format(str(datetime.datetime.now()).split(".")[0]))
        else:
            sysdata.write("\n[{}] Normalization Custom with cutoff value set at {}\n". \
                              format(str(datetime.datetime.now()).split(".")[0],excecution_parameters["matrix_normalization"]))

        sysdata.close()

        for i in range(startProcess, endProcess + 1):
            current_time_start = datetime.datetime.now()

            #memory_usage()
            processAvg, \
            exposureAvg, \
            processStd, \
            exposureStd, \
            avgSilhouetteCoefficients, \
            clusterSilhouetteCoefficients, \
            finalgenomeErrors, \
            finalgenomesReconstructed, \
            finalWall, \
            finalHall, \
            converge_information, \
            reconstruction_error, \
            processes = sub.decipher_signatures(excecution_parameters,
                                                genomes= genomes,
                                                mut_context=m,
                                                i = i)

            #denormalize the genomes and exposures
            #genomes = sub.denormalize_samples(genomes, totalMutations, normalization_value=100000)
            #exposureStd = sub.denormalize_samples(exposureStd, totalMutations, normalization_value=100000)
            ####################################################################### add sparsity in the exposureAvg #################################################################

            # remove signatures only if the process stability is above a thresh-hold of 0.85
            if avgSilhouetteCoefficients > -1.0:
                stic = time.time()

                #removing signatures:
                # =============================================================================
                #                     pool = mp.Pool()
                #                     results = [pool.apply_async(sub.remove_all_single_signatures_pool, args=(x,processAvg,exposureAvg,genomes,)) for x in range(genomes.shape[1])]
                #                     pooloutput = [p.get() for p in results]
                #
                #                     #print(results)
                #                     pool.close()
                #
                #                     for i in range(len(pooloutput)):
                #                         #print(results[i])
                #                         exposureAvg[:,i]=pooloutput[i]
                # =============================================================================

                #refitting signatures:
                #removing signatures:
                pool = mp.Pool()
                results = [
                    pool.apply_async(ss.fit_signatures_pool,
                                     args=(
                                         genomes,
                                         processAvg,
                                         x,
                                     )) for x in range(genomes.shape[1])
                ]
                pooloutput = [p.get() for p in results]
                pool.close()

                for i in range(len(pooloutput)):

                    exposureAvg[:, i] = pooloutput[i][0]

                stoc = time.time()
                print("Optimization time is {} seconds".format(stoc - stic))
                #sysdata.write("\nAnalysis of context type {} is ended successfully\n".format(m))
            #report progress to the system file:

            #Get total mutationation for each signature in reverse order and order the signatures from high to low mutation barden
            signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int)

            sorted_idx = np.argsort(-signature_total_mutations)
            processAvg = np.take(processAvg, sorted_idx, axis=1)
            exposureAvg = np.take(exposureAvg, sorted_idx, axis=0)
            signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int)
            processStd = np.take(processStd, sorted_idx, axis=1)
            exposureStd = np.take(exposureStd, sorted_idx, axis=0)
            clusterSilhouetteCoefficients = np.take(
                clusterSilhouetteCoefficients, sorted_idx, axis=0)

            signature_stats = pd.DataFrame({
                "Stability":
                clusterSilhouetteCoefficients,
                "Total Mutations":
                signature_total_mutations
            })
            minimum_stabilities.append(
                round(np.mean(clusterSilhouetteCoefficients), 2)
            )  #here minimum stability is the average stability !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            # Compute the estimated genome from the processAvg and exposureAvg
            est_genomes = np.dot(processAvg, exposureAvg)

            #check the similarities between the original and estimated genome for each number of signatures

            all_similarities, cosine_similarities = sub.calculate_similarities(
                genomes, est_genomes, colnames)
            #print(totalMutations)
            ##########################################################################################################################################################################
            # store the resutls of the loop.  Here,  processStd and exposureStd are standard Errors, NOT STANDARD DEVIATIONS.
            loopResults = [
                genomes, processAvg, exposureAvg, processStd, exposureStd,
                avgSilhouetteCoefficients, clusterSilhouetteCoefficients,
                signature_total_mutations, all_similarities, signature_stats,
                reconstruction_error, finalgenomeErrors,
                finalgenomesReconstructed, converge_information, finalWall,
                finalHall, processes
            ]
            information.append([
                processAvg, exposureAvg, processStd, exposureStd,
                clusterSilhouetteCoefficients, signature_total_mutations,
                signature_stats, all_similarities
            ])  #Will be used during hierarchycal approach

            ################################# Export the results ###########################################################
            sub.export_information(loopResults,
                                   m,
                                   layer_directory,
                                   index,
                                   colnames,
                                   wall=wall,
                                   sequence=sequence)

            all_similirities_list.append(all_similarities)
            #
            #similarity_dataframe["Total Signatures "+str(processes)] = cosine_similarities

            current_time_end = datetime.datetime.now()

            sysdata = open(out_put + "/JOB_METADATA.txt", "a")
            sysdata.write("\n[{}] {} de novo extraction completed for a total of {} signatures! \nExecution time:{}\n". \
                          format(str(datetime.datetime.now()).split(".")[0],mutation_type,processes,str(current_time_end-current_time_start).split(".")[0], current_time_end))
            sysdata.close()

        ################################################################################################################
        ########################################## Plot Stabiltity vs Reconstruction Error #############################
        ################################################################################################################
        # Print the Stabiltity vs Reconstruction Error as get the solution as well
        solution, all_stats = sub.stabVsRError(
            layer_directory + "/All_solutions_stat.csv",
            layer_directory,
            title,
            all_similirities_list,
            mtype=mutation_type,
            stability=stability,
            min_stability=min_stability,
            combined_stability=combined_stability)
        all_stats.insert(
            1, 'Stability (Avg Silhouette)', minimum_stabilities
        )  #!!!!!!!!!!!!!!!!1 here minimum stability is avg stability
        all_stats = all_stats.set_index(["Signatures"])
        all_stats.to_csv(layer_directory + "/All_solutions_stat.csv", sep=",")

        # add more information to results_stat.csv

        #Set index for the  the Similarity Dataframe
        #similarity_dataframe = similarity_dataframe.set_index("Sample Name")

        #Add the total mutations of each sample
        #sample_total_mutations = list(np.sum(genomes, axis =0))

        #similarity_dataframe.insert(loc=0, column = "Total Mutations", value = sample_total_mutations)

        # write the name of Samples and Matrix participating in each Layer.
        layer_genome = pd.DataFrame(genomes)
        layer_genome = layer_genome.set_index(index)
        layer_genome.columns = colnames
        layer_genome = layer_genome.rename_axis("Mutation Types",
                                                axis="columns")

        # =============================================================================
        #                 data_stat_folder = output+"/Data_Stats"
        #                 try:
        #                     if not os.path.exists(data_stat_folder):
        #                         os.makedirs(data_stat_folder)
        #                 except:
        #                         print ("The {} folder could not be created".format("Data_Stats"))
        #
        #                 layer_genome.to_csv(data_stat_folder+"/Samples.text", sep = "\t", index_label=[layer_genome.columns.name])
        #                 similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs.text", sep = "\t")
        #                 del layer_genome
        #                 for i in range(startProcess,endProcess+1):
        #                     all_similirities_list[i-startProcess].to_csv(data_stat_folder+"/Similatiry_Data_Sig_"+str(i)+".text", sep="\t")
        # =============================================================================
        # record the samples
        layer_genome.to_csv(output + "/Samples.txt",
                            sep="\t",
                            index_label=[layer_genome.columns.name])
        #similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t")
        del layer_genome
        ################################### Decompose the new signatures into global signatures   #########################
        processAvg = information[solution - startProcess][0]
        exposureAvg = information[solution - startProcess][1]
        processSTE = information[solution - startProcess][2]
        signature_stabilities = information[solution - startProcess][4]
        signature_total_mutations = information[solution - startProcess][5]
        signature_stats = information[solution - startProcess][6]
        all_similarities = information[solution - startProcess][7]

        # create the folder for the final solution/ De Novo Solution
        layer_directory1 = output + "/Suggested_Solution/" + mutation_type + "_De_Novo_Solution"
        try:
            if not os.path.exists(layer_directory1):
                os.makedirs(layer_directory1)
        except:
            print("The {} folder could not be created".format("output"))

        # make the texts for signature plotting

        signature_stabilities = sub.signature_plotting_text(
            signature_stabilities, "Stability", "float")
        signature_total_mutations = sub.signature_plotting_text(
            signature_total_mutations, "Total Mutations", "integer")
        # make de novo solution(processAvg, allgenomes, layer_directory1)

        listOfSignatures = sub.make_letter_ids(idlenth=processAvg.shape[1],
                                               mtype=mutation_context)
        allgenomes = pd.DataFrame(allgenomes)


        exposureAvg = sub.make_final_solution(processAvg, allgenomes, listOfSignatures, layer_directory1, m, index, \
                       allcolnames, process_std_error = processSTE, signature_stabilities = signature_stabilities, \
                       signature_total_mutations = signature_total_mutations,denovo_exposureAvg  = exposureAvg, \
                       signature_stats = signature_stats, add_penalty=add_penalty, remove_penalty=remove_penalty, \
                       initial_remove_penalty=initial_remove_penalty, refit_denovo_signatures=refit_denovo_signatures, de_novo_fit_penalty=de_novo_fit_penalty, sequence=sequence)

        #try:
        # create the folder for the final solution/ Decomposed Solution

        layer_directory2 = output + "/Suggested_Solution/COSMIC_" + mutation_type + "_Decomposed_Solution"
        try:
            if not os.path.exists(layer_directory2):
                os.makedirs(layer_directory2)
        except:
            print("The {} folder could not be created".format("output"))

        originalProcessAvg = pd.DataFrame(processAvg, index=index)

        if processAvg.shape[
                0] == 1536:  #collapse the 1596 context into 96 only for the deocmposition
            processAvg = pd.DataFrame(processAvg, index=index)
            processAvg = processAvg.groupby(processAvg.index.str[1:8]).sum()
            genomes = pd.DataFrame(genomes, index=index)
            genomes = genomes.groupby(genomes.index.str[1:8]).sum()
            index = genomes.index
            processAvg = np.array(processAvg)
            genomes = np.array(genomes)

        if processAvg.shape[
                0] == 288:  #collapse the 288 context into 96 only for the deocmposition
            processAvg = pd.DataFrame(processAvg, index=index)
            processAvg = processAvg.groupby(processAvg.index.str[2:9]).sum()
            genomes = pd.DataFrame(genomes, index=index)
            genomes = genomes.groupby(genomes.index.str[2:9]).sum()
            index = genomes.index
            processAvg = np.array(processAvg)
            genomes = np.array(genomes)

        originalProcessAvg.columns = listOfSignatures
        final_signatures = sub.signature_decomposition(
            processAvg,
            m,
            layer_directory2,
            genome_build=genome_build,
            add_penalty=add_penalty,
            remove_penalty=remove_penalty,
            mutation_context=mutation_context,
            make_decomposition_plots=make_decomposition_plots,
            originalProcessAvg=originalProcessAvg)

        # extract the global signatures and new signatures from the final_signatures dictionary
        globalsigs = final_signatures["globalsigs"]
        globalsigs = np.array(globalsigs)
        newsigs = final_signatures["newsigs"]
        try:
            processAvg = np.hstack([globalsigs, newsigs])
            allsigids = final_signatures["globalsigids"] + final_signatures[
                "newsigids"]
        except:
            processAvg = newsigs
            allsigids = final_signatures["newsigids"]

        attribution = final_signatures["dictionary"]
        background_sigs = final_signatures["background_sigs"]
        genomes = pd.DataFrame(genomes)



        exposureAvg = sub.make_final_solution(processAvg, genomes, allsigids, layer_directory2, m, index, colnames, \
                                cosmic_sigs=True, attribution = attribution, denovo_exposureAvg  = exposureAvg , background_sigs=background_sigs, add_penalty=add_penalty, remove_penalty=remove_penalty, initial_remove_penalty=initial_remove_penalty, genome_build=genome_build, sequence=sequence,export_probabilities=export_probabilities)

    sysdata = open(out_put + "/JOB_METADATA.txt", "a")
    end_time = datetime.datetime.now()
    sysdata.write("\n[{}] Analysis ended: \n".format(
        str(end_time).split(".")[0]))

    sysdata.write("\n-------Job Status------- \n")
    sysdata.write(
        "Analysis of mutational signatures completed successfully! \nTotal execution time: "
        + str(end_time - start_time).split(".")[0] +
        " \nResults can be found in: " + " " + out_put + " " + " folder")
    sysdata.close()

    print(
        "\n\n \nYour Job Is Successfully Completed! Thank You For Using SigProfiler Extractor.\n "
    )

Exemple #17

0

Afficher le fichier

Fichier : create_input_matrix.py Projet : EESI/MetaMutationalSigs

# from SigProfilerMatrixGenerator import install as genInstall
# genInstall.install('GRCh37')

from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen
matGen.SigProfilerMatrixGeneratorFunc(
    "MetaMutationalSigs", 'GRCh37',
    "C:\\Users\\pande\\OneDriveDrexelUniversity\\Documents\\Fall-2021\\Coop\\CGC\\SanjeeVCFFiles\\PLOS_review_paper\\metaSignatures\\flaskmultiplefileupload\\uploads"
    + user_file)

# matrices = matGen.SigProfilerMatrixGeneratorFunc("Sigprofiler",'GRCh37' , "C:\\Users\\pande\\OneDrive - Drexel University\\Documents\\Fall-2021\\Coop\\CGC\\SanjeeVCFFiles\kidney_vcf\\indels\\plink")