def main(): # Initiate the parser parser = argparse.ArgumentParser() parser.add_argument( "-v", "--vcf_file_folder", help="path to folder containing small variant VCF files") parser.add_argument("-n", "--name", help="string to associate with the files/figures", default="mut_spec") parser.add_argument("-r", "--reference", help="Must be one of GRCh38,GRCh37,mm10,mm9,etc.", default="GRCh37") #get at the arguments args = parser.parse_args() if (args.vcf_file_folder == None or not os.path.exists(args.vcf_file_folder)): parser.print_help() sys.exit() #set up to use GRCh37 #genInstall.install('GRCh37', rsync=False, bash=True) #should be set up when container is built matGen.SigProfilerMatrixGeneratorFunc(args.name, args.reference, args.vcf_file_folder, plot=True)
def load(mut_class): # Extract mutational matrices via SigProfiler matrices = matGen.SigProfilerMatrixGeneratorFunc( project=mut_class.project_name, genome=mut_class.reference_genome, vcfFiles=mut_class.vcf, exome=mut_class.exome) return
def main(): """ Generates matrix from MAF file """ args = parse_arguments() outdir = os.path.dirname(args.maf) # output made in MAF directory # Generates matrix from MAF found in outdir matrices = mg.SigProfilerMatrixGeneratorFunc(args.project, args.ref, \ outdir, tsb_stat = True) return ()
def benchmark(genome, ref_dir): #current_dir = os.path.realpath(__file__) #ref_dir = re.sub('\/install.py$', '', current_dir) ref_dir = os.path.dirname(os.path.abspath(__file__)) vcf_path = ref_dir + "/references/vcf_files/" + genome + "_bench/" start_time = time.time() matGen.SigProfilerMatrixGeneratorFunc(genome + "_bench", genome, vcf_path) end_time = time.time() original_matrix_96 = ref_dir + "/scripts/Benchmark/" + genome + "_bench_orig_96.txt" original_matrix_3072 = ref_dir + "/scripts/Benchmark/" + genome + "_bench_orig_3072.txt" new_matrix_96 = vcf_path + "output/SBS/" + genome + "_bench.SBS96.all" new_matrix_3072 = vcf_path + "output/SBS/" + genome + "_bench.SBS6144.all" #genome = "GRCh37" ############# Cosine Test ################################################### data_orig = pd.read_csv(original_matrix_96, sep='\t', header=0) data_new = pd.read_csv(new_matrix_96, sep='\t', header=0) count = 0 range_count = min(len(data_orig.loc[0]), len(data_new.loc[0])) for i in range(1, range_count, 1): orig_list = list(data_orig[data_orig.columns[i]]) new_list = list(data_new[data_new.columns[i]]) cosine_result = (1 - spatial.distance.cosine(orig_list, new_list)) if cosine_result != 1: count += 1 if count != 0: print( "There seems to be some errors in the newly generated matrix. The installation may not have been successful." ) data_orig = pd.read_csv(original_matrix_3072, sep='\t', header=0) data_new = pd.read_csv(new_matrix_3072, sep='\t', header=0) count = 0 range_count = min(len(data_orig.loc[0]), len(data_new.loc[0])) for i in range(1, range_count, 1): orig_list = data_orig[data_orig.columns[i]] new_list = data_new[data_new.columns[i]] cosine_result = (1 - spatial.distance.cosine(orig_list, new_list)) if cosine_result <= 0.85: count += 1 if count != 0: print( "There seems to be some errors in the newly generated matrix. The installation may not have been successful." ) end_time = time.time() print("Installation was succesful.\nSigProfilerMatrixGenerator took " + str(end_time - start_time) + " seconds to complete.")
def generate(): print(genome_ref, mutationalPattern, sigflow, sigfit, deconstructSigs) x = 1 yield "data:" + str(x) + "\n\n" yield "data:" + str(x) + "\n\n" if glob.glob("uploads/*.vcf"): matGen.SigProfilerMatrixGeneratorFunc("MetaMutationalSigs", 'GRCh37', "uploads") x = x + 33 yield "data:" + str(x) + "\n\n" subprocess.call([ 'Rscript', "../meta_sig_main_flask.r", "uploads", genome_ref, mutationalPattern, sigflow, sigfit, deconstructSigs ]) x = x + 33 yield "data:" + str(x) + "\n\n" subprocess.call([ 'python3.8', "../plot_graphs.py", "uploads", mutationalPattern, sigflow, sigfit, deconstructSigs ]) shutil.rmtree("uploads" + "/input") shutil.rmtree("uploads" + "/logs") shutil.rmtree("uploads" + "/output") files_in_directory = os.listdir("uploads") filtered_files = [ file for file in files_in_directory if file.endswith(".vcf") ] for file in filtered_files: path_to_file = os.path.join("uploads", file) os.remove(path_to_file) zipf = zipfile.ZipFile("metaMutationalSignatures_results.zip", 'w', zipfile.ZIP_DEFLATED) # os.chdir("") zipdir("./uploads/", zipf) zipf.close() # shutil.rmtree("uploads") if not os.path.isdir(app.config['UPLOAD_FOLDER']): os.mkdir(app.config['UPLOAD_FOLDER']) x = x + 33 yield "data:" + str(x) + "\n\n" else: pass
def SigProfilerSimulator (project, project_path, genome, contexts, exome=None, simulations=1, updating=False, bed_file=None, overlap=False, gender='female', seqInfo=False, chrom_based=False, seed_file=None, spacing=1, noisePoisson=False, noiseAWGN=0, cushion=100, region=None, vcf=False, mask=None): ''' contexts -> [] must be a list ''' print("\n======================================\n SigProfilerSimulator \n======================================\n\nChecking for all reference files and relevant matrices...") start_run = time.time() # Ensures proper string for the project's path if project_path[-1] != "/": project_path += "/" # Sorts the user-provided contexts contexts.sort(reverse=True) bed = False if bed_file: bed = True exome_file = None # Asigns a species based on the genome parameter species = None if genome.upper() == 'GRCH37' or genome.upper() == 'GRCH38': species = "homo_sapiens" elif genome.upper() == 'MM10' or genome.upper() == 'MM9': species = "mus_musculus" else: species = "custom" ############################## References ########################################################################################################### chromosomes = ['X', 'Y', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22'] tsb_ref = {0:['N','A'], 1:['N','C'], 2:['N','G'], 3:['N','T'], 4:['T','A'], 5:['T','C'], 6:['T','G'], 7:['T','T'], 8:['U','A'], 9:['U','C'], 10:['U','G'], 11:['U','T'], 12:['B','A'], 13:['B','C'], 14:['B','G'], 15:['B','T'], 16:['N','N'], 17:['T','N'], 18:['U','N'], 19:['B','N']} tsb_ref_rev = {'N':{'A':0, 'C':1, 'G':2, 'T':3, 'N':16}, 'T':{'A':4, 'C':5, 'G':6, 'T':7, 'N':17}, 'U':{'A':8, 'C':9, 'G':10, 'T':11, 'N':18}, 'B':{'A':12, 'C':13, 'G':14, 'T':15, 'N':19}} if species == 'mus_musculus': chromosomes = chromosomes[:21] chromosome_string_path, ref_dir = matRef.reference_paths(genome) if species == 'custom': chromosome_string_path, ref_dir = matRef.reference_paths(genome) chromosomes = os.listdir(chromosome_string_path) if ".DS_Store" in chromosomes: chromosomes.remove(".DS_Store") chromosomes = [x.split(".")[0] for x in chromosomes if len(x.split(".")[0]) < 8] if genome == 'yeast': chromosomes = sorted(chromosomes, key = lambda x: (['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI'].index(x))) if gender == 'female' or gender.upper() == 'FEMALE': if "Y" in chromosomes: chromosomes.remove('Y') if region: chromosomes = [region] ############################## Log and Error Files ################################################################################################## time_stamp = datetime.date.today() error_file = project_path + 'logs/SigProfilerSimulator_' + project + "_" + genome + "_" + str(time_stamp) + ".err" log_file = project_path + 'logs/SigProfilerSimulator_' + project + "_" + genome + "_" + str(time_stamp) + ".out" if not os.path.exists(project_path + "logs/"): os.makedirs(project_path + "logs/") if os.path.exists(error_file): # os.system("rm " + error_file) os.remove(error_file) if os.path.exists(log_file): # os.system("rm " + log_file) os.remove(log_file) sys.stderr = open(error_file, 'w') log_out = open(log_file, 'w') log_out.write("THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME\n\n\n") log_out.write("-------System Info-------\n") log_out.write("Operating System Name: "+ platform.uname()[0]+"\n"+"Nodename: "+ platform.uname()[1]+"\n"+"Release: "+ platform.uname()[2]+"\n"+"Version: "+ platform.uname()[3]+"\n") log_out.write("\n-------Python and Package Versions------- \n") log_out.write("Python Version: "+str(platform.sys.version_info.major)+"."+str(platform.sys.version_info.minor)+"."+str(platform.sys.version_info.micro)+"\n") log_out.write("SigProfilerSimulator Version: "+sigSim.__version__+"\n") log_out.write("SigProfilerMatrixGenerator Version: "+sig.__version__+"\n") log_out.write("numpy version: "+np.__version__+"\n") log_out.write("\n-------Vital Parameters Used for the execution -------\n") log_out.write("Project: {}\nGenome: {}\nInput File Path: {}\ncontexts: {}\nexome: {}\nsimulations: {}\nupdating: {}\nbed_file: {}\noverlap: {}\ngender: {}\nseqInfo: {}\nchrom_based: {}\nseed_file: {}\n".format(project, project_path, genome, contexts, str(exome), str(simulations), str(updating), str(bed_file), str(overlap), gender, str(seqInfo), str(chrom_based), str(seed_file))) log_out.write("\n-------Date and Time Data------- \n") tic = datetime.datetime.now() log_out.write("Date and Clock time when the execution started: "+str(tic)+"\n\n\n") ############################## Pre-simulation Checks ################################################################################################## # Ensures that the chromosome strings are saves properly: chromosome_string_path, ref_dir = matRef.reference_paths(genome) if os.path.exists(chromosome_string_path) == False or len(os.listdir(chromosome_string_path)) < len(chromosomes): print(" The chromosome strings were not saved properly or have not been created yet. Please refer to the SigProfilerMatrixGenerator README for installation instructions:\n\thttps://github.com/AlexandrovLab/SigProfilerMatrixGenerator") sys.exit() # Ensures that the chromosome proportions are saved: if os.path.exists(chromosome_string_path + genome + "_proportions.txt") == False: print(" Chromosome proportion file does not exist. Creating now...", end='') chromosomeProbs = simScript.chrom_proportions(chromosome_string_path, genome, chromosomes) print("Completed!") if bed_file: print(" Creating a chromosome proportion file for the given BED file ranges...", end='') chromosomeProbs = simScript.chrom_proportions_BED(bed_file, chromosome_string_path, genome, chromosomes) print("Completed!") # Ensures that the mutational matrices exist: catalogue_files = {} for context in contexts: matrix_path = project_path + "output/" if context == 'DINUC' or 'DBS' in context: context_folder = 'DBS' matrix_path = matrix_path + context_folder + "/" if context == 'DBS' or context == 'DINUC' or context == '78': file_name = ".DBS78" else: file_name = '.' + context elif context == 'INDEL' or 'ID' in context or '415' in context: context_folder = 'ID' matrix_path = matrix_path + context_folder + "/" if context == 'INDEL' or context == 'ID' or context == '83': file_name = '.ID83' else: file_name = "." + context else: context_folder = 'SBS' matrix_path = matrix_path + context_folder + "/" file_name = '.SBS' + context if exome: catalogue_file = matrix_path + project + file_name + '.exome' else: if bed_file: catalogue_file = matrix_path + project + file_name + '.region' else: catalogue_file = matrix_path + project + file_name + '.all' catalogue_files[context] = catalogue_file vcf_files_1 = project_path vcf_files_2 = project_path + "input/" parent_dir = os.getcwd() matrix_dir = "scripts/" if chrom_based: if os.path.exists (catalogue_file + '.chr1') == False: if os.path.exists (vcf_files_2) == False and len(os.listdir(vcf_files_1)) == 0: print (" Please place your vcf files for each sample into the 'references/vcf_files/[project]/' directory. Once you have done that, rerun this script.") else: print(" Matrices per chromosomes do not exist. Creating the matrix files now.") matGen.SigProfilerMatrixGeneratorFunc(project, genome, project_path ,plot=False, exome=exome, bed_file=bed_file, chrom_based=True, cushion=cushion) # print("The matrix file has been created. Continuing with simulations...") if os.path.exists (catalogue_file) == False: if os.path.exists (vcf_files_2) == False and len(os.listdir(vcf_files_1)) == 0: print (" Please place your vcf files for each sample into the 'references/vcf_files/[project]/' directory. Once you have done that, rerun this script.") else: print(" " + catalogue_file + " does not exist. Creating the matrix file now.") matGen.SigProfilerMatrixGeneratorFunc(project, genome, project_path ,plot=False, exome=exome, bed_file=bed_file, cushion=cushion) # print("The matrix file has been created. Continuing with simulations...") else: if os.path.exists (catalogue_file) == False:# or bed_file: if os.path.exists (vcf_files_2) == False and len(os.listdir(vcf_files_1)) == 0: print (" Please place your vcf files for each sample into the 'references/vcf_files/[project]/' directory. Once you have done that, rerun this script.") else: print(" " + catalogue_file + " does not exist. Creating the matrix file now.") matGen.SigProfilerMatrixGeneratorFunc(project, genome, project_path ,plot=False, exome=exome, bed_file=bed_file, cushion=cushion) # print("The matrix file has been created. Continuing with simulations...") if exome: exome_file = ref_dir + "/references/chromosomes/exome/" + genome + "/" + genome + "_exome.interval_list" # Esnures that the nucleotide context files are saved properly nucleotide_context_files = {} for context in contexts: nucleotide_context_file = chromosome_string_path.split("/") ref_path = nucleotide_context_file[:-3] ref_path = '/'.join([x for x in ref_path]) nucleotide_context_file = ref_path + '/context_distributions/' if bed_file: if region: nucleotide_context_file += "context_distribution_" + genome + "_" + context + "_" + gender + ".csv" else: nucleotide_context_file += "context_distribution_" + genome + "_" + context + "_" + gender + "_BED.csv" else: if exome: nucleotide_context_file += "context_distribution_" + genome + "_" + context + "_" + gender + "_exome.csv" else: nucleotide_context_file += "context_distribution_" + genome + "_" + context + "_" + gender + ".csv" nucleotide_context_files[context] = nucleotide_context_file if os.path.exists(nucleotide_context_file) == True and bed and not region: os.remove(nucleotide_context_file) if os.path.exists(nucleotide_context_file) == False and (context != 'INDEL' and context != 'ID' and context != 'ID415'): print(" The context distribution file does not exist. This file needs to be created before simulating. This may take several hours...") if bed: output_file = ref_path + '/context_distributions/context_distribution_' + genome + "_" + context + "_" + gender + '_BED.csv' context_dist.context_distribution_BED(context, output_file, chromosome_string_path, chromosomes, bed, bed_file, exome, exome_file, genome, ref_path, tsb_ref, gender) elif exome: output_file = ref_path + '/context_distributions/context_distribution_' + genome + "_" + context + "_" + gender + '_exome.csv' context_dist.context_distribution_BED(context, output_file, chromosome_string_path, chromosomes, bed, bed_file, exome, exome_file, genome, ref_dir, tsb_ref, gender) else: output_file = ref_path + '/context_distributions/context_distribution_' + genome + "_" + context + "_" + gender + '.csv' context_dist.context_distribution(context, output_file, chromosome_string_path, chromosomes, tsb_ref, genome) print(" The context distribution file has been created!") if gender == 'female' or gender.upper() == 'FEMALE': if "Y" in chromosomes: chromosomes.remove('Y') ############################## Set-up output files ################################################################################################## context_string = "_".join(contexts) if bed_file: output_path = project_path + "output/simulations/" + project + '_simulations_' + genome + '_' + context_string + '_BED/' elif exome: output_path = project_path + "output/simulations/" + project + '_simulations_' + genome + '_' + context_string + '_exome/' else: output_path = project_path + "output/simulations/" + project + '_simulations_' + genome + '_' + context_string + '/' if os.path.exists(output_path): shutil.rmtree(output_path) os.makedirs(output_path) else: os.makedirs(output_path) if "M" in chromosomes: chromosomes.remove("M") if "MT" in chromosomes: chromosomes.remove("MT") ############################## Begin the simulation process ################################################################################################## print() if chrom_based: sample_names, mut_prep, mut_dict = simScript.mutation_preparation_chromosomes(catalogue_files, matrix_path, chromosomes, project, log_file) reference_sample = sample_names[0] elif region: sample_names, mut_prep, mut_dict = simScript.mutation_preparation_region(catalogue_files, matrix_path, project, log_file, region) reference_sample = sample_names[0] else: sample_names, mut_prep = simScript.mutation_preparation(catalogue_files, log_file) reference_sample = sample_names[0] mut_dict = simScript.mut_tracker(sample_names, mut_prep, reference_sample, nucleotide_context_files, chromosome_string_path, genome, chromosomes, bed_file, log_file) if vcf: if "" in sample_names: sample_names.remove("") for sample in sample_names: if not os.path.exists(output_path + sample + "/"): os.makedirs(output_path + sample + "/") # Add desired noise if applicable: # if noisePoisson or noiseAWGN: # mut_dict = simScript.noise(mut_dict, noisePoisson, noiseAWGN) # Set-up parallelization: processors = mp.cpu_count() max_seed = processors if processors > len(chromosomes): max_seed = len(chromosomes) pool = mp.Pool(max_seed) chrom_break = len(chromosomes)/max_seed chromosomes_parallel = [[] for i in range(max_seed)] chrom_bin = 0 for chrom in chromosomes: if chrom_bin == max_seed: chrom_bin = 0 chromosomes_parallel[chrom_bin].append(chrom) chrom_bin += 1 iterations_parallel = [[] for i in range(max_seed)] iter_bin = 0 for i in range(1, simulations + 1, 1): if iter_bin == max_seed: iter_bin = 0 iterations_parallel[iter_bin].append(i) iter_bin += 1 # Generate unique seeds for each process log_out.write("\n-------Seeds for random number generation per process------- \n") seeds = [] if seed_file == None: ref_dir, tail = os.path.split(os.path.dirname(os.path.abspath(__file__))) seed_file = ref_dir + "/SigProfilerSimulator/seeds.txt" with open(seed_file) as f: for i in range (0, max_seed, 1): new_seed = int(f.readline().strip()) + time.time() seeds.append(new_seed) log_out.write("Process " + str(i) + ": " + str(new_seed) + "\n") log_out.write("\n\n\n-------Runtime Checkpoints------- \n") log_out.close() if exome: bed = True bed_file = ref_dir + "/SigProfilerMatrixGenerator/references/chromosomes/exome/" + genome + "/" + genome + "_exome.interval_list" if seqInfo: seqOut_path = project_path + "output/vcf_files/simulations/" if not os.path.exists(seqOut_path): os.makedirs(seqOut_path) for context in contexts: if not os.path.exists(seqOut_path + context + "/"): os.makedirs(seqOut_path + context + "/") else: print(seqOut_path+ context + "/") shutil.rmtree(seqOut_path+ context + "/") os.makedirs(seqOut_path+ context + "/") pool = mp.Pool(max_seed) results = [] for i in range (0, len(chromosomes_parallel), 1): mut_dict_parallel = {k1:{k2:{k3:{k4:v4 for k4, v4 in v3.items() if k4 in chromosomes_parallel[i]} for k3, v3 in v2.items()} for k2, v2 in v1.items()} for k1, v1 in mut_dict.items()} r = pool.apply_async(simScript.simulator, args=(sample_names, mut_dict_parallel, chromosome_string_path, tsb_ref, tsb_ref_rev, simulations, seeds[i], cushion, output_path, updating, chromosomes_parallel[i], project, genome, bed, bed_file, contexts, overlap, project_path, seqInfo, log_file, spacing, noisePoisson, noiseAWGN, vcf, mask)) results.append(r) pool.close() pool.join() # simScript.simulator(sample_names, mut_dict, chromosome_string_path, tsb_ref, tsb_ref_rev, simulations, seeds[0], output_path, updating, chromosomes, project, genome, bed, bed_file, contexts, overlap, project_path, seqInfo, log_file, spacing, noisePoisson, noiseAWGN) for r in results: r.wait() if not r.successful(): # Raises an error when not successful r.get() pool = mp.Pool(max_seed) #if region: bed=False for i in range (0, len(iterations_parallel), 1): r = pool.apply_async(simScript.combine_simulation_files, args=(iterations_parallel[i], output_path, chromosomes, sample_names, bed, exome, vcf)) pool.close() pool.join() for r in results: r.wait() if not r.successful(): # Raises an error when not successful r.get() end_run = time.time() run_time = end_run - start_run log_out = open(log_file, 'a') print("Simulation completed\nJob took " , run_time, " seconds", file=log_out) print("Simulation completed\nJob took " , run_time, " seconds") log_out.close() sys.stderr.close()
def single_sample(data, output, ref="GRCh37", sig_database="default", check_rules=True, exome=False): """ Decompose the query samples into the global signatures. parameters ---------- vcf: string or dataframe. The name of the folder containing the vcf files. The folder should be present in the current working directory. If a dataframe is used, that should be a mutational catalogue where the row index will be the names of mutations and the column names will be the sample names. outputdir: A string. The name of the output folder. The output folder will be generated in the current working directory according to name provided in the current working directory. ref: string. The name of the reference genome file. The file should be installed previously through "SigProfilerMatrixGenerator". Please see the "INSTALLATION" part of the README.md file. The default reference genome is "GRCh37". sig_database: dataframe. This is signature catalogue where the row index will be the names of mutations and the column names will be the sample names. The sum of each column should be one. The row numbers should be equal to the row the number of the mutational catalogue and the order/sequence of the mutation types should be same as of those in the mutational catalogue. check_rules: boolean. If true, check the signature rules. Not functional for the custom signature database. exome: boolean. If the agrument is True, that will genearate the mutational profile only for the exomes. If False, the profile for the whole genome sequence will be generated. Returns: ------- After the single_sample function is successfully executed, an output directory will be generated in the current working directory. The output folder will contain the following files: -exposure.txt -signature.txt -probabilities.txt -signature plot pdf -dendrogram plot -decomposition profile.csv Example: ------- >>> from sigproSS import spss >>> data = spss.importdata() >>> spss.single_sample(data, "results", ref="GRCh37", exome=False) """ if not os.path.exists(output): os.makedirs(output) #get the path for files paths = cosmic.__path__[0] #set the signature database: if type(sig_database) == str: signatures_names = paths + '/input/signaturesSet.txt' wholegenome_singnatures = paths + '/input/genomeSignatures.txt' exome_signatures = paths + '/input/exomeSignatures.txt' #extract data from the signature database signaturesNames = open(signatures_names, 'r').read().split('\n') allGenomeSignatures = np.loadtxt(wholegenome_singnatures) allExomeSignatures = np.loadtxt(exome_signatures) else: signaturesNames = list(sig_database.columns) allGenomeSignatures = np.array(sig_database) allExomeSignatures = np.array(sig_database) # take the inputs # check if the input type is a vcf or a dataframe if type(data) == str: vcf = data if vcf[-1] != "/": vcf_name = vcf.split("/")[-1] else: vcf_name = vcf.split("/")[-2] data = matGen.SigProfilerMatrixGeneratorFunc(vcf_name, ref, vcf, exome=exome, tsb_stat=True) # make the totalExposure dataframe which have dimention of totalsignatures and totalsamples p_value = data["7_pvalue"] data = data["96"] else: p_value = "none" check_rules = False number_of_signatures = len(signaturesNames) totalExposures = np.zeros([number_of_signatures, data.shape[1]]) listOfSamples = list(data.columns) # open a file to profile the signatures fh = open(output + "/decomposition_profile.csv", "w") fh.write("Sample_Names,Global_NMF_Signatures,Similarity\n") fh.close() #set the signature database: if type(sig_database) == str: signatures_names = paths + '/input/signaturesSet.txt' wholegenome_singnatures = paths + '/input/genomeSignatures.txt' exome_signatures = paths + '/input/exomeSignatures.txt' #extract data from the signature database signaturesNames = open(signatures_names, 'r').read().split('\n') allGenomeSignatures = np.loadtxt(wholegenome_singnatures) allExomeSignatures = np.loadtxt(exome_signatures) else: signaturesNames = sig_database.columns allGenomeSignatures = np.array(sig_database) allExomeSignatures = np.array(sig_database) for i in range(data.shape[1]): print("##########################################################") print("Exacting Profile for " + "Sample " + str(i + 1)) index = i samples = data.iloc[:, index:index + 1] #print(p_value) samples = np.array(samples) sampleNames = list(data.head(0))[index:index + 1] cancerType = ['Breast Cancer'] * samples.shape[1] seqType = ['WGS'] * samples.shape[1] totalMutations = np.sum(samples, axis=0) #results variable contains [indices,exposures, signatureNames, allSignatures, similarity] results = analysis_individual_samples( samples, check_rules, signaturesNames, allGenomeSignatures, allExomeSignatures, sampleNames, cancerType, cancerType, seqType, totalMutations, p_value, paths + '/input/20181108_Signature_Rules.xml') totalExposures[results[0], i] = results[1] listOfSignatures = results[2] signatures = pd.DataFrame(results[3]) profile = decomposition_profile(totalExposures[:, i], results[4], results[2], sampleNames[0]) #write the profiles into file fh = open(output + "/decomposition_profile.csv", "a") fh.write(profile) fh.close() #prepare the exposures dataframe totalExposures = pd.DataFrame(totalExposures) totalExposures = totalExposures.set_index(listOfSignatures) totalExposures.columns = listOfSamples totalExposures = totalExposures.rename_axis("Samples", axis="columns") #Convert the floats to integers totalExposures[listOfSamples] = totalExposures[listOfSamples].applymap( np.int64) #remove the rows with all zeros to create the final exposure dataframe exposures = totalExposures.loc[~(totalExposures == 0).all(axis=1)] #presure the signatures dataframe signatures = pd.DataFrame(results[3]) signatures.columns = listOfSignatures signatures = signatures.set_index(data.index) signatures = signatures.rename_axis("Signatures", axis="columns") #Filter the signatures by the exposures rows to get the final signature dataframe signatures = signatures.loc[:, list(exposures.index)] #create the probalities probability = sub.probabilities(signatures, exposures, data.index, signatures.columns, totalExposures.columns) probability = probability.set_index("Sample Names") probability = probability.rename_axis("", axis="columns") try: #create the dedrogrames Y, dn = sub.dendrogram(exposures, 0.05, output) except: pass #export results signatures.to_csv(output + "/signatures.txt", "\t", index_label=[signatures.columns.name]) exposures.to_csv(output + "/Sig_activities.txt", "\t", index_label=[exposures.columns.name]) probability.to_csv(output + "/Mutation_Probabilities.txt", "\t") try: plot.plotSBS(output + "/signatures.txt", output + "/Signature_plot", "", "96", True, custom_text_upper=" ") except: print( "SORRY! THE MUTATION CONTEXT YOU PROVIDED COULD NOT BE PLOTTED\n\n" ) print( "CONGRATULATIONS! THE SIGPROFILER SINGLE SAMPLE ANALYSIS ENDED SUCCESSFULLY" )
if __name__ == "__main__": args = parse_args() if args.directory is None: current = getcwd() args.directory = current + "/" + "sigprof_input" try: if not isdir(args.directory): mkdir(args.directory) except: print("ERROR: creation of directory", args.directory, "failed. Please use the -d option to create a valid directory.") try: shutil.copyfile(args.maf, args.directory + "/" + basename(args.maf)) except: print("File copy failed.", args.maf, args.directory + basename(args.maf)) matrices = matGen.SigProfilerMatrixGeneratorFunc(args.project, "GRCh37", args.directory, plot=args.plot, exome=False, bed_file=None, chrom_based=False, tsb_stat=False, seqInfo=False, cushion=args.cushion)
from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen matrices = matGen.SigProfilerMatrixGeneratorFunc("./output/", "GRCh37", "./data/", plot=True, exome=False, bed_file=None, chrom_based=False, tsb_stat=False, seqInfo=False, cushion=100)
#out_dir = snakemake.params["out_dir"] #bed = snakemake.input["bed"] sample = "SJCBF" genome_version = "GRCh37" out_dir = ".tests/" bed = None from SigProfilerMatrixGenerator import install as genInstall genInstall.install(genome_version, rsync=False, bash=True) from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen matrices = matGen.SigProfilerMatrixGeneratorFunc(sample, genome_version, out_dir, plot=True, exome=True, bed_file=bed, chrom_based=False, tsb_stat=False, seqInfo=False, cushion=100) matrix_path = out_dir + "output/SBS/{sample}.SBS96.exome".format(sample=sample) from sigProfilerPlotting import sample_portrait as sP sP.samplePortrait(sample_matrices_path, output_path, project, percentage=False)
def main(): # Parse and validate arguments args = parse_arguments() matrices = matGen.SigProfilerMatrixGeneratorFunc(args.project, args.genome, args.vcfpath, plot=True, exome=args.exome, bed_file=None, chrom_based=False, tsb_stat=False, seqInfo=False, cushion=100) num_tasks = 0 sig_list = [] try: if matrices['96'][args.project].sum() > 0: num_tasks = num_tasks + 1 sig_list.append(('SBS', '96')) else: if os.path.exists(args.vcfpath + "/output/SBS"): for f in os.listdir(args.vcfpath + "/output/SBS"): os.remove(os.path.join(args.vcfpath + "/output/SBS", f)) os.rmdir(args.vcfpath + "/output/SBS") except: if os.path.exists(args.vcfpath + "/output/SBS"): for f in os.listdir(args.vcfpath + "/output/SBS"): os.remove(os.path.join(args.vcfpath + "/output/SBS", f)) os.rmdir(args.vcfpath + "/output/SBS") try: if matrices['DINUC'][args.project].sum() > 0: num_tasks = num_tasks + 1 sig_list.append(('DBS', '78')) else: if os.path.exists(args.vcfpath + "/output/DBS"): for f in os.listdir(args.vcfpath + "/output/DBS"): os.remove(os.path.join(args.vcfpath + "/output/DBS", f)) os.rmdir(args.vcfpath + "/output/DBS") except: if os.path.exists(args.vcfpath + "/output/DBS"): for f in os.listdir(args.vcfpath + "/output/DBS"): os.remove(os.path.join(args.vcfpath + "/output/DBS", f)) os.rmdir(args.vcfpath + "/output/DBS") try: if matrices['ID'][args.project].sum() > 0: num_tasks = num_tasks + 1 sig_list.append(('ID', '83')) else: if os.path.exists(args.vcfpath + "/output/ID"): for f in os.listdir(args.vcfpath + "/output/ID"): os.remove(os.path.join(args.vcfpath + "/output/ID", f)) os.rmdir(args.vcfpath + "/output/ID") except: if os.path.exists(args.vcfpath + "/output/ID"): for f in os.listdir(args.vcfpath + "/output/ID"): os.remove(os.path.join(args.vcfpath + "/output/ID", f)) os.rmdir(args.vcfpath + "/output/ID") if num_tasks > 0: cpus_per_task = max(int(args.threads / num_tasks), 1) with ThreadPoolExecutor(max_workers=3) as e: for sigClass, sigContext in sig_list: e.submit(extractSignatures, args.output, args.vcfpath, args.genome, args.project, sigClass, sigContext, args.exome, cpus_per_task)
import argparse from os.path import dirname from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", dest="input", help="MAF file from which to extract matrix.", required=True) parser.add_argument("-e", "--exome", dest="exome", help="Exome data - restrict genome to exome regions", default=False) parser.add_argument("-p", "--project", dest="project", default="PROJECT", help="Project name for output.") parser.add_argument("-d", "--directory", dest="directory", default="input", help="Input/Output directory") parser.add_argument("-P", "--plot", dest="plot", action="store_true", help="Output plots of input data.") return parser.parse_args() if __name__ == "__main__": args = parse_args() if args.directory is None: args.directory = dirname(args.input) matrices = matGen.SigProfilerMatrixGeneratorFunc(args.project, "GRCh37", dirname(args.input), plot=args.plot, exome=False, bed_file=None, chrom_based=False, tsb_stat=False, seqInfo=True, cushion=100)
def sigProfilerExtractor(input_type, out_put, input_data, refgen="GRCh37", genome_build='GRCh37', startProcess=1, endProcess=10, totalIterations=8, cpu=-1, hierarchy=False, mtype=["default"], exome=False, par_h=0.90, penalty=0.05, resample=True): memory_usage() """ Extracts mutational signatures from an array of samples. Parameters ---------- input_type: A string. Type of input. The type of input should be one of the following: - "vcf": used for vcf format inputs. - "table": used for table format inputs using a tab seperated file. out_put: A string. The name of the output folder. The output folder will be generated in the current working directory. input_data: A string. Name of the input folder (in case of "vcf" type input) or the input file (in case of "table" type input). The project file or folder should be inside the current working directory. For the "vcf" type input,the project has to be a folder which will contain the vcf files in vcf format or text formats. The "text"type projects have to be a file. refgen: A string, optional. The name of the reference genome. The default reference genome is "GRCh37". This parameter is applicable only if the input_type is "vcf". startProcess: A positive integer, optional. The minimum number of signatures to be extracted. The default value is 1 endProcess: A positive integer, optional. The maximum number of signatures to be extracted. The default value is 10 totalIterations: A positive integer, optional. The number of iteration to be performed to extract each number signature. The default value is 8 cpu: An integer, optional. The number of processors to be used to extract the signatures. The default value is -1 which will use all available processors. hierarchy: Boolean, optional. Defines if the signature will be extracted in a hierarchical fashion. The default value is "False". par_h = Float, optional. Ranges from 0 t0 1. Default is 0.90. Active only if the "hierarchy" is True. Sets the cutoff to select the unexplained samples in a hierarchical layer based on the cosine similarity between the original and reconstructed samples. mtype: A list of strings, optional. The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is ["96", "DINUC" , "ID"], where "96" is the SBS96 context, "DINUC" is the DINULEOTIDE context and ID is INDEL context. exome: Boolean, optional. Defines if the exomes will be extracted. The default value is "False". penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the thresh-hold cutoff to asaign signatures to a sample. resample: Boolean, optional. Default is True. If True, add poisson noise to samples by resampling. Returns ------- After sigProfilerExtractor is successfully executed, an output directory will be generated in the current working directory according to the name of the parameter of the "out_put" argument. In the "output" directory there will be subfolder for each type of mutational contexts. If the "hierarchy" parameter is false, inside of each mutational context subdirectory, there will be subdirectories named "All solutions" and "Final solution". Besides the subdirectories, there will be a file named "results_stat.csv" which will contain the record of the relative reconstruction error and process stability for each number of signatures. Another file named stibility.pdf will contain the plot of recontruction error vs process stability. The "All solution" directory will contain the subdirectories for each number of signatures which will further contain the solution files ("signature.txt", "exposure.txt", "probabilities.txt" and a pdf file that depicts the proportion of the mututaions for each number signatures. On the other hand, the "Final solution" directory contains two subdirectories: "De Novo Solution" and "Decomposed Solution". The "De Novo Solution" subdirectory will contain the solution files for the optimum number of "De Novo Signatures" signatures with a dendrogram file where the samples are clustered by the de novo signatures. The "Decomposed Solution" subfolder contains the records where "De Novo Signatures" are further decomposed into the global signatures. If the "hierarchy" parameter is true, inside of each mutational context subdirectory, there will be a subdirectory named "All_Solution_by_Layer" which will further contain the solutions in the layer (L) subdirectories. Everything else will be similar to the previously deccribed directory structures. The structure of the result folder is synopsized below: If Hierarchy is False: -Mutational Context folder -All solution folder -Signature folder -exposure.txt file -signature.txt file -probabilities.txt file -signature plot pdf file -Selected_Solution folder -De_Novo_Solution folder -exposure.txt file -signature.txt file -probabilities.txt file -signature plot pdf file -dendrogram plot file -Decomposed_Solution folder -comparison with global signature.csv file -exposure.txt file -signature.txt file -probabilities.txt file -signature plot pdf file -dendrogram plot file -results_stat.csv file -stability plot pdf If Hierarchy is True: -Mutational Context folder -All Solution by Layer folder -Layer folder (L) -All solution folder -Signature folder -exposure.txt file -signature.txt file -probabilities.txt file -signature plot pdf file -L1_solution folder -exposure.txt file -signature.txt file -probabilities.txt file -signature plot pdf file -results_stat.csv file -stability plot pdf -Selected_Solution folder -De_Novo_Solution folder -exposure.txt file -signature.txt file -probabilities.txt file -signature plot pdf file -dendrogram plot file -Decomposed_Solution folder -comparison with global signature.csv file -exposure.txt file -signature.txt file -probabilities.txt file -signature plot pdf file -dendrogram plot file -results_stat.csv file -stability plot pdf Examples -------- >>> from sigproextractor import sigpro as sig >>> data = sig.importdata("vcf") >>> sig.sigProfilerExtractor("vcf", "example_output", data, startProcess=1, endProcess=3) Wait untill the excecution is finished. The process may a couple of hours based on the size of the data. Check the current working directory for the "example_output" folder. """ #################################### At first create the system data file #################################### if not os.path.exists(out_put): os.makedirs(out_put) sysdata = open(out_put + "/JOB_METADATA.txt", "w") sysdata.write( "THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME\n\n\n") sysdata.write("-------System Info-------\n") sysdata.write("Operating System Name: " + platform.uname()[0] + "\n" + "Nodename: " + platform.uname()[1] + "\n" + "Release: " + platform.uname()[2] + "\n" + "Version: " + platform.uname()[3] + "\n") sysdata.write("\n-------Python and Package Versions------- \n") sysdata.write("Python Version: " + str(platform.sys.version_info.major) + "." + str(platform.sys.version_info.minor) + "." + str(platform.sys.version_info.micro) + "\n") sysdata.write("Sigproextractor Version: " + cosmic.__version__ + "\n") sysdata.write("SigprofilerPlotting Version: " + sigProfilerPlotting.__version__ + "\n") sysdata.write("SigprofilerMatrixGenerator Version: " + SigProfilerMatrixGenerator.__version__ + "\n") sysdata.write("Pandas version: " + pd.__version__ + "\n") sysdata.write("Numpy version: " + np.__version__ + "\n") sysdata.write("Scipy version: " + scipy.__version__ + "\n") sysdata.write("Scikit-learn version: " + sklearn.__version__ + "\n") sysdata.write("Nimfa version: " + nimfa.__version__ + "\n") sysdata.write("\n-------Vital Parameters Used for the execution -------\n") #format the project_name first: project = input_data #will use this variable as the parameter for project argument in SigprofilerMatrixGenerator if project[-1] != "/": project_name = project.split( "/" )[-1] #will use this variable as the parameter for project_name argument in SigprofilerMatrixGenerator else: project_name = project.split("/")[-2] sysdata.write( "input_type: {}\ninputdata: {}\nstartProcess: {}\nendProcess: {}\ntotalIterations: {}\ncpu: {}\nhierarchy: {}\nrefgen: {}\ngenome_build: {}\nmtype: {}\n" .format(input_type, project_name, startProcess, endProcess, totalIterations, cpu, hierarchy, refgen, genome_build, mtype)) sysdata.write("\n-------Date and Time Data------- \n") tic = datetime.datetime.now() sysdata.write("Date and Clock time when the execution started: " + str(tic) + "\n") sysdata.close() ################################ take the inputs from the mandatory arguments #################################### input_type = input_type out_put = out_put #project = input_data #the variable was already set above ################################ take the inputs from the general optional arguments #################################### startProcess = startProcess endProcess = endProcess totalIterations = totalIterations cpu = cpu hierarchi = hierarchy if input_type == "text" or input_type == "table": ################################### For text input files ###################################################### text_file = project title = "" # set the title for plotting data = pd.read_csv(text_file, sep="\t").iloc[:, :] data = data.dropna(axis=1, inplace=False) data = data.loc[:, (data != 0).any(axis=0)] genomes = data.iloc[:, 1:] genomes = np.array(genomes) allgenomes = genomes.copy( ) # save the allgenomes for the final results #Contruct the indeces of the matrix #setting index and columns names of processAvg and exposureAvg index = data.iloc[:, 0] colnames = data.columns[1:] allcolnames = colnames.copy( ) # save the allcolnames for the final results #creating list of mutational type to sync with the vcf type input mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DINUC"] elif mtypes[0] == "83": mtypes = ["ID"] ############################################################################################################### ########################################################################################################################################################################################### elif input_type == "csv": ################################# For matlab input files ####################################################### filename = project title = "" # set the title for plotting genomes, index, colnames, mtypes = sub.read_csv(filename) allgenomes = genomes.copy() allcolnames = colnames.copy() # Define the mtypes if mtypes[0] == "78": mtypes = ["DINUC"] elif mtypes[0] == "83": mtypes = ["ID"] ################################################################################################################# ########################################################################################################################################################################################### elif input_type == "matobj": ################################# For matlab input files ####################################################### mat_file = project title = "" # set the title for plotting mat = scipy.io.loadmat(mat_file) mat = sub.extract_input(mat) genomes = mat[1] allgenomes = genomes.copy( ) # save the allgenomes for the final results #Contruct the indeces of the matrix #setting index and columns names of processAvg and exposureAvg index1 = mat[3] index2 = mat[4] index = [] for i, j in zip(index1, index2): index.append(i[0] + "[" + j + "]" + i[2]) colnames = np.array(pd.Series(mat[2])) allcolnames = colnames.copy( ) # save the allcolnames for the final results index = np.array(pd.Series(index)) #creating list of mutational type to sync with the vcf type input mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DINUC"] elif mtypes[0] == "83": mtypes = ["ID"] ################################################################################################################# elif input_type == "vcf": ################################# For vcf input files ####################################################### project = project title = project # set the title for plotting refgen = refgen exome = exome #project_name = project.split("/")[-1] data = datadump.SigProfilerMatrixGeneratorFunc(project_name, refgen, project, exome=exome, bed_file=None, chrom_based=False, plot=False, gs=False) # Selecting the mutation types if mtype != ["default"]: mkeys = data.keys() mtypes = mtype if any(x not in mkeys for x in mtypes): raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\ "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters") else: if set(["96", "DINUC", "ID"]).issubset(data): mtypes = ["96", "DINUC", "ID"] elif set(["96", "DINUC"]).issubset(data): mtypes = ["96", "DINUC"] elif set(["ID"]).issubset(data): mtypes = ["ID"] #print (mtypes) #change working directory #set the genome_build genome_build = refgen else: raise ValueError( "Please provide a correct input_type. Check help for more details") ########################################################################################################################################################################################### for m in mtypes: # Determine the types of mutation which will be needed for exporting and copying the files if not (m == "DINUC" or m == "ID"): mutation_type = "SBS" + m else: if m == "DINUC": mutation_type = "DBS78" elif m == "ID": mutation_type = "ID83" if input_type == "vcf": genomes = pd.DataFrame(data[m]) #check if the genome is a nonzero matrix shape = genomes.shape if shape == (0, 0): sysdata = open(out_put + "/JOB_METADATA.txt", "a") sysdata.write( "Sample is not a nonzero matrix for the mutation context " + m + "\n") print( "Sample is not a nozero matrix for the mutation context " + m) sysdata.close() continue genomes = genomes.loc[:, (genomes != 0).any(axis=0)] allgenomes = genomes.copy( ) # save the allgenomes for the final results index = genomes.index.values colnames = genomes.columns allcolnames = colnames.copy( ) # save the allcolnames for the final results #in the plotting funciton "ID" is used as "INDEL" if m == "ID": m = "INDEL" #for plotting #create output directories to store all the results output = out_put + "/" + mutation_type est_genomes = np.zeros([1, 1]) listofsignatures = [] listofsignaturesSTE = [] list_of_signature_stabilities = [] list_of_signature_total_mutations = [] H_iteration = 1 flag = True # We need to enter into the first while loop regardless any condition # While loop starts here while flag: genomes = np.array(genomes) information = [] if hierarchi is True: layer_directory = output + "/All_Solution_Layer/L" + str( H_iteration) elif hierarchi is False: layer_directory = output try: if not os.path.exists(layer_directory): os.makedirs(layer_directory) #os.makedirs(output+"/pickle_objects") #os.makedirs(output+"/All solutions") except: print("The {} folder could not be created".format("output")) fh = open(layer_directory + "/All_solutions_stat.csv", "w") fh.write("Total Signatures,Stability,Matrix Frobenius%\n") fh.close() # The following for loop operates to extract data from each number of signature all_similirities_list = [ ] #this list is going to store the dataframes of different similirieties as items minimum_stabilities = [] #similarity_dataframe = pd.DataFrame({"Sample Name": list(colnames)}) #normatlize the genomes before running nmf genomes = sub.normalize_samples(genomes, normalize=False, all_samples=False, number=30000) for i in range(startProcess, endProcess + 1): current_time_start = datetime.datetime.now() #memory_usage() processAvg, \ exposureAvg, \ processStd, \ exposureStd, \ avgSilhouetteCoefficients, \ clusterSilhouetteCoefficients, \ finalgenomeErrors, \ finalgenomesReconstructed, \ finalWall, \ finalHall, \ reconstruction_error, \ processes = sub.decipher_signatures(genomes= genomes, \ i = i, \ totalIterations=totalIterations, \ cpu=cpu, \ mut_context=m, \ resample = resample) ####################################################################### add sparsity in the exposureAvg ################################################################# # remove signatures only if the process stability is above a thresh-hold of 0.85 if avgSilhouetteCoefficients > -1.0: stic = time.time() #removing signatures: # ============================================================================= # pool = mp.Pool() # results = [pool.apply_async(sub.remove_all_single_signatures_pool, args=(x,processAvg,exposureAvg,genomes,)) for x in range(genomes.shape[1])] # pooloutput = [p.get() for p in results] # # #print(results) # pool.close() # # for i in range(len(pooloutput)): # #print(results[i]) # exposureAvg[:,i]=pooloutput[i] # ============================================================================= #refitting signatures: #removing signatures: pool = mp.Pool() results = [ pool.apply_async(ss.fit_signatures_pool, args=( genomes, processAvg, x, )) for x in range(genomes.shape[1]) ] pooloutput = [p.get() for p in results] pool.close() for i in range(len(pooloutput)): exposureAvg[:, i] = pooloutput[i][0] stoc = time.time() print("Optimization time is {} seconds".format(stoc - stic)) #report progress to the system file: current_time_end = datetime.datetime.now() sysdata = open(out_put + "/JOB_METADATA.txt", "a") if hierarchi is True: sysdata.write( "\nSignature extraction for {} completed for layer {} {} signatures for {}! TimeStamp: {}\n" .format(mutation_type, H_iteration, processes, current_time_end - current_time_start, current_time_end)) else: sysdata.write( "\nSignature extraction for {} completed for {} signatures for {}! TimeStamp: {}\n" .format(mutation_type, processes, current_time_end - current_time_start, current_time_end)) #Get total mutationation for each signature signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int) signature_stats = pd.DataFrame({ "Stability": clusterSilhouetteCoefficients, "Total Mutations": signature_total_mutations }) minimum_stabilities.append( round(np.mean(clusterSilhouetteCoefficients), 2) ) #here minimum stability is the average stability !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # Compute the estimated genome from the processAvg and exposureAvg est_genomes = np.dot(processAvg, exposureAvg) #check the similarities between the original and estimated genome for each number of signatures all_similarities, cosine_similarities = sub.calculate_similarities( genomes, est_genomes, colnames) #print(totalMutations) ########################################################################################################################################################################## # store the resutls of the loop. Here, processStd and exposureStd are standard Errors, NOT STANDARD DEVIATIONS. loopResults = [ genomes, processAvg, exposureAvg, processStd, exposureStd, avgSilhouetteCoefficients, clusterSilhouetteCoefficients, signature_total_mutations, all_similarities, signature_stats, reconstruction_error, finalgenomeErrors, finalgenomesReconstructed, finalWall, finalHall, processes ] information.append([ processAvg, exposureAvg, processStd, exposureStd, clusterSilhouetteCoefficients, signature_total_mutations, signature_stats, all_similarities ]) #Will be used during hierarchical approach ################################# Export the results ########################################################### sub.export_information(loopResults, m, layer_directory, index, colnames) all_similirities_list.append(all_similarities) # #similarity_dataframe["Total Signatures "+str(processes)] = cosine_similarities ################################################################################################################ ########################################## Plot Stabiltity vs Reconstruction Error ############################# ################################################################################################################ # Print the Stabiltity vs Reconstruction Error as get the solution as well solution, all_stats = sub.stabVsRError( layer_directory + "/All_solutions_stat.csv", layer_directory, title, all_similirities_list, mutation_type) all_stats.insert( 0, 'Stability (Avg Silhouette)', minimum_stabilities ) #!!!!!!!!!!!!!!!!1 here minimum stability is avg stability all_stats.to_csv(layer_directory + "/All_solutions_stat.csv", sep=",") # add more information to results_stat.csv #Set index for the the Similarity Dataframe #similarity_dataframe = similarity_dataframe.set_index("Sample Name") #Add the total mutations of each sample #sample_total_mutations = list(np.sum(genomes, axis =0)) #similarity_dataframe.insert(loc=0, column = "Total Mutations", value = sample_total_mutations) # write the name of Samples and Matrix participating in each Layer. layer_genome = pd.DataFrame(genomes) layer_genome = layer_genome.set_index(index) layer_genome.columns = colnames layer_genome = layer_genome.rename_axis("Mutation Types", axis="columns") ################################### Hierarchical Extraction ######################### if hierarchi is True: #data_stat_folder = layer_directory+"/Data_Stats" # ============================================================================= # try: # if not os.path.exists(data_stat_folder): # os.makedirs(data_stat_folder) # except: # print ("The {} folder could not be created".format("Data_Stats")) # ============================================================================= layer_genome.to_csv(layer_directory + "/Samples_in_Layer_" + str(H_iteration) + ".text", sep="\t", index_label=[layer_genome.columns.name]) #similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t") del layer_genome # ============================================================================= # for i in range(startProcess,endProcess+1): # all_similirities_list[i-startProcess].to_csv(data_stat_folder+"/Similatiry_Data_Sig"+str(i)+".text", sep="\t") # ============================================================================= # ============================================================================= # sample_record = open(output+"/Samples_Selected_by_Layers.text", "a") # sample_record.write("\nSamples participating in Layer"+str(H_iteration)+"\n"+"Total number of samples in this layer is: "+str(len(colnames))+"\n\n" ) # # # for sn in colnames: # # sn is the abbreviation of "Sample Name", used as a iterator variable # sample_record.write(sn+" ,\n" ) # sample_record.write("######################################################################################\n") # sample_record.write("######################################################################################\n") # sample_record.write("######################################################################################\n") # sample_record.write("######################################################################################\n\n\n\n\n") # sample_record.close() # ============================================================================= if os.path.exists(layer_directory + "/L" + str(H_iteration) + "_solution"): shutil.rmtree(layer_directory + "/L" + str(H_iteration) + "_solution") # Copy the best solution the "selected solution" folder solutionFolderFrom = layer_directory + "/All_solutions/" + mutation_type + "_" + str( solution) + "_Signatures" solutionFolderTo = layer_directory + "/L" + str( H_iteration) + "_Solution/" + mutation_type + "_" + str( solution) + "_Signatures" shutil.copytree(solutionFolderFrom, solutionFolderTo) # load the best processAvg, exposureAvg and processSTE based on the solution processAvg = information[solution - startProcess][0] exposureAvg = information[solution - startProcess][1] processSTE = information[solution - startProcess][2] list_of_signature_stabilities = list_of_signature_stabilities + list( information[solution - startProcess][4]) list_of_signature_total_mutations = list_of_signature_total_mutations + list( information[solution - startProcess][5]) all_similarities = information[solution - startProcess][7] #del information # Compute the estimated genome from the processAvg and exposureAvg est_genomes = np.dot(processAvg, exposureAvg) # make the list of the samples which have similarity lower than the thresh-hold with the estimated ones low_similarity_idx = [] for i in range(genomes.shape[1]): similarity = sub.cos_sim(genomes[:, i], est_genomes[:, i]) # The tresh-hold for hierarchy is 0.95 for now if similarity < par_h: low_similarity_idx.append(i) if len(low_similarity_idx) == 0: low_similarity_idx = [] #print(low_similarity_idx) # Accumulated the signatures and signaturesSTE for the final results listofsignatures.append(processAvg) listofsignaturesSTE.append(processSTE) genomes = genomes[:, low_similarity_idx] colnames = colnames[low_similarity_idx] H_iteration = H_iteration + 1 ######################################################################################################### # do the necessary operations and put the outputs in the "Final Solution" folder when the while loop ends if genomes.shape[1] < 10 or est_genomes.shape[ 1] == genomes.shape[1]: flag = False #update the flag for the whileloop # create the folder for the final solution/ De Novo Solution layer_directory1 = output + "/Suggested_Solution/De_Novo_Solution" try: if not os.path.exists(layer_directory1): os.makedirs(layer_directory1) except: print("The {} folder could not be created".format( "output")) count = 0 for p, q in zip(listofsignatures, listofsignaturesSTE): if count == 0: processAvg = p processSTE = q else: processAvg = np.hstack([processAvg, p]) processSTE = np.hstack([processSTE, q]) count += 1 # make the texts for signature plotting signature_stabilities = sub.signature_plotting_text( list_of_signature_stabilities, "Stability", "float") signature_total_mutations = sub.signature_plotting_text( list_of_signature_total_mutations, "Total Mutations", "integer") signature_stats = pd.DataFrame({ "Stability": signature_stabilities, "Total Mutations": signature_total_mutations }) # make de novo solution(processAvg, allgenomes, layer_directory1) listOfSignatures = sub.make_letter_ids( idlenth=processAvg.shape[1]) exposureAvg = sub.make_final_solution( processAvg, allgenomes, listOfSignatures, layer_directory1, m, index, allcolnames, process_std_error=processSTE, signature_stabilities=signature_stabilities, signature_total_mutations=signature_total_mutations, signature_stats=signature_stats, penalty=penalty) try: # create the folder for the final solution/ Decomposed Solution layer_directory2 = output + "/Suggested_Solution/Decomposed_Solution" try: if not os.path.exists(layer_directory2): os.makedirs(layer_directory2) except: print("The {} folder could not be created".format( "output")) final_signatures = sub.signature_decomposition( processAvg, m, layer_directory2, genome_build=genome_build) # extract the global signatures and new signatures from the final_signatures dictionary globalsigs = final_signatures["globalsigs"] globalsigs = np.array(globalsigs) newsigs = final_signatures["newsigs"] processAvg = np.hstack([globalsigs, newsigs]) allsigids = final_signatures[ "globalsigids"] + final_signatures["newsigids"] attribution = final_signatures["dictionary"] background_sigs = final_signatures["background_sigs"] exposureAvg = sub.make_final_solution(processAvg, allgenomes, allsigids, layer_directory2, m, index, allcolnames, \ remove_sigs=True, attribution = attribution, denovo_exposureAvg = exposureAvg , background_sigs=background_sigs, penalty=penalty, genome_build=genome_build) except: print( "\nWARNING!!! We apolozize we don't have a global signature database for the mutational context you provided. We have a database only for SBS96, DINUC and INDELS.\nTherefore no result for signature Decomposition is generated." ) shutil.rmtree(layer_directory2) ####################################################################################################### elif hierarchi is False: # ============================================================================= # data_stat_folder = output+"/Data_Stats" # try: # if not os.path.exists(data_stat_folder): # os.makedirs(data_stat_folder) # except: # print ("The {} folder could not be created".format("Data_Stats")) # # layer_genome.to_csv(data_stat_folder+"/Samples.text", sep = "\t", index_label=[layer_genome.columns.name]) # similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs.text", sep = "\t") # del layer_genome # for i in range(startProcess,endProcess+1): # all_similirities_list[i-startProcess].to_csv(data_stat_folder+"/Similatiry_Data_Sig_"+str(i)+".text", sep="\t") # ============================================================================= # record the samples layer_genome.to_csv(output + "/Samples.txt", sep="\t", index_label=[layer_genome.columns.name]) #similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t") del layer_genome ################################### Decompose the new signatures into global signatures ######################### processAvg = information[solution - startProcess][0] processSTE = information[solution - startProcess][2] signature_stabilities = information[solution - startProcess][4] signature_total_mutations = information[solution - startProcess][5] signature_stats = information[solution - startProcess][6] all_similarities = information[solution - startProcess][7] # create the folder for the final solution/ De Novo Solution layer_directory1 = output + "/Suggested_Solution/De_Novo_Solution" try: if not os.path.exists(layer_directory1): os.makedirs(layer_directory1) except: print( "The {} folder could not be created".format("output")) # make the texts for signature plotting signature_stabilities = sub.signature_plotting_text( signature_stabilities, "Stability", "float") signature_total_mutations = sub.signature_plotting_text( signature_total_mutations, "Total Mutations", "integer") # make de novo solution(processAvg, allgenomes, layer_directory1) listOfSignatures = sub.make_letter_ids( idlenth=processAvg.shape[1]) exposureAvg = sub.make_final_solution(processAvg, allgenomes, listOfSignatures, layer_directory1, m, index, \ allcolnames, process_std_error = processSTE, signature_stabilities = signature_stabilities, \ signature_total_mutations = signature_total_mutations, signature_stats = signature_stats, penalty=penalty) try: # create the folder for the final solution/ Decomposed Solution layer_directory2 = output + "/Suggested_Solution/Decomposed_Solution" try: if not os.path.exists(layer_directory2): os.makedirs(layer_directory2) except: print("The {} folder could not be created".format( "output")) if processAvg.shape[ 0] == 1536: #collapse the 1596 context into 96 only for the deocmposition processAvg = pd.DataFrame(processAvg, index=index) processAvg = processAvg.groupby( processAvg.index.str[1:8]).sum() genomes = pd.DataFrame(genomes, index=index) genomes = genomes.groupby(genomes.index.str[1:8]).sum() index = genomes.index processAvg = np.array(processAvg) genomes = np.array(genomes) final_signatures = sub.signature_decomposition( processAvg, m, layer_directory2, genome_build=genome_build) # extract the global signatures and new signatures from the final_signatures dictionary globalsigs = final_signatures["globalsigs"] globalsigs = np.array(globalsigs) newsigs = final_signatures["newsigs"] processAvg = np.hstack([globalsigs, newsigs]) allsigids = final_signatures[ "globalsigids"] + final_signatures["newsigids"] attribution = final_signatures["dictionary"] background_sigs = final_signatures["background_sigs"] exposureAvg = sub.make_final_solution(processAvg, genomes, allsigids, layer_directory2, m, index, colnames, \ remove_sigs=True, attribution = attribution, denovo_exposureAvg = exposureAvg , background_sigs=background_sigs, penalty=penalty, genome_build=genome_build) except: print( "\nWARNING!!! We apolozize we don't have a global signature database for the mutational context you provided. We have a database only for SBS96, DINUC and INDELS.\nTherefore no result for signature Decomposition is generated." ) shutil.rmtree(layer_directory2) break sysdata = open(out_put + "/JOB_METADATA.txt", "a") toc = datetime.datetime.now() sysdata.write("\nDate and Clock time when the execution ended: " + str(toc) + "\n") sysdata.write("-------Job Status------- \n") sysdata.write( "Analysis of mutational signatures completed successfully! Total execution time: " + str(toc - tic) + ". Results can be found in: [" + out_put + "] folder") sysdata.close() print( "\n\n \nYour Job Is Successfully Completed! Thank You For Using SigProfiler Extractor.\n " )
if runsigflow == False: runsigflow = "TRUE" if runsigflow == True: runsigflow = "FALSE" if runsigfit == False: runsigfit = "TRUE" if runsigfit == True: runsigfit = "FALSE" if runDeconstructSigs == False: runDeconstructSigs = "TRUE" if runDeconstructSigs == True: runDeconstructSigs = "FALSE" matGen.SigProfilerMatrixGeneratorFunc("MetaMutationalSigs", genome_ref, input_dir) subprocess.call([ 'Rscript', "meta_sig_main_flask.r", input_dir, genome_ref, runMutationalPatterns, runsigflow, runsigfit, runDeconstructSigs ]) subprocess.call([ 'python3.8', "errors_pie_heatmap.py", input_dir, runMutationalPatterns, runsigflow, runsigfit, runDeconstructSigs ]) shutil.rmtree(input_dir + "/input") shutil.rmtree(input_dir + "/logs") shutil.rmtree(input_dir + "/output") files_in_directory = os.listdir(input_dir)
def sigProfilerExtractor(input_type, out_put, input_data, refgen="GRCh37", genome_build='GRCh37', startProcess=1, endProcess=10, totalIterations=100, init="alexandrov-lab-custom", cpu=-1, mtype="default", exome=False, penalty=0.05, resample=True, wall=False, gpu=False): memory_usage() """ Extracts mutational signatures from an array of samples. Parameters ---------- input_type: A string. Type of input. The type of input should be one of the following: - "vcf": used for vcf format inputs. - "matrix": used for table format inputs using a tab seperated file. out_put: A string. The name of the output folder. The output folder will be generated in the current working directory. input_data: A string. Name of the input folder (in case of "vcf" type input) or the input file (in case of "table" type input). The project file or folder should be inside the current working directory. For the "vcf" type input,the project has to be a folder which will contain the vcf files in vcf format or text formats. The "text"type projects have to be a file. refgen: A string, optional. The name of the reference genome. The default reference genome is "GRCh37". This parameter is applicable only if the input_type is "vcf". startProcess: A positive integer, optional. The minimum number of signatures to be extracted. The default value is 1 endProcess: A positive integer, optional. The maximum number of signatures to be extracted. The default value is 10 totalIterations: A positive integer, optional. The number of iteration to be performed to extract each number signature. The default value is 100 init: A String. The initialization algorithm for W and H matrix of NMF wall: A Boolean. If true, the Ws and Hs from all the NMF iterations are generated in the output. cpu: An integer, optional. The number of processors to be used to extract the signatures. The default value is -1 which will use all available processors. mtype: A list of strings, optional. The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is ["96", "DINUC" , "ID"], where "96" is the SBS96 context, "DINUC" is the DINULEOTIDE context and ID is INDEL context. exome: Boolean, optional. Defines if the exomes will be extracted. The default value is "False". penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the thresh-hold cutoff to asaign signatures to a sample. resample: Boolean, optional. Default is True. If True, add poisson noise to samples by resampling. Returns ------- To learn about the output, please visit https://osf.io/t6j7u/wiki/home/ Examples -------- >>> from SigProfilerExtractor import sigpro as sig >>> data = sig.importdata("vcf") >>> sig.sigProfilerExtractor("vcf", "example_output", data, startProcess=1, endProcess=3) Wait untill the excecution is finished. The process may a couple of hours based on the size of the data. Check the results in the "example_output" folder. """ if gpu == True: import torch if gpu and (torch.cuda.device_count() == 0): raise RuntimeError("GPU not available!") #################################### At first create the system data file #################################### if not os.path.exists(out_put): os.makedirs(out_put) sysdata = open(out_put + "/JOB_METADATA.txt", "w") sysdata.write( "THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME\n\n\n") sysdata.write("-------System Info-------\n") sysdata.write("Operating System Name: " + platform.uname()[0] + "\n" + "Nodename: " + platform.uname()[1] + "\n" + "Release: " + platform.uname()[2] + "\n" + "Version: " + platform.uname()[3] + "\n") sysdata.write("\n-------Python and Package Versions------- \n") sysdata.write("Python Version: " + str(platform.sys.version_info.major) + "." + str(platform.sys.version_info.minor) + "." + str(platform.sys.version_info.micro) + "\n") sysdata.write("Sigproextractor Version: " + cosmic.__version__ + "\n") sysdata.write("SigprofilerPlotting Version: " + sigProfilerPlotting.__version__ + "\n") sysdata.write("SigprofilerMatrixGenerator Version: " + SigProfilerMatrixGenerator.__version__ + "\n") sysdata.write("Pandas version: " + pd.__version__ + "\n") sysdata.write("Numpy version: " + np.__version__ + "\n") sysdata.write("Scipy version: " + scipy.__version__ + "\n") sysdata.write("Scikit-learn version: " + sklearn.__version__ + "\n") #sysdata.write("Nimfa version: "+nimfa.__version__+"\n") sysdata.write("\n-------Vital Parameters Used for the execution -------\n") #format the project_name first: project = input_data #will use this variable as the parameter for project argument in SigprofilerMatrixGenerator try: if project[-1] != "/": project_name = project.split( "/" )[-1] #will use this variable as the parameter for project_name argument in SigprofilerMatrixGenerator else: project_name = project.split("/")[-2] except: project_name = "Input from DataFrame" sysdata.write( "input_type: {}\ninputdata: {}\nstartProcess: {}\nendProcess: {}\ntotalIterations: {}\ncpu: {}\nrefgen: {}\ngenome_build: {}\nmtype: {} \ninit: {}\n" .format(input_type, project_name, startProcess, endProcess, totalIterations, cpu, refgen, genome_build, mtype, init)) sysdata.write("\n-------Date and Time Data------- \n") tic = datetime.datetime.now() sysdata.write("Date and Clock time when the execution started: " + str(tic) + "\n") sysdata.close() ################################ take the inputs from the mandatory arguments #################################### input_type = input_type out_put = out_put #project = input_data #the variable was already set above ################################ take the inputs from the general optional arguments #################################### startProcess = startProcess endProcess = endProcess totalIterations = totalIterations cpu = cpu hierarchi = False #No use if input_type == "text" or input_type == "table" or input_type == "matrix": ################################### For text input files ###################################################### text_file = project title = "" # set the title for plotting if type(text_file) != str: data = text_file else: data = pd.read_csv(text_file, sep="\t").iloc[:, :] data = data.dropna(axis=1, inplace=False) data = data.loc[:, (data != 0).any(axis=0)] genomes = data.iloc[:, 1:] genomes = np.array(genomes) allgenomes = genomes.copy( ) # save the allgenomes for the final results #Contruct the indeces of the matrix #setting index and columns names of processAvg and exposureAvg index = data.iloc[:, 0] colnames = data.columns[1:] allcolnames = colnames.copy( ) # save the allcolnames for the final results #creating list of mutational type to sync with the vcf type input mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DBS78"] elif mtypes[0] == "83": mtypes = ["ID83"] else: mtypes = ["SBS" + mtypes[0]] ############################################################################################################### ########################################################################################################################################################################################### elif input_type == "csv": ################################# For matlab input files ####################################################### filename = project title = "" # set the title for plotting genomes, index, colnames, mtypes = sub.read_csv(filename) allgenomes = genomes.copy() allcolnames = colnames.copy() # Define the mtypes mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DINUC"] elif mtypes[0] == "83": mtypes = ["ID"] ################################################################################################################# ########################################################################################################################################################################################### elif input_type == "matobj": ################################# For matlab input files ####################################################### mat_file = project title = "" # set the title for plotting mat = scipy.io.loadmat(mat_file) mat = sub.extract_input(mat) genomes = mat[1] allgenomes = genomes.copy( ) # save the allgenomes for the final results #Contruct the indeces of the matrix #setting index and columns names of processAvg and exposureAvg index1 = mat[3] index2 = mat[4] index = [] for i, j in zip(index1, index2): index.append(i[0] + "[" + j + "]" + i[2]) colnames = np.array(pd.Series(mat[2])) allcolnames = colnames.copy( ) # save the allcolnames for the final results index = np.array(pd.Series(index)) #creating list of mutational type to sync with the vcf type input mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DINUC"] elif mtypes[0] == "83": mtypes = ["ID"] ################################################################################################################# elif input_type == "vcf": ################################# For vcf input files ####################################################### project = project title = project # set the title for plotting refgen = refgen exome = exome #project_name = project.split("/")[-1] data = datadump.SigProfilerMatrixGeneratorFunc(project_name, refgen, project, exome=exome, bed_file=None, chrom_based=False, plot=False, gs=False) # Selecting the mutation types if mtype == ["default"]: if set(["96", "DINUC", "ID"]).issubset(data): mtypes = ["SBS96", "DBS78", "ID83"] elif set(["96", "DINUC"]).issubset(data): mtypes = ["SBS96", "DBS78"] elif set(["ID"]).issubset(data): mtypes = ["ID83"] elif mtype == "default": if set(["96", "DINUC", "ID"]).issubset(data): mtypes = ["SBS96", "DBS78", "ID83"] elif set(["96", "DINUC"]).issubset(data): mtypes = ["SBS96", "DBS78"] elif set(["ID"]).issubset(data): mtypes = ["ID83"] else: #mkeys = data.keys() mtype = mtype.upper() mtype = mtype.replace(" ", "") mtypes = mtype.split(",") # ============================================================================= # if any(x not in mkeys for x in mtypes): # raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\ # "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters") # ============================================================================= #change working directory #set the genome_build genome_build = refgen else: raise ValueError( "Please provide a correct input_type. Check help for more details") ########################################################################################################################################################################################### for m in mtypes: mutation_context = m # we need to rename the m because users input could be SBS96, SBS1536, DBS78, ID83 etc if m.startswith("SBS"): m = m[3:] #removing "SBS" elif m.startswith("DBS"): m = "DINUC" elif m.startswith("ID"): m = "ID" # Determine the types of mutation which will be needed for exporting and copying the files if not (m == "DINUC" or m.startswith("DBS") or m.startswith("ID")): if m.startswith("SBS"): mutation_type = m else: mutation_type = "SBS" + m else: if m == "DINUC" or m.startswith("DBS"): mutation_type = "DBS78" elif m == "ID" or m.stratswith("ID"): mutation_type = "ID83" if input_type == "vcf": try: genomes = pd.DataFrame(data[m]) except: raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\ "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters") #check if the genome is a nonzero matrix shape = genomes.shape if shape == (0, 0): sysdata = open(out_put + "/JOB_METADATA.txt", "a") sysdata.write( "Sample is not a nonzero matrix for the mutation context " + m + "\n") print( "Sample is not a nozero matrix for the mutation context " + m) sysdata.close() continue genomes = genomes.loc[:, (genomes != 0).any(axis=0)] allgenomes = genomes.copy( ) # save the allgenomes for the final results index = genomes.index.values colnames = genomes.columns allcolnames = colnames.copy( ) # save the allcolnames for the final results #check if start and end processes are bigger than the number of samples startProcess = min(startProcess, genomes.shape[1]) endProcess = min(endProcess, genomes.shape[1]) #in the plotting funciton "ID" is used as "INDEL" if m == "ID": m = "INDEL" #for plotting #create output directories to store all the results output = out_put + "/" + mutation_type est_genomes = np.zeros([1, 1]) H_iteration = 1 genomes = np.array(genomes) information = [] layer_directory = output try: if not os.path.exists(layer_directory): os.makedirs(layer_directory) #os.makedirs(output+"/pickle_objects") #os.makedirs(output+"/All solutions") except: print("The {} folder could not be created".format("output")) fh = open(layer_directory + "/All_solutions_stat.csv", "w") fh.write("Total Signatures,Stability,Matrix Frobenius%,avgStability\n") fh.close() # The following for loop operates to extract data from each number of signature all_similirities_list = [ ] #this list is going to store the dataframes of different similirieties as items minimum_stabilities = [] #similarity_dataframe = pd.DataFrame({"Sample Name": list(colnames)}) # set up the seeds generation same matrices for different number of signatures seeds = np.random.randint( 0, 10000000, size=totalIterations ) # set the seeds ranging from 0 to 10000000 for resampling and same seeds are used in different number of signatures # get the cutoff for normatization to handle the hypermutators normalization_cutoff = sub.get_normalization_cutoff(genomes) #print("Normalization Cutoff is :", normalization_cutoff) #genomes = sub.normalize_samples(genomes, normalize=False, all_samples=False, number=30000) for i in range(startProcess, endProcess + 1): current_time_start = datetime.datetime.now() #memory_usage() processAvg, \ exposureAvg, \ processStd, \ exposureStd, \ avgSilhouetteCoefficients, \ clusterSilhouetteCoefficients, \ finalgenomeErrors, \ finalgenomesReconstructed, \ finalWall, \ finalHall, \ converge_information, \ reconstruction_error, \ processes = sub.decipher_signatures(genomes= genomes, \ i = i, \ totalIterations=totalIterations, \ cpu=cpu, \ mut_context=m, \ resample = resample, seeds=seeds, init = init, normalization_cutoff=normalization_cutoff, gpu=gpu,) #denormalize the genomes and exposures #genomes = sub.denormalize_samples(genomes, totalMutations, normalization_value=100000) #exposureStd = sub.denormalize_samples(exposureStd, totalMutations, normalization_value=100000) ####################################################################### add sparsity in the exposureAvg ################################################################# # remove signatures only if the process stability is above a thresh-hold of 0.85 if avgSilhouetteCoefficients > -1.0: stic = time.time() #removing signatures: # ============================================================================= # pool = mp.Pool() # results = [pool.apply_async(sub.remove_all_single_signatures_pool, args=(x,processAvg,exposureAvg,genomes,)) for x in range(genomes.shape[1])] # pooloutput = [p.get() for p in results] # # #print(results) # pool.close() # # for i in range(len(pooloutput)): # #print(results[i]) # exposureAvg[:,i]=pooloutput[i] # ============================================================================= #refitting signatures: #removing signatures: pool = mp.Pool() results = [ pool.apply_async(ss.fit_signatures_pool, args=( genomes, processAvg, x, )) for x in range(genomes.shape[1]) ] pooloutput = [p.get() for p in results] pool.close() for i in range(len(pooloutput)): exposureAvg[:, i] = pooloutput[i][0] stoc = time.time() print("Optimization time is {} seconds".format(stoc - stic)) #report progress to the system file: current_time_end = datetime.datetime.now() sysdata = open(out_put + "/JOB_METADATA.txt", "a") if hierarchi is True: sysdata.write( "\nSignature extraction for {} completed for layer {} {} signatures for {}! TimeStamp: {}\n" .format(mutation_type, H_iteration, processes, current_time_end - current_time_start, current_time_end)) else: sysdata.write( "\nSignature extraction for {} completed for {} signatures for {}! TimeStamp: {}\n" .format(mutation_type, processes, current_time_end - current_time_start, current_time_end)) #Get total mutationation for each signature in reverse order and order the signatures from high to low mutation barden signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int) sorted_idx = np.argsort(-signature_total_mutations) processAvg = np.take(processAvg, sorted_idx, axis=1) exposureAvg = np.take(exposureAvg, sorted_idx, axis=0) signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int) signature_stats = pd.DataFrame({ "Stability": clusterSilhouetteCoefficients, "Total Mutations": signature_total_mutations }) minimum_stabilities.append( round(np.mean(clusterSilhouetteCoefficients), 2) ) #here minimum stability is the average stability !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # Compute the estimated genome from the processAvg and exposureAvg est_genomes = np.dot(processAvg, exposureAvg) #check the similarities between the original and estimated genome for each number of signatures all_similarities, cosine_similarities = sub.calculate_similarities( genomes, est_genomes, colnames) #print(totalMutations) ########################################################################################################################################################################## # store the resutls of the loop. Here, processStd and exposureStd are standard Errors, NOT STANDARD DEVIATIONS. loopResults = [ genomes, processAvg, exposureAvg, processStd, exposureStd, avgSilhouetteCoefficients, clusterSilhouetteCoefficients, signature_total_mutations, all_similarities, signature_stats, reconstruction_error, finalgenomeErrors, finalgenomesReconstructed, converge_information, finalWall, finalHall, processes ] information.append([ processAvg, exposureAvg, processStd, exposureStd, clusterSilhouetteCoefficients, signature_total_mutations, signature_stats, all_similarities ]) #Will be used during hierarchical approach ################################# Export the results ########################################################### sub.export_information(loopResults, m, layer_directory, index, colnames, wall=wall) all_similirities_list.append(all_similarities) # #similarity_dataframe["Total Signatures "+str(processes)] = cosine_similarities ################################################################################################################ ########################################## Plot Stabiltity vs Reconstruction Error ############################# ################################################################################################################ # Print the Stabiltity vs Reconstruction Error as get the solution as well solution, all_stats = sub.stabVsRError( layer_directory + "/All_solutions_stat.csv", layer_directory, title, all_similirities_list, mutation_type) all_stats.insert( 0, 'Stability (Avg Silhouette)', minimum_stabilities ) #!!!!!!!!!!!!!!!!1 here minimum stability is avg stability all_stats.to_csv(layer_directory + "/All_solutions_stat.csv", sep=",") # add more information to results_stat.csv #Set index for the the Similarity Dataframe #similarity_dataframe = similarity_dataframe.set_index("Sample Name") #Add the total mutations of each sample #sample_total_mutations = list(np.sum(genomes, axis =0)) #similarity_dataframe.insert(loc=0, column = "Total Mutations", value = sample_total_mutations) # write the name of Samples and Matrix participating in each Layer. layer_genome = pd.DataFrame(genomes) layer_genome = layer_genome.set_index(index) layer_genome.columns = colnames layer_genome = layer_genome.rename_axis("Mutation Types", axis="columns") # ============================================================================= # data_stat_folder = output+"/Data_Stats" # try: # if not os.path.exists(data_stat_folder): # os.makedirs(data_stat_folder) # except: # print ("The {} folder could not be created".format("Data_Stats")) # # layer_genome.to_csv(data_stat_folder+"/Samples.text", sep = "\t", index_label=[layer_genome.columns.name]) # similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs.text", sep = "\t") # del layer_genome # for i in range(startProcess,endProcess+1): # all_similirities_list[i-startProcess].to_csv(data_stat_folder+"/Similatiry_Data_Sig_"+str(i)+".text", sep="\t") # ============================================================================= # record the samples layer_genome.to_csv(output + "/Samples.txt", sep="\t", index_label=[layer_genome.columns.name]) #similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t") del layer_genome ################################### Decompose the new signatures into global signatures ######################### processAvg = information[solution - startProcess][0] processSTE = information[solution - startProcess][2] signature_stabilities = information[solution - startProcess][4] signature_total_mutations = information[solution - startProcess][5] signature_stats = information[solution - startProcess][6] all_similarities = information[solution - startProcess][7] # create the folder for the final solution/ De Novo Solution layer_directory1 = output + "/Suggested_Solution/De_Novo_Solution" try: if not os.path.exists(layer_directory1): os.makedirs(layer_directory1) except: print("The {} folder could not be created".format("output")) # make the texts for signature plotting signature_stabilities = sub.signature_plotting_text( signature_stabilities, "Stability", "float") signature_total_mutations = sub.signature_plotting_text( signature_total_mutations, "Total Mutations", "integer") # make de novo solution(processAvg, allgenomes, layer_directory1) listOfSignatures = sub.make_letter_ids(idlenth=processAvg.shape[1], mtype=mutation_context) allgenomes = pd.DataFrame(allgenomes) exposureAvg = sub.make_final_solution(processAvg, allgenomes, listOfSignatures, layer_directory1, m, index, \ allcolnames, process_std_error = processSTE, signature_stabilities = signature_stabilities, \ signature_total_mutations = signature_total_mutations, signature_stats = signature_stats, penalty=penalty) try: # create the folder for the final solution/ Decomposed Solution layer_directory2 = output + "/Suggested_Solution/Decomposed_Solution" try: if not os.path.exists(layer_directory2): os.makedirs(layer_directory2) except: print("The {} folder could not be created".format("output")) if processAvg.shape[ 0] == 1536: #collapse the 1596 context into 96 only for the deocmposition processAvg = pd.DataFrame(processAvg, index=index) processAvg = processAvg.groupby( processAvg.index.str[1:8]).sum() genomes = pd.DataFrame(genomes, index=index) genomes = genomes.groupby(genomes.index.str[1:8]).sum() index = genomes.index processAvg = np.array(processAvg) genomes = np.array(genomes) final_signatures = sub.signature_decomposition( processAvg, m, layer_directory2, genome_build=genome_build, mutation_context=mutation_context) # extract the global signatures and new signatures from the final_signatures dictionary globalsigs = final_signatures["globalsigs"] globalsigs = np.array(globalsigs) newsigs = final_signatures["newsigs"] processAvg = np.hstack([globalsigs, newsigs]) allsigids = final_signatures["globalsigids"] + final_signatures[ "newsigids"] attribution = final_signatures["dictionary"] background_sigs = final_signatures["background_sigs"] genomes = pd.DataFrame(genomes) #print(exposureAvg) exposureAvg = sub.make_final_solution(processAvg, genomes, allsigids, layer_directory2, m, index, colnames, \ remove_sigs=True, attribution = attribution, denovo_exposureAvg = exposureAvg , background_sigs=background_sigs, penalty=penalty, genome_build=genome_build) except: print( "\nWARNING!!! We apolozize we don't have a global signature database for the mutational context you provided. We have a database only for SBS96, DINUC and INDELS.\nTherefore no result for signature Decomposition is generated." ) shutil.rmtree(layer_directory2) sysdata = open(out_put + "/JOB_METADATA.txt", "a") toc = datetime.datetime.now() sysdata.write("\nDate and Clock time when the execution ended: " + str(toc) + "\n") sysdata.write("-------Job Status------- \n") sysdata.write( "Analysis of mutational signatures completed successfully! Total execution time: " + str(toc - tic) + ". Results can be found in: [" + out_put + "] folder") sysdata.close() print( "\n\n \nYour Job Is Successfully Completed! Thank You For Using SigProfiler Extractor.\n " )
def sigProfilerExtractor(input_type, output, input_data, reference_genome="GRCh37", opportunity_genome="GRCh37", context_type="default", exome=False, minimum_signatures=1, maximum_signatures=25, nmf_replicates=100, resample=True, batch_size=1, cpu=-1, gpu=False, nmf_init="nndsvd_min", precision="single", matrix_normalization="gmm", seeds="random", min_nmf_iterations=10000, max_nmf_iterations=1000000, nmf_test_conv=10000, nmf_tolerance=1e-15, nnls_add_penalty=0.05, nnls_remove_penalty=0.01, de_novo_fit_penalty=0.02, initial_remove_penalty=0.05, refit_denovo_signatures=True, clustering_distance="cosine", export_probabilities=True, make_decomposition_plots=True, stability=0.8, min_stability=0.2, combined_stability=1.0, get_all_signature_matrices=False): memory_usage() """ Extracts mutational signatures from an array of samples. Parameters ---------- INPUT DATA:- input_type: A string. Type of input. The type of input should be one of the following: - "vcf": used for vcf format inputs. - "matrix": used for table format inputs using a tab seperated file. output: A string. The name of the output folder. The output folder will be generated in the current working directory. input_data: A string. Name of the input folder (in case of "vcf" type input) or the input file (in case of "table" type input). The project file or folder should be inside the current working directory. For the "vcf" type input,the project has to be a folder which will contain the vcf files in vcf format or text formats. The "text"type projects have to be a file. reference_genome: A string, optional. The name of the reference genome. The default reference genome is "GRCh37". This parameter is applicable only if the input_type is "vcf". opportunity_genome: The build or version of the reference signatures for the reference genome. The default opportunity genome is GRCh37. If the input_type is "vcf", the genome_build automatically matches the input reference genome value. context_type: A list of strings, optional. The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is "SBS96,DBS78,ID83". exome: Boolean, optional. Defines if the exomes will be extracted. The default value is "False". NMF RUNS:- minimum_signature: A positive integer, optional. The minimum number of signatures to be extracted. The default value is 1 maximum_signatures: A positive integer, optional. The maximum number of signatures to be extracted. The default value is 10 nmf_replicates: A positive integer, optional. The number of iteration to be performed to extract each number signature. The default value is 100 resample: Boolean, optional. Default is True. If True, add poisson noise to samples by resampling. seeds: Boolean. Default is "random". If random, then the seeds for resampling will be random for different analysis. If not random, then seeds will be obtained from a given path of a .txt file that contains a list of seed. NMF RUNS:- matrix_normalization: A string. Method of normalizing the genome matrix before it is analyzed by NMF. Default is "log2". Other options are "gmm", "100X" or "no_normalization". nmf_init: A String. The initialization algorithm for W and H matrix of NMF. Options are 'random', 'nndsvd', 'nndsvda', 'nndsvdar' and 'nndsvd_min' Default is 'nndsvd_min'. precision: A string. Values should be single or double. Default is single. min_nmf_iterations: An integer. Value defines the minimum number of iterations to be completed before NMF converges. Default is 2000. max_nmf_iterations: An integer. Value defines the maximum number of iterations to be completed before NMF converges. Default is 200000 nmf_test_conv: An integer. Value definer the number number of iterations to done between checking next convergence. nmf_tolerance: A float. Value defines the tolerance to achieve to converge. EXECUTION:- cpu: An integer, optional. The number of processors to be used to extract the signatures. The default value is -1 which will use all available processors. gpu:Boolean, optional. Defines if the GPU resource will used if available. Default is False. If True, the GPU resource will be used in the computation. batch_size: An integer. Will be effective only if the GPU is used. Defines the number of NMF replicates to be performed by each CPU during the parallel processing. Default is 1. SOLUTION ESTIMATION THRESH-HOLDS:- stability: A float. Default is 0.8. The cutoff thresh-hold of the average stability. Solutions with average stabilities below this thresh-hold will not be considered. min_stability: A float. Default is 0.2. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. combined_stability: A float. Default is 1.0. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered. DECOMPOSITION:- de_novo_fit_penalty: Float, optional. Takes any positive float. Default is 0.02. Defines the weak (remove) thresh-hold cutoff to be assigned denovo signatures to a sample. nnls_add_penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the strong (add) thresh-hold cutoff to be assigned COSMIC signatures to a sample. nnls_remove_penalty: Float, optional. Takes any positive float. Default is 0.01. Defines the weak (remove) thresh-hold cutoff to be assigned COSMIC signatures to a sample. initial_remove_penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the initial weak (remove) thresh-hold cutoff to be COSMIC assigned signatures to a sample. refit_denovo_signatures: Boolean, optional. Default is False. If True, then refit the denovo signatures with nnls. make_decomposition_plots: Boolean, optional. Defualt is True. If True, Denovo to Cosmic sigantures decompostion plots will be created as a part the results. OTHERS:- get_all_signature_matrices: A Boolean. If true, the Ws and Hs from all the NMF iterations are generated in the output. export_probabilities: A Boolean. Defualt is True. If False, then doesn't create the probability matrix. Returns ------- To learn about the output, please visit https://osf.io/t6j7u/wiki/home/ Examples -------- Examples -------- >>> from SigProfilerExtractor import sigpro as sig # to get input from vcf files >>> path_to_example_folder_containing_vcf_files = sig.importdata("vcf") >>> data = path_to_example_folder_containing_vcf_files # you can put the path to your folder containing the vcf samples >>> sig.sigProfilerExtractor("vcf", "example_output", data, minimum_signatures=1, maximum_signatures=3) Wait untill the excecution is finished. The process may a couple of hours based on the size of the data. Check the current working directory for the "example_output" folder. # to get input from table format (mutation catalog matrix) >>> path_to_example_table = sig.importdata("matrix") >>> data = path_to_example_table # you can put the path to your tab delimited file containing the mutational catalog matrix/table >>> sig.sigProfilerExtractor("matrix", "example_output", data, opportunity_genome="GRCh38", minimum_signatures=1, maximum_signatures=3) Wait untill the excecution is finished. The process may a couple of hours based on the size of the data. Check the results in the "example_output" folder. """ #record the start time start_time = datetime.datetime.now() #set the output variable out_put = output if gpu == True: import torch if gpu and (torch.cuda.device_count() == 0): raise RuntimeError("GPU not available!") #################################### At first create the system data file #################################### if not os.path.exists(out_put): os.makedirs(out_put) sysdata = open(out_put + "/JOB_METADATA.txt", "w") sysdata.write( "THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME\n\n\n") sysdata.write("-------System Info-------\n") sysdata.write("Operating System Name: " + platform.uname()[0] + "\n" + "Nodename: " + platform.uname()[1] + "\n" + "Release: " + platform.uname()[2] + "\n" + "Version: " + platform.uname()[3] + "\n") sysdata.write("\n-------Python and Package Versions------- \n") sysdata.write("Python Version: " + str(platform.sys.version_info.major) + "." + str(platform.sys.version_info.minor) + "." + str(platform.sys.version_info.micro) + "\n") sysdata.write("Sigproextractor Version: " + cosmic.__version__ + "\n") sysdata.write("SigprofilerPlotting Version: " + sigProfilerPlotting.__version__ + "\n") sysdata.write("SigprofilerMatrixGenerator Version: " + SigProfilerMatrixGenerator.__version__ + "\n") sysdata.write("Pandas version: " + pd.__version__ + "\n") sysdata.write("Numpy version: " + np.__version__ + "\n") sysdata.write("Scipy version: " + scipy.__version__ + "\n") sysdata.write("Scikit-learn version: " + sklearn.__version__ + "\n") #sysdata.write("Nimfa version: "+nimfa.__version__+"\n") #format the project_name first: project = input_data #will use this variable as the parameter for project argument in SigprofilerMatrixGenerator try: if project[-1] != "/": project_name = project.split( "/" )[-1] #will use this variable as the parameter for project_name argument in SigprofilerMatrixGenerator else: project_name = project.split("/")[-2] except: project_name = "Input from DataFrame" excecution_parameters = { "input_type": input_type, "output": output, "input_data": input_data, "reference_genome": reference_genome, "opportunity_genome": opportunity_genome, "context_type": context_type, "exome": exome, "minimum_signatures": minimum_signatures, "maximum_signatures": maximum_signatures, "NMF_replicates": nmf_replicates, "cpu": cpu, "gpu": gpu, "batch_size": batch_size, "NMF_init": nmf_init, "precision": precision, "matrix_normalization": matrix_normalization, "resample": resample, "seeds": seeds, "min_NMF_iterations": min_nmf_iterations, "max_NMF_iterations": max_nmf_iterations, "NMF_test_conv": nmf_test_conv, "NMF_tolerance": nmf_tolerance, "nnls_add_penalty": nnls_add_penalty, "nnls_remove_penalty": nnls_remove_penalty, "initial_remove_penalty": initial_remove_penalty, "de_novo_fit_penalty": de_novo_fit_penalty, "refit_denovo_signatures": refit_denovo_signatures, "dist": clustering_distance, "export_probabilities": export_probabilities, "make_decompostion_plots": make_decomposition_plots, "stability": stability, "min_stability": min_stability, "combined_stability": combined_stability, "get_all_signature_matrices": get_all_signature_matrices } ################################ take the inputs from the mandatory arguments #################################### input_type = input_type #project = input_data #the variable was already set above ################################ take the inputs from the general optional arguments #################################### startProcess = minimum_signatures endProcess = maximum_signatures #totalIterations=nmf_replicates cpu = cpu hierarchy = False #No use mtype = context_type #init=nmf_init wall = get_all_signature_matrices add_penalty = nnls_add_penalty remove_penalty = nnls_remove_penalty genome_build = opportunity_genome refgen = reference_genome refit_denovo_signatures #set the squence type ("genome" or "exome") for the tmb plot inside the make_final_solution function if exome == False: sequence = "genome" if exome == True: sequence = "exome" #setting seeds if seeds == "random": excecution_parameters["seeds"] = seeds replicates = list(range(1, nmf_replicates + 1)) seed = np.random.randint(0, 10000000, size=nmf_replicates) seeds = pd.DataFrame(list(zip(replicates, seed)), columns=["Replicates", "Seeds"]) seeds = seeds.set_index("Replicates") seeds.to_csv(out_put + "/Seeds.txt", sep="\t") else: try: excecution_parameters["seeds"] = seeds seeds = pd.read_csv(seeds, sep="\t", index_col=0) seeds.to_csv(out_put + "/Seeds.txt", sep="\t") seed = np.array(seeds["Seeds"]) except: "Please set valid seeds" if input_type == "text" or input_type == "table" or input_type == "matrix": ################################### For text input files ###################################################### text_file = project title = "" # set the title for plotting if type(text_file) != str: data = text_file excecution_parameters["input_data"] = "Matrix[" + str( data.shape[0]) + " rows X " + str(data.shape[1]) + " columns]" else: data = pd.read_csv(text_file, sep="\t").iloc[:, :] data = data.dropna(axis=1, inplace=False) data = data.loc[:, (data != 0).any(axis=0)] genomes = data.iloc[:, 1:] genomes = np.array(genomes) allgenomes = genomes.copy( ) # save the allgenomes for the final results #Contruct the indeces of the matrix #setting index and columns names of processAvg and exposureAvg index = data.iloc[:, 0] colnames = data.columns[1:] allcolnames = colnames.copy( ) # save the allcolnames for the final results #creating list of mutational type to sync with the vcf type input mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DBS78"] elif mtypes[0] == "83": mtypes = ["ID83"] else: mtypes = ["SBS" + mtypes[0]] ############################################################################################################### ########################################################################################################################################################################################### elif input_type == "csv": ################################# For matlab input files ####################################################### filename = project title = "" # set the title for plotting genomes, index, colnames, mtypes = sub.read_csv(filename) allgenomes = genomes.copy() allcolnames = colnames.copy() # Define the mtypes mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DINUC"] elif mtypes[0] == "83": mtypes = ["ID"] ################################################################################################################# ########################################################################################################################################################################################### elif input_type == "matobj": ################################# For matlab input files ####################################################### mat_file = project title = "" # set the title for plotting mat = scipy.io.loadmat(mat_file) mat = sub.extract_input(mat) genomes = mat[1] allgenomes = genomes.copy( ) # save the allgenomes for the final results #Contruct the indeces of the matrix #setting index and columns names of processAvg and exposureAvg index1 = mat[3] index2 = mat[4] index = [] for i, j in zip(index1, index2): index.append(i[0] + "[" + j + "]" + i[2]) colnames = np.array(pd.Series(mat[2])) allcolnames = colnames.copy( ) # save the allcolnames for the final results index = np.array(pd.Series(index)) #creating list of mutational type to sync with the vcf type input mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DINUC"] elif mtypes[0] == "83": mtypes = ["ID"] ################################################################################################################# elif input_type == "vcf": ################################# For vcf input files ####################################################### project = project title = project # set the title for plotting refgen = refgen exome = exome #project_name = project.split("/")[-1] data = datadump.SigProfilerMatrixGeneratorFunc(project_name, refgen, project, exome=exome, bed_file=None, chrom_based=False, plot=False, gs=False) # Selecting the mutation types if mtype == ["default"]: if set(["96", "DINUC", "ID"]).issubset(data): mtypes = ["SBS96", "DBS78", "ID83"] elif set(["96", "DINUC"]).issubset(data): mtypes = ["SBS96", "DBS78"] elif set(["ID"]).issubset(data): mtypes = ["ID83"] elif mtype == "default": if set(["96", "DINUC", "ID"]).issubset(data): mtypes = ["SBS96", "DBS78", "ID83"] elif set(["96", "DINUC"]).issubset(data): mtypes = ["SBS96", "DBS78"] elif set(["ID"]).issubset(data): mtypes = ["ID83"] else: #mkeys = data.keys() mtype = mtype.upper() mtype = mtype.replace(" ", "") mtypes = mtype.split(",") # ============================================================================= # if any(x not in mkeys for x in mtypes): # raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\ # "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters") # ============================================================================= #change working directory #set the genome_build genome_build = refgen else: raise ValueError( "Please provide a correct input_type. Check help for more details") #recording context types excecution_parameters["context_type"] = ",".join(mtypes) record_parameters(sysdata, excecution_parameters, start_time) sysdata.close() ########################################################################################################################################################################################### for m in mtypes: mutation_context = m # we need to rename the m because users input could be SBS96, SBS1536, DBS78, ID83 etc if m.startswith("SBS"): m = m[3:] #removing "SBS" elif m.startswith("DBS"): m = "DINUC" elif m.startswith("ID"): m = "ID" # Determine the types of mutation which will be needed for exporting and copying the files if not (m == "DINUC" or m.startswith("DBS") or m.startswith("ID")): if m.startswith("SBS"): mutation_type = m else: mutation_type = "SBS" + m else: if m == "DINUC" or m.startswith("DBS"): mutation_type = "DBS78" elif m == "ID" or m.stratswith("ID"): mutation_type = "ID83" if input_type == "vcf": try: genomes = pd.DataFrame(data[m]) except: raise Exception("Please pass valid mutation types seperated by comma with no space. Carefully check (using SigProfilerMatrixGenerator)"\ "what mutation contexts should be generated by your VCF files. Also please use the uppercase characters") #check if the genome is a nonzero matrix shape = genomes.shape if shape == (0, 0): sysdata = open(out_put + "/JOB_METADATA.txt", "a") sysdata.write( "Sample is not a nonzero matrix for the mutation context " + m + "\n") print( "Sample is not a nozero matrix for the mutation context " + m) sysdata.close() continue genomes = genomes.loc[:, (genomes != 0).any(axis=0)] allgenomes = genomes.copy( ) # save the allgenomes for the final results index = genomes.index.values colnames = genomes.columns allcolnames = colnames.copy( ) # save the allcolnames for the final results #check if start and end processes are bigger than the number of samples startProcess = min(startProcess, genomes.shape[1]) endProcess = min(endProcess, genomes.shape[1]) #in the plotting funciton "ID" is used as "INDEL" if m == "ID": m = "INDEL" #for plotting #create output directories to store all the results output = out_put + "/" + mutation_type est_genomes = np.zeros([1, 1]) H_iteration = 1 genomes = np.array(genomes) information = [] layer_directory = output try: if not os.path.exists(layer_directory): os.makedirs(layer_directory) #os.makedirs(output+"/pickle_objects") #os.makedirs(output+"/All solutions") except: print("The {} folder could not be created".format("output")) fh = open(layer_directory + "/All_solutions_stat.csv", "w") fh.write("Total Signatures,Stability,Matrix Frobenius%,avgStability\n") fh.close() # The following for loop operates to extract data from each number of signature all_similirities_list = [ ] #this list is going to store the dataframes of different similirieties as items minimum_stabilities = [] #similarity_dataframe = pd.DataFrame({"Sample Name": list(colnames)}) # get the cutoff for normatization to handle the hypermutators normalization_cutoff = sub.get_normalization_cutoff(genomes, manual_cutoff=100 * genomes.shape[0]) #print("Normalization Cutoff is :", normalization_cutoff) excecution_parameters["normalization_cutoff"] = normalization_cutoff #pass the seed values to inner funtions: excecution_parameters["seeds"] = seed if genomes.shape[1] < endProcess: endProcess = genomes.shape[1] #report the notmatlization criteria sysdata = open(out_put + "/JOB_METADATA.txt", "a") context_start_time = datetime.datetime.now() sysdata.write("\n##################################\n") sysdata.write( "\n[{}] Analysis started for {}. Matrix size [{} rows x {} columns]\n" .format( str(context_start_time).split(".")[0], mutation_type, genomes.shape[0], genomes.shape[1])) if excecution_parameters["matrix_normalization"] == "gmm": sysdata.write("\n[{}] Normalization GMM with cutoff value set at {}\n". \ format(str(datetime.datetime.now()).split(".")[0], normalization_cutoff)) elif excecution_parameters["matrix_normalization"] == "100X": sysdata.write("\n[{}] Normalization 100X with cutoff value set at {}\n". \ format(str(datetime.datetime.now()).split(".")[0],(genomes.shape[0]*100))) elif excecution_parameters["matrix_normalization"] == "log2": sysdata.write("\n[{}] Normalization Log2\n". \ format(str(datetime.datetime.now()).split(".")[0])) elif excecution_parameters["matrix_normalization"] == "none": sysdata.write("\n[{}] Analysis is proceeding without normalization\n". \ format(str(datetime.datetime.now()).split(".")[0])) else: sysdata.write("\n[{}] Normalization Custom with cutoff value set at {}\n". \ format(str(datetime.datetime.now()).split(".")[0],excecution_parameters["matrix_normalization"])) sysdata.close() for i in range(startProcess, endProcess + 1): current_time_start = datetime.datetime.now() #memory_usage() processAvg, \ exposureAvg, \ processStd, \ exposureStd, \ avgSilhouetteCoefficients, \ clusterSilhouetteCoefficients, \ finalgenomeErrors, \ finalgenomesReconstructed, \ finalWall, \ finalHall, \ converge_information, \ reconstruction_error, \ processes = sub.decipher_signatures(excecution_parameters, genomes= genomes, mut_context=m, i = i) #denormalize the genomes and exposures #genomes = sub.denormalize_samples(genomes, totalMutations, normalization_value=100000) #exposureStd = sub.denormalize_samples(exposureStd, totalMutations, normalization_value=100000) ####################################################################### add sparsity in the exposureAvg ################################################################# # remove signatures only if the process stability is above a thresh-hold of 0.85 if avgSilhouetteCoefficients > -1.0: stic = time.time() #removing signatures: # ============================================================================= # pool = mp.Pool() # results = [pool.apply_async(sub.remove_all_single_signatures_pool, args=(x,processAvg,exposureAvg,genomes,)) for x in range(genomes.shape[1])] # pooloutput = [p.get() for p in results] # # #print(results) # pool.close() # # for i in range(len(pooloutput)): # #print(results[i]) # exposureAvg[:,i]=pooloutput[i] # ============================================================================= #refitting signatures: #removing signatures: pool = mp.Pool() results = [ pool.apply_async(ss.fit_signatures_pool, args=( genomes, processAvg, x, )) for x in range(genomes.shape[1]) ] pooloutput = [p.get() for p in results] pool.close() for i in range(len(pooloutput)): exposureAvg[:, i] = pooloutput[i][0] stoc = time.time() print("Optimization time is {} seconds".format(stoc - stic)) #sysdata.write("\nAnalysis of context type {} is ended successfully\n".format(m)) #report progress to the system file: #Get total mutationation for each signature in reverse order and order the signatures from high to low mutation barden signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int) sorted_idx = np.argsort(-signature_total_mutations) processAvg = np.take(processAvg, sorted_idx, axis=1) exposureAvg = np.take(exposureAvg, sorted_idx, axis=0) signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int) processStd = np.take(processStd, sorted_idx, axis=1) exposureStd = np.take(exposureStd, sorted_idx, axis=0) clusterSilhouetteCoefficients = np.take( clusterSilhouetteCoefficients, sorted_idx, axis=0) signature_stats = pd.DataFrame({ "Stability": clusterSilhouetteCoefficients, "Total Mutations": signature_total_mutations }) minimum_stabilities.append( round(np.mean(clusterSilhouetteCoefficients), 2) ) #here minimum stability is the average stability !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # Compute the estimated genome from the processAvg and exposureAvg est_genomes = np.dot(processAvg, exposureAvg) #check the similarities between the original and estimated genome for each number of signatures all_similarities, cosine_similarities = sub.calculate_similarities( genomes, est_genomes, colnames) #print(totalMutations) ########################################################################################################################################################################## # store the resutls of the loop. Here, processStd and exposureStd are standard Errors, NOT STANDARD DEVIATIONS. loopResults = [ genomes, processAvg, exposureAvg, processStd, exposureStd, avgSilhouetteCoefficients, clusterSilhouetteCoefficients, signature_total_mutations, all_similarities, signature_stats, reconstruction_error, finalgenomeErrors, finalgenomesReconstructed, converge_information, finalWall, finalHall, processes ] information.append([ processAvg, exposureAvg, processStd, exposureStd, clusterSilhouetteCoefficients, signature_total_mutations, signature_stats, all_similarities ]) #Will be used during hierarchycal approach ################################# Export the results ########################################################### sub.export_information(loopResults, m, layer_directory, index, colnames, wall=wall, sequence=sequence) all_similirities_list.append(all_similarities) # #similarity_dataframe["Total Signatures "+str(processes)] = cosine_similarities current_time_end = datetime.datetime.now() sysdata = open(out_put + "/JOB_METADATA.txt", "a") sysdata.write("\n[{}] {} de novo extraction completed for a total of {} signatures! \nExecution time:{}\n". \ format(str(datetime.datetime.now()).split(".")[0],mutation_type,processes,str(current_time_end-current_time_start).split(".")[0], current_time_end)) sysdata.close() ################################################################################################################ ########################################## Plot Stabiltity vs Reconstruction Error ############################# ################################################################################################################ # Print the Stabiltity vs Reconstruction Error as get the solution as well solution, all_stats = sub.stabVsRError( layer_directory + "/All_solutions_stat.csv", layer_directory, title, all_similirities_list, mtype=mutation_type, stability=stability, min_stability=min_stability, combined_stability=combined_stability) all_stats.insert( 1, 'Stability (Avg Silhouette)', minimum_stabilities ) #!!!!!!!!!!!!!!!!1 here minimum stability is avg stability all_stats = all_stats.set_index(["Signatures"]) all_stats.to_csv(layer_directory + "/All_solutions_stat.csv", sep=",") # add more information to results_stat.csv #Set index for the the Similarity Dataframe #similarity_dataframe = similarity_dataframe.set_index("Sample Name") #Add the total mutations of each sample #sample_total_mutations = list(np.sum(genomes, axis =0)) #similarity_dataframe.insert(loc=0, column = "Total Mutations", value = sample_total_mutations) # write the name of Samples and Matrix participating in each Layer. layer_genome = pd.DataFrame(genomes) layer_genome = layer_genome.set_index(index) layer_genome.columns = colnames layer_genome = layer_genome.rename_axis("Mutation Types", axis="columns") # ============================================================================= # data_stat_folder = output+"/Data_Stats" # try: # if not os.path.exists(data_stat_folder): # os.makedirs(data_stat_folder) # except: # print ("The {} folder could not be created".format("Data_Stats")) # # layer_genome.to_csv(data_stat_folder+"/Samples.text", sep = "\t", index_label=[layer_genome.columns.name]) # similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs.text", sep = "\t") # del layer_genome # for i in range(startProcess,endProcess+1): # all_similirities_list[i-startProcess].to_csv(data_stat_folder+"/Similatiry_Data_Sig_"+str(i)+".text", sep="\t") # ============================================================================= # record the samples layer_genome.to_csv(output + "/Samples.txt", sep="\t", index_label=[layer_genome.columns.name]) #similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t") del layer_genome ################################### Decompose the new signatures into global signatures ######################### processAvg = information[solution - startProcess][0] exposureAvg = information[solution - startProcess][1] processSTE = information[solution - startProcess][2] signature_stabilities = information[solution - startProcess][4] signature_total_mutations = information[solution - startProcess][5] signature_stats = information[solution - startProcess][6] all_similarities = information[solution - startProcess][7] # create the folder for the final solution/ De Novo Solution layer_directory1 = output + "/Suggested_Solution/" + mutation_type + "_De_Novo_Solution" try: if not os.path.exists(layer_directory1): os.makedirs(layer_directory1) except: print("The {} folder could not be created".format("output")) # make the texts for signature plotting signature_stabilities = sub.signature_plotting_text( signature_stabilities, "Stability", "float") signature_total_mutations = sub.signature_plotting_text( signature_total_mutations, "Total Mutations", "integer") # make de novo solution(processAvg, allgenomes, layer_directory1) listOfSignatures = sub.make_letter_ids(idlenth=processAvg.shape[1], mtype=mutation_context) allgenomes = pd.DataFrame(allgenomes) exposureAvg = sub.make_final_solution(processAvg, allgenomes, listOfSignatures, layer_directory1, m, index, \ allcolnames, process_std_error = processSTE, signature_stabilities = signature_stabilities, \ signature_total_mutations = signature_total_mutations,denovo_exposureAvg = exposureAvg, \ signature_stats = signature_stats, add_penalty=add_penalty, remove_penalty=remove_penalty, \ initial_remove_penalty=initial_remove_penalty, refit_denovo_signatures=refit_denovo_signatures, de_novo_fit_penalty=de_novo_fit_penalty, sequence=sequence) #try: # create the folder for the final solution/ Decomposed Solution layer_directory2 = output + "/Suggested_Solution/COSMIC_" + mutation_type + "_Decomposed_Solution" try: if not os.path.exists(layer_directory2): os.makedirs(layer_directory2) except: print("The {} folder could not be created".format("output")) originalProcessAvg = pd.DataFrame(processAvg, index=index) if processAvg.shape[ 0] == 1536: #collapse the 1596 context into 96 only for the deocmposition processAvg = pd.DataFrame(processAvg, index=index) processAvg = processAvg.groupby(processAvg.index.str[1:8]).sum() genomes = pd.DataFrame(genomes, index=index) genomes = genomes.groupby(genomes.index.str[1:8]).sum() index = genomes.index processAvg = np.array(processAvg) genomes = np.array(genomes) if processAvg.shape[ 0] == 288: #collapse the 288 context into 96 only for the deocmposition processAvg = pd.DataFrame(processAvg, index=index) processAvg = processAvg.groupby(processAvg.index.str[2:9]).sum() genomes = pd.DataFrame(genomes, index=index) genomes = genomes.groupby(genomes.index.str[2:9]).sum() index = genomes.index processAvg = np.array(processAvg) genomes = np.array(genomes) originalProcessAvg.columns = listOfSignatures final_signatures = sub.signature_decomposition( processAvg, m, layer_directory2, genome_build=genome_build, add_penalty=add_penalty, remove_penalty=remove_penalty, mutation_context=mutation_context, make_decomposition_plots=make_decomposition_plots, originalProcessAvg=originalProcessAvg) # extract the global signatures and new signatures from the final_signatures dictionary globalsigs = final_signatures["globalsigs"] globalsigs = np.array(globalsigs) newsigs = final_signatures["newsigs"] try: processAvg = np.hstack([globalsigs, newsigs]) allsigids = final_signatures["globalsigids"] + final_signatures[ "newsigids"] except: processAvg = newsigs allsigids = final_signatures["newsigids"] attribution = final_signatures["dictionary"] background_sigs = final_signatures["background_sigs"] genomes = pd.DataFrame(genomes) exposureAvg = sub.make_final_solution(processAvg, genomes, allsigids, layer_directory2, m, index, colnames, \ cosmic_sigs=True, attribution = attribution, denovo_exposureAvg = exposureAvg , background_sigs=background_sigs, add_penalty=add_penalty, remove_penalty=remove_penalty, initial_remove_penalty=initial_remove_penalty, genome_build=genome_build, sequence=sequence,export_probabilities=export_probabilities) sysdata = open(out_put + "/JOB_METADATA.txt", "a") end_time = datetime.datetime.now() sysdata.write("\n[{}] Analysis ended: \n".format( str(end_time).split(".")[0])) sysdata.write("\n-------Job Status------- \n") sysdata.write( "Analysis of mutational signatures completed successfully! \nTotal execution time: " + str(end_time - start_time).split(".")[0] + " \nResults can be found in: " + " " + out_put + " " + " folder") sysdata.close() print( "\n\n \nYour Job Is Successfully Completed! Thank You For Using SigProfiler Extractor.\n " )
# from SigProfilerMatrixGenerator import install as genInstall # genInstall.install('GRCh37') from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen matGen.SigProfilerMatrixGeneratorFunc( "MetaMutationalSigs", 'GRCh37', "C:\\Users\\pande\\OneDriveDrexelUniversity\\Documents\\Fall-2021\\Coop\\CGC\\SanjeeVCFFiles\\PLOS_review_paper\\metaSignatures\\flaskmultiplefileupload\\uploads" + user_file) # matrices = matGen.SigProfilerMatrixGeneratorFunc("Sigprofiler",'GRCh37' , "C:\\Users\\pande\\OneDrive - Drexel University\\Documents\\Fall-2021\\Coop\\CGC\\SanjeeVCFFiles\kidney_vcf\\indels\\plink")