def multiprocess_permutation(bed_dict, mut_df, opts, fs_cts_df=None, p_inactivating=None): """Handles parallelization of permutations by splitting work by chromosome. """ chroms = sorted(bed_dict.keys(), key=lambda x: len(bed_dict[x]), reverse=True) multiprocess_flag = opts['processes']>0 if multiprocess_flag: num_processes = opts['processes'] else: num_processes = 1 result_list = [] for i in range(0, len(chroms), num_processes): if multiprocess_flag: pool = Pool(processes=num_processes) tmp_num_proc = len(chroms) - i if i + num_processes > len(chroms) else num_processes info_repeat = ((bed_dict[chroms[tmp_ix]], mut_df, opts, fs_cts_df, p_inactivating) for tmp_ix in range(i, i+tmp_num_proc)) process_results = pool.imap(singleprocess_permutation, info_repeat) process_results.next = utils.keyboard_exit_wrapper(process_results.next) try: for chrom_result in process_results: result_list += chrom_result except KeyboardInterrupt: pool.close() pool.join() logger.info('Exited by user. ctrl-c') sys.exit(0) pool.close() pool.join() else: info = (bed_dict[chroms[i]], mut_df, opts, fs_cts_df, p_inactivating) result_list += singleprocess_permutation(info) return result_list
def multiprocess_permutation(bed_dict, mut_df, opts): """Handles parallelization of permutations by splitting work by chromosome. """ chroms = sorted(bed_dict.keys()) multiprocess_flag = opts['processes'] > 0 if multiprocess_flag: num_processes = opts['processes'] else: num_processes = 1 num_permutations = opts['num_permutations'] if not opts['by_sample']: obs_result = [] else: uniq_samp = mut_df['Tumor_Sample'].unique() obs_result = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))), index=uniq_samp, columns=cols) # initialize list containing output if not opts['score_dir']: result_list = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] else: result_list = [[0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] # iterate over each chromosome for i in range(0, len(chroms), num_processes): if multiprocess_flag: pool = Pool(processes=num_processes) tmp_num_proc = len(chroms) - i if i + num_processes > len( chroms) else num_processes info_repeat = ((bed_dict[chroms[tmp_ix]], mut_df, opts) for tmp_ix in range(i, i + tmp_num_proc)) process_results = pool.imap(singleprocess_permutation, info_repeat) process_results.next = utils.keyboard_exit_wrapper( process_results.next) try: for chrom_result, obs_mutations in process_results: for j in range(num_permutations): result_list[j][0] += chrom_result[j][0] result_list[j][1] += chrom_result[j][1] result_list[j][2] += chrom_result[j][2] result_list[j][3] += chrom_result[j][3] result_list[j][4] += chrom_result[j][4] result_list[j][5] += chrom_result[j][5] result_list[j][6] += chrom_result[j][6] if opts['score_dir']: result_list[j][7] += chrom_result[j][7] result_list[j][8] += chrom_result[j][8] if not opts['by_sample']: obs_result.append(obs_mutations) else: obs_result = obs_result + obs_mutations except KeyboardInterrupt: pool.close() pool.join() logger.info('Exited by user. ctrl-c') sys.exit(0) pool.close() pool.join() else: info = (bed_dict[chroms[i]], mut_df, opts) chrom_result, obs_mutations = singleprocess_permutation(info) for j in range(num_permutations): result_list[j][0] += chrom_result[j][0] result_list[j][1] += chrom_result[j][1] result_list[j][2] += chrom_result[j][2] result_list[j][3] += chrom_result[j][3] result_list[j][4] += chrom_result[j][4] result_list[j][5] += chrom_result[j][5] result_list[j][6] += chrom_result[j][6] if opts['score_dir']: result_list[j][7] += chrom_result[j][7] result_list[j][8] += chrom_result[j][8] if not opts['by_sample']: obs_result.append(obs_mutations) else: obs_result = obs_result + obs_mutations return result_list, obs_result
def multiprocess_permutation(bed_dict, mut_df, opts, indel_df=None): """Handles parallelization of permutations by splitting work by chromosome. """ chroms = sorted(bed_dict.keys(), key=lambda x: len(bed_dict[x]), reverse=True) multiprocess_flag = opts['processes'] > 0 if multiprocess_flag: num_processes = opts['processes'] else: num_processes = 1 file_handle = open(opts['output'], 'w') mywriter = csv.writer(file_handle, delimiter='\t', lineterminator='\n') if opts['maf'] and opts['num_iterations']: header = [ 'Gene', 'strand', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Allele', 'Context', 'DNA_Change', 'Protein_Change', 'Variant_Classification' ] elif opts['maf']: header = [ 'Gene', 'strand', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Allele', 'DNA_Change', 'Protein_Change', 'Variant_Classification', 'Tumor_Sample', 'Tumor_Type' ] else: header = [ 'Gene', 'ID', 'gene length', 'non-silent snv', 'silent snv', 'nonsense', 'lost stop', 'splice site', 'lost start', 'missense', 'recurrent missense', 'normalized missense position entropy', ] # add column header for scores, is user provided one if opts['score_dir']: header += [ 'Total Missense MGAEntropy', 'Total Missense VEST Score' ] # add indel columns header += [ 'frameshift indel', 'inframe indel', 'normalized mutation entropy' ] mywriter.writerow(header) num_iterations = opts['num_iterations'] # simulate indel counts if opts['summary'] and num_iterations: fs_cts, inframe_cts, gene_names = indel.simulate_indel_counts( indel_df, bed_dict, num_iterations, opts['seed']) name2ix = {gene_names[z]: z for z in range(len(gene_names))} # just count observed indels elif opts['summary']: # get gene names gene_names = [ mybed.gene_name for chrom in bed_dict for mybed in bed_dict[chrom] ] name2ix = {gene_names[z]: z for z in range(len(gene_names))} # initiate count vectors inframe_cts = np.zeros((1, len(gene_names))) fs_cts = np.zeros((1, len(gene_names))) # populate observed counts indel_cts_dict = indel_df['Gene'].value_counts().to_dict() fs_cts_dict = indel_df[indel.is_frameshift_annotation( indel_df)]['Gene'].value_counts().to_dict() for mygene in indel_cts_dict: if mygene in name2ix: # gene should be found in BED file annotation ix = name2ix[mygene] fs_cts[0, ix] = 0 if mygene not in fs_cts_dict else fs_cts_dict[ mygene] inframe_cts[0, ix] = indel_cts_dict[mygene] - fs_cts[0, ix] # simulate snvs obs_result = [] for i in range(0, len(chroms), num_processes): if multiprocess_flag: pool = Pool(processes=num_processes) tmp_num_proc = len(chroms) - i if i + num_processes > len( chroms) else num_processes info_repeat = ((bed_dict[chroms[tmp_ix]], mut_df, opts) for tmp_ix in range(i, i + tmp_num_proc)) process_results = pool.imap(singleprocess_permutation, info_repeat) process_results.next = utils.keyboard_exit_wrapper( process_results.next) try: # iterate through each chromosome result for chrom_result in process_results: # add columns for indels if opts['summary']: tmp_chrom_result = [] for gname, grp in it.groupby(chrom_result, lambda x: x[0]): for l, row in enumerate(grp): gene_ix = name2ix[gname] fs_count = fs_cts[l, gene_ix] inframe_count = inframe_cts[l, gene_ix] missense_pos_ct = list(row.pop( -1).values()) # missense codon counts silent_pos_ct = [1 for l in range(row[4])] inactivating_ct = sum(row[5:9]) + fs_count tmp_count_list = missense_pos_ct + silent_pos_ct + [ inactivating_ct, inframe_count ] norm_ent = math.normalized_mutation_entropy( tmp_count_list) tmp_chrom_result.append( row + [fs_count, inframe_count, norm_ent]) chrom_result = tmp_chrom_result # write output to file mywriter.writerows(chrom_result) except KeyboardInterrupt: pool.close() pool.join() logger.info('Exited by user. ctrl-c') sys.exit(0) pool.close() pool.join() else: # perform simulation info = (bed_dict[chroms[i]], mut_df, opts) chrom_results = singleprocess_permutation(info) # add indel columns if opts['summary']: tmp_chrom_result = [] for gname, grp in it.groupby(chrom_results, lambda x: x[0]): for l, row in enumerate(grp): gene_ix = name2ix[gname] fs_count = fs_cts[l, gene_ix] inframe_count = inframe_cts[l, gene_ix] missense_pos_ct = list( row.pop(-1).values()) # missense codon counts silent_pos_ct = [1 for l in range(row[4])] inactivating_ct = sum(row[5:9]) + fs_count tmp_count_list = missense_pos_ct + silent_pos_ct + [ inactivating_ct, inframe_count ] norm_ent = math.normalized_mutation_entropy( tmp_count_list) tmp_chrom_result.append( row + [fs_count, inframe_count, norm_ent]) chrom_results = tmp_chrom_result # write to file mywriter.writerows(chrom_results) file_handle.close()
def multiprocess_permutation(bed_dict, mut_df, opts): """Handles parallelization of permutations by splitting work by chromosome. """ chroms = sorted(bed_dict.keys()) multiprocess_flag = opts['processes']>0 if multiprocess_flag: num_processes = opts['processes'] else: num_processes = 1 num_permutations = opts['num_permutations'] if not opts['by_sample']: obs_result = [] else: uniq_samp = mut_df['Tumor_Sample'].unique() obs_result = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))), index=uniq_samp, columns=cols) # initialize list containing output if not opts['score_dir']: result_list = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] else: result_list = [[0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] # iterate over each chromosome for i in range(0, len(chroms), num_processes): if multiprocess_flag: pool = Pool(processes=num_processes) tmp_num_proc = len(chroms) - i if i + num_processes > len(chroms) else num_processes info_repeat = ((bed_dict[chroms[tmp_ix]], mut_df, opts) for tmp_ix in range(i, i+tmp_num_proc)) process_results = pool.imap(singleprocess_permutation, info_repeat) process_results.next = utils.keyboard_exit_wrapper(process_results.next) try: for chrom_result, obs_mutations in process_results: for j in range(num_permutations): result_list[j][0] += chrom_result[j][0] result_list[j][1] += chrom_result[j][1] result_list[j][2] += chrom_result[j][2] result_list[j][3] += chrom_result[j][3] result_list[j][4] += chrom_result[j][4] result_list[j][5] += chrom_result[j][5] result_list[j][6] += chrom_result[j][6] if opts['score_dir']: result_list[j][7] += chrom_result[j][7] result_list[j][8] += chrom_result[j][8] if not opts['by_sample']: obs_result.append(obs_mutations) else: obs_result = obs_result + obs_mutations except KeyboardInterrupt: pool.close() pool.join() logger.info('Exited by user. ctrl-c') sys.exit(0) pool.close() pool.join() else: info = (bed_dict[chroms[i]], mut_df, opts) chrom_result, obs_mutations = singleprocess_permutation(info) for j in range(num_permutations): result_list[j][0] += chrom_result[j][0] result_list[j][1] += chrom_result[j][1] result_list[j][2] += chrom_result[j][2] result_list[j][3] += chrom_result[j][3] result_list[j][4] += chrom_result[j][4] result_list[j][5] += chrom_result[j][5] result_list[j][6] += chrom_result[j][6] if opts['score_dir']: result_list[j][7] += chrom_result[j][7] result_list[j][8] += chrom_result[j][8] if not opts['by_sample']: obs_result.append(obs_mutations) else: obs_result = obs_result + obs_mutations return result_list, obs_result