Ejemplo n.º 1
0
def multiprocess_permutation(bed_dict, mut_df, opts,
                             fs_cts_df=None, p_inactivating=None):
    """Handles parallelization of permutations by splitting work
    by chromosome.
    """
    chroms = sorted(bed_dict.keys(), key=lambda x: len(bed_dict[x]), reverse=True)
    multiprocess_flag = opts['processes']>0
    if multiprocess_flag:
        num_processes = opts['processes']
    else:
        num_processes = 1
    result_list = []
    for i in range(0, len(chroms), num_processes):
        if multiprocess_flag:
            pool = Pool(processes=num_processes)
            tmp_num_proc = len(chroms) - i if i + num_processes > len(chroms) else num_processes
            info_repeat = ((bed_dict[chroms[tmp_ix]], mut_df, opts, fs_cts_df, p_inactivating)
                            for tmp_ix in range(i, i+tmp_num_proc))
            process_results = pool.imap(singleprocess_permutation, info_repeat)
            process_results.next = utils.keyboard_exit_wrapper(process_results.next)
            try:
                for chrom_result in process_results:
                    result_list += chrom_result
            except KeyboardInterrupt:
                pool.close()
                pool.join()
                logger.info('Exited by user. ctrl-c')
                sys.exit(0)
            pool.close()
            pool.join()
        else:
            info = (bed_dict[chroms[i]], mut_df, opts, fs_cts_df, p_inactivating)
            result_list += singleprocess_permutation(info)

    return result_list
def multiprocess_permutation(bed_dict, mut_df, opts,
                             fs_cts_df=None, p_inactivating=None):
    """Handles parallelization of permutations by splitting work
    by chromosome.
    """
    chroms = sorted(bed_dict.keys(), key=lambda x: len(bed_dict[x]), reverse=True)
    multiprocess_flag = opts['processes']>0
    if multiprocess_flag:
        num_processes = opts['processes']
    else:
        num_processes = 1
    result_list = []
    for i in range(0, len(chroms), num_processes):
        if multiprocess_flag:
            pool = Pool(processes=num_processes)
            tmp_num_proc = len(chroms) - i if i + num_processes > len(chroms) else num_processes
            info_repeat = ((bed_dict[chroms[tmp_ix]], mut_df, opts, fs_cts_df, p_inactivating)
                            for tmp_ix in range(i, i+tmp_num_proc))
            process_results = pool.imap(singleprocess_permutation, info_repeat)
            process_results.next = utils.keyboard_exit_wrapper(process_results.next)
            try:
                for chrom_result in process_results:
                    result_list += chrom_result
            except KeyboardInterrupt:
                pool.close()
                pool.join()
                logger.info('Exited by user. ctrl-c')
                sys.exit(0)
            pool.close()
            pool.join()
        else:
            info = (bed_dict[chroms[i]], mut_df, opts, fs_cts_df, p_inactivating)
            result_list += singleprocess_permutation(info)

    return result_list
def multiprocess_permutation(bed_dict, mut_df, opts):
    """Handles parallelization of permutations by splitting work
    by chromosome.
    """
    chroms = sorted(bed_dict.keys())
    multiprocess_flag = opts['processes'] > 0
    if multiprocess_flag:
        num_processes = opts['processes']
    else:
        num_processes = 1
    num_permutations = opts['num_permutations']
    if not opts['by_sample']:
        obs_result = []
    else:
        uniq_samp = mut_df['Tumor_Sample'].unique()
        obs_result = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))),
                                  index=uniq_samp,
                                  columns=cols)

    # initialize list containing output
    if not opts['score_dir']:
        result_list = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]
    else:
        result_list = [[0, 0, 0, 0, 0, 0, 0, 0, 0]
                       for k in range(num_permutations)]

    # iterate over each chromosome
    for i in range(0, len(chroms), num_processes):
        if multiprocess_flag:
            pool = Pool(processes=num_processes)
            tmp_num_proc = len(chroms) - i if i + num_processes > len(
                chroms) else num_processes
            info_repeat = ((bed_dict[chroms[tmp_ix]], mut_df, opts)
                           for tmp_ix in range(i, i + tmp_num_proc))
            process_results = pool.imap(singleprocess_permutation, info_repeat)
            process_results.next = utils.keyboard_exit_wrapper(
                process_results.next)
            try:
                for chrom_result, obs_mutations in process_results:
                    for j in range(num_permutations):
                        result_list[j][0] += chrom_result[j][0]
                        result_list[j][1] += chrom_result[j][1]
                        result_list[j][2] += chrom_result[j][2]
                        result_list[j][3] += chrom_result[j][3]
                        result_list[j][4] += chrom_result[j][4]
                        result_list[j][5] += chrom_result[j][5]
                        result_list[j][6] += chrom_result[j][6]
                        if opts['score_dir']:
                            result_list[j][7] += chrom_result[j][7]
                            result_list[j][8] += chrom_result[j][8]

                    if not opts['by_sample']:
                        obs_result.append(obs_mutations)
                    else:
                        obs_result = obs_result + obs_mutations
            except KeyboardInterrupt:
                pool.close()
                pool.join()
                logger.info('Exited by user. ctrl-c')
                sys.exit(0)
            pool.close()
            pool.join()
        else:
            info = (bed_dict[chroms[i]], mut_df, opts)
            chrom_result, obs_mutations = singleprocess_permutation(info)
            for j in range(num_permutations):
                result_list[j][0] += chrom_result[j][0]
                result_list[j][1] += chrom_result[j][1]
                result_list[j][2] += chrom_result[j][2]
                result_list[j][3] += chrom_result[j][3]
                result_list[j][4] += chrom_result[j][4]
                result_list[j][5] += chrom_result[j][5]
                result_list[j][6] += chrom_result[j][6]
                if opts['score_dir']:
                    result_list[j][7] += chrom_result[j][7]
                    result_list[j][8] += chrom_result[j][8]
            if not opts['by_sample']:
                obs_result.append(obs_mutations)
            else:
                obs_result = obs_result + obs_mutations

    return result_list, obs_result
Ejemplo n.º 4
0
def multiprocess_permutation(bed_dict, mut_df, opts, indel_df=None):
    """Handles parallelization of permutations by splitting work
    by chromosome.
    """
    chroms = sorted(bed_dict.keys(),
                    key=lambda x: len(bed_dict[x]),
                    reverse=True)
    multiprocess_flag = opts['processes'] > 0
    if multiprocess_flag:
        num_processes = opts['processes']
    else:
        num_processes = 1
    file_handle = open(opts['output'], 'w')
    mywriter = csv.writer(file_handle, delimiter='\t', lineterminator='\n')
    if opts['maf'] and opts['num_iterations']:
        header = [
            'Gene', 'strand', 'Chromosome', 'Start_Position', 'End_Position',
            'Reference_Allele', 'Tumor_Allele', 'Context', 'DNA_Change',
            'Protein_Change', 'Variant_Classification'
        ]
    elif opts['maf']:
        header = [
            'Gene', 'strand', 'Chromosome', 'Start_Position', 'End_Position',
            'Reference_Allele', 'Tumor_Allele', 'DNA_Change', 'Protein_Change',
            'Variant_Classification', 'Tumor_Sample', 'Tumor_Type'
        ]
    else:
        header = [
            'Gene',
            'ID',
            'gene length',
            'non-silent snv',
            'silent snv',
            'nonsense',
            'lost stop',
            'splice site',
            'lost start',
            'missense',
            'recurrent missense',
            'normalized missense position entropy',
        ]
        # add column header for scores, is user provided one
        if opts['score_dir']:
            header += [
                'Total Missense MGAEntropy', 'Total Missense VEST Score'
            ]
        # add indel columns
        header += [
            'frameshift indel', 'inframe indel', 'normalized mutation entropy'
        ]
    mywriter.writerow(header)
    num_iterations = opts['num_iterations']

    # simulate indel counts
    if opts['summary'] and num_iterations:
        fs_cts, inframe_cts, gene_names = indel.simulate_indel_counts(
            indel_df, bed_dict, num_iterations, opts['seed'])
        name2ix = {gene_names[z]: z for z in range(len(gene_names))}
    # just count observed indels
    elif opts['summary']:
        # get gene names
        gene_names = [
            mybed.gene_name for chrom in bed_dict for mybed in bed_dict[chrom]
        ]
        name2ix = {gene_names[z]: z for z in range(len(gene_names))}

        # initiate count vectors
        inframe_cts = np.zeros((1, len(gene_names)))
        fs_cts = np.zeros((1, len(gene_names)))

        # populate observed counts
        indel_cts_dict = indel_df['Gene'].value_counts().to_dict()
        fs_cts_dict = indel_df[indel.is_frameshift_annotation(
            indel_df)]['Gene'].value_counts().to_dict()
        for mygene in indel_cts_dict:
            if mygene in name2ix:
                # gene should be found in BED file annotation
                ix = name2ix[mygene]
                fs_cts[0,
                       ix] = 0 if mygene not in fs_cts_dict else fs_cts_dict[
                           mygene]
                inframe_cts[0, ix] = indel_cts_dict[mygene] - fs_cts[0, ix]

    # simulate snvs
    obs_result = []
    for i in range(0, len(chroms), num_processes):
        if multiprocess_flag:
            pool = Pool(processes=num_processes)
            tmp_num_proc = len(chroms) - i if i + num_processes > len(
                chroms) else num_processes
            info_repeat = ((bed_dict[chroms[tmp_ix]], mut_df, opts)
                           for tmp_ix in range(i, i + tmp_num_proc))
            process_results = pool.imap(singleprocess_permutation, info_repeat)
            process_results.next = utils.keyboard_exit_wrapper(
                process_results.next)
            try:
                # iterate through each chromosome result
                for chrom_result in process_results:
                    # add columns for indels
                    if opts['summary']:
                        tmp_chrom_result = []
                        for gname, grp in it.groupby(chrom_result,
                                                     lambda x: x[0]):
                            for l, row in enumerate(grp):
                                gene_ix = name2ix[gname]
                                fs_count = fs_cts[l, gene_ix]
                                inframe_count = inframe_cts[l, gene_ix]
                                missense_pos_ct = list(row.pop(
                                    -1).values())  # missense codon counts
                                silent_pos_ct = [1 for l in range(row[4])]
                                inactivating_ct = sum(row[5:9]) + fs_count
                                tmp_count_list = missense_pos_ct + silent_pos_ct + [
                                    inactivating_ct, inframe_count
                                ]
                                norm_ent = math.normalized_mutation_entropy(
                                    tmp_count_list)
                                tmp_chrom_result.append(
                                    row + [fs_count, inframe_count, norm_ent])
                        chrom_result = tmp_chrom_result

                    # write output to file
                    mywriter.writerows(chrom_result)
            except KeyboardInterrupt:
                pool.close()
                pool.join()
                logger.info('Exited by user. ctrl-c')
                sys.exit(0)
            pool.close()
            pool.join()
        else:
            # perform simulation
            info = (bed_dict[chroms[i]], mut_df, opts)
            chrom_results = singleprocess_permutation(info)

            # add indel columns
            if opts['summary']:
                tmp_chrom_result = []
                for gname, grp in it.groupby(chrom_results, lambda x: x[0]):
                    for l, row in enumerate(grp):
                        gene_ix = name2ix[gname]
                        fs_count = fs_cts[l, gene_ix]
                        inframe_count = inframe_cts[l, gene_ix]
                        missense_pos_ct = list(
                            row.pop(-1).values())  # missense codon counts
                        silent_pos_ct = [1 for l in range(row[4])]
                        inactivating_ct = sum(row[5:9]) + fs_count
                        tmp_count_list = missense_pos_ct + silent_pos_ct + [
                            inactivating_ct, inframe_count
                        ]
                        norm_ent = math.normalized_mutation_entropy(
                            tmp_count_list)
                        tmp_chrom_result.append(
                            row + [fs_count, inframe_count, norm_ent])
                chrom_results = tmp_chrom_result

            # write to file
            mywriter.writerows(chrom_results)
    file_handle.close()
def multiprocess_permutation(bed_dict, mut_df, opts):
    """Handles parallelization of permutations by splitting work
    by chromosome.
    """
    chroms = sorted(bed_dict.keys())
    multiprocess_flag = opts['processes']>0
    if multiprocess_flag:
        num_processes = opts['processes']
    else:
        num_processes = 1
    num_permutations = opts['num_permutations']
    if not opts['by_sample']:
        obs_result = []
    else:
        uniq_samp = mut_df['Tumor_Sample'].unique()
        obs_result = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))),
                                  index=uniq_samp, columns=cols)

    # initialize list containing output
    if not opts['score_dir']:
        result_list = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]
    else:
        result_list = [[0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]

    # iterate over each chromosome
    for i in range(0, len(chroms), num_processes):
        if multiprocess_flag:
            pool = Pool(processes=num_processes)
            tmp_num_proc = len(chroms) - i if i + num_processes > len(chroms) else num_processes
            info_repeat = ((bed_dict[chroms[tmp_ix]], mut_df, opts)
                            for tmp_ix in range(i, i+tmp_num_proc))
            process_results = pool.imap(singleprocess_permutation, info_repeat)
            process_results.next = utils.keyboard_exit_wrapper(process_results.next)
            try:
                for chrom_result, obs_mutations in process_results:
                    for j in range(num_permutations):
                        result_list[j][0] += chrom_result[j][0]
                        result_list[j][1] += chrom_result[j][1]
                        result_list[j][2] += chrom_result[j][2]
                        result_list[j][3] += chrom_result[j][3]
                        result_list[j][4] += chrom_result[j][4]
                        result_list[j][5] += chrom_result[j][5]
                        result_list[j][6] += chrom_result[j][6]
                        if opts['score_dir']:
                            result_list[j][7] += chrom_result[j][7]
                            result_list[j][8] += chrom_result[j][8]

                    if not opts['by_sample']:
                        obs_result.append(obs_mutations)
                    else:
                        obs_result = obs_result + obs_mutations
            except KeyboardInterrupt:
                pool.close()
                pool.join()
                logger.info('Exited by user. ctrl-c')
                sys.exit(0)
            pool.close()
            pool.join()
        else:
            info = (bed_dict[chroms[i]], mut_df, opts)
            chrom_result, obs_mutations = singleprocess_permutation(info)
            for j in range(num_permutations):
                result_list[j][0] += chrom_result[j][0]
                result_list[j][1] += chrom_result[j][1]
                result_list[j][2] += chrom_result[j][2]
                result_list[j][3] += chrom_result[j][3]
                result_list[j][4] += chrom_result[j][4]
                result_list[j][5] += chrom_result[j][5]
                result_list[j][6] += chrom_result[j][6]
                if opts['score_dir']:
                    result_list[j][7] += chrom_result[j][7]
                    result_list[j][8] += chrom_result[j][8]
            if not opts['by_sample']:
                obs_result.append(obs_mutations)
            else:
                obs_result = obs_result + obs_mutations

    return result_list, obs_result