Example #1
0
def readAllNucleosomeOccupancyDataAndWriteChrBasedSignalCountArraysSequentially(
        genome, quantileValue, nucleosomeFilename):
    chromSizesDict = getChromSizesDict(genome)

    print('Before nucleosome occupancy is loaded into memory')
    print('Memory usage in %s MB' % memory_usage())

    if os.path.exists(nucleosomeFilename):
        nucleosome_df_grouped, max_signal, min_signal = readNucleosomeOccupancyData(
            quantileValue, nucleosomeFilename)
        print('max_signal:%d min_signal:%d' % (max_signal, min_signal))
        print('np.finfo(np.float16).min:%f np.finfo(np.float16).max:%f' %
              (np.finfo(np.float16).min, np.finfo(np.float16).max))

        print('After nucleosome occupancy grouped by')
        print('Memory usage in %s MB' % memory_usage())

        for chrLong, chromBasedNucleosomeDF in nucleosome_df_grouped:
            print('For %s write nucleosome signal and count array' % (chrLong))
            chromSize = chromSizesDict[chrLong]
            inputList = []
            inputList.append(None)
            inputList.append(None)
            inputList.append(chrLong)
            inputList.append(chromSize)
            inputList.append(chromBasedNucleosomeDF)
            inputList.append(nucleosomeFilename)
            inputList.append(NUCLEOSOMEOCCUPANCY)
            inputList.append(max_signal)
            inputList.append(min_signal)
            writeChrBasedOccupancySignalArray(inputList)

        print('After all chr based files are written')
        print('Memory usage in %s MB' % memory_usage())
Example #2
0
def readNucleosomeOccupancyData(quantileValue, nucleosomeFilename):

    column_names = [CHROM, START, END, SIGNAL]
    nucleosome_df = pd.read_csv(nucleosomeFilename,
                                sep='\t',
                                header=None,
                                comment='#',
                                names=column_names,
                                dtype={
                                    CHROM: 'category',
                                    START: np.int32,
                                    END: np.int32,
                                    SIGNAL: np.float32
                                })

    max_signal = nucleosome_df[SIGNAL].max()
    min_signal = nucleosome_df[SIGNAL].min()
    mean_signal = nucleosome_df[SIGNAL].mean()
    std_signal = nucleosome_df[SIGNAL].std()
    print('\n#########################################')
    print('Before outlier elimination')
    print('Max Signal: %f' % max_signal)
    print('Min Signal: %f' % min_signal)
    print('Mean Signal: %f' % mean_signal)
    print('Std Signal: %f' % std_signal)
    print('Memory usage in %s MB' % memory_usage())

    #########################################################
    if (quantileValue < 1.0):
        # remove the outliers
        q = nucleosome_df[SIGNAL].quantile(quantileValue)
        print('\n#########################################')
        print('q:%f' % q)
        print('before outlier elimination number of rows: %d' %
              (nucleosome_df.shape[0]))
        nucleosome_df = nucleosome_df[nucleosome_df[SIGNAL] < q]
        print('after outlier elimination number of rows: %d' %
              (nucleosome_df.shape[0]))

        max_signal = nucleosome_df[SIGNAL].max()
        min_signal = nucleosome_df[SIGNAL].min()
        mean_signal = nucleosome_df[SIGNAL].mean()
        std_signal = nucleosome_df[SIGNAL].std()
        print('\n#########################################')
        print('After outlier elimination')
        print('Max Signal: %f' % max_signal)
        print('Min Signal: %f' % min_signal)
        print('Mean Signal: %f' % mean_signal)
        print('Std Signal: %f' % std_signal)
        print('Memory usage in %s MB' % memory_usage())
    #########################################################

    nucleosome_df_grouped = nucleosome_df.groupby(CHROM)

    print('After nucleosome occupancy grouped by')
    print('Memory usage in %s MB' % memory_usage())

    return nucleosome_df_grouped, max_signal, min_signal
Example #3
0
def readAllNucleosomeOccupancyDataAndWriteChrBasedSignalCountArraysInParallel(
        genome, quantileValue, nucleosomeFilename):
    chromSizesDict = getChromSizesDict(genome)

    # Start the pool
    numofProcesses = multiprocessing.cpu_count()
    print('Number of processors:%d' % (numofProcesses))

    print('Before nucleosome occupancy is loaded into memory')
    print('Memory usage in %s MB' % memory_usage())

    if os.path.exists(nucleosomeFilename):
        nucleosome_df_grouped, max_signal, min_signal = readNucleosomeOccupancyData(
            quantileValue, nucleosomeFilename)
        print('max_signal:%d min_signal:%d' % (max_signal, min_signal))
        print('np.finfo(np.float16).min:%f np.finfo(np.float16).max:%f' %
              (np.finfo(np.float16).min, np.finfo(np.float16).max))

        pool = multiprocessing.Pool(numofProcesses)

        print('After pool is initialized')
        print('Memory usage in %s MB' % memory_usage())
        poolInputList = []

        for chrLong, chromBasedNucleosomeDF in nucleosome_df_grouped:
            print('for %s write nucleosome signal and count array' % (chrLong))
            chromSize = chromSizesDict[chrLong]
            inputList = []
            inputList.append(None)
            inputList.append(None)
            inputList.append(chrLong)
            inputList.append(chromSize)
            inputList.append(chromBasedNucleosomeDF)
            inputList.append(nucleosomeFilename)
            inputList.append(NUCLEOSOMEOCCUPANCY)
            inputList.append(max_signal)
            inputList.append(min_signal)
            poolInputList.append(inputList)

        pool.map(writeChrBasedOccupancySignalArray, poolInputList)

        ################################
        pool.close()
        pool.join()
        ################################

        print('After pool is closed and joined')
        print('Memory usage in %s MB' % memory_usage())
Example #4
0
def fill_signal_array_dict(chrname, chr_based_df_list, chromSizesDict,
                           occupancy_type, verbose):

    if verbose:
        print(
            '\tVerbose %s Worker pid %s Fill Signal Array Dict starts:  current_mem_usage %.2f (mb)'
            % (occupancy_type, str(os.getpid()), memory_usage()))
    process_signal_array_dict = {}

    process_signal_array = np.zeros(chromSizesDict[chrname], dtype=np.float32)
    max_signal = np.finfo(np.float32).min
    min_signal = np.finfo(np.float32).max

    if verbose:
        print(
            '\tVerbose %s Worker pid %s Fill Signal Array Dict --- len(chr_based_df_list):%d current_mem_usage %.2f (mb)'
            % (occupancy_type, str(
                os.getpid()), len(chr_based_df_list), memory_usage()))
    if verbose:
        print(
            '\tVerbose %s Worker pid %s Fill Signal Array Dict --- chrname:%s current_mem_usage %.2f (mb)'
            % (occupancy_type, str(os.getpid()), chrname, memory_usage()))

    for chr_based_df in chr_based_df_list:
        if (chr_based_df[SIGNAL].max() > max_signal):
            max_signal = chr_based_df[SIGNAL].max()
        if (chr_based_df[SIGNAL].min() < min_signal):
            min_signal = chr_based_df[SIGNAL].min()
        # Use list comprehension
        [
            updateSignalArraysForListComprehension(row, process_signal_array)
            for row in chr_based_df.values
        ]

    # Initialzie the list, you will return this list
    process_signal_array_dict['min_signal'] = min_signal
    process_signal_array_dict['max_signal'] = max_signal
    process_signal_array_dict[chrname] = process_signal_array

    if verbose:
        print(
            '\tVerbose %s Worker pid %s Fill Signal Array Dict: min_signal: %f max_signal: %f current_mem_usage %.2f (mb)'
            % (occupancy_type, str(
                os.getpid()), process_signal_array_dict['min_signal'],
               process_signal_array_dict['max_signal'], memory_usage()))
    if verbose:
        print(
            '\tVerbose %s Worker pid %s Fill Signal Array Dict: signal_array_dict.keys():%s current_mem_usage %.2f (mb)'
            % (occupancy_type, str(os.getpid()),
               process_signal_array_dict.keys(), memory_usage()))
    if verbose:
        print(
            '\tVerbose %s Worker pid %s Fill Signal Array Dict ends: current_mem_usage %.2f (mb)\n'
            % (occupancy_type, str(os.getpid()), memory_usage()))

    return process_signal_array_dict
def findProcessiveGroupsWithDistance(
        processivity_inter_mutational_distance, simNum, chrLong, sample,
        sorted_sampleBased_chrBased_subs_df,
        considerProbabilityInProcessivityAnalysis,
        signature_cutoff_numberofmutations_averageprobability_df, verbose):

    my_dict = {}
    mutations_loci_df = None

    if verbose:
        print(
            '\tVerbose Worker pid %s memory_usage %.2f MB simNum:%d chrLong:%s sample:%s findProcessiveGroups starts'
            % (str(os.getpid()), memory_usage(), simNum, chrLong, sample))
    # They must be coming from the same sample
    # They must be same type of mutation e.g.: T>A
    # They must be resulted from same signature
    # They must be on the same strand
    if (sorted_sampleBased_chrBased_subs_df is not None):
        # As long as mutation, signature, and pyrimidine strands are the same continue to accumulate, if one of them is different calculate cumsum and start again
        # e.g.: If there are 4 consecutive rows with same mutation, signature and pyrimidine strand, subgroup will be X(THE SAME) for these 4 rows
        if processivity_inter_mutational_distance:
            sorted_sampleBased_chrBased_subs_df['subgroup'] = (
                (sorted_sampleBased_chrBased_subs_df[MUTATION].ne(
                    sorted_sampleBased_chrBased_subs_df[MUTATION].shift())) |
                (sorted_sampleBased_chrBased_subs_df['Signature'].ne(
                    sorted_sampleBased_chrBased_subs_df['Signature'].shift()))
                | (sorted_sampleBased_chrBased_subs_df[PYRAMIDINESTRAND].ne(
                    sorted_sampleBased_chrBased_subs_df[PYRAMIDINESTRAND].
                    shift())) |
                (sorted_sampleBased_chrBased_subs_df[START] -
                 sorted_sampleBased_chrBased_subs_df[START].shift() >
                 processivity_inter_mutational_distance)).cumsum()
        else:
            sorted_sampleBased_chrBased_subs_df['subgroup'] = (
                (sorted_sampleBased_chrBased_subs_df[MUTATION].ne(
                    sorted_sampleBased_chrBased_subs_df[MUTATION].shift())) |
                (sorted_sampleBased_chrBased_subs_df['Signature'].ne(
                    sorted_sampleBased_chrBased_subs_df['Signature'].shift()))
                | (sorted_sampleBased_chrBased_subs_df[PYRAMIDINESTRAND].ne(
                    sorted_sampleBased_chrBased_subs_df[PYRAMIDINESTRAND].
                    shift()))).cumsum()

        # Former way
        # df = sorted_sampleBased_chrBased_subs_df.groupby("subgroup").agg(
        #     Signature=pd.NamedAgg(column='Signature', aggfunc="first"),
        #     ProcessiveGroupLength=pd.NamedAgg(column=MUTATION, aggfunc="count"),
        #     Probability=pd.NamedAgg(column='Probability', aggfunc="max"),
        #     LastDistance=pd.NamedAgg(column='Start', aggfunc="last"),
        #     FirstDistance=pd.NamedAgg(column='Start', aggfunc="first"),
        #     ).assign(Distance=lambda x: x.pop('LastDistance') - x.pop('FirstDistance'))
        # #agg().reset_index() can be used. not necessary

        # Current way facilitates printing mutation start positions
        df = sorted_sampleBased_chrBased_subs_df.groupby('subgroup').agg({
            'Sample':
            'first',
            'Signature':
            'first',
            'Probability':
            'max',
            'Mutation':
            'count',
            'PyramidineStrand':
            'first',
            # 'Start': lambda x: x.unique().tolist(),
            # 'Start': lambda x: x.tolist(),
            'Start':
            list,
        }).rename(columns={
            'Start': 'Start_List',
            'Mutation': 'ProcessiveGroupLength'
        }).reset_index()

        df['FirstLoci'] = pd.DataFrame(
            df['Start_List'].values.tolist()).min(axis=1)
        df['LastLoci'] = pd.DataFrame(
            df['Start_List'].values.tolist()).max(axis=1)

        df['Distance'] = df['LastLoci'] - df['FirstLoci']
        df.drop(columns=['FirstLoci', 'LastLoci'], inplace=True)

        # Remove rows with processive groups of length 1
        df = df[df['ProcessiveGroupLength'].ne(1)]

        # ################################
        # This is done earlier
        # if considerProbabilityInProcessivityAnalysis:
        #     df=df.loc[df['Probability'] >= df['Signature'].map(signature_cutoff_numberofmutations_averageprobability_df.set_index('signature')['cutoff'])]
        # ################################

        #'df columns: subgroup' 'Signature' 'Probability' 'ProcessiveGroupLength' 'PyramidineStrand' 'Start_List' 'Distance'
        mutations_loci_df = df.groupby(['Signature',
                                        'ProcessiveGroupLength']).agg({
                                            'Sample':
                                            'first',
                                            'Start_List':
                                            list,
                                            'Distance':
                                            list
                                        }).reset_index()

        mutations_loci_df['Chr'] = chrLong
        mutations_loci_df = mutations_loci_df[[
            'Sample', 'Chr', 'Signature', 'ProcessiveGroupLength',
            'Start_List', 'Distance'
        ]]

        my_dict = {
            k: np.hstack(v)
            for k, v in df.groupby(['Signature', 'ProcessiveGroupLength'])
            ['Distance']
        }

    if verbose:
        print(
            '\tVerbose Worker pid %s memory_usage %.2f MB simNum:%d chrLong:%s sample:%s findProcessiveGroups ends'
            % (str(os.getpid()), memory_usage(), simNum, chrLong, sample))

    return (my_dict, mutations_loci_df)