Esempio n. 1
0
def draw_faceted_heatmap(data, indexingdf, xaxis, yaxis, zaxis, lognorm,
                         cbarticks, logarithmic, symlog, symlognorm, linthresh,
                         **kwargs):
    originalColumnOrder = list(
        pd.unique(indexingdf.index.get_level_values(xaxis)))
    unsortedPivotedData = data.pivot_table(index=yaxis,
                                           columns=xaxis,
                                           values=zaxis)
    indexdf = indexingdf.groupby(level=yaxis, sort=False).first()
    data = reindexDataFrame(unsortedPivotedData, indexdf, False)
    data = data[originalColumnOrder]
    data.columns.name = xaxis
    plt.axis('off')
    if logarithmic:
        g = sns.heatmap(data,
                        norm=lognorm,
                        **kwargs,
                        cbar=True,
                        cbar_kws={
                            "ticks": cbarticks,
                            'label': zaxis
                        })
    elif symlog:
        linthresh = int(linthresh)
        maxlog = int(np.ceil(np.log10(data.values.max())))
        minlog = int(np.ceil(np.log10(-1 * data.values.min())))
        #tick_locations=([-(10**x) for x in range(-1*int(linthresh), minlog+1, 1)][::-1]+[0.0]+[(10**x) for x in range(-minlog,maxlog+1, 1)])
        tick_locations = ([-(10**x)
                           for x in range(-linthresh, minlog + 1, 1)][::-1] +
                          [0.0] + [(10**x)
                                   for x in range(-linthresh, maxlog + 1, 1)])
        #generate logarithmic ticks
        g = sns.heatmap(data,
                        norm=symlognorm,
                        cbar_kws={
                            'label': zaxis,
                            'ticks': tick_locations,
                            'format': ticker.LogFormatterMathtext()
                        },
                        **kwargs)
        #g = sns.heatmap(data, norm=symlognorm,**kwargs,cbar=True,cbar_kws={'label':zaxis})
    else:
        g = sns.heatmap(data, **kwargs, cbar=True, cbar_kws={'label': zaxis})

    #Add hiearchical level names and borders to heatmap
    ax1 = plt.gca()
    label_index(ax1, data)
    draw_borders(g, data)
    label_columns(ax1, data)

    label_headers(ax1, data)
def mergeTimepointLabels(df, reindexingDf, timepointLabelMergingList):

    dfToReindex = df.unstack('Time')
    reindexedDf = reindexDataFrame(dfToReindex, reindexingDf, False)

    newcolumnsTuples = []
    for col in range(reindexedDf.shape[1]):
        names = list(reindexedDf.iloc[:, col].name)
        for i, timeRange in enumerate(timepointLabelMergingList):
            lowerTimebound = float(timeRange.split('-')[0])
            upperTimebound = float(timeRange.split('-')[1])
            if float(names[-1]) > lowerTimebound and float(
                    names[-1]) <= upperTimebound:
                names[-1] = timeRange
                break
        newcolumnsTuples.append(names)
    newcolumns = pd.MultiIndex.from_tuples(
        newcolumnsTuples,
        names=['DataType', 'Population', 'Statistic', 'Feature', 'Time'])
    newdf = pd.DataFrame(reindexedDf.values,
                         index=reindexedDf.index,
                         columns=newcolumns)
    return newdf
def createSubsettedDataFrame(dimensionDict, dataTypeDfDict, finalDimensions):
    dimensionReductionMatrixList = []

    #Subset index (samples)
    for dataType in dataTypeDfDict:
        tempdf = dataTypeDfDict[dataType].stack().to_frame('temp')
        k = 0
        subsettingList = []
        for i in range(len(tempdf.index.names)):
            if i < dataTypeObservableRangeDict[dataType] or tempdf.index.names[
                    i] in ['Event', 'event']:
                subsettingList.append(slice(None))
            else:
                levelName = tempdf.index.names[i]
                levelValues = list(
                    pd.unique(tempdf.index.get_level_values(levelName)))
                levelValueBooleanList = includeLevelValues2[k]
                levelValueList = []
                print(levelName)
                for j in range(len(levelValues)):
                    if levelValueBooleanList[j]:
                        levelValueList.append(levelValues[j])
                subsettingList.append(levelValueList)
                k += 1

        print('watwat')
        newdf = tempdf.loc[tuple(subsettingList), :]
        #Comment here
        #Undoes the stacking performed at begining of loop
        if dataType != 'singlecell':
            stackingVariable = 'Time'
        else:
            stackingVariable = 'Marker'
        newdf2 = newdf.unstack(stackingVariable).droplevel(axis=1, level=0)
        print(newdf2)
        wat = newdf.groupby(newdf.index.names[:-1], sort=False).first()
        watdf = pd.DataFrame(newdf2.values,
                             index=wat.index,
                             columns=newdf2.columns)
        watdf.columns.name = stackingVariable
        if dataType != 'singlecell':
            newdf3 = reindexDataFrame(newdf2,
                                      watdf,
                                      False,
                                      sortDataTypeLevels=False)
        else:
            newdf3 = newdf2.copy()
        print(newdf3)
        dataTypeDfDict[dataType] = newdf3
        #dataTypeDfDict[dataType] = newdf

    if dataType != 'singlecell':
        #Subset columns (measurables)
        postProcessingMatrices = []
        postProcessingFeatures = []
        allIndexList = []
        allIndexDictList = []
        #Iterate through datatypes
        for dataType in dataTypeDfDict:
            rowList = []
            #Iterate through each row in datatype df; grab dimension names
            for row in range(dataTypeDfDict[dataType].shape[0]):
                names = dataTypeDfDict[dataType].iloc[row, :].name
                dimensionNames = names[:dataTypeObservableRangeDict[dataType]]
                rowList.append(dimensionNames)
            #Go thorugh each level that was selected, add level values of each level
            selectedLevelList = []
            for i, level in enumerate(dataTypeDfDict[dataType].index.
                                      names[:len(dimensionDict[dataType])]):
                levelList = []
                for levelValue, includeLevelValue in zip(
                        list(
                            pd.unique(dataTypeDfDict[dataType].index.
                                      get_level_values(level))),
                        dimensionDict[dataType][i]):
                    if includeLevelValue:
                        levelList.append(levelValue)
                selectedLevelList.append(levelList)

            #Get all possible combinations of level values from each dimension
            print(selectedLevelList)
            allPossibleSelectedLevelCombinations = itertools.product(
                *selectedLevelList)
            print(allPossibleSelectedLevelCombinations)
            rowindexlist = []
            #From original dataframe; select all rows that appear in the all possible combination list
            selectedDimensions = []
            for levelCombination in allPossibleSelectedLevelCombinations:
                if levelCombination in rowList and levelCombination in finalDimensions:
                    indices = [
                        i for i, x in enumerate(rowList)
                        if x == levelCombination
                    ]
                    rowindexlist += indices
            subsettedDf = dataTypeDfDict[dataType].iloc[rowindexlist, :]
            #Move measuarables to columns
            postProcessingDf = subsettedDf.stack().unstack(
                dataTypeDfDict[dataType].index.
                names[:dataTypeObservableRangeDict[dataType]])
            indexList = []
            indexDict = {}
            for row in range(postProcessingDf.shape[0]):
                key = ','.join(
                    list(map(str, list(postProcessingDf.iloc[row, :].name))))
                indexDict[key] = row
                indexList.append(key)
            allIndexList.append(indexList)
            allIndexDictList.append(indexDict)
            postProcessingCommonIndex = postProcessingDf.index
            print(postProcessingDf.index)
            postProcessingFeatures.append(list(postProcessingDf.columns))
            postProcessingMatrices.append(postProcessingDf.values)
        if len(dataTypeDfDict.keys()) > 1:
            result = list(set(allIndexList[0]).intersection(*allIndexList[1:]))
            reorderedResult = []
            for value in allIndexList[0]:
                if value in result:
                    reorderedResult.append(value)
            result = reorderedResult
        else:
            result = allIndexList[0]
        for i, postProcessingMatrix, indexDict in zip(
                range(len(postProcessingMatrices)), postProcessingMatrices,
                allIndexDictList):
            rows = []
            for key in result:
                rows.append(indexDict[key])
            postProcessingMatrices[i] = postProcessingMatrix[rows, :]
        fullPostProcessingMatrix = np.hstack(postProcessingMatrices)
        commonFeatures = [
            item for sublist in postProcessingFeatures for item in sublist
        ]
        fullPostProcessingDf = pd.DataFrame(fullPostProcessingMatrix,
                                            index=postProcessingCommonIndex,
                                            columns=commonFeatures)
    else:
        subsettingDimensions = []
        for dim in finalDimensions:
            subsettingDimensions.append(dim[0])
        fullPostProcessingDf = newdf3.loc[subsettingDimensions]
    return fullPostProcessingDf
Esempio n. 4
0
def draw_faceted_heatmap(data, indexingdf, xaxis, yaxis, zaxis, lognorm,
                         cbarticks, logarithmic, symlog, symlognorm, linthresh,
                         **kwargs):
    unsortedPivotedData = data.pivot_table(index=yaxis,
                                           columns=xaxis,
                                           values=zaxis)
    indexdf = indexingdf.groupby(level=yaxis, sort=False).first()
    data = reindexDataFrame(unsortedPivotedData, indexdf, False)
    if not isinstance(xaxis, list):
        originalColumnOrder = list(
            pd.unique(indexingdf.index.get_level_values(xaxis)))
        if str(originalColumnOrder[0]).isnumeric():
            originalColumnOrder.sort(key=float)
        data = data[originalColumnOrder]
    if not isinstance(yaxis, list):
        originalRowOrder = list(
            pd.unique(indexingdf.index.get_level_values(yaxis)))
        if str(originalRowOrder[0]).isnumeric():
            originalRowOrder.sort(key=float)
        data = data.reindex(originalRowOrder)
    plt.axis('off')
    experimentParametersBool = False
    for fn in os.listdir('misc'):
        if 'experimentParameters' in fn:
            experimentParametersBool = True
            experimentParameters = json.load(open('misc/' + fn, 'r'))
    if experimentParametersBool:
        data = reorderDfByExperimentParameters(data, experimentParameters)

    data.columns.name = xaxis
    if logarithmic:
        g = sns.heatmap(data,
                        norm=lognorm,
                        **kwargs,
                        cbar=True,
                        cbar_kws={
                            "ticks": cbarticks,
                            'label': zaxis
                        })
    elif symlog:
        linthresh = int(linthresh)
        maxlog = int(np.ceil(np.log10(data.values.max())))
        minlog = int(np.ceil(np.log10(-1 * data.values.min())))
        tick_locations = ([-(10**x)
                           for x in range(-linthresh, minlog + 1, 1)][::-1] +
                          [0.0] + [(10**x)
                                   for x in range(-linthresh, maxlog + 1, 1)])
        #generate logarithmic ticks
        g = sns.heatmap(data,
                        norm=symlognorm,
                        cbar_kws={
                            'label': zaxis,
                            'ticks': tick_locations,
                            'format': ticker.LogFormatterMathtext()
                        },
                        **kwargs)
    else:
        g = sns.heatmap(data, **kwargs, cbar=True, cbar_kws={'label': zaxis})

    #Add hiearchical level names and borders to heatmap
    ax1 = plt.gca()
    label_index(ax1, data)
    draw_borders(g, data)
    label_columns(ax1, data)

    label_headers(ax1, data)
def returnFeatureDataStatisticList(inputStatisticDf, dataType,
                                   minTimePointScaleFactor):
    featureStatisticDfList = []
    statisticList = list(
        pd.unique(inputStatisticDf.index.get_level_values('Statistic')))
    for statistic in statisticList:
        #Make statistic sliced dataframes (only one dummy statistic for cytokines/proliferation, many statistics (GFI, CV, % Positive etc.) for cells
        featureStatisticDf = inputStatisticDf.xs([statistic],
                                                 level=['Statistic'])
        if dataType == 'cyt':
            observableName = 'Cytokine'
        elif dataType == 'cell':
            observableName = 'Marker'
        else:
            observableName = 'Metric'

        ###Add individual timepoints of each observable in the datatype as features, primarily to compare their deconvolution against the best kinetic features
        #(to make the point that we need the time aspect of the data to get good deconvolution)###
        #Unstack Observable (move observable to columns; columns now have time-observable))
        individualTimepointDfToReindex = featureStatisticDf.unstack(
            observableName)
        #Grab a dataframe containing the first observable from the statisticDf
        reindexingDf = featureStatisticDf.xs([
            list(
                pd.unique(
                    featureStatisticDf.index.get_level_values(observableName)))
            [0]
        ],
                                             level=[observableName])
        #Unstacking automatically sorts the dataframe in lexographic order. We use this method to recover the original ordering of the index
        individualTimepointDfBeforeNewColumns = reindexDataFrame(
            individualTimepointDfToReindex, reindexingDf, False)
        #Make new column index for individualtimepoint df
        newDfList = []
        timeslicelist = []
        #Go through each timepoint and observable and construct a list containing the timepoint as the timeslicestart and end, the observable as the observable,
        #and the feature type as "Individual Observation" (allows for easy subsetting later on)
        for timepoint in pd.unique(
                individualTimepointDfBeforeNewColumns.columns.get_level_values(
                    'Time')):
            currentTimeDf = individualTimepointDfBeforeNewColumns.loc[:,
                                                                      timepoint]
            for observable in currentTimeDf:
                timeslicelist.append([
                    timepoint, timepoint, 'IndividualObservation', observable
                ])
        #Construct new column multindex and dataframe with previously constructed list
        newMultiIndexColumns = pd.MultiIndex.from_tuples(timeslicelist,
                                                         names=[
                                                             'TimeSliceStart',
                                                             'TimeSliceEnd',
                                                             'FeatureType',
                                                             'Observable'
                                                         ])
        individualTimepointDf = pd.DataFrame(
            individualTimepointDfBeforeNewColumns.values,
            index=individualTimepointDfBeforeNewColumns.index,
            columns=newMultiIndexColumns)

        #Grab all "timepoint regions" possible for the timeseries (5-20 hours, 5-25 hours etc.) and start iterating through them
        timepointRegions = returnTimePointEndpoints(featureStatisticDf,
                                                    minTimePointScaleFactor)
        timepointRegionDfList = []
        for timepointRegion in timepointRegions:
            timeStart = timepointRegion[0]
            timeEnd = timepointRegion[1]
            timeStartIndex = list(featureStatisticDf.columns).index(timeStart)
            timeEndIndex = list(featureStatisticDf.columns).index(timeEnd)

            #Get all observables for this datatype and statistic (doesn't change in cyt/prolif, but does change per statistic in cells)
            observableList = list(
                pd.unique(
                    featureStatisticDf.index.get_level_values(observableName)))
            #Slice the time kinetics data into specified region
            df = featureStatisticDf.iloc[:, timeStartIndex:timeEndIndex + 1]
            #Start calculating kinetic features from kinetic feature dictionary, and start adding the returned dataframes to a list
            kineticFeatureList = []
            for kineticFeature in kineticFeatureDictionary:
                kineticFeatureDf = kineticFeatureDictionary[kineticFeature](
                    df, observableList)
                kineticFeatureList.append(kineticFeatureDf)
            #Combine all kinetic features for all observables for a particular time slice into single dataframe
            timepointRegionFeatureDf = pd.concat(kineticFeatureList, axis=1)
            timepointRegionDfList.append(timepointRegionFeatureDf)
            print('\t\t\t' + str(timeStart) + 'hrs-' + str(timeEnd) +
                  'hrs done!')
        #all feature dataframes for all observables in a statistic get concatenated into a single dataframe
        featureStatisticMultiIndex = pd.MultiIndex.from_tuples(
            timepointRegions, names=['TimeSliceStart', 'TimeSliceEnd'])
        featureStatisticDf = pd.concat(
            timepointRegionDfList,
            axis=1,
            keys=timepointRegions,
            names=['TimeSliceStart', 'TimeSliceEnd'])
        #The kinetic feature values and the invidual timepoint df constructed earlier are joined columnwise to produce the final feature df for the statistic
        featureStatisticDfWithIndividualTimepoints = pd.concat(
            [featureStatisticDf, individualTimepointDf], axis=1)
        featureStatisticDfList.append(
            featureStatisticDfWithIndividualTimepoints)
        print('\t\t' + str(statistic) + ' done!')
    return featureStatisticDfList