def draw_faceted_heatmap(data, indexingdf, xaxis, yaxis, zaxis, lognorm, cbarticks, logarithmic, symlog, symlognorm, linthresh, **kwargs): originalColumnOrder = list( pd.unique(indexingdf.index.get_level_values(xaxis))) unsortedPivotedData = data.pivot_table(index=yaxis, columns=xaxis, values=zaxis) indexdf = indexingdf.groupby(level=yaxis, sort=False).first() data = reindexDataFrame(unsortedPivotedData, indexdf, False) data = data[originalColumnOrder] data.columns.name = xaxis plt.axis('off') if logarithmic: g = sns.heatmap(data, norm=lognorm, **kwargs, cbar=True, cbar_kws={ "ticks": cbarticks, 'label': zaxis }) elif symlog: linthresh = int(linthresh) maxlog = int(np.ceil(np.log10(data.values.max()))) minlog = int(np.ceil(np.log10(-1 * data.values.min()))) #tick_locations=([-(10**x) for x in range(-1*int(linthresh), minlog+1, 1)][::-1]+[0.0]+[(10**x) for x in range(-minlog,maxlog+1, 1)]) tick_locations = ([-(10**x) for x in range(-linthresh, minlog + 1, 1)][::-1] + [0.0] + [(10**x) for x in range(-linthresh, maxlog + 1, 1)]) #generate logarithmic ticks g = sns.heatmap(data, norm=symlognorm, cbar_kws={ 'label': zaxis, 'ticks': tick_locations, 'format': ticker.LogFormatterMathtext() }, **kwargs) #g = sns.heatmap(data, norm=symlognorm,**kwargs,cbar=True,cbar_kws={'label':zaxis}) else: g = sns.heatmap(data, **kwargs, cbar=True, cbar_kws={'label': zaxis}) #Add hiearchical level names and borders to heatmap ax1 = plt.gca() label_index(ax1, data) draw_borders(g, data) label_columns(ax1, data) label_headers(ax1, data)
def mergeTimepointLabels(df, reindexingDf, timepointLabelMergingList): dfToReindex = df.unstack('Time') reindexedDf = reindexDataFrame(dfToReindex, reindexingDf, False) newcolumnsTuples = [] for col in range(reindexedDf.shape[1]): names = list(reindexedDf.iloc[:, col].name) for i, timeRange in enumerate(timepointLabelMergingList): lowerTimebound = float(timeRange.split('-')[0]) upperTimebound = float(timeRange.split('-')[1]) if float(names[-1]) > lowerTimebound and float( names[-1]) <= upperTimebound: names[-1] = timeRange break newcolumnsTuples.append(names) newcolumns = pd.MultiIndex.from_tuples( newcolumnsTuples, names=['DataType', 'Population', 'Statistic', 'Feature', 'Time']) newdf = pd.DataFrame(reindexedDf.values, index=reindexedDf.index, columns=newcolumns) return newdf
def createSubsettedDataFrame(dimensionDict, dataTypeDfDict, finalDimensions): dimensionReductionMatrixList = [] #Subset index (samples) for dataType in dataTypeDfDict: tempdf = dataTypeDfDict[dataType].stack().to_frame('temp') k = 0 subsettingList = [] for i in range(len(tempdf.index.names)): if i < dataTypeObservableRangeDict[dataType] or tempdf.index.names[ i] in ['Event', 'event']: subsettingList.append(slice(None)) else: levelName = tempdf.index.names[i] levelValues = list( pd.unique(tempdf.index.get_level_values(levelName))) levelValueBooleanList = includeLevelValues2[k] levelValueList = [] print(levelName) for j in range(len(levelValues)): if levelValueBooleanList[j]: levelValueList.append(levelValues[j]) subsettingList.append(levelValueList) k += 1 print('watwat') newdf = tempdf.loc[tuple(subsettingList), :] #Comment here #Undoes the stacking performed at begining of loop if dataType != 'singlecell': stackingVariable = 'Time' else: stackingVariable = 'Marker' newdf2 = newdf.unstack(stackingVariable).droplevel(axis=1, level=0) print(newdf2) wat = newdf.groupby(newdf.index.names[:-1], sort=False).first() watdf = pd.DataFrame(newdf2.values, index=wat.index, columns=newdf2.columns) watdf.columns.name = stackingVariable if dataType != 'singlecell': newdf3 = reindexDataFrame(newdf2, watdf, False, sortDataTypeLevels=False) else: newdf3 = newdf2.copy() print(newdf3) dataTypeDfDict[dataType] = newdf3 #dataTypeDfDict[dataType] = newdf if dataType != 'singlecell': #Subset columns (measurables) postProcessingMatrices = [] postProcessingFeatures = [] allIndexList = [] allIndexDictList = [] #Iterate through datatypes for dataType in dataTypeDfDict: rowList = [] #Iterate through each row in datatype df; grab dimension names for row in range(dataTypeDfDict[dataType].shape[0]): names = dataTypeDfDict[dataType].iloc[row, :].name dimensionNames = names[:dataTypeObservableRangeDict[dataType]] rowList.append(dimensionNames) #Go thorugh each level that was selected, add level values of each level selectedLevelList = [] for i, level in enumerate(dataTypeDfDict[dataType].index. names[:len(dimensionDict[dataType])]): levelList = [] for levelValue, includeLevelValue in zip( list( pd.unique(dataTypeDfDict[dataType].index. get_level_values(level))), dimensionDict[dataType][i]): if includeLevelValue: levelList.append(levelValue) selectedLevelList.append(levelList) #Get all possible combinations of level values from each dimension print(selectedLevelList) allPossibleSelectedLevelCombinations = itertools.product( *selectedLevelList) print(allPossibleSelectedLevelCombinations) rowindexlist = [] #From original dataframe; select all rows that appear in the all possible combination list selectedDimensions = [] for levelCombination in allPossibleSelectedLevelCombinations: if levelCombination in rowList and levelCombination in finalDimensions: indices = [ i for i, x in enumerate(rowList) if x == levelCombination ] rowindexlist += indices subsettedDf = dataTypeDfDict[dataType].iloc[rowindexlist, :] #Move measuarables to columns postProcessingDf = subsettedDf.stack().unstack( dataTypeDfDict[dataType].index. names[:dataTypeObservableRangeDict[dataType]]) indexList = [] indexDict = {} for row in range(postProcessingDf.shape[0]): key = ','.join( list(map(str, list(postProcessingDf.iloc[row, :].name)))) indexDict[key] = row indexList.append(key) allIndexList.append(indexList) allIndexDictList.append(indexDict) postProcessingCommonIndex = postProcessingDf.index print(postProcessingDf.index) postProcessingFeatures.append(list(postProcessingDf.columns)) postProcessingMatrices.append(postProcessingDf.values) if len(dataTypeDfDict.keys()) > 1: result = list(set(allIndexList[0]).intersection(*allIndexList[1:])) reorderedResult = [] for value in allIndexList[0]: if value in result: reorderedResult.append(value) result = reorderedResult else: result = allIndexList[0] for i, postProcessingMatrix, indexDict in zip( range(len(postProcessingMatrices)), postProcessingMatrices, allIndexDictList): rows = [] for key in result: rows.append(indexDict[key]) postProcessingMatrices[i] = postProcessingMatrix[rows, :] fullPostProcessingMatrix = np.hstack(postProcessingMatrices) commonFeatures = [ item for sublist in postProcessingFeatures for item in sublist ] fullPostProcessingDf = pd.DataFrame(fullPostProcessingMatrix, index=postProcessingCommonIndex, columns=commonFeatures) else: subsettingDimensions = [] for dim in finalDimensions: subsettingDimensions.append(dim[0]) fullPostProcessingDf = newdf3.loc[subsettingDimensions] return fullPostProcessingDf
def draw_faceted_heatmap(data, indexingdf, xaxis, yaxis, zaxis, lognorm, cbarticks, logarithmic, symlog, symlognorm, linthresh, **kwargs): unsortedPivotedData = data.pivot_table(index=yaxis, columns=xaxis, values=zaxis) indexdf = indexingdf.groupby(level=yaxis, sort=False).first() data = reindexDataFrame(unsortedPivotedData, indexdf, False) if not isinstance(xaxis, list): originalColumnOrder = list( pd.unique(indexingdf.index.get_level_values(xaxis))) if str(originalColumnOrder[0]).isnumeric(): originalColumnOrder.sort(key=float) data = data[originalColumnOrder] if not isinstance(yaxis, list): originalRowOrder = list( pd.unique(indexingdf.index.get_level_values(yaxis))) if str(originalRowOrder[0]).isnumeric(): originalRowOrder.sort(key=float) data = data.reindex(originalRowOrder) plt.axis('off') experimentParametersBool = False for fn in os.listdir('misc'): if 'experimentParameters' in fn: experimentParametersBool = True experimentParameters = json.load(open('misc/' + fn, 'r')) if experimentParametersBool: data = reorderDfByExperimentParameters(data, experimentParameters) data.columns.name = xaxis if logarithmic: g = sns.heatmap(data, norm=lognorm, **kwargs, cbar=True, cbar_kws={ "ticks": cbarticks, 'label': zaxis }) elif symlog: linthresh = int(linthresh) maxlog = int(np.ceil(np.log10(data.values.max()))) minlog = int(np.ceil(np.log10(-1 * data.values.min()))) tick_locations = ([-(10**x) for x in range(-linthresh, minlog + 1, 1)][::-1] + [0.0] + [(10**x) for x in range(-linthresh, maxlog + 1, 1)]) #generate logarithmic ticks g = sns.heatmap(data, norm=symlognorm, cbar_kws={ 'label': zaxis, 'ticks': tick_locations, 'format': ticker.LogFormatterMathtext() }, **kwargs) else: g = sns.heatmap(data, **kwargs, cbar=True, cbar_kws={'label': zaxis}) #Add hiearchical level names and borders to heatmap ax1 = plt.gca() label_index(ax1, data) draw_borders(g, data) label_columns(ax1, data) label_headers(ax1, data)
def returnFeatureDataStatisticList(inputStatisticDf, dataType, minTimePointScaleFactor): featureStatisticDfList = [] statisticList = list( pd.unique(inputStatisticDf.index.get_level_values('Statistic'))) for statistic in statisticList: #Make statistic sliced dataframes (only one dummy statistic for cytokines/proliferation, many statistics (GFI, CV, % Positive etc.) for cells featureStatisticDf = inputStatisticDf.xs([statistic], level=['Statistic']) if dataType == 'cyt': observableName = 'Cytokine' elif dataType == 'cell': observableName = 'Marker' else: observableName = 'Metric' ###Add individual timepoints of each observable in the datatype as features, primarily to compare their deconvolution against the best kinetic features #(to make the point that we need the time aspect of the data to get good deconvolution)### #Unstack Observable (move observable to columns; columns now have time-observable)) individualTimepointDfToReindex = featureStatisticDf.unstack( observableName) #Grab a dataframe containing the first observable from the statisticDf reindexingDf = featureStatisticDf.xs([ list( pd.unique( featureStatisticDf.index.get_level_values(observableName))) [0] ], level=[observableName]) #Unstacking automatically sorts the dataframe in lexographic order. We use this method to recover the original ordering of the index individualTimepointDfBeforeNewColumns = reindexDataFrame( individualTimepointDfToReindex, reindexingDf, False) #Make new column index for individualtimepoint df newDfList = [] timeslicelist = [] #Go through each timepoint and observable and construct a list containing the timepoint as the timeslicestart and end, the observable as the observable, #and the feature type as "Individual Observation" (allows for easy subsetting later on) for timepoint in pd.unique( individualTimepointDfBeforeNewColumns.columns.get_level_values( 'Time')): currentTimeDf = individualTimepointDfBeforeNewColumns.loc[:, timepoint] for observable in currentTimeDf: timeslicelist.append([ timepoint, timepoint, 'IndividualObservation', observable ]) #Construct new column multindex and dataframe with previously constructed list newMultiIndexColumns = pd.MultiIndex.from_tuples(timeslicelist, names=[ 'TimeSliceStart', 'TimeSliceEnd', 'FeatureType', 'Observable' ]) individualTimepointDf = pd.DataFrame( individualTimepointDfBeforeNewColumns.values, index=individualTimepointDfBeforeNewColumns.index, columns=newMultiIndexColumns) #Grab all "timepoint regions" possible for the timeseries (5-20 hours, 5-25 hours etc.) and start iterating through them timepointRegions = returnTimePointEndpoints(featureStatisticDf, minTimePointScaleFactor) timepointRegionDfList = [] for timepointRegion in timepointRegions: timeStart = timepointRegion[0] timeEnd = timepointRegion[1] timeStartIndex = list(featureStatisticDf.columns).index(timeStart) timeEndIndex = list(featureStatisticDf.columns).index(timeEnd) #Get all observables for this datatype and statistic (doesn't change in cyt/prolif, but does change per statistic in cells) observableList = list( pd.unique( featureStatisticDf.index.get_level_values(observableName))) #Slice the time kinetics data into specified region df = featureStatisticDf.iloc[:, timeStartIndex:timeEndIndex + 1] #Start calculating kinetic features from kinetic feature dictionary, and start adding the returned dataframes to a list kineticFeatureList = [] for kineticFeature in kineticFeatureDictionary: kineticFeatureDf = kineticFeatureDictionary[kineticFeature]( df, observableList) kineticFeatureList.append(kineticFeatureDf) #Combine all kinetic features for all observables for a particular time slice into single dataframe timepointRegionFeatureDf = pd.concat(kineticFeatureList, axis=1) timepointRegionDfList.append(timepointRegionFeatureDf) print('\t\t\t' + str(timeStart) + 'hrs-' + str(timeEnd) + 'hrs done!') #all feature dataframes for all observables in a statistic get concatenated into a single dataframe featureStatisticMultiIndex = pd.MultiIndex.from_tuples( timepointRegions, names=['TimeSliceStart', 'TimeSliceEnd']) featureStatisticDf = pd.concat( timepointRegionDfList, axis=1, keys=timepointRegions, names=['TimeSliceStart', 'TimeSliceEnd']) #The kinetic feature values and the invidual timepoint df constructed earlier are joined columnwise to produce the final feature df for the statistic featureStatisticDfWithIndividualTimepoints = pd.concat( [featureStatisticDf, individualTimepointDf], axis=1) featureStatisticDfList.append( featureStatisticDfWithIndividualTimepoints) print('\t\t' + str(statistic) + ' done!') return featureStatisticDfList