def main(args=None): args = process_args(args) # if more than one bed file is given, they are concatenated into one file. if len(args.regionsFileName) > 1: bed_file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') for bed in args.regionsFileName: bed.close() # concatenate all intermediate tempfiles into one shutil.copyfileobj(open(bed.name, 'U'), bed_file) # append hash and label based on the file name label = os.path.basename(bed.name) if label.endswith(".bed"): label = label[:-4] bed_file.write("# {}\n".format(label)) bed_file.seek(0) else: bed_file = args.regionsFileName[0] parameters = {'upstream': args.beforeRegionStartLength, 'downstream': args.afterRegionStartLength, 'body': args.regionBodyLength, 'bin size': args.binSize, 'ref point': args.referencePoint, 'verbose': args.verbose, 'bin avg type': args.averageTypeBins, 'missing data as zero': args.missingDataAsZero, 'min threshold': args.minThreshold, 'max threshold': args.maxThreshold, 'scale': args.scale, 'skip zeros': args.skipZeros, 'nan after end': args.nanAfterEnd, 'proc number': args.numberOfProcessors, 'sort regions': args.sortRegions, 'sort using': args.sortUsing } hm = heatmapper.heatmapper() scores_file_list = args.scoreFileName hm.computeMatrix(scores_file_list, bed_file, parameters, verbose=args.verbose) if args.sortRegions != 'no': hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions) hm.save_matrix(args.outFileName) bed_file.close() if len(args.regionsFileName) > 1: os.remove(bed_file.name) if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) # TODO This isn't implemented # if args.outFileNameData: # hm.saveTabulatedValues(args.outFileNameData) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions)
def __compute_matrix(regions, bigwigs, configfile, parameters, refIndex=None): """ computing the corresponding matrix using deeptools/computeMatrix """ hm = heatmapper() if refIndex: bigwigs = [bigwigs[int(i) - 1] for i in refIndex] index = 1 if not refIndex else 0 matrix_args = argparse.Namespace() matrix_args.transcriptID = configfile['transcriptID'][index] matrix_args.exonID = configfile['exonID'][index] matrix_args.transcript_id_designator = configfile[ 'transcript_id_designator'][index] matrix_args.samplesLabel = configfile['samplesLabel'] matrix_args.exonID = configfile['exonID'][index] hm.computeMatrix(score_file_list=bigwigs, regions_file=regions, parameters=parameters, blackListFileName=configfile["blackListFileName"][index], verbose=configfile["verbose"][index], allArgs=matrix_args) return hm
def main(): """ Either the closest genes are foune and a deeptools-like matrix is created, if annotation file is provided, or a deeptools-like matrix directly from a provided enriched regions name-based files. In either case the output matrix is ordered and is appended to the input deeptools matrix. """ parser = parse_args() args = parser.parse_args() # Check if the feature names are consistent between all the tables __read_tables_columns(args.tables, args.Features) hm = heatmapper() hm.read_matrix_file(args.deeptoolsMatrix) regions = parseMatrixRegions(hm.matrix.get_regions()) # Using bedtool closest to map annotation and regions if args.annotation: closestMapping = find_closest_genes( regions, args.annotation, args.annotationFeature, args.annotationOutput, args.referencePoint, args.closestGenesOutput ) # XXX instead of all these arguments i can simply add args. # paste an extra column per table to the input matrix extract_ge_folchange_per_peak(regions, args.tables, closestMapping, args.Features, args.idcolumn, hm) else: # No closest gene is involved in this case , each enrichment id is individually checked and values are updated. update_matrix_values(regions, args.tables, args.Features, args.idcolumn, hm) # save the joint matrix obtained from either of cases hm.save_matrix(os.path.join(args.outputMatrix))
def main(args=None): args = process_args(args) hm = heatmapper.heatmapper() matrix_file = args.matrixFile.name args.matrixFile.close() hm.read_matrix_file(matrix_file) if args.kmeans is not None: hm.matrix.hmcluster(args.kmeans, method='kmeans') else: if args.hclust is not None: print("Performing hierarchical clustering." "Please note that it might be very slow for large datasets.\n") hm.matrix.hmcluster(args.hclust, method='hierarchical') group_len_ratio = np.diff(hm.matrix.group_boundaries) / float(len(hm.matrix.regions)) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) sys.stderr.write("WARNING: Group '{}' is too small for plotting, you might want to remove it. \n".format(hm.matrix.group_labels[problem[0]])) if args.regionsLabel: hm.matrix.set_group_labels(args.regionsLabel) if args.samplesLabel and len(args.samplesLabel): hm.matrix.set_sample_labels(args.samplesLabel) if args.outFileNameData: hm.save_tabulated_values(args.outFileNameData, reference_point_label=args.refPointLabel, start_label=args.startLabel, end_label=args.endLabel, averagetype=args.averageType) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) prof = Profile(hm, args.outFileName, plot_title=args.plotTitle, y_axis_label=args.yAxisLabel, y_min=args.yMin, y_max=args.yMax, averagetype=args.averageType, reference_point_label=args.refPointLabel, start_label=args.startLabel, end_label=args.endLabel, plot_height=args.plotHeight, plot_width=args.plotWidth, per_group=args.perGroup, plot_type=args.plotType, image_format=args.plotFileFormat, color_list=args.colors, legend_location=args.legendLocation, plots_per_row=args.numPlotsPerRow, dpi=args.dpi) if args.plotType == 'heatmap': prof.plot_heatmap() elif args.plotType == 'overlapped_lines': prof.plot_hexbin() else: prof.plot_profile()
def main(args=None): if len(sys.argv) == 1: args = ["-h"] if len(sys.argv) == 2: args = [sys.argv[1], "-h"] args = parse_arguments().parse_args(args) hm = heatmapper.heatmapper() if not isinstance(args.matrixFile, list): hm.read_matrix_file(args.matrixFile) if args.command == 'info': printInfo(hm) elif args.command == 'subset': sIdx = getSampleBounds(args, hm) gIdx, gBounds = getGroupBounds(args, hm) # groups hm.matrix.regions = subsetRegions(hm, gIdx) # matrix hm.matrix.matrix = hm.matrix.matrix[gIdx, :] hm.matrix.matrix = hm.matrix.matrix[:, sIdx] # boundaries if args.samples is None: args.samples = hm.matrix.sample_labels hm.matrix.sample_boundaries = hm.matrix.sample_boundaries[ 0:len(args.samples) + 1] hm.matrix.group_boundaries = gBounds.tolist() # special params keepIdx = set() for _, sample in enumerate(hm.matrix.sample_labels): if sample in args.samples: keepIdx.add(_) for param in hm.special_params: hm.parameters[param] = [ v for k, v in enumerate(hm.parameters[param]) if k in keepIdx ] # labels hm.matrix.sample_labels = args.samples if args.groups is None: args.groups = hm.matrix.group_labels hm.matrix.group_labels = args.groups # save hm.save_matrix(args.outFileName) elif args.command == 'filterStrand': filterHeatmap(hm, args) hm.save_matrix(args.outFileName) elif args.command == 'rbind': rbindMatrices(hm, args) hm.save_matrix(args.outFileName) elif args.command == 'cbind': cbindMatrices(hm, args) hm.save_matrix(args.outFileName) elif args.command == 'sort': sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator) hm.save_matrix(args.outFileName) else: sys.exit("Unknown command {0}!\n".format(args.command))
def main(args=None): args = process_args(args) hm = heatmapper.heatmapper() matrix_file = args.matrixFile.name args.matrixFile.close() hm.read_matrix_file(matrix_file) if args.kmeans is not None: hm.matrix.hmcluster(args.kmeans, method='kmeans') else: if args.hclust is not None: print "Performing hierarchical clustering." \ "Please note that it might be very slow for large datasets.\n" hm.matrix.hmcluster(args.hclust, method='hierarchical') group_len_ratio = np.diff(hm.matrix.group_boundaries) / len(hm.matrix.regions) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) sys.stderr.write("WARNING: Group '{}' is too small for plotting, you might want to remove it. " "There will likely be an error message from matplotlib regarding this below.\n".format(hm.matrix.group_labels[problem[0]])) if args.regionsLabel: hm.matrix.set_group_labels(args.regionsLabel) if args.samplesLabel and len(args.samplesLabel): hm.matrix.set_sample_labels(args.samplesLabel) if args.sortRegions != 'no': hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions) if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) colormap_dict = {'colorMap': args.colorMap, 'colorList': args.colorList, 'colorNumber': args.colorNumber, 'missingDataColor': args.missingDataColor} plotMatrix(hm, args.outFileName, colormap_dict, args.plotTitle, args.xAxisLabel, args.yAxisLabel, args.regionsLabel, args.zMin, args.zMax, args.yMin, args.yMax, args.averageTypeSummaryPlot, args.refPointLabel, args.startLabel, args.endLabel, args.heatmapHeight, args.heatmapWidth, args.perGroup, args.whatToShow, image_format=args.plotFileFormat, legend_location=args.legendLocation)
def zMn_set(f): if zMinTest: zMn=0 matrixFlatten=None content=heatmapper.heatmapper() content.readMatrixFile(f) matrixFlatten = flattenMatrix(content.matrixDict) zMn = np.percentile(matrixFlatten, 1.0) return f, zMn
def zMn_set(f): if zMinTest: zMn = 0 matrixFlatten = None content = heatmapper.heatmapper() content.readMatrixFile(f) matrixFlatten = flattenMatrix(content.matrixDict) zMn = np.percentile(matrixFlatten, 1.0) return f, zMn
def cbindMatrices(hm, args): """ Bind columns from different matrices according to the group and region names Missing regions are left as NA """ hm2 = heatmapper.heatmapper() # Make a dict of region name:row associations hm.read_matrix_file(args.matrixFile[0]) d = dict({x: dict() for x in hm.parameters["group_labels"]}) for idx, group in enumerate(hm.parameters["group_labels"]): s = hm.parameters["group_boundaries"][idx] e = hm.parameters["group_boundaries"][idx + 1] for idx2, reg in enumerate(hm.matrix.regions[s:e]): d[group][reg[2]] = idx2 + s # Iterate through the other matrices for idx in range(1, len(args.matrixFile)): hm2.read_matrix_file(args.matrixFile[idx]) # Add the sample labels hm.parameters['sample_labels'].extend(hm2.parameters['sample_labels']) # Add the sample boundaries lens = [ x + hm.parameters['sample_boundaries'][-1] for x in hm2.parameters['sample_boundaries'] ][1:] hm.parameters['sample_boundaries'].extend(lens) # Add on additional NA initialized columns ncol = hm.matrix.matrix.shape[1] hm.matrix.matrix = np.hstack( (hm.matrix.matrix, np.empty(hm2.matrix.matrix.shape))) hm.matrix.matrix[:, ncol:] = np.NAN # Update the values for idx2, group in enumerate(hm2.parameters["group_labels"]): if group not in d: continue s = hm2.parameters["group_boundaries"][idx2] e = hm2.parameters["group_boundaries"][idx2 + 1] for idx3, reg in enumerate(hm2.matrix.regions[s:e]): if reg[2] not in d[group]: continue hm.matrix.matrix[d[group][reg[2]], ncol:] = hm2.matrix.matrix[s + idx3, :] # Append the special params for s in hm.special_params: hm.parameters[s].extend(hm2.parameters[s]) # Update the sample parameters hm.matrix.sample_labels = hm.parameters['sample_labels'] hm.matrix.sample_boundaries = hm.parameters['sample_boundaries']
def zMx_set(f): if zMaxTest: zMx=0 matrixFlatten=None content=heatmapper.heatmapper() content.readMatrixFile(f) matrixFlatten = flattenMatrix(content.matrixDict) # try to avoid outliers by using np.percentile zMx = np.percentile(matrixFlatten, 98.0) print np.shape(content.matrixDict["genes"])[0] return f, zMx
def zMx_set(f): if zMaxTest: zMx = 0 matrixFlatten = None content = heatmapper.heatmapper() content.readMatrixFile(f) matrixFlatten = flattenMatrix(content.matrixDict) # try to avoid outliers by using np.percentile zMx = np.percentile(matrixFlatten, 98.0) print np.shape(content.matrixDict["genes"])[0] return f, zMx
def cbindMatrices(hm, args): """ Bind columns from different matrices according to the group and region names Missing regions are left as NA """ hm2 = heatmapper.heatmapper() # Make a dict of region name:row associations hm.read_matrix_file(args.matrixFile[0]) d = dict({x: dict() for x in hm.parameters["group_labels"]}) for idx, group in enumerate(hm.parameters["group_labels"]): s = hm.parameters["group_boundaries"][idx] e = hm.parameters["group_boundaries"][idx + 1] for idx2, reg in enumerate(hm.matrix.regions[s:e]): d[group][reg[2]] = idx2 + s # Iterate through the other matrices for idx in range(1, len(args.matrixFile)): hm2.read_matrix_file(args.matrixFile[idx]) # Add the sample labels hm.parameters['sample_labels'].extend(hm2.parameters['sample_labels']) # Add the sample boundaries lens = [x + hm.parameters['sample_boundaries'][-1] for x in hm2.parameters['sample_boundaries']][1:] hm.parameters['sample_boundaries'].extend(lens) # Add on additional NA initialized columns ncol = hm.matrix.matrix.shape[1] hm.matrix.matrix = np.hstack((hm.matrix.matrix, np.empty(hm2.matrix.matrix.shape))) hm.matrix.matrix[:, ncol:] = np.NAN # Update the values for idx2, group in enumerate(hm2.parameters["group_labels"]): if group not in d: continue s = hm2.parameters["group_boundaries"][idx2] e = hm2.parameters["group_boundaries"][idx2 + 1] for idx3, reg in enumerate(hm2.matrix.regions[s:e]): if reg[2] not in d[group]: continue hm.matrix.matrix[d[group][reg[2]], ncol:] = hm2.matrix.matrix[s + idx3, :] # Append the special params for s in hm.special_params: hm.parameters[s].extend(hm2.parameters[s]) # Update the sample parameters hm.matrix.sample_labels = hm.parameters['sample_labels'] hm.matrix.sample_boundaries = hm.parameters['sample_boundaries']
def main(args=None): if len(sys.argv) == 1: args = ["-h"] if len(sys.argv) == 2: args = [sys.argv[1], "-h"] args = parse_arguments().parse_args(args) hm = heatmapper.heatmapper() if not isinstance(args.matrixFile, list): hm.read_matrix_file(args.matrixFile) if args.command == 'info': printInfo(hm) elif args.command == 'subset': sIdx = getSampleBounds(args, hm) gIdx, gBounds = getGroupBounds(args, hm) # groups hm.matrix.regions = subsetRegions(hm, gIdx) # matrix hm.matrix.matrix = hm.matrix.matrix[gIdx, :] hm.matrix.matrix = hm.matrix.matrix[:, sIdx] # boundaries if args.samples is None: args.samples = hm.matrix.sample_labels hm.matrix.sample_boundaries = hm.matrix.sample_boundaries[0:len(args.samples) + 1] hm.matrix.group_boundaries = gBounds.tolist() # labels hm.matrix.sample_labels = args.samples if args.groups is None: args.groups = hm.matrix.group_labels hm.matrix.group_labels = args.groups # save hm.save_matrix(args.outFileName) elif args.command == 'filterStrand': filterHeatmap(hm, args) hm.save_matrix(args.outFileName) elif args.command == 'rbind': rbindMatrices(hm, args) hm.save_matrix(args.outFileName) elif args.command == 'cbind': cbindMatrices(hm, args) hm.save_matrix(args.outFileName) elif args.command == 'sort': sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator) hm.save_matrix(args.outFileName) else: sys.exit("Unknown command {0}!\n".format(args.command))
def main(args=None): args = process_args(args) parameters = {'upstream': args.beforeRegionStartLength, 'downstream': args.afterRegionStartLength, 'body': args.regionBodyLength, 'bin size': args.binSize, 'ref point': args.referencePoint, 'verbose': args.verbose, 'bin avg type': args.averageTypeBins, 'missing data as zero': args.missingDataAsZero, 'min threshold': args.minThreshold, 'max threshold': args.maxThreshold, 'scale': args.scale, 'skip zeros': args.skipZeros, 'nan after end': args.nanAfterEnd, 'proc number': args.numberOfProcessors, 'sort regions': args.sortRegions, 'sort using': args.sortUsing, 'unscaled 5 prime': args.unscaled5prime, 'unscaled 3 prime': args.unscaled3prime } hm = heatmapper.heatmapper() scores_file_list = args.scoreFileName hm.computeMatrix(scores_file_list, args.regionsFileName, parameters, blackListFileName=args.blackListFileName, verbose=args.verbose, allArgs=args) if args.sortRegions != 'no': sortUsingSamples = [] if args.sortUsingSamples is not None: for i in args.sortUsingSamples: if (i > 0 and i <= hm.matrix.get_num_samples()): sortUsingSamples.append(i - 1) else: exit("The value {0} for --sortUsingSamples is not valid. Only values from 1 to {1} are allowed.".format(args.sortUsingSamples, hm.matrix.get_num_samples())) print('Samples used for ordering within each group: ', sortUsingSamples) hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions, sample_list=sortUsingSamples) hm.save_matrix(args.outFileName) if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions)
def main(args=None): args = process_args(args) hm = heatmapper.heatmapper() matrix_file = args.matrixFile.name args.matrixFile.close() hm.read_matrix_file(matrix_file, default_group_name=args.regionsLabel) if args.kmeans is not None: hm.matrix.hmcluster(args.kmeans, method='kmeans') if len(args.regionsLabel): hm.matrix.set_group_labels(args.regionsLabel) if args.samplesLabel and len(args.samplesLabel): hm.matrix.set_sample_labels(args.samplesLabel) # if args.outFileNameData: # hm.saveTabulatedValues(args.outFileNameData) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) prof = Profile(hm, args.outFileName, plot_title=args.plotTitle, y_axis_label=args.yAxisLabel, y_min=args.yMin, y_max=args.yMax, averagetype=args.averageType, reference_point_label=args.refPointLabel, start_label=args.startLabel, end_label=args.endLabel, plot_height=args.plotHeight, plot_width=args.plotWidth, per_group=args.perGroup, plot_type=args.plotType, image_format=args.plotFileFormat, color_list=args.colors, legend_location=args.legendLocation, plots_per_row=args.numPlotsPerRow) if args.plotType == 'heatmap': prof.plot_heatmap() elif args.plotType == 'overlapped_lines': prof.plot_hexbin() else: prof.plot_profile()
def rbindMatrices(hm, args): """ This only supports a single group at this point It's assumed that the same samples are present in both and in the exact same order """ hm2 = heatmapper.heatmapper() hm.read_matrix_file(args.matrixFile[0]) for idx in range(1, len(args.matrixFile)): hm2.read_matrix_file(args.matrixFile[idx]) for idx, group in enumerate(hm2.parameters["group_labels"]): if group in hm.parameters["group_labels"]: insertMatrix(hm, hm2, group) else: appendMatrix(hm, hm2, group) # Update the group boundaries attribute hm.matrix.group_labels = hm.parameters['group_labels'] hm.matrix.group_boundaries = hm.parameters['group_boundaries']
def main(args): parameters = { 'upstream': args.beforeRegionStartLength, 'downstream': args.afterRegionStartLength, 'body': args.regionBodyLength, 'bin size': args.binSize, 'ref point': args.referencePoint, 'verbose': args.verbose, 'bin avg type': args.averageTypeBins, 'missing data as zero': args.missingDataAsZero, 'min threshold': args.minThreshold, 'max threshold': args.maxThreshold, 'scale': args.scale, 'skip zeros': args.skipZeros, 'nan after end': args.nanAfterEnd, 'proc number': args.numberOfProcessors, } hm = heatmapper.heatmapper() hm.computeMatrix(args.scoreFileName.name, args.regionsFileName, parameters, verbose=args.verbose) if args.sortRegions != 'no': hm.sortMatrix(sort_using=args.sortUsing, sort_method=args.sortRegions) hm.saveMatrix(args.outFileName) if args.outFileNameMatrix: hm.saveMatrixValues(args.outFileNameMatrix) if args.outFileNameData: hm.saveTabulatedValues(args.outFileNameData) if args.outFileSortedRegions: hm.saveBED(args.outFileSortedRegions)
def main(args=None): args = process_args(args) hm = heatmapper.heatmapper() matrix_file = args.matrixFile.name args.matrixFile.close() hm.read_matrix_file(matrix_file) if args.sortRegions == 'keep': args.sortRegions = 'no' # These are the same thing if args.kmeans is not None: hm.matrix.hmcluster(args.kmeans, method='kmeans') else: if args.hclust is not None: print( "Performing hierarchical clustering." "Please note that it might be very slow for large datasets.\n") hm.matrix.hmcluster(args.hclust, method='hierarchical') group_len_ratio = np.diff(hm.matrix.group_boundaries) / len( hm.matrix.regions) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) sys.stderr.write( "WARNING: Group '{}' is too small for plotting, you might want to remove it. " "There will likely be an error message from matplotlib regarding this " "below.\n".format(hm.matrix.group_labels[problem[0]])) if args.regionsLabel: hm.matrix.set_group_labels(args.regionsLabel) if args.samplesLabel and len(args.samplesLabel): hm.matrix.set_sample_labels(args.samplesLabel) if args.sortRegions != 'no': sortUsingSamples = [] if args.sortUsingSamples is not None: for i in args.sortUsingSamples: if (i > 0 and i <= hm.matrix.get_num_samples()): sortUsingSamples.append(i - 1) else: exit( "The value {0} for --sortSamples is not valid. Only values from 1 to {1} are allowed." .format(args.sortUsingSamples, hm.matrix.get_num_samples())) print('Samples used for ordering within each group: ', sortUsingSamples) hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions, sample_list=sortUsingSamples) if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) colormap_dict = { 'colorMap': args.colorMap, 'colorList': args.colorList, 'colorNumber': args.colorNumber, 'missingDataColor': args.missingDataColor, 'alpha': args.alpha } plotMatrix(hm, args.outFileName, colormap_dict, args.plotTitle, args.xAxisLabel, args.yAxisLabel, args.regionsLabel, args.zMin, args.zMax, args.yMin, args.yMax, args.averageTypeSummaryPlot, args.refPointLabel, args.startLabel, args.endLabel, args.heatmapHeight, args.heatmapWidth, args.perGroup, args.whatToShow, image_format=args.plotFileFormat, legend_location=args.legendLocation, box_around_heatmaps=args.boxAroundHeatmaps, label_rotation=args.label_rotation, dpi=args.dpi, interpolation_method=args.interpolationMethod)
def main(args=None): args = process_args(args) hm = heatmapper.heatmapper() matrix_file = args.matrixFile.name args.matrixFile.close() hm.read_matrix_file(matrix_file) if args.kmeans is not None: hm.matrix.hmcluster(args.kmeans, method='kmeans') else: if args.hclust is not None: print("Performing hierarchical clustering." "Please note that it might be very slow for large datasets.\n") hm.matrix.hmcluster(args.hclust, method='hierarchical') group_len_ratio = np.diff(hm.matrix.group_boundaries) / len(hm.matrix.regions) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) sys.stderr.write("WARNING: Group '{}' is too small for plotting, you might want to remove it. " "There will likely be an error message from matplotlib regarding this " "below.\n".format(hm.matrix.group_labels[problem[0]])) if args.regionsLabel: hm.matrix.set_group_labels(args.regionsLabel) if args.samplesLabel and len(args.samplesLabel): hm.matrix.set_sample_labels(args.samplesLabel) if args.sortRegions != 'no': sortUsingSamples = [] if args.sortUsingSamples is not None: for i in args.sortUsingSamples: if (i > 0 and i <= hm.matrix.get_num_samples()): sortUsingSamples.append(i - 1) else: exit("The value {0} for --sortSamples is not valid. Only values from 1 to {1} are allowed.".format(args.sortUsingSamples, hm.matrix.get_num_samples())) print('Samples used for ordering within each group: ', sortUsingSamples) hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions, sample_list=sortUsingSamples) if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) colormap_dict = {'colorMap': args.colorMap, 'colorList': args.colorList, 'colorNumber': args.colorNumber, 'missingDataColor': args.missingDataColor, 'alpha': args.alpha} plotMatrix(hm, args.outFileName, colormap_dict, args.plotTitle, args.xAxisLabel, args.yAxisLabel, args.regionsLabel, args.zMin, args.zMax, args.yMin, args.yMax, args.averageTypeSummaryPlot, args.refPointLabel, args.startLabel, args.endLabel, args.heatmapHeight, args.heatmapWidth, args.perGroup, args.whatToShow, image_format=args.plotFileFormat, legend_location=args.legendLocation, box_around_heatmaps=args.boxAroundHeatmaps, dpi=args.dpi)
def main(args=None): args = process_args(args) hm = heatmapper.heatmapper() matrix_file = args.matrixFile.name args.matrixFile.close() hm.read_matrix_file(matrix_file, default_group_name=args.regionsLabel) if args.kmeans is not None: hm.matrix.hmcluster(args.kmeans, method='kmeans') else: if args.hclust is not None: print "Performing hierarchical clustering." \ "Please note that it might be very slow for large datasets.\n" hm.matrix.hmcluster(args.hclust, method='hierarchical') group_len_ratio = np.diff(hm.matrix.group_boundaries) / len( hm.matrix.regions) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) group_len = np.diff(hm.matrix.group_boundaries) print "Group '{}' contains too few regions {}. It can't "\ "be plotted. Try removing this group.\n".format(hm.matrix.group_labels[problem[0]], group_len[problem]) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) print 'Clustered output written in : ' + args.outFileSortedRegions.name else: print "No Output file defined for sorted regions. Please re-run "\ "heatmapper with --outFileSortedRegions to save the clustered output. " exit(1) if len(args.regionsLabel): hm.matrix.set_group_labels(args.regionsLabel) if args.samplesLabel and len(args.samplesLabel): hm.matrix.set_sample_labels(args.samplesLabel) if args.sortRegions != 'no': hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions) if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) # if args.outFileNameData: # hm.saveTabulatedValues(args.outFileNameData) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) colormap_dict = { 'colorMap': args.colorMap, 'colorList': args.colorList, 'colorNumber': args.colorNumber, 'missingDataColor': args.missingDataColor } plotMatrix(hm, args.outFileName, colormap_dict, args.plotTitle, args.xAxisLabel, args.yAxisLabel, args.regionsLabel, args.zMin, args.zMax, args.yMin, args.yMax, args.averageTypeSummaryPlot, args.refPointLabel, args.startLabel, args.endLabel, args.heatmapHeight, args.heatmapWidth, args.perGroup, args.whatToShow, image_format=args.plotFileFormat, legend_location=args.legendLocation)
stats = {} sample = "" for filename in snakemake.input.profiles: kind = kind_from_filename(filename) sample = sample_from_filename(filename, kind) # if the file is empty, we need to decide what to do with it. For now, just # continue if os.stat(filename).st_size == 0: continue # use deeptools' parsing to handle loading the matrix. They actually # support some pretty complex features (concatentating upstream, gene body, # downstream; concatentating multiple sets of features). The snakefile is # running the simpler mode of a single set of features. So we can simply # grab the (0, 0) matrix. h = heatmapper() h.read_matrix_file(filename) # `matrix` is a numpy array, with one row per feature (in this context, one # row per peak) and one column for each bin matrix = h.matrix.get_matrix(0, 0)['matrix'] # take the mean of each column y = matrix.mean(axis=0) # deeptools.Heatmapper.read_matrix_file also parses the parameters, which # is pretty nice. We can use that to build an x-axis. if snakemake.wildcards.scaling == 'reference-point': x = np.linspace( -h.parameters['upstream'],
def compute_matrix(args): args.samplesLabel = [ scoreFname.replace(args.scoreFileNamePlusSuffix, '') for scoreFname in args.scoreFileNamePlus ] parameters = { 'upstream': args.beforeRegionStartLength, 'downstream': args.afterRegionStartLength, 'body': args.regionBodyLength, 'bin size': args.binSize, 'ref point': args.referencePoint, 'verbose': args.verbose, 'bin avg type': args.averageTypeBins, 'missing data as zero': args.missingDataAsZero, 'min threshold': args.minThreshold, 'max threshold': args.maxThreshold, 'scale': args.scale, 'skip zeros': args.skipZeros, 'nan after end': args.nanAfterEnd, 'proc number': args.numberOfProcessors, 'sort regions': args.sortRegions, 'sort using': args.sortUsing, 'unscaled 5 prime': args.unscaled5prime, 'unscaled 3 prime': args.unscaled3prime } # Preload deepBlue files (@MS: any file .wiggle, .wig or .bedgraph in deeptools context afaics), which need to then be deleted #deepBlueFilesPlus = load_deepblue_files(args.regionsFileNamePlus, args.scoreFileNamePlus) #deepBlueFilesMinus = load_deepblue_files(args.regionsFileNameMinus, args.scoreFileNameMinus) hm = heatmapper.heatmapper() hm.computeMatrix(args.scoreFileNamePlus, args.regionsFileNamePlus, parameters, blackListFileName=args.blackListFileName, verbose=args.verbose, allArgs=args) hm.matrix.group_labels = [ grp_label.replace(args.regionFileNamePlusSuffix, '') for grp_label in hm.matrix.group_labels ] hm_minus = heatmapper.heatmapper() hm_minus.computeMatrix(args.scoreFileNameMinus, args.regionsFileNameMinus, parameters, blackListFileName=args.blackListFileName, verbose=args.verbose, allArgs=args) hm_minus.matrix.group_labels = [ grp_label.replace(args.regionFileNameMinusSuffix, '') for grp_label in hm_minus.matrix.group_labels ] hm = rbindMatrices(hm, hm_minus) if args.sortRegions not in ['no', 'keep']: sortUsingSamples = [] if args.sortUsingSamples is not None: for i in args.sortUsingSamples: if (i > 0 and i <= hm.matrix.get_num_samples()): sortUsingSamples.append(i - 1) else: exit( "The value {0} for --sortUsingSamples is not valid. Only values from 1 to {1} are allowed." .format(args.sortUsingSamples, hm.matrix.get_num_samples())) print('Samples used for ordering within each group: ', sortUsingSamples) hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions, sample_list=sortUsingSamples) elif args.sortRegions == 'keep': hm.parameters['group_labels'] = hm.matrix.group_labels hm.parameters["group_boundaries"] = hm.matrix.group_boundaries # cmo.sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator) # Clean up temporary bigWig files, if applicable # if not args.deepBlueKeepTemp: # for k, v in deepBlueFilesPlus: # os.remove(args.scoreFileNamePlus[v]) # for k, v in deepBlueFilesMinus: # os.remove(args.scoreFileNameMinus[v]) # else: # for k, v in deepBlueFilesPlus: # print("{} is stored in {}".format(k, args.scoreFileNamePlus[v])) # for k, v in deepBlueFilesMinus: # print("{} is stored in {}".format(k, args.scoreFileNameMinus[v])) return hm
def tabbed_BED_region_to_deeptools(region): ''' tab-separated region to string ''' return [region[0], region[1], region[2], region[3]] def deeptools_region_str(region): ''' tab-separated region to string ''' if isinstance(region, dict): return region['chrom']+':'+str(region['start'])+'-'+str(region['end'])+'('+region['strand']+')_'+region['name'] elif isinstance(region, list): return region[0] + ':' + str(region[1][0][0]) + '-' + str(region[1][-1][1]) + '(' + region[4] + ')_' + \ region[2] if __name__ == '__main__': args = parse_arguments().parse_args(sys.argv[1:]) hm = heatmapper.heatmapper() hm.read_matrix_file(args.matrixFile) perform_operations(args, hm.matrix) hm.parameters['group_labels'] = hm.matrix.group_labels hm.parameters['group_boundaries'] = hm.matrix.group_boundaries hm.save_matrix(args.outFileName)
def __clustering(hm, indexList, configfile): """ """ for index, i in enumerate(indexList): indexList[index] = i - 1 if hm.parameters['min threshold'] is not None or\ hm.parameters['max threshold'] is not None: hm.filterHeatmapValues(hm.parameters['min threshold'], hm.parameters['max threshold']) if configfile["kmeans"] is not None: hm.matrix.hmcluster(configfile["kmeans"], method='kmeans') else: if configfile["hclust"] is not None: print("Performing hierarchical clustering." "Please note that it might be very slow for large " "datasets.\n") hm.matrix.hmcluster(configfile["hclust"], method='hierarchical') group_len_ratio = np.diff(hm.matrix.group_boundaries) /\ len(hm.matrix.regions) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) sys.stderr.write("WARNING: Group '{}' is too small for plotting," "you might want to remove it. " "There will likely be an error message from " "matplotlib regarding this " "below.\n".format(hm.matrix.group_labels[problem[0]])) # TODO set sample & region labels!! if configfile["sortRegions"][0] != 'keep': hm.matrix.sort_groups(sort_using=configfile["sortUsing"][0], sort_method=configfile["sortRegions"][0], sample_list=indexList) outputMatrix_path = "" if configfile["outputReferenceMatrix"] is not None: outputMatrix_path = os.path.join(configfile["outputReferenceMatrix"]) hm.save_matrix(outputMatrix_path) """ TODO: figure out how to do it directly from hm. For the moment when I use hm some parameters have incomapatible types. for example upstream is a value if it is read directly but is a list if it is read from a file.""" if configfile["plotOutput"] is not None: if configfile["outputReferenceMatrix"] is None: outputMatrix_path = os.path.dirname( os.path.abspath(configfile["matrixOutput"])) outputMatrix_path += "/outputReferenceMatrix.gz" hm.save_matrix(outputMatrix_path) hm1 = heatmapper() hm1.read_matrix_file(outputMatrix_path) color_dict = { 'colorMap': ['RdYlBu'], 'colorList': None, 'colorNumber': int(256), 'missingDataColor': 'black', 'alpha': float(1.0) } plotMatrix(hm1, os.path.join(configfile["plotOutput"]), colorMapDict=color_dict) assert (configfile["outFileSortedRegions"]) hm.save_BED(open(configfile["outFileSortedRegions"], "w"))
def main(args=None): args = process_args(args) # if more than one bed file is given, they are concatenated into one file. if len(args.regionsFileName) > 1: bed_file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') for bed in args.regionsFileName: bed.close() # concatenate all intermediate tempfiles into one print "appending {} file".format(bed.name) shutil.copyfileobj(open(bed.name, 'U'), bed_file) # append hash and label based on the file name label = os.path.basename(bed.name) if label.endswith(".bed"): label = label[:-4] bed_file.write("# {}\n".format(label)) bed_file.seek(0) else: bed_file = args.regionsFileName[0] parameters = { 'upstream': args.beforeRegionStartLength, 'downstream': args.afterRegionStartLength, 'body': args.regionBodyLength, 'bin size': args.binSize, 'ref point': args.referencePoint, 'verbose': args.verbose, 'bin avg type': args.averageTypeBins, 'missing data as zero': args.missingDataAsZero, 'min threshold': args.minThreshold, 'max threshold': args.maxThreshold, 'scale': args.scale, 'skip zeros': args.skipZeros, 'nan after end': args.nanAfterEnd, 'proc number': args.numberOfProcessors, 'sort regions': args.sortRegions, 'sort using': args.sortUsing } hm = heatmapper.heatmapper() scores_file_list = [x.name for x in args.scoreFileName] hm.computeMatrix(scores_file_list, bed_file, parameters, verbose=args.verbose) if args.sortRegions != 'no': hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions) hm.save_matrix(args.outFileName) bed_file.close() if len(args.regionsFileName) > 1: os.remove(bed_file.name) if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) # TODO This isn't implemented # if args.outFileNameData: # hm.saveTabulatedValues(args.outFileNameData) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions)
def main(args=None): args = process_args(args) hm = heatmapper.heatmapper() matrix_file = args.matrixFile.name args.matrixFile.close() hm.read_matrix_file(matrix_file, default_group_name=args.regionsLabel) if args.kmeans is not None: hm.matrix.hmcluster(args.kmeans, method='kmeans') else: if args.hclust is not None: print "Performing hierarchical clustering." \ "Please note that it might be very slow for large datasets.\n" hm.matrix.hmcluster(args.hclust, method='hierarchical') group_len_ratio = np.diff(hm.matrix.group_boundaries) / len(hm.matrix.regions) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) group_len = np.diff(hm.matrix.group_boundaries) print "Group '{}' contains too few regions {}. It can't "\ "be plotted. Try removing this group.\n".format(hm.matrix.group_labels[problem[0]], group_len[problem]) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) print 'Clustered output written in : ' + args.outFileSortedRegions.name else: print "No Output file defined for sorted regions. Please re-run "\ "heatmapper with --outFileSortedRegions to save the clustered output. " exit(1) if len(args.regionsLabel): hm.matrix.set_group_labels(args.regionsLabel) if args.samplesLabel and len(args.samplesLabel): hm.matrix.set_sample_labels(args.samplesLabel) if args.sortRegions != 'no': hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions) if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) # if args.outFileNameData: # hm.saveTabulatedValues(args.outFileNameData) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) colormap_dict = {'colorMap': args.colorMap, 'colorList': args.colorList, 'colorNumber': args.colorNumber, 'missingDataColor': args.missingDataColor} plotMatrix(hm, args.outFileName, colormap_dict, args.plotTitle, args.xAxisLabel, args.yAxisLabel, args.regionsLabel, args.zMin, args.zMax, args.yMin, args.yMax, args.averageTypeSummaryPlot, args.refPointLabel, args.startLabel, args.endLabel, args.heatmapHeight, args.heatmapWidth, args.perGroup, args.whatToShow, image_format=args.plotFileFormat, legend_location=args.legendLocation)
def main(args=None): args = process_args(args) parameters = {'upstream': args.beforeRegionStartLength, 'downstream': args.afterRegionStartLength, 'body': args.regionBodyLength, 'bin size': args.binSize, 'ref point': args.referencePoint, 'verbose': args.verbose, 'bin avg type': args.averageTypeBins, 'missing data as zero': args.missingDataAsZero, 'min threshold': args.minThreshold, 'max threshold': args.maxThreshold, 'scale': args.scale, 'skip zeros': args.skipZeros, 'nan after end': args.nanAfterEnd, 'proc number': args.numberOfProcessors, 'sort regions': args.sortRegions, 'sort using': args.sortUsing, 'unscaled 5 prime': args.unscaled5prime, 'unscaled 3 prime': args.unscaled3prime } hm = heatmapper.heatmapper() # Preload deepBlue files, which need to then be deleted deepBlueFiles = [] for idx, fname in enumerate(args.scoreFileName): if db.isDeepBlue(fname): deepBlueFiles.append([fname, idx]) if len(deepBlueFiles) > 0: sys.stderr.write("Preloading the following deepBlue files: {}\n".format(",".join([x[0] for x in deepBlueFiles]))) regs = db.makeRegions(args.regionsFileName, args) for x in deepBlueFiles: x.extend([args, regs]) if len(deepBlueFiles) > 1 and args.numberOfProcessors > 1: pool = multiprocessing.Pool(args.numberOfProcessors) res = pool.map_async(db.preloadWrapper, deepBlueFiles).get(9999999) else: res = list(map(db.preloadWrapper, deepBlueFiles)) # substitute the file names with the temp files for (ftuple, r) in zip(deepBlueFiles, res): args.scoreFileName[ftuple[1]] = r deepBlueFiles = [[x[0], x[1]] for x in deepBlueFiles] del regs scores_file_list = args.scoreFileName hm.computeMatrix(scores_file_list, args.regionsFileName, parameters, blackListFileName=args.blackListFileName, verbose=args.verbose, allArgs=args) if args.sortRegions not in ['no', 'keep']: sortUsingSamples = [] if args.sortUsingSamples is not None: for i in args.sortUsingSamples: if (i > 0 and i <= hm.matrix.get_num_samples()): sortUsingSamples.append(i - 1) else: exit("The value {0} for --sortUsingSamples is not valid. Only values from 1 to {1} are allowed.".format(args.sortUsingSamples, hm.matrix.get_num_samples())) print('Samples used for ordering within each group: ', sortUsingSamples) hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions, sample_list=sortUsingSamples) elif args.sortRegions == 'keep': hm.parameters['group_labels'] = hm.matrix.group_labels hm.parameters["group_boundaries"] = hm.matrix.group_boundaries cmo.sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator, verbose=not args.quiet) hm.save_matrix(args.outFileName) if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) # Clean up temporary bigWig files, if applicable if not args.deepBlueKeepTemp: for k, v in deepBlueFiles: os.remove(args.scoreFileName[v]) else: for k, v in deepBlueFiles: print("{} is stored in {}".format(k, args.scoreFileName[v]))