def _output(section, subsection, valuef, dtype): # fold change matrix matrix, row_headers = buildMatrix(results, valuef=valuef, dtype=dtype) outfile = getFileName(options, go=test_ontology, section=section, set='%s_all' % subsection) IOTools.writeMatrix(outfile, matrix, row_headers, col_headers, row_header="category") outfile = getFileName(options, go=test_ontology, section=section, set='%s_alldesc' % subsection) IOTools.writeMatrix( outfile, matrix, ["%s:%s" % (x, go2info[x].mDescription) for x in row_headers], col_headers, row_header="category")
def _output(section, subsection, valuef, dtype): # fold change matrix matrix, row_headers = buildMatrix(results, valuef=valuef, dtype=dtype) outfile = getFileName(options, go=test_ontology, section=section, set='%s_all' % subsection) IOTools.writeMatrix( outfile, matrix, row_headers, col_headers, row_header="category") outfile = getFileName(options, go=test_ontology, section=section, set='%s_alldesc' % subsection) IOTools.writeMatrix(outfile, matrix, ["%s:%s" % (x, go2info[x].mDescription) for x in row_headers], col_headers, row_header="category")
def buildGeneListMatrix(infiles, outfile): '''build a gene list matrix for simple pathway analysis based on hypergeometric test. A gene list is derived from a gene set by applying thresholds to the input data set. The thresholds are defined in the configuration file. ''' genesets = [] backgrounds = [] headers = [] for infile in infiles: genelist = pandas.read_csv( IOTools.openFile(infile), index_col=0, sep='\t') track = P.snip(os.path.basename(infile), ".tsv.gz") headers.append(track) field = PARAMS[P.matchParameter("%s_foreground_field" % track)] min_threshold = PARAMS[P.matchParameter( "%s_foreground_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter( "%s_foreground_max_threshold" % track)] genesets.append(set(genelist[ (genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info('%s: foreground: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) field = PARAMS[P.matchParameter("%s_background_field" % track)] min_threshold = PARAMS[P.matchParameter( "%s_background_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter( "%s_background_max_threshold" % track)] E.info('%s: background: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) backgrounds.append(set(genelist[ (genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info("%s: fg=%i, bg=%i" % (track, len(genesets[-1]), len(backgrounds[-1]))) E.info("writing gene list matrix") with IOTools.openFile(outfile, "w") as outf: SetTools.writeSets(outf, genesets, labels=headers) with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf: SetTools.writeSets(outf, backgrounds, labels=headers) E.info("writing intersection/union matrix") # build set intersection matrix matrix = SetTools.unionIntersectionMatrix(genesets) with IOTools.openFile(outfile + ".matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers) matrix = SetTools.unionIntersectionMatrix(backgrounds) with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers)
def writeMatricesForSortOrder(features_per_interval, bins, foreground_track, control_tracks, shifted, sort_order): '''output one or more matrices for each sort sorder. For each sort order output the forerground. If there are additional controls and shifted section, output these as well The files will named: matrix_<track>_<sortorder> ''' if "name" in features_per_interval[0].interval: names = [x.interval.name for x in features_per_interval] else: names = map(str, range(1, len(features_per_interval) + 1)) bins = ["%i" % x for x in bins] sort_order = re.sub("-", "_", sort_order) # write foreground IOTools.writeMatrix(E.openOutputFile("matrix_%s_%s.gz" % (foreground_track, sort_order)), [x.foreground.counts for x in features_per_interval], row_headers=names, col_headers=bins, row_header="name") # write controls for idx, track in enumerate(control_tracks): IOTools.writeMatrix( E.openOutputFile("matrix_%s_%s.gz" % (track, sort_order)), [x.controls[idx].counts for x in features_per_interval], row_headers=names, col_headers=bins, row_header="name") # write shifted matrix if shifted: IOTools.writeMatrix(E.openOutputFile("matrix_shift_%s.gz" % (sort_order)), [x.shifted.counts for x in features_per_interval], row_headers=names, col_headers=bins, row_header="name") # output a combined matrix if len(control_tracks) > 0 or shifted: rows = [] for row in features_per_interval: l = [row.foreground.counts] l.extend( [row.controls[x].counts for x in range(len(control_tracks))]) if shifted: l.append(row.shifted.counts) rows.append(numpy.concatenate(l)) n = 1 + len(control_tracks) if shifted: n += 1 # make column names unique and make sure they can be sorted # lexicographically all_bins = [] for x in range(n): all_bins.extend(["%i:%s" % (x, b) for b in bins]) IOTools.writeMatrix(E.openOutputFile("matrix_sidebyside_%s.gz" % (sort_order)), rows, row_headers=names, col_headers=all_bins, row_header="name")
def buildGeneListMatrix(infiles, outfile): '''build a gene list matrix for simple pathway analysis based on hypergeometric test. A gene list is derived from a gene set by applying thresholds to the input data set. The thresholds are defined in the configuration file. ''' genesets = [] backgrounds = [] headers = [] for infile in infiles: genelist = pandas.read_csv(IOTools.openFile(infile), index_col=0, sep='\t') track = P.snip(os.path.basename(infile), ".tsv.gz") headers.append(track) field = PARAMS[P.matchParameter("%s_foreground_field" % track)] min_threshold = PARAMS[P.matchParameter("%s_foreground_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter("%s_foreground_max_threshold" % track)] genesets.append( set(genelist[(genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info('%s: foreground: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) field = PARAMS[P.matchParameter("%s_background_field" % track)] min_threshold = PARAMS[P.matchParameter("%s_background_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter("%s_background_max_threshold" % track)] E.info('%s: background: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) backgrounds.append( set(genelist[(genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info("%s: fg=%i, bg=%i" % (track, len(genesets[-1]), len(backgrounds[-1]))) E.info("writing gene list matrix") with IOTools.openFile(outfile, "w") as outf: SetTools.writeSets(outf, genesets, labels=headers) with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf: SetTools.writeSets(outf, backgrounds, labels=headers) E.info("writing intersection/union matrix") # build set intersection matrix matrix = SetTools.unionIntersectionMatrix(genesets) with IOTools.openFile(outfile + ".matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers) matrix = SetTools.unionIntersectionMatrix(backgrounds) with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers)
def writeMatricesForSortOrder(features_per_interval, bins, foreground_track, control_tracks, shifted, sort_order): '''output one or more matrices for each sort sorder. For each sort order output the forerground. If there are additional controls and shifted section, output these as well The files will named: matrix_<track>_<sortorder> ''' if "name" in features_per_interval[0].interval: names = [x.interval.name for x in features_per_interval] else: names = map(str, range(1, len(features_per_interval) + 1)) bins = ["%i" % x for x in bins] sort_order = re.sub("-", "_", sort_order) # write foreground IOTools.writeMatrix( E.openOutputFile("matrix_%s_%s.gz" % (foreground_track, sort_order)), [x.foreground.counts for x in features_per_interval], row_headers=names, col_headers=bins, row_header="name") # write controls for idx, track in enumerate(control_tracks): IOTools.writeMatrix( E.openOutputFile("matrix_%s_%s.gz" % (track, sort_order)), [x.controls[idx].counts for x in features_per_interval], row_headers=names, col_headers=bins, row_header="name") # write shifted matrix if shifted: IOTools.writeMatrix( E.openOutputFile("matrix_shift_%s.gz" % (sort_order)), [x.shifted.counts for x in features_per_interval], row_headers=names, col_headers=bins, row_header="name") # output a combined matrix if len(control_tracks) > 0 or shifted: rows = [] for row in features_per_interval: l = [row.foreground.counts] l.extend([row.controls[x].counts for x in range(len(control_tracks))]) if shifted: l.append(row.shifted.counts) rows.append(numpy.concatenate(l)) n = 1 + len(control_tracks) if shifted: n += 1 # make column names unique and make sure they can be sorted # lexicographically all_bins = [] for x in range(n): all_bins.extend(["%i:%s" % (x, b) for b in bins]) IOTools.writeMatrix( E.openOutputFile("matrix_sidebyside_%s.gz" % (sort_order)), rows, row_headers=names, col_headers=all_bins, row_header="name")