Example #1
0
    def _output(section, subsection, valuef, dtype):

        # fold change matrix
        matrix, row_headers = buildMatrix(results, valuef=valuef, dtype=dtype)

        outfile = getFileName(options,
                              go=test_ontology,
                              section=section,
                              set='%s_all' % subsection)

        IOTools.writeMatrix(outfile,
                            matrix,
                            row_headers,
                            col_headers,
                            row_header="category")

        outfile = getFileName(options,
                              go=test_ontology,
                              section=section,
                              set='%s_alldesc' % subsection)

        IOTools.writeMatrix(
            outfile,
            matrix,
            ["%s:%s" % (x, go2info[x].mDescription) for x in row_headers],
            col_headers,
            row_header="category")
Example #2
0
    def _output(section, subsection, valuef, dtype):

        # fold change matrix
        matrix, row_headers = buildMatrix(results,
                                          valuef=valuef,
                                          dtype=dtype)

        outfile = getFileName(options,
                              go=test_ontology,
                              section=section,
                              set='%s_all' % subsection)

        IOTools.writeMatrix(
            outfile, matrix, row_headers, col_headers, row_header="category")

        outfile = getFileName(options,
                              go=test_ontology,
                              section=section,
                              set='%s_alldesc' % subsection)

        IOTools.writeMatrix(outfile, matrix,
                            ["%s:%s" % (x, go2info[x].mDescription)
                             for x in row_headers],
                            col_headers, row_header="category")
def buildGeneListMatrix(infiles, outfile):
    '''build a gene list matrix for simple pathway analysis
    based on hypergeometric test.

    A gene list is derived from a gene set by
    applying thresholds to the input data set. The
    thresholds are defined in the configuration file.
    '''

    genesets = []
    backgrounds = []
    headers = []
    for infile in infiles:
        genelist = pandas.read_csv(
            IOTools.openFile(infile),
            index_col=0,
            sep='\t')

        track = P.snip(os.path.basename(infile), ".tsv.gz")
        headers.append(track)

        field = PARAMS[P.matchParameter("%s_foreground_field" % track)]
        min_threshold = PARAMS[P.matchParameter(
            "%s_foreground_min_threshold" % track)]
        max_threshold = PARAMS[P.matchParameter(
            "%s_foreground_max_threshold" % track)]
        genesets.append(set(genelist[
            (genelist[field] >= min_threshold) &
            (genelist[field] <= max_threshold)].index))

        E.info('%s: foreground: %f <= %s <= %f' % (track,
                                                   min_threshold,
                                                   field,
                                                   max_threshold))

        field = PARAMS[P.matchParameter("%s_background_field" % track)]
        min_threshold = PARAMS[P.matchParameter(
            "%s_background_min_threshold" % track)]
        max_threshold = PARAMS[P.matchParameter(
            "%s_background_max_threshold" % track)]

        E.info('%s: background: %f <= %s <= %f' % (track,
                                                   min_threshold,
                                                   field,
                                                   max_threshold))
        backgrounds.append(set(genelist[
            (genelist[field] >= min_threshold) &
            (genelist[field] <= max_threshold)].index))

        E.info("%s: fg=%i, bg=%i" % (track,
                                     len(genesets[-1]),
                                     len(backgrounds[-1])))

    E.info("writing gene list matrix")
    with IOTools.openFile(outfile, "w") as outf:
        SetTools.writeSets(outf, genesets, labels=headers)
    with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf:
        SetTools.writeSets(outf, backgrounds, labels=headers)

    E.info("writing intersection/union matrix")
    # build set intersection matrix
    matrix = SetTools.unionIntersectionMatrix(genesets)
    with IOTools.openFile(outfile + ".matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
    matrix = SetTools.unionIntersectionMatrix(backgrounds)
    with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
Example #4
0
def writeMatricesForSortOrder(features_per_interval, bins, foreground_track,
                              control_tracks, shifted, sort_order):
    '''output one or more matrices for each sort sorder.

    For each sort order output the forerground. If there
    are additional controls and shifted section, output
    these as well

    The files will named:
    matrix_<track>_<sortorder>

    '''
    if "name" in features_per_interval[0].interval:
        names = [x.interval.name for x in features_per_interval]
    else:
        names = map(str, range(1, len(features_per_interval) + 1))

    bins = ["%i" % x for x in bins]
    sort_order = re.sub("-", "_", sort_order)

    # write foreground
    IOTools.writeMatrix(E.openOutputFile("matrix_%s_%s.gz" %
                                         (foreground_track, sort_order)),
                        [x.foreground.counts for x in features_per_interval],
                        row_headers=names,
                        col_headers=bins,
                        row_header="name")

    # write controls
    for idx, track in enumerate(control_tracks):
        IOTools.writeMatrix(
            E.openOutputFile("matrix_%s_%s.gz" % (track, sort_order)),
            [x.controls[idx].counts for x in features_per_interval],
            row_headers=names,
            col_headers=bins,
            row_header="name")

    # write shifted matrix
    if shifted:
        IOTools.writeMatrix(E.openOutputFile("matrix_shift_%s.gz" %
                                             (sort_order)),
                            [x.shifted.counts for x in features_per_interval],
                            row_headers=names,
                            col_headers=bins,
                            row_header="name")

    # output a combined matrix
    if len(control_tracks) > 0 or shifted:
        rows = []
        for row in features_per_interval:
            l = [row.foreground.counts]
            l.extend(
                [row.controls[x].counts for x in range(len(control_tracks))])
            if shifted:
                l.append(row.shifted.counts)
            rows.append(numpy.concatenate(l))

        n = 1 + len(control_tracks)
        if shifted:
            n += 1

        # make column names unique and make sure they can be sorted
        # lexicographically
        all_bins = []
        for x in range(n):
            all_bins.extend(["%i:%s" % (x, b) for b in bins])

        IOTools.writeMatrix(E.openOutputFile("matrix_sidebyside_%s.gz" %
                                             (sort_order)),
                            rows,
                            row_headers=names,
                            col_headers=all_bins,
                            row_header="name")
Example #5
0
def buildGeneListMatrix(infiles, outfile):
    '''build a gene list matrix for simple pathway analysis
    based on hypergeometric test.

    A gene list is derived from a gene set by
    applying thresholds to the input data set. The
    thresholds are defined in the configuration file.
    '''

    genesets = []
    backgrounds = []
    headers = []
    for infile in infiles:
        genelist = pandas.read_csv(IOTools.openFile(infile),
                                   index_col=0,
                                   sep='\t')

        track = P.snip(os.path.basename(infile), ".tsv.gz")
        headers.append(track)

        field = PARAMS[P.matchParameter("%s_foreground_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_foreground_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_foreground_max_threshold" %
                                                track)]
        genesets.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info('%s: foreground: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))

        field = PARAMS[P.matchParameter("%s_background_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_background_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_background_max_threshold" %
                                                track)]

        E.info('%s: background: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))
        backgrounds.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info("%s: fg=%i, bg=%i" %
               (track, len(genesets[-1]), len(backgrounds[-1])))

    E.info("writing gene list matrix")
    with IOTools.openFile(outfile, "w") as outf:
        SetTools.writeSets(outf, genesets, labels=headers)
    with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf:
        SetTools.writeSets(outf, backgrounds, labels=headers)

    E.info("writing intersection/union matrix")
    # build set intersection matrix
    matrix = SetTools.unionIntersectionMatrix(genesets)
    with IOTools.openFile(outfile + ".matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
    matrix = SetTools.unionIntersectionMatrix(backgrounds)
    with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
Example #6
0
def writeMatricesForSortOrder(features_per_interval,
                              bins,
                              foreground_track,
                              control_tracks,
                              shifted,
                              sort_order):
    '''output one or more matrices for each sort sorder.

    For each sort order output the forerground. If there
    are additional controls and shifted section, output
    these as well

    The files will named:
    matrix_<track>_<sortorder>

    '''
    if "name" in features_per_interval[0].interval:
        names = [x.interval.name for x in features_per_interval]
    else:
        names = map(str, range(1, len(features_per_interval) + 1))

    bins = ["%i" % x for x in bins]
    sort_order = re.sub("-", "_", sort_order)

    # write foreground
    IOTools.writeMatrix(
        E.openOutputFile("matrix_%s_%s.gz" % (foreground_track, sort_order)),
        [x.foreground.counts for x in features_per_interval],
        row_headers=names,
        col_headers=bins,
        row_header="name")

    # write controls
    for idx, track in enumerate(control_tracks):
        IOTools.writeMatrix(
            E.openOutputFile("matrix_%s_%s.gz" % (track, sort_order)),
            [x.controls[idx].counts for x in features_per_interval],
            row_headers=names,
            col_headers=bins,
            row_header="name")

    # write shifted matrix
    if shifted:
        IOTools.writeMatrix(
            E.openOutputFile("matrix_shift_%s.gz" % (sort_order)),
            [x.shifted.counts for x in features_per_interval],
            row_headers=names,
            col_headers=bins,
            row_header="name")

    # output a combined matrix
    if len(control_tracks) > 0 or shifted:
        rows = []
        for row in features_per_interval:
            l = [row.foreground.counts]
            l.extend([row.controls[x].counts for x in
                      range(len(control_tracks))])
            if shifted:
                l.append(row.shifted.counts)
            rows.append(numpy.concatenate(l))

        n = 1 + len(control_tracks)
        if shifted:
            n += 1

        # make column names unique and make sure they can be sorted
        # lexicographically
        all_bins = []
        for x in range(n):
            all_bins.extend(["%i:%s" % (x, b) for b in bins])

        IOTools.writeMatrix(
            E.openOutputFile("matrix_sidebyside_%s.gz" % (sort_order)),
            rows,
            row_headers=names,
            col_headers=all_bins,
            row_header="name")