コード例 #1
0
ファイル: matrix2stats.py プロジェクト: gsc0107/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: matrix2stats.py 2795 2009-09-16 15:29:23Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("chi-squared", "pearson-chi-squared"),
                      help="statistical methods to apply.")

    parser.add_option("-t", "--header-names", dest="headers", action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--no-headers", dest="headers", action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-i", "--input-format", dest="input_format", type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""input format for matrix."""  )

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""output format for matrix."""  )

    parser.add_option("-p", "--parameters", dest="parameters", action="append", type="string",
                      help="parameters for various functions.")

    parser.add_option("-a", "--iteration", dest="iteration", type="choice",
                      choices=("pairwise", "all-vs-all"),
                      help="""how to compute stats [%default]."""  )

    parser.set_defaults(
        method="chi-squared",
        headers=True,
        value_format="%6.4f",
        pvalue_format="%6.4e",
        input_format="full",
        write_separators=True,
        parameters=[],
        iteration=None,
    )

    (options, args) = E.Start(parser)

    lines = [x for x in sys.stdin.readlines() if x[0] != "#"]

    chunks = [x for x in range(len(lines)) if lines[x][0] == ">"]

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    ninput, noutput, nskipped = 0, 0, 0

    if options.write_separators:
        options.stdout.write("test\t")

    header_prefix = ""

    if options.method == "chi-squared":
        header_prefix = "observed\texpected"
        options.stdout.write("\t".join(
            (header_prefix, "n", "min", "max", "chi", "df", "P", "passed", "phi")) + "\n")

    elif options.method in ("pearson-chi-squared",):
        options.stdout.write("column\t")
        options.stdout.write("\t".join(
            (header_prefix, "n", "prob", "obs", "exp", "chi", "df", "P", "passed", "phi")) + "\n")

        if len(options.parameters) == 0:
            raise "out of parameters - please supply probability or filename with probabilities."

        param = options.parameters[0]
        del options.parameters[0]

        if options.write_separators:
            probabilities = IOTools.ReadMap(
               IOTools.openFile(param, "r"), map_functions=(str, float))
        else:
            probability = float(param)

    for x in range(len(chunks) - 1):
        ninput += 1
        matrix, row_headers, col_headers = MatlabTools.readMatrix(
            StringIO("".join(lines[chunks[x] + 1:chunks[x + 1]])),
            format=options.input_format,
            headers=options.headers)
        nrows, ncols = matrix.shape

        if options.loglevel >= 2:
            options.stdlog.write("# read matrix: %i x %i, %i row titles, %i colum titles.\n" %
                                 (nrows, ncols, len(row_headers), len(col_headers)))

        if options.write_separators:
            options.stdout.write(lines[chunks[x]][1:-1] + "\t")

        pairs = []
        if options.iteration == "pairwise":
            pairs = []
            for row1 in range(0, len(row_headers)):
                for row2 in range(row1 + 1, len(row_headers)):
                    pairs.append((row1, row2))
        elif options.iteration == "all-vs-all":
            pairs = []
            for row1 in range(0, len(row_headers)):
                for row2 in range(0, len(row_headers)):
                    if row1 == row2:
                        continue
                    pairs.append((row1, row2))

        if options.method == "chi-squared":

            for row1, row2 in pairs:
                row_header1 = row_headers[row1]
                row_header2 = row_headers[row2]
                try:
                    result = Stats.doChiSquaredTest(
                        numpy.vstack((matrix[row1], matrix[row2])))
                except ValueError:
                    nskipped += 1
                    continue

                noutput += 1
                options.stdout.write("\t".join((
                    "%s" % row_header1,
                    "%s" % row_header2,
                    "%i" % result.mSampleSize,
                    "%i" % min(matrix.flat),
                    "%i" % max(matrix.flat),
                    options.value_format % result.mChiSquaredValue,
                    "%i" % result.mDegreesFreedom,
                    options.pvalue_format % result.mProbability,
                    "%s" % result.mSignificance,
                    options.value_format % result.mPhi)) + "\n")

        elif options.method == "pearson-chi-squared":

            if nrows != 2:
                raise ValueError("only implemented for 2xn table")

            if options.write_separators:
                id = re.match("(\S+)", lines[chunks[x]][1:-1]).groups()[0]
                probability = probabilities[id]

            for col in range(ncols):
                options.stdout.write("%s\t" % col_headers[col])
                result = Stats.doPearsonChiSquaredTest(
                    probability, sum(matrix[:, col]), matrix[0, col])
                options.stdout.write("\t".join((
                    "%i" % result.mSampleSize,
                    "%f" % probability,
                    "%i" % result.mObserved,
                    "%f" % result.mExpected,
                    options.value_format % result.mChiSquaredValue,
                    "%i" % result.mDegreesFreedom,
                    options.pvalue_format % result.mProbability,
                    "%s" % result.mSignificance,
                    options.value_format % result.mPhi)))
                if col < ncols - 1:
                    options.stdout.write("\n")
                    if options.write_separators:
                        options.stdout.write(lines[chunks[x]][1:-1] + "\t")

            options.stdout.write("\n")

    E.info("# ninput=%i, noutput=%i, nskipped=%i\n" %
           (ninput, noutput, nskipped))

    E.Stop()
コード例 #2
0
ファイル: matrix2stats.py プロジェクト: siping/cgat
                for row2 in range( row1+1, len(row_headers) ):
                    pairs.append( (row1, row2) )
        elif options.iteration == "all-vs-all":
            pairs = []
            for row1 in range( 0, len(row_headers) ):
                for row2 in range( 0, len(row_headers) ):
                    if row1 == row2: continue
                    pairs.append( (row1, row2) )
    
        if options.method == "chi-squared":
            
            for row1, row2 in pairs:
                row_header1 = row_headers[row1]
                row_header2 = row_headers[row2]
                try:
                    result = Stats.doChiSquaredTest( numpy.vstack( (matrix[row1], matrix[row2] ) ) )
                except ValueError:
                    nskipped += 1
                    continue

                noutput += 1
                options.stdout.write( "\t".join( ( "%s" % row_header1,
                                                   "%s" % row_header2,
                                                   "%i" % result.mSampleSize,
                                                   "%i" % min(matrix.flat),
                                                   "%i" % max(matrix.flat),
                                                   options.value_format % result.mChiSquaredValue,
                                                   "%i" % result.mDegreesFreedom,
                                                   options.pvalue_format % result.mProbability,
                                                   "%s" % result.mSignificance,
                                                   options.value_format % result.mPhi ) ) + "\n" )