def main():
    try:
        datafile = sys.argv[1]
        outfile_name = sys.argv[2]
        expression = sys.argv[3]
    except:
        stop_err('Usage: python gsummary.py input_file ouput_file expression')

    math_allowed = S3_METHODS()['Math']
    ops_allowed = S3_METHODS()['Ops']

    # Check for invalid expressions
    for word in re.compile('[a-zA-Z]+').findall(expression):
        if word and word not in math_allowed:
            stop_err(
                "Invalid expression '%s': term '%s' is not recognized or allowed"
                % (expression, word))
    symbols = set()
    for symbol in re.compile('[^a-z0-9\s]+').findall(expression):
        if symbol and symbol not in ops_allowed:
            stop_err(
                "Invalid expression '%s': operator '%s' is not recognized or allowed"
                % (expression, symbol))
        else:
            symbols.add(symbol)
    if len(symbols) == 1 and ',' in symbols:
        # User may have entered a comma-separated list r_data_frame columns
        stop_err(
            "Invalid columns '%s': this tool requires a single column or expression"
            % expression)

    # Find all column references in the expression
    cols = []
    for col in re.compile('c[0-9]+').findall(expression):
        try:
            cols.append(int(col[1:]) - 1)
        except:
            pass

    tmp_file = tempfile.NamedTemporaryFile('w+b')
    # Write the R header row to the temporary file
    hdr_str = "\t".join("c%s" % str(col + 1) for col in cols)
    tmp_file.write("%s\n" % hdr_str)
    skipped_lines = 0
    first_invalid_line = 0
    i = 0
    for i, line in enumerate(open(datafile)):
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            valid = True
            fields = line.split('\t')
            # Write the R data row to the temporary file
            for col in cols:
                try:
                    float(fields[col])
                except:
                    skipped_lines += 1
                    if not first_invalid_line:
                        first_invalid_line = i + 1
                    valid = False
                    break
            if valid:
                data_str = "\t".join(fields[col] for col in cols)
                tmp_file.write("%s\n" % data_str)
    tmp_file.flush()

    if skipped_lines == i + 1:
        stop_err(
            "Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements."
        )
    else:
        # summary function and return labels
        set_default_mode(NO_CONVERSION)
        summary_func = r(
            "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }"
        )
        headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%']
        headings_str = "\t".join(headings)

        r_data_frame = r.read_table(tmp_file.name, header=True, sep="\t")

        outfile = open(outfile_name, 'w')

        for col in re.compile('c[0-9]+').findall(expression):
            r.assign(col, r["$"](r_data_frame, col))
        try:
            summary = summary_func(r(expression))
        except RException as s:
            outfile.close()
            stop_err("Computation resulted in the following error: %s" %
                     str(s))
        summary = summary.as_py(BASIC_CONVERSION)
        outfile.write("#%s\n" % headings_str)
        if type(summary) is dict:
            # using rpy
            outfile.write("%s\n" %
                          "\t".join(["%g" % summary[k] for k in headings]))
        else:
            # using rpy2
            outfile.write("%s\n" % "\t".join(["%g" % k for k in summary]))
        outfile.close()

        if skipped_lines:
            print "Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % (
                skipped_lines, first_invalid_line)
Ejemplo n.º 2
0
def main():
    try:
        datafile = sys.argv[1]
        outfile_name = sys.argv[2]
        expression = sys.argv[3]
    except Exception:
        stop_err('Usage: python gsummary.py input_file ouput_file expression')

    math_allowed = S3_METHODS()['Math']
    ops_allowed = S3_METHODS()['Ops']

    # Check for invalid expressions
    for word in re.compile('[a-zA-Z]+').findall(expression):
        if word and word not in math_allowed:
            stop_err("Invalid expression '%s': term '%s' is not recognized or allowed" % (expression, word))
    symbols = set()
    for symbol in re.compile('[^a-z0-9\s]+').findall(expression):
        if symbol and symbol not in ops_allowed:
            stop_err("Invalid expression '%s': operator '%s' is not recognized or allowed" % (expression, symbol))
        else:
            symbols.add(symbol)
    if len(symbols) == 1 and ',' in symbols:
        # User may have entered a comma-separated list r_data_frame columns
        stop_err("Invalid columns '%s': this tool requires a single column or expression" % expression)

    # Find all column references in the expression
    cols = []
    for col in re.compile('c[0-9]+').findall(expression):
        try:
            cols.append(int(col[1:]) - 1)
        except Exception:
            pass

    tmp_file = tempfile.NamedTemporaryFile('w+')
    # Write the R header row to the temporary file
    hdr_str = "\t".join("c%s" % str(col + 1) for col in cols)
    tmp_file.write("%s\n" % hdr_str)
    skipped_lines = 0
    first_invalid_line = 0
    i = 0
    for i, line in enumerate(open(datafile)):
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            valid = True
            fields = line.split('\t')
            # Write the R data row to the temporary file
            for col in cols:
                try:
                    float(fields[col])
                except Exception:
                    skipped_lines += 1
                    if not first_invalid_line:
                        first_invalid_line = i + 1
                    valid = False
                    break
            if valid:
                data_str = "\t".join(fields[col] for col in cols)
                tmp_file.write("%s\n" % data_str)
    tmp_file.flush()

    if skipped_lines == i + 1:
        stop_err("Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements.")
    else:
        # summary function and return labels
        set_default_mode(NO_CONVERSION)
        summary_func = r("function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }")
        headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%']
        headings_str = "\t".join(headings)

        r_data_frame = r.read_table(tmp_file.name, header=True, sep="\t")

        outfile = open(outfile_name, 'w')

        for col in re.compile('c[0-9]+').findall(expression):
            r.assign(col, r["$"](r_data_frame, col))
        try:
            summary = summary_func(r(expression))
        except RException as s:
            outfile.close()
            stop_err("Computation resulted in the following error: %s" % str(s))
        summary = summary.as_py(BASIC_CONVERSION)
        outfile.write("#%s\n" % headings_str)
        if type(summary) is dict:
            # using rpy
            outfile.write("%s\n" % "\t".join(["%g" % summary[k] for k in headings]))
        else:
            # using rpy2
            outfile.write("%s\n" % "\t".join(["%g" % k for k in summary]))
        outfile.close()

        if skipped_lines:
            print("Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % (skipped_lines, first_invalid_line))