예제 #1
0
def process(args, raw_info_lines, raw_input_headers, raw_output_headers):
    info_lines = Util.get_stripped_lines(raw_info_lines)
    rows = [line.split() for line in info_lines]
    # the number of columns should be consistent among rows
    if len(set(len(row) for row in rows)) != 1:
        msg = 'the number of columns should be consistent among rows'
        raise ValueError(msg)
    # break the list of rows into a header row and data rows
    header, data_rows = rows[0], rows[1:]
    # account for missing input data
    if args.star_missing_in:
        data_rows = [[None if v=='*' else v for v in r] for r in data_rows]
    elif args.NULL_missing_in:
        data_rows = [[None if v=='NULL' else v for v in r] for r in data_rows]
    # define the renamed input headers
    input_headers = Util.get_stripped_lines(raw_input_headers)
    if len(input_headers) < len(header):
        msg = 'each input header should be explicitly (re)named'
        raise ValueError(msg)
    if len(header) < len(input_headers):
        msg = 'more renamed headers than input headers'
        raise ValueError(msg)
    for h in input_headers:
        if not Carbone.is_valid_header(h):
            msg = 'invalid column header: %s' % h
            raise ValueError(msg)
    # force IC prefix for non-missing elements in the first column if requested
    if args.clean_isolates:
        data_rows = Carbone.clean_isolate_table(data_rows)
    # define the ordered output headers
    output_headers = Util.get_stripped_lines(raw_output_headers)
    bad_output_headers = set(output_headers) - set(input_headers)
    if bad_output_headers:
        msg_a = 'unrecognized output column headers: '
        msg_b = ', '.join(bad_output_headers)
        raise ValueError(msg_a + msg_b)
    # define the order of the output data columns
    h_to_i = dict((h, i) for i, h in enumerate(input_headers))
    # build the output data rows by reordering the columns
    data_rows = [[row[h_to_i[h]] for h in output_headers] for row in data_rows]
    # deal with missing data by skipping rows or replacing elements
    table = []
    for row in data_rows:
        if args.remove_missing_out and (None in row):
            continue
        elif args.NA_missing_out:
            row = ['NA' if x is None else x for x in row]
        table.append(row)
    # add row index labels for R compatibility if requested
    if args.add_indices:
        table = [[i+1] + row for i, row in enumerate(table)]
    # begin writing the R table
    out = StringIO()
    # write the table header
    print >> out, '\t'.join(output_headers)
    # write the table
    for row in table:
        print >> out, '\t'.join(str(x) for x in row)
    # return the table
    return out.getvalue()
예제 #2
0
파일: 20100611a.py 프로젝트: BIGtigr/xgcode
def process(args, raw_info_lines, input_headers, output_headers):
    info_lines = Util.get_stripped_lines(raw_info_lines)
    # extract info from the .csv file
    rows = list(csv.reader(info_lines))
    # the number of columns should be consistent among rows
    if len(set(len(row) for row in rows)) != 1:
        msg = 'the number of columns should be consistent among rows'
        raise ValueError(msg)
    # break the list of rows into a header row and data rows
    header, data_rows = rows[0], rows[1:]
    # account for missing input data
    if args.star_missing_in:
        data_rows = [[None if v == '*' else v for v in r] for r in data_rows]
    elif args.NULL_missing_in:
        data_rows = [[None if v == 'NULL' else v for v in r]
                     for r in data_rows]
    # define the renamed input headers
    if len(input_headers) < len(header):
        msg = 'each input header should be explicitly (re)named'
        raise ValueError(msg)
    if len(header) < len(input_headers):
        msg = 'more renamed headers than input headers'
        raise ValueError(msg)
    for h in input_headers:
        if not Carbone.is_valid_header(h):
            msg = 'invalid column header: %s' % h
            raise ValueError(msg)
    # force IC prefix for non-missing elements in the first column if requested
    if args.clean_isolates:
        data_rows = Carbone.clean_isolate_table(data_rows)
    # define the ordered output headers
    bad_output_headers = set(output_headers) - set(input_headers)
    if bad_output_headers:
        msg_a = 'unrecognized output column headers: '
        msg_b = ', '.join(bad_output_headers)
        raise ValueError(msg_a + msg_b)
    # define the order of the output data columns
    h_to_i = dict((h, i) for i, h in enumerate(input_headers))
    # build the output data rows by reordering the columns
    data_rows = [[row[h_to_i[h]] for h in output_headers] for row in data_rows]
    # deal with missing data by skipping rows or replacing elements
    table = []
    for row in data_rows:
        if args.remove_missing_out and (None in row):
            continue
        elif args.NA_missing_out:
            row = ['NA' if x is None else x for x in row]
        table.append(row)
    # add row index labels for R compatibility if requested
    if args.add_indices:
        table = [[i + 1] + row for i, row in enumerate(table)]
    # begin writing the R table
    out = StringIO()
    # write the table header
    print >> out, '\t'.join(output_headers)
    # write the table
    for row in table:
        print >> out, '\t'.join(str(x) for x in row)
    # return the table
    return out.getvalue()
예제 #3
0
def get_rtable_info(rtable, cluster_header, axis_headers):
    """
    @param rtable: a RUtil.RTable object
    @param cluster_header: header of the new column to add
    @param axis_headers: a tuple of column headers
    @return: points as rows in a numpy array
    """
    header_row = rtable.headers
    data_rows = rtable.data
    # do header validation
    Carbone.validate_headers(header_row)
    if not Carbone.is_valid_header(cluster_header):
        raise ValueError('invalid column header: %s' % cluster_header)
    if cluster_header in header_row:
        raise ValueError(
                'the column header %s '
                'is already in the table' % cluster_header)
    # get the numpy array of conformant points
    h_to_i = dict((h, i+1) for i, h in enumerate(header_row))
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError(
                    'expected the axis column %s '
                    'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    return points