def process(args, raw_info_lines, raw_input_headers, raw_output_headers): info_lines = Util.get_stripped_lines(raw_info_lines) rows = [line.split() for line in info_lines] # the number of columns should be consistent among rows if len(set(len(row) for row in rows)) != 1: msg = 'the number of columns should be consistent among rows' raise ValueError(msg) # break the list of rows into a header row and data rows header, data_rows = rows[0], rows[1:] # account for missing input data if args.star_missing_in: data_rows = [[None if v=='*' else v for v in r] for r in data_rows] elif args.NULL_missing_in: data_rows = [[None if v=='NULL' else v for v in r] for r in data_rows] # define the renamed input headers input_headers = Util.get_stripped_lines(raw_input_headers) if len(input_headers) < len(header): msg = 'each input header should be explicitly (re)named' raise ValueError(msg) if len(header) < len(input_headers): msg = 'more renamed headers than input headers' raise ValueError(msg) for h in input_headers: if not Carbone.is_valid_header(h): msg = 'invalid column header: %s' % h raise ValueError(msg) # force IC prefix for non-missing elements in the first column if requested if args.clean_isolates: data_rows = Carbone.clean_isolate_table(data_rows) # define the ordered output headers output_headers = Util.get_stripped_lines(raw_output_headers) bad_output_headers = set(output_headers) - set(input_headers) if bad_output_headers: msg_a = 'unrecognized output column headers: ' msg_b = ', '.join(bad_output_headers) raise ValueError(msg_a + msg_b) # define the order of the output data columns h_to_i = dict((h, i) for i, h in enumerate(input_headers)) # build the output data rows by reordering the columns data_rows = [[row[h_to_i[h]] for h in output_headers] for row in data_rows] # deal with missing data by skipping rows or replacing elements table = [] for row in data_rows: if args.remove_missing_out and (None in row): continue elif args.NA_missing_out: row = ['NA' if x is None else x for x in row] table.append(row) # add row index labels for R compatibility if requested if args.add_indices: table = [[i+1] + row for i, row in enumerate(table)] # begin writing the R table out = StringIO() # write the table header print >> out, '\t'.join(output_headers) # write the table for row in table: print >> out, '\t'.join(str(x) for x in row) # return the table return out.getvalue()
def process(args, raw_info_lines, input_headers, output_headers): info_lines = Util.get_stripped_lines(raw_info_lines) # extract info from the .csv file rows = list(csv.reader(info_lines)) # the number of columns should be consistent among rows if len(set(len(row) for row in rows)) != 1: msg = 'the number of columns should be consistent among rows' raise ValueError(msg) # break the list of rows into a header row and data rows header, data_rows = rows[0], rows[1:] # account for missing input data if args.star_missing_in: data_rows = [[None if v == '*' else v for v in r] for r in data_rows] elif args.NULL_missing_in: data_rows = [[None if v == 'NULL' else v for v in r] for r in data_rows] # define the renamed input headers if len(input_headers) < len(header): msg = 'each input header should be explicitly (re)named' raise ValueError(msg) if len(header) < len(input_headers): msg = 'more renamed headers than input headers' raise ValueError(msg) for h in input_headers: if not Carbone.is_valid_header(h): msg = 'invalid column header: %s' % h raise ValueError(msg) # force IC prefix for non-missing elements in the first column if requested if args.clean_isolates: data_rows = Carbone.clean_isolate_table(data_rows) # define the ordered output headers bad_output_headers = set(output_headers) - set(input_headers) if bad_output_headers: msg_a = 'unrecognized output column headers: ' msg_b = ', '.join(bad_output_headers) raise ValueError(msg_a + msg_b) # define the order of the output data columns h_to_i = dict((h, i) for i, h in enumerate(input_headers)) # build the output data rows by reordering the columns data_rows = [[row[h_to_i[h]] for h in output_headers] for row in data_rows] # deal with missing data by skipping rows or replacing elements table = [] for row in data_rows: if args.remove_missing_out and (None in row): continue elif args.NA_missing_out: row = ['NA' if x is None else x for x in row] table.append(row) # add row index labels for R compatibility if requested if args.add_indices: table = [[i + 1] + row for i, row in enumerate(table)] # begin writing the R table out = StringIO() # write the table header print >> out, '\t'.join(output_headers) # write the table for row in table: print >> out, '\t'.join(str(x) for x in row) # return the table return out.getvalue()
def get_rtable_info(rtable, cluster_header, axis_headers): """ @param rtable: a RUtil.RTable object @param cluster_header: header of the new column to add @param axis_headers: a tuple of column headers @return: points as rows in a numpy array """ header_row = rtable.headers data_rows = rtable.data # do header validation Carbone.validate_headers(header_row) if not Carbone.is_valid_header(cluster_header): raise ValueError('invalid column header: %s' % cluster_header) if cluster_header in header_row: raise ValueError( 'the column header %s ' 'is already in the table' % cluster_header) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError( 'expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) return points