def process(line_sources): """ @param line_sources: sources of line iterables """ # get the headers and data from all of the input sources header_data_pairs = [hud.decode(lines) for lines in line_sources] header_list, data_list = zip(*header_data_pairs) # get the header to index map for each input source h_to_i_list = [Util.inverse_map(x) for x in header_list] # get the intersection of headers in all lists header_sets = [set(x) for x in header_list] header_intersection = set.intersection(*header_sets) # get the ordered list of all headers unique_headers = list( iterutils.unique_everseen(itertools.chain.from_iterable(header_list))) # get the ordered list of headers present in every input source out_headers = [h for h in unique_headers if h in header_intersection] out_data = [] for h in out_headers: row = [] for data, h_to_i in zip(data_list, h_to_i_list): if h in h_to_i: row.extend(data[h_to_i[h]]) out_data.append(row) return hud.encode(out_headers, out_data) + '\n'
def get_response_content(fs): lines = Util.get_stripped_lines(fs.data.splitlines()) if len(lines) < 2: raise ValueError('expected at least two lines') rows = [line.split() for line in lines] headers = rows[0] data_rows = [[int(x) for x in row] for row in rows[1:]] for row in data_rows: for x in row: if x not in (-1, 0, 1, 2): msg = 'invalid diploid data value: %d' % x raise ValueError(msg) # impute the missing data if fs.use_mode: imputed_data_rows = [] for row in data_rows: non_missing_row = [x for x in row if x != -1] if not non_missing_row: msg = 'a variable has missing data for each individual' raise ValueError(msg) counts = [0]*3 for x in non_missing_row: counts[x] += 1 imputed_value = counts.index(max(counts)) imputed_row = [imputed_value if x == -1 else x for x in row] imputed_data_rows.append(imputed_row) data_rows = imputed_data_rows # return the hud table return hud.encode(headers, zip(*data_rows))
def process(line_sources): """ @param line_sources: sources of line iterables """ # get the headers and data from all of the input sources header_data_pairs = [hud.decode(lines) for lines in line_sources] header_list, data_list = zip(*header_data_pairs) # get the header to index map for each input source h_to_i_list = [Util.inverse_map(x) for x in header_list] # get the intersection of headers in all lists header_sets = [set(x) for x in header_list] header_intersection = set.intersection(*header_sets) # get the ordered list of all headers unique_headers = list(iterutils.unique_everseen( itertools.chain.from_iterable(header_list))) # get the ordered list of headers present in every input source out_headers = [h for h in unique_headers if h in header_intersection] out_data = [] for h in out_headers: row = [] for data, h_to_i in zip(data_list, h_to_i_list): if h in h_to_i: row.extend(data[h_to_i[h]]) out_data.append(row) return hud.encode(out_headers, out_data) + '\n'
def process(fs, raw_lines): headers, sequences = Phylip.decode(raw_lines) binary_rows = Carbone.get_binary_rows(sequences) if fs.hud: return hud.encode(headers, binary_rows) + '\n' elif fs.phy: binary_seqs = [''.join(str(x) for x in row) for row in binary_rows] return Phylip.encode(headers, binary_seqs) + '\n'
def get_response_content(fs): # get the headers and data from all of the input sources headers, sequences = hud.decode(fs.hud.splitlines()) h_to_s = dict((h, s) for h, s in zip(headers, sequences)) headers_out = [] sequences_out = [] for p, hs in process_headers(headers): headers_out.append(p) data = np.vstack(h_to_s[h] for h in hs).sum(axis=0) if fs.combine_exist: data = np.minimum(1, data) sequences_out.append(data) if fs.remove_invariant: sequences_out = remove_invariant_columns(sequences_out) return hud.encode(headers_out, sequences_out) + '\n'
def process(raw_lines): headers, binary_rows = read_microsatellite_lines(raw_lines) return hud.encode(headers, binary_rows) + '\n'
def get_response_content(fs): headers, binary_rows = read_microsatellite_lines(fs.data.splitlines()) return hud.encode(headers, binary_rows) + '\n'