def read_microsatellite_lines(raw_lines): """ How can i combine the two haploid data sources? Maybe create each data matrix separately from the interleaved input. @param raw_lines: raw input lines @return: headers, diploid data """ lines = Util.get_stripped_lines(raw_lines) if len(lines) % 2: raise ValueError('expected an even number of lines') if len(lines) < 2: raise ValueError('expected at least two lines') full_rows = [x.split() for x in lines] nfullcols = len(full_rows[0]) if nfullcols < 2: raise ValueError('expected at least two columns') for row in full_rows: if len(row) != nfullcols: msg = 'each row should have the same number of elements' raise ValueError(msg) a_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 0] b_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 1] a_headers = [row[0] for row in a_full_rows] b_headers = [row[0] for row in b_full_rows] for h in a_headers: if not h.endswith('a'): msg = 'each odd row label should end with the letter a' raise ValueError(msg) for h in b_headers: if not h.endswith('b'): msg = 'each even row label should end with the letter b' raise ValueError(msg) headers = [h[:-1] for h in a_headers] # get the unique elements of each column rows = [row[1:] for row in full_rows] cols = zip(*rows) uniques = [list(iterutils.unique_everseen(col)) for col in cols] # get the results for each row a_rows = [row[1:] for row in a_full_rows] b_rows = [row[1:] for row in b_full_rows] a_columns = zip(*a_rows) b_columns = zip(*b_rows) a_binary_rows = Carbone.get_binary_rows_helper(a_columns, uniques) b_binary_rows = Carbone.get_binary_rows_helper(b_columns, uniques) # add the elements entrywise and return as a list of lists bin_row_groups = [a_binary_rows, b_binary_rows] binary_rows = np.array(bin_row_groups).sum(axis=0).tolist() return headers, binary_rows
def read_microsatellite_lines(raw_lines): """ How can i combine the two haploid data sources? Maybe create each data matrix separately from the interleaved input. @param raw_lines: raw input lines @return: headers, diploid data """ lines = Util.get_stripped_lines(raw_lines) full_rows = [line.split() for line in lines] nfullcols = len(full_rows[0]) if nfullcols < 2: raise ValueError('expected at least two columns') if not all(len(row) == nfullcols for row in full_rows): raise ValueError('expected the same number of elements in each row') full_cols = zip(*full_rows) full_names = full_cols[0] ploidy = get_ploidy(full_names) headers = list(gen_headers(full_names, ploidy)) # get the unique elements of each column rows = [row[1:] for row in full_rows] cols = zip(*rows) uniques = [list(iterutils.unique_everseen(col)) for col in cols] # get the rows for each offset n = len(rows) / ploidy groups = [[rows[j * ploidy + i] for j in range(n)] for i in range(ploidy)] # get the column groups col_groups = [zip(*m) for m in groups] # get the binary row groups bin_row_groups = [ Carbone.get_binary_rows_helper(cols, uniques) for cols in col_groups ] # get the entrywise sum binary_rows = np.array(bin_row_groups).sum(axis=0).tolist() return headers, binary_rows