def process(line_sources): """ @param line_sources: sources of line iterables """ # get the headers and data from all of the input sources header_data_pairs = [hud.decode(lines) for lines in line_sources] header_list, data_list = zip(*header_data_pairs) # get the header to index map for each input source h_to_i_list = [Util.inverse_map(x) for x in header_list] # get the intersection of headers in all lists header_sets = [set(x) for x in header_list] header_intersection = set.intersection(*header_sets) # get the ordered list of all headers unique_headers = list(iterutils.unique_everseen( itertools.chain.from_iterable(header_list))) # get the ordered list of headers present in every input source out_headers = [h for h in unique_headers if h in header_intersection] out_data = [] for h in out_headers: row = [] for data, h_to_i in zip(data_list, h_to_i_list): if h in h_to_i: row.extend(data[h_to_i[h]]) out_data.append(row) return hud.encode(out_headers, out_data) + '\n'
def process(line_sources): """ @param line_sources: sources of line iterables """ # get the headers and data from all of the input sources header_data_pairs = [hud.decode(lines) for lines in line_sources] header_list, data_list = zip(*header_data_pairs) # get the header to index map for each input source h_to_i_list = [Util.inverse_map(x) for x in header_list] # get the intersection of headers in all lists header_sets = [set(x) for x in header_list] header_intersection = set.intersection(*header_sets) # get the ordered list of all headers unique_headers = list( iterutils.unique_everseen(itertools.chain.from_iterable(header_list))) # get the ordered list of headers present in every input source out_headers = [h for h in unique_headers if h in header_intersection] out_data = [] for h in out_headers: row = [] for data, h_to_i in zip(data_list, h_to_i_list): if h in h_to_i: row.extend(data[h_to_i[h]]) out_data.append(row) return hud.encode(out_headers, out_data) + '\n'
def read_microsatellite_lines(raw_lines): """ How can i combine the two haploid data sources? Maybe create each data matrix separately from the interleaved input. @param raw_lines: raw input lines @return: headers, diploid data """ lines = Util.get_stripped_lines(raw_lines) full_rows = [line.split() for line in lines] nfullcols = len(full_rows[0]) if nfullcols < 2: raise ValueError('expected at least two columns') if not all(len(row) == nfullcols for row in full_rows): raise ValueError('expected the same number of elements in each row') full_cols = zip(*full_rows) full_names = full_cols[0] ploidy = get_ploidy(full_names) headers = list(gen_headers(full_names, ploidy)) # get the unique elements of each column rows = [row[1:] for row in full_rows] cols = zip(*rows) uniques = [list(iterutils.unique_everseen(col)) for col in cols] # get the rows for each offset n = len(rows) / ploidy groups = [[rows[j * ploidy + i] for j in range(n)] for i in range(ploidy)] # get the column groups col_groups = [zip(*m) for m in groups] # get the binary row groups bin_row_groups = [ Carbone.get_binary_rows_helper(cols, uniques) for cols in col_groups ] # get the entrywise sum binary_rows = np.array(bin_row_groups).sum(axis=0).tolist() return headers, binary_rows
def get_binary_rows(multivalued_rows): """ Convert multivalued data to binary data. @param multivalued_rows: elements of each rows can be anything @return: longer rows of binary elements """ # get the columns columns = zip(*multivalued_rows) # convert to compound binary columns uniques = [list(iterutils.unique_everseen(x)) for x in columns] # convert to simple binary rows return get_binary_rows_helper(columns, uniques)
def __init__(self, header, column): """ @param header: the name of the variable to be represented by shape @param column: a list of categorical values """ self.header = header self.values = column[:] self.unique_values = list(iterutils.unique_everseen(column)) # for now just use sequential integers starting at zero self.unique_pchs = range(len(self.unique_values)) value_to_pch = dict(zip(self.unique_values, self.unique_pchs)) self.pchs = [value_to_pch[v] for v in column]
def __init__(self, header, column): """ @param header: the name of the variable to be represented by color @param column: a list of categorical values """ self.header = header self.values = column[:] self.unique_values = list(iterutils.unique_everseen(column)) self.unique_colors = create_R_palette(len(self.unique_values)) value_to_color = dict(zip(self.unique_values, self.unique_colors)) self.colors = [value_to_color[v] for v in column] # pch fifteen is a solid block self.pch = 15
def get_labels(sqdists): """ Inputs and outputs are numpy arrays. Account for the fact that sometimes a cluster will go away. That is, if no point is in the voronoi region of a centroid, then in the next iteration this cluster should disappear. @param sqdists: for each point, the squared distance to each center @return: for each point, the label of the nearest cluster """ labels = np.argmin(sqdists, axis=1) new_to_old = list(iterutils.unique_everseen(labels)) old_to_new = dict((old, new) for new, old in enumerate(new_to_old)) return np.array([old_to_new[old] for old in labels])
def gen_categories(self, tags, form_objects, form_out): """ @return: an iterable of category strings """ if self.show_io_types: cats = [obj.__class__.__name__ for obj in form_objects] for cat in iterutils.unique_everseen(cats): yield 'input:' + cat if form_out: yield 'output:' + form_out.__class__.__name__ if self.show_tags: for tag in tags: yield tag if self.universal: yield self.universal
def read_microsatellite_lines(raw_lines): """ How can i combine the two haploid data sources? Maybe create each data matrix separately from the interleaved input. @param raw_lines: raw input lines @return: headers, diploid data """ lines = Util.get_stripped_lines(raw_lines) if len(lines) % 2: raise ValueError('expected an even number of lines') if len(lines) < 2: raise ValueError('expected at least two lines') full_rows = [x.split() for x in lines] nfullcols = len(full_rows[0]) if nfullcols < 2: raise ValueError('expected at least two columns') for row in full_rows: if len(row) != nfullcols: msg = 'each row should have the same number of elements' raise ValueError(msg) a_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 0] b_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 1] a_headers = [row[0] for row in a_full_rows] b_headers = [row[0] for row in b_full_rows] for h in a_headers: if not h.endswith('a'): msg = 'each odd row label should end with the letter a' raise ValueError(msg) for h in b_headers: if not h.endswith('b'): msg = 'each even row label should end with the letter b' raise ValueError(msg) headers = [h[:-1] for h in a_headers] # get the unique elements of each column rows = [row[1:] for row in full_rows] cols = zip(*rows) uniques = [list(iterutils.unique_everseen(col)) for col in cols] # get the results for each row a_rows = [row[1:] for row in a_full_rows] b_rows = [row[1:] for row in b_full_rows] a_columns = zip(*a_rows) b_columns = zip(*b_rows) a_binary_rows = Carbone.get_binary_rows_helper(a_columns, uniques) b_binary_rows = Carbone.get_binary_rows_helper(b_columns, uniques) # add the elements entrywise and return as a list of lists bin_row_groups = [a_binary_rows, b_binary_rows] binary_rows = np.array(bin_row_groups).sum(axis=0).tolist() return headers, binary_rows
def process(args, raw_a_lines, raw_b_lines): a_table = RUtil.RTable(raw_a_lines) b_table = RUtil.RTable(raw_b_lines) if args.join_header not in a_table.headers: msg = 'the first table does not have the requested column' raise ValueError(msg) if args.join_header not in b_table.headers: msg = 'the second table does not have the requested column' raise ValueError(msg) concat_headers = a_table.headers + b_table.headers out_headers = list(iterutils.unique_everseen(concat_headers)) nunique_headers = len(out_headers) if len(a_table.headers) + len(b_table.headers) != nunique_headers + 1: msg = 'the tables should share only the requested column' raise ValueError(msg) # get the column index for each table a_index = a_table.header_to_column_index(args.join_header) b_index = b_table.header_to_column_index(args.join_header) # get the join column for each table a_column = a_table.header_to_primary_column(args.join_header) b_column = b_table.header_to_primary_column(args.join_header) # get the set of join elements common to both tables common_join_element_set = set(a_column) & set(b_column) # for the second table get the map from join elements to row indices b_j_to_i = dict((j, i) for i, j in enumerate(b_column)) # create the output table without the R row labels out_data = [] out_r_label = 1 for row in a_table.data: a_join_element = row[a_index] if a_join_element not in common_join_element_set: continue a_out_row = row[1:] b_row_index = b_j_to_i[a_join_element] b_row = b_table.data[b_row_index] b_out_row = [x for i, x in enumerate(b_row) if i != b_index][1:] out_row = [out_r_label] + a_out_row + b_out_row out_data.append(out_row) out_r_label += 1 # write the R table out = StringIO() print >> out, '\t'.join(out_headers) for row in out_data: print >> out, '\t'.join(str(x) for x in row) return out.getvalue()
def addSprs(self, sprs): sprs0 = set(self._sprs) sprs = [spr for spr in sprs if spr not in sprs0] sprs = list(unique_everseen(sprs)) if not sprs: return if not self.sortingEnabled(): startrow = len(self._sprs) lastrow = len(self._sprs) + len(sprs) self.beginInsertRows(QModelIndex(), startrow, lastrow) self._sprs.extend(sprs) self.endInsertRows() else: allsprs = [(spr, False) for spr in self._sprs] + \ [(spr, True) for spr in sprs] allsprs.sort(key=lambda x:self._key(x[0])) from itertools import groupby def isNewSpr(x): i, (spr, t) = x return bool(t) count_appended = 0 for k, group in groupby(enumerate(allsprs), key=isNewSpr): if not k: continue group = list(group) startrow, _ = group[0] lastrow, _ = group[-1] self.beginInsertRows(QModelIndex(), startrow, lastrow) self._sprs[startrow:startrow] = [spr for _, (spr, _) in group] self.endInsertRows() assert [spr for spr, _ in allsprs] == self._sprs, self._sprs
def _init_unique_shapes(self): self.unique_shapes = list(iterutils.unique_everseen(self.shape_list))
def __init__(self, alist): list.__init__(self, unique_everseen(alist)) self._indexDict = dict((x, i) for i, x in enumerate(alist))