Ejemplo n.º 1
0
def process(line_sources):
    """
    @param line_sources: sources of line iterables
    """
    # get the headers and data from all of the input sources
    header_data_pairs = [hud.decode(lines) for lines in line_sources]
    header_list, data_list = zip(*header_data_pairs)
    # get the header to index map for each input source
    h_to_i_list = [Util.inverse_map(x) for x in header_list]
    # get the intersection of headers in all lists
    header_sets = [set(x) for x in header_list]
    header_intersection = set.intersection(*header_sets)
    # get the ordered list of all headers
    unique_headers = list(iterutils.unique_everseen(
            itertools.chain.from_iterable(header_list)))
    # get the ordered list of headers present in every input source
    out_headers = [h for h in unique_headers if h in header_intersection]
    out_data = []
    for h in out_headers:
        row = []
        for data, h_to_i in zip(data_list, h_to_i_list):
            if h in h_to_i:
                row.extend(data[h_to_i[h]])
        out_data.append(row)
    return hud.encode(out_headers, out_data) + '\n'
Ejemplo n.º 2
0
def process(line_sources):
    """
    @param line_sources: sources of line iterables
    """
    # get the headers and data from all of the input sources
    header_data_pairs = [hud.decode(lines) for lines in line_sources]
    header_list, data_list = zip(*header_data_pairs)
    # get the header to index map for each input source
    h_to_i_list = [Util.inverse_map(x) for x in header_list]
    # get the intersection of headers in all lists
    header_sets = [set(x) for x in header_list]
    header_intersection = set.intersection(*header_sets)
    # get the ordered list of all headers
    unique_headers = list(
        iterutils.unique_everseen(itertools.chain.from_iterable(header_list)))
    # get the ordered list of headers present in every input source
    out_headers = [h for h in unique_headers if h in header_intersection]
    out_data = []
    for h in out_headers:
        row = []
        for data, h_to_i in zip(data_list, h_to_i_list):
            if h in h_to_i:
                row.extend(data[h_to_i[h]])
        out_data.append(row)
    return hud.encode(out_headers, out_data) + '\n'
Ejemplo n.º 3
0
def read_microsatellite_lines(raw_lines):
    """
    How can i combine the two haploid data sources?
    Maybe create each data matrix separately from the interleaved input.
    @param raw_lines: raw input lines
    @return: headers, diploid data
    """
    lines = Util.get_stripped_lines(raw_lines)
    full_rows = [line.split() for line in lines]
    nfullcols = len(full_rows[0])
    if nfullcols < 2:
        raise ValueError('expected at least two columns')
    if not all(len(row) == nfullcols for row in full_rows):
        raise ValueError('expected the same number of elements in each row')
    full_cols = zip(*full_rows)
    full_names = full_cols[0]
    ploidy = get_ploidy(full_names)
    headers = list(gen_headers(full_names, ploidy))
    # get the unique elements of each column
    rows = [row[1:] for row in full_rows]
    cols = zip(*rows)
    uniques = [list(iterutils.unique_everseen(col)) for col in cols]
    # get the rows for each offset
    n = len(rows) / ploidy
    groups = [[rows[j * ploidy + i] for j in range(n)] for i in range(ploidy)]
    # get the column groups
    col_groups = [zip(*m) for m in groups]
    # get the binary row groups
    bin_row_groups = [
        Carbone.get_binary_rows_helper(cols, uniques) for cols in col_groups
    ]
    # get the entrywise sum
    binary_rows = np.array(bin_row_groups).sum(axis=0).tolist()
    return headers, binary_rows
Ejemplo n.º 4
0
def get_binary_rows(multivalued_rows):
    """
    Convert multivalued data to binary data.
    @param multivalued_rows: elements of each rows can be anything
    @return: longer rows of binary elements
    """
    # get the columns
    columns = zip(*multivalued_rows)
    # convert to compound binary columns
    uniques = [list(iterutils.unique_everseen(x)) for x in columns]
    # convert to simple binary rows
    return get_binary_rows_helper(columns, uniques)
Ejemplo n.º 5
0
 def __init__(self, header, column):
     """
     @param header: the name of the variable to be represented by shape
     @param column: a list of categorical values
     """
     self.header = header
     self.values = column[:]
     self.unique_values = list(iterutils.unique_everseen(column))
     # for now just use sequential integers starting at zero
     self.unique_pchs = range(len(self.unique_values))
     value_to_pch = dict(zip(self.unique_values, self.unique_pchs))
     self.pchs = [value_to_pch[v] for v in column]
Ejemplo n.º 6
0
def get_binary_rows(multivalued_rows):
    """
    Convert multivalued data to binary data.
    @param multivalued_rows: elements of each rows can be anything
    @return: longer rows of binary elements
    """
    # get the columns
    columns = zip(*multivalued_rows)
    # convert to compound binary columns
    uniques = [list(iterutils.unique_everseen(x)) for x in columns]
    # convert to simple binary rows
    return get_binary_rows_helper(columns, uniques)
Ejemplo n.º 7
0
 def __init__(self, header, column):
     """
     @param header: the name of the variable to be represented by color
     @param column: a list of categorical values
     """
     self.header = header
     self.values = column[:]
     self.unique_values = list(iterutils.unique_everseen(column))
     self.unique_colors = create_R_palette(len(self.unique_values))
     value_to_color = dict(zip(self.unique_values, self.unique_colors))
     self.colors = [value_to_color[v] for v in column]
     # pch fifteen is a solid block
     self.pch = 15
Ejemplo n.º 8
0
def get_labels(sqdists):
    """
    Inputs and outputs are numpy arrays.
    Account for the fact that sometimes a cluster will go away.
    That is, if no point is in the voronoi region of a centroid,
    then in the next iteration this cluster should disappear.
    @param sqdists: for each point, the squared distance to each center
    @return: for each point, the label of the nearest cluster
    """
    labels = np.argmin(sqdists, axis=1)
    new_to_old = list(iterutils.unique_everseen(labels))
    old_to_new = dict((old, new) for new, old in enumerate(new_to_old))
    return np.array([old_to_new[old] for old in labels])
Ejemplo n.º 9
0
 def gen_categories(self, tags, form_objects, form_out):
     """
     @return: an iterable of category strings
     """
     if self.show_io_types:
         cats = [obj.__class__.__name__ for obj in form_objects]
         for cat in iterutils.unique_everseen(cats):
             yield 'input:' + cat
         if form_out:
             yield 'output:' + form_out.__class__.__name__
     if self.show_tags:
         for tag in tags:
             yield tag
     if self.universal:
         yield self.universal
Ejemplo n.º 10
0
 def gen_categories(self, tags, form_objects, form_out):
     """
     @return: an iterable of category strings
     """
     if self.show_io_types:
         cats = [obj.__class__.__name__ for obj in form_objects]
         for cat in iterutils.unique_everseen(cats):
             yield 'input:' + cat
         if form_out:
             yield 'output:' + form_out.__class__.__name__
     if self.show_tags:
         for tag in tags:
             yield tag
     if self.universal:
         yield self.universal
Ejemplo n.º 11
0
def read_microsatellite_lines(raw_lines):
    """
    How can i combine the two haploid data sources?
    Maybe create each data matrix separately from the interleaved input.
    @param raw_lines: raw input lines
    @return: headers, diploid data
    """
    lines = Util.get_stripped_lines(raw_lines)
    if len(lines) % 2:
        raise ValueError('expected an even number of lines')
    if len(lines) < 2:
        raise ValueError('expected at least two lines')
    full_rows = [x.split() for x in lines]
    nfullcols = len(full_rows[0])
    if nfullcols < 2:
        raise ValueError('expected at least two columns')
    for row in full_rows:
        if len(row) != nfullcols:
            msg = 'each row should have the same number of elements'
            raise ValueError(msg)
    a_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 0]
    b_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 1]
    a_headers = [row[0] for row in a_full_rows]
    b_headers = [row[0] for row in b_full_rows]
    for h in a_headers:
        if not h.endswith('a'):
            msg = 'each odd row label should end with the letter a'
            raise ValueError(msg)
    for h in b_headers:
        if not h.endswith('b'):
            msg = 'each even row label should end with the letter b'
            raise ValueError(msg)
    headers = [h[:-1] for h in a_headers]
    # get the unique elements of each column
    rows = [row[1:] for row in full_rows]
    cols = zip(*rows)
    uniques = [list(iterutils.unique_everseen(col)) for col in cols]
    # get the results for each row
    a_rows = [row[1:] for row in a_full_rows]
    b_rows = [row[1:] for row in b_full_rows]
    a_columns = zip(*a_rows)
    b_columns = zip(*b_rows)
    a_binary_rows = Carbone.get_binary_rows_helper(a_columns, uniques)
    b_binary_rows = Carbone.get_binary_rows_helper(b_columns, uniques)
    # add the elements entrywise and return as a list of lists
    bin_row_groups = [a_binary_rows, b_binary_rows]
    binary_rows = np.array(bin_row_groups).sum(axis=0).tolist()
    return headers, binary_rows
Ejemplo n.º 12
0
def read_microsatellite_lines(raw_lines):
    """
    How can i combine the two haploid data sources?
    Maybe create each data matrix separately from the interleaved input.
    @param raw_lines: raw input lines
    @return: headers, diploid data
    """
    lines = Util.get_stripped_lines(raw_lines)
    if len(lines) % 2:
        raise ValueError('expected an even number of lines')
    if len(lines) < 2:
        raise ValueError('expected at least two lines')
    full_rows = [x.split() for x in lines]
    nfullcols = len(full_rows[0])
    if nfullcols < 2:
        raise ValueError('expected at least two columns')
    for row in full_rows:
        if len(row) != nfullcols:
            msg = 'each row should have the same number of elements'
            raise ValueError(msg)
    a_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 0]
    b_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 1]
    a_headers = [row[0] for row in a_full_rows]
    b_headers = [row[0] for row in b_full_rows]
    for h in a_headers:
        if not h.endswith('a'):
            msg = 'each odd row label should end with the letter a'
            raise ValueError(msg)
    for h in b_headers:
        if not h.endswith('b'):
            msg = 'each even row label should end with the letter b'
            raise ValueError(msg)
    headers = [h[:-1] for h in a_headers]
    # get the unique elements of each column
    rows = [row[1:] for row in full_rows]
    cols = zip(*rows)
    uniques = [list(iterutils.unique_everseen(col)) for col in cols]
    # get the results for each row
    a_rows = [row[1:] for row in a_full_rows]
    b_rows = [row[1:] for row in b_full_rows]
    a_columns = zip(*a_rows)
    b_columns = zip(*b_rows)
    a_binary_rows = Carbone.get_binary_rows_helper(a_columns, uniques)
    b_binary_rows = Carbone.get_binary_rows_helper(b_columns, uniques)
    # add the elements entrywise and return as a list of lists
    bin_row_groups = [a_binary_rows, b_binary_rows]
    binary_rows = np.array(bin_row_groups).sum(axis=0).tolist()
    return headers, binary_rows
Ejemplo n.º 13
0
def process(args, raw_a_lines, raw_b_lines):
    a_table = RUtil.RTable(raw_a_lines)
    b_table = RUtil.RTable(raw_b_lines)
    if args.join_header not in a_table.headers:
        msg = 'the first table does not have the requested column'
        raise ValueError(msg)
    if args.join_header not in b_table.headers:
        msg = 'the second table does not have the requested column'
        raise ValueError(msg)
    concat_headers = a_table.headers + b_table.headers
    out_headers = list(iterutils.unique_everseen(concat_headers))
    nunique_headers = len(out_headers)
    if len(a_table.headers) + len(b_table.headers) != nunique_headers + 1:
        msg = 'the tables should share only the requested column'
        raise ValueError(msg)
    # get the column index for each table
    a_index = a_table.header_to_column_index(args.join_header)
    b_index = b_table.header_to_column_index(args.join_header)
    # get the join column for each table
    a_column = a_table.header_to_primary_column(args.join_header)
    b_column = b_table.header_to_primary_column(args.join_header)
    # get the set of join elements common to both tables
    common_join_element_set = set(a_column) & set(b_column)
    # for the second table get the map from join elements to row indices
    b_j_to_i = dict((j, i) for i, j in enumerate(b_column))
    # create the output table without the R row labels
    out_data = []
    out_r_label = 1
    for row in a_table.data:
        a_join_element = row[a_index]
        if a_join_element not in common_join_element_set:
            continue
        a_out_row = row[1:]
        b_row_index = b_j_to_i[a_join_element]
        b_row = b_table.data[b_row_index]
        b_out_row = [x for i, x in enumerate(b_row) if i != b_index][1:]
        out_row = [out_r_label] + a_out_row + b_out_row
        out_data.append(out_row)
        out_r_label += 1
    # write the R table
    out = StringIO()
    print >> out, '\t'.join(out_headers)
    for row in out_data:
        print >> out, '\t'.join(str(x) for x in row)
    return out.getvalue()
Ejemplo n.º 14
0
 def addSprs(self, sprs):
     sprs0 = set(self._sprs)
     sprs = [spr for spr in sprs if spr not in sprs0]
     sprs = list(unique_everseen(sprs))
     if not sprs:
         return
     
     if not self.sortingEnabled():
         startrow = len(self._sprs)
         lastrow  = len(self._sprs) + len(sprs)
         self.beginInsertRows(QModelIndex(), startrow, lastrow)
         self._sprs.extend(sprs)
         self.endInsertRows()
     else:
         allsprs = [(spr, False) for spr in self._sprs] + \
                   [(spr, True) for spr in sprs]
         allsprs.sort(key=lambda x:self._key(x[0]))
         
         from itertools import groupby
         def isNewSpr(x):
             i, (spr, t) = x
             return bool(t)
         
         count_appended = 0
         for k, group in groupby(enumerate(allsprs), key=isNewSpr):
             if not k:
                 continue
             
             group = list(group)
             startrow, _ = group[0]
             lastrow,  _ = group[-1]
             
             self.beginInsertRows(QModelIndex(), startrow, lastrow)
             self._sprs[startrow:startrow] = [spr for _, (spr, _) in group]
             self.endInsertRows()
         
         assert [spr for spr, _ in allsprs] == self._sprs, self._sprs
Ejemplo n.º 15
0
 def _init_unique_shapes(self):
     self.unique_shapes = list(iterutils.unique_everseen(self.shape_list))
Ejemplo n.º 16
0
 def _init_unique_shapes(self):
     self.unique_shapes = list(iterutils.unique_everseen(self.shape_list))
Ejemplo n.º 17
0
 def __init__(self, alist):
     list.__init__(self, unique_everseen(alist))
     self._indexDict = dict((x, i) for i, x in enumerate(alist))