Esempio n. 1
0
def process(args, raw_info_lines, input_headers, output_headers):
    info_lines = Util.get_stripped_lines(raw_info_lines)
    # extract info from the .csv file
    rows = list(csv.reader(info_lines))
    # the number of columns should be consistent among rows
    if len(set(len(row) for row in rows)) != 1:
        msg = 'the number of columns should be consistent among rows'
        raise ValueError(msg)
    # break the list of rows into a header row and data rows
    header, data_rows = rows[0], rows[1:]
    # account for missing input data
    if args.star_missing_in:
        data_rows = [[None if v == '*' else v for v in r] for r in data_rows]
    elif args.NULL_missing_in:
        data_rows = [[None if v == 'NULL' else v for v in r]
                     for r in data_rows]
    # define the renamed input headers
    if len(input_headers) < len(header):
        msg = 'each input header should be explicitly (re)named'
        raise ValueError(msg)
    if len(header) < len(input_headers):
        msg = 'more renamed headers than input headers'
        raise ValueError(msg)
    for h in input_headers:
        if not Carbone.is_valid_header(h):
            msg = 'invalid column header: %s' % h
            raise ValueError(msg)
    # force IC prefix for non-missing elements in the first column if requested
    if args.clean_isolates:
        data_rows = Carbone.clean_isolate_table(data_rows)
    # define the ordered output headers
    bad_output_headers = set(output_headers) - set(input_headers)
    if bad_output_headers:
        msg_a = 'unrecognized output column headers: '
        msg_b = ', '.join(bad_output_headers)
        raise ValueError(msg_a + msg_b)
    # define the order of the output data columns
    h_to_i = dict((h, i) for i, h in enumerate(input_headers))
    # build the output data rows by reordering the columns
    data_rows = [[row[h_to_i[h]] for h in output_headers] for row in data_rows]
    # deal with missing data by skipping rows or replacing elements
    table = []
    for row in data_rows:
        if args.remove_missing_out and (None in row):
            continue
        elif args.NA_missing_out:
            row = ['NA' if x is None else x for x in row]
        table.append(row)
    # add row index labels for R compatibility if requested
    if args.add_indices:
        table = [[i + 1] + row for i, row in enumerate(table)]
    # begin writing the R table
    out = StringIO()
    # write the table header
    print >> out, '\t'.join(output_headers)
    # write the table
    for row in table:
        print >> out, '\t'.join(str(x) for x in row)
    # return the table
    return out.getvalue()
Esempio n. 2
0
def process(args, raw_info_lines, raw_input_headers, raw_output_headers):
    info_lines = Util.get_stripped_lines(raw_info_lines)
    rows = [line.split() for line in info_lines]
    # the number of columns should be consistent among rows
    if len(set(len(row) for row in rows)) != 1:
        msg = 'the number of columns should be consistent among rows'
        raise ValueError(msg)
    # break the list of rows into a header row and data rows
    header, data_rows = rows[0], rows[1:]
    # account for missing input data
    if args.star_missing_in:
        data_rows = [[None if v=='*' else v for v in r] for r in data_rows]
    elif args.NULL_missing_in:
        data_rows = [[None if v=='NULL' else v for v in r] for r in data_rows]
    # define the renamed input headers
    input_headers = Util.get_stripped_lines(raw_input_headers)
    if len(input_headers) < len(header):
        msg = 'each input header should be explicitly (re)named'
        raise ValueError(msg)
    if len(header) < len(input_headers):
        msg = 'more renamed headers than input headers'
        raise ValueError(msg)
    for h in input_headers:
        if not Carbone.is_valid_header(h):
            msg = 'invalid column header: %s' % h
            raise ValueError(msg)
    # force IC prefix for non-missing elements in the first column if requested
    if args.clean_isolates:
        data_rows = Carbone.clean_isolate_table(data_rows)
    # define the ordered output headers
    output_headers = Util.get_stripped_lines(raw_output_headers)
    bad_output_headers = set(output_headers) - set(input_headers)
    if bad_output_headers:
        msg_a = 'unrecognized output column headers: '
        msg_b = ', '.join(bad_output_headers)
        raise ValueError(msg_a + msg_b)
    # define the order of the output data columns
    h_to_i = dict((h, i) for i, h in enumerate(input_headers))
    # build the output data rows by reordering the columns
    data_rows = [[row[h_to_i[h]] for h in output_headers] for row in data_rows]
    # deal with missing data by skipping rows or replacing elements
    table = []
    for row in data_rows:
        if args.remove_missing_out and (None in row):
            continue
        elif args.NA_missing_out:
            row = ['NA' if x is None else x for x in row]
        table.append(row)
    # add row index labels for R compatibility if requested
    if args.add_indices:
        table = [[i+1] + row for i, row in enumerate(table)]
    # begin writing the R table
    out = StringIO()
    # write the table header
    print >> out, '\t'.join(output_headers)
    # write the table
    for row in table:
        print >> out, '\t'.join(str(x) for x in row)
    # return the table
    return out.getvalue()
Esempio n. 3
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i + 1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError('expected the axis column %s '
                             'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # find the set of indices of duplicate points
    dup_indices = get_dup_indices(points, fs.radius)
    # get the data rows with duplicate indices removed
    new_rows = [row for i, row in enumerate(data_rows) if i not in dup_indices]
    # construct the new table
    out = StringIO()
    print >> out, '\t'.join(header_row)
    print >> out, '\n'.join('\t'.join(row) for row in new_rows)
    return out.getvalue()
Esempio n. 4
0
def get_response_content(fs):
    # get the r table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # check requested variable names as column headers
    if fs.var_a not in header_row:
        raise ValueError('the first variable name is not column header')
    if fs.var_b not in header_row:
        raise ValueError('the second variable name is not column header')
    return RUtil.run_with_table(fs.table, fs, get_script_content)
Esempio n. 5
0
def get_response_content(fs):
    # get the r table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # check requested variable names as column headers
    if fs.var_a not in header_row:
        raise ValueError('the first variable name is not column header')
    if fs.var_b not in header_row:
        raise ValueError('the second variable name is not column header')
    return RUtil.run_with_table(fs.table, fs, get_script_content)
Esempio n. 6
0
def get_response_content(fs):
    # get the r table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # check requested variable names as column headers
    if fs.variable not in header_row:
        msg = 'the variable name was not found as a column in the data table'
        raise ValueError(msg)
    if fs.factor not in header_row:
        msg = 'the factor name was not found as a column in the data table'
        raise ValueError(msg)
    return RUtil.run_with_table(fs.table, fs, get_script_content)
Esempio n. 7
0
def read_microsatellite_lines(raw_lines):
    """
    How can i combine the two haploid data sources?
    Maybe create each data matrix separately from the interleaved input.
    @param raw_lines: raw input lines
    @return: headers, diploid data
    """
    lines = Util.get_stripped_lines(raw_lines)
    if len(lines) % 2:
        raise ValueError('expected an even number of lines')
    if len(lines) < 2:
        raise ValueError('expected at least two lines')
    full_rows = [x.split() for x in lines]
    nfullcols = len(full_rows[0])
    if nfullcols < 2:
        raise ValueError('expected at least two columns')
    for row in full_rows:
        if len(row) != nfullcols:
            msg = 'each row should have the same number of elements'
            raise ValueError(msg)
    a_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 0]
    b_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 1]
    a_headers = [row[0] for row in a_full_rows]
    b_headers = [row[0] for row in b_full_rows]
    for h in a_headers:
        if not h.endswith('a'):
            msg = 'each odd row label should end with the letter a'
            raise ValueError(msg)
    for h in b_headers:
        if not h.endswith('b'):
            msg = 'each even row label should end with the letter b'
            raise ValueError(msg)
    headers = [h[:-1] for h in a_headers]
    # get the unique elements of each column
    rows = [row[1:] for row in full_rows]
    cols = zip(*rows)
    uniques = [list(iterutils.unique_everseen(col)) for col in cols]
    # get the results for each row
    a_rows = [row[1:] for row in a_full_rows]
    b_rows = [row[1:] for row in b_full_rows]
    a_columns = zip(*a_rows)
    b_columns = zip(*b_rows)
    a_binary_rows = Carbone.get_binary_rows_helper(a_columns, uniques)
    b_binary_rows = Carbone.get_binary_rows_helper(b_columns, uniques)
    # add the elements entrywise and return as a list of lists
    bin_row_groups = [a_binary_rows, b_binary_rows]
    binary_rows = np.array(bin_row_groups).sum(axis=0).tolist()
    return headers, binary_rows
Esempio n. 8
0
def read_microsatellite_lines(raw_lines):
    """
    How can i combine the two haploid data sources?
    Maybe create each data matrix separately from the interleaved input.
    @param raw_lines: raw input lines
    @return: headers, diploid data
    """
    lines = Util.get_stripped_lines(raw_lines)
    if len(lines) % 2:
        raise ValueError('expected an even number of lines')
    if len(lines) < 2:
        raise ValueError('expected at least two lines')
    full_rows = [x.split() for x in lines]
    nfullcols = len(full_rows[0])
    if nfullcols < 2:
        raise ValueError('expected at least two columns')
    for row in full_rows:
        if len(row) != nfullcols:
            msg = 'each row should have the same number of elements'
            raise ValueError(msg)
    a_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 0]
    b_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 1]
    a_headers = [row[0] for row in a_full_rows]
    b_headers = [row[0] for row in b_full_rows]
    for h in a_headers:
        if not h.endswith('a'):
            msg = 'each odd row label should end with the letter a'
            raise ValueError(msg)
    for h in b_headers:
        if not h.endswith('b'):
            msg = 'each even row label should end with the letter b'
            raise ValueError(msg)
    headers = [h[:-1] for h in a_headers]
    # get the unique elements of each column
    rows = [row[1:] for row in full_rows]
    cols = zip(*rows)
    uniques = [list(iterutils.unique_everseen(col)) for col in cols]
    # get the results for each row
    a_rows = [row[1:] for row in a_full_rows]
    b_rows = [row[1:] for row in b_full_rows]
    a_columns = zip(*a_rows)
    b_columns = zip(*b_rows)
    a_binary_rows = Carbone.get_binary_rows_helper(a_columns, uniques)
    b_binary_rows = Carbone.get_binary_rows_helper(b_columns, uniques)
    # add the elements entrywise and return as a list of lists
    bin_row_groups = [a_binary_rows, b_binary_rows]
    binary_rows = np.array(bin_row_groups).sum(axis=0).tolist()
    return headers, binary_rows
Esempio n. 9
0
def read_microsatellite_lines(raw_lines):
    """
    How can i combine the two haploid data sources?
    Maybe create each data matrix separately from the interleaved input.
    @param raw_lines: raw input lines
    @return: headers, diploid data
    """
    lines = Util.get_stripped_lines(raw_lines)
    full_rows = [line.split() for line in lines]
    nfullcols = len(full_rows[0])
    if nfullcols < 2:
        raise ValueError('expected at least two columns')
    if not all(len(row) == nfullcols for row in full_rows):
        raise ValueError('expected the same number of elements in each row')
    full_cols = zip(*full_rows)
    full_names = full_cols[0]
    ploidy = get_ploidy(full_names)
    headers = list(gen_headers(full_names, ploidy))
    # get the unique elements of each column
    rows = [row[1:] for row in full_rows]
    cols = zip(*rows)
    uniques = [list(iterutils.unique_everseen(col)) for col in cols]
    # get the rows for each offset
    n = len(rows) / ploidy
    groups = [[rows[j * ploidy + i] for j in range(n)] for i in range(ploidy)]
    # get the column groups
    col_groups = [zip(*m) for m in groups]
    # get the binary row groups
    bin_row_groups = [
        Carbone.get_binary_rows_helper(cols, uniques) for cols in col_groups
    ]
    # get the entrywise sum
    binary_rows = np.array(bin_row_groups).sum(axis=0).tolist()
    return headers, binary_rows
Esempio n. 10
0
def process(args, raw_hud_lines):
    """
    @param args: user options from the web or cmdline
    @param hud_lines: raw lines of a .hud file
    @return: results in convenient text form
    """
    out = StringIO()
    names, data = hud.decode(raw_hud_lines)
    # normalize the names of the isolates
    if args.clean_isolates:
        names = [Carbone.clean_isolate_element(x) for x in names]
    # get the pcs
    C_full = np.array(data, dtype=float)
    pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic)
    # check for sufficient number of eigenvectors
    if len(pcs) < args.npcs:
        msg_a = 'the number of requested principal components '
        msg_b = 'must be no more than the number of OTUs'
        raise ValueError(msg_a + msg_b)
    # create the R frame
    headers = ['otu'] + ['pc%d' % (i+1) for i in range(args.npcs)]
    print >> out, '\t'.join(headers)
    for i, name in enumerate(names):
        typed_row = [name] + [pcs[j][i] for j in range(args.npcs)]
        if args.add_indices:
            typed_row = [i+1] + typed_row
        row = [str(x) for x in typed_row]
        print >> out, '\t'.join(row)
    return out.getvalue()
Esempio n. 11
0
def process(args, raw_hud_lines):
    """
    @param args: user options from the web or cmdline
    @param hud_lines: raw lines of a .hud file
    @return: results in convenient text form
    """
    out = StringIO()
    names, data = hud.decode(raw_hud_lines)
    # normalize the names of the isolates
    if args.clean_isolates:
        names = [Carbone.clean_isolate_element(x) for x in names]
    # get the pcs
    C_full = np.array(data, dtype=float)
    pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic)
    # check for sufficient number of eigenvectors
    if len(pcs) < args.npcs:
        msg_a = 'the number of requested principal components '
        msg_b = 'must be no more than the number of OTUs'
        raise ValueError(msg_a + msg_b)
    # create the R frame
    headers = ['otu'] + ['pc%d' % (i + 1) for i in range(args.npcs)]
    print >> out, '\t'.join(headers)
    for i, name in enumerate(names):
        typed_row = [name] + [pcs[j][i] for j in range(args.npcs)]
        if args.add_indices:
            typed_row = [i + 1] + typed_row
        row = [str(x) for x in typed_row]
        print >> out, '\t'.join(row)
    return out.getvalue()
Esempio n. 12
0
def process(fs, raw_lines):
    headers, sequences = Phylip.decode(raw_lines)
    binary_rows = Carbone.get_binary_rows(sequences)
    if fs.hud:
        return hud.encode(headers, binary_rows) + '\n'
    elif fs.phy:
        binary_seqs = [''.join(str(x) for x in row) for row in binary_rows]
        return Phylip.encode(headers, binary_seqs) + '\n'
Esempio n. 13
0
def get_response_content(fs):
    # get the independent variable names
    indep = Util.get_stripped_lines(fs.independent.splitlines())
    dep = fs.dependent
    # get the r table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # check requested variable names as column headers
    bad_indep_names = set(indep) - set(header_row)
    if bad_indep_names:
        raise ValueError('these requested independent variable names '
                         'were not found as columns '
                         'in the data table: ' + str(bad_indep_names))
    if dep not in header_row:
        raise ValueError('the dependent variable name '
                         'was not found as a column in the data table')
    return RUtil.run_with_table(fs.table, (indep, dep), get_script_content)
Esempio n. 14
0
def get_response_content(fs):
    # get the independent variable names
    indep = Util.get_stripped_lines(fs.independent.splitlines())
    dep = fs.dependent
    # get the r table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # check requested variable names as column headers
    bad_indep_names = set(indep) - set(header_row)
    if bad_indep_names:
        raise ValueError(
            "these requested independent variable names "
            "were not found as columns "
            "in the data table: " + str(bad_indep_names)
        )
    if dep not in header_row:
        raise ValueError("the dependent variable name " "was not found as a column in the data table")
    return RUtil.run_with_table(fs.table, (indep, dep), get_script_content)
Esempio n. 15
0
def process(args, table_lines):
    """
    @param args: command line or web input
    @param table_lines: input lines
    @return: the image data as a string
    """
    rtable = RUtil.RTable(table_lines)
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # Read the relevant columns and their labels.
    plot_info = PlotInfo(args, header_row, data_rows)
    # Get info for the temporary data
    augmented_lines = plot_info.get_augmented_table_lines()
    # Create a temporary data table file for R.
    table_string = "\n".join(augmented_lines)
    temp_table_name = Util.create_tmp_file(table_string, suffix=".table")
    # Create a temporary pathname for the plot created by R.
    temp_plot_name = Util.get_tmp_filename()
    # Create a temporary R script file.
    script = plot_info.get_script(args, temp_plot_name, temp_table_name)
    temp_script_name = Util.create_tmp_file(script, suffix=".R")
    # Call R.
    retcode, r_out, r_err = RUtil.run(temp_script_name)
    if retcode:
        raise ValueError("R error:\n" + r_err)
    # Delete the temporary data table file.
    os.unlink(temp_table_name)
    # Delete the temporary script file.
    os.unlink(temp_script_name)
    # Read the image file.
    try:
        with open(temp_plot_name, "rb") as fin:
            image_data = fin.read()
    except IOError as e:
        raise HandlingError("the R call seems to not have created the plot")
    # Delete the temporary image file.
    os.unlink(temp_plot_name)
    # Return the image data as a string.
    return image_data
Esempio n. 16
0
 def _init_colors(self, args, headers, data):
     """
     Colors are numeric, and use whatever gradient is built into R.
     """
     self.color_header = args.color
     if self.color_header not in headers:
         raise ValueError('bad color column header: ' + self.color_header)
     index = self.h_to_i[self.color_header]
     try:
         self.color_list = Carbone.get_numeric_column(data, index)
     except Carbone.NumericError:
         raise ValueError('expected the color column %s '
                          'to be numeric' % self.color_header)
Esempio n. 17
0
def get_rtable_info(rtable, cluster_header, axis_headers):
    """
    @param rtable: a RUtil.RTable object
    @param cluster_header: header of the new column to add
    @param axis_headers: a tuple of column headers
    @return: points as rows in a numpy array
    """
    header_row = rtable.headers
    data_rows = rtable.data
    # do header validation
    Carbone.validate_headers(header_row)
    if not Carbone.is_valid_header(cluster_header):
        raise ValueError('invalid column header: %s' % cluster_header)
    if cluster_header in header_row:
        raise ValueError(
                'the column header %s '
                'is already in the table' % cluster_header)
    # get the numpy array of conformant points
    h_to_i = dict((h, i+1) for i, h in enumerate(header_row))
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError(
                    'expected the axis column %s '
                    'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    return points
Esempio n. 18
0
 def _init_colors(self, args, headers, data):
     """
     Colors are numeric, and use whatever gradient is built into R.
     """
     self.color_header = args.color
     if self.color_header not in headers:
         raise ValueError('bad color column header: ' + self.color_header)
     index = self.h_to_i[self.color_header]
     try:
         self.color_list = Carbone.get_numeric_column(data, index)
     except Carbone.NumericError:
         raise ValueError(
                 'expected the color column %s '
                 'to be numeric' % self.color_header)
Esempio n. 19
0
def process(args, table_lines):
    """
    @param args: command line or web input
    @param table_lines: input lines
    @return: the image data as a string
    """
    rtable = RUtil.RTable(table_lines)
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # Read the relevant columns and their labels.
    plot_info = PlotInfo(args, header_row, data_rows)
    # Get info for the temporary data
    augmented_lines = plot_info.get_augmented_table_lines()
    table_string = '\n'.join(augmented_lines)
    temp_table_name = Util.create_tmp_file(table_string, suffix='.table')
    temp_plot_name = Util.get_tmp_filename()
    script = plot_info.get_script(args, temp_plot_name, temp_table_name)
    temp_script_name = Util.create_tmp_file(script, suffix='.R')
    # Call R.
    retcode, r_out, r_err = RUtil.run(temp_script_name)
    if retcode:
        raise ValueError('R error:\n' + r_err)
    # Delete the temporary data table file.
    os.unlink(temp_table_name)
    # Delete the temporary script file.
    os.unlink(temp_script_name)
    # Read the image file.
    try:
        with open(temp_plot_name, 'rb') as fin:
            image_data = fin.read()
    except IOError as e:
        raise HandlingError('the R call seems to not have created the plot')
    # Delete the temporary image file.
    os.unlink(temp_plot_name)
    # Return the image data as a string.
    return image_data
Esempio n. 20
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i+1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError(
                    'expected the axis column %s '
                    'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # find the set of indices of duplicate points
    dup_indices = get_dup_indices(points, fs.radius)
    # get the data rows with duplicate indices removed
    new_rows = [row for i, row in enumerate(data_rows) if i not in dup_indices]
    # construct the new table
    out = StringIO()
    print >> out, '\t'.join(header_row)
    print >> out, '\n'.join('\t'.join(row) for row in new_rows)
    return out.getvalue()
Esempio n. 21
0
 def _init_axes(self, args, headers, data):
     # read the axes
     self.axis_headers = args.axes
     # verify the number of axis headers
     if len(self.axis_headers) != 2:
         raise ValueError('expected two axis column headers')
     # verify the axis header contents
     bad_axis_headers = set(self.axis_headers) - set(headers)
     if bad_axis_headers:
         raise ValueError('bad axis column headers: ' +
                          ', '.join(bad_axis_headers))
     self.axis_lists = []
     for h in self.axis_headers:
         index = self.h_to_i[h]
         try:
             axis_list = Carbone.get_numeric_column(data, index)
         except Carbone.NumericError:
             raise ValueError('expected the axis column %s '
                              'to be numeric' % h)
         self.axis_lists.append(axis_list)
Esempio n. 22
0
 def _init_axes(self, args, headers, data):
     # read the axes
     self.axis_headers = args.axes
     # verify the number of axis headers
     if len(self.axis_headers) != 2:
         raise ValueError('expected two axis column headers')
     # verify the axis header contents
     bad_axis_headers = set(self.axis_headers) - set(headers)
     if bad_axis_headers:
         raise ValueError(
                 'bad axis column headers: ' + ', '.join(bad_axis_headers))
     self.axis_lists = []
     for h in self.axis_headers:
         index = self.h_to_i[h]
         try:
             axis_list = Carbone.get_numeric_column(data, index)
         except Carbone.NumericError:
             raise ValueError(
                     'expected the axis column %s '
                     'to be numeric' % h)
         self.axis_lists.append(axis_list)
Esempio n. 23
0
 def _init_axes(self, args, headers, data):
     # read the axes
     self.axis_headers = args.axes
     # verify the number of axis headers
     if len(self.axis_headers) != 3:
         raise ValueError("expected three axis column headers")
     # verify the axis header contents
     bad_axis_headers = set(self.axis_headers) - set(headers)
     if bad_axis_headers:
         msg_a = "bad axis column headers: "
         msg_b = ", ".join(bad_axis_headers)
         raise ValueError(msg_a + msg_b)
     self.axis_lists = []
     for h in self.axis_headers:
         index = self.h_to_i[h]
         try:
             axis_list = Carbone.get_numeric_column(data, index)
         except Carbone.NumericError:
             msg_a = "expected the axis column %s " % h
             msg_b = "to be numeric"
             raise ValueError(msg_a + msg_b)
         self.axis_lists.append(axis_list)
Esempio n. 24
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i + 1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            msg_a = 'expected the axis column %s ' % h
            msg_b = 'to be numeric'
            raise ValueError(msg_a + msg_b)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # precompute some stuff
    allmeandist = kmeans.get_allmeandist(points)
    nrestarts = 10
    nseconds = 2
    tm = time.time()
    n = len(points)
    wgss_list = []
    # neg because both items in the pair are used for sorting
    neg_calinski_k_pairs = []
    # look for the best calinski index in a small amount of time
    k = 2
    while True:
        codebook, distortion = cluster.vq.kmeans(points,
                                                 k,
                                                 iter=nrestarts,
                                                 thresh=1e-9)
        sqdists = kmeans.get_point_center_sqdists(points, codebook)
        labels = kmeans.get_labels_without_cluster_removal(sqdists)
        wgss = kmeans.get_wcss(sqdists, labels)
        bgss = allmeandist - wgss
        calinski = kmeans.get_calinski_index(bgss, wgss, k, n)
        k_unique = len(set(labels))
        neg_calinski_k_pairs.append((-calinski, k_unique))
        wgss_list.append(wgss)
        if time.time() - tm > nseconds:
            break
        if k == n - 1:
            break
        k += 1
    max_k = k
    best_neg_calinski, best_k = min(neg_calinski_k_pairs)
    best_calinski = -best_neg_calinski
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    print >> out, 'searched 2 <= k <= %d clusters' % max_k
    print >> out, '%.2f seconds' % (time.time() - tm)
    if fs.verbose:
        print >> out
        print >> out, '(k_unique, wgss, calinski):'
        for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs):
            neg_calinski, k_unique = neg_calinski_k_pair
            calinski = -neg_calinski
            row = [k_unique, wgss, calinski]
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Esempio n. 25
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i + 1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError('expected the axis column %s '
                             'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # do the clustering while computing the calinski index at each merge
    cluster_counts = []
    wgss_values = []
    neg_calinskis = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        # do an agglomeration step
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        # compute the within group sum of squares
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        # compute the between group sum of squares
        bgss = allmeandist - wgss
        # get the calinksi index
        n = len(points)
        k = len(cluster_map)
        numerator = bgss / float(k - 1)
        denominator = wgss / float(n - k)
        calinski = numerator / denominator
        # append to the lists
        cluster_counts.append(k)
        wgss_values.append(wgss)
        neg_calinskis.append(-calinski)
    # Get the best cluster count according to the calinski index.
    # Do this trickery with negs so that it breaks ties
    # using the smallest number of clusters.
    neg_calinksi, best_k = min(zip(neg_calinskis, cluster_counts))
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    if fs.verbose:
        print >> out
        print >> out, '(k, wgss, calinski):'
        for k, wgss, neg_calinski in zip(cluster_counts, wgss_values,
                                         neg_calinskis):
            row = (k, wgss, -neg_calinski)
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Esempio n. 26
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i+1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            msg_a = 'expected the axis column %s ' % h
            msg_b = 'to be numeric'
            raise ValueError(msg_a + msg_b)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # precompute some stuff
    allmeandist = kmeans.get_allmeandist(points)
    nrestarts = 10
    nseconds = 2
    tm = time.time()
    n = len(points)
    wgss_list = []
    # neg because both items in the pair are used for sorting
    neg_calinski_k_pairs = []
    # look for the best calinski index in a small amount of time
    k = 2
    while True:
        codebook, distortion = cluster.vq.kmeans(
                points, k, iter=nrestarts, thresh=1e-9)
        sqdists = kmeans.get_point_center_sqdists(points, codebook)
        labels = kmeans.get_labels_without_cluster_removal(sqdists)
        wgss = kmeans.get_wcss(sqdists, labels)
        bgss = allmeandist - wgss
        calinski = kmeans.get_calinski_index(bgss, wgss, k, n)
        k_unique = len(set(labels))
        neg_calinski_k_pairs.append((-calinski, k_unique))
        wgss_list.append(wgss)
        if time.time() - tm > nseconds:
            break
        if k == n-1:
            break
        k += 1
    max_k = k
    best_neg_calinski, best_k = min(neg_calinski_k_pairs)
    best_calinski = -best_neg_calinski
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    print >> out, 'searched 2 <= k <= %d clusters' % max_k
    print >> out, '%.2f seconds' % (time.time() - tm)
    if fs.verbose:
        print >> out
        print >> out, '(k_unique, wgss, calinski):'
        for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs):
            neg_calinski, k_unique = neg_calinski_k_pair
            calinski = -neg_calinski
            row = [k_unique, wgss, calinski]
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Esempio n. 27
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i + 1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError('expected the axis column %s '
                             'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # do the clustering while computing the wgss at each merge
    cluster_counts = []
    wgss_values = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        # do an agglomeration step
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        # compute the within group sum of squares
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        # compute the between group sum of squares
        bgss = allmeandist - wgss
        # append to the lists
        cluster_counts.append(len(cluster_map))
        wgss_values.append(wgss)
    # compute the log wgss values
    wlogs = np.log(wgss_values)
    # reverse the log values so that they are by increasing cluster size
    wlogs = list(reversed(wlogs))
    # sample from the null distribution
    extents = np.max(points, axis=0) - np.min(points, axis=0)
    nclusters_list, expectations, thresholds = do_sampling(
        extents, len(points), fs.nsamples)
    # get the gaps
    gaps = np.array(expectations) - wlogs
    # Get the best cluster count according to the gap statistic.
    best_i = None
    criteria = []
    for i, ip1 in iterutils.pairwise(range(len(nclusters_list))):
        k, kp1 = nclusters_list[i], nclusters_list[ip1]
        criterion = gaps[i] - gaps[ip1] + thresholds[ip1]
        criteria.append(criterion)
        if criterion > 0:
            if best_i is None:
                best_i = i
    best_k = nclusters_list[best_i]
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    if fs.verbose:
        print >> out
        print >> out, '(k, expected, observed, gap, threshold, criterion):'
        n = len(nclusters_list)
        for i, k in enumerate(nclusters_list):
            row = [k, expectations[i], wlogs[i], gaps[i], thresholds[i]]
            if i < n - 1:
                row += [criteria[i]]
            else:
                row += ['-']
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Esempio n. 28
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i+1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError(
                    'expected the axis column %s '
                    'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # do the clustering while computing the wgss at each merge
    cluster_counts = []
    wgss_values = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        # do an agglomeration step
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        # compute the within group sum of squares
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        # compute the between group sum of squares
        bgss = allmeandist - wgss
        # append to the lists
        cluster_counts.append(len(cluster_map))
        wgss_values.append(wgss)
    # compute the log wgss values
    wlogs = np.log(wgss_values)
    # reverse the log values so that they are by increasing cluster size
    wlogs = list(reversed(wlogs))
    # sample from the null distribution
    extents = np.max(points, axis=0) - np.min(points, axis=0)
    nclusters_list, expectations, thresholds = do_sampling(
            extents, len(points), fs.nsamples)
    # get the gaps
    gaps = np.array(expectations) - wlogs
    # Get the best cluster count according to the gap statistic.
    best_i = None
    criteria = []
    for i, ip1 in iterutils.pairwise(range(len(nclusters_list))):
        k, kp1 = nclusters_list[i], nclusters_list[ip1]
        criterion = gaps[i] - gaps[ip1] + thresholds[ip1]
        criteria.append(criterion)
        if criterion > 0:
            if best_i is None:
                best_i = i
    best_k = nclusters_list[best_i]
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    if fs.verbose:
        print >> out
        print >> out, '(k, expected, observed, gap, threshold, criterion):'
        n = len(nclusters_list)
        for i, k in enumerate(nclusters_list):
            row = [k, expectations[i], wlogs[i], gaps[i], thresholds[i]]
            if i < n-1:
                row += [criteria[i]]
            else:
                row += ['-']
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()
Esempio n. 29
0
def get_response_content(fs):
    # read the table
    rtable = RUtil.RTable(fs.table.splitlines())
    header_row = rtable.headers
    data_rows = rtable.data
    Carbone.validate_headers(header_row)
    # get the numpy array of conformant points
    h_to_i = dict((h, i+1) for i, h in enumerate(header_row))
    axis_headers = fs.axes
    if not axis_headers:
        raise ValueError('no Euclidean axes were provided')
    axis_set = set(axis_headers)
    header_set = set(header_row)
    bad_axes = axis_set - header_set
    if bad_axes:
        raise ValueError('invalid axes: ' + ', '.join(bad_axes))
    axis_lists = []
    for h in axis_headers:
        index = h_to_i[h]
        try:
            axis_list = Carbone.get_numeric_column(data_rows, index)
        except Carbone.NumericError:
            raise ValueError(
                    'expected the axis column %s '
                    'to be numeric' % h)
        axis_lists.append(axis_list)
    points = np.array(zip(*axis_lists))
    # do the clustering while computing the calinski index at each merge
    cluster_counts = []
    wgss_values = []
    neg_calinskis = []
    allmeandist = kmeans.get_allmeandist(points)
    cluster_map = agglom.get_initial_cluster_map(points)
    w_ssd_map = agglom.get_initial_w_ssd_map(points)
    b_ssd_map = agglom.get_initial_b_ssd_map(points)
    q = agglom.get_initial_queue(b_ssd_map)
    while len(cluster_map) > 2:
        # do an agglomeration step
        pair = agglom.get_pair_fast(cluster_map, q)
        agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair)
        # compute the within group sum of squares
        indices = cluster_map.keys()
        wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices)
        # compute the between group sum of squares
        bgss = allmeandist - wgss
        # get the calinksi index
        n = len(points)
        k = len(cluster_map)
        numerator = bgss / float(k - 1)
        denominator = wgss / float(n - k)
        calinski = numerator / denominator
        # append to the lists
        cluster_counts.append(k)
        wgss_values.append(wgss)
        neg_calinskis.append(-calinski)
    # Get the best cluster count according to the calinski index.
    # Do this trickery with negs so that it breaks ties
    # using the smallest number of clusters.
    neg_calinksi, best_k = min(zip(neg_calinskis, cluster_counts))
    # create the response
    out = StringIO()
    print >> out, 'best cluster count: k = %d' % best_k
    if fs.verbose:
        print >> out
        print >> out, '(k, wgss, calinski):'
        for k, wgss, neg_calinski in zip(
                cluster_counts, wgss_values, neg_calinskis):
            row = (k, wgss, -neg_calinski)
            print >> out, '\t'.join(str(x) for x in row)
    # return the response
    return out.getvalue()