def process(args, raw_info_lines, input_headers, output_headers): info_lines = Util.get_stripped_lines(raw_info_lines) # extract info from the .csv file rows = list(csv.reader(info_lines)) # the number of columns should be consistent among rows if len(set(len(row) for row in rows)) != 1: msg = 'the number of columns should be consistent among rows' raise ValueError(msg) # break the list of rows into a header row and data rows header, data_rows = rows[0], rows[1:] # account for missing input data if args.star_missing_in: data_rows = [[None if v == '*' else v for v in r] for r in data_rows] elif args.NULL_missing_in: data_rows = [[None if v == 'NULL' else v for v in r] for r in data_rows] # define the renamed input headers if len(input_headers) < len(header): msg = 'each input header should be explicitly (re)named' raise ValueError(msg) if len(header) < len(input_headers): msg = 'more renamed headers than input headers' raise ValueError(msg) for h in input_headers: if not Carbone.is_valid_header(h): msg = 'invalid column header: %s' % h raise ValueError(msg) # force IC prefix for non-missing elements in the first column if requested if args.clean_isolates: data_rows = Carbone.clean_isolate_table(data_rows) # define the ordered output headers bad_output_headers = set(output_headers) - set(input_headers) if bad_output_headers: msg_a = 'unrecognized output column headers: ' msg_b = ', '.join(bad_output_headers) raise ValueError(msg_a + msg_b) # define the order of the output data columns h_to_i = dict((h, i) for i, h in enumerate(input_headers)) # build the output data rows by reordering the columns data_rows = [[row[h_to_i[h]] for h in output_headers] for row in data_rows] # deal with missing data by skipping rows or replacing elements table = [] for row in data_rows: if args.remove_missing_out and (None in row): continue elif args.NA_missing_out: row = ['NA' if x is None else x for x in row] table.append(row) # add row index labels for R compatibility if requested if args.add_indices: table = [[i + 1] + row for i, row in enumerate(table)] # begin writing the R table out = StringIO() # write the table header print >> out, '\t'.join(output_headers) # write the table for row in table: print >> out, '\t'.join(str(x) for x in row) # return the table return out.getvalue()
def process(args, raw_info_lines, raw_input_headers, raw_output_headers): info_lines = Util.get_stripped_lines(raw_info_lines) rows = [line.split() for line in info_lines] # the number of columns should be consistent among rows if len(set(len(row) for row in rows)) != 1: msg = 'the number of columns should be consistent among rows' raise ValueError(msg) # break the list of rows into a header row and data rows header, data_rows = rows[0], rows[1:] # account for missing input data if args.star_missing_in: data_rows = [[None if v=='*' else v for v in r] for r in data_rows] elif args.NULL_missing_in: data_rows = [[None if v=='NULL' else v for v in r] for r in data_rows] # define the renamed input headers input_headers = Util.get_stripped_lines(raw_input_headers) if len(input_headers) < len(header): msg = 'each input header should be explicitly (re)named' raise ValueError(msg) if len(header) < len(input_headers): msg = 'more renamed headers than input headers' raise ValueError(msg) for h in input_headers: if not Carbone.is_valid_header(h): msg = 'invalid column header: %s' % h raise ValueError(msg) # force IC prefix for non-missing elements in the first column if requested if args.clean_isolates: data_rows = Carbone.clean_isolate_table(data_rows) # define the ordered output headers output_headers = Util.get_stripped_lines(raw_output_headers) bad_output_headers = set(output_headers) - set(input_headers) if bad_output_headers: msg_a = 'unrecognized output column headers: ' msg_b = ', '.join(bad_output_headers) raise ValueError(msg_a + msg_b) # define the order of the output data columns h_to_i = dict((h, i) for i, h in enumerate(input_headers)) # build the output data rows by reordering the columns data_rows = [[row[h_to_i[h]] for h in output_headers] for row in data_rows] # deal with missing data by skipping rows or replacing elements table = [] for row in data_rows: if args.remove_missing_out and (None in row): continue elif args.NA_missing_out: row = ['NA' if x is None else x for x in row] table.append(row) # add row index labels for R compatibility if requested if args.add_indices: table = [[i+1] + row for i, row in enumerate(table)] # begin writing the R table out = StringIO() # write the table header print >> out, '\t'.join(output_headers) # write the table for row in table: print >> out, '\t'.join(str(x) for x in row) # return the table return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i + 1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError('expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # find the set of indices of duplicate points dup_indices = get_dup_indices(points, fs.radius) # get the data rows with duplicate indices removed new_rows = [row for i, row in enumerate(data_rows) if i not in dup_indices] # construct the new table out = StringIO() print >> out, '\t'.join(header_row) print >> out, '\n'.join('\t'.join(row) for row in new_rows) return out.getvalue()
def get_response_content(fs): # get the r table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # check requested variable names as column headers if fs.var_a not in header_row: raise ValueError('the first variable name is not column header') if fs.var_b not in header_row: raise ValueError('the second variable name is not column header') return RUtil.run_with_table(fs.table, fs, get_script_content)
def get_response_content(fs): # get the r table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # check requested variable names as column headers if fs.variable not in header_row: msg = 'the variable name was not found as a column in the data table' raise ValueError(msg) if fs.factor not in header_row: msg = 'the factor name was not found as a column in the data table' raise ValueError(msg) return RUtil.run_with_table(fs.table, fs, get_script_content)
def read_microsatellite_lines(raw_lines): """ How can i combine the two haploid data sources? Maybe create each data matrix separately from the interleaved input. @param raw_lines: raw input lines @return: headers, diploid data """ lines = Util.get_stripped_lines(raw_lines) if len(lines) % 2: raise ValueError('expected an even number of lines') if len(lines) < 2: raise ValueError('expected at least two lines') full_rows = [x.split() for x in lines] nfullcols = len(full_rows[0]) if nfullcols < 2: raise ValueError('expected at least two columns') for row in full_rows: if len(row) != nfullcols: msg = 'each row should have the same number of elements' raise ValueError(msg) a_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 0] b_full_rows = [row for i, row in enumerate(full_rows) if i % 2 == 1] a_headers = [row[0] for row in a_full_rows] b_headers = [row[0] for row in b_full_rows] for h in a_headers: if not h.endswith('a'): msg = 'each odd row label should end with the letter a' raise ValueError(msg) for h in b_headers: if not h.endswith('b'): msg = 'each even row label should end with the letter b' raise ValueError(msg) headers = [h[:-1] for h in a_headers] # get the unique elements of each column rows = [row[1:] for row in full_rows] cols = zip(*rows) uniques = [list(iterutils.unique_everseen(col)) for col in cols] # get the results for each row a_rows = [row[1:] for row in a_full_rows] b_rows = [row[1:] for row in b_full_rows] a_columns = zip(*a_rows) b_columns = zip(*b_rows) a_binary_rows = Carbone.get_binary_rows_helper(a_columns, uniques) b_binary_rows = Carbone.get_binary_rows_helper(b_columns, uniques) # add the elements entrywise and return as a list of lists bin_row_groups = [a_binary_rows, b_binary_rows] binary_rows = np.array(bin_row_groups).sum(axis=0).tolist() return headers, binary_rows
def read_microsatellite_lines(raw_lines): """ How can i combine the two haploid data sources? Maybe create each data matrix separately from the interleaved input. @param raw_lines: raw input lines @return: headers, diploid data """ lines = Util.get_stripped_lines(raw_lines) full_rows = [line.split() for line in lines] nfullcols = len(full_rows[0]) if nfullcols < 2: raise ValueError('expected at least two columns') if not all(len(row) == nfullcols for row in full_rows): raise ValueError('expected the same number of elements in each row') full_cols = zip(*full_rows) full_names = full_cols[0] ploidy = get_ploidy(full_names) headers = list(gen_headers(full_names, ploidy)) # get the unique elements of each column rows = [row[1:] for row in full_rows] cols = zip(*rows) uniques = [list(iterutils.unique_everseen(col)) for col in cols] # get the rows for each offset n = len(rows) / ploidy groups = [[rows[j * ploidy + i] for j in range(n)] for i in range(ploidy)] # get the column groups col_groups = [zip(*m) for m in groups] # get the binary row groups bin_row_groups = [ Carbone.get_binary_rows_helper(cols, uniques) for cols in col_groups ] # get the entrywise sum binary_rows = np.array(bin_row_groups).sum(axis=0).tolist() return headers, binary_rows
def process(args, raw_hud_lines): """ @param args: user options from the web or cmdline @param hud_lines: raw lines of a .hud file @return: results in convenient text form """ out = StringIO() names, data = hud.decode(raw_hud_lines) # normalize the names of the isolates if args.clean_isolates: names = [Carbone.clean_isolate_element(x) for x in names] # get the pcs C_full = np.array(data, dtype=float) pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic) # check for sufficient number of eigenvectors if len(pcs) < args.npcs: msg_a = 'the number of requested principal components ' msg_b = 'must be no more than the number of OTUs' raise ValueError(msg_a + msg_b) # create the R frame headers = ['otu'] + ['pc%d' % (i+1) for i in range(args.npcs)] print >> out, '\t'.join(headers) for i, name in enumerate(names): typed_row = [name] + [pcs[j][i] for j in range(args.npcs)] if args.add_indices: typed_row = [i+1] + typed_row row = [str(x) for x in typed_row] print >> out, '\t'.join(row) return out.getvalue()
def process(args, raw_hud_lines): """ @param args: user options from the web or cmdline @param hud_lines: raw lines of a .hud file @return: results in convenient text form """ out = StringIO() names, data = hud.decode(raw_hud_lines) # normalize the names of the isolates if args.clean_isolates: names = [Carbone.clean_isolate_element(x) for x in names] # get the pcs C_full = np.array(data, dtype=float) pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic) # check for sufficient number of eigenvectors if len(pcs) < args.npcs: msg_a = 'the number of requested principal components ' msg_b = 'must be no more than the number of OTUs' raise ValueError(msg_a + msg_b) # create the R frame headers = ['otu'] + ['pc%d' % (i + 1) for i in range(args.npcs)] print >> out, '\t'.join(headers) for i, name in enumerate(names): typed_row = [name] + [pcs[j][i] for j in range(args.npcs)] if args.add_indices: typed_row = [i + 1] + typed_row row = [str(x) for x in typed_row] print >> out, '\t'.join(row) return out.getvalue()
def process(fs, raw_lines): headers, sequences = Phylip.decode(raw_lines) binary_rows = Carbone.get_binary_rows(sequences) if fs.hud: return hud.encode(headers, binary_rows) + '\n' elif fs.phy: binary_seqs = [''.join(str(x) for x in row) for row in binary_rows] return Phylip.encode(headers, binary_seqs) + '\n'
def get_response_content(fs): # get the independent variable names indep = Util.get_stripped_lines(fs.independent.splitlines()) dep = fs.dependent # get the r table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # check requested variable names as column headers bad_indep_names = set(indep) - set(header_row) if bad_indep_names: raise ValueError('these requested independent variable names ' 'were not found as columns ' 'in the data table: ' + str(bad_indep_names)) if dep not in header_row: raise ValueError('the dependent variable name ' 'was not found as a column in the data table') return RUtil.run_with_table(fs.table, (indep, dep), get_script_content)
def get_response_content(fs): # get the independent variable names indep = Util.get_stripped_lines(fs.independent.splitlines()) dep = fs.dependent # get the r table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # check requested variable names as column headers bad_indep_names = set(indep) - set(header_row) if bad_indep_names: raise ValueError( "these requested independent variable names " "were not found as columns " "in the data table: " + str(bad_indep_names) ) if dep not in header_row: raise ValueError("the dependent variable name " "was not found as a column in the data table") return RUtil.run_with_table(fs.table, (indep, dep), get_script_content)
def process(args, table_lines): """ @param args: command line or web input @param table_lines: input lines @return: the image data as a string """ rtable = RUtil.RTable(table_lines) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # Read the relevant columns and their labels. plot_info = PlotInfo(args, header_row, data_rows) # Get info for the temporary data augmented_lines = plot_info.get_augmented_table_lines() # Create a temporary data table file for R. table_string = "\n".join(augmented_lines) temp_table_name = Util.create_tmp_file(table_string, suffix=".table") # Create a temporary pathname for the plot created by R. temp_plot_name = Util.get_tmp_filename() # Create a temporary R script file. script = plot_info.get_script(args, temp_plot_name, temp_table_name) temp_script_name = Util.create_tmp_file(script, suffix=".R") # Call R. retcode, r_out, r_err = RUtil.run(temp_script_name) if retcode: raise ValueError("R error:\n" + r_err) # Delete the temporary data table file. os.unlink(temp_table_name) # Delete the temporary script file. os.unlink(temp_script_name) # Read the image file. try: with open(temp_plot_name, "rb") as fin: image_data = fin.read() except IOError as e: raise HandlingError("the R call seems to not have created the plot") # Delete the temporary image file. os.unlink(temp_plot_name) # Return the image data as a string. return image_data
def _init_colors(self, args, headers, data): """ Colors are numeric, and use whatever gradient is built into R. """ self.color_header = args.color if self.color_header not in headers: raise ValueError('bad color column header: ' + self.color_header) index = self.h_to_i[self.color_header] try: self.color_list = Carbone.get_numeric_column(data, index) except Carbone.NumericError: raise ValueError('expected the color column %s ' 'to be numeric' % self.color_header)
def get_rtable_info(rtable, cluster_header, axis_headers): """ @param rtable: a RUtil.RTable object @param cluster_header: header of the new column to add @param axis_headers: a tuple of column headers @return: points as rows in a numpy array """ header_row = rtable.headers data_rows = rtable.data # do header validation Carbone.validate_headers(header_row) if not Carbone.is_valid_header(cluster_header): raise ValueError('invalid column header: %s' % cluster_header) if cluster_header in header_row: raise ValueError( 'the column header %s ' 'is already in the table' % cluster_header) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError( 'expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) return points
def _init_colors(self, args, headers, data): """ Colors are numeric, and use whatever gradient is built into R. """ self.color_header = args.color if self.color_header not in headers: raise ValueError('bad color column header: ' + self.color_header) index = self.h_to_i[self.color_header] try: self.color_list = Carbone.get_numeric_column(data, index) except Carbone.NumericError: raise ValueError( 'expected the color column %s ' 'to be numeric' % self.color_header)
def process(args, table_lines): """ @param args: command line or web input @param table_lines: input lines @return: the image data as a string """ rtable = RUtil.RTable(table_lines) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # Read the relevant columns and their labels. plot_info = PlotInfo(args, header_row, data_rows) # Get info for the temporary data augmented_lines = plot_info.get_augmented_table_lines() table_string = '\n'.join(augmented_lines) temp_table_name = Util.create_tmp_file(table_string, suffix='.table') temp_plot_name = Util.get_tmp_filename() script = plot_info.get_script(args, temp_plot_name, temp_table_name) temp_script_name = Util.create_tmp_file(script, suffix='.R') # Call R. retcode, r_out, r_err = RUtil.run(temp_script_name) if retcode: raise ValueError('R error:\n' + r_err) # Delete the temporary data table file. os.unlink(temp_table_name) # Delete the temporary script file. os.unlink(temp_script_name) # Read the image file. try: with open(temp_plot_name, 'rb') as fin: image_data = fin.read() except IOError as e: raise HandlingError('the R call seems to not have created the plot') # Delete the temporary image file. os.unlink(temp_plot_name) # Return the image data as a string. return image_data
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError( 'expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # find the set of indices of duplicate points dup_indices = get_dup_indices(points, fs.radius) # get the data rows with duplicate indices removed new_rows = [row for i, row in enumerate(data_rows) if i not in dup_indices] # construct the new table out = StringIO() print >> out, '\t'.join(header_row) print >> out, '\n'.join('\t'.join(row) for row in new_rows) return out.getvalue()
def _init_axes(self, args, headers, data): # read the axes self.axis_headers = args.axes # verify the number of axis headers if len(self.axis_headers) != 2: raise ValueError('expected two axis column headers') # verify the axis header contents bad_axis_headers = set(self.axis_headers) - set(headers) if bad_axis_headers: raise ValueError('bad axis column headers: ' + ', '.join(bad_axis_headers)) self.axis_lists = [] for h in self.axis_headers: index = self.h_to_i[h] try: axis_list = Carbone.get_numeric_column(data, index) except Carbone.NumericError: raise ValueError('expected the axis column %s ' 'to be numeric' % h) self.axis_lists.append(axis_list)
def _init_axes(self, args, headers, data): # read the axes self.axis_headers = args.axes # verify the number of axis headers if len(self.axis_headers) != 2: raise ValueError('expected two axis column headers') # verify the axis header contents bad_axis_headers = set(self.axis_headers) - set(headers) if bad_axis_headers: raise ValueError( 'bad axis column headers: ' + ', '.join(bad_axis_headers)) self.axis_lists = [] for h in self.axis_headers: index = self.h_to_i[h] try: axis_list = Carbone.get_numeric_column(data, index) except Carbone.NumericError: raise ValueError( 'expected the axis column %s ' 'to be numeric' % h) self.axis_lists.append(axis_list)
def _init_axes(self, args, headers, data): # read the axes self.axis_headers = args.axes # verify the number of axis headers if len(self.axis_headers) != 3: raise ValueError("expected three axis column headers") # verify the axis header contents bad_axis_headers = set(self.axis_headers) - set(headers) if bad_axis_headers: msg_a = "bad axis column headers: " msg_b = ", ".join(bad_axis_headers) raise ValueError(msg_a + msg_b) self.axis_lists = [] for h in self.axis_headers: index = self.h_to_i[h] try: axis_list = Carbone.get_numeric_column(data, index) except Carbone.NumericError: msg_a = "expected the axis column %s " % h msg_b = "to be numeric" raise ValueError(msg_a + msg_b) self.axis_lists.append(axis_list)
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i + 1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: msg_a = 'expected the axis column %s ' % h msg_b = 'to be numeric' raise ValueError(msg_a + msg_b) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # precompute some stuff allmeandist = kmeans.get_allmeandist(points) nrestarts = 10 nseconds = 2 tm = time.time() n = len(points) wgss_list = [] # neg because both items in the pair are used for sorting neg_calinski_k_pairs = [] # look for the best calinski index in a small amount of time k = 2 while True: codebook, distortion = cluster.vq.kmeans(points, k, iter=nrestarts, thresh=1e-9) sqdists = kmeans.get_point_center_sqdists(points, codebook) labels = kmeans.get_labels_without_cluster_removal(sqdists) wgss = kmeans.get_wcss(sqdists, labels) bgss = allmeandist - wgss calinski = kmeans.get_calinski_index(bgss, wgss, k, n) k_unique = len(set(labels)) neg_calinski_k_pairs.append((-calinski, k_unique)) wgss_list.append(wgss) if time.time() - tm > nseconds: break if k == n - 1: break k += 1 max_k = k best_neg_calinski, best_k = min(neg_calinski_k_pairs) best_calinski = -best_neg_calinski # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k print >> out, 'searched 2 <= k <= %d clusters' % max_k print >> out, '%.2f seconds' % (time.time() - tm) if fs.verbose: print >> out print >> out, '(k_unique, wgss, calinski):' for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs): neg_calinski, k_unique = neg_calinski_k_pair calinski = -neg_calinski row = [k_unique, wgss, calinski] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i + 1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError('expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # do the clustering while computing the calinski index at each merge cluster_counts = [] wgss_values = [] neg_calinskis = [] allmeandist = kmeans.get_allmeandist(points) cluster_map = agglom.get_initial_cluster_map(points) w_ssd_map = agglom.get_initial_w_ssd_map(points) b_ssd_map = agglom.get_initial_b_ssd_map(points) q = agglom.get_initial_queue(b_ssd_map) while len(cluster_map) > 2: # do an agglomeration step pair = agglom.get_pair_fast(cluster_map, q) agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair) # compute the within group sum of squares indices = cluster_map.keys() wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices) # compute the between group sum of squares bgss = allmeandist - wgss # get the calinksi index n = len(points) k = len(cluster_map) numerator = bgss / float(k - 1) denominator = wgss / float(n - k) calinski = numerator / denominator # append to the lists cluster_counts.append(k) wgss_values.append(wgss) neg_calinskis.append(-calinski) # Get the best cluster count according to the calinski index. # Do this trickery with negs so that it breaks ties # using the smallest number of clusters. neg_calinksi, best_k = min(zip(neg_calinskis, cluster_counts)) # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k if fs.verbose: print >> out print >> out, '(k, wgss, calinski):' for k, wgss, neg_calinski in zip(cluster_counts, wgss_values, neg_calinskis): row = (k, wgss, -neg_calinski) print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: msg_a = 'expected the axis column %s ' % h msg_b = 'to be numeric' raise ValueError(msg_a + msg_b) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # precompute some stuff allmeandist = kmeans.get_allmeandist(points) nrestarts = 10 nseconds = 2 tm = time.time() n = len(points) wgss_list = [] # neg because both items in the pair are used for sorting neg_calinski_k_pairs = [] # look for the best calinski index in a small amount of time k = 2 while True: codebook, distortion = cluster.vq.kmeans( points, k, iter=nrestarts, thresh=1e-9) sqdists = kmeans.get_point_center_sqdists(points, codebook) labels = kmeans.get_labels_without_cluster_removal(sqdists) wgss = kmeans.get_wcss(sqdists, labels) bgss = allmeandist - wgss calinski = kmeans.get_calinski_index(bgss, wgss, k, n) k_unique = len(set(labels)) neg_calinski_k_pairs.append((-calinski, k_unique)) wgss_list.append(wgss) if time.time() - tm > nseconds: break if k == n-1: break k += 1 max_k = k best_neg_calinski, best_k = min(neg_calinski_k_pairs) best_calinski = -best_neg_calinski # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k print >> out, 'searched 2 <= k <= %d clusters' % max_k print >> out, '%.2f seconds' % (time.time() - tm) if fs.verbose: print >> out print >> out, '(k_unique, wgss, calinski):' for wgss, neg_calinski_k_pair in zip(wgss_list, neg_calinski_k_pairs): neg_calinski, k_unique = neg_calinski_k_pair calinski = -neg_calinski row = [k_unique, wgss, calinski] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i + 1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError('expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # do the clustering while computing the wgss at each merge cluster_counts = [] wgss_values = [] allmeandist = kmeans.get_allmeandist(points) cluster_map = agglom.get_initial_cluster_map(points) w_ssd_map = agglom.get_initial_w_ssd_map(points) b_ssd_map = agglom.get_initial_b_ssd_map(points) q = agglom.get_initial_queue(b_ssd_map) while len(cluster_map) > 2: # do an agglomeration step pair = agglom.get_pair_fast(cluster_map, q) agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair) # compute the within group sum of squares indices = cluster_map.keys() wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices) # compute the between group sum of squares bgss = allmeandist - wgss # append to the lists cluster_counts.append(len(cluster_map)) wgss_values.append(wgss) # compute the log wgss values wlogs = np.log(wgss_values) # reverse the log values so that they are by increasing cluster size wlogs = list(reversed(wlogs)) # sample from the null distribution extents = np.max(points, axis=0) - np.min(points, axis=0) nclusters_list, expectations, thresholds = do_sampling( extents, len(points), fs.nsamples) # get the gaps gaps = np.array(expectations) - wlogs # Get the best cluster count according to the gap statistic. best_i = None criteria = [] for i, ip1 in iterutils.pairwise(range(len(nclusters_list))): k, kp1 = nclusters_list[i], nclusters_list[ip1] criterion = gaps[i] - gaps[ip1] + thresholds[ip1] criteria.append(criterion) if criterion > 0: if best_i is None: best_i = i best_k = nclusters_list[best_i] # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k if fs.verbose: print >> out print >> out, '(k, expected, observed, gap, threshold, criterion):' n = len(nclusters_list) for i, k in enumerate(nclusters_list): row = [k, expectations[i], wlogs[i], gaps[i], thresholds[i]] if i < n - 1: row += [criteria[i]] else: row += ['-'] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError( 'expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # do the clustering while computing the wgss at each merge cluster_counts = [] wgss_values = [] allmeandist = kmeans.get_allmeandist(points) cluster_map = agglom.get_initial_cluster_map(points) w_ssd_map = agglom.get_initial_w_ssd_map(points) b_ssd_map = agglom.get_initial_b_ssd_map(points) q = agglom.get_initial_queue(b_ssd_map) while len(cluster_map) > 2: # do an agglomeration step pair = agglom.get_pair_fast(cluster_map, q) agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair) # compute the within group sum of squares indices = cluster_map.keys() wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices) # compute the between group sum of squares bgss = allmeandist - wgss # append to the lists cluster_counts.append(len(cluster_map)) wgss_values.append(wgss) # compute the log wgss values wlogs = np.log(wgss_values) # reverse the log values so that they are by increasing cluster size wlogs = list(reversed(wlogs)) # sample from the null distribution extents = np.max(points, axis=0) - np.min(points, axis=0) nclusters_list, expectations, thresholds = do_sampling( extents, len(points), fs.nsamples) # get the gaps gaps = np.array(expectations) - wlogs # Get the best cluster count according to the gap statistic. best_i = None criteria = [] for i, ip1 in iterutils.pairwise(range(len(nclusters_list))): k, kp1 = nclusters_list[i], nclusters_list[ip1] criterion = gaps[i] - gaps[ip1] + thresholds[ip1] criteria.append(criterion) if criterion > 0: if best_i is None: best_i = i best_k = nclusters_list[best_i] # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k if fs.verbose: print >> out print >> out, '(k, expected, observed, gap, threshold, criterion):' n = len(nclusters_list) for i, k in enumerate(nclusters_list): row = [k, expectations[i], wlogs[i], gaps[i], thresholds[i]] if i < n-1: row += [criteria[i]] else: row += ['-'] print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()
def get_response_content(fs): # read the table rtable = RUtil.RTable(fs.table.splitlines()) header_row = rtable.headers data_rows = rtable.data Carbone.validate_headers(header_row) # get the numpy array of conformant points h_to_i = dict((h, i+1) for i, h in enumerate(header_row)) axis_headers = fs.axes if not axis_headers: raise ValueError('no Euclidean axes were provided') axis_set = set(axis_headers) header_set = set(header_row) bad_axes = axis_set - header_set if bad_axes: raise ValueError('invalid axes: ' + ', '.join(bad_axes)) axis_lists = [] for h in axis_headers: index = h_to_i[h] try: axis_list = Carbone.get_numeric_column(data_rows, index) except Carbone.NumericError: raise ValueError( 'expected the axis column %s ' 'to be numeric' % h) axis_lists.append(axis_list) points = np.array(zip(*axis_lists)) # do the clustering while computing the calinski index at each merge cluster_counts = [] wgss_values = [] neg_calinskis = [] allmeandist = kmeans.get_allmeandist(points) cluster_map = agglom.get_initial_cluster_map(points) w_ssd_map = agglom.get_initial_w_ssd_map(points) b_ssd_map = agglom.get_initial_b_ssd_map(points) q = agglom.get_initial_queue(b_ssd_map) while len(cluster_map) > 2: # do an agglomeration step pair = agglom.get_pair_fast(cluster_map, q) agglom.merge_fast(cluster_map, w_ssd_map, b_ssd_map, q, pair) # compute the within group sum of squares indices = cluster_map.keys() wgss = sum(w_ssd_map[i] / float(len(cluster_map[i])) for i in indices) # compute the between group sum of squares bgss = allmeandist - wgss # get the calinksi index n = len(points) k = len(cluster_map) numerator = bgss / float(k - 1) denominator = wgss / float(n - k) calinski = numerator / denominator # append to the lists cluster_counts.append(k) wgss_values.append(wgss) neg_calinskis.append(-calinski) # Get the best cluster count according to the calinski index. # Do this trickery with negs so that it breaks ties # using the smallest number of clusters. neg_calinksi, best_k = min(zip(neg_calinskis, cluster_counts)) # create the response out = StringIO() print >> out, 'best cluster count: k = %d' % best_k if fs.verbose: print >> out print >> out, '(k, wgss, calinski):' for k, wgss, neg_calinski in zip( cluster_counts, wgss_values, neg_calinskis): row = (k, wgss, -neg_calinski) print >> out, '\t'.join(str(x) for x in row) # return the response return out.getvalue()