def train(self, train_vects): # load the train data to a matrix. if self.verbose: print "Loading the training data to memory..." (self.D, self.t) = get_train_data(train_vects) # create co-variance matrix C. self.n = len(train_vects["vects"]) self.m = len(train_vects["featIDs"]) if self.verbose: print "Creating the covariance matrix..." term = TerminalController() progress = ProgressBar(term, "Train instances = %d" % self.n) C = zeros((self.n, self.n)) for i in range(0, self.n): if self.verbose: progress.update( float(i + 1) / self.n, "Processing instance no. %d" % (i + 1)) for j in range(i, self.n): x_i = self.D[i, :] x_j = self.D[j, :] val = self.kernel.value(x_i, x_j) if i == j: val += 1.0 / float(self.beta) C[i, j] = val C[j, i] = val # compute the inverse. if self.verbose: print "Computing the inverse of the matrix..." self.Cinv = inv(C) pass
def train(self, train_vects): # load the train data to a matrix. if self.verbose: print "Loading the training data to memory..." (self.D, self.t) = get_train_data(train_vects) # create co-variance matrix C. self.n = len(train_vects["vects"]) self.m = len(train_vects["featIDs"]) if self.verbose: print "Creating the covariance matrix..." term = TerminalController() progress = ProgressBar(term, "Train instances = %d" % self.n) C = zeros((self.n,self.n)) for i in range(0,self.n): if self.verbose: progress.update(float(i + 1) / self.n, "Processing instance no. %d" % (i + 1)) for j in range(i,self.n): x_i = self.D[i,:] x_j = self.D[j,:] val = self.kernel.value(x_i, x_j) if i == j: val += 1.0 / float(self.beta) C[i,j] = val C[j,i] = val # compute the inverse. if self.verbose: print "Computing the inverse of the matrix..." self.Cinv = inv(C) pass
def cluster(self, m, theta): #first sort patterns according to the total frequency #of all word-pairs in which they appear. pats = [] # (pat_id, total_frequency_in_wpairs) for pat in m.get_row_id_list(): row = m.get_row(pat) total = 0 for k in row: total += row[k] pats.append((pat, total)) N = len(pats) pats.sort(self.patsort) #initialize clusters. clusts = [] count = 0 m.L2_normalize_rows() term = TerminalController() progress = ProgressBar(term, "Clustering total rows = %d" % N) for (pat, total) in pats: maxsim = 0 maxclust = None count += 1 for c in clusts: v = m.get_row(pat) s = self.sim(c, v) if s > maxsim: maxsim = s maxclust = c if maxsim > theta: progress.update(float(count)/N, "MERGED %d: row = %d freq = %d clusts = %d" \ % (count, pat, total, len(clusts))) maxclust.merge(pat, m.get_row(pat)) else: progress.update(float(count)/N, " NEW %d: %s freq = %d clusts = %d" \ % (count, pat, total, len(clusts))) clusts.append(SEQ_CLUST_DATA(pat, m.get_row(pat))) return (clusts)
def cluster(self, m, theta): #first sort patterns according to the total frequency #of all word-pairs in which they appear. pats = [] # (pat_id, total_frequency_in_wpairs) for pat in m.get_row_id_list(): row = m.get_row(pat) total = 0 for k in row: total += row[k] pats.append((pat, total)) N = len(pats) pats.sort(self.patsort) #initialize clusters. clusts = [] count = 0 m.L2_normalize_rows() term = TerminalController() progress = ProgressBar(term, "Clustering total rows = %d" %N) for (pat, total) in pats: maxsim = 0 maxclust = None count += 1 for c in clusts: v = m.get_row(pat) s = self.sim(c, v) if s > maxsim: maxsim = s maxclust = c if maxsim > theta: progress.update(float(count)/N, "MERGED %d: row = %d freq = %d clusts = %d" \ % (count, pat, total, len(clusts))) maxclust.merge(pat, m.get_row(pat)) else: progress.update(float(count)/N, " NEW %d: %s freq = %d clusts = %d" \ % (count, pat, total, len(clusts))) clusts.append(SEQ_CLUST_DATA(pat, m.get_row(pat))) return(clusts)
def coclustering(self, M, theta, phi): """ Implements sequential co-clustering. (alternation variant) """ # Initialization. sorting row counts. cols = [] rows = [] columnIndex = {} rowIndex = {} for rowid in M.get_row_id_list(): rows.append((rowid,M.get_row_sum(rowid))) rows.sort(self.patsort) no_rows = len(rows) # sorting column counts. for colid in M.get_column_id_list(): cols.append((colid, M.get_column_sum(colid))) cols.sort(self.patsort) no_cols = len(cols) colclusts = {} rowclusts = {} theta_max = -1 phi_max = -1 if not self.VERBOSE: term = TerminalController() progress = ProgressBar(term, "Clustering rows = %d, columns = %d" % \ (no_rows, no_cols)) total = no_rows + no_cols count = 0 # start alternative clustering. while(cols or rows): if cols: # column clustering. count += 1 current_column = cols[0][0] del cols[0] theta_max = 0 max_col_clust = -1 validClusts = self.get_clusters(rowIndex, M.get_column(current_column)) for c in validClusts: s = self.cosine(M.get_column(current_column), M.get_column(c)) if s > theta_max: theta_max = s max_col_clust = c if theta_max > theta: colclusts[max_col_clust].append(current_column) self.update_index(rowIndex, M.get_column(current_column), max_col_clust) M.merge("COLUMNS",max_col_clust,current_column) if self.VERBOSE: print "COL\t%d\tMRG\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \ (current_column,theta_max, len(rowclusts), len(colclusts), count, total) else: progress.update(float(count)/total,\ "COL %d MRG SIM=%f Total=(%d,%d) [%d/%d]" %\ (current_column,theta_max,\ len(rowclusts), len(colclusts),\ count, total)) pass else: colclusts[current_column] = [current_column] self.update_index(rowIndex, M.get_column(current_column), current_column) if self.VERBOSE: print "COL\t%d\tNEW\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \ (current_column,theta_max, len(rowclusts), len(colclusts), count, total) else: progress.update(float(count)/total,\ "COL %d NEW SIM=%f Total=(%d,%d) [%d/%d]" % \ (current_column,theta_max,\ len(rowclusts), len(colclusts),\ count, total)) pass if rows: # row clustering. count += 1 current_row = rows[0][0] del rows[0] phi_max = 0 max_row_clust = -1 validClusts = self.get_clusters(columnIndex, M.get_row(current_row)) for c in validClusts: s = self.cosine(M.get_row(current_row), M.get_row(c)) if s > phi_max: phi_max = s max_row_clust = c if phi_max > phi: rowclusts[max_row_clust].append(current_row) self.update_index(columnIndex, M.get_row(current_row), max_row_clust) M.merge("ROWS",max_row_clust,current_row) if self.VERBOSE: print "ROW\t%d\tMRG\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \ (current_row,phi_max, len(rowclusts), len(colclusts), count, total) else: progress.update(float(count)/total,\ "ROW %d MRG SIM=%f Total=(%d,%d) [%d/%d]" % \ (current_row,phi_max, len(rowclusts), len(colclusts), count, total)) pass else: rowclusts[current_row] = [current_row] self.update_index(columnIndex, M.get_row(current_row), current_row) if self.VERBOSE: print "ROW\t%d\tNEW\tSIM=%f\tTotal=(%d,%d) [%d,%d]" % \ (current_row,phi_max, len(rowclusts), len(colclusts), count, total) else: progress.update(float(count)/total,\ "ROW %d NEW SIM=%f Total=(%d,%d) [%d/%d]" % \ (current_row,phi_max, len(rowclusts), len(colclusts), count, total)) pass # Final steps. return (rowclusts,colclusts)