def write_distribution(M, result_fname): """ Compute the row similarity distribution. To compute column similarity distribution, transpose the matrix first. """ work_queue = Queue() lock = Lock() distFile = open(result_fname, "w") row_ids = M.get_row_id_list() no_rows, no_cols = M.shape() for (counter, i) in enumerate(row_ids): work_queue.put(i) term = TerminalController() progress = ProgressBar(term,"Total rows = %d, columns = %d"\ % (no_rows,no_cols)) count = 0 # compute similarity. procs = [ Process(target=do_work, args=(work_queue, lock, M, no_rows, distFile, progress)) for i in range(NO_OF_PROCESSORS) ] for p in procs: p.start() for p in procs: p.join() distFile.close() pass
def train(self, train_vects): # load the train data to a matrix. if self.verbose: print "Loading the training data to memory..." (self.D, self.t) = get_train_data(train_vects) # create co-variance matrix C. self.n = len(train_vects["vects"]) self.m = len(train_vects["featIDs"]) if self.verbose: print "Creating the covariance matrix..." term = TerminalController() progress = ProgressBar(term, "Train instances = %d" % self.n) C = zeros((self.n, self.n)) for i in range(0, self.n): if self.verbose: progress.update( float(i + 1) / self.n, "Processing instance no. %d" % (i + 1)) for j in range(i, self.n): x_i = self.D[i, :] x_j = self.D[j, :] val = self.kernel.value(x_i, x_j) if i == j: val += 1.0 / float(self.beta) C[i, j] = val C[j, i] = val # compute the inverse. if self.verbose: print "Computing the inverse of the matrix..." self.Cinv = inv(C) pass
def write_distribution(M, result_fname, DIelements): """ Compute the row similarity distribution. To compute column similarity distribution, transpose the matrix first. if Domain independent row elements are given (DIelements), then compute the similarity between those elements and all the row elements. """ work_queue = Queue() lock = Lock() distFile = open(result_fname, "w") row_ids = [] for rowid in DIelements: if M.row_exists(rowid): row_ids.append(rowid) (no_rows, no_cols) = M.shape() for (counter, i) in enumerate(row_ids): work_queue.put(i) term = TerminalController() progress = ProgressBar(term,"Total rows = %d, columns = %d"\ % (no_rows,no_cols)) count = 0 # compute similarity. procs = [ Process(target=do_work, args=(work_queue, lock, M, len(row_ids), distFile, progress)) for i in range(NO_OF_PROCESSORS) ] for p in procs: p.start() for p in procs: p.join() distFile.close() pass
def cluster(self, m, theta): #first sort patterns according to the total frequency #of all word-pairs in which they appear. pats = [] # (pat_id, total_frequency_in_wpairs) for pat in m.get_row_id_list(): row = m.get_row(pat) total = 0 for k in row: total += row[k] pats.append((pat, total)) N = len(pats) pats.sort(self.patsort) #initialize clusters. clusts = [] count = 0 m.L2_normalize_rows() term = TerminalController() progress = ProgressBar(term, "Clustering total rows = %d" % N) for (pat, total) in pats: maxsim = 0 maxclust = None count += 1 for c in clusts: v = m.get_row(pat) s = self.sim(c, v) if s > maxsim: maxsim = s maxclust = c if maxsim > theta: progress.update(float(count)/N, "MERGED %d: row = %d freq = %d clusts = %d" \ % (count, pat, total, len(clusts))) maxclust.merge(pat, m.get_row(pat)) else: progress.update(float(count)/N, " NEW %d: %s freq = %d clusts = %d" \ % (count, pat, total, len(clusts))) clusts.append(SEQ_CLUST_DATA(pat, m.get_row(pat))) return (clusts)
def coclustering(self, M, theta, phi): """ Implements sequential co-clustering. (alternation variant) """ # Initialization. sorting row counts. cols = [] rows = [] columnIndex = {} rowIndex = {} for rowid in M.get_row_id_list(): rows.append((rowid,M.get_row_sum(rowid))) rows.sort(self.patsort) no_rows = len(rows) # sorting column counts. for colid in M.get_column_id_list(): cols.append((colid, M.get_column_sum(colid))) cols.sort(self.patsort) no_cols = len(cols) colclusts = {} rowclusts = {} theta_max = -1 phi_max = -1 if not self.VERBOSE: term = TerminalController() progress = ProgressBar(term, "Clustering rows = %d, columns = %d" % \ (no_rows, no_cols)) total = no_rows + no_cols count = 0 # start alternative clustering. while(cols or rows): if cols: # column clustering. count += 1 current_column = cols[0][0] del cols[0] theta_max = 0 max_col_clust = -1 validClusts = self.get_clusters(rowIndex, M.get_column(current_column)) for c in validClusts: s = self.cosine(M.get_column(current_column), M.get_column(c)) if s > theta_max: theta_max = s max_col_clust = c if theta_max > theta: colclusts[max_col_clust].append(current_column) self.update_index(rowIndex, M.get_column(current_column), max_col_clust) M.merge("COLUMNS",max_col_clust,current_column) if self.VERBOSE: print "COL\t%d\tMRG\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \ (current_column,theta_max, len(rowclusts), len(colclusts), count, total) else: progress.update(float(count)/total,\ "COL %d MRG SIM=%f Total=(%d,%d) [%d/%d]" %\ (current_column,theta_max,\ len(rowclusts), len(colclusts),\ count, total)) pass else: colclusts[current_column] = [current_column] self.update_index(rowIndex, M.get_column(current_column), current_column) if self.VERBOSE: print "COL\t%d\tNEW\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \ (current_column,theta_max, len(rowclusts), len(colclusts), count, total) else: progress.update(float(count)/total,\ "COL %d NEW SIM=%f Total=(%d,%d) [%d/%d]" % \ (current_column,theta_max,\ len(rowclusts), len(colclusts),\ count, total)) pass if rows: # row clustering. count += 1 current_row = rows[0][0] del rows[0] phi_max = 0 max_row_clust = -1 validClusts = self.get_clusters(columnIndex, M.get_row(current_row)) for c in validClusts: s = self.cosine(M.get_row(current_row), M.get_row(c)) if s > phi_max: phi_max = s max_row_clust = c if phi_max > phi: rowclusts[max_row_clust].append(current_row) self.update_index(columnIndex, M.get_row(current_row), max_row_clust) M.merge("ROWS",max_row_clust,current_row) if self.VERBOSE: print "ROW\t%d\tMRG\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \ (current_row,phi_max, len(rowclusts), len(colclusts), count, total) else: progress.update(float(count)/total,\ "ROW %d MRG SIM=%f Total=(%d,%d) [%d/%d]" % \ (current_row,phi_max, len(rowclusts), len(colclusts), count, total)) pass else: rowclusts[current_row] = [current_row] self.update_index(columnIndex, M.get_row(current_row), current_row) if self.VERBOSE: print "ROW\t%d\tNEW\tSIM=%f\tTotal=(%d,%d) [%d,%d]" % \ (current_row,phi_max, len(rowclusts), len(colclusts), count, total) else: progress.update(float(count)/total,\ "ROW %d NEW SIM=%f Total=(%d,%d) [%d/%d]" % \ (current_row,phi_max, len(rowclusts), len(colclusts), count, total)) pass # Final steps. return (rowclusts,colclusts)