Ejemplo n.º 1
0
 def train(self, train_vects):
     # load the train data to a matrix.
     if self.verbose:
         print "Loading the training data to memory..."
     (self.D, self.t) = get_train_data(train_vects)
     # create co-variance matrix C.
     self.n = len(train_vects["vects"])
     self.m = len(train_vects["featIDs"])
     if self.verbose:
         print "Creating the covariance matrix..."
         term = TerminalController()
         progress = ProgressBar(term, "Train instances = %d" % self.n)
     C = zeros((self.n, self.n))
     for i in range(0, self.n):
         if self.verbose:
             progress.update(
                 float(i + 1) / self.n,
                 "Processing instance no. %d" % (i + 1))
         for j in range(i, self.n):
             x_i = self.D[i, :]
             x_j = self.D[j, :]
             val = self.kernel.value(x_i, x_j)
             if i == j:
                 val += 1.0 / float(self.beta)
             C[i, j] = val
             C[j, i] = val
     # compute the inverse.
     if self.verbose:
         print "Computing the inverse of the matrix..."
     self.Cinv = inv(C)
     pass
Ejemplo n.º 2
0
 def train(self, train_vects):
     # load the train data to a matrix.
     if self.verbose:
         print "Loading the training data to memory..."
     (self.D, self.t) = get_train_data(train_vects)
     # create co-variance matrix C.
     self.n = len(train_vects["vects"])
     self.m = len(train_vects["featIDs"])
     if self.verbose:
         print "Creating the covariance matrix..."
         term = TerminalController()
         progress = ProgressBar(term,
                                "Train instances = %d" % self.n)        
     C = zeros((self.n,self.n))
     for i in range(0,self.n):
         if self.verbose:
             progress.update(float(i + 1) / self.n,
                             "Processing instance no. %d" % (i + 1))
         for j in range(i,self.n):
             x_i = self.D[i,:]
             x_j = self.D[j,:]
             val = self.kernel.value(x_i, x_j)
             if i == j:
                 val += 1.0 / float(self.beta)
             C[i,j] = val
             C[j,i] = val
     # compute the inverse.
     if self.verbose:
         print "Computing the inverse of the matrix..."
     self.Cinv = inv(C)                
     pass
Ejemplo n.º 3
0
 def cluster(self, m, theta):
     #first sort patterns according to the total frequency
     #of all word-pairs in which they appear.
     pats = []  # (pat_id, total_frequency_in_wpairs)
     for pat in m.get_row_id_list():
         row = m.get_row(pat)
         total = 0
         for k in row:
             total += row[k]
         pats.append((pat, total))
     N = len(pats)
     pats.sort(self.patsort)
     #initialize clusters.
     clusts = []
     count = 0
     m.L2_normalize_rows()
     term = TerminalController()
     progress = ProgressBar(term, "Clustering total rows = %d" % N)
     for (pat, total) in pats:
         maxsim = 0
         maxclust = None
         count += 1
         for c in clusts:
             v = m.get_row(pat)
             s = self.sim(c, v)
             if s > maxsim:
                 maxsim = s
                 maxclust = c
         if maxsim > theta:
             progress.update(float(count)/N,
                             "MERGED %d: row = %d freq = %d clusts = %d" \
                             % (count, pat, total, len(clusts)))
             maxclust.merge(pat, m.get_row(pat))
         else:
             progress.update(float(count)/N,
                             "   NEW %d: %s freq = %d clusts = %d" \
                             % (count, pat, total, len(clusts)))
             clusts.append(SEQ_CLUST_DATA(pat, m.get_row(pat)))
     return (clusts)
Ejemplo n.º 4
0
 def cluster(self, m, theta):
     #first sort patterns according to the total frequency
     #of all word-pairs in which they appear.
     pats = [] # (pat_id, total_frequency_in_wpairs)
     for pat in m.get_row_id_list():
         row = m.get_row(pat)
         total = 0
         for k in row:
             total += row[k]
         pats.append((pat, total))
     N = len(pats)
     pats.sort(self.patsort)
     #initialize clusters.
     clusts = []
     count = 0
     m.L2_normalize_rows()
     term = TerminalController()
     progress = ProgressBar(term, "Clustering total rows = %d" %N)
     for (pat, total) in pats:
         maxsim = 0
         maxclust = None
         count += 1
         for c in clusts:
             v = m.get_row(pat)
             s = self.sim(c, v)
             if s > maxsim:
                 maxsim = s
                 maxclust = c
         if maxsim > theta:
             progress.update(float(count)/N,
                             "MERGED %d: row = %d freq = %d clusts = %d" \
                             % (count, pat, total, len(clusts)))
             maxclust.merge(pat, m.get_row(pat))
         else:
             progress.update(float(count)/N,
                             "   NEW %d: %s freq = %d clusts = %d" \
                             % (count, pat, total, len(clusts)))
             clusts.append(SEQ_CLUST_DATA(pat, m.get_row(pat)))
     return(clusts)
Ejemplo n.º 5
0
 def coclustering(self, M, theta, phi):
     """
     Implements sequential co-clustering.
     (alternation variant)
     """
     # Initialization. sorting row counts.
     cols = []
     rows = []
     columnIndex = {}
     rowIndex = {}
     for rowid in M.get_row_id_list():
         rows.append((rowid,M.get_row_sum(rowid)))
     rows.sort(self.patsort)
     no_rows = len(rows)
     # sorting column counts.
     for colid in M.get_column_id_list():
         cols.append((colid, M.get_column_sum(colid)))
     cols.sort(self.patsort)
     no_cols = len(cols)
     colclusts = {}
     rowclusts = {}
     theta_max = -1
     phi_max = -1
     if not self.VERBOSE:
         term = TerminalController()
         progress = ProgressBar(term,
                                "Clustering rows = %d, columns = %d" % \
                                (no_rows, no_cols))
     total = no_rows + no_cols
     count = 0
     # start alternative clustering.
     while(cols or rows):
         if cols:
             # column clustering.
             count += 1
             current_column = cols[0][0]
             del cols[0]
             theta_max = 0
             max_col_clust = -1
             validClusts = self.get_clusters(rowIndex,
                                             M.get_column(current_column))
             for c in validClusts:
                 s = self.cosine(M.get_column(current_column),
                                 M.get_column(c))
                 if s > theta_max:
                     theta_max = s
                     max_col_clust = c
             if theta_max > theta:
                 colclusts[max_col_clust].append(current_column)
                 self.update_index(rowIndex, M.get_column(current_column),
                                   max_col_clust)
                 M.merge("COLUMNS",max_col_clust,current_column)
                 if self.VERBOSE:
                     print "COL\t%d\tMRG\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \
                           (current_column,theta_max,
                            len(rowclusts), len(colclusts),
                            count, total)
                 else:
                     progress.update(float(count)/total,\
                                     "COL %d MRG SIM=%f Total=(%d,%d) [%d/%d]" %\
                                     (current_column,theta_max,\
                                      len(rowclusts), len(colclusts),\
                                      count, total))
                     pass                                        
             else:
                 colclusts[current_column] = [current_column]
                 self.update_index(rowIndex, M.get_column(current_column),
                                   current_column)
                 if self.VERBOSE:
                     print "COL\t%d\tNEW\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \
                           (current_column,theta_max,
                            len(rowclusts), len(colclusts),
                            count, total)
                 else:
                     progress.update(float(count)/total,\
                                     "COL %d NEW SIM=%f Total=(%d,%d) [%d/%d]" % \
                                     (current_column,theta_max,\
                                      len(rowclusts), len(colclusts),\
                                      count, total))
                     pass
         if rows:
             # row clustering.
             count += 1
             current_row = rows[0][0]
             del rows[0]
             phi_max = 0
             max_row_clust = -1
             validClusts = self.get_clusters(columnIndex,
                                             M.get_row(current_row))
             for c in validClusts:
                 s = self.cosine(M.get_row(current_row),
                                 M.get_row(c))
                 if s > phi_max:
                     phi_max = s
                     max_row_clust = c
             if phi_max > phi:
                 rowclusts[max_row_clust].append(current_row)
                 self.update_index(columnIndex, M.get_row(current_row),
                                   max_row_clust)
                 M.merge("ROWS",max_row_clust,current_row)
                 if self.VERBOSE:
                     print "ROW\t%d\tMRG\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \
                           (current_row,phi_max,
                            len(rowclusts), len(colclusts),
                            count, total)
                 else:
                     progress.update(float(count)/total,\
                                     "ROW %d MRG SIM=%f Total=(%d,%d) [%d/%d]" % \
                                     (current_row,phi_max,
                                      len(rowclusts), len(colclusts),
                                      count, total))
                     pass                                        
             else:
                 rowclusts[current_row] = [current_row]
                 self.update_index(columnIndex, M.get_row(current_row),
                                   current_row)
                 if self.VERBOSE:
                     print "ROW\t%d\tNEW\tSIM=%f\tTotal=(%d,%d) [%d,%d]" % \
                           (current_row,phi_max,
                            len(rowclusts), len(colclusts),
                            count, total)
                 else:
                     progress.update(float(count)/total,\
                                     "ROW %d NEW SIM=%f Total=(%d,%d) [%d/%d]" % \
                                     (current_row,phi_max,
                                      len(rowclusts), len(colclusts),
                                      count, total))
                     pass                 
     # Final steps.
     return (rowclusts,colclusts)