def compute_local_affinity(V): global damping cluster_args = {"damping": damping} cluster = cluster_clf(**cluster_args) DV = cdist(V, V, metric='cosine') z_labels = cluster.fit_predict(DV) # print "{} unique labels found".format(np.unique(z_labels).shape) return V, z_labels
def compute_local_affinity(V): global damping cluster_args = {"damping":damping} cluster = cluster_clf(**cluster_args) DV = cdist(V,V,metric='cosine') z_labels = cluster.fit_predict(DV) #print "{} unique labels found".format(np.unique(z_labels).shape) return V,z_labels
def compute_affinity(item): text, f_idx, table_name, f_sql = item tokens = text.split() # Find out which tokens are defined valid_tokens = [w for w in tokens if w in M] collections.Counter(valid_tokens) labels = np.array(list(set(valid_tokens))) token_clf_index = np.array([M.word2index[w] for w in labels]) if not labels.size: msg = "Document has no valid tokens! This is problem." raise ValueError(msg) V = np.array([M[w] for w in labels]) DV = cdist(V, V, metric='cosine') # Values are sometimes "slightly" less than zero due to rounding DV[DV < 0] = 0 cluster_args = {"damping": damping} cluster = cluster_clf(**cluster_args) y_labels = cluster.fit_predict(DV) data = {} data = { "token_clf_index": token_clf_index, "y_labels": y_labels, } return f_idx, f_sql, data
def compute_affinity(item): text,f_idx,table_name,f_sql = item tokens = text.split() # Find out which tokens are defined valid_tokens = [w for w in tokens if w in M] local_counts = collections.Counter(valid_tokens) labels = np.array(list(set(valid_tokens))) token_clf_index = np.array([M.word2index[w] for w in labels]) if not labels.size: msg = "Document has no valid tokens! This is problem." raise ValueError(msg) V = np.array([M[w] for w in labels]) DV = cdist(V,V,metric='cosine') # Values are sometimes "slightly" less than zero due to rounding DV[DV<0] = 0 cluster_args = {"damping":damping} cluster = cluster_clf(**cluster_args) y_labels = cluster.fit_predict(DV) data = {} data = { "token_clf_index":token_clf_index, "y_labels":y_labels, } return f_idx, f_sql, data
if skip and k%skip==0: counter += 1 yield line.split() V = [] for tokens in tweet_iterator(max_t,skip=skip_t): v = np.array([clf[t] for t in tokens if t in clf]) V.append( v.mean(axis=0) ) V = np.array(V) print "Size of V {}".format(V.shape) cluster = cluster_clf(**cluster_args) y_labels = cluster.fit_predict(V) print "Number of clusters {}".format(y_labels.max()) print "Cluster sizes", collections.Counter(y_labels).most_common() Z = [] WORDS = [] for i in range(y_labels.max()): idx = y_labels==i z = V[idx].mean(axis=0) z /= np.linalg.norm(z) Z.append(z) dispersion = V[idx].dot(z).mean()
names = map(label_maker, EM) df = pd.DataFrame(0.0, columns=names, index=names) for w1, w2 in itertools.product(EM, repeat=2): if w1 == w2: continue name1 = label_maker(w1) name2 = label_maker(w2) df[name1][name2] = clf.similarity("EMOJI_" + w1, "EMOJI_" + w2) A = df.values cluster_args = {"n_clusters": CLUSTER_N} cluster = cluster_clf(**cluster_args) y_labels = cluster.fit_predict(A) idx = np.argsort(y_labels) y_labels = y_labels[idx] A = A[idx, :][:, idx] labels = np.array(names)[idx] df2 = pd.DataFrame(A, columns=labels, index=labels) import seaborn as sns import matplotlib import matplotlib.pyplot as plt