コード例 #1
0
def compute_local_affinity(V):
    global damping

    cluster_args = {"damping": damping}
    cluster = cluster_clf(**cluster_args)

    DV = cdist(V, V, metric='cosine')
    z_labels = cluster.fit_predict(DV)

    # print "{} unique labels found".format(np.unique(z_labels).shape)
    return V, z_labels
コード例 #2
0
def compute_local_affinity(V):
    global damping

    cluster_args = {"damping":damping}
    cluster = cluster_clf(**cluster_args)

    DV = cdist(V,V,metric='cosine')    
    z_labels = cluster.fit_predict(DV)

    #print "{} unique labels found".format(np.unique(z_labels).shape)
    return V,z_labels
コード例 #3
0
def compute_affinity(item):

    text, f_idx, table_name, f_sql = item
    tokens = text.split()

    # Find out which tokens are defined
    valid_tokens = [w for w in tokens if w in M]

    collections.Counter(valid_tokens)
    labels = np.array(list(set(valid_tokens)))

    token_clf_index = np.array([M.word2index[w]
                                for w in labels])

    if not labels.size:
        msg = "Document has no valid tokens! This is problem."
        raise ValueError(msg)

    V = np.array([M[w] for w in labels])
    DV = cdist(V, V, metric='cosine')

    # Values are sometimes "slightly" less than zero due to rounding
    DV[DV < 0] = 0

    cluster_args = {"damping": damping}
    cluster = cluster_clf(**cluster_args)

    y_labels = cluster.fit_predict(DV)

    data = {}

    data = {
        "token_clf_index": token_clf_index,
        "y_labels": y_labels,
    }

    return f_idx, f_sql, data
コード例 #4
0
def compute_affinity(item):

    text,f_idx,table_name,f_sql = item
    tokens = text.split()

    # Find out which tokens are defined
    valid_tokens = [w for w in tokens if w in M]
    
    local_counts = collections.Counter(valid_tokens)
    labels = np.array(list(set(valid_tokens)))

    token_clf_index = np.array([M.word2index[w]
                                for w in labels])

    if not labels.size:
        msg = "Document has no valid tokens! This is problem."
        raise ValueError(msg)

    V  = np.array([M[w] for w in labels])
    DV = cdist(V,V,metric='cosine')

    # Values are sometimes "slightly" less than zero due to rounding
    DV[DV<0] = 0

    cluster_args = {"damping":damping}
    cluster = cluster_clf(**cluster_args)

    y_labels = cluster.fit_predict(DV)

    data = {}

    data = {
        "token_clf_index":token_clf_index,
        "y_labels":y_labels,
    }
    
    return f_idx, f_sql, data
コード例 #5
0
ファイル: polysem.py プロジェクト: thoppe/polysemous-emoji
                if skip and k%skip==0:
                    counter += 1                
                    yield line.split()


V = []
for tokens in tweet_iterator(max_t,skip=skip_t):
    v = np.array([clf[t] for t in tokens if t in clf])
    V.append( v.mean(axis=0) )

V = np.array(V)

print "Size of V {}".format(V.shape)

cluster = cluster_clf(**cluster_args)
y_labels = cluster.fit_predict(V)

print "Number of clusters {}".format(y_labels.max())
print "Cluster sizes", collections.Counter(y_labels).most_common()

Z = []
WORDS = []
for i in range(y_labels.max()):
    idx = y_labels==i
    z = V[idx].mean(axis=0)
    z /= np.linalg.norm(z)
    Z.append(z)
    
    dispersion = V[idx].dot(z).mean()
コード例 #6
0
ファイル: plot_diffs.py プロジェクト: thoppe/polysemous-emoji
names = map(label_maker, EM)

df = pd.DataFrame(0.0, columns=names, index=names)
for w1, w2 in itertools.product(EM, repeat=2):
    if w1 == w2: continue

    name1 = label_maker(w1)
    name2 = label_maker(w2)

    df[name1][name2] = clf.similarity("EMOJI_" + w1, "EMOJI_" + w2)

A = df.values

cluster_args = {"n_clusters": CLUSTER_N}
cluster = cluster_clf(**cluster_args)

y_labels = cluster.fit_predict(A)
idx = np.argsort(y_labels)

y_labels = y_labels[idx]
A = A[idx, :][:, idx]

labels = np.array(names)[idx]
df2 = pd.DataFrame(A, columns=labels, index=labels)

import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt