# key in data name
import sys, os
name = sys.argv[1]

# load data
from py.utils.load_data import read_dataset

X_train, _, X_test, _ = read_dataset(name)

from py.utils.sent2vec import sent2vec
from py.utils.safe_pickle import pickle_dump
from tqdm import tqdm

# pmean calculator
import numpy


def p_mean_vector(powers, vectors):
    if len(vectors) <= 1:
        return numpy.zeros(300 * len(powers))
    embeddings = []
    for p in powers:
        embeddings.append(
            numpy.power(
                numpy.mean(numpy.power(numpy.array(vectors, dtype=complex), p),
                           axis=0), 1 / p).real)
    return numpy.hstack(embeddings)


powers_list = [[1.0], [1.0, 2.0], [1.0, 2.0, 3.0],
               [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]]
Exemple #2
0
    fname = "../../../produced/" + distname + "_" + dataset + ".numpyz.npz"

    from scipy.spatial.distance import squareform
    distmat = squareform(load(fname)["dist"])
    distmat = nan_to_num(distmat)

distmat[distmat < 0] = 0.0
distmat = distmat / max(distmat)
print(distmat.shape)

# need to split into train-test
# to do this, need to load original and get the indices
# load data
from py.utils.load_data import read_dataset

X_train, Y_train, X_test, Y_test = read_dataset(dataset)
train_idx = len(X_train)

X_train_mat = distmat[:train_idx, :train_idx]
X_test_train_mat = distmat[train_idx:, :train_idx]

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import classification_report, accuracy_score

knn = KNN(n_neighbors=1, metric="precomputed")

knn.fit(X_train_mat, Y_train)
predict = knn.predict(X_test_train_mat)

report = classification_report(Y_test, predict, digits=5)
acc = accuracy_score(Y_test, predict)
Exemple #3
0
dataset = "amazon"
distname = "wmddist"
best_params = "100-1.0-1.0"

from py.utils.safe_pickle import pickle_load

embeddings = pickle_load("../../../exact_embeddings/" + distname + "_" +
                         dataset + "/" + best_params + ".p")
print(embeddings.shape)

from py.utils.load_data import read_dataset

X_train, _, X_test, _ = read_dataset(dataset)
all_sent = X_train + X_test
print(len(all_sent))

from scipy.spatial.distance import cosine


def sim_sent(embeddings, query_idx):
    dist = 1
    best_match_idx = None
    query_emb = embeddings[query_idx]
    for i in range(len(embeddings)):
        if i != idx:
            d = cosine(query_emb, embeddings[i])
            if d < dist:
                dist = d
                best_match_idx = i
    return 1 - dist, best_match_idx
# key in data name
import sys
name = sys.argv[1]

# load data and write in a suitable format for wmd code
from py.utils.load_data import read_dataset
X_train, Y_train, X_test, Y_test = read_dataset(name)
with open("../../../data/" + name + "_for_wmd.txt", "w") as f:
    for i in range(len(X_train)):
        f.write("{}\t{}\n".format(Y_train[i], X_train[i]))
    for i in range(len(X_test)):
        f.write("{}\t{}\n".format(Y_test[i], X_test[i]))

from py.distances.wmd.get_word_vectors import read_line_by_line
import gensim
# load word2vec model (trained on Google News)
model = gensim.models.KeyedVectors.load_word2vec_format(
    '../../../resources/GoogleNews-vectors-negative300.bin.gz', binary=True)
vec_size = 300

# specify train/test datasets
train_dataset = "../../../data/" + name + "_for_wmd.txt"  # e.g.: 'twitter.txt'
save_file = "../../../produced/" + name + "_vec.pk"  # e.g.: 'twitter.pk'

# read document data
(X, BOW_X, y, C, words) = read_line_by_line(train_dataset, [], model, vec_size)

# save pickle of extracted variables
import pickle
with open(save_file, 'wb') as f:
    pickle.dump([X, BOW_X, y, C, words], f)