Ejemplo n.º 1
0
def run_word_vectors():
    print('reading nyt_vec.bin')
    all_w2vec = utils.read_vec_bin()
    words2id = utils.load_words()
    print('prepare w2vec')
    w2vec = utils.word_vectors(words2id, all_w2vec)
    print('dumping')
    json.dump(w2vec, open(Const.words_id2vector_filename, 'w'))
Ejemplo n.º 2
0
import numpy as np
from chinese_whispers import chinese_whispers, aggregate_clusters

from utils import triples, grouper, word_vectors, words_vec

parser = argparse.ArgumentParser()
parser.add_argument('--neighbors', '-n', type=int, default=10)
parser.add_argument('--min-weight', type=float, default=0.)
parser.add_argument('--pickle', type=argparse.FileType('wb'))
parser.add_argument('triples', type=argparse.FileType('r', encoding='UTF-8'))
group = parser.add_mutually_exclusive_group()
group.add_argument('--w2v', default=None, type=argparse.FileType('rb'))
group.add_argument('--pyro', default=None, type=str)
args = parser.parse_args()

w2v = word_vectors(
    args, lambda args: parser.error('Please set the --w2v or --pyro option.'))

spos, _ = triples(args.triples, min_weight=args.min_weight, build_index=False)

vocabulary = {
    word
    for triple in spos
    for word in (triple.subject, triple.predicate, triple.object)
}

vectors = {}

for words in grouper(vocabulary, 512):
    vectors.update(words_vec(w2v, words))

spos = [
Ejemplo n.º 3
0
trigram_map = make_map_from_nested(top_tokens['top_trigrams'].values())

results = set()
for sen in train['text']:
    results.update(sen.split())
all_map = {w: i for i, w in enumerate(list(results))}

y_train_ints = np.array([class_map[label] for label in train['class']])
y_dev_ints = np.array([class_map[label] for label in dev['class']])
y_test_ints = np.array([class_map[label] for label in test['class']])

y_train = to_categorical(y_train_ints, 10)
y_dev = to_categorical(y_dev_ints, 10)
y_test = to_categorical(y_test_ints, 10)

X_train_words = word_vectors(train['text'], word_map)
X_train_trigrams = trigram_vectors(train['text'], trigram_map)
X_train_all = all_vectors(train['text'], all_map)
X_dev_words = word_vectors(dev['text'], word_map)
X_dev_trigrams = trigram_vectors(dev['text'], trigram_map)
X_dev_all = all_vectors(dev['text'], all_map)
X_test_words = word_vectors(test['text'], word_map)
X_test_trigrams = trigram_vectors(test['text'], trigram_map)
X_test_all = all_vectors(test['text'], all_map)

max_word_len = max(max([len(s) for s in X_train_words]),
                   max([len(s) for s in X_dev_words]),
                   max([len(s) for s in X_test_words]))
max_tri_len = max(max([len(s) for s in X_train_trigrams]),
                  max([len(s) for s in X_dev_trigrams]),
                  max([len(s) for s in X_test_trigrams]))