def main(vocab_file, inv_vocab_file, infiles):
    vocab = load_pickled(vocab_file)
    inv_vocab = load_pickled(inv_vocab_file)

    lda = LdaModel(id2word=inv_vocab, num_topics=200)

    for f in infiles:
        tc = TweetCorpus(f, vocab)
        lda.update(tc)

    lda.save('topics.lda')
Esempio n. 2
0
def test_provider():
    vocab = load_pickled('vocab.dat')
    tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt'])
    lda = LdaLoader('topics.lda', 200)
    label_vectorizer = LabelVectorizer(load_pickled('labels.dat'))
    stemmer = MemoizedStemmer()

    pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab,
                                          tfidf, lda, label_vectorizer,
                                          stemmer)

    return pos_provider
def train_kmeans(embedding_file, label_out, kmeans_random_seed):
    embeddings = load_pickled(embedding_file)

    labels = run_kmeans(embeddings, kmeans_random_seed)

    with open(label_out, 'wb') as f:
        pickle.dump(labels, f, protocol=-1)
Esempio n. 4
0
def main(embedding_file, label_out):
    embeddings = load_pickled(embedding_file)

    labels = run_kmeans(embeddings)

    with open(label_out, 'wb') as f:
        pickle.dump(labels, f, protocol=-1)
Esempio n. 5
0
def train_setup(vocab_file, pos_file, neg_file, cluster_labels_file,
                validation_file):
    vocab = load_pickled(vocab_file)
    tfidf = TfIdf(vocab, [pos_file, neg_file])
    label_vectorizer = LabelVectorizer(load_pickled(cluster_labels_file))
    stemmer = MemoizedStemmer()

    pos_provider = TrainingSampleProvider(pos_file, 1, vocab, tfidf,
                                          label_vectorizer, stemmer)
    neg_provider = TrainingSampleProvider(neg_file, -1, vocab, tfidf,
                                          label_vectorizer, stemmer)

    merged = SampleMerger(pos_provider, neg_provider)

    validation_provider = ValidationSampleProvider(validation_file, None,
                                                   vocab, tfidf,
                                                   label_vectorizer, stemmer)

    return merged, validation_provider
Esempio n. 6
0
def train_setup():
    vocab = load_pickled('vocab.dat')
    tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt'])
    lda = LdaLoader('topics.lda', 200)
    label_vectorizer = LabelVectorizer(load_pickled('labels.dat'))
    stemmer = MemoizedStemmer()

    pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab,
                                          tfidf, lda, label_vectorizer,
                                          stemmer)
    neg_provider = TrainingSampleProvider('./data/train_neg.txt', -1, vocab,
                                          tfidf, lda, label_vectorizer,
                                          stemmer)

    merged = SampleMerger(pos_provider, neg_provider)

    validation_provider = ValidationSampleProvider('./data/test_data.txt',
                                                   None, vocab, tfidf, lda,
                                                   label_vectorizer, stemmer)

    return merged, validation_provider
def main():
    pos_file = './data/train_pos.txt'
    neg_file = './data/train_neg.txt'
    validation = './data/test_data.txt'
    stopwords = './data/stopwords.txt'

    vocab_file = 'vocab.dat'
    inv_vocab_file = 'inv_vocab.dat'

    cooc_file = 'cooc.dat'

    embeddings_file = 'embeddings.dat'

    label_file = 'labels.dat'

    submission_file = 'submission.csv'

    glove_seed = 1234
    kmeans_seed = 4321
    xgb_seed = 1337
    sampler_seed = 7331

    build_vocab([pos_file, neg_file],
                stopwords,
                vocab_file,
                inv_vocab_file,
                cutoff=5)

    vocab = load_pickled(vocab_file)
    inv_vocab = load_pickled(inv_vocab_file)

    build_cooc([pos_file, neg_file], vocab, cooc_file)

    train_glove(cooc_file, embeddings_file, glove_seed)

    train_kmeans(embeddings_file, label_file, kmeans_seed)

    train_xgb(vocab_file, pos_file, neg_file, label_file, validation,
              submission_file, xgb_seed, sampler_seed)
Esempio n. 8
0
def main():

    cooc = load_pickled('cooc.dat')

    glove = GloVe(cooc)

    for epoch_num in range(0, 10):
        print("start epoch " + str(epoch_num))
        glove.training_run()

    print("finished")

    glove.save('embeddings.dat')
def train_glove(cooc_file, embeddings_file, numpy_random_seed):

    np.random.seed(numpy_random_seed)

    cooc = load_pickled(cooc_file)

    glove = GloVe(cooc)

    for epoch_num in range(0, 10):
        print("start epoch " + str(epoch_num))
        glove.training_run()

    print("finished")

    glove.save(embeddings_file)
def main(vocab_file, infiles):
    vocab = load_pickled(vocab_file)

    build_cooc(infiles, vocab, 'cooc.dat')