def main(vocab_file, inv_vocab_file, infiles): vocab = load_pickled(vocab_file) inv_vocab = load_pickled(inv_vocab_file) lda = LdaModel(id2word=inv_vocab, num_topics=200) for f in infiles: tc = TweetCorpus(f, vocab) lda.update(tc) lda.save('topics.lda')
def test_provider(): vocab = load_pickled('vocab.dat') tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt']) lda = LdaLoader('topics.lda', 200) label_vectorizer = LabelVectorizer(load_pickled('labels.dat')) stemmer = MemoizedStemmer() pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab, tfidf, lda, label_vectorizer, stemmer) return pos_provider
def train_kmeans(embedding_file, label_out, kmeans_random_seed): embeddings = load_pickled(embedding_file) labels = run_kmeans(embeddings, kmeans_random_seed) with open(label_out, 'wb') as f: pickle.dump(labels, f, protocol=-1)
def main(embedding_file, label_out): embeddings = load_pickled(embedding_file) labels = run_kmeans(embeddings) with open(label_out, 'wb') as f: pickle.dump(labels, f, protocol=-1)
def train_setup(vocab_file, pos_file, neg_file, cluster_labels_file, validation_file): vocab = load_pickled(vocab_file) tfidf = TfIdf(vocab, [pos_file, neg_file]) label_vectorizer = LabelVectorizer(load_pickled(cluster_labels_file)) stemmer = MemoizedStemmer() pos_provider = TrainingSampleProvider(pos_file, 1, vocab, tfidf, label_vectorizer, stemmer) neg_provider = TrainingSampleProvider(neg_file, -1, vocab, tfidf, label_vectorizer, stemmer) merged = SampleMerger(pos_provider, neg_provider) validation_provider = ValidationSampleProvider(validation_file, None, vocab, tfidf, label_vectorizer, stemmer) return merged, validation_provider
def train_setup(): vocab = load_pickled('vocab.dat') tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt']) lda = LdaLoader('topics.lda', 200) label_vectorizer = LabelVectorizer(load_pickled('labels.dat')) stemmer = MemoizedStemmer() pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab, tfidf, lda, label_vectorizer, stemmer) neg_provider = TrainingSampleProvider('./data/train_neg.txt', -1, vocab, tfidf, lda, label_vectorizer, stemmer) merged = SampleMerger(pos_provider, neg_provider) validation_provider = ValidationSampleProvider('./data/test_data.txt', None, vocab, tfidf, lda, label_vectorizer, stemmer) return merged, validation_provider
def main(): pos_file = './data/train_pos.txt' neg_file = './data/train_neg.txt' validation = './data/test_data.txt' stopwords = './data/stopwords.txt' vocab_file = 'vocab.dat' inv_vocab_file = 'inv_vocab.dat' cooc_file = 'cooc.dat' embeddings_file = 'embeddings.dat' label_file = 'labels.dat' submission_file = 'submission.csv' glove_seed = 1234 kmeans_seed = 4321 xgb_seed = 1337 sampler_seed = 7331 build_vocab([pos_file, neg_file], stopwords, vocab_file, inv_vocab_file, cutoff=5) vocab = load_pickled(vocab_file) inv_vocab = load_pickled(inv_vocab_file) build_cooc([pos_file, neg_file], vocab, cooc_file) train_glove(cooc_file, embeddings_file, glove_seed) train_kmeans(embeddings_file, label_file, kmeans_seed) train_xgb(vocab_file, pos_file, neg_file, label_file, validation, submission_file, xgb_seed, sampler_seed)
def main(): cooc = load_pickled('cooc.dat') glove = GloVe(cooc) for epoch_num in range(0, 10): print("start epoch " + str(epoch_num)) glove.training_run() print("finished") glove.save('embeddings.dat')
def train_glove(cooc_file, embeddings_file, numpy_random_seed): np.random.seed(numpy_random_seed) cooc = load_pickled(cooc_file) glove = GloVe(cooc) for epoch_num in range(0, 10): print("start epoch " + str(epoch_num)) glove.training_run() print("finished") glove.save(embeddings_file)
def main(vocab_file, infiles): vocab = load_pickled(vocab_file) build_cooc(infiles, vocab, 'cooc.dat')