def main(run=None): init_random() log_to_info('starting main') if run: slave_options = run['algo_params'] run_with(**slave_options) else: start_manual_run(run_and_say)
def classify(mp, x_test, x_train, y_train): log_to_info("Starting classification") classifiers = dict( logistic=LogisticClassifier, logistic2=LogisticClassifier2, rbf=RbfClassifier, rbf_scv=RbfSVCClassifier, svc=LinearSVCClassifier, ) return classifiers[mp.classifier_name]().classify(mp, x_train, y_train, x_test)
def classify(self, mp, train_centroids, training_data_sentiment, test_centroids, testing_data_ids): # ****** Fit a random forest and extract predictions clf = RandomForestClassifier(n_estimators=mp.random_forest_estimators) # Fitting the forest may take a few minutes log_to_info('Fitting a random forest to labeled training data...') clf = clf.fit(train_centroids, training_data_sentiment) result = clf.predict(test_centroids) # Write the test results return pd.DataFrame(data={'id': testing_data_ids, 'sentiment': result})
def classify(self, mp, train_centroids, training_data_sentiment, test_centroids, testing_data_ids): # ****** Fit a random forest and extract predictions clf = RandomForestClassifier(n_estimators=mp.random_forest_estimators) # Fitting the forest may take a few minutes log_to_info('Fitting a random forest to labeled training data...') clf = clf.fit(train_centroids, training_data_sentiment) result = clf.predict(test_centroids) # Write the test results return pd.DataFrame(data={'id': testing_data_ids, 'sentiment': result})
def slave_main(): log_handler, log_buffer = setup_logger(True) run = get_random_pending_run() # noinspection PyBroadException try: if not run: log_to_info('Nothing to start, exiting') return # log_to_info('NothingScore: 77.7% to start, exiting') main(run=run) except Exception: logging.exception('Unknown error') output = get_log_output(log_handler, log_buffer) report_results(run['id'], output)
def train_tfidf_vectors(mp, testing_reviews, training_reviews, x_train, x_test): log_to_info('Training tfid vectors') tfv = TfidfVectorizer(max_features=mp.tfid_features) tr = list(training_reviews['words'].values) te = list(testing_reviews['words'].values) space_separated_words = tr + te tfid_vectors = tfv.fit_transform(space_separated_words) x_train_tfid = tfid_vectors[:len(tr)].toarray() x_test_tfid = tfid_vectors[len(tr):].toarray() if x_train is None: x_train = x_train_tfid x_test = x_test_tfid else: x_train = np.concatenate([np.array(list(x_train)), x_train_tfid], axis=1) x_test = np.concatenate([np.array(list(x_test)), x_test_tfid], axis=1) return x_train, x_test
def classify(self, model, y_test_reviews, reviews, nearest_reviews_count=2000): y_test = np.zeros(len(y_test_reviews)) i = 0 for d2v_numeric_id in y_test_reviews['d2v_id']: if i % 100 == 0: log_to_info('Processing {0} of {1} ({2}%)'.format( i, len(y_test_reviews), 1.0 * i / len(y_test_reviews) * 100.0)) d2v_id = get_d2v_identifier(d2v_numeric_id) arr = model.most_similar(d2v_id, topn=10000) sentiment_sum = 0.0 total_neareness = 0.0 total_sentiments = 0 for key in arr: if key[0].startswith('REVIEW_'): most_similar_review = key[0] most_similar_id = int(most_similar_review.split('_')[1]) r = reviews[reviews['d2v_id'].eq(most_similar_id)] if r['use_for_classifier_training'].all(): sentiment = r['sentiment'].values[0] nearness = key[1] sentiment_sum += sentiment * nearness total_neareness += nearness total_sentiments += 1 if total_sentiments >= nearest_reviews_count: break # log_to_info('{0} predicts {1}'.format(d2v_id, sentiment)) if total_neareness == 0: log_to_info('key {0} has no similar review!'.format(d2v_id)) y_test[i] = 0 else: sentiment = 0 if sentiment_sum <= total_neareness * 0.5 else 1 y_test[i] = sentiment i += 1 return y_test
def train_tfidf_vectors(mp, testing_reviews, training_reviews, x_train, x_test): log_to_info('Training tfid vectors') tfv = TfidfVectorizer(max_features=mp.tfid_features) tr = list(training_reviews['words'].values) te = list(testing_reviews['words'].values) space_separated_words = tr + te tfid_vectors = tfv.fit_transform(space_separated_words) x_train_tfid = tfid_vectors[:len(tr)].toarray() x_test_tfid = tfid_vectors[len(tr):].toarray() if x_train is None: x_train = x_train_tfid x_test = x_test_tfid else: x_train = np.concatenate([np.array(list(x_train)), x_train_tfid], axis=1) x_test = np.concatenate([np.array(list(x_test)), x_test_tfid], axis=1) return x_train, x_test
def build_train_and_test_x(mp, reviews): log_to_info('Starting to load data') training_reviews = reviews[reviews['use_for_classifier_training'].eq(True)] testing_reviews = reviews[reviews['predict_sentiment'].eq(True)] x_train = None x_test = None if mp.word_vector_dimensionality: log_to_info('Starting to load the doc2vec model') model_dm = Doc2VecFactory(mp, reviews, 1).get_word2vec_model() model_dbow = Doc2VecFactory(mp, reviews, 0).get_word2vec_model() x_train = training_reviews['d2v_id'].map(lambda d2v_id: convert_to_vector(d2v_id, model_dm, model_dbow)).values x_test = testing_reviews['d2v_id'].map(lambda d2v_id: convert_to_vector(d2v_id, model_dm, model_dbow)).values if mp.tfid_features: x_train, x_test = train_tfidf_vectors(mp, testing_reviews, training_reviews, x_train, x_test) y_train = list(training_reviews['sentiment'].values) return list(x_train), y_train, list(x_test)
def classify(self, model, y_test_reviews, reviews, nearest_reviews_count=2000): y_test = np.zeros(len(y_test_reviews)) i = 0 for d2v_numeric_id in y_test_reviews["d2v_id"]: if i % 100 == 0: log_to_info( "Processing {0} of {1} ({2}%)".format(i, len(y_test_reviews), 1.0 * i / len(y_test_reviews) * 100.0) ) d2v_id = get_d2v_identifier(d2v_numeric_id) arr = model.most_similar(d2v_id, topn=10000) sentiment_sum = 0.0 total_neareness = 0.0 total_sentiments = 0 for key in arr: if key[0].startswith("REVIEW_"): most_similar_review = key[0] most_similar_id = int(most_similar_review.split("_")[1]) r = reviews[reviews["d2v_id"].eq(most_similar_id)] if r["use_for_classifier_training"].all(): sentiment = r["sentiment"].values[0] nearness = key[1] sentiment_sum += sentiment * nearness total_neareness += nearness total_sentiments += 1 if total_sentiments >= nearest_reviews_count: break # log_to_info('{0} predicts {1}'.format(d2v_id, sentiment)) if total_neareness == 0: log_to_info("key {0} has no similar review!".format(d2v_id)) y_test[i] = 0 else: sentiment = 0 if sentiment_sum <= total_neareness * 0.5 else 1 y_test[i] = sentiment i += 1 return y_test
def classify(self, mp, x_train, y_train, x_test): clf = SVC(cache_size=6000, verbose=True) log_to_info('Fitting a RBF SVC to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def classify(self, mp, x_train, y_train, x_test): clf = LinearSVC(dual=False, verbose=False, C=1.0) log_to_info('Fitting a LinearSVC to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def classify(self, mp, x_train, y_train, x_test): clf = SVC(cache_size=6000, verbose=True) log_to_info('Fitting a RBF SVC to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def classify(self, mp, x_train, y_train, x_test): feature_map_nystroem = Nystroem(random_state=1, gamma=1.1, n_components=1000) # gamma=0.00005, clf = pipeline.Pipeline([("feature_map", feature_map_nystroem), ("svm", LinearSVC())]) log_to_info('Fitting a RBF SVM to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def classify(self, mp, x_train, y_train, x_test): x_train_reg = sm.add_constant(x_train) x_test_reg = sm.add_constant(x_test) logit = sm.Logit(y_train, x_train_reg) clf = logit.fit(disp=0) # print(clf.summary()) log_to_info('Fitting a Logistic Regression to labeled training data...') log_to_info('Predicting test value') y_test = clf.predict(x_test_reg) log_to_info('Done!') return numpy.rint(y_test)
def classify(self, mp, x_train, y_train, x_test): feature_map_nystroem = Nystroem(random_state=1, gamma=1.1, n_components=1000) # gamma=0.00005, clf = pipeline.Pipeline([("feature_map", feature_map_nystroem), ("svm", LinearSVC())]) log_to_info('Fitting a RBF SVM to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def classify(self, mp, x_train, y_train, x_test): x_train_reg = sm.add_constant(x_train) x_test_reg = sm.add_constant(x_test) logit = sm.Logit(y_train, x_train_reg) clf = logit.fit(disp=0) # print(clf.summary()) log_to_info( 'Fitting a Logistic Regression to labeled training data...') log_to_info('Predicting test value') y_test = clf.predict(x_test_reg) log_to_info('Done!') return numpy.rint(y_test)
def run_with(**kwargs): log_to_info(str(kwargs)) check_paths_and_create_directories() start = time.time() mp = ModelParameters(**kwargs) reviews = DataPreparer(mp).convert() x_train, y_train, x_test = build_train_and_test_x(mp, reviews) y_test_reviews = reviews[reviews['predict_sentiment'].eq(True)].copy() y_test_reviews['predicted_sentiment'] = classify(mp, x_test, x_train, y_train) log_to_info(str(kwargs)) ScoreCalculator().print_score(y_test_reviews) end = time.time() log_to_info('It took {0} seconds'.format(end - start))
def classify(self, x_train, y_train, x_test): x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) # nn = Classifier( # layers=[ # Layer("Maxout", units=100, pieces=2), # Layer("Softmax")], # learning_rate=0.001, # n_iter=25) # nn.fit(x_train, y_train) # y_test = nn.predict(np.array(x_test)) nn = Regressor(layers=[Layer('Rectifier', units=400), Layer('Linear')], learning_rate=0.02, n_iter=10) log_to_info('Fitting a NN to labeled training data...') nn.fit(np.array(x_train), np.array(y_train)) log_to_info('Predicting test value') y_test = nn.predict(np.array(x_test)) log_to_info('Done!') return y_test
def classify(self, mp, x_train, y_train, x_test): x_train = sm.add_constant(x_train) x_test = sm.add_constant(x_test) clf = LogisticRegressionCV(verbose=1, cv=5) log_to_info( 'Fitting a Logistic Regression to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Training details') log_to_info('Classifier parameters: {}'.format(clf.get_params())) log_to_info('On training: {}'.format( clf.score(x_train, y_train) * 100.0)) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def start_manual_run(func): # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=3, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=2, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=1, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=20, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=15, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=10, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=5, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=4, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=3, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=2, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=1, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=3, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='exp') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=2, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='exp') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=1, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='exp') # run_and_say(algorithm_version=3, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling=1e-4, # negative=17, hierarchical_paragraph_vectors=1, epochs=1, classifier_name='rbf') version = 77.8 # epochs = list(range(1, 5)) + list(range(5, 76, 5)) epochs = list(range(1, 6)) # for epoch in epochs: # log_to_info('Epoch exp: {}'.format(epoch)) # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling=0, negative=5, # hierarchical_paragraph_vectors=0, epochs=epoch, epochs_total=max(epochs), classifier_name='rbf', classifier_c=0.0195, # classifier_penalty='l2', tfid_features=0, learning_rate_type='exp') # log_to_info('Epoch was exp: {}'.format(epoch)) epochs = [3] epoch = max(epochs) log_to_info('Epoch linear: {}'.format(epoch)) hpv = 0 func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling_dm=0.0001, frequent_words_downsampling_dbow=0.01, negative=25, hierarchical_paragraph_vectors=hpv, epochs=epoch, epochs_total=max(epochs), classifier_name='svc', classifier_c=0.0195, classifier_penalty='l2', tfid_features=0, learning_rate_type='exp', experiment_number=1) # func(algorithm_version=version, word_vector_dimensionality=100, word_context_window=10, frequent_words_downsampling=0, negative=5, # hierarchical_paragraph_vectors=5, epochs=epoch, epochs_total=max(epochs), classifier_name='svc', classifier_c=0.0195, # classifier_penalty='l2', tfid_features=0, learning_rate_type='linear') log_to_info('Epoch was linear: {}'.format(epoch))
def train_model(self): log_to_info('Loading training sentences') review_d2v_id_list = zip(self.reviews['words'], self.reviews['d2v_id'], self.reviews['best_topics'], self.reviews['second_best_topics']) labeled_reviews = [] if self.dm == 0: log_to_info('applying dbow with hpv={}'.format(self.mp.hierarchical_paragraph_vectors_dbow)) elif self.dm == 1: log_to_info('applying dm with hpv={}'.format(self.mp.hierarchical_paragraph_vectors_dm)) for space_separated_words, d2v_id, best_topic, second_best_topic in review_d2v_id_list: if self.dm == 0: labeled_reviews.extend( convert_to_labeled_review(self.mp.hierarchical_paragraph_vectors_dbow, space_separated_words, d2v_id, best_topic, second_best_topic)) elif self.dm == 1: labeled_reviews.extend( convert_to_labeled_review(self.mp.hierarchical_paragraph_vectors_dm, space_separated_words, d2v_id, best_topic, second_best_topic)) log_to_info('Loading Doc2Vec model...') start_epoch = self.epochs + 1 model = None for epoch in range(self.epochs, 0, -1): model = self.load_model(epoch) if model: log_to_info('Found model in cache!') break start_epoch = epoch if not model: if self.dm == 0: # PV-DBOW log_to_info('Yep, this is DBOW!') model = Doc2Vec(dm=self.dm, hs=0, workers=self.workers, size=self.mp.word_vector_dimensionality, min_count=self.min_count, window=self.mp.word_context_window, sample=self.mp.frequent_words_downsampling_dbow, seed=random_int(), negative=self.mp.negative) # model = Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=8) elif self.dm == 1: # PV-DM w/average log_to_info('Yep, this is DM!') model = Doc2Vec(dm=self.dm, dm_mean=1, hs=0, workers=self.workers, size=self.mp.word_vector_dimensionality, min_count=self.min_count, window=self.mp.word_context_window, sample=self.mp.frequent_words_downsampling_dm, seed=random_int(), negative=self.mp.negative) # model = Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=8) start1 = time.time() model.build_vocab(labeled_reviews) start1 = time.time() end1 = time.time() log_to_info('Vocab building for dm{0} took {1} seconds'.format(self.dm, end1 - start1)) log_to_info('Training Doc2Vec model...') for epoch in range(start_epoch, self.epochs + 1): log_to_info('Epoch {0} of {1}'.format(epoch, self.epochs)) m = self.load_model(epoch) if m is not None: log_to_info('Found model in cache!') model = m continue permuted_labeled_reviews = labeled_reviews[:] random.shuffle(permuted_labeled_reviews) alpha = alpha_for_epoch(epoch, self.mp.epochs_total, self.mp.alpha_max, self.mp.alpha_min, self.mp.learning_rate_type) model.min_alpha, model.alpha = alpha, alpha start2 = time.time() model.train(permuted_labeled_reviews) end2 = time.time() log_to_info('DM HPV is {0}, DBOW HPV is {1}'.format(self.mp.hierarchical_paragraph_vectors_dm, self.mp.hierarchical_paragraph_vectors_dbow)) log_to_info('Model training for dm{0} took {1} seconds'.format(self.dm, end2 - start2)) self.model = model self.store_model(epoch) self.model = model
def main(): setup_logger() init_random() log_to_info('starting main') start = time.time() reviews = _get_reviews() if 'best_topics' in reviews: log_to_info('best topics already set, aborting!') return log_to_info('getting reviews done') if not os.path.exists(dictionary_cache_name) or not os.path.exists(mm_cache_name): documents = _get_documents(reviews) log_to_info('dictionary') if os.path.exists(dictionary_cache_name): dictionary = gensim.corpora.Dictionary.load(dictionary_cache_name) else: dictionary = gensim.corpora.Dictionary(documents) dictionary.save(dictionary_cache_name) log_to_info('mm') if os.path.exists(mm_cache_name): corpus = gensim.corpora.MmCorpus(mm_cache_name) else: corpus = [dictionary.doc2bow(text) for text in documents] gensim.corpora.MmCorpus.serialize(mm_cache_name, corpus) log_to_info('lda') if os.path.exists(lda_cache_name): if model_type == 'lsi': lda = gensim.models.LsiModel.load(lda_cache_name) else: lda = gensim.models.LdaModel.load(lda_cache_name) else: if model_type == 'lsi': lda = gensim.models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) elif single_pass: lda = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) else: lda = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, update_every=0, passes=20) lda.save(lda_cache_name) log_to_info('it took {0} seconds'.format(time.time() - start)) infer_topics(lda, reviews) log_to_info('everything took {0} seconds'.format(time.time() - start))
def print_score(self, reviews): score, correct, total = self.calculate_score(reviews) log_to_info("Score: {0}%, which is {1}/{2}".format(score, correct, total))
def classify(mp, x_test, x_train, y_train): log_to_info('Starting classification') classifiers = dict(logistic=LogisticClassifier, logistic2=LogisticClassifier2, rbf=RbfClassifier, rbf_scv=RbfSVCClassifier, svc=LinearSVCClassifier) return classifiers[mp.classifier_name]().classify(mp, x_train, y_train, x_test)
def start_manual_run(func): # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=3, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=2, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=1, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=20, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=15, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=10, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=5, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=4, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=3, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=2, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=1, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='linear') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=3, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='exp') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=2, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='exp') # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17, # hierarchical_paragraph_vectors=0, epochs=1, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2', # tfid_features=0, learning_rate_type='exp') # run_and_say(algorithm_version=3, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling=1e-4, # negative=17, hierarchical_paragraph_vectors=1, epochs=1, classifier_name='rbf') version = 77.8 # epochs = list(range(1, 5)) + list(range(5, 76, 5)) epochs = list(range(1, 6)) # for epoch in epochs: # log_to_info('Epoch exp: {}'.format(epoch)) # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling=0, negative=5, # hierarchical_paragraph_vectors=0, epochs=epoch, epochs_total=max(epochs), classifier_name='rbf', classifier_c=0.0195, # classifier_penalty='l2', tfid_features=0, learning_rate_type='exp') # log_to_info('Epoch was exp: {}'.format(epoch)) epochs = [3] epoch = max(epochs) log_to_info('Epoch linear: {}'.format(epoch)) hpv = 0 func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling_dm=0.0001, frequent_words_downsampling_dbow=0.01, negative=25, hierarchical_paragraph_vectors=hpv, epochs=epoch, epochs_total=max(epochs), classifier_name='svc', classifier_c=0.0195, classifier_penalty='l2', tfid_features=0, learning_rate_type='exp', experiment_number=1) # func(algorithm_version=version, word_vector_dimensionality=100, word_context_window=10, frequent_words_downsampling=0, negative=5, # hierarchical_paragraph_vectors=5, epochs=epoch, epochs_total=max(epochs), classifier_name='svc', classifier_c=0.0195, # classifier_penalty='l2', tfid_features=0, learning_rate_type='linear') log_to_info('Epoch was linear: {}'.format(epoch))
def force_exists(path): if not os.path.exists(path): log_to_info('The file {0} does not exist!'.format(path)) exit(1)
def classify(self, mp, x_train, y_train, x_test): x_train = sm.add_constant(x_train) x_test = sm.add_constant(x_test) clf = LogisticRegressionCV(verbose=1, cv=5) log_to_info('Fitting a Logistic Regression to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Training details') log_to_info('Classifier parameters: {}'.format(clf.get_params())) log_to_info('On training: {}'.format(clf.score(x_train, y_train) * 100.0)) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def print_score(self, reviews): score, correct, total = self.calculate_score(reviews) log_to_info('Score: {0}%, which is {1}/{2}'.format( score, correct, total))
def force_exists(path): if not os.path.exists(path): log_to_info('The file {0} does not exist!'.format(path)) exit(1)