def train_kaggle(dataset, alg="rig", data="bow"): train_x, train_y, test_x = dataset print "shape for training data is", train_x.shape if alg == "svm": clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20) elif alg == "svm_sq": clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20, loss="squared_hinge") elif alg == "log": clf = LogisticRegression(verbose=1, n_jobs=2) elif alg == "per": clf = Perceptron(verbose=1, n_jobs=2, n_iter=25) elif alg == "rig": clf = RidgeClassifier() elif alg == "pa": clf = PassiveAggressiveClassifier(n_jobs=2, n_iter=25) else: raise NotImplementedError print "training with %s..." % alg clf.fit(train_x, train_y) # clf.fit(validate_x, validate_y) predicted = clf.predict(test_x) save_csv(predicted, fname=alg + "_" + data) if alg != "nb": return clf.decision_function(train_x), clf.decision_function(test_x) else: return clf.predict_proba(train_x), clf.predict_proba(test_x)
def multi_learner(n_estimators=200, alg="et"): train_x_2, train_y, test_x_2 = bow_kaggle_dataset() train_x, test_x = read_all_predict_score() train_x = sparse.hstack([train_x, train_x_2]) test_x = sparse.hstack([test_x, test_x_2]) print "training with", alg, n_estimators if alg == "rf": clf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True, verbose=1) elif alg == "et": clf = ExtraTreesClassifier(n_estimators=n_estimators, verbose=1) elif alg == "log": clf = LogisticRegression(verbose=1, n_jobs=2) else: raise NotImplementedError clf.fit(train_x, train_y) predicted = clf.predict(test_x) save_csv(predicted, "test")
def train(data=SST_KAGGLE, alg='logcv'): train_x, train_y, test_x = read_aggregated_vectors(google=True, data=data) train_x = np.asarray(train_x) train_y = np.asarray(train_y) test_x = np.asarray(test_x) print "shape for training data is", train_x.shape if alg == 'svm': clf = SVC(verbose=1) elif alg == 'log': clf = LogisticRegression(verbose=1) elif alg == 'logcv': clf = LogisticRegressionCV(cv=5, verbose=1) else: raise NotImplementedError print "training..." clf.fit(train_x, train_y) # clf.fit(validate_x, validate_y) predicted = clf.predict(test_x) save_csv(predicted)
def train(data=SST_KAGGLE, alg='log'): _, train_y, _ = vectorize_text(data=data) train_x, test_x = read_doc2vec_pickle(dm=False) # train_x_1, test_x_1 = senti_lexicon_vectorizor(data=data, tfidf=True) # train_x_2, test_x_2 = senti_wordnet_vectorizer(data=data, tfidf=True) # # train_x = sparse.hstack((train_x_1, train_x_2)) # test_x = sparse.hstack((test_x_1, test_x_2)) print "shape for training data is", train_x.shape if alg == 'svm': clf = SVC(verbose=1) elif alg == 'log': clf = LogisticRegression(verbose=1) # 61.756, no phrase, elif alg == 'nb': clf = MultinomialNB() else: raise NotImplementedError print "training..." clf.fit(train_x, train_y) predicted = clf.predict(test_x) save_csv(predicted)