print 'Downsample favor: ' + str(downsample_rate_favor) print 'Downsample none: ' + str(downsample_rate_none) train_data = ptd.getTrainingData() validate_data = ptd.getValidationData() #test_data = ptd.getTestData() sub_none = ptd.getDownsample2_0(train_data, "NONE", strength, downsample_rate_none) sub_favor = ptd.getDownsample2_0(train_data, "FAVOR", strength, downsample_rate_favor) against = train_data[train_data.Stance == "AGAINST"] train_data = pd.concat([sub_favor, sub_none, against]) else: print("using nothing") train_data = ptd.getTrainingData() validate_data = ptd.getValidationData() test_data = ptd.getTestData() if use_upsample: print("using up sampling") train_data = pd.concat([train_data, train_data[train_data.Stance == "AGAINST"]]) cv = StratifiedKFold(train_data.Stance, n_folds=10, shuffle=True, random_state=1) # Select classifiers to use classifiers = [ #LinearSVC(C=1.178), SVC(C=6.9183097091893631, kernel='linear', shrinking=True) #MultinomialNB(alpha=0.1, fit_prior=False) #LogisticRegression() ]
from sklearn.metrics import classification_report from sklearn.cross_validation import cross_val_predict, StratifiedKFold from sklearn.metrics import fbeta_score from sklearn.svm import LinearSVC, SVC from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.naive_bayes import MultinomialNB, BernoulliNB import pandas as pd import sklearn strength = 'soft' # ***** LOAD DATA ***** train_data = ptd.getTrainingData() validate_data = ptd.getValidationData() test_data = ptd.getTestData() cv = StratifiedKFold(train_data.Stance, n_folds=10, shuffle=True, random_state=1) # Select classifiers to use classifiers = [ LinearSVC(C=2.3988329190194899, multi_class='crammer_singer'), #SVC(C=5.2, kernel='linear') #MultinomialNB(alpha=0.63, fit_prior=True) #LogisticRegression(C=22.759, penalty='l2', solver='lbfgs') #SGDClassifier(alpha=0.0001, loss='squared_hinge') #BernoulliNB(alpha=0.1, fit_prior=True) ]
import json ################# # Parameters # ################# store_to_file = 0 ################ # Load Data # ################ print("Loading data...") train_data = pd.concat([ptd.getTrainingData(), ptd.getValidationData(), ptd.getTestData()]) unlabelled_data = ptd.getUnlabelledData() ######################### # Train classifier # ######################### print("Training classifier") best_classifier = LinearSVC(C=1.178) pipeline = Pipeline([('vect', CountVectorizer(decode_error='ignore', analyzer='word', ngram_range=(1, 2), stop_words= None,