from sklearn.preprocessing import FunctionTransformer from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import SGDClassifier, LogisticRegression file = write.initFile("ex12-linearSVC-part2") # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') ############################################################################### # Load strength = 'soft' #data = pd.read_csv('../../TextFiles/data/tcp_train.csv', sep='\t') data = ptd.getTrainingData() data = data[data.Stance != 'NONE'] cv = StratifiedKFold(data.Stance, n_folds=10, shuffle=True, random_state=1) print("%d training documents" % len(data.Abstract)) write.writeTextToFile("%d training documents" % len(data.Abstract), file) print("%d categories" % 3) write.writeTextToFile("%d categories" % 3, file) print() ############################################################################### # Classifiers # MultinomialNB(), BernoulliNB(), SVM(), LinearSVM(), SGDClassifier(), LogisticRegression() clf = MultinomialNB()
rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] training_scores = [] validation_scores = [] for downsample_rate_favor in rates: tmp = [] tmp2 = [] for downsample_rate_none in rates: print 120 * '*' # ***** LOAD DATA ***** if use_downsample: print("using down sampling") print 'Downsample favor: ' + str(downsample_rate_favor) print 'Downsample none: ' + str(downsample_rate_none) train_data = ptd.getTrainingData() validate_data = ptd.getValidationData() #test_data = ptd.getTestData() sub_none = ptd.getDownsample2_0(train_data, "NONE", strength, downsample_rate_none) sub_favor = ptd.getDownsample2_0(train_data, "FAVOR", strength, downsample_rate_favor) against = train_data[train_data.Stance == "AGAINST"] train_data = pd.concat([sub_favor, sub_none, against]) else: print("using nothing") train_data = ptd.getTrainingData() validate_data = ptd.getValidationData() test_data = ptd.getTestData()