clf = svm.LinearSVC() if True: # #clf = svm.linearSVC()#92009 clf = svm.LinearSVC(class_weight={ '1': 0.1, '2': 0.5, '3': 0.12, '4': 0.2, '5': 0.08 }) #91236 target, data = helper.get_train_data('../../Data/train_prep.csv', vectorizer=helper.get_vectorizer( stop_words=stopwordlist, min_df=3, ngram_range=(2, 2)), pred_pos=0, text_pos=1, tf_idf=True, remove_header=False) print "LOW + MIN3 + BIG + TFIDF + NS" x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=0) clf.fit(x_train, y_train) y_predicted = clf.predict(x_test) print(confusion_matrix(y_test, y_predicted)) if False: #[ 0.82027483 0.82085535 0.82068147] target, data = helper.get_train_data('../../Data/train_prep_emot.csv', vectorizer=helper.get_vectorizer(
from sklearn.model_selection import cross_val_score from sklearn.ensemble import AdaBoostClassifier import helpers as helper from nltk.corpus import stopwords stopwordlist = stopwords.words('english') clf = AdaBoostClassifier(n_estimators=100) x_target, x_data = helper.get_train_data( '../../Data/train_prep.csv', vectorizer=helper.get_vectorizer(stop_words=stopwordlist, min_df=3), pred_pos=0, text_pos=1, tf_idf=True, remove_header=False) scores = cross_val_score(clf, x_data, x_target) print scores.mean()
from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report, accuracy_score # Block: setting up arguments ap = argparse.ArgumentParser() ap.add_argument('--seed', help='random seed', default=11, type=int) ap.add_argument('--summary', help='show data summaries', action='store_true') ap.add_argument('--tuning', help='do algorithm tuning', action='store_true') args = vars(ap.parse_args()) seed = args['seed'] # Block: seed the np random number generator np.random.seed(seed) # Block: get the data df, Xtrn, Xval, Ytrn, Yval = helpers.get_train_data() # Block: High level summaries of the data if args['summary']: print(df.head(10)) print(df.describe()) print(df.dtypes) print(df.groupby('survived').size()) print(df.shape) # Histogram to show distribution df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1) plt.savefig('tmp-histograms.png') # Density plot to show distribution df.plot(kind='density', subplots=True, layout=(3, 4),
if True: clf = MLPClassifier(verbose=True, tol=0.001, learning_rate="adaptive", max_iter=10, early_stopping=True, alpha=0.0001, hidden_layer_sizes=(5, 5, 10, 5), random_state=1) train_y, train_x = helper.get_train_data('../../Data/train_prep.csv', vectorizer=helper.get_vectorizer( stop_words=stopwordlist, min_df=3, ngram_range=(2, 2)), pred_pos=0, text_pos=1, tf_idf=True, remove_header=False) ids, test_x = helper.get_train_data('../../Data/test_prep.csv', vectorizer=helper.get_vectorizer( stop_words=stopwordlist, min_df=3, ngram_range=(2, 2)), tf_idf=True, remove_header=True) print "LOW + MIN3 + BIG + TFIDF" #x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=0) clf.fit(train_x, train_y)
print(__doc__) from time import time import numpy as np import matplotlib.pyplot as plt from sklearn import metrics from sklearn.cluster import KMeans from sklearn.datasets import load_digits from sklearn.decomposition import PCA from sklearn.preprocessing import scale import helpers as helper target, data = helper.get_train_data('../../Data/train_prep.csv', vectorizer = helper.get_vectorizer(min_df = 3), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False) n_digits = len(np.unique(target)) labels = target sample_size = 300 print(79 * '_') print('% 9s' % 'init' ' time inertia h**o compl v-meas ARI AMI silhouette') def bench_k_means(estimator, name, data): t0 = time() estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_,
for x in voc[c]: if count % 1000 == 0: print count if x not in voc['5'] and x not in vocab: vocab[x] = 1 count += 1 vocabulario = [] with open(train_path + 'voc--.csv', 'wb') as csvo: writero = csv.writer(csvo) for x in vocab: vocabulario.append(x) writero.writerow([x]) if True: print "Obtengo los datos" x_target, x_data = helper.get_train_data(train_path + '.csv', vectorizer = helper.get_vectorizer(vocabulary=vocabulario, min_df = 3), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False) y_target, y_data = helper.get_train_data(test_path + '.csv', vectorizer = helper.get_vectorizer(vocabulary=vocabulario ,min_df = 3), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False) print "LOW + MIN3 + BIG + TFIDF" clf = MultinomialNB(alpha=0.01) clf.fit(x_data, x_target) y_predicted = clf.predict(y_data) print(confusion_matrix(y_target, y_predicted)) print(accuracy_score(y_target, y_predicted)) if False: count = 0 with open('../../Data/train_prep_voc1.csv', 'r') as csv4: with open('../../Data/train_prep_voc5.csv', 'r') as csv5: with open('../../Data/train_prep_voc1-5.csv', 'wb') as csvo: reader4 = csv.reader(csv4) reader5 = csv.reader(csv5)