Ejemplo n.º 1
0
clf = svm.LinearSVC()
if True:
    #
    #clf = svm.linearSVC()#92009
    clf = svm.LinearSVC(class_weight={
        '1': 0.1,
        '2': 0.5,
        '3': 0.12,
        '4': 0.2,
        '5': 0.08
    })  #91236
    target, data = helper.get_train_data('../../Data/train_prep.csv',
                                         vectorizer=helper.get_vectorizer(
                                             stop_words=stopwordlist,
                                             min_df=3,
                                             ngram_range=(2, 2)),
                                         pred_pos=0,
                                         text_pos=1,
                                         tf_idf=True,
                                         remove_header=False)
    print "LOW + MIN3 + BIG + TFIDF + NS"
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        random_state=0)
    clf.fit(x_train, y_train)
    y_predicted = clf.predict(x_test)
    print(confusion_matrix(y_test, y_predicted))
if False:
    #[ 0.82027483  0.82085535  0.82068147]
    target, data = helper.get_train_data('../../Data/train_prep_emot.csv',
                                         vectorizer=helper.get_vectorizer(
Ejemplo n.º 2
0
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
import helpers as helper
from nltk.corpus import stopwords

stopwordlist = stopwords.words('english')

clf = AdaBoostClassifier(n_estimators=100)
x_target, x_data = helper.get_train_data(
    '../../Data/train_prep.csv',
    vectorizer=helper.get_vectorizer(stop_words=stopwordlist, min_df=3),
    pred_pos=0,
    text_pos=1,
    tf_idf=True,
    remove_header=False)

scores = cross_val_score(clf, x_data, x_target)
print scores.mean()
Ejemplo n.º 3
0
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Block: setting up arguments
ap = argparse.ArgumentParser()
ap.add_argument('--seed', help='random seed', default=11, type=int)
ap.add_argument('--summary', help='show data summaries', action='store_true')
ap.add_argument('--tuning', help='do algorithm tuning', action='store_true')
args = vars(ap.parse_args())
seed = args['seed']

# Block: seed the np random number generator
np.random.seed(seed)

# Block: get the data
df, Xtrn, Xval, Ytrn, Yval = helpers.get_train_data()

# Block: High level summaries of the data
if args['summary']:
    print(df.head(10))
    print(df.describe())
    print(df.dtypes)
    print(df.groupby('survived').size())
    print(df.shape)
    # Histogram to show distribution
    df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
    plt.savefig('tmp-histograms.png')
    # Density plot to show distribution
    df.plot(kind='density',
            subplots=True,
            layout=(3, 4),
Ejemplo n.º 4
0
if True:
    clf = MLPClassifier(verbose=True,
                        tol=0.001,
                        learning_rate="adaptive",
                        max_iter=10,
                        early_stopping=True,
                        alpha=0.0001,
                        hidden_layer_sizes=(5, 5, 10, 5),
                        random_state=1)

    train_y, train_x = helper.get_train_data('../../Data/train_prep.csv',
                                             vectorizer=helper.get_vectorizer(
                                                 stop_words=stopwordlist,
                                                 min_df=3,
                                                 ngram_range=(2, 2)),
                                             pred_pos=0,
                                             text_pos=1,
                                             tf_idf=True,
                                             remove_header=False)
    ids, test_x = helper.get_train_data('../../Data/test_prep.csv',
                                        vectorizer=helper.get_vectorizer(
                                            stop_words=stopwordlist,
                                            min_df=3,
                                            ngram_range=(2, 2)),
                                        tf_idf=True,
                                        remove_header=True)

    print "LOW + MIN3 + BIG + TFIDF"
    #x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=0)
    clf.fit(train_x, train_y)
Ejemplo n.º 5
0
print(__doc__)

from time import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import helpers as helper

target, data = helper.get_train_data('../../Data/train_prep.csv', vectorizer = helper.get_vectorizer(min_df = 3), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False)


n_digits = len(np.unique(target))
labels = target

sample_size = 300

print(79 * '_')
print('% 9s' % 'init'
      '    time  inertia    h**o   compl  v-meas     ARI AMI  silhouette')


def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
Ejemplo n.º 6
0
        for x in voc[c]:
            if count % 1000 == 0:
                print count
            if x not in voc['5'] and x not in vocab:
                vocab[x] = 1
            count += 1
    
    vocabulario = []
    with open(train_path + 'voc--.csv', 'wb') as csvo:
        writero = csv.writer(csvo)
        for x in vocab:
            vocabulario.append(x)
            writero.writerow([x])
if True:
    print "Obtengo los datos"
    x_target, x_data = helper.get_train_data(train_path + '.csv', vectorizer = helper.get_vectorizer(vocabulary=vocabulario, min_df = 3), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False)
    y_target, y_data = helper.get_train_data(test_path + '.csv', vectorizer = helper.get_vectorizer(vocabulary=vocabulario ,min_df = 3), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False)
    print "LOW + MIN3 + BIG + TFIDF"
    clf = MultinomialNB(alpha=0.01)
    clf.fit(x_data, x_target)
    y_predicted = clf.predict(y_data)
    print(confusion_matrix(y_target, y_predicted))
    print(accuracy_score(y_target, y_predicted))
    
if False:
    count = 0
    with open('../../Data/train_prep_voc1.csv', 'r') as csv4:
        with open('../../Data/train_prep_voc5.csv', 'r') as csv5:
            with open('../../Data/train_prep_voc1-5.csv', 'wb') as csvo:
                reader4 = csv.reader(csv4)
                reader5 = csv.reader(csv5)