def worker(X_train, y_train, X_test, y_test, pm, rs):
    p = PreProcessor(stop_words=pm[0], tf=pm[1], idf=pm[2], scale=pm[3])
    print 'Test', p.get_name()

    m = SGDClassifier()
    m.fit(p.fit_training(X_train), y_train)
    y_pred = m.predict(p.fit_test(X_test))

    f1 = metrics.f1_score(y_test, y_pred)
    rs[float(f1)] = p.get_name()
Ejemplo n.º 2
0
        tf = bool(row[3])
        idf = bool(row[4])
        scale = bool(row[5])
        params[category] = [stop_words, tf, idf, scale]


with open(RESULT, 'w') as out:
    rows = csv.writer(out)
    for category in categories:
        source = os.path.join(TRAINING_CATEGORIES, category)
        if os.path.exists(source):

            X_train, y_train = load(source, category)
            pms = params[category]
            p = PreProcessor(stop_words=pms[0],
                             tf=pms[1],
                             idf=pms[2],
                             scale=pms[3])

            X_train = p.fit_training(X_train)

            print '.fit()'
            m = SGDClassifier()
            m.fit(X_train, y_train)

            print '.predict()'
            for test_id, text in test_entries():
                text = p.fit_test(np.array([text]))
                result = [category, test_id, m.predict(text)[0]]
                rows.writerow(result)