#!/usr/bin/env python
# First prediction, using sklearn's RandomForestClassifier
import lib.loader as ld
import sklearn.ensemble as ens

if __name__ == '__main__':
    trainx, trainy = ld.loadtrain('data/trainingdata.txt')
    testx = ld.loadtest('data/testingdata.txt')
    clf = ens.RandomForestClassifier(criterion='entropy', n_estimators=20)
    clf.fit(trainx, trainy)
    ld.write('predictions/Prediction1.txt', clf.predict(testx))

# Score: 0.52515
Beispiel #2
0
                                           n_jobs=n_jobs))
            # 0.0420 + 0.16 - 0.04
        elif rand == 5:
            numChange = comp.append(
                lm.SGDClassifier(loss='hinge',
                                 alpha=0.000631 + 0.01 * r.random()))
            # 0.000631 + 0.01
        elif rand == 6:
            numChange = comp.append(
                svm.LinearSVC(dual=False, C=0.01 + 0.1 * r.random()))
            # 0.01 + 0.1
        elif rand == 7:
            numChange = comp.append(
                ens.RandomForestClassifier(n_estimators=100,
                                           max_features=0.120 +
                                           0.4 * r.random(),
                                           min_samples_split=7,
                                           n_jobs=n_jobs))
            # 0.32 +- 0.2
        elif rand == 8:
            numChange = comp.append(
                svm.LinearSVC(dual=False, C=100 + 900 * r.random()))
            # 1000 - 900
        if numChange == 0:
            num_consec += 1
        else:
            num_consec = 0
        print('Number CV Changes: ' + str(numChange))
        if len(comp.clfs) % save_interval == 0:  # don't save every time
            ld.write('predictions/Prediction7.txt', comp.predict())
                # 0.7 +- 0.3
        elif rand == 4:
            numChange = comp.append(ens.RandomForestClassifier(n_estimators=450,
                max_features=0.0020 + 0.2 * r.random(), min_samples_split=7,
                n_jobs=n_jobs))
                # 0.0420 + 0.16 - 0.04
        elif rand == 5:
            numChange = comp.append(lm.SGDClassifier(loss='hinge',
                alpha=0.000631 + 0.01 * r.random()))
                # 0.000631 + 0.01
        elif rand == 6:
            numChange = comp.append(svm.LinearSVC(dual=False, C=0.01 + 0.1 *
                r.random()))
                # 0.01 + 0.1
        elif rand == 7:
            numChange = comp.append(ens.RandomForestClassifier(n_estimators=100,
                max_features=0.120 + 0.4 * r.random(), min_samples_split=7,
                n_jobs=n_jobs))
                # 0.32 +- 0.2
        elif rand == 8:
            numChange = comp.append(svm.LinearSVC(dual=False, C=100 + 900 *
                r.random()))
                # 1000 - 900
        if numChange == 0:
            num_consec += 1
        else:
            num_consec = 0
        print('Number CV Changes: ' + str(numChange))
        if len(comp.clfs) % save_interval == 0: # don't save every time
            ld.write('predictions/Prediction7.txt', comp.predict())
#!/usr/bin/env python
# prediction 3 with more features. Totally guessing by now

import numpy as np
import lib.loader as ld
import sklearn.ensemble as ens
import sklearn.feature_extraction.text as tfidf

if __name__ == '__main__':
    ens.RandomForestClassifier(n_jobs=-1)
    trainx, trainy = ld.loadtrain('data/trainingdata.txt')
    testx = ld.loadtest('data/testingdata.txt')
    trainx2 = tfidf.TfidfTransformer().fit_transform(trainx)
    testx2 = tfidf.TfidfTransformer().fit_transform(testx)

    clf = ens.RandomForestClassifier(max_features=0.38, criterion='entropy',
            n_estimators=5000, min_samples_split=7)
    clf.fit(trainx2, trainy)
    ld.write('predictions/Prediction5.txt', clf.predict(testx2))

# score=0.51
Beispiel #5
0
#!/usr/bin/env python
# prediction 3, with gridsearch'd parameters

import lib.loader as ld
import sklearn.ensemble as ens
import sklearn.feature_extraction.text as tfidf

if __name__ == '__main__':
    ens.RandomForestClassifier(n_jobs=-1)
    trainx, trainy = ld.loadtrain('data/trainingdata.txt')
    testx = ld.loadtest('data/testingdata.txt')
    trainx2 = tfidf.TfidfTransformer().fit_transform(trainx)
    testx2 = tfidf.TfidfTransformer().fit_transform(testx)

    clf = ens.RandomForestClassifier(max_features=0.0420,
                                     criterion='entropy',
                                     n_estimators=5000,
                                     min_samples_split=7)
    clf.fit(trainx2, trainy)
    ld.write('predictions/Prediction3.txt', clf.predict(testx2))

# score 0.51