def main(neval=30,
         nfolds=5,
         ncvjobs=1,
         nreps=5,
         kbest=None,
         ngram_hi=3,
         jobs=1,
         seed=1,
         *event_sel):
    print(locals())
    #return
    #event_sel=[70, 71]
    #nreps=1
    df = read_train()
    X, y = preprocess(df, event_sel=event_sel, ngrams=(2, ngram_hi))

    best = evaluate_hyper(
        X,
        y,
        hyper_objective,
        neval=neval,
        nfolds=nfolds,
        ncvjobs=ncvjobs,
        nreps=nreps,
        nbest=kbest,
        njobs=jobs,
        seed=seed,
    )

    print('Final best: {}'.format(best))

    return
def main(event_sel=None):
    df = read_train()

    #    X_train, y_train = preprocess(df, event_sel=[71, 62, 42])
    #    X_train, y_train = preprocess(df, event_sel=[62, 63, 60])
    X_train, y_train = preprocess(df, event_sel=event_sel)

    from sklearn.linear_model import Perceptron
    clf = Perceptron(max_iter=50, tol=1e-3, random_state=1)

    return benchmark(clf, X_train, y_train)
Exemple #3
0
def main(event_sel=None):
    df = read_train()

#    X_train, y_train = preprocess(df, event_sel=[71, 62, 42])
#    X_train, y_train = preprocess(df, event_sel=[62, 63, 60])
    X_train, y_train = preprocess(df, event_sel=event_sel)

    from sklearn.naive_bayes import BernoulliNB, MultinomialNB
    clf = BernoulliNB(alpha=.01)

    return benchmark(clf, X_train, y_train)
Exemple #4
0
def main(event_sel=None):
    df = read_train()

    #    X_train, y_train = preprocess(df, event_sel=[71, 62, 42])
    #    X_train, y_train = preprocess(df, event_sel=[62, 63, 60])
    X_train, y_train = preprocess(df, event_sel=event_sel)

    from sklearn.neighbors import KNeighborsClassifier
    clf = KNeighborsClassifier(n_neighbors=10)

    return benchmark(clf, X_train, y_train)
Exemple #5
0
def main(event_sel=None):
    df = read_train()

    #    X_train, y_train = preprocess(df, event_sel=[71, 62, 42])
    #    X_train, y_train = preprocess(df, event_sel=[62, 63, 60])
    X_train, y_train = preprocess(df, event_sel=event_sel)

    from sklearn.linear_model import RidgeClassifier
    clf = RidgeClassifier(tol=1e-2, solver="sag", random_state=1)

    return benchmark(clf, X_train, y_train)
def main(event_sel=None):
    df = read_train()

    #    X_train, y_train = preprocess(df, event_sel=[71, 62, 42])
    #    X_train, y_train = preprocess(df, event_sel=[62, 63, 60])
    X_train, y_train = preprocess(df, event_sel=event_sel)

    from sklearn.neighbors import NearestCentroid
    clf = NearestCentroid()

    return benchmark(clf, X_train, y_train)
Exemple #7
0
def main(event_sel=None):
    df = read_train()

    #    X_train, y_train = preprocess(df, event_sel=event_sel)
    #    X_train, y_train = preprocess(df, event_sel=[31, 78])
    #    X_train, y_train = preprocess(df, event_sel=[71, 62, 42])
    #    X_train, y_train = preprocess(df, event_sel=[71, 62, 42, 55, 11])
    X_train, y_train = preprocess(df, event_sel=[62, 63, 60])

    from lightgbm import LGBMClassifier
    clf = LGBMClassifier(verbose=1, random_state=1, silent=0, n_estimators=400)

    return benchmark(clf, X_train.astype(float), y_train)
def main(event_sel=None):
    df = read_train()

    #    X_train, y_train = preprocess(df, event_sel=[71, 62, 42])
    #    X_train, y_train = preprocess(df, event_sel=[62, 63, 60])
    X_train, y_train = preprocess(df, event_sel=event_sel)

    from sklearn.linear_model import SGDClassifier
    clf = SGDClassifier(alpha=.0001,
                        max_iter=50,
                        tol=1e-3,
                        penalty='l2',
                        random_state=1)

    return benchmark(clf, X_train, y_train)
Exemple #9
0
def main(event_sel=None):
    df = read_train()

    X_train, y_train = preprocess(df, event_sel=event_sel)
    #    X_train, y_train = preprocess(df, event_sel=[71, 62, 42])
    #    X_train, y_train = preprocess(df, event_sel=[71, 62, 42, 55, 11])
    #    X_train, y_train = preprocess(df, event_sel=[62, 63, 60])

    from sklearn.svm import LinearSVC
    clf = LinearSVC(loss='squared_hinge',
                    penalty='l2',
                    dual=False,
                    tol=1e-3,
                    verbose=0,
                    random_state=1)

    return benchmark(clf, X_train, y_train)
Exemple #10
0
def main(event_sel=None):
    df = read_train()

#    X_train, y_train = preprocess(df, event_sel=event_sel)
#    X_train, y_train = preprocess(df, event_sel=[71, 62, 42])
#    X_train, y_train = preprocess(df, event_sel=[71, 62, 42, 55, 11])
    X_train, y_train = preprocess(df, event_sel=[62, 63, 60], ngrams=(2,4))

    print('Extracting best features by a chi-squared test')
    from sklearn.feature_selection import SelectKBest, chi2
    ch2 = SelectKBest(chi2, k=12000)
    X_train = ch2.fit_transform(X_train, y_train)
    print('Extracting done, {}'.format(X_train.shape))

    from sklearn.svm import LinearSVC
    clf = LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3, verbose=0, random_state=1)

    return benchmark(clf, X_train, y_train)
Exemple #11
0
def main(
    kbest=46000,
    ngram_hi=3,
    jobs=1,
    seed=1,
):

    print(locals())

    df_tr = read_train()
    df_te = read_test()

    # Przyklady pipeline
    # https://github.com/sfu-natlang/typed-align/blob/3fdab03765524a93831b31c7f21f3e8a11dfe54d/srcs/LogisticRegression.py
    # https://scikit-learn.org/0.18/auto_examples/hetero_feature_union.html

    clf = Pipeline([
        (
            'union',
            FeatureUnion([

                # vectorized text
                ('vect',
                 Pipeline([
                     ('selector', ItemSelector(key='text')),
                     ('whitespace_remover',
                      FunctionTransformer(
                          lambda col: col.apply(lambda s: ''.join(s.split())),
                          validate=False)),
                     ('digits_remover',
                      FunctionTransformer(lambda col: col.apply(
                          lambda s: ''.join(i for i in s if not i.isdigit())),
                                          validate=False)),
                     ('vec',
                      CountVectorizer(binary=True,
                                      ngram_range=(2, ngram_hi),
                                      analyzer='char',
                                      dtype=np.uint8)),
                 ])),

                # sex
                ('sex',
                 Pipeline([
                     ('selector', ItemSelector(key='sex')),
                     ('less_1',
                      FunctionTransformer(lambda col: col.values[:, None] - 1,
                                          validate=False)),
                 ])),
            ])),
        ('kbest', SelectKBest(chi2, k=kbest)),
        ('classifier',
         LinearSVC(loss='squared_hinge',
                   penalty='l2',
                   dual=False,
                   tol=1e-4,
                   verbose=0,
                   random_state=1,
                   C=0.007283075704523443))
    ])

    #    foo = clf.fit_transform(df_tr, df_tr.event)
    clf.fit(df_tr, df_tr.event)
    #    foo = clf.transform(df_te)
    yhat = clf.predict(df_te)
    #    print(foo.shape)
    #    print(foo.dtype)
    #    print(foo)

    df_te['event'] = yhat

    df_te.to_csv('solution.csv', index=False)