def main(neval=30, nfolds=5, ncvjobs=1, nreps=5, kbest=None, ngram_hi=3, jobs=1, seed=1, *event_sel): print(locals()) #return #event_sel=[70, 71] #nreps=1 df = read_train() X, y = preprocess(df, event_sel=event_sel, ngrams=(2, ngram_hi)) best = evaluate_hyper( X, y, hyper_objective, neval=neval, nfolds=nfolds, ncvjobs=ncvjobs, nreps=nreps, nbest=kbest, njobs=jobs, seed=seed, ) print('Final best: {}'.format(best)) return
def main(event_sel=None): df = read_train() # X_train, y_train = preprocess(df, event_sel=[71, 62, 42]) # X_train, y_train = preprocess(df, event_sel=[62, 63, 60]) X_train, y_train = preprocess(df, event_sel=event_sel) from sklearn.linear_model import Perceptron clf = Perceptron(max_iter=50, tol=1e-3, random_state=1) return benchmark(clf, X_train, y_train)
def main(event_sel=None): df = read_train() # X_train, y_train = preprocess(df, event_sel=[71, 62, 42]) # X_train, y_train = preprocess(df, event_sel=[62, 63, 60]) X_train, y_train = preprocess(df, event_sel=event_sel) from sklearn.naive_bayes import BernoulliNB, MultinomialNB clf = BernoulliNB(alpha=.01) return benchmark(clf, X_train, y_train)
def main(event_sel=None): df = read_train() # X_train, y_train = preprocess(df, event_sel=[71, 62, 42]) # X_train, y_train = preprocess(df, event_sel=[62, 63, 60]) X_train, y_train = preprocess(df, event_sel=event_sel) from sklearn.neighbors import KNeighborsClassifier clf = KNeighborsClassifier(n_neighbors=10) return benchmark(clf, X_train, y_train)
def main(event_sel=None): df = read_train() # X_train, y_train = preprocess(df, event_sel=[71, 62, 42]) # X_train, y_train = preprocess(df, event_sel=[62, 63, 60]) X_train, y_train = preprocess(df, event_sel=event_sel) from sklearn.linear_model import RidgeClassifier clf = RidgeClassifier(tol=1e-2, solver="sag", random_state=1) return benchmark(clf, X_train, y_train)
def main(event_sel=None): df = read_train() # X_train, y_train = preprocess(df, event_sel=[71, 62, 42]) # X_train, y_train = preprocess(df, event_sel=[62, 63, 60]) X_train, y_train = preprocess(df, event_sel=event_sel) from sklearn.neighbors import NearestCentroid clf = NearestCentroid() return benchmark(clf, X_train, y_train)
def main(event_sel=None): df = read_train() # X_train, y_train = preprocess(df, event_sel=event_sel) # X_train, y_train = preprocess(df, event_sel=[31, 78]) # X_train, y_train = preprocess(df, event_sel=[71, 62, 42]) # X_train, y_train = preprocess(df, event_sel=[71, 62, 42, 55, 11]) X_train, y_train = preprocess(df, event_sel=[62, 63, 60]) from lightgbm import LGBMClassifier clf = LGBMClassifier(verbose=1, random_state=1, silent=0, n_estimators=400) return benchmark(clf, X_train.astype(float), y_train)
def main(event_sel=None): df = read_train() # X_train, y_train = preprocess(df, event_sel=[71, 62, 42]) # X_train, y_train = preprocess(df, event_sel=[62, 63, 60]) X_train, y_train = preprocess(df, event_sel=event_sel) from sklearn.linear_model import SGDClassifier clf = SGDClassifier(alpha=.0001, max_iter=50, tol=1e-3, penalty='l2', random_state=1) return benchmark(clf, X_train, y_train)
def main(event_sel=None): df = read_train() X_train, y_train = preprocess(df, event_sel=event_sel) # X_train, y_train = preprocess(df, event_sel=[71, 62, 42]) # X_train, y_train = preprocess(df, event_sel=[71, 62, 42, 55, 11]) # X_train, y_train = preprocess(df, event_sel=[62, 63, 60]) from sklearn.svm import LinearSVC clf = LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3, verbose=0, random_state=1) return benchmark(clf, X_train, y_train)
def main(event_sel=None): df = read_train() # X_train, y_train = preprocess(df, event_sel=event_sel) # X_train, y_train = preprocess(df, event_sel=[71, 62, 42]) # X_train, y_train = preprocess(df, event_sel=[71, 62, 42, 55, 11]) X_train, y_train = preprocess(df, event_sel=[62, 63, 60], ngrams=(2,4)) print('Extracting best features by a chi-squared test') from sklearn.feature_selection import SelectKBest, chi2 ch2 = SelectKBest(chi2, k=12000) X_train = ch2.fit_transform(X_train, y_train) print('Extracting done, {}'.format(X_train.shape)) from sklearn.svm import LinearSVC clf = LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3, verbose=0, random_state=1) return benchmark(clf, X_train, y_train)
def main( kbest=46000, ngram_hi=3, jobs=1, seed=1, ): print(locals()) df_tr = read_train() df_te = read_test() # Przyklady pipeline # https://github.com/sfu-natlang/typed-align/blob/3fdab03765524a93831b31c7f21f3e8a11dfe54d/srcs/LogisticRegression.py # https://scikit-learn.org/0.18/auto_examples/hetero_feature_union.html clf = Pipeline([ ( 'union', FeatureUnion([ # vectorized text ('vect', Pipeline([ ('selector', ItemSelector(key='text')), ('whitespace_remover', FunctionTransformer( lambda col: col.apply(lambda s: ''.join(s.split())), validate=False)), ('digits_remover', FunctionTransformer(lambda col: col.apply( lambda s: ''.join(i for i in s if not i.isdigit())), validate=False)), ('vec', CountVectorizer(binary=True, ngram_range=(2, ngram_hi), analyzer='char', dtype=np.uint8)), ])), # sex ('sex', Pipeline([ ('selector', ItemSelector(key='sex')), ('less_1', FunctionTransformer(lambda col: col.values[:, None] - 1, validate=False)), ])), ])), ('kbest', SelectKBest(chi2, k=kbest)), ('classifier', LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-4, verbose=0, random_state=1, C=0.007283075704523443)) ]) # foo = clf.fit_transform(df_tr, df_tr.event) clf.fit(df_tr, df_tr.event) # foo = clf.transform(df_te) yhat = clf.predict(df_te) # print(foo.shape) # print(foo.dtype) # print(foo) df_te['event'] = yhat df_te.to_csv('solution.csv', index=False)