Beispiel #1
0
    parser.add_argument('-o',
                        '--output_dir',
                        default="models/",
                        help='Path to the output folder')

    args = parser.parse_args()

    if args.w2v:
        glove = Glove.load()
        w2v = glove.get_dict()
        print('w2v dict size:', len(w2v))
        with open(os.path.join(args.output_dir, 'w2v_full.pkl'), 'wb') as f:
            pickle.dump(w2v, f, protocol=pickle.HIGHEST_PROTOCOL)

    if args.index:
        X_train, y_train, X_test, y_test = get_train_test_data(merge=True)
        tokenizer = Tokenizer(num_words=MAX_FEATURES)
        X_data = pd.concat((X_train, X_test), ignore_index=True)

        tokenizer.fit_on_texts(X_data)
        print("Word index dict size:", len(tokenizer.word_index))
        outfile = os.path.join(args.output_dir, 'word_index_full.pkl')
        print("...wrote to", outfile)
        with open(outfile, 'wb') as f:
            pickle.dump(tokenizer.word_index,
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)

        if not args.w2v:
            W2V_DICT_PATH = './models/w2v.pkl'
            with open(W2V_DICT_PATH, 'rb') as f:
Beispiel #2
0
from database.utils import get_train_test_data
from pipelines.feature_extractor import get_feature_extractor
from pipelines.models import get_ensemble_model
from feature_extraction.features import get_glove_w2v
from evaluation.metrics import class_report

from copy import deepcopy
import numpy as np
import pandas as pd
import pprint
import time

if __name__ == '__main__':
    w2v = get_glove_w2v()
    train_test_data = get_train_test_data()
    feature_extractor = get_feature_extractor(w2v)
    ensemble = get_ensemble_model(w2v)

    models = [
        ("lr",
         LogisticRegression(C=0.1, penalty='l2', solver='lbfgs', n_jobs=-1)),
        ("nb", BernoulliNB(alpha=5.0)),
        ("rf",
         RandomForestClassifier(n_estimators=300,
                                max_depth=10,
                                min_samples_split=5,
                                n_jobs=-1)),
        ("xgb", XGBClassifier(n_estimators=300, max_depth=8, n_jobs=-1)),
        ("et",
         ExtraTreesClassifier(n_estimators=300,