def get_ensemble_model(w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() n_jobs = -1 return Pipeline([ ('feature_extraction', get_features(w2v)), # false positive rate test for feature selection ('feature_selection', SelectFpr(f_classif)), #('normalize', Normalizer(norm='l2')), ( 'proba', ProbExtractor([ RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, n_jobs=n_jobs), # ExtraTreesClassifier(n_estimators=300, max_depth=10, # min_samples_split=10, # n_jobs=n_jobs), XGBClassifier(n_estimators=300, max_depth=10, n_jobs=8), LogisticRegression(C=0.1, solver='lbfgs', penalty='l2', n_jobs=n_jobs), BernoulliNB(alpha=5.0) ])), ('polynomial', PolynomialFeatures(degree=2)), ('logistic_regression', GridSearchCV(LogisticRegression(penalty='l2', random_state=42), param_grid=params)) ])
def get_feature_extractor(w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() return Pipeline([("feature_extraction", get_features(w2v)), ('feature_selection', SelectFpr(f_classif)) ])
def get_features(w2v=None): tfidf_words = TfidfVectorizer(ngram_range=(1, 4), max_features=5000, lowercase=True, tokenizer=tokenize, stop_words='english', min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, norm='l2', sublinear_tf=True) tfidf_chars = TfidfVectorizer(ngram_range=(1, 4), max_features=5000, lowercase=False, analyzer='char', min_df=3, max_df=0.9, use_idf=True, norm='l2', sublinear_tf=True) if not w2v: glove = Glove.load() w2v = glove.get_dict() return FeatureUnion([ # Average length of word in a sentence ('avg_word_len', AverageWordLengthExtractor()), # Number of words ('num_words', NumWordExtractor()), # Number of characters in a sentence ('num_chars', CharLengthExtractor()), # Number of unique words used ('num_unique', NumUniqueWordExtractor()), # Naive bayes tfidf features ("tfidf_nbf", Pipeline([ ("wc_tfidf", FeatureUnion([ # TF-IDF over tokens ('tfidf_token_ngrams', tfidf_words), # TF-IDF over characters ('tfidf_token_chars', tfidf_chars) ])), ("nbf", NBFeaturer(alpha=10)) ])), # Averaged word embedding, weighted by tfidf ('w2v', TfidfEmbeddingVectorizer(w2v)) # Averaged word embedding #('w2v', MeanEmbeddingVectorizer(w2v)) ])
def get_basic_model(model, w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() n_jobs = -1 return Pipeline([ ('feature_extraction', get_features(w2v)), # false positive rate test for feature selection ('feature_selection', SelectFpr(f_classif)), #('normalize', StandardScaler(with_mean=False)), #('normalize', MaxAbsScaler()), ("model", model) ])
help='Save word index dictionary') parser.add_argument('-t', '--tfidf', action='store_true', help='Save TfidfVectorizer object') parser.add_argument('-o', '--output_dir', default="models/", help='Path to the output folder') args = parser.parse_args() if args.w2v: glove = Glove.load() w2v = glove.get_dict() print('w2v dict size:', len(w2v)) with open(os.path.join(args.output_dir, 'w2v_full.pkl'), 'wb') as f: pickle.dump(w2v, f, protocol=pickle.HIGHEST_PROTOCOL) if args.index: X_train, y_train, X_test, y_test = get_train_test_data(merge=True) tokenizer = Tokenizer(num_words=MAX_FEATURES) X_data = pd.concat((X_train, X_test), ignore_index=True) tokenizer.fit_on_texts(X_data) print("Word index dict size:", len(tokenizer.word_index)) outfile = os.path.join(args.output_dir, 'word_index_full.pkl') print("...wrote to", outfile) with open(outfile, 'wb') as f: