def get_ensemble_model(w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() n_jobs = -1 return Pipeline([ ('feature_extraction', get_features(w2v)), # false positive rate test for feature selection ('feature_selection', SelectFpr(f_classif)), #('normalize', Normalizer(norm='l2')), ( 'proba', ProbExtractor([ RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, n_jobs=n_jobs), # ExtraTreesClassifier(n_estimators=300, max_depth=10, # min_samples_split=10, # n_jobs=n_jobs), XGBClassifier(n_estimators=300, max_depth=10, n_jobs=8), LogisticRegression(C=0.1, solver='lbfgs', penalty='l2', n_jobs=n_jobs), BernoulliNB(alpha=5.0) ])), ('polynomial', PolynomialFeatures(degree=2)), ('logistic_regression', GridSearchCV(LogisticRegression(penalty='l2', random_state=42), param_grid=params)) ])
def get_feature_extractor(w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() return Pipeline([("feature_extraction", get_features(w2v)), ('feature_selection', SelectFpr(f_classif)) ])
def get_features(w2v=None): tfidf_words = TfidfVectorizer(ngram_range=(1, 4), max_features=5000, lowercase=True, tokenizer=tokenize, stop_words='english', min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, norm='l2', sublinear_tf=True) tfidf_chars = TfidfVectorizer(ngram_range=(1, 4), max_features=5000, lowercase=False, analyzer='char', min_df=3, max_df=0.9, use_idf=True, norm='l2', sublinear_tf=True) if not w2v: glove = Glove.load() w2v = glove.get_dict() return FeatureUnion([ # Average length of word in a sentence ('avg_word_len', AverageWordLengthExtractor()), # Number of words ('num_words', NumWordExtractor()), # Number of characters in a sentence ('num_chars', CharLengthExtractor()), # Number of unique words used ('num_unique', NumUniqueWordExtractor()), # Naive bayes tfidf features ("tfidf_nbf", Pipeline([ ("wc_tfidf", FeatureUnion([ # TF-IDF over tokens ('tfidf_token_ngrams', tfidf_words), # TF-IDF over characters ('tfidf_token_chars', tfidf_chars) ])), ("nbf", NBFeaturer(alpha=10)) ])), # Averaged word embedding, weighted by tfidf ('w2v', TfidfEmbeddingVectorizer(w2v)) # Averaged word embedding #('w2v', MeanEmbeddingVectorizer(w2v)) ])
def get_basic_model(model, w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() n_jobs = -1 return Pipeline([ ('feature_extraction', get_features(w2v)), # false positive rate test for feature selection ('feature_selection', SelectFpr(f_classif)), #('normalize', StandardScaler(with_mean=False)), #('normalize', MaxAbsScaler()), ("model", model) ])
help='Save word index dictionary') parser.add_argument('-t', '--tfidf', action='store_true', help='Save TfidfVectorizer object') parser.add_argument('-o', '--output_dir', default="models/", help='Path to the output folder') args = parser.parse_args() if args.w2v: glove = Glove.load() w2v = glove.get_dict() print('w2v dict size:', len(w2v)) with open(os.path.join(args.output_dir, 'w2v_full.pkl'), 'wb') as f: pickle.dump(w2v, f, protocol=pickle.HIGHEST_PROTOCOL) if args.index: X_train, y_train, X_test, y_test = get_train_test_data(merge=True) tokenizer = Tokenizer(num_words=MAX_FEATURES) X_data = pd.concat((X_train, X_test), ignore_index=True) tokenizer.fit_on_texts(X_data) print("Word index dict size:", len(tokenizer.word_index)) outfile = os.path.join(args.output_dir, 'word_index_full.pkl') print("...wrote to", outfile) with open(outfile, 'wb') as f:
MAX_LEN = 75 # Max number of words in a tweet W2V_DICT_PATH = 'models/w2v.pkl' if __name__ == "__main__": ps = time.time() Xr_train, y_train, Xr_test, y_test = get_train_test_data(merge=True) if W2V_DICT_PATH: with open(W2V_DICT_PATH, "rb") as f: w2v = pickle.load(f) print('...loaded w2v dict') else: glove = Glove.load() w2v = glove.get_dict() ensemble = get_ensemble_model(w2v) #ensemble.steps = ensemble.steps[2:] feature_extractor = get_feature_extractor(w2v) n_jobs = 12 cols_target = ['label_pa', 'label_sb', 'label_sleep'] models = [ ("lr", LogisticRegression(C=0.1, penalty='l2', solver='lbfgs', n_jobs=n_jobs)), ("nb", BernoulliNB(alpha=5.0)), ("rf",