'bg_bert_title', 'bg_bert_text', 'meta_media', 'bg_xlm_title', 'bg_xlm_text' ]), ('top_8', [ 'bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text', 'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text', 'bg_bert_title', 'bg_bert_text', 'meta_media', 'bg_xlm_title', 'bg_xlm_text', 'en_nela_title', 'en_nela_text' ]), ('top_9', [ 'bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text', 'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text', 'bg_bert_title', 'bg_bert_text', 'meta_media', 'bg_xlm_title', 'bg_xlm_text', 'en_nela_title', 'en_nela_text', 'bg_styl_title', 'bg_styl_textx' ]), ] oversampler = None models = [] for name, feature_list in features: clf = LogisticRegression() clf_params = {'clf__C': 1.5, 'clf__solver': 'liblinear', 'clf__tol': 0.01} model = pipelines.make(clf, feature_list, clf_params=clf_params) # evaluation models.append((f'{name}', model)) compare_classifiers(models, df, df['label'], silent=False, plot=False)
multi_class="auto", solver='liblinear', max_iter=20000) features = [ 'bg_bert_title', 'bg_bert_text', 'bg_xlm_title', 'bg_xlm_text', 'meta_media', 'bg_styl_title', 'bg_styl_text', 'bg_lsa_title', 'bg_lsa_text', 'en_use_title', 'en_use_text', 'en_nela_title', 'en_nela_text', 'en_bert_title', 'en_bert_text', 'en_elmo_title', 'en_elmo_text', ] all_features = [f'{x}_pred' for x in features] # oversamplers = [None, SMOTE(), ADASYN(), RandomOverSampler(random_state=0)] oversamplers = [None] for oversampler in oversamplers: model = ('pred_all', pipelines.make(clf, all_features, oversampler)) compare_classifiers([model], df, df['label'], silent=False, plot=False)
db = database.MongoDB() clf = LogisticRegression(C=1.5, tol=0.01, random_state=0, multi_class="auto", solver='liblinear', max_iter=20000) df = get_df(list(db.get_articles())) for oversampler in [ None, SMOTE(), ADASYN(), RandomOverSampler(random_state=0) ]: models = [ ('bg_lsa', pipelines.make(clf, ['bg_lsa_title', 'bg_lsa_text'], oversampler=oversampler)) #('bg_bert_title_text', pipelines.make(clf, ['bg_bert_title', 'bg_bert_text'], oversampler=oversampler)), # ('bg_xlm_title_text', pipelines.make(clf, oversampler, ['bg_xlm_title', 'bg_xlm_text', 'bg_xlm_cos'])), # ('bg_styl_title_text', pipelines.make(clf, oversampler, ['bg_styl_title', 'bg_styl_text'])), # ('meta_media', pipelines.make(clf, oversampler, ['meta_media'])), # ('en_use_title_text', pipelines.make(clf, oversampler, ['en_use_title', 'en_use_text', 'en_use_cos'])), # ('en_nela_title_text', pipelines.make(clf, oversampler, ['en_nela_title', 'en_nela_text', 'en_nela_cos'])), # ('en_bert_title_text', pipelines.make(clf, oversampler, ['en_bert_title', 'en_bert_text', 'en_bert_cos'])), # ('en_elmo_title_text', pipelines.make(clf, oversampler, ['en_elmo_title', 'en_elmo_text', 'en_elmo_cos'])), ] print(f'Oversampler: { oversampler}') compare_classifiers(models, df, df['label'], silent=False, plot=False)
all_feats = [] for feature_set in feature_sets: all_feats.append(feature_set + '_title') all_feats.append(feature_set + '_text') #all_feats.append('meta_media') param_grid = { 'clf__tol': [1e-10, 1e-8, 1e-4, 1e-2, 1e-1], # 1e-4 'clf__C': [0.05, 0.15, 0.25, 0.35, 0.50, 0.75, 1, 1.25, 1.5, 2], # 1, 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] # lbfgs } print('All features count: ', len(all_feats)) for feature_set in all_feats: model = pipelines.make(clf, [feature_set]) gs = GridSearchCV(model, param_grid=param_grid, scoring='accuracy', cv=5, error_score=-1, verbose=1, n_jobs=-1, iid=False, return_train_score=True) gs.fit(df, df['label']) print(f"{feature_set} | BEST SCORE: {gs.best_score_}") print(f"{feature_set} | BEST PARAMS: {gs.best_params_}")
# ('all_en', en_feats), ('all', all_feats) ] oversampler = None # evaluation param_grid = { # 'clf__tol': [1e-2], # [1e-10, 1e-8, 1e-4, 1e-2, 1e-1], # 1e-4 'clf__C': [0.05, 0.15, 0.25, 0.35, 0.50, 0.75, 1, 1.25, 1.5, 2], # 1, 'clf__solver': ['liblinear'] #['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] } for name, feature_list in features: model = pipelines.make( LogisticRegression(random_state=0, multi_class="auto", max_iter=1000), feature_list) print(name) gs = GridSearchCV(model, param_grid=param_grid, scoring='accuracy', cv=5, error_score=-1, verbose=10000, n_jobs=-1, iid=False, return_train_score=True) gs.fit(df, df['label'])
] all_feats = [] for feature_set in feature_sets: all_feats.append(feature_set + '_title') all_feats.append(feature_set + '_text') if feature_set not in ['bg_styl', 'bg_lsa']: all_feats.append(feature_set + '_cos') all_feats.append('meta_media') bg_feats = [x for x in all_feats if x.startswith('bg_')] + ['meta_media'] en_feats = [x for x in all_feats if x.startswith('en_')] + ['meta_media'] models = [] for feature_set in feature_sets: title_model = (feature_set + '_title', pipelines.make(clf, [feature_set + '_title'])) text_model = (feature_set + '_text', pipelines.make(clf, [feature_set + '_text'])) title_text_model = (feature_set + '_title_text', pipelines.make( clf, [feature_set + '_title', feature_set + '_text'])) title_text_cos_model = (feature_set + '_title_text_cos', pipelines.make(clf, [ feature_set + '_title', feature_set + '_text', feature_set + '_cos' ])) models.append(title_model) models.append(text_model)