Esempio n. 1
0
        'bg_bert_title', 'bg_bert_text', 'meta_media', 'bg_xlm_title',
        'bg_xlm_text'
    ]),
    ('top_8', [
        'bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text',
        'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text',
        'bg_bert_title', 'bg_bert_text', 'meta_media', 'bg_xlm_title',
        'bg_xlm_text', 'en_nela_title', 'en_nela_text'
    ]),
    ('top_9', [
        'bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text',
        'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text',
        'bg_bert_title', 'bg_bert_text', 'meta_media', 'bg_xlm_title',
        'bg_xlm_text', 'en_nela_title', 'en_nela_text', 'bg_styl_title',
        'bg_styl_textx'
    ]),
]

oversampler = None

models = []
for name, feature_list in features:
    clf = LogisticRegression()
    clf_params = {'clf__C': 1.5, 'clf__solver': 'liblinear', 'clf__tol': 0.01}
    model = pipelines.make(clf, feature_list, clf_params=clf_params)

    # evaluation
    models.append((f'{name}', model))

compare_classifiers(models, df, df['label'], silent=False, plot=False)
Esempio n. 2
0
                         multi_class="auto",
                         solver='liblinear',
                         max_iter=20000)

features = [
    'bg_bert_title',
    'bg_bert_text',
    'bg_xlm_title',
    'bg_xlm_text',
    'meta_media',
    'bg_styl_title',
    'bg_styl_text',
    'bg_lsa_title',
    'bg_lsa_text',
    'en_use_title',
    'en_use_text',
    'en_nela_title',
    'en_nela_text',
    'en_bert_title',
    'en_bert_text',
    'en_elmo_title',
    'en_elmo_text',
]

all_features = [f'{x}_pred' for x in features]
# oversamplers = [None, SMOTE(), ADASYN(), RandomOverSampler(random_state=0)]
oversamplers = [None]
for oversampler in oversamplers:
    model = ('pred_all', pipelines.make(clf, all_features, oversampler))
    compare_classifiers([model], df, df['label'], silent=False, plot=False)
Esempio n. 3
0
db = database.MongoDB()

clf = LogisticRegression(C=1.5,
                         tol=0.01,
                         random_state=0,
                         multi_class="auto",
                         solver='liblinear',
                         max_iter=20000)

df = get_df(list(db.get_articles()))

for oversampler in [
        None, SMOTE(),
        ADASYN(), RandomOverSampler(random_state=0)
]:
    models = [
        ('bg_lsa',
         pipelines.make(clf, ['bg_lsa_title', 'bg_lsa_text'],
                        oversampler=oversampler))
        #('bg_bert_title_text', pipelines.make(clf, ['bg_bert_title', 'bg_bert_text'], oversampler=oversampler)),
        #        ('bg_xlm_title_text', pipelines.make(clf, oversampler, ['bg_xlm_title', 'bg_xlm_text', 'bg_xlm_cos'])),
        #        ('bg_styl_title_text', pipelines.make(clf, oversampler, ['bg_styl_title', 'bg_styl_text'])),
        #        ('meta_media', pipelines.make(clf, oversampler, ['meta_media'])),
        #        ('en_use_title_text', pipelines.make(clf, oversampler, ['en_use_title', 'en_use_text', 'en_use_cos'])),
        #        ('en_nela_title_text', pipelines.make(clf, oversampler, ['en_nela_title', 'en_nela_text', 'en_nela_cos'])),
        #        ('en_bert_title_text', pipelines.make(clf, oversampler, ['en_bert_title', 'en_bert_text', 'en_bert_cos'])),
        #        ('en_elmo_title_text', pipelines.make(clf, oversampler, ['en_elmo_title', 'en_elmo_text', 'en_elmo_cos'])),
    ]
    print(f'Oversampler: { oversampler}')
    compare_classifiers(models, df, df['label'], silent=False, plot=False)
all_feats = []
for feature_set in feature_sets:
    all_feats.append(feature_set + '_title')
    all_feats.append(feature_set + '_text')
#all_feats.append('meta_media')

param_grid = {
    'clf__tol': [1e-10, 1e-8, 1e-4, 1e-2, 1e-1],  # 1e-4
    'clf__C': [0.05, 0.15, 0.25, 0.35, 0.50, 0.75, 1, 1.25, 1.5, 2],  # 1,
    'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']  # lbfgs
}

print('All features count: ', len(all_feats))
for feature_set in all_feats:
    model = pipelines.make(clf, [feature_set])

    gs = GridSearchCV(model,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=5,
                      error_score=-1,
                      verbose=1,
                      n_jobs=-1,
                      iid=False,
                      return_train_score=True)

    gs.fit(df, df['label'])

    print(f"{feature_set} | BEST SCORE: {gs.best_score_}")
    print(f"{feature_set} | BEST PARAMS: {gs.best_params_}")
Esempio n. 5
0
    # ('all_en', en_feats),
    ('all', all_feats)
]
oversampler = None

# evaluation
param_grid = {
    # 'clf__tol': [1e-2], # [1e-10, 1e-8, 1e-4, 1e-2, 1e-1],  # 1e-4
    'clf__C': [0.05, 0.15, 0.25, 0.35, 0.50, 0.75, 1, 1.25, 1.5, 2],  # 1,
    'clf__solver':
    ['liblinear']  #['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

for name, feature_list in features:
    model = pipelines.make(
        LogisticRegression(random_state=0, multi_class="auto", max_iter=1000),
        feature_list)

    print(name)

    gs = GridSearchCV(model,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=5,
                      error_score=-1,
                      verbose=10000,
                      n_jobs=-1,
                      iid=False,
                      return_train_score=True)

    gs.fit(df, df['label'])
Esempio n. 6
0
]

all_feats = []
for feature_set in feature_sets:
    all_feats.append(feature_set + '_title')
    all_feats.append(feature_set + '_text')
    if feature_set not in ['bg_styl', 'bg_lsa']:
        all_feats.append(feature_set + '_cos')
all_feats.append('meta_media')
bg_feats = [x for x in all_feats if x.startswith('bg_')] + ['meta_media']
en_feats = [x for x in all_feats if x.startswith('en_')] + ['meta_media']
models = []

for feature_set in feature_sets:
    title_model = (feature_set + '_title',
                   pipelines.make(clf, [feature_set + '_title']))
    text_model = (feature_set + '_text',
                  pipelines.make(clf, [feature_set + '_text']))
    title_text_model = (feature_set + '_title_text',
                        pipelines.make(
                            clf,
                            [feature_set + '_title', feature_set + '_text']))

    title_text_cos_model = (feature_set + '_title_text_cos',
                            pipelines.make(clf, [
                                feature_set + '_title', feature_set + '_text',
                                feature_set + '_cos'
                            ]))

    models.append(title_model)
    models.append(text_model)