}
clf_param_grid = {
    'loss': ['hinge', 'log', 'squared_hinge', 'modified_huber'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': uniform(0.0001 / 10, 0.0001 * 1.4),
    'shuffle': [False, True],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': uniform(0.01, 1),
}
cfg_param_grid = {'batch_size': [12500, 25000, 50000]}
HashSampler = ParameterSampler(hash_param_grid, n_iter=1)
clfSampler = ParameterSampler(clf_param_grid, n_iter=50000000)
cfgSampler = ParameterSampler(cfg_param_grid, n_iter=1, random_state=rng)

for _ in range(15):
    print(next(cfgSampler.__iter__()))
#for p in cfgSampler:
#print(p)

# +
# 2^18 = 262,144
# 2^21 = 2,097,152
###### CONFIG #####
xml_lst = glob("/mnt/training_defs/math*/*.xml.gz")
# THIS CONFIGURATION WORKS BEAUTIFULLY BUT STILL TRYING TO MAKE IT BETTER
#cfg = {'batch_size': 25000,
#      'hash_vect': {'decode_error':'ignore',
#                    'n_features': 2 ** 23,
#                    'alternate_sign': False,
#                    'ngram_range': (1,3)}, }
#hash_param = next(HashSampler.__iter__())
Esempio n. 2
0
clf_param_grid = {'loss': ['hinge', 'log', 'squared_hinge', 'modified_huber'],
              'penalty': ['l2', 'l1', 'elasticnet'],
              'alpha': uniform(0.0001/10, 0.0001*1.4),
              'shuffle': [False, True],
              'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
                 'eta0': uniform(0.01, 1),}
cfg_param_grid = {'batch_size': [12500, 25000, 50000], }
HashSampler = ParameterSampler(hash_param_grid, n_iter=1)
clfSampler = ParameterSampler(clf_param_grid,   n_iter=1)
cfgSampler = ParameterSampler(cfg_param_grid,   n_iter=1)

tboy_acc = 0
cnt = 0
while True:
    xml_lst = glob(cfg['train_data'])
    hash_param = next(HashSampler.__iter__())
    vectorizer = HashingVectorizer(**hash_param)

    clf_param = next(clfSampler.__iter__())
    # Here are some classifiers that support the `partial_fit` method
    partial_fit_classifiers = {
        'SGD': SGDClassifier(**clf_param),
    }

    # test data statistics
    test_stats = {'n_test': 0, 'n_test_pos': 0}

    # First we hold out a number of examples to estimate accuracy
    cfg_param = next(cfgSampler.__iter__())
    stream = stream_arxiv_paragraphs(xml_lst, samples=cfg_param['batch_size'])
    X_test_text, y_test = next(stream)