Esempio n. 1
0
def main(sentences, model, language, outfile, processes, gazetteer):
    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}

    logger.info("Loading model from '%s' ...", model)
    model, extractor = joblib.load(model)

    classifier = SentenceClassifier(model, extractor, language, gazetteer)

    def worker(batch):
        data = (json.loads(s) for s in batch)
        for classified in classifier.classify_sentences(data):
            yield json.dumps(classified)

    count = 0
    for each in parallel.map(worker, sentences, batch_size=1000,
                             flatten=True, processes=processes):
        outfile.write(each)
        outfile.write('\n')

        count += 1
        if count % 1000 == 0:
            logger.info('Classified %d sentences', count)

    logger.info('Done, classified %d sentences', count)
    if count > 0:
        logger.info("Dumped classified sentences to '%s'", outfile.name)
Esempio n. 2
0
def main(training_set, language, outfile, gazetteer, **kwargs):
    """ Trains the classifier """

    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}
    extractor = FactExtractorFeatureExtractor(language)

    logger.info("Building training set from '%s' ..." % training_set.name)
    for row in training_set:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'],
                                   data['fes'],
                                   add_unknown=True,
                                   gazetteer=gazetteer)

    logger.info('Finalizing training set ...')
    x, y = extractor.get_features()

    logger.info('Got %d samples with %d features each', *x.shape)

    logger.info('Fitting model ...')
    kwargs['C'] = kwargs.pop('c')
    svc = LinearSVC(**kwargs)
    svc.fit(x, y)

    joblib.dump((svc, extractor), outfile)
    logger.info("Done, dumped model to '%s'", outfile)
Esempio n. 3
0
def main(training_set, language, gold_standard, gazetteer):
    """ Searches for the best hyperparameters """

    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}

    logger.info('Building training set')
    extractor = FactExtractorFeatureExtractor(language)
    for row in training_set:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['fes'],
                                   add_unknown=True, gazetteer=gazetteer)

    logger.info('Finalizing training set')
    x, y = extractor.get_features()

    logger.info('Searching for the best model parameters')
    svc = LinearSVC()
    search = GridSearchCV(
        svc,
        param_grid=[{
            'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
            'multi_class': ['ovr', 'crammer_singer'],
        }],
        scoring='f1_weighted',
        cv=10)
    search.fit(x, y)

    logger.info('The best model (weighted-averaged F1 of %.4f) has parameters %s',
                search.best_score_, search.best_params_)

    if not gold_standard:
        logger.info('Skipping gold standard evaluation')
        return

    logger.info('Evaluating on the gold standard')
    for row in gold_standard:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['fes'])
    x_gold, y_gold = extractor.get_features()

    dummy = DummyClassifier(strategy='stratified')
    dummy.fit(x, y)

    y_dummy = dummy.predict(x_gold)
    logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f',
                metrics.f1_score(y_gold, y_dummy, average='weighted'))

    y_best = search.predict(x_gold)
    logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f',
                metrics.f1_score(y_gold, y_best, average='weighted'))
Esempio n. 4
0
def main(
    training_set,
    language,
    outfile,
    model_class,
    model_param,
    extractor_class,
    extractor_param,
    gazetteer,
    folds,
    scoring,
    skip_majority,
    evaluate_gold,
):
    """ Trains the classifier """

    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}

    model_cls, model_args = initialize(model_class, model_param, False)

    if evaluate_gold:
        gold_extractor = initialize(extractor_class, [("language", language)] + list(extractor_param), True)

        gold_evaluation(map(json.loads, training_set), gold_extractor, gazetteer, model_cls, model_args)

        training_set.seek(0)

    extractor = initialize(extractor_class, [("language", language)] + list(extractor_param), True)

    logger.info("Building training set from '%s' ..." % training_set.name)
    for row in training_set:
        data = json.loads(row)
        extractor.process_sentence(data["sentence"], data["lu"], data["fes"], add_unknown=True, gazetteer=gazetteer)
    x, y = extractor.get_features(refit=True)
    logger.info("Got %d samples with %d features each", *x.shape)

    model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args)

    if folds > 1:
        kfolds_evaluation(folds, model, scoring, skip_majority, x, y)

    logger.info("Fitting model ...")
    model.fit(x, y)

    joblib.dump((model, {"extractor": extractor}), outfile)

    logger.info("Done, dumped model to '%s'", outfile)
Esempio n. 5
0
def main(training_set, language, outfile, model_class, model_param, extractor_class,
         extractor_param, gazetteer, folds, scoring, skip_majority, evaluate_gold):
    """ Trains the classifier """

    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}

    model_cls, model_args = initialize(model_class, model_param, False)

    if evaluate_gold:
        gold_extractor = initialize(
            extractor_class, [('language', language)] + list(extractor_param), True
        )

        gold_evaluation(
            map(json.loads, training_set), gold_extractor,
            gazetteer, model_cls, model_args
        )

        training_set.seek(0)

    extractor = initialize(extractor_class, [('language', language)] + list(extractor_param), True)

    logger.info("Building training set from '%s' ..." % training_set.name)
    for row in training_set:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['lu'], data['fes'],
                                   add_unknown=True, gazetteer=gazetteer)
    x, y = extractor.get_features(refit=True)
    logger.info('Got %d samples with %d features each', *x.shape)

    model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args)

    if folds > 1:
        kfolds_evaluation(folds, model, scoring, skip_majority, x, y)

    logger.info('Fitting model ...')
    model.fit(x, y)

    joblib.dump((model, {
        'extractor': extractor
    }), outfile)

    logger.info("Done, dumped model to '%s'", outfile)
Esempio n. 6
0
def get_training_sets(training_set, language, gazetteer, word2vec_model,
                      independent_lus):
    extractor_args = itertools.chain(
        itertools.product([BagOfTermsFeatureExtractor], [True, False],
                          [0, 1, 2]),
        itertools.product([Word2VecFeatureExtractor], [word2vec_model],
                          [True, False], [0, 1, 2]) if word2vec_model else [])

    lus = set(json.loads(row)['lu']
              for row in training_set) if independent_lus else ['$all']

    count = 0
    for gaz in list(gazetteer) + [None]:
        for args in extractor_args:
            for lu in lus:
                logger.debug('%d) gazetteer: %s, extractor params: %s, lu: %s',
                             count, gaz.name if gaz else None, args, lu)
                count += 1

                extractor, init_args = args[0], args[1:]
                extractor = extractor(language, *init_args)
                gazetteer = reverse_gazetteer(
                    json.load(gazetteer)) if gaz else {}

                training_set.seek(0)
                for row in training_set:
                    data = json.loads(row)
                    if not independent_lus or data['lu'] in lus:
                        extractor.process_sentence(data['sentence'],
                                                   data['lu'],
                                                   data['fes'],
                                                   add_unknown=True,
                                                   gazetteer=gazetteer)

                meta = {
                    'lu': lu,
                    'gazetteer': gaz,
                    'extractor_cls': args[0],
                    'extractor_args': [language] + list(args[1:]),
                    'extractor': extractor
                }

                yield meta, extractor
Esempio n. 7
0
def get_training_sets(training_set, language, gazetteer, word2vec_model, independent_lus):
    extractor_args = itertools.chain(
        itertools.product([BagOfTermsFeatureExtractor], [True, False], [0, 1, 2]),

        itertools.product([Word2VecFeatureExtractor], [word2vec_model], [True, False], [0, 1, 2])
        if word2vec_model else []
    )

    lus = set(json.loads(row)['lu'] for row in training_set) if independent_lus else ['$all']

    count = 0
    for gaz in list(gazetteer) + [None]:
        for args in extractor_args:
            for lu in lus:
                logger.debug('%d) gazetteer: %s, extractor params: %s, lu: %s',
                             count, gaz.name if gaz else None, args, lu)
                count += 1

                extractor, init_args = args[0], args[1:]
                extractor = extractor(language, *init_args)
                gazetteer = reverse_gazetteer(json.load(gazetteer)) if gaz else {}

                training_set.seek(0)
                for row in training_set:
                    data = json.loads(row)
                    if not independent_lus or data['lu'] in lus:
                        extractor.process_sentence(data['sentence'], data['lu'], data['fes'],
                                                   add_unknown=True, gazetteer=gazetteer)

                meta = {
                    'lu': lu,
                    'gazetteer': gaz,
                    'extractor_cls': args[0],
                    'extractor_args': [language] + list(args[1:]),
                    'extractor': extractor
                }

                yield meta, extractor
Esempio n. 8
0
def main(training_set, language, outfile, gazetteer, **kwargs):
    """ Trains the classifier """

    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}
    extractor = FactExtractorFeatureExtractor(language)

    logger.info("Building training set from '%s' ..." % training_set.name)
    for row in training_set:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['fes'],
                                   add_unknown=True, gazetteer=gazetteer)

    logger.info('Finalizing training set ...')
    x, y = extractor.get_features()

    logger.info('Got %d samples with %d features each', *x.shape)

    logger.info('Fitting model ...')
    kwargs['C'] = kwargs.pop('c')
    svc = LinearSVC(**kwargs)
    svc.fit(x, y)

    joblib.dump((svc, extractor), outfile)
    logger.info("Done, dumped model to '%s'", outfile)