def __init__(self,
                 samples,
                 results_csv_path,
                 tuner_domains=TUNER_DOMAINS,
                 validation_samples=None,
                 show_progress=True,
                 show_epoch_eval=True,
                 dump_models=False,
                 dump_pss_eval=False,
                 evaluator=PSSClasifierEvaluator(),
                 tuner_score_getter=lambda evaluations: max([e['f1'] or 0 for e in evaluations]),
                 tuner_results_getter=extract_classifier_evaluator_results,
                 task_name=''):
        self.task_name = task_name
        self.dump_models = dump_models
        self.dump_pss_eval = dump_pss_eval
        self.fit_kwargs = None
        self.tuner_results_getter = tuner_results_getter
        self.tuner_score_getter = tuner_score_getter

        assert evaluator is not None

        def dump_result(output_dir, result, params):
            if self.dump_models:
                result.predictor.save(output_dir + '/model')
            if self.dump_pss_eval:
                ident = 'autoid'
                StreusleEvaluator(result.predictor).evaluate(validation_samples, output_tsv_path=output_dir + '/psseval_out.tsv', ident=ident)

        self.tuner = HyperparametersTuner(task_name=task_name,
                                          results_csv_path=results_csv_path,
                                          params_settings=tuner_domains, executor=self._execute,
                                          csv_row_builder=build_csv_rows, shared_csv=True,
                                          lock_file_path=results_csv_path + '.lock',
                                          dump_result=dump_result)

        self.fit_kwargs = {
            'samples': samples,
            'validation_samples': validation_samples,
            'show_progress': show_progress,
            'show_epoch_eval': show_epoch_eval,
            'evaluator': evaluator
        }
コード例 #2
0
def run():
    loader = StreusleLoader()
    STREUSLE_BASE = os.environ.get(
        'STREUSLE_BASE'
    ) or '/cs/usr/aviramstern/lab/nlp/datasets/streusle_v4/release'
    task = 'goldid.goldsyn'
    train_records = loader.load(STREUSLE_BASE + '/train/streusle.ud_train.' +
                                task + '.json',
                                input_format='json')
    dev_records = loader.load(STREUSLE_BASE + '/dev/streusle.ud_dev.' + task +
                              '.json',
                              input_format='json')
    test_records = loader.load(STREUSLE_BASE + '/test/streusle.ud_test.' +
                               task + '.json',
                               input_format='json')

    train_samples = [
        streusle_record_to_lstm_model_sample(r) for r in train_records
    ]
    dev_samples = [
        streusle_record_to_lstm_model_sample(r) for r in dev_records
    ]
    test_samples = [
        streusle_record_to_lstm_model_sample(r) for r in test_records
    ]

    test_features()

    GOLD_ID_AUTO_PREP = json.loads("""{
  "mask_mwes": false,
  "learning_rate_decay": 0.0001,
  "lstm_h_dim": 100,
  "mlp_layers": 2,
  "is_bilstm": true,
  "num_lstm_layers": 2,
  "dynet_random_seed": "7564313",
  "use_ud_xpos": true,
  "ner_embd_dim": 10,
  "allow_empty_prediction": false,
  "learning_rate": 0.15848931924611143,
  "mlp_activation": "relu",
  "use_lexcat": true,
  "use_govobj": true,
  "token_embd_dim": 300,
  "update_lemmas_embd": true,
  "govobj_config_embd_dim": 3,
  "ud_deps_embd_dim": 10,
  "mlp_layer_dim": 100,
  "mlp_dropout_p": 0.37,
  "ud_xpos_embd_dim": 25,
  "use_ner": true,
  "update_token_embd": false,
  "epochs": 1,
  "lstm_dropout_p": 0.38,
  "use_ud_dep": true,
  "lexcat_embd_dim": 3,
  "use_prep_onehot": false,
  "use_token": true,
  "use_token_internal": true,
  "token_internal_embd_dim": 10,
  "labels_to_predict": [
    "supersense_role",
    "supersense_func"
  ]
}""")

    print('Training model..')
    model = LstmMlpSupersensesModel(
        hyperparameters=LstmMlpSupersensesModel.HyperParameters(
            **GOLD_ID_AUTO_PREP), )
    predictor = model.fit(train_samples, dev_samples)
    evaluator = PSSClasifierEvaluator(predictor.model)
    evaluator.evaluate([model.sample_to_lowlevel(s) for s in test_samples])

    btrain, bdev, btest = load_boknilev()
    all_samples = btrain + bdev + btest
    predictions = {}
    for ind, sample in enumerate(all_samples):
        print("%d/%d" % (ind, len(all_samples)))
        lm_sample_xs = boknilev_record_to_lstm_model_sample_xs(sample)
        lm_sample_ys = model.predict(lm_sample_xs)
        predictions[sample['sent_id']] = {}
        for ind, (sx, sy) in enumerate(zip(lm_sample_xs, lm_sample_ys)):
            if sx.identified_for_pss:
                predictions[sample['sent_id']][ind] = (sy.supersense_role,
                                                       sy.supersense_func)

    dump_boknilev_pss(predictions)
コード例 #3
0
import os
import copy
from collections import defaultdict

from datasets.pp_attachement.boknilev.load_boknilev import load_boknilev, dump_boknilev_pss
from datasets.streusle_v4 import StreusleLoader
from evaluators.pss_classifier_evaluator import PSSClasifierEvaluator
from models.supersenses.boknilev_integration import boknilev_record_to_lstm_model_sample_xs
from models.supersenses.features.features_test import test_features
from models.supersenses.lstm_mlp_supersenses_model import LstmMlpSupersensesModel
from models.supersenses.streusle_integration import streusle_record_to_lstm_model_sample
import json
evaluator = PSSClasifierEvaluator()


def run():
    loader = StreusleLoader()
    STREUSLE_BASE = os.environ.get(
        'STREUSLE_BASE'
    ) or '/cs/usr/aviramstern/lab/nlp/datasets/streusle_v4/release'
    task = 'goldid.goldsyn'
    train_records = loader.load(STREUSLE_BASE + '/train/streusle.ud_train.' +
                                task + '.json',
                                input_format='json')
    dev_records = loader.load(STREUSLE_BASE + '/dev/streusle.ud_dev.' + task +
                              '.json',
                              input_format='json')
    test_records = loader.load(STREUSLE_BASE + '/test/streusle.ud_test.' +
                               task + '.json',
                               input_format='json')
コード例 #4
0
def run():
    loader = StreusleLoader()
    STREUSLE_BASE = os.environ.get(
        'STREUSLE_BASE'
    ) or '/cs/usr/aviramstern/lab/nlp/datasets/streusle_v4/release'
    task = 'goldid.goldsyn'
    train_records = loader.load(STREUSLE_BASE + '/train/streusle.ud_train.' +
                                task + '.json',
                                input_format='json')
    dev_records = loader.load(STREUSLE_BASE + '/dev/streusle.ud_dev.' + task +
                              '.json',
                              input_format='json')
    test_records = loader.load(STREUSLE_BASE + '/test/streusle.ud_test.' +
                               task + '.json',
                               input_format='json')

    train_samples = [
        streusle_record_to_lstm_model_sample(r) for r in train_records
    ]
    dev_samples = [
        streusle_record_to_lstm_model_sample(r) for r in dev_records
    ]
    # test_samples = [streusle_record_to_lstm_model_sample(r) for r in test_records]

    test_features()

    GOLD_ID_GOLD_PREP_WITH_NER = json.loads("""{
  "mask_mwes": false,
  "learning_rate_decay": 0.00031622776601683794,
  "lstm_h_dim": 100,
  "mlp_layers": 2,
  "is_bilstm": true,
  "num_lstm_layers": 2,
  "dynet_random_seed": "3857654",
  "use_ud_xpos": true,
  "ner_embd_dim": 5,
  "allow_empty_prediction": false,
  "learning_rate": 0.15848931924611143,
  "mlp_activation": "relu",
  "use_lexcat": true,
  "use_govobj": true,
  "token_embd_dim": 300,
  "update_lemmas_embd": false,
  "govobj_config_embd_dim": 3,
  "ud_deps_embd_dim": 25,
  "mlp_layer_dim": 100,
  "mlp_dropout_p": 0.42,
  "ud_xpos_embd_dim": 5,
  "use_ner": true,
  "update_token_embd": false,
  "epochs": 80,
  "lstm_dropout_p": 0.49,
  "use_ud_dep": true,
  "lexcat_embd_dim": 3,
  "use_prep_onehot": false,
  "use_token": true,
  "use_token_internal": true,
  "token_internal_embd_dim": 10,
  "labels_to_predict": [
    "supersense_role",
    "supersense_func"
  ]
}""")

    GOLD_ID_GOLD_PREP_WITHOUT_NER = copy.deepcopy(GOLD_ID_GOLD_PREP_WITH_NER)
    GOLD_ID_GOLD_PREP_WITHOUT_NER['use_ner'] = False
    tasks = {
        'GOLD_ID_GOLD_PREP_WITH_NER': GOLD_ID_GOLD_PREP_WITH_NER,
        'GOLD_ID_GOLD_PREP_WITHOUT_NER': GOLD_ID_GOLD_PREP_WITHOUT_NER
    }

    task_acc = defaultdict(lambda: [])

    N_SAMPLES = 3
    for _ in range(N_SAMPLES):
        for task, hp in tasks.items():
            model = LstmMlpSupersensesModel(
                hyperparameters=LstmMlpSupersensesModel.HyperParameters(
                    **hp), )
            predictor = model.fit(train_samples, dev_samples)
            evaluator = PSSClasifierEvaluator(predictor.model)
            acc = evaluator.evaluate(
                [model.sample_to_lowlevel(s) for s in dev_samples])['f1']
            task_acc[task].append(acc)

    for task, accs in task_acc.items():
        print(task + ": " + ", ".join(["%2.2f" % acc for acc in accs]))
        print(task + ": Mean is %2.2f" % (sum(accs) / len(accs)))