Ejemplo n.º 1
0
def test(model_filename: str,
         test_corpus: str,
         window_size: int = 5,
         _run: Run = None,
         _log: logger = None):
    _run.add_resource(test_corpus)
    _run.add_resource(f'{model_filename}.pkl')
    test_sents, _ = get_tagged_sents_and_words(test_corpus)

    X_test = [sent2features(s, window_size) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    _log.info(f'load from: {model_filename}.pkl')
    crf = sklearn_crfsuite.CRF(model_filename=model_filename)

    y_pred = crf.predict(X_test)
    overall, by_type = evaluate(y_test, y_pred)
    _run.info[f'overall_f1'] = overall.f1_score
    _run.log_scalar('overall_f1', overall.f1_score)
    _run.info[f'overall_precision'] = overall.precision
    _run.log_scalar('overall_precision', overall.precision)
    _run.info[f'overall_recall'] = overall.recall
    _run.log_scalar('overall_recall', overall.recall)
    _log.info(f'Overall F1 score: {overall.f1_score}')
    for _, key in enumerate(sorted(by_type.keys())):
        for metric_key in by_type[key]._fields:
            metric_val = getattr(by_type[key], metric_key)
            _run.info[f'{key}-{metric_key}'] = metric_val
            _run.log_scalar(f'{key}-{metric_key}', metric_val)
            _log.info(f'{key}-{metric_key}: {metric_val}')
Ejemplo n.º 2
0
def train(train_corpus: str,
          dev_corpus: str,
          c1: float = 0.0,
          c2: float = 0.0,
          algorithm: str = 'lbfgs',
          max_iterations: int = 100,
          all_possible_transitions: bool = False,
          window_size: int = 1,
          model_filename: str = None,
          _run: Run = None,
          _log: logger = None):
    """
    running crf experiment
    """
    _run.add_resource(train_corpus)
    _run.add_resource(dev_corpus)
    train_sents, _ = get_tagged_sents_and_words(train_corpus)
    dev_sents, _ = get_tagged_sents_and_words(dev_corpus)

    X_train = [sent2features(s, window_size) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_dev = [sent2features(s, window_size) for s in dev_sents]
    y_dev = [sent2labels(s) for s in dev_sents]

    crf = sklearn_crfsuite.CRF(
        algorithm=algorithm,
        c1=c1,
        c2=c2,
        max_iterations=max_iterations,
        all_possible_transitions=all_possible_transitions,
        model_filename=model_filename,
    )

    crf.fit(X_train, y_train)
    y_pred = crf.predict(X_dev)
    overall, by_type = evaluate(y_dev, y_pred)
    _run.info[f'overall_f1'] = overall.f1_score
    _run.log_scalar('overall_f1', overall.f1_score)
    _run.info[f'overall_precision'] = overall.precision
    _run.log_scalar('overall_precision', overall.precision)
    _run.info[f'overall_recall'] = overall.recall
    _run.log_scalar('overall_recall', overall.recall)
    _log.info(f'Overall F1 score: {overall.f1_score}')
    for _, key in enumerate(sorted(by_type.keys())):
        for metric_key in by_type[key]._fields:
            metric_val = getattr(by_type[key], metric_key)
            _run.info[f'{key}-{metric_key}'] = metric_val
            _run.log_scalar(f'{key}-{metric_key}', metric_val)
            _log.info(f'{key}-{metric_key}: {metric_val}')
    if model_filename is not None:
        _log.info(f'saving to: {model_filename}.pkl')
        joblib.dump(crf, f'{model_filename}.pkl')
        _run.add_artifact(f'{model_filename}.pkl')
Ejemplo n.º 3
0
def train(train_corpus: str,
          dev_corpus: str,
          char_int: int,
          save_path: str,
          test_corpus: str = None,
          dropout: float = 0.5,
          num_epochs: int = 10,
          lm_loss_scale=0.1,
          device: int = 0,
          save=False,
          _run: Run = None):
    _run.add_resource(train_corpus)
    _run.add_resource(dev_corpus)
    trainer = TrainerMod(_run,
                         train_corpus,
                         save_path,
                         dev_corpus,
                         num_epochs=num_epochs,
                         dropout=dropout,
                         char_integration_method=char_int,
                         lm_loss_scale=lm_loss_scale,
                         save=save,
                         device=device)
    trainer.run()
    if test_corpus:
        _run.add_resource(test_corpus)
        ex.run_command('test',
                       config_updates={
                           'save_path': save_path,
                           'test_corpus': test_corpus,
                           'device': device
                       })
Ejemplo n.º 4
0
def train_w_pretrained(train_corpus: str,
                       dev_corpus: str,
                       char_int: int,
                       pretrained_embeddings: str,
                       save_path: str,
                       test_corpus: str = None,
                       word_embedding_size: int = 300,
                       update_pretrained_embedding: bool = True,
                       dropout: float = 0.5,
                       num_epochs: int = 10,
                       lm_loss_scale=0.1,
                       device: int = 0,
                       save=False,
                       _run: Run = None):
    _run.add_resource(train_corpus)
    _run.add_resource(dev_corpus)
    trainer = TrainerMod(
        _run,
        train_corpus,
        save_path,
        dev_corpus,
        word_embedding_size=word_embedding_size,
        num_epochs=num_epochs,
        dropout=dropout,
        char_integration_method=char_int,
        lm_loss_scale=lm_loss_scale,
        save=save,
        device=device,
        pretrained_embeddings=pretrained_embeddings,
        update_pretrained_embedding=update_pretrained_embedding,
        model_class=NewSequenceLabeler)
    trainer.run()
    if test_corpus:
        _run.add_resource(test_corpus)
        ex.run_command('test_w_pretrained',
                       config_updates={
                           'save_path': save_path,
                           'test_corpus': test_corpus,
                           'device': device
                       })
Ejemplo n.º 5
0
def train(train_corpus: str,
          dev_corpus: str,
          pacrf: str,
          model_filename: str,
          labels: List,
          c1: float = 0.0,
          c2: float = 1.0,
          algorithm: str = 'lbfgs',
          max_iterations: int = None,
          all_possible_transitions: bool = False,
          window_size: int = 0,
          _run: Run = None,
          _log: logger = None):
    """
    running crf experiment
    """
    _run.add_resource(train_corpus)
    _run.add_resource(dev_corpus)
    train_sents, _ = get_tagged_sents_and_words(train_corpus)
    dev_sents, _ = get_tagged_sents_and_words(dev_corpus)

    tmp_train = tempfile.NamedTemporaryFile(mode='w+')
    # temp_train_corpus = open(f'{model_filename}-{train_corpus}.feature', mode='w+')
    print_corpus(train_sents, labels, tmp_train, window_size=window_size)

    # X_dev = [sent2features(s, window_size) for s in dev_sents]
    y_dev = [sent2labels_colmap(s, col=1) for s in dev_sents]

    tmp_dev = tempfile.NamedTemporaryFile(mode='w+')
    # temp_test_corpus = open(f'{model_filename}-{test_corpus}.feature', mode='w+')
    print_corpus(dev_sents, labels, tmp_dev, window_size=window_size)

    # to call partial-crf via Popen command
    # command = f'{pacrf} learn -m {model_filename} -a {algorithm} {temp_train_corpus}'
    # call([pacrf, "--help"])

    crfsuire_proc = Popen([pacrf, "learn", "-m", model_filename, "-a", algorithm, \
                           "-p", f"c1={c1}", "-p", f"c2={c2}", tmp_train.name])
    out, err = crfsuire_proc.communicate()
    print(out)
    print(err)
    # os.system(f'{pacrf} learn -m {model_filename} -a {algorithm} {tmp_train.name}')

    tmp_train.close()

    tmp_pred = tempfile.NamedTemporaryFile(mode='w+')

    # cmd_out([pacrf, "tag", "-m", model_filename, tmp_dev.name, ">", tmp_pred.name])

    _run.add_artifact(model_filename)

    # TODO modified this to call partial-crf via Popen command
    # y_pred = crf.predict(X_dev)
    y_pred = get_tagged_sents_and_words(tmp_pred.name)
    print(y_pred)
    y_pred = [sent2labels_colmap(s, 0) for s in y_pred]

    # TODO modified this to read partial-crf via tempfile
    overall, by_type = evaluate(y_dev, y_pred)
    tmp_pred.close()
    tmp_dev.close()

    _run.info[f'overall_f1'] = overall.f1_score
    _run.log_scalar('overall_f1', overall.f1_score)
    _run.info[f'overall_precision'] = overall.precision
    _run.log_scalar('overall_precision', overall.precision)
    _run.info[f'overall_recall'] = overall.recall
    _run.log_scalar('overall_recall', overall.recall)
    _log.info(f'Overall F1 score: {overall.f1_score}')
    for _, key in enumerate(sorted(by_type.keys())):
        for metric_key in by_type[key]._fields:
            metric_val = getattr(by_type[key], metric_key)
            _run.info[f'{key}-{metric_key}'] = metric_val
            _run.log_scalar(f'{key}-{metric_key}', metric_val)
            _log.info(f'{key}-{metric_key}: {metric_val}')