Beispiel #1
0
def test_filter_messages():
    filename = get_project_root() / 'data' / 'askubuntu' / 'askubuntu.tsv'
    train = get_filtered_messages(filename, training=True)
    assert 53 == len(train)

    test = get_filtered_messages(filename, training=False)
    assert 109 == len(test)
    assert 'Software Recommendation' == test[0].data['intent']
Beispiel #2
0
def test_train_evaluate():
    reduce_output()
    validate_debug_params()
    hparams = get_debug_hparams()
    hparams = hparams._replace(output_dir=str(get_project_root() / 'tmp' /
                                              'test_my_classifier_train_eval'))
    _, estimator = get_model_fn_and_estimator(hparams)
    train_evaluate(hparams, estimator)
Beispiel #3
0
def test_main():
    reduce_output()
    model_dir = get_project_root() / 'tmp' / 'custom_estimator'
    if model_dir.is_dir():
        clean_folder(model_dir)

    args = [None, '--train_steps=10', '--model_dir={}'.format(model_dir)]
    tf.logging.set_verbosity(tf.logging.WARN)
    main(args)
Beispiel #4
0
def test_clean_folder():
    folder = get_project_root() / 'tmp' / 'test_clean_folder'
    if folder.is_dir():
        shutil.rmtree(str(folder))
    else:
        tf.gfile.MakeDirs(str(folder))
    tf.gfile.MakeDirs(str(folder / 'some_dir'))
    clean_folder(folder)
    sub_files = os.listdir(str(folder))
    assert 0 == len(list(sub_files))
Beispiel #5
0
def test_train():
    reduce_output()
    validate_debug_params()
    hparams = get_debug_hparams()
    hparams = hparams._replace(output_dir=str(get_project_root() / 'tmp' /
                                              'test_my_classifier_train'))
    _, estimator = get_model_fn_and_estimator(hparams)
    train(hparams, estimator, max_steps=2)

    clean_folder(hparams.output_dir)
Beispiel #6
0
def classify_sentence():
    rc.FLAGS.task_name = TASK
    rc.FLAGS.do_train = True
    rc.FLAGS.do_eval = True
    rc.FLAGS.data_dir = str(get_project_root() / 'data' / TASK)
    rc.FLAGS.vocab_file = str(BERT_BASE_DIR / 'vocab.txt')
    rc.FLAGS.bert_config_file = str(BERT_BASE_DIR / 'bert_config.json')
    rc.FLAGS.init_checkpoint = str(BERT_BASE_DIR / 'bert_model.ckpt')
    rc.FLAGS.max_seq_length = 128
    rc.FLAGS.train_batch_size = 16  # using too much memory at 32 batches
    rc.FLAGS.learning_rate = 2e-5
    rc.FLAGS.num_train_epochs = 1
    rc.FLAGS.output_dir = str(get_project_root() / 'output' / '1_epochs')

    start_time = time.time()

    rc.main('')
    running_time = round(((time.time() - start_time) / 60), 2)
    print('Execution took {} minutes'.format(running_time))
Beispiel #7
0
def get_debug_hparams() -> HParams:
    """Parameters for lightweight BERT execution for debug purposes."""
    from improv.utils import get_project_root

    task_name = '_chatbot'
    bert_model = 'uncased_L-12_H-768_A-12'
    bert_pretrained_dir = Path.home() / 'Downloads' / bert_model
    output_dir_name = 'debug'

    return HParams(
        data_dir=get_project_root() / 'data' / task_name,
        bert_config_file=bert_pretrained_dir / 'bert_config.json',
        task='ner_intent',
        task_name=task_name,
        vocab_file=bert_pretrained_dir / 'vocab.txt',
        output_dir=str(get_project_root() / 'tmp' / task_name /
                       output_dir_name),
        local_dir=str(get_project_root() / 'tmp' / task_name /
                      output_dir_name),
        init_checkpoint=bert_pretrained_dir / 'bert_model.ckpt',
        do_lower_case=bert_model.startswith('uncased'),
        max_seq_length=128,
        do_train_eval=False,
        do_train=True,
        do_eval=True,
        do_predict=True,
        train_batch_size=1,
        eval_batch_size=8,
        predict_batch_size=8,
        learning_rate=2e-5,
        num_train_steps=2,
        warmup_proportion=0.1,
        save_checkpoints_steps=1,
        save_summary_steps=1,
        iterations_per_loop=1,
        use_tpu=False,
        tpu_name='',  # is used as tpu_address in Colab script
        tpu_zone=None,
        gcp_project=None,
        master=None,
        num_tpu_cores=8)
Beispiel #8
0
from improv.utils import get_project_root
from improv.read_ner import get_ner_lines, get_unique_labels, get_interesting_labels_indexes

data_dir = get_project_root() / 'data' / 'chatbot'


def test_get_ner_lines():
    lines = get_ner_lines(data_dir / 'test.txt')
    assert len(lines) == 105
    assert lines[0][0] == 'C O O O O B-StationDest'
    assert lines[0][1] == 'FindConnection i want to go marienplatz'


def test_get_unique_labels():
    labels = get_unique_labels(data_dir)
    # order might change for each execution
    expected = ['Criterion', 'Line', 'StationDest', 'StationStart', 'Vehicle']
    for label in expected:
        # it might be that some I-<label> is missing, this could be caused by (small) dataset.
        full_label = 'B-{}'.format(label)
        assert full_label in labels
    assert '[CLS]' in labels
    assert '[SEP]' in labels
    assert 'X' in labels
    assert len(
        labels) > 16  # changed by the intent examples, two intents occur

    assert 'DepartureTime' in labels  # this is a sentence intent example


def test_get_interesting_labels_indexes():
Beispiel #9
0
def test_find_tf_events():
    folder = get_project_root() / 'tests'
    assert 'events.out.tfevents.000000' == find_tf_events(folder).name
Beispiel #10
0
def test_get_y_true():
    file = get_project_root() / 'data' / 'askubuntu' / 'askubuntu.tsv'
    y_true = get_y_true(file, training=False)
    assert 'Software Recommendation' == y_true[0]
    assert 109 == len(y_true)
Beispiel #11
0
from improv.utils import get_project_root
from improv.evaluate import parse_file, print_scores

# this filename should be updated when output format changes (which possibly will happen)
filename = get_project_root() / 'runs' / '2018-12-20 chatbot' / 'results.txt'


def test_parse_file():
    ner_datas = parse_file(filename)
    for ner_data in ner_datas:
        print(ner_data)


def test_evaluate():
    print_scores(filename)
Beispiel #12
0
def get_debug_filename() -> Path:
    return get_project_root() / get_filename(Corpus.ASKUBUNTU)
Beispiel #13
0
def get_filename(corpus: Corpus) -> Path:
    """Returns filename for some Corpus. This avoids re-defining corpus location all over the place."""
    from improv.utils import get_project_root

    task = corpus.name.lower()
    return get_project_root() / 'data' / task / (task + '.tsv')
Beispiel #14
0
    return round(f1_score(y_true, y_pred, average='weighted'), 3)


def is_ner_not_empty(ner_datas: Tuple[NERData]) -> bool:
    y_true = map(lambda ner_data: ner_data.true[1:], ner_datas)
    y_true = chain(*y_true)

    def is_not_empty(token: str) -> bool:
        return token != 'O'

    return any(map(is_not_empty, y_true))


def print_scores(filename: Path):
    """Determine intent and NER accuracy (weighted f1 score)."""
    ner_datas = tuple(parse_file(filename))

    if ner_datas[0].text[0] == 'INTENT':
        y_true, y_pred = get_intents(ner_datas)
        print('intents weighted f1: {}'.format(rounded_f1(y_true, y_pred)))

    if is_ner_not_empty(ner_datas):
        y_true, y_pred = get_entities(ner_datas)
        print('entities weighted f1: {}'.format(rounded_f1(y_true, y_pred)))


if __name__ == '__main__':
    fn = get_project_root(
    ) / 'runs' / '2018-12-20 chatbot' / '2018-12-20 chatbot joint.txt'
    print_scores(fn)