Esempio n. 1
0
def main(ckpt_path, tokenizer_path, text_path, criterion, max_len, gpu='0'):
    """

    :param ckpt_path:
    :param tokenizer_path:
    :param lexsub_path:
    :param cand_path:
    :param output_dir:
    :param metric:
    :param gpu:
    :return:
    """
    import dgm4nlp.device as dev
    config = dev.tf_config(gpu, allow_growth=True)
    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

    sent_embedding(
        ckpt_path=ckpt_path,
        graph_file='%s.meta'%(ckpt_path),
        tokenizer_path=tokenizer_path,
        text_path=text_path,
        criterion=criterion,
        max_len=max_len,
        config=config,
    )
    return
def main(ckpt_path,
         tokenizer_path,
         lexsub_path,
         cand_path,
         output_dir,
         metric='kl',
         gpu='0'):
    """

    :param ckpt_path:
    :param tokenizer_path:
    :param lexsub_path:
    :param cand_path:
    :param output_dir:
    :param metric:
    :param gpu:
    :return:
    """
    import dgm4nlp.device as dev
    config = dev.tf_config(gpu, allow_growth=True)
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    os.makedirs(output_dir, exist_ok=True)  # make sure base_dir exists
    #output_dir = tempfile.mkdtemp(dir=output_dir)
    logging.info('Workspace: %s', output_dir)
    #shutil.copy(os.path.abspath(__file__), output_dir)
    if metric == 'kl':
        _ = lex_subs_KL(
            ckpt_path=ckpt_path,
            graph_file='%s.meta' % (ckpt_path),
            tokenizer_path=tokenizer_path,
            lexsub_path=lexsub_path,
            cand_path=cand_path,
            output_dir=output_dir,
            config=config,
        )
    elif metric == 'cos':
        _ = lex_subs(
            ckpt_path=ckpt_path,
            graph_file='%s.meta' % (ckpt_path),
            tokenizer_path=tokenizer_path,
            lexsub_path=lexsub_path,
            cand_path=cand_path,
            output_dir=output_dir,
            heuristic='cos',  # add, mult, cos
            config=config,
        )
    return
Esempio n. 3
0
def main(ckpt_path,
         tokenizer_path,
         test_path,
         output_dir,
         batch_size=100,
         dz=100,
         gpu='0'):
    """

    :param ckpt_path:
    :param tokenizer_path:
    :param test_path:
    :param output_dir:
    :param batch_size:
    :param dz:
    :param gpu:
    :return:
    """
    import dgm4nlp.device as dev
    config = dev.tf_config(gpu, allow_growth=True)
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    os.makedirs(output_dir, exist_ok=True)  # make sure base_dir exists
    #output_dir = tempfile.mkdtemp(dir=output_dir)
    logging.info('Workspace: %s', output_dir)
    #shutil.copy(os.path.abspath(__file__), output_dir)
    extract_vec_dict(
        ckpt_path=ckpt_path,
        graph_file='%s.meta' % (ckpt_path),
        tokenizer_path=tokenizer_path,
        test_x=test_path,  # '/mnt/data/mrios/dgm4nlp/test_data/1k.e', #
        batch_size=batch_size,
        output_dir=output_dir,
        output_type='word2vec',
        dz=dz,
        config=config)
    return
Esempio n. 4
0
def main(ckpt_path,
         tokenizer_path,
         test_x,
         test_y,
         test_naacl,
         gpu='0',
         batch_size=100):
    """
    :param ckpt_path: path to checkpoint saved by trainer (without suffix .meta)
    :param tokenizer_path: path to tokenizer saved by trainer
    :param test_x: a path or a list of paths (for multiple test sets)
    :param test_y: a path or a list of paths (for multiple test sets)
    :param test_naacl: a path or a list of paths (for multiple test sets)
    :param gpu: which GPU to use (as a string)
    :return:
    """
    import dgm4nlp.device as dev
    config = dev.tf_config(gpu, allow_growth=True)
    tks = dill.load(open(tokenizer_path, 'rb'))
    if type(test_x) is str:
        test_x = [test_x]
        test_y = [test_y]
        test_naacl = [test_naacl]
    decoder = EmbedAlignDecoder(ckpt_path=ckpt_path,
                                config=config,
                                vx=tks[0].vocab_size(),
                                vy=tks[1].vocab_size())
    header = ['dataset', 'loss', 'perp_x', 'perp_y', 'acc_x', 'acc_y', 'AER']
    rows = []
    for x, y, a in zip(test_x, test_y, test_naacl):
        test, test_wa = prepare_test(tks, x, y, a)
        aer, acc_x, acc_y, loss, perp_x, perp_y = decoder.evaluate(
            test, iter(test_wa), batch_size=batch_size)
        rows.append(
            [os.path.basename(a), loss, perp_x, perp_y, acc_x, acc_y, aer])
    print(tabulate(rows, header))
"""
:Authors: - Wilker Aziz
"""
import sys

if len(sys.argv) != 7:
    raise ValueError('Usage: %s data-dir x-lang y-lang architecture working-dir gpu' % sys.argv[0])

data_dir, x_lang, y_lang, architecture, output_dir, gpu = sys.argv[1:]


def make_file_name(stem, suffix, basedir=''):
    return '%s/%s.%s' % (basedir, stem, suffix) if basedir else '%s.%s' % (stem, suffix)

import dgm4nlp.device as dev
config = dev.tf_config(gpu, allow_growth=True)

import os
import logging
from dgm4nlp.tf.embedalign.trainer import test_embedalign
from dgm4nlp.tf.embedalign.architectures import get_hparams

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

training_x = make_file_name('training', x_lang, data_dir)
training_y = make_file_name('training', y_lang, data_dir)
#val_x = make_file_name('dev', x_lang, data_dir)
#val_y = make_file_name('dev', y_lang, data_dir)
#val_naacl = make_file_name('dev', 'naacl', data_dir)
#test_x = make_file_name('test', x_lang, data_dir)
#test_y = make_file_name('test', y_lang, data_dir)
Esempio n. 6
0
"""
:Authors: - Wilker Aziz
"""
import dgm4nlp.device as dev
config = dev.tf_config('0', allow_growth=True)

import os
import numpy as np
import logging
import tempfile
import shutil
from datetime import datetime
from dgm4nlp.annealing import AnnealingSchedule
from embedalign.trainer import test_embedalign
from embedalign.infnet import get_infnet

import tensorflow as tf

np.random.seed(42)

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

# 1. Get a unique working directory and save script for reproducibility
base_dir = 'debug/hansards'
os.makedirs(base_dir, exist_ok=True)  # make sure base_dir exists
output_dir = tempfile.mkdtemp(prefix=datetime.now().strftime("%y-%m-%d.%Hh%Mm%Ss."), dir=base_dir)
logging.info('Workspace: %s', output_dir)
shutil.copy(os.path.abspath(__file__), output_dir)


test_embedalign(