Ejemplo n.º 1
0
def preprocess(voc_path, txt_path):

    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    logger = create_logger(None, 0)

    bin_path = txt_path + ".pth"

    dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['sentences']) - len(data['positions']),
                 len(data['dico']), len(data['positions'])))
    if len(data['unk_words']) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." %
            (sum(data['unk_words'].values()), len(
                data['unk_words']), sum(data['unk_words'].values()) * 100. /
             (len(data['sentences']) - len(data['positions']))))
        if len(data['unk_words']) < 30:
            for w, c in sorted(data['unk_words'].items(),
                               key=lambda x: x[1])[::-1]:
                logger.info("%s: %i" % (w, c))
Ejemplo n.º 2
0
def main(args):
    if args.table_label is None:
        args.table_label = args.table + "_label"
    if args.table_vocab is None:
        args.table_vocab = args.table + "_vocab"

    assert os.path.isfile(args.table)
    assert os.path.isfile(args.table_label)
    assert os.path.isfile(args.table_vocab)

    print_args(args)

    table_dico = Dictionary.read_vocab(args.table_vocab)

    table_data = Dictionary.index_table(args.table, args.table_label,
                                        table_dico, args.table + ".pth")
def main(args):
    if args.summary_vocab is None:
        args.summary_vocab = args.summary + "_vocab"
    if args.summary_label is None:
        args.summary_label = args.summary + "_label"

    assert os.path.isfile(args.summary)
    assert os.path.isfile(args.summary_vocab)
    assert os.path.isfile(args.summary_label)

    print_args(args)

    summary_dico = Dictionary.read_vocab(args.summary_vocab)
    summary_data = Dictionary.index_summary(args.summary,
                                            args.summary_label,
                                            summary_dico,
                                            args.summary + ".pth",
                                            max_len=args.summary_max_length)
Ejemplo n.º 4
0

def print_args(args):
    print("table:\t{}".format(args.table))
    print("table_label:\t{}".format(args.table_label))
    print("table_vocab:\t{}".format(args.table_vocab))


if __name__ == '__main__':
    readme = ""
    parser = argparse.ArgumentParser(description=readme)
    parser.add_argument('--table', help="table dataOld")
    parser.add_argument('--table_label', help="table label")
    parser.add_argument('--table_vocab', help="table vocab")
    args = parser.parse_args()

    if args.table_label is None:
        args.table_label = args.table + "_label"
    if args.table_vocab is None:
        args.table_vocab = args.table + "_vocab"

    assert os.path.isfile(args.table)
    assert os.path.isfile(args.table_label)
    assert os.path.isfile(args.table_vocab)

    print_args(args)

    table_dico = Dictionary.read_vocab(args.table_vocab)
    table_data = Dictionary.index_table(args.table, args.table_label,
                                        table_dico, args.table + ".pth")
Ejemplo n.º 5
0
import sys

from src.logger import create_logger
from src.data.dictionary import Dictionary

if __name__ == "__main__":

    logger = create_logger(None, 0)

    voc_path = sys.argv[1]
    txt_path = sys.argv[2]
    bin_path = sys.argv[2] + ".pth"
    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." % (
        len(data["sentences"]) - len(data["positions"]),
        len(data["dico"]),
        len(data["positions"]),
    ))
    if len(data["unk_words"]) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." % (
                sum(data["unk_words"].values()),
                len(data["unk_words"]),
                sum(data["unk_words"].values()) * 100.0 /
                (len(data["sentences"]) - len(data["positions"])),
Ejemplo n.º 6
0
    # src_txt_path = 'data/all.zh.bpe'
    # tgt_voc_path = 'data/vocab.en'
    # tgt_txt_path = 'data/all.en.bpe'

    # bin_path = 'data/cwmt.bin'
    src_voc_path = sys.argv[3]
    src_txt_path = sys.argv[1]
    tgt_voc_path = sys.argv[4]
    tgt_txt_path = sys.argv[2]
    bin_path = sys.argv[5]
    assert os.path.isfile(src_voc_path)
    assert os.path.isfile(src_txt_path)
    assert os.path.isfile(tgt_voc_path)
    assert os.path.isfile(tgt_txt_path)

    src_dico = Dictionary.read_vocab(src_voc_path)
    tgt_dico = Dictionary.read_vocab(tgt_voc_path)

    data = Dictionary.index_data(src_txt_path, tgt_txt_path, src_dico,
                                 tgt_dico, bin_path)
    if data is None:
        exit(0)
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['src_sentences']) - len(data['src_positions']),
                 len(data['src_dico']), len(data['src_positions'])))
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['tgt_sentences']) - len(data['tgt_positions']),
                 len(data['tgt_dico']), len(data['tgt_positions'])))
    if len(data['src_unk_words']) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." %
if __name__ == '__main__':
    readme = ""
    parser = argparse.ArgumentParser(description=readme)
    parser.add_argument('--summary', help="summary data")
    parser.add_argument('--summary_vocab', help="summary data vocab")
    parser.add_argument('--summary_label', help="summary data label")
    parser.add_argument('--summary_max_length',
                        type=int,
                        default=600,
                        help="summmary maximum length")
    args = parser.parse_args()

    if args.summary_vocab is None:
        args.summary_vocab = args.summary + "_vocab"
    if args.summary_label is None:
        args.summary_label = args.summary + "_label"

    assert os.path.isfile(args.summary)
    assert os.path.isfile(args.summary_vocab)
    assert os.path.isfile(args.summary_label)

    print_args(args)

    summary_dico = Dictionary.read_vocab(args.summary_vocab)
    summary_data = Dictionary.index_summary(args.summary,
                                            args.summary_label,
                                            summary_dico,
                                            args.summary + ".pth",
                                            max_len=args.summary_max_length)
Ejemplo n.º 8
0
    voc_path = sys.argv[1]
    txt_path = sys.argv[2]
    bin_path = sys.argv[2] + '.pth'
    if len(sys.argv) > 3 :
        special_tokens = sys.argv[3].split(",")
    else :
        special_tokens = []
        
    if True :
        # bias corpus
        special_tokens.extend(["<url>", "<email>", "<phone>", "<number>", "<digit>", "<cur>"])
        
    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    dico = Dictionary.read_vocab(voc_path, special_tokens)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." % (
        len(data['sentences']) - len(data['positions']),
        len(data['dico']),
        len(data['positions'])
    ))
    if len(data['unk_words']) > 0:
        logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % (
            sum(data['unk_words'].values()),
            len(data['unk_words']),
            sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions']))
        ))
        if len(data['unk_words']) < 30:
Ejemplo n.º 9
0
                    default=4,
                    help="beam search size")
parser.add_argument("--length_penalty",
                    type=float,
                    default=1.0,
                    help="length penalty")
parser.add_argument("--clip_grad_norm",
                    type=float,
                    default=5.0,
                    help="clip grad norm")
parser.add_argument("--checkpoint_dir", type=str, default='all_models')
params = parser.parse_args()
params.gpu_num = 1
params.seed = 1234
params.reload_model = 'all_models/pe2zh_model_epoch4_update210000.pt'
params.src_dico = Dictionary.read_vocab('data_pe2zh/vocab.pe')
params.tgt_dico = Dictionary.read_vocab('data_pe2zh/vocab.zh')
params.eos_index = params.src_dico.index(EOS_WORD)  # 1
params.pad_index = params.src_dico.index(PAD_WORD)  # 2
params.unk_index = params.src_dico.index(UNK_WORD)  # 3
params.bos_index = params.src_dico.index(BOS_WORD)  # 0
params.src_n_words = len(params.src_dico)
params.tgt_n_words = len(params.tgt_dico)
encoder, decoder, _ = build_mt_model(params)
encoder.eval()
decoder.eval()


def preprocess(s, iszh=False, table=None):
    # moses tools chain
    # norm punc, tokenize, ...