Ejemplo n.º 1
0
def preprocess(voc_path, txt_path):

    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    logger = create_logger(None, 0)

    bin_path = txt_path + ".pth"

    dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['sentences']) - len(data['positions']),
                 len(data['dico']), len(data['positions'])))
    if len(data['unk_words']) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." %
            (sum(data['unk_words'].values()), len(
                data['unk_words']), sum(data['unk_words'].values()) * 100. /
             (len(data['sentences']) - len(data['positions']))))
        if len(data['unk_words']) < 30:
            for w, c in sorted(data['unk_words'].items(),
                               key=lambda x: x[1])[::-1]:
                logger.info("%s: %i" % (w, c))
Ejemplo n.º 2
0
def create_binary(txt_path, bin_path, dico):

	data = Dictionary.index_data(txt_path, bin_path, dico)
	logger.info("%i words (%i unique) in %i sentences." % (
		len(data['sentences']) - len(data['positions']),
		len(data['dico']),
		len(data['positions'])
	))

	if len(data['unk_words']) > 0:
		logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % (
			sum(data['unk_words'].values()),
			len(data['unk_words']),
			sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions']))
		))
		if len(data['unk_words']) < 30:
			for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]:
				logger.info("%s: %i" % (w, c))
	else:
		logger.info("0 unknown word.")
Ejemplo n.º 3
0
from src.data.dictionary import Dictionary

if __name__ == "__main__":

    logger = create_logger(None, 0)

    voc_path = sys.argv[1]
    txt_path = sys.argv[2]
    bin_path = sys.argv[2] + ".pth"
    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." % (
        len(data["sentences"]) - len(data["positions"]),
        len(data["dico"]),
        len(data["positions"]),
    ))
    if len(data["unk_words"]) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." % (
                sum(data["unk_words"].values()),
                len(data["unk_words"]),
                sum(data["unk_words"].values()) * 100.0 /
                (len(data["sentences"]) - len(data["positions"])),
            ))
        if len(data["unk_words"]) < 30:
            for w, c in sorted(data["unk_words"].items(),
Ejemplo n.º 4
0
    # bin_path = 'data/cwmt.bin'
    src_voc_path = sys.argv[3]
    src_txt_path = sys.argv[1]
    tgt_voc_path = sys.argv[4]
    tgt_txt_path = sys.argv[2]
    bin_path = sys.argv[5]
    assert os.path.isfile(src_voc_path)
    assert os.path.isfile(src_txt_path)
    assert os.path.isfile(tgt_voc_path)
    assert os.path.isfile(tgt_txt_path)

    src_dico = Dictionary.read_vocab(src_voc_path)
    tgt_dico = Dictionary.read_vocab(tgt_voc_path)

    data = Dictionary.index_data(src_txt_path, tgt_txt_path, src_dico,
                                 tgt_dico, bin_path)
    if data is None:
        exit(0)
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['src_sentences']) - len(data['src_positions']),
                 len(data['src_dico']), len(data['src_positions'])))
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['tgt_sentences']) - len(data['tgt_positions']),
                 len(data['tgt_dico']), len(data['tgt_positions'])))
    if len(data['src_unk_words']) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." %
            (sum(data['src_unk_words'].values()), len(data['src_unk_words']),
             sum(data['src_unk_words'].values()) * 100. /
             (len(data['src_sentences']) - len(data['src_positions']))))
        if len(data['src_unk_words']) < 30:
    attr_cols.insert(0, review_col)
    logger.info(attr_list)
    logger.info(attr_cols)
    assert attr_list == sorted(attr_list)

    # read vocabulary
    dico = Dictionary.read_vocab(voc_path)

    # read attribute labels
    attr_values = read_attr_values(lbl_path)
    print(sorted(attr_values.keys()), attr_list)
    assert sorted(attr_values.keys()) == attr_list

    # index and export data
    attr_cols = [int(x) for x in attr_cols]
    data = Dictionary.index_data(txt_path, bin_path, dico, attr_list, attr_cols, attr_values)
    logger.info("%i words (%i unique) in %i sentences." % (
        len(data['sentences']) - len(data['positions']),
        len(data['dico']),
        len(data['positions'])
    ))

    # print unknown words
    if len(data['unk_words']) > 0:
        logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % (
            sum(data['unk_words'].values()),
            len(data['unk_words']),
            sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions']))
        ))
        if len(data['unk_words']) < 30:
            for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]:
Ejemplo n.º 6
0
    if '--allow-masked' in sys.argv[1:]:
        allow_masked = True
        sys.argv.remove('--allow-masked')

    voc_path = sys.argv[1]
    txt_path = sys.argv[2]
    bin_path = sys.argv[2] + '.pth'
    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path,
                                 bin_path,
                                 dico,
                                 allow_special=allow_masked)
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['sentences']) - len(data['positions']),
                 len(data['dico']), len(data['positions'])))
    if len(data['unk_words']) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." %
            (sum(data['unk_words'].values()), len(
                data['unk_words']), sum(data['unk_words'].values()) * 100. /
             (len(data['sentences']) - len(data['positions']))))
        if len(data['unk_words']) < 30:
            for w, c in sorted(data['unk_words'].items(),
                               key=lambda x: x[1])[::-1]:
                logger.info("%s: %i" % (w, c))