def preprocess(voc_path, txt_path): assert os.path.isfile(voc_path) assert os.path.isfile(txt_path) logger = create_logger(None, 0) bin_path = txt_path + ".pth" dico = Dictionary.read_vocab(voc_path) logger.info("") data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % (len(data['sentences']) - len(data['positions']), len(data['dico']), len(data['positions']))) if len(data['unk_words']) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." % (sum(data['unk_words'].values()), len( data['unk_words']), sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions'])))) if len(data['unk_words']) < 30: for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]: logger.info("%s: %i" % (w, c))
def create_binary(txt_path, bin_path, dico): data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % ( len(data['sentences']) - len(data['positions']), len(data['dico']), len(data['positions']) )) if len(data['unk_words']) > 0: logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % ( sum(data['unk_words'].values()), len(data['unk_words']), sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions'])) )) if len(data['unk_words']) < 30: for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]: logger.info("%s: %i" % (w, c)) else: logger.info("0 unknown word.")
from src.data.dictionary import Dictionary if __name__ == "__main__": logger = create_logger(None, 0) voc_path = sys.argv[1] txt_path = sys.argv[2] bin_path = sys.argv[2] + ".pth" assert os.path.isfile(voc_path) assert os.path.isfile(txt_path) dico = Dictionary.read_vocab(voc_path) logger.info("") data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % ( len(data["sentences"]) - len(data["positions"]), len(data["dico"]), len(data["positions"]), )) if len(data["unk_words"]) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." % ( sum(data["unk_words"].values()), len(data["unk_words"]), sum(data["unk_words"].values()) * 100.0 / (len(data["sentences"]) - len(data["positions"])), )) if len(data["unk_words"]) < 30: for w, c in sorted(data["unk_words"].items(),
# bin_path = 'data/cwmt.bin' src_voc_path = sys.argv[3] src_txt_path = sys.argv[1] tgt_voc_path = sys.argv[4] tgt_txt_path = sys.argv[2] bin_path = sys.argv[5] assert os.path.isfile(src_voc_path) assert os.path.isfile(src_txt_path) assert os.path.isfile(tgt_voc_path) assert os.path.isfile(tgt_txt_path) src_dico = Dictionary.read_vocab(src_voc_path) tgt_dico = Dictionary.read_vocab(tgt_voc_path) data = Dictionary.index_data(src_txt_path, tgt_txt_path, src_dico, tgt_dico, bin_path) if data is None: exit(0) logger.info("%i words (%i unique) in %i sentences." % (len(data['src_sentences']) - len(data['src_positions']), len(data['src_dico']), len(data['src_positions']))) logger.info("%i words (%i unique) in %i sentences." % (len(data['tgt_sentences']) - len(data['tgt_positions']), len(data['tgt_dico']), len(data['tgt_positions']))) if len(data['src_unk_words']) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." % (sum(data['src_unk_words'].values()), len(data['src_unk_words']), sum(data['src_unk_words'].values()) * 100. / (len(data['src_sentences']) - len(data['src_positions'])))) if len(data['src_unk_words']) < 30:
attr_cols.insert(0, review_col) logger.info(attr_list) logger.info(attr_cols) assert attr_list == sorted(attr_list) # read vocabulary dico = Dictionary.read_vocab(voc_path) # read attribute labels attr_values = read_attr_values(lbl_path) print(sorted(attr_values.keys()), attr_list) assert sorted(attr_values.keys()) == attr_list # index and export data attr_cols = [int(x) for x in attr_cols] data = Dictionary.index_data(txt_path, bin_path, dico, attr_list, attr_cols, attr_values) logger.info("%i words (%i unique) in %i sentences." % ( len(data['sentences']) - len(data['positions']), len(data['dico']), len(data['positions']) )) # print unknown words if len(data['unk_words']) > 0: logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % ( sum(data['unk_words'].values()), len(data['unk_words']), sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions'])) )) if len(data['unk_words']) < 30: for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]:
if '--allow-masked' in sys.argv[1:]: allow_masked = True sys.argv.remove('--allow-masked') voc_path = sys.argv[1] txt_path = sys.argv[2] bin_path = sys.argv[2] + '.pth' assert os.path.isfile(voc_path) assert os.path.isfile(txt_path) dico = Dictionary.read_vocab(voc_path) logger.info("") data = Dictionary.index_data(txt_path, bin_path, dico, allow_special=allow_masked) logger.info("%i words (%i unique) in %i sentences." % (len(data['sentences']) - len(data['positions']), len(data['dico']), len(data['positions']))) if len(data['unk_words']) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." % (sum(data['unk_words'].values()), len( data['unk_words']), sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions'])))) if len(data['unk_words']) < 30: for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]: logger.info("%s: %i" % (w, c))