def preprocess(voc_path, txt_path): assert os.path.isfile(voc_path) assert os.path.isfile(txt_path) logger = create_logger(None, 0) bin_path = txt_path + ".pth" dico = Dictionary.read_vocab(voc_path) logger.info("") data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % (len(data['sentences']) - len(data['positions']), len(data['dico']), len(data['positions']))) if len(data['unk_words']) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." % (sum(data['unk_words'].values()), len( data['unk_words']), sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions'])))) if len(data['unk_words']) < 30: for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]: logger.info("%s: %i" % (w, c))
def main(args): if args.table_label is None: args.table_label = args.table + "_label" if args.table_vocab is None: args.table_vocab = args.table + "_vocab" assert os.path.isfile(args.table) assert os.path.isfile(args.table_label) assert os.path.isfile(args.table_vocab) print_args(args) table_dico = Dictionary.read_vocab(args.table_vocab) table_data = Dictionary.index_table(args.table, args.table_label, table_dico, args.table + ".pth")
def main(args): if args.summary_vocab is None: args.summary_vocab = args.summary + "_vocab" if args.summary_label is None: args.summary_label = args.summary + "_label" assert os.path.isfile(args.summary) assert os.path.isfile(args.summary_vocab) assert os.path.isfile(args.summary_label) print_args(args) summary_dico = Dictionary.read_vocab(args.summary_vocab) summary_data = Dictionary.index_summary(args.summary, args.summary_label, summary_dico, args.summary + ".pth", max_len=args.summary_max_length)
def print_args(args): print("table:\t{}".format(args.table)) print("table_label:\t{}".format(args.table_label)) print("table_vocab:\t{}".format(args.table_vocab)) if __name__ == '__main__': readme = "" parser = argparse.ArgumentParser(description=readme) parser.add_argument('--table', help="table dataOld") parser.add_argument('--table_label', help="table label") parser.add_argument('--table_vocab', help="table vocab") args = parser.parse_args() if args.table_label is None: args.table_label = args.table + "_label" if args.table_vocab is None: args.table_vocab = args.table + "_vocab" assert os.path.isfile(args.table) assert os.path.isfile(args.table_label) assert os.path.isfile(args.table_vocab) print_args(args) table_dico = Dictionary.read_vocab(args.table_vocab) table_data = Dictionary.index_table(args.table, args.table_label, table_dico, args.table + ".pth")
import sys from src.logger import create_logger from src.data.dictionary import Dictionary if __name__ == "__main__": logger = create_logger(None, 0) voc_path = sys.argv[1] txt_path = sys.argv[2] bin_path = sys.argv[2] + ".pth" assert os.path.isfile(voc_path) assert os.path.isfile(txt_path) dico = Dictionary.read_vocab(voc_path) logger.info("") data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % ( len(data["sentences"]) - len(data["positions"]), len(data["dico"]), len(data["positions"]), )) if len(data["unk_words"]) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." % ( sum(data["unk_words"].values()), len(data["unk_words"]), sum(data["unk_words"].values()) * 100.0 / (len(data["sentences"]) - len(data["positions"])),
# src_txt_path = 'data/all.zh.bpe' # tgt_voc_path = 'data/vocab.en' # tgt_txt_path = 'data/all.en.bpe' # bin_path = 'data/cwmt.bin' src_voc_path = sys.argv[3] src_txt_path = sys.argv[1] tgt_voc_path = sys.argv[4] tgt_txt_path = sys.argv[2] bin_path = sys.argv[5] assert os.path.isfile(src_voc_path) assert os.path.isfile(src_txt_path) assert os.path.isfile(tgt_voc_path) assert os.path.isfile(tgt_txt_path) src_dico = Dictionary.read_vocab(src_voc_path) tgt_dico = Dictionary.read_vocab(tgt_voc_path) data = Dictionary.index_data(src_txt_path, tgt_txt_path, src_dico, tgt_dico, bin_path) if data is None: exit(0) logger.info("%i words (%i unique) in %i sentences." % (len(data['src_sentences']) - len(data['src_positions']), len(data['src_dico']), len(data['src_positions']))) logger.info("%i words (%i unique) in %i sentences." % (len(data['tgt_sentences']) - len(data['tgt_positions']), len(data['tgt_dico']), len(data['tgt_positions']))) if len(data['src_unk_words']) > 0: logger.info( "%i unknown words (%i unique), covering %.2f%% of the data." %
if __name__ == '__main__': readme = "" parser = argparse.ArgumentParser(description=readme) parser.add_argument('--summary', help="summary data") parser.add_argument('--summary_vocab', help="summary data vocab") parser.add_argument('--summary_label', help="summary data label") parser.add_argument('--summary_max_length', type=int, default=600, help="summmary maximum length") args = parser.parse_args() if args.summary_vocab is None: args.summary_vocab = args.summary + "_vocab" if args.summary_label is None: args.summary_label = args.summary + "_label" assert os.path.isfile(args.summary) assert os.path.isfile(args.summary_vocab) assert os.path.isfile(args.summary_label) print_args(args) summary_dico = Dictionary.read_vocab(args.summary_vocab) summary_data = Dictionary.index_summary(args.summary, args.summary_label, summary_dico, args.summary + ".pth", max_len=args.summary_max_length)
voc_path = sys.argv[1] txt_path = sys.argv[2] bin_path = sys.argv[2] + '.pth' if len(sys.argv) > 3 : special_tokens = sys.argv[3].split(",") else : special_tokens = [] if True : # bias corpus special_tokens.extend(["<url>", "<email>", "<phone>", "<number>", "<digit>", "<cur>"]) assert os.path.isfile(voc_path) assert os.path.isfile(txt_path) dico = Dictionary.read_vocab(voc_path, special_tokens) logger.info("") data = Dictionary.index_data(txt_path, bin_path, dico) logger.info("%i words (%i unique) in %i sentences." % ( len(data['sentences']) - len(data['positions']), len(data['dico']), len(data['positions']) )) if len(data['unk_words']) > 0: logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % ( sum(data['unk_words'].values()), len(data['unk_words']), sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions'])) )) if len(data['unk_words']) < 30:
default=4, help="beam search size") parser.add_argument("--length_penalty", type=float, default=1.0, help="length penalty") parser.add_argument("--clip_grad_norm", type=float, default=5.0, help="clip grad norm") parser.add_argument("--checkpoint_dir", type=str, default='all_models') params = parser.parse_args() params.gpu_num = 1 params.seed = 1234 params.reload_model = 'all_models/pe2zh_model_epoch4_update210000.pt' params.src_dico = Dictionary.read_vocab('data_pe2zh/vocab.pe') params.tgt_dico = Dictionary.read_vocab('data_pe2zh/vocab.zh') params.eos_index = params.src_dico.index(EOS_WORD) # 1 params.pad_index = params.src_dico.index(PAD_WORD) # 2 params.unk_index = params.src_dico.index(UNK_WORD) # 3 params.bos_index = params.src_dico.index(BOS_WORD) # 0 params.src_n_words = len(params.src_dico) params.tgt_n_words = len(params.tgt_dico) encoder, decoder, _ = build_mt_model(params) encoder.eval() decoder.eval() def preprocess(s, iszh=False, table=None): # moses tools chain # norm punc, tokenize, ...