def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_file", default=None, type=str, required=True, help="training file") parser.add_argument("--dev_file", default=None, type=str, required=True, help="development file") parser.add_argument("--output_dir", default=None, type=str, required=True, help="output directory for tokenizers and models") parser.add_argument("--num_epochs", default=10, type=int, required=False, help="number of epochs for training") parser.add_argument("--vocab_size", default=50000, type=int, required=False, help="vocabulary size") parser.add_argument("--hidden_size", default=300, type=int, required=False, help="hidden size of GRU") parser.add_argument("--embed_size", default=300, type=int, required=False, help="word embedding size") parser.add_argument("--batch_size", default=64, type=int, required=False, help="batch size for train and eval") parser.add_argument("--loss_function", default="hinge", type=str, required=False, choices=["CrossEntropy", "hinge"], help="which loss function to choose") args = parser.parse_args() # load dataset train_df = pd.read_csv(args.train_file)[["title", "reply"]] dev_df = pd.read_csv(args.dev_file)[["title", "reply"]] texts = list(train_df["title"]) + list(train_df["reply"]) tokenizer = create_tokenizer(texts, args.vocab_size) title_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size) reply_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size) model = DualEncoder(title_encoder, reply_encoder, type=args.loss_function) if args.loss_function == "CrossEntropy": loss_fn = nn.BCEWithLogitsLoss() elif args.loss_function == "hinge": loss_fn = nn.CosineEmbeddingLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) pickle.dump(tokenizer, open(os.path.join(args.output_dir, "tokenizer.pickle"), "wb")) best_acc = 0. for epoch in range(args.num_epochs): print("start epoch {}".format(epoch)) train(train_df, model, loss_fn, optimizer, device, tokenizer, args) acc = evaluate(dev_df, model, loss_fn, device, tokenizer, args) if acc > best_acc: best_acc = acc print("saving best model") torch.save(model.state_dict(), os.path.join(args.output_dir, "faq_model.pth"))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_file", default=None, type=str, required=True, help="training file") parser.add_argument("--output_dir", default=None, type=str, required=True, help="output directory for tokenizers and models") parser.add_argument("--batch_size", default=64, type=int, required=False, help="batch size for train and eval") parser.add_argument("--hidden_size", default=300, type=int, required=False, help="hidden size of GRU") parser.add_argument("--embed_size", default=300, type=int, required=False, help="word embedding size") args = parser.parse_args() # load dataset train_df = pd.read_csv(args.train_file)[["title", "reply"]] tokenizer = pickle.load(open(os.path.join(args.output_dir, "tokenizer.pickle"), "rb")) title_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size) reply_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size) model = DualEncoder(title_encoder, reply_encoder) model.load_state_dict(torch.load(os.path.join(args.output_dir, "faq_model.pth"))) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) candidate_file = os.path.join(args.output_dir, "reply_candidates.pickle") if not os.path.isfile(candidate_file): replies, vectors = prepare_replies(train_df, model, device, tokenizer, args) pickle.dump([replies, vectors], open(candidate_file, "wb")) else: replies, vectors = pickle.load(open(candidate_file, "rb")) while True: title = input("你的问题是?\n") if len(title.strip()) == 0: continue title = [title] x, x_mask = list2tensor(title, tokenizer) x = x.to(device) x_mask = x_mask.to(device) x_rep = model.encoder2(x, x_mask).data.cpu().numpy() scores = cosine_similarity(x_rep, vectors)[0] index = np.argmax(scores) print("可能的答案:", replies[index])
def createModel(self): encoder = GRUEncoder.Encoder(self.config.num_encoder_tokens, self.config.embedding_dim, self.config.hidden_size, self.config.batch_size) decoder = GRUDecoder.Decoder(self.config.num_decoder_tokens, self.config.embedding_dim, self.config.hidden_size, self.config.batch_size) return encoder, decoder
hidden_size = 8 # 得到输入和输出的字符映射表 source_int_to_letter, source_letter_to_int = extract_character_vocab( source_data + target_data) target_int_to_letter, target_letter_to_int = extract_character_vocab( source_data + target_data) # 将每一行转换成字符id的list source_int = [[ source_letter_to_int.get(letter, source_letter_to_int['<UNK>']) for letter in line ] for line in source_data] # 在 output sequence 后添加 <EOS> tag target_int = [[ target_letter_to_int.get(letter, target_letter_to_int['<UNK>']) for letter in line ] + [target_letter_to_int['<EOS>']] for line in target_data] # inputs, outputs, seq_pairs encoder = GRUEncoder(len(target_letter_to_int), hidden_size) attn_decoder = AttnDecoder(hidden_size, len(target_letter_to_int), dropout_p=0.1) # all 32000 trainIters(encoder, attn_decoder, 30000, print_every=500)