Example #1
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--train_file", default=None, type=str, required=True,
						help="training file")
	parser.add_argument("--dev_file", default=None, type=str, required=True,
						help="development file")
	parser.add_argument("--output_dir", default=None, type=str, required=True,
						help="output directory for tokenizers and models")
	parser.add_argument("--num_epochs", default=10, type=int, required=False,
						help="number of epochs for training")
	parser.add_argument("--vocab_size", default=50000, type=int, required=False,
						help="vocabulary size")
	parser.add_argument("--hidden_size", default=300, type=int, required=False,
						help="hidden size of GRU")
	parser.add_argument("--embed_size", default=300, type=int, required=False,
						help="word embedding size")
	parser.add_argument("--batch_size", default=64, type=int, required=False,
						help="batch size for train and eval")
	parser.add_argument("--loss_function", default="hinge", type=str, required=False,
						choices=["CrossEntropy", "hinge"],
						help="which loss function to choose")
	args = parser.parse_args()

	# load dataset
	train_df = pd.read_csv(args.train_file)[["title", "reply"]]
	dev_df = pd.read_csv(args.dev_file)[["title", "reply"]]
	texts = list(train_df["title"]) + list(train_df["reply"])
	tokenizer = create_tokenizer(texts, args.vocab_size)

	title_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size)
	reply_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size)
	model = DualEncoder(title_encoder, reply_encoder, type=args.loss_function)
	if args.loss_function == "CrossEntropy":
		loss_fn = nn.BCEWithLogitsLoss()
	elif args.loss_function == "hinge":
		loss_fn = nn.CosineEmbeddingLoss()
	optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
	model = model.to(device)
	
	if not os.path.exists(args.output_dir):
		os.makedirs(args.output_dir)
	pickle.dump(tokenizer, open(os.path.join(args.output_dir, "tokenizer.pickle"), "wb"))

	best_acc = 0.
	for epoch in range(args.num_epochs):
		print("start epoch {}".format(epoch))
		train(train_df, model, loss_fn, optimizer, device, tokenizer, args)	
		acc = evaluate(dev_df, model, loss_fn, device, tokenizer, args)
		if acc > best_acc:
			best_acc = acc
			print("saving best model")
			torch.save(model.state_dict(), os.path.join(args.output_dir, "faq_model.pth"))
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_file", default=None, type=str, required=True,
                        help="training file")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="output directory for tokenizers and models")
    parser.add_argument("--batch_size", default=64, type=int, required=False,
                        help="batch size for train and eval")
    parser.add_argument("--hidden_size", default=300, type=int, required=False,
                        help="hidden size of GRU")
    parser.add_argument("--embed_size", default=300, type=int, required=False,
                        help="word embedding size")
    args = parser.parse_args()

    # load dataset
    train_df = pd.read_csv(args.train_file)[["title", "reply"]]
    tokenizer = pickle.load(open(os.path.join(args.output_dir, "tokenizer.pickle"), "rb"))

    title_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size)
    reply_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size)
    model = DualEncoder(title_encoder, reply_encoder)
    model.load_state_dict(torch.load(os.path.join(args.output_dir, "faq_model.pth")))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
    model = model.to(device)

    candidate_file = os.path.join(args.output_dir, "reply_candidates.pickle")
    if not os.path.isfile(candidate_file):
        replies, vectors = prepare_replies(train_df, model, device, tokenizer, args) 
        pickle.dump([replies, vectors], open(candidate_file, "wb"))
    else:
        replies, vectors = pickle.load(open(candidate_file, "rb"))


    while True:
        title = input("你的问题是?\n")
        if len(title.strip()) == 0:
            continue
        title = [title] 
        x, x_mask = list2tensor(title, tokenizer)
        x = x.to(device)	
        x_mask = x_mask.to(device)
        x_rep = model.encoder2(x, x_mask).data.cpu().numpy()
        scores = cosine_similarity(x_rep, vectors)[0]
        index = np.argmax(scores)
        print("可能的答案:", replies[index])
Example #3
0
    def createModel(self):

        encoder = GRUEncoder.Encoder(self.config.num_encoder_tokens,
                                     self.config.embedding_dim,
                                     self.config.hidden_size,
                                     self.config.batch_size)
        decoder = GRUDecoder.Decoder(self.config.num_decoder_tokens,
                                     self.config.embedding_dim,
                                     self.config.hidden_size,
                                     self.config.batch_size)

        return encoder, decoder
Example #4
0
    hidden_size = 8

    # 得到输入和输出的字符映射表

    source_int_to_letter, source_letter_to_int = extract_character_vocab(
        source_data + target_data)

    target_int_to_letter, target_letter_to_int = extract_character_vocab(
        source_data + target_data)

    # 将每一行转换成字符id的list
    source_int = [[
        source_letter_to_int.get(letter, source_letter_to_int['<UNK>'])
        for letter in line
    ] for line in source_data]

    # 在 output sequence 后添加 <EOS> tag
    target_int = [[
        target_letter_to_int.get(letter, target_letter_to_int['<UNK>'])
        for letter in line
    ] + [target_letter_to_int['<EOS>']] for line in target_data]

    # inputs, outputs, seq_pairs
    encoder = GRUEncoder(len(target_letter_to_int), hidden_size)

    attn_decoder = AttnDecoder(hidden_size,
                               len(target_letter_to_int),
                               dropout_p=0.1)

    # all 32000
    trainIters(encoder, attn_decoder, 30000, print_every=500)