# Build bert model. model = build_model(args) # Load pretrained model. pretrained_model_dict = torch.load(args.pretrained_model_path) model.load_state_dict(pretrained_model_dict, strict=False) model = GenerateModel(args, model) # Build tokenizer. tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) with open(args.input_path, mode="r", encoding="utf-8") as f: line = f.readline().strip() src = [vocab.get(t) for t in tokenizer.tokenize(line.strip())] seg = [1] * len(src) start_length = len(src) if len(src) > args.seq_length: src = src[:args.seq_length] seg = seg[:args.seq_length] src = [src] seg = [seg] src_tensor = torch.LongTensor(src) seg_tensor = torch.LongTensor(seg) f_output = open(args.output_path, mode="w", encoding="utf-8") for i in range(args.seq_length - start_length): prob = model(src_tensor, seg_tensor) top_token = (-prob[0][-1]).argsort()[random.randint(0, 2)]
print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) seq_encoder = nn.DataParallel(seq_encoder) seq_encoder = seq_encoder.to(device) # Build tokenizer if args.tokenizer == "mixed": tokenizer = MixedTokenizer(vocab) else: tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) dataset = [] with open(args.input_path, mode="r", encoding="utf-8") as f: for line in f: tokens = [vocab.get(t) for t in tokenizer.tokenize(line)] if len(tokens) == 0: continue tokens = [CLS_ID] + tokens seg = [1] * len(tokens) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] seg = seg[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(PAD_ID) seg.append(PAD_ID) dataset.append((tokens, seg)) input_ids = torch.LongTensor([e[0] for e in dataset]) seg_ids = torch.LongTensor([e[1] for e in dataset])
if args.spm_model_path: try: import sentencepiece as spm except ImportError: raise ImportError("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece") sp_model = spm.SentencePieceProcessor() sp_model.Load(args.spm_model_path) vocab = Vocab() vocab.i2w = {i: sp_model.IdToPiece(i) for i in range(sp_model.GetPieceSize())} else: vocab = Vocab() vocab.load(args.vocab_path) pretrained_model = torch.load(args.load_model_path) embedding = pretrained_model["embedding.word_embedding.weight"] with open(args.word_embedding_path, mode="w", encoding="utf-8") as f: head=str(list(embedding.size())[0])+" "+str(list(embedding.size())[1])+"\n" f.write(head) for i in range(len(vocab.i2w)): word = vocab.i2w[i] word_embedding = embedding[vocab.get(word), :] word_embedding = word_embedding.cpu().numpy().tolist() line = str(word) for j in range(len(word_embedding)): line = line + " " + str(word_embedding[j]) line += "\n" f.write(line)
line = line.strip().split("\t") if len(line) != 2: continue target_word, context = line[0], line[1] print("Original sentence: " + context) print("Target word: " + target_word) src = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(context)) seg = [1] * len(src) if len(src) > args.seq_length: src = src[:args.seq_length] seg = seg[:args.seq_length] while len(src) < args.seq_length: src.append(PAD_ID) seg.append(PAD_ID) target_word_id = vocab.get(target_word) if target_word_id in src: position = src.index(target_word_id) else: print("The target word is not in the sentence.") continue output = model(torch.LongTensor([src]).to(device), torch.LongTensor([seg]).to(device)) output = output.cpu().data.numpy() output = output.reshape([args.seq_length, -1]) target_embedding = output[position, :] target_embedding = target_embedding.reshape(1,-1).astype("float") cand_words_batch, cand_embeddings = [], [] for i, word in enumerate(cand_vocab.i2w): cand_words_batch.append(vocab.w2i.get(word))
parser.add_argument("--pretrained_model_path", help=".") # Output path. parser.add_argument("--topn", type=int, default=20) args = parser.parse_args() vocab = Vocab() vocab.load(args.vocab_path) pretrained_model = torch.load(args.pretrained_model_path) embedding = pretrained_model["embedding.word_embedding.weight"] cand_vocab = Vocab() cand_vocab.load(args.cand_vocab_path) cand_vocab_id = [vocab.get(w) for w in cand_vocab.i2w] cand_embedding = embedding[cand_vocab_id, :] f_word = open(args.target_words_path, mode="r", encoding="utf-8") for line in f_word: word = line.strip().split()[0] print("Target word: " + word) target_embedding = embedding[vocab.get(word), :] sims = torch.nn.functional.cosine_similarity( target_embedding.view(1, -1), cand_embedding) sorted_id = torch.argsort(sims, descending=True) for j in sorted_id[1:args.topn + 1]: print(cand_vocab.i2w[j].strip() + "\t" + str(sims[j].item())) print()