Beispiel #1
0
    # Build bert model.
    model = build_model(args)

    # Load pretrained model.
    pretrained_model_dict = torch.load(args.pretrained_model_path)
    model.load_state_dict(pretrained_model_dict, strict=False)

    model = GenerateModel(args, model)

    # Build tokenizer.
    tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    with open(args.input_path, mode="r", encoding="utf-8") as f:
        line = f.readline().strip()
        src = [vocab.get(t) for t in tokenizer.tokenize(line.strip())]
        seg = [1] * len(src)
        start_length = len(src)
        if len(src) > args.seq_length:
            src = src[:args.seq_length]
            seg = seg[:args.seq_length]
    src = [src]
    seg = [seg]
    src_tensor = torch.LongTensor(src)
    seg_tensor = torch.LongTensor(seg)

    f_output = open(args.output_path, mode="w", encoding="utf-8")

    for i in range(args.seq_length - start_length):
        prob = model(src_tensor, seg_tensor)
        top_token = (-prob[0][-1]).argsort()[random.randint(0, 2)]
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        seq_encoder = nn.DataParallel(seq_encoder)

    seq_encoder = seq_encoder.to(device)

    # Build tokenizer
    if args.tokenizer == "mixed":
        tokenizer = MixedTokenizer(vocab)
    else:
        tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    dataset = []
    with open(args.input_path, mode="r", encoding="utf-8") as f:
        for line in f:
            tokens = [vocab.get(t) for t in tokenizer.tokenize(line)]
            if len(tokens) == 0:
                continue
            tokens = [CLS_ID] + tokens
            seg = [1] * len(tokens)

            if len(tokens) > args.seq_length:
                tokens = tokens[:args.seq_length]
                seg = seg[:args.seq_length]
            while len(tokens) < args.seq_length:
                tokens.append(PAD_ID)
                seg.append(PAD_ID)
            dataset.append((tokens, seg))

    input_ids = torch.LongTensor([e[0] for e in dataset])
    seg_ids = torch.LongTensor([e[1] for e in dataset])
Beispiel #3
0
    if args.spm_model_path:
        try:
            import sentencepiece as spm
        except ImportError:
            raise ImportError("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                                                    "pip install sentencepiece")
        sp_model = spm.SentencePieceProcessor()
        sp_model.Load(args.spm_model_path)
        vocab = Vocab()
        vocab.i2w = {i: sp_model.IdToPiece(i) for i in range(sp_model.GetPieceSize())}
    else:
        vocab = Vocab()
        vocab.load(args.vocab_path)

    pretrained_model = torch.load(args.load_model_path)
    embedding = pretrained_model["embedding.word_embedding.weight"]

    with open(args.word_embedding_path, mode="w", encoding="utf-8") as f:
        head=str(list(embedding.size())[0])+" "+str(list(embedding.size())[1])+"\n"
        f.write(head)

        for i in range(len(vocab.i2w)):
            word = vocab.i2w[i]
            word_embedding = embedding[vocab.get(word), :]
            word_embedding = word_embedding.cpu().numpy().tolist()
            line = str(word)
            for j in range(len(word_embedding)):
                line = line + " " + str(word_embedding[j])
            line += "\n"
            f.write(line)
Beispiel #4
0
            line = line.strip().split("\t")
            if len(line) != 2:
                continue
            target_word, context = line[0], line[1]
            print("Original sentence: " + context)
            print("Target word: " + target_word)
            src = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(context))
            seg = [1] * len(src)
            if len(src) > args.seq_length:
                src = src[:args.seq_length]
                seg = seg[:args.seq_length]
            while len(src) < args.seq_length:
                src.append(PAD_ID)
                seg.append(PAD_ID)

            target_word_id = vocab.get(target_word)
            if target_word_id in src:
                position = src.index(target_word_id)
            else:
                print("The target word is not in the sentence.")
                continue

            output = model(torch.LongTensor([src]).to(device), torch.LongTensor([seg]).to(device))
            output = output.cpu().data.numpy()
            output = output.reshape([args.seq_length, -1])
            target_embedding = output[position, :]
            target_embedding = target_embedding.reshape(1,-1).astype("float")

            cand_words_batch, cand_embeddings = [], []
            for i, word in enumerate(cand_vocab.i2w):
                cand_words_batch.append(vocab.w2i.get(word))
    parser.add_argument("--pretrained_model_path", help=".")

    # Output path.
    parser.add_argument("--topn", type=int, default=20)

    args = parser.parse_args()

    vocab = Vocab()
    vocab.load(args.vocab_path)

    pretrained_model = torch.load(args.pretrained_model_path)
    embedding = pretrained_model["embedding.word_embedding.weight"]

    cand_vocab = Vocab()
    cand_vocab.load(args.cand_vocab_path)
    cand_vocab_id = [vocab.get(w) for w in cand_vocab.i2w]
    cand_embedding = embedding[cand_vocab_id, :]

    f_word = open(args.target_words_path, mode="r", encoding="utf-8")

    for line in f_word:
        word = line.strip().split()[0]
        print("Target word: " + word)
        target_embedding = embedding[vocab.get(word), :]
        sims = torch.nn.functional.cosine_similarity(
            target_embedding.view(1, -1), cand_embedding)
        sorted_id = torch.argsort(sims, descending=True)
        for j in sorted_id[1:args.topn + 1]:
            print(cand_vocab.i2w[j].strip() + "\t" + str(sims[j].item()))
        print()