def targs_to_idx(col_name):
     # This function builds the index to vocab (and its inverse) mapping
     values = set(rows[col_name].values)
     vocab = vocabulary.Vocabulary(counter=None, non_padded_namespaces=[col_name])
     for value in values:
         vocab.add_token_to_namespace(value, col_name)
     idx_to_word = vocab.get_index_to_token_vocabulary(col_name)
     word_to_idx = vocab.get_token_to_index_vocabulary(col_name)
     rows[col_name] = rows[col_name].apply(lambda x: [word_to_idx[x]] if x != "" else [])
     return word_to_idx, idx_to_word, rows[col_name]
Example #2
0
            if num_occurrences[i] == 0:
                continue

            token = vocab.get_token_from_index(i)
            to_dump = token + ' ' + ' '.join([str(v)
                                              for v in embeds[i, :]]) + '\n'
            f.write(to_dump)


if __name__ == '__main__':
    args = parse_config(args)
    if os.path.exists(args.out_dir):
        print("Output dir already exists: {}".format(args.out_dir))
        sys.exit(1)

    vocab = vocabulary.Vocabulary()
    vocab.set_from_file(args.vocab_file, oov_token='<UNK>')
    print("Loaded vocabulary of size {}".format(vocab.get_vocab_size()))

    anchors, norms, num_occurrences = run_elmo(args.txt_files,
                                               args.elmo_options_path,
                                               args.elmo_weights_path, vocab,
                                               args.layers, args.batch_size,
                                               args.cuda_device)

    os.makedirs(args.out_dir, exist_ok=True)
    norm_dict = {}
    print('Saving outputs to {}'.format(args.out_dir))
    for l in tqdm(args.layers):
        norm_key = 'avg_norm_layer_{}'.format(l)
        norm_dict[norm_key] = norms[l]
Example #3
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--lang", default=None, type=str, required=True)
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--out_dir", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    ## Other parameters
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="0", type=str)
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('-d',
                        '--emb_dim',
                        type=int,
                        default=1024,
                        help="Embeddings size")
    parser.add_argument(
        '--vocab_file',
        type=str,
        default='vocabs/en_50k.vocab',
        help=
        "Path to vocab file with tokens (one per line) to include in output. Should also include <UNK> token. Can use $l as a placeholder for language"
    )

    args = parser.parse_args()

    lang = args.lang

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    vocab = vocabulary.Vocabulary()

    vocab.set_from_file(args.vocab_file, oov_token=tokenizer.unk_token)

    print("Loaded vocabulary of size {}".format(vocab.get_vocab_size()))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    layer_indexes = [int(x) for x in args.layers.split(",")]

    examples = read_examples(args.input_file)

    features = convert_examples_to_features(examples=examples,
                                            seq_length=args.max_seq_length,
                                            tokenizer=tokenizer,
                                            lang=lang)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = AutoModel.from_pretrained(args.bert_model)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    num_occurrences = [0] * vocab.get_vocab_size()
    anchors = {}
    norms = {}
    total_words = 0
    for l in layer_indexes:
        norms[l] = 0.0
        anchors[l] = np.zeros(shape=(vocab.get_vocab_size(), args.emb_dim))

    oov_ind = vocab.get_token_index(vocab._oov_token)
    model.eval()
    for input_ids, input_mask, example_indices in tqdm(eval_dataloader):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)

        all_encoder_layers = model(input_ids)
        all_encoder_layers = all_encoder_layers

        for b, example_index in enumerate(example_indices):
            feature = features[example_index.item()]

            for (i, token) in enumerate(feature.tokens):
                all_layers = []
                w_id = vocab.get_token_index(token)
                if w_id == oov_ind:
                    continue

                n = num_occurrences[w_id]
                for (j, layer_index) in enumerate(layer_indexes):
                    layer_output = all_encoder_layers[int(
                        layer_index)].detach().cpu().numpy()
                    layer_output = layer_output[b]
                    layers = collections.OrderedDict()
                    l = layer_index
                    values = layer_output[i]

                    anchors[l][w_id, :] = anchors[l][w_id, :] * (
                        n / (n + 1)) + values[:] / (n + 1)
                    norm = np.linalg.norm(values[:])
                    norms[l] = norms[l] * (total_words /
                                           (total_words + 1)) + norm / (
                                               total_words + 1)

                total_words += 1
                num_occurrences[w_id] += 1

    os.makedirs(args.out_dir, exist_ok=True)
    norm_dict = {}
    print('Saving outputs to {}'.format(args.out_dir))
    for l in tqdm(layer_indexes):
        norm_key = 'avg_norm_layer_{}'.format(l)
        norm_dict[norm_key] = norms[l]
        file_path = os.path.join(args.out_dir, 'avg_embeds_{}.txt'.format(l))
        save_embeds(file_path, anchors[l], vocab, num_occurrences,
                    args.emb_dim)

    norm_dict['occurrences'] = num_occurrences
    file_path = os.path.join(args.out_dir, 'norms.json')
    json.dump(norm_dict, open(file_path, 'w'))