Python Vocabulary Examples

Programming Language: Python

Namespace/Package Name: allennlp.data.vocabulary

Method/Function: Vocabulary

Examples at hotexamples.com: 3

Python Vocabulary - 3 examples found. These are the top rated real world Python examples of allennlp.data.vocabulary.Vocabulary extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: data_loaders.py Project: varunchaudharycs/aNswER_multirc

 def targs_to_idx(col_name):
     # This function builds the index to vocab (and its inverse) mapping
     values = set(rows[col_name].values)
     vocab = vocabulary.Vocabulary(counter=None, non_padded_namespaces=[col_name])
     for value in values:
         vocab.add_token_to_namespace(value, col_name)
     idx_to_word = vocab.get_index_to_token_vocabulary(col_name)
     word_to_idx = vocab.get_token_to_index_vocabulary(col_name)
     rows[col_name] = rows[col_name].apply(lambda x: [word_to_idx[x]] if x != "" else [])
     return word_to_idx, idx_to_word, rows[col_name]

Example #2

Show file

            if num_occurrences[i] == 0:
                continue

            token = vocab.get_token_from_index(i)
            to_dump = token + ' ' + ' '.join([str(v)
                                              for v in embeds[i, :]]) + '\n'
            f.write(to_dump)


if __name__ == '__main__':
    args = parse_config(args)
    if os.path.exists(args.out_dir):
        print("Output dir already exists: {}".format(args.out_dir))
        sys.exit(1)

    vocab = vocabulary.Vocabulary()
    vocab.set_from_file(args.vocab_file, oov_token='<UNK>')
    print("Loaded vocabulary of size {}".format(vocab.get_vocab_size()))

    anchors, norms, num_occurrences = run_elmo(args.txt_files,
                                               args.elmo_options_path,
                                               args.elmo_weights_path, vocab,
                                               args.layers, args.batch_size,
                                               args.cuda_device)

    os.makedirs(args.out_dir, exist_ok=True)
    norm_dict = {}
    print('Saving outputs to {}'.format(args.out_dir))
    for l in tqdm(args.layers):
        norm_key = 'avg_norm_layer_{}'.format(l)
        norm_dict[norm_key] = norms[l]

Example #3

Show file

def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--lang", default=None, type=str, required=True)
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--out_dir", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    ## Other parameters
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="0", type=str)
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('-d',
                        '--emb_dim',
                        type=int,
                        default=1024,
                        help="Embeddings size")
    parser.add_argument(
        '--vocab_file',
        type=str,
        default='vocabs/en_50k.vocab',
        help=
        "Path to vocab file with tokens (one per line) to include in output. Should also include <UNK> token. Can use $l as a placeholder for language"
    )

    args = parser.parse_args()

    lang = args.lang

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    vocab = vocabulary.Vocabulary()

    vocab.set_from_file(args.vocab_file, oov_token=tokenizer.unk_token)

    print("Loaded vocabulary of size {}".format(vocab.get_vocab_size()))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    layer_indexes = [int(x) for x in args.layers.split(",")]

    examples = read_examples(args.input_file)

    features = convert_examples_to_features(examples=examples,
                                            seq_length=args.max_seq_length,
                                            tokenizer=tokenizer,
                                            lang=lang)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = AutoModel.from_pretrained(args.bert_model)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    num_occurrences = [0] * vocab.get_vocab_size()
    anchors = {}
    norms = {}
    total_words = 0
    for l in layer_indexes:
        norms[l] = 0.0
        anchors[l] = np.zeros(shape=(vocab.get_vocab_size(), args.emb_dim))

    oov_ind = vocab.get_token_index(vocab._oov_token)
    model.eval()
    for input_ids, input_mask, example_indices in tqdm(eval_dataloader):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)

        all_encoder_layers = model(input_ids)
        all_encoder_layers = all_encoder_layers

        for b, example_index in enumerate(example_indices):
            feature = features[example_index.item()]

            for (i, token) in enumerate(feature.tokens):
                all_layers = []
                w_id = vocab.get_token_index(token)
                if w_id == oov_ind:
                    continue

                n = num_occurrences[w_id]
                for (j, layer_index) in enumerate(layer_indexes):
                    layer_output = all_encoder_layers[int(
                        layer_index)].detach().cpu().numpy()
                    layer_output = layer_output[b]
                    layers = collections.OrderedDict()
                    l = layer_index
                    values = layer_output[i]

                    anchors[l][w_id, :] = anchors[l][w_id, :] * (
                        n / (n + 1)) + values[:] / (n + 1)
                    norm = np.linalg.norm(values[:])
                    norms[l] = norms[l] * (total_words /
                                           (total_words + 1)) + norm / (
                                               total_words + 1)

                total_words += 1
                num_occurrences[w_id] += 1

    os.makedirs(args.out_dir, exist_ok=True)
    norm_dict = {}
    print('Saving outputs to {}'.format(args.out_dir))
    for l in tqdm(layer_indexes):
        norm_key = 'avg_norm_layer_{}'.format(l)
        norm_dict[norm_key] = norms[l]
        file_path = os.path.join(args.out_dir, 'avg_embeds_{}.txt'.format(l))
        save_embeds(file_path, anchors[l], vocab, num_occurrences,
                    args.emb_dim)

    norm_dict['occurrences'] = num_occurrences
    file_path = os.path.join(args.out_dir, 'norms.json')
    json.dump(norm_dict, open(file_path, 'w'))