Python Vocabulary.sizeの例

プログラミング言語: Python

名前空間/パッケージ名: dictlearn.vocab

クラス/型: Vocabulary

メソッド/関数: size

hotexamples.comのコード掲載数: 5

Python Vocabulary.size - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdictlearn.vocab.Vocabulary.sizeの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Vocabulary(23)

build(5)

size(5)

word_to_id(4)

word_freq(3)

id_to_word(1)

コード例 #1

ファイルを表示

ファイル: test_language_model.py プロジェクト: xgeric/dict_based_learning

def test_language_model():
    with temporary_content_path(TEST_VOCAB) as path:
        vocab = Vocabulary(path)
    with temporary_content_path(TEST_DICT_JSON, suffix=".json") as path:
        dict_ = Dictionary(path)

    floatX = theano.config.floatX

    def make_data_and_mask(data):
        data = [[str2vec(s, 3) for s in row] for row in data]
        data = np.array(data)
        mask = np.ones((data.shape[0], data.shape[1]),
                        dtype=floatX)
        return data, mask
    words_val, mask_val = make_data_and_mask([['p', 'e', 'a', ], ['a', 'e', 'p',]])
    mask_val[1,2] = 0
    print "data:"
    print words_val
    print "mask:"
    print mask_val
    mask_def_emb_val = np.asarray([[0, 1], [0,0]])

    # With the dictionary
    retrieval = Retrieval(vocab, dict_, exclude_top_k=7)
    lm = LanguageModel(7, 5, vocab.size(), vocab.size(), 
        vocab=vocab, retrieval=retrieval,
        compose_type='transform_and_sum',
        weights_init=Uniform(width=0.1),
        biases_init=Uniform(width=0.1))
    lm.initialize()
    words = tensor.ltensor3('words')
    mask = tensor.matrix('mask', dtype=floatX)
    costs = lm.apply(words, mask)
    cg = ComputationGraph(costs)
    def_mean, = VariableFilter(name='_dict_word_embeddings')(cg)
    def_mean_f = theano.function([words], def_mean)

    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    mask_def_emb, = VariableFilter(name='mask_def_emb')(cg)
    perplexities_f = theano.function([words, mask], perplexities)
    perplexities_v = perplexities_f(words_val, mask_val)
    mask_emb_f = theano.function([words, mask], mask_def_emb)
    mask_def_v = mask_emb_f(words_val, mask_val)
    for v,p in zip(perplexities_v,perplexities):
        print p.name, ":", v
    assert(np.allclose(mask_def_v, mask_def_emb_val))

コード例 #2

ファイルを表示

def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Converts GLOVE embeddings to a numpy array")
    parser.add_argument("txt", help="GLOVE data in txt format")
    parser.add_argument("npy", help="Destination for npy format")
    parser.add_argument("vocab_out", help="output vocabulary")
    parser.add_argument("--vocab",
                        default="",
                        help="Performs subsetting based on passed vocab")

    # OOV handling
    parser.add_argument("--try-lowercase",
                        action="store_true",
                        help="Try lowercase")

    args = parser.parse_args()

    if args.vocab == "":
        raise NotImplementedError("Not implemented")
        embeddings = []
        dim = None
        with open(args.txt) as src:
            for i, line in enumerate(src):
                tokens = line.strip().split()
                features = map(float, tokens[1:])
                dim = len(features)
                embeddings.append(features)
                if i and i % 100000 == 0:
                    print i
        embeddings = [[0.] * dim] * len(
            Vocabulary.SPECIAL_TOKEN_MAP) + embeddings
        np.save(args.npy, embeddings)
    else:
        vocab = Vocabulary(args.vocab)

        print('Computing GloVe')

        # Loading
        embeddings_index = {}
        f = open(args.txt)

        print('Reading GloVe file')
        for line in f:
            values = line.split(' ')
            word = values[0]
            dim = len(values[1:])
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

        f.close()

        # Embedding matrix: larger than necessary
        f_out = open(args.vocab_out, 'w')
        n_specials = len(Vocabulary.SPECIAL_TOKEN_MAP.values())

        embedding_matrix = np.zeros((vocab.size() + n_specials, dim))
        for special_token in Vocabulary.SPECIAL_TOKEN_MAP.values():
            line = '<' + special_token + '>' + " 0\n"
            f_out.write(line.encode('utf-8'))

        i = n_specials
        #i = 0
        for word, count in zip(vocab.words, vocab.frequencies):
            embedding_vector = embeddings_index.get(word)
            if args.try_lowercase and not isinstance(embedding_vector,
                                                     np.ndarray):
                embedding_vector = embeddings_index.get(word.lower())
            in_glove = embedding_vector is not None
            last_comp = None
            if in_glove:
                last_comp = embedding_vector[-1]
            #print "i: {}, word {}, count {}, in_glove {}, last {}".format(i, word, count, in_glove, last_comp)
            if in_glove:
                try:
                    embedding_matrix[i] = embedding_vector
                except:
                    print "error idx", i
                # else, null vector
                #print "writing:", line, i
                line = word + " " + str(count) + "\n"
                f_out.write(line.encode('utf-8'))
                i += 1
            if i and i % 10000 == 0:
                print "i:", i
        f_out.close()
        np.save(args.npy, embedding_matrix[:i])

コード例 #3

ファイルを表示

ファイル: merge_vocab.py プロジェクト: xgeric/dict_based_learning

def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a dictionary")
    parser.add_argument("--target_coverage_text",
                        type=float,
                        help="Target coverage of text")
    parser.add_argument("--target_coverage_def",
                        type=float,
                        help="Target coverage of def")
    parser.add_argument("--vocab_text", type=str, help="Vocabulary of text")
    parser.add_argument("--vocab_def", type=str, help="Vocabulary of def")
    parser.add_argument("--step_size", type=int, default=30)
    parser.add_argument("--target", type=str, default="Final path")
    args = parser.parse_args()

    vocab_text = Vocabulary(args.vocab_text)
    vocab_def = Vocabulary(args.vocab_def)

    # Greedy solution is optimal
    # I also approximate greedy a bit by adding word by word. This is fine, vocabs are big
    target_coverage_text = np.sum(
        vocab_text.frequencies) * args.target_coverage_text
    target_coverage_def = np.sum(
        vocab_def.frequencies) * args.target_coverage_def
    current_vocab = set([])

    # Of course I could use binsearch
    for id in range(vocab_def.size() / args.step_size):
        for id2 in range(args.step_size):
            current_vocab.add(vocab_def.id_to_word(id * args.step_size + id2))

        current_vocab_mod = set(current_vocab)

        current_coverage_def = 0.0
        current_coverage_text = 0.0

        for w in current_vocab_mod:
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[
                vocab_text.word_to_id(w)]

        id_text = 0
        while current_coverage_text < target_coverage_text:
            while vocab_text.id_to_word(id_text) in current_vocab_mod:
                id_text += 1
                if id_text >= vocab_text.size():
                    raise Exception("Perhaps try lower target coverage")

            w = vocab_text.id_to_word(id_text)
            current_vocab_mod.add(w)
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[id_text]

        if current_coverage_def > target_coverage_def:
            current_vocab = current_vocab_mod
            break

        print(
            "After adding {} words I covered {} of def and {} of text occurences"
            .format(
                len(current_vocab_mod),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    # To be safe rechecking shortlist works
    current_coverage_def = 0
    current_coverage_text = 0
    for w in current_vocab:
        current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(w)]
        current_coverage_text += vocab_text.frequencies[vocab_text.word_to_id(
            w)]

    print(
        "Sanity check: after adding {} words I covered {} of def and {} of text occurences"
        .format(len(current_vocab),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    vocab_result = Vocabulary.build(
        {word: vocab_text.word_freq(word)
         for word in current_vocab})
    vocab_result.save(args.target)

コード例 #4

ファイルを表示

def evaluate(c, tar_path, *args, **kwargs):
    """
    Performs rudimentary evaluation of SNLI/MNLI run

    * Runs on valid and test given network
    * Saves all predictions
    * Saves embedding matrix
    * Saves results.json and predictions.csv
    """

    # Load and configure
    model = kwargs['model']
    assert c.endswith("json")
    c = json.load(open(c))

    # Very ugly absolute path fix
    ABS_PATHS = [
        "data/", "/mnt/users/jastrzebski/local/dict_based_learning/data/",
        "/data/cf9ffb48-61bd-40dc-a011-b2e7e5acfd72/"
    ]
    from six import string_types
    for abs_path in ABS_PATHS:
        for k in c:
            if isinstance(c[k], string_types):
                if c[k].startswith(abs_path):
                    c[k] = c[k][len(abs_path):]

    # Make data paths nice
    for path in [
            'dict_path', 'embedding_def_path', 'embedding_path', 'vocab',
            'vocab_def', 'vocab_text'
    ]:
        if c.get(path, ''):
            if not os.path.isabs(c[path]):
                c[path] = os.path.join(fuel.config.data_path[0], c[path])

    logging.info("Updating config with " + str(kwargs))
    c.update(**kwargs)

    # NOTE: This assures we don't miss crucial definition for some def heavy words
    # usually it is a good idea
    c['max_def_per_word'] = c['max_def_per_word'] * 2

    assert tar_path.endswith("tar")
    dest_path = os.path.dirname(tar_path)
    prefix = os.path.splitext(os.path.basename(tar_path))[0]

    s1_decoded, s2_decoded = T.lmatrix('sentence1'), T.lmatrix('sentence2')

    if c['dict_path']:
        s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix(
            'sentence2_def_map')
        def_mask = T.fmatrix("def_mask")
        defs = T.lmatrix("defs")
    else:
        s1_def_map, s2_def_map = None, None
        def_mask = None
        defs = None

    s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask')

    if model == 'simple':
        model, data, used_dict, used_retrieval, used_vocab = _initialize_simple_model_and_data(
            c)
    elif model == 'esim':
        model, data, used_dict, used_retrieval, used_vocab = _initialize_esim_model_and_data(
            c)
    else:
        raise NotImplementedError()

    pred = model.apply(s1_decoded,
                       s1_mask,
                       s2_decoded,
                       s2_mask,
                       def_mask=def_mask,
                       defs=defs,
                       s1_def_map=s1_def_map,
                       s2_def_map=s2_def_map,
                       train_phase=False)
    cg = ComputationGraph([pred])
    if c.get("bn", True):
        bn_params = [
            p for p in VariableFilter(bricks=[BatchNormalization])(cg)
            if hasattr(p, "set_value")
        ]
    else:
        bn_params = []

    # Load model
    model = Model(cg.outputs)
    parameters = model.get_parameter_dict()  # Blocks version mismatch
    logging.info(
        "Trainable parameters" + "\n" +
        pprint.pformat([(key, parameters[key].get_value().shape)
                        for key in sorted([
                            get_brick(param).get_hierarchical_name(param)
                            for param in cg.parameters
                        ])],
                       width=120))
    logging.info("# of parameters {}".format(
        sum([
            np.prod(parameters[key].get_value().shape) for key in sorted([
                get_brick(param).get_hierarchical_name(param)
                for param in cg.parameters
            ])
        ])))
    with open(tar_path) as src:
        params = load_parameters(src)

        loaded_params_set = set(params.keys())
        model_params_set = set([
            get_brick(param).get_hierarchical_name(param)
            for param in cg.parameters
        ])

        logging.info("Loaded extra parameters")
        logging.info(loaded_params_set - model_params_set)
        logging.info("Missing parameters")
        logging.info(model_params_set - loaded_params_set)
    model.set_parameter_values(params)

    if c.get("bn", True):
        logging.info("Loading " + str([
            get_brick(param).get_hierarchical_name(param)
            for param in bn_params
        ]))
        for param in bn_params:
            param.set_value(
                params[get_brick(param).get_hierarchical_name(param)])
        for p in bn_params:
            model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p

    # Read logs
    logs = pd.read_csv(os.path.join(dest_path, "logs.csv"))
    best_val_acc = logs['valid_misclassificationrate_apply_error_rate'].min()
    logging.info("Best measured valid acc: " + str(best_val_acc))

    # NOTE(kudkudak): We need this to have comparable mean rank and embedding scores
    reference_vocab = Vocabulary(
        os.path.join(fuel.config.data_path[0], c['data_path'], 'vocab.txt'))
    vocab_all = Vocabulary(
        os.path.join(
            fuel.config.data_path[0], c['data_path'],
            'vocab_all.txt'))  # Can include OOV words, which is interesting
    retrieval_all = Retrieval(vocab_text=used_vocab,
                              dictionary=used_dict,
                              max_def_length=c['max_def_length'],
                              exclude_top_k=0,
                              max_def_per_word=c['max_def_per_word'])
    # logging.info("Calculating dict and word embeddings for vocab.txt and vocab_all.txt")
    # for name in ['s1_word_embeddings', 's1_dict_word_embeddings']:
    #     variables = VariableFilter(name=name)(cg)
    #     if len(variables):
    #         s1_emb = variables[0]
    #         # A bit sloppy about downcast
    #
    #         if "dict" in name:
    #             embedder = construct_dict_embedder(
    #                 theano.function([s1_decoded, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True),
    #                 vocab=data.vocab, retrieval=retrieval_all)
    #         else:
    #             embedder = construct_embedder(theano.function([s1_decoded], s1_emb, allow_input_downcast=True),
    #                 vocab=data.vocab)
    #
    #         for v_name, v in [("vocab_all", vocab_all), ("vocab", reference_vocab)]:
    #             logging.info("Calculating {} embeddings for {}".format(name, v_name))

    # Predict
    predict_fnc = theano.function(cg.inputs, pred)
    results = {}
    batch_size = 14
    for subset in ['valid', 'test']:
        logging.info("Predicting on " + subset)
        stream = data.get_stream(subset, batch_size=batch_size, seed=778)
        it = stream.get_epoch_iterator()
        rows = []
        for ex in tqdm.tqdm(it, total=10000 / batch_size):
            ex = dict(zip(stream.sources, ex))
            inp = [ex[v.name] for v in cg.inputs]
            prob = predict_fnc(*inp)
            label_pred = np.argmax(prob, axis=1)

            for id in range(len(prob)):
                s1_decoded = used_vocab.decode(ex['sentence1'][id]).split()
                s2_decoded = used_vocab.decode(ex['sentence2'][id]).split()

                assert used_vocab == data.vocab

                s1_decoded = [
                    '*' + w + '*'
                    if used_vocab.word_to_id(w) > c['num_input_words'] else w
                    for w in s1_decoded
                ]
                s2_decoded = [
                    '*' + w + '*'
                    if used_vocab.word_to_id(w) > c['num_input_words'] else w
                    for w in s2_decoded
                ]

                # Different difficulty metrics

                # text_unk_percentage
                s1_no_pad = [w for w in ex['sentence1'][id] if w != 0]
                s2_no_pad = [w for w in ex['sentence2'][id] if w != 0]

                s1_unk_percentage = sum([
                    1. for w in s1_no_pad if w == used_vocab.unk
                ]) / len(s1_no_pad)
                s2_unk_percentage = sum([
                    1. for w in s1_no_pad if w == used_vocab.unk
                ]) / len(s2_no_pad)

                # mean freq word
                s1_mean_freq = np.mean([
                    0 if w == data.vocab.unk else used_vocab._id_to_freq[w]
                    for w in s1_no_pad
                ])
                s2_mean_freq = np.mean([
                    0 if w == data.vocab.unk else used_vocab._id_to_freq[w]
                    for w in s2_no_pad
                ])

                # mean rank word (UNK is max rank)
                # NOTE(kudkudak): Will break if we reindex unk between vocabs :P
                s1_mean_rank = np.mean([
                    reference_vocab.size() if reference_vocab.word_to_id(
                        used_vocab.id_to_word(w)) == reference_vocab.unk else
                    reference_vocab.word_to_id(used_vocab.id_to_word(w))
                    for w in s1_no_pad
                ])

                s2_mean_rank = np.mean([
                    reference_vocab.size() if reference_vocab.word_to_id(
                        used_vocab.id_to_word(w)) == reference_vocab.unk else
                    reference_vocab.word_to_id(used_vocab.id_to_word(w))
                    for w in s2_no_pad
                ])

                rows.append({
                    "pred": label_pred[id],
                    "true_label": ex['label'][id],
                    "s1": ' '.join(s1_decoded),
                    "s2": ' '.join(s2_decoded),
                    "s1_unk_percentage": s1_unk_percentage,
                    "s2_unk_percentage": s2_unk_percentage,
                    "s1_mean_freq": s1_mean_freq,
                    "s2_mean_freq": s2_mean_freq,
                    "s1_mean_rank": s1_mean_rank,
                    "s2_mean_rank": s2_mean_rank,
                    "p_0": prob[id, 0],
                    "p_1": prob[id, 1],
                    "p_2": prob[id, 2]
                })

        preds = pd.DataFrame(rows, columns=rows[0].keys())
        preds.to_csv(
            os.path.join(dest_path,
                         prefix + '_predictions_{}.csv'.format(subset)))
        results[subset] = {}
        results[subset]['misclassification'] = 1 - np.mean(
            preds.pred == preds.true_label)

        if subset == "valid" and np.abs(
            (1 - np.mean(preds.pred == preds.true_label)) -
                best_val_acc) > 0.001:
            logging.error("!!!")
            logging.error(
                "Found different best_val_acc. Probably due to changed specification of the model class."
            )
            logging.error("Discrepancy {}".format(
                (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc))
            logging.error("!!!")

        logging.info(results)

    json.dump(results,
              open(os.path.join(dest_path, prefix + '_results.json'), "w"))

コード例 #5

ファイルを表示

def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Converts GLOVE embeddings to a numpy array")
    parser.add_argument("txt", help="GLOVE data in txt format")
    parser.add_argument("npy", help="Destination for npy format")
    parser.add_argument("--vocab",
                        default="",
                        help="Performs subsetting based on passed vocab")
    parser.add_argument("--dict",
                        default="",
                        help="Performs subsetting based on passed dict")

    # OOV handling
    parser.add_argument("--try-lemma", action="store_true", help="Try lemma")
    parser.add_argument("--try-lowercase", default="", help="Try lowercase")

    args = parser.parse_args()

    if args.dict and not args.vocab:
        # usually you'd want to use both, I suppose
        raise NotImplementedError("Not implemented")
    if args.try_lemma or args.try_lowercase:
        # TODO(kudkudak): Implement
        raise NotImplementedError("Not implemented yet")

    if args.vocab == "":
        embeddings = []
        dim = None
        with open(args.txt) as src:
            for i, line in enumerate(src):
                tokens = line.strip().split()
                features = map(float, tokens[1:])
                dim = len(features)
                embeddings.append(features)
                if i and i % 100000 == 0:
                    print i
        embeddings = [[0.] * dim] * len(
            Vocabulary.SPECIAL_TOKEN_MAP) + embeddings
        numpy.save(args.npy, embeddings)
    else:
        vocab = Vocabulary(args.vocab)
        if args.dict:
            dict_ = Dictionary(args.dict)

        print('Computing GloVe')

        # Loading
        embeddings_index = {}
        f = open(args.txt)
        print('Reading GloVe file')
        for line in f:
            values = line.split(' ')
            word = values[0]
            dim = len(values[1:])
            coefs = numpy.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        # Embedding matrix
        print('Reading GloVe file')
        embedding_matrix = numpy.zeros((vocab.size(), dim))
        for word in vocab._word_to_id:
            embedding_vector = embeddings_index.get(word)
            in_glove = embedding_vector is not None
            if args.dict:
                in_dict = len(dict_.get_definitions(word)) > 0

            if in_glove and (not args.dict or in_dict):
                # words not found in embedding index will be all-zeros.
                embedding_matrix[vocab.word_to_id(word)] = embedding_vector
            else:
                if not in_glove:
                    print(u'Missing from GloVe: {}'.format(word))
                else:
                    print(u'Missing from dict: {}'.format(word))

        numpy.save(args.npy, embedding_matrix)