Ejemplo n.º 1
0
def train_eval(mode,
               model_file,
               descriptions_file,
               neg_words_mult=2.,
               lbda=50,
               min_words=50,
               eval_lines=5000,
               eval_words=10):

    model = load_word2vec_model(model_file, mmap='r')
    if mode == 'centroid':
        entity_model = EntityModelCentroid()
    elif mode == 'lr':
        bins = np.cumsum(
            [model.vocab[word].count for word in model.index2word])
        entity_model = EntityModelLR(bins, neg_words_mult, lbda)
    else:
        raise Exception('unsupported mode %s' % mode)

    rng = random.Random(1729)

    eval_items = []

    def sampled_word_seqs():
        for i, (entity, t, word_idxs) in \
            enumerate(read_entity_word_seqs(descriptions_file, model, min_words)):
            rng.shuffle(word_idxs)
            if i < eval_lines:
                eval_items.append(
                    (entity, word_idxs[:eval_words], len(word_idxs)))
            yield entity, t, word_idxs[eval_words:]

    entity_model.train(model, sampled_word_seqs())

    evaluate_retrieval(model, entity_model, eval_items)
Ejemplo n.º 2
0
def train_eval(mode, model_file, descriptions_file,
               neg_words_mult=2., lbda=50, min_words=50,
               eval_lines=5000, eval_words=10):

    model = load_word2vec_model(model_file, mmap='r')
    if mode == 'centroid':
        entity_model = EntityModelCentroid()
    elif mode == 'lr':
        bins = np.cumsum([model.vocab[word].count
                      for word in model.index2word])
        entity_model = EntityModelLR(bins, neg_words_mult, lbda)
    else:
        raise Exception('unsupported mode %s' % mode)

    rng = random.Random(1729)

    eval_items = []
    def sampled_word_seqs():
        for i, (entity, t, word_idxs) in \
            enumerate(read_entity_word_seqs(descriptions_file, model, min_words)):
            rng.shuffle(word_idxs)
            if i < eval_lines:
                eval_items.append((entity, word_idxs[:eval_words], len(word_idxs)))
            yield entity, t, word_idxs[eval_words:]

    entity_model.train(model, sampled_word_seqs())

    evaluate_retrieval(model, entity_model, eval_items)
Ejemplo n.º 3
0
def train(mode, model_file, descriptions_file, output_file=None,
          neg_words_mult=2., lbda=50, min_words=1):

    model = load_word2vec_model(model_file, mmap='r')
    if mode == 'centroid':
        entity_model = EntityModelCentroid()
    elif mode == 'lr':
        bins = np.cumsum([model.vocab[word].count
                      for word in model.index2word])
        entity_model = EntityModelLR(bins, neg_words_mult, lbda)
    else:
        raise Exception('unsupported mode %s' % mode)

    entity_model.train(model,
                       read_entity_word_seqs(descriptions_file, model, min_words))

    if output_file is not None:
        entity_model.save(output_file)
Ejemplo n.º 4
0
def train(mode,
          model_file,
          descriptions_file,
          output_file=None,
          neg_words_mult=2.,
          lbda=50,
          min_words=1):

    model = load_word2vec_model(model_file, mmap='r')
    if mode == 'centroid':
        entity_model = EntityModelCentroid()
    elif mode == 'lr':
        bins = np.cumsum(
            [model.vocab[word].count for word in model.index2word])
        entity_model = EntityModelLR(bins, neg_words_mult, lbda)
    else:
        raise Exception('unsupported mode %s' % mode)

    entity_model.train(
        model, read_entity_word_seqs(descriptions_file, model, min_words))

    if output_file is not None:
        entity_model.save(output_file)