Beispiel #1
0
def build_custom3(initial_model=None,
                  lexicon_name='',
                  a_i=0.5, b_ij=0.5, n_iter=10, in_place=True, **kwargs):
    """Retrofit a model using faruqui:2014:NIPS-DLRLW method.

    Args:
        in_place: Modify the given model instead of copying it if True."""
    if initial_model is None:
        raise ValueError('Need an initial model')

    old_lexicon = lexicons.get_lexicon(lexicon_name)

    if not in_place:
        initial_model_file = tempfile.NamedTemporaryFile(mode='w+',
                                                         encoding='utf-8',
                                                         delete=False)
        initial_model_file.close()
        initial_model.save(initial_model_file.name)
        model = gensim.models.Word2Vec.load(initial_model_file.name)
        os.remove(initial_model_file.name)
    else:
        model = initial_model
    lexicon = {}
    for w in old_lexicon:
        if w in model:
            lexicon[w] = old_lexicon[w]
    lexicon_inv = utils.invert_dict_nonunique(lexicon)

    for it in range(n_iter):
        # loop through every node also in ontology (else just use data
        # estimate)
        for word in lexicon:
            if word not in model:
                continue
            i = model.wv.vocab[word].index
            word_neighbours = [w for w in lexicon_inv[lexicon[word]]
                               if w != word]
            num_neighbours = len(word_neighbours)

            if b_ij == 'degree':
                b_ij = 1/num_neighbours

            #no neighbours, pass - use data estimate
            if num_neighbours == 0:
                continue
            # the weight of the data estimate if the number of neighbours
            model.wv.syn0[i] = num_neighbours * a_i * initial_model.wv.syn0[i]
            # loop over neighbours and add to new vector
            # for pp_word in word_neighbours:
            #     j = model.wv.vocab[pp_word].index
            #     model.wv.syn0[i] += b_ij * model.wv.syn0[j]

            # Vectorized version of the above
            word_neighbours = [model.wv.vocab[w].index for w in word_neighbours]
            model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_neighbours], axis=0)
            model.wv.syn0[i] = model.wv.syn0[i] / (num_neighbours * (b_ij + a_i))
    return model
Beispiel #2
0
def build_custom1(train_path,
                  word2vec_param=default_word2vec_param,
                  lexicon_name='', **kwargs):
    logger.info('Train custom1 model')
    lexicon = lexicons.get_lexicon(lexicon_name)
    source = TwitterLoggerTextReader(train_path)
    source = GenericTextReader(source)
    source = Splitter(source)
    source = LexiconProjecter(source, lexicon)
    return gensim.models.Word2Vec(source, **word2vec_param)
Beispiel #3
0
def build_custom3_2(initial_model=None,
                    lexicon_name='',
                    a_i=1, b_ij=1,
                    n_iter=10, in_place=True,
                    d=1, topn=50, **kwargs):
    """Derived from faruqui:2014:NIPS-DLRLW method.
Put same class closer.

Also moves the topn neighboors by d x <the actual translation>

    Args:
        in_place: Modify the given model instead of copying it if True."""
    logger.info('Customize 3_2 with %s', lexicon_name)
    if initial_model is None:
        raise ValueError('Need an initial model')

    if not in_place:
        initial_model_file = tempfile.NamedTemporaryFile(mode='w+',
                                                         encoding='utf-8',
                                                         delete=False)
        initial_model_file.close()
        initial_model.save(initial_model_file.name)
        model = gensim.models.Word2Vec.load(initial_model_file.name)
        os.remove(initial_model_file.name)
    else:
        model = initial_model
    lexicon = {}
    for (w, v) in lexicons.get_lexicon(lexicon_name).items():
        if w in model:
            lexicon[w] = v
    lexicon_inv = utils.invert_dict_nonunique(lexicon)

    for it in range(n_iter):
        # loop through every node also in ontology (else just use data
        # estimate)
        for word in lexicon:
            if word not in model:
                continue
            i = model.wv.vocab[word].index
            word_lex_neighbours = [w for w in lexicon_inv[lexicon[word]]
                               if w != word]

            word_model_neighbours = model.most_similar(word, topn=topn)

            num_lex_neighbours = len(word_lex_neighbours)

            if b_ij == 'degree':
                b_ij = 1/num_lex_neighbours

            # FIXE use not_lex_neighbours
            #no lex_neighbours, pass - use data estimate
            if num_lex_neighbours == 0:
                continue
            # the weight of the data estimate if the number of lex_neighbours
            model.wv.syn0[i] = num_lex_neighbours * a_i * initial_model.wv.syn0[i]

            # loop over lex_neighbours and add to new vector
            # for pp_word in word_lex_neighbours:
            #     j = model.wv.vocab[pp_word].index
            #     model.wv.syn0[i] += b_ij * model.wv.syn0[j]

            # Vectorized version of the above
            word_lex_neighbours = [model.wv.vocab[w].index for w in word_lex_neighbours]
            model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_lex_neighbours], axis=0)
            model.wv.syn0[i] = model.wv.syn0[i] / (num_lex_neighbours * (b_ij + a_i))

            for (neighbour, _) in word_model_neighbours:
                i = model.wv.vocab[neighbour].index
                model.wv.syn0[i] = d * num_lex_neighbours * a_i * initial_model.wv.syn0[i]
                model.wv.syn0[i] = d * (model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_lex_neighbours], axis=0))

    return model
Beispiel #4
0
def build_custom3_1(initial_model=None,
                    lexicon_name='',
                    a_i=1, b_ij=1, c_ij=1, n_iter=10, in_place=True, **kwargs):
    """Derived from faruqui:2014:NIPS-DLRLW method.
Put same class closer and other classes away.

    Args:
        in_place: Modify the given model instead of copying it if True."""
    if initial_model is None:
        raise ValueError('Need an initial model')

    if not in_place:
        initial_model_file = tempfile.NamedTemporaryFile(mode='w+',
                                                         encoding='utf-8',
                                                         delete=False)
        initial_model_file.close()
        initial_model.save(initial_model_file.name)
        model = gensim.models.Word2Vec.load(initial_model_file.name)
        os.remove(initial_model_file.name)
    else:
        model = initial_model
    lexicon = {}
    for (w, v) in lexicons.get_lexicon(lexicon_name).items():
        if w in model:
            lexicon[w] = v
    lexicon_inv = utils.invert_dict_nonunique(lexicon)

    for it in range(n_iter):
        # loop through every node also in ontology (else just use data
        # estimate)
        for word in lexicon:
            if word not in model:
                continue
            i = model.wv.vocab[word].index
            word_neighbours = [w for w in lexicon_inv[lexicon[word]]
                               if w != word]

            # Non-neighbours are words with different classes than WORD
            word_not_neighbours = []
            for c in lexicon_inv:
                if c != lexicon[word]:
                    word_not_neighbours.extend(lexicon_inv[c])
            # Remove duplicate
            word_not_neighbours = list(set(word_not_neighbours))

            num_neighbours = len(word_neighbours)
            num_not_neighbours = len(word_not_neighbours)

            if b_ij == 'degree':
                b_ij = 1/num_neighbours

            if c_ij == 'degree':
                c_ij = 1/num_not_neighbours

            # FIXE use not_neighbours
            #no neighbours, pass - use data estimate
            if num_neighbours == 0 and num_not_neighbours == 0:
                continue
            # the weight of the data estimate if the number of neighbours
            model.wv.syn0[i] = num_neighbours * a_i * initial_model.wv.syn0[i]
            # loop over neighbours and add to new vector
            # for pp_word in word_neighbours:
            #     j = model.wv.vocab[pp_word].index
            #     model.wv.syn0[i] += b_ij * model.wv.syn0[j]

            # Vectorized version of the above
            word_neighbours = [model.wv.vocab[w].index for w in word_neighbours]
            model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_neighbours], axis=0)
            word_not_neighbours = [model.wv.vocab[w].index for w in word_not_neighbours]
            model.wv.syn0[i] = model.wv.syn0[i] - c_ij * np.sum(model.wv.syn0[word_not_neighbours], axis=0)
            model.wv.syn0[i] = model.wv.syn0[i] / (num_neighbours * (b_ij + a_i))
    return model
Beispiel #5
0
def build_custom_mce(train_path,
                     word2vec_param=default_word2vec_param,
                     lexicon_name='', valid_num=0.1, top=10,
                     clean_after=True, **kwargs):
    """Build a Word2Vec model using MCE method.

    Args:
        lexicon: The lexicon used to build the inequalities.
        valid_num: How much inequations should be used for cross-validation (either a floar between 0 and 1 or an integer.
        top: See feat.build_ineq_for_model
        clean_after: Clean the files after building the model if True."""
    lexicon = lexicons.get_lexicon(lexicon_name)

    source = TwitterLoggerTextReader(train_path)
    source = GenericTextReader(source, lower=True)
    source = Splitter(source)

    input_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                             delete=False, prefix='input')
    for line in source:
        input_file.write(' '.join(line))
        input_file.write('\n')
    input_file.close()

    output_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                              delete=False, prefix='output')
    output_file.close()

    syn_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                           delete=False, prefix='syn')
    ant_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                           delete=False, prefix='ant')

    lexicon_inv = utils.invert_dict_nonunique(lexicon)
    # Generate syn_file
    for c in lexicon_inv:
        syn_file.write('\t'.join(lexicon_inv[c]))
        syn_file.write('\n')
    syn_file.close()

    # Generate ant_file
    for cur_c in lexicon_inv:
        for word in lexicon_inv[cur_c]:
            for c in lexicon_inv:
                # skip current observed class
                if c == cur_c:
                    continue
                ant_file.write(word + '\t' + '\t'.join(lexicon_inv[c]))
                ant_file.write('\n')
    ant_file.close()

    cmd = ['./word2vec',
           '-train', input_file.name,
           '-output', output_file.name,
           '-size', str(word2vec_param['size']),
           '-window', str(word2vec_param['window']),
           '-sample', str(word2vec_param['sample']),
           '-hs', str(word2vec_param['hs']),
           '-iter', str(word2vec_param['iter']),
           '-min-count', str(word2vec_param['min_count']),
           '-read-syn', syn_file.name,
           '-read-ant', ant_file.name,
    ]
    logger.info(' '.join(cmd))
    p = Popen(cmd,
              stdin=PIPE,
              stdout=PIPE,
              stderr=PIPE,
              cwd=res.MCE_PATH)
    out, err = p.communicate()
    err = err.decode()
    out = out.decode()
    logger.info(out)
    logger.error(err)
    if p.returncode == 0:
        model = gensim.models.Word2Vec.load_word2vec_format(output_file.name,
                                                            binary=False,
                                                            unicode_errors='replace')
    if clean_after:
        os.remove(output_file.name)
        os.remove(input_file.name)
        os.remove(syn_file.name)
        os.remove(ant_file.name)
    return model
Beispiel #6
0
def build_custom2(train_path,
                  word2vec_param=default_word2vec_param,
                  lexicon_name='', valid_num=0.1, top=10,
                  clean_after=True, **kwargs):
    """Build a Word2Vec model using SWE method (optimization with
inequalities).

    Args:
        lexicon: The lexicon used to build the inequalities.
        valid_num: How much inequations should be used for cross-validation (either a floar between 0 and 1 or an integer.
        top: See feat.build_ineq_for_model
        clean_after: Clean the files after building the model if True."""
    lexicon = lexicons.get_lexicon(lexicon_name)
    model = None
    source = TwitterLoggerTextReader(train_path)
    source = GenericTextReader(source, lower=True)
    source = Splitter(source)

    input_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                             delete=False, prefix='input')
    output_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                              delete=False, prefix='output')
    output_file.close()

    ineq_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                            delete=False, prefix='ineq')
    ineq_file.close()

    vocab_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                             delete=False, prefix='vocab')

    try:
        logger.info('Build vocabulary file')
        vocab = Counter()
        for line in source:
            vocab.update(line)
            input_file.write(' '.join(line))
            input_file.write('\n')
        vocab = OrderedDict(sorted(vocab.items(), key=lambda t: t[1],
                                   reverse=True))
        for word in vocab:
            # Ignore word with freq < min_count
            if 'min_count' in word2vec_param and vocab[word] < word2vec_param['min_count']:
                del vocab[word]
                continue
            vocab_file.write('%s\t%d\n' % (word, vocab[word]))
        vocab_file.close()

        model0 = get_custom0(word2vec_param=word2vec_param)
        new_lexicon = {}
        for w in lexicon:
            # Ignore word not in vocab nor in model
            if w not in model0 or w not in vocab:
                continue
            new_lexicon[w] = lexicon[w]
        feat.build_ineq_for_model(model0, new_lexicon,
                                  output_path=ineq_file.name,
                                  vocab=list(vocab),
                                  top=top)
        utils.split_train_valid(ineq_file.name, valid_num=valid_num)

        input_file.close()
        cmd = ['bin/SWE_Train',
               '-train', input_file.name,
               '-read-vocab', vocab_file.name,
               '-output', output_file.name,
               '-size', str(word2vec_param['size']),
               '-window', str(word2vec_param['window']),
               '-sample', str(word2vec_param['sample']),
               '-hs', str(word2vec_param['hs']),
               '-iter', str(word2vec_param['iter']),
               '-min-count', str(word2vec_param['min_count']),
               '-sem-train', ineq_file.name + '.train',
               '-sem-valid', ineq_file.name + '.valid',
        ]
        logger.info(' '.join(cmd))
        p = Popen(cmd,
                  stdin=PIPE,
                  stdout=PIPE,
                  stderr=PIPE,
                  cwd=res.SWE_PATH)
        out, err = p.communicate()
        err = err.decode()
        out = out.decode()
        logger.info(out)
        logger.error(err)
        if p.returncode == 0:
            model = gensim.models.Word2Vec.load_word2vec_format(output_file.name,
                                                                binary=False,
                                                                unicode_errors='replace')
    finally:
        if clean_after:
            os.remove(vocab_file.name)
            os.remove(input_file.name)
            os.remove(output_file.name)
            os.remove(ineq_file.name)
            os.remove(ineq_file.name + '.train')
            os.remove(ineq_file.name + '.valid')
    return model