Beispiel #1
0
def build_all_ineq(lexicon, output_path, vocab=None, truncate=None):
    """Write inequalities for SWE like models in output_path.

    For each word of each class in lexicon, build inqualities like
this :
    lexicon['c1'][i] lexicon['c1'][j] lexicon['c1'][i] lexicon['c2'][l]

    If vocab is supplied, do not build inequalities for words that
aren't in the vocabulary.
    """
    with codecs.open(output_path, 'w+', 'utf-8') as ofile:
        lexicon_inv = utils.invert_dict_nonunique(lexicon)
        for (c1, c2) in itertools.combinations(lexicon_inv, 2):
            lst_c1 = lexicon_inv[c1]
            if vocab is not None:
                for w in lst_c1:
                    if w not in vocab:
                        lst_c1.remove(w)
            random.shuffle(lst_c1)
            lst_c1 = lst_c1[:truncate]

            lst_c2 = lexicon_inv[c2]
            if vocab is not None:
                for w in lst_c2:
                    if w not in vocab:
                        lst_c2.remove(w)
            random.shuffle(lst_c2)
            lst_c2 = lst_c2[:truncate]

            for (c1_w1, c1_w2) in itertools.combinations(lst_c1, 2):
                for c2_w1 in lst_c2:
                    ofile.write('%s %s %s %s\n' % (c1_w1, c1_w2, c1_w1, c2_w1))
Beispiel #2
0
def compare_model_with_lexicon_class(model, lexicon,
                                     **kwargs):
    lexicon_inv = utils.invert_dict_nonunique(lexicon)
    for c in lexicon_inv:
        c_lexicon = {}
        for w in lexicon_inv[c]:
            c_lexicon[w] = c
        logger.info('Compare with class %s', c)
        compare_model_with_lexicon(model, lexicon, **kwargs)
Beispiel #3
0
def build_custom3(initial_model=None,
                  lexicon_name='',
                  a_i=0.5, b_ij=0.5, n_iter=10, in_place=True, **kwargs):
    """Retrofit a model using faruqui:2014:NIPS-DLRLW method.

    Args:
        in_place: Modify the given model instead of copying it if True."""
    if initial_model is None:
        raise ValueError('Need an initial model')

    old_lexicon = lexicons.get_lexicon(lexicon_name)

    if not in_place:
        initial_model_file = tempfile.NamedTemporaryFile(mode='w+',
                                                         encoding='utf-8',
                                                         delete=False)
        initial_model_file.close()
        initial_model.save(initial_model_file.name)
        model = gensim.models.Word2Vec.load(initial_model_file.name)
        os.remove(initial_model_file.name)
    else:
        model = initial_model
    lexicon = {}
    for w in old_lexicon:
        if w in model:
            lexicon[w] = old_lexicon[w]
    lexicon_inv = utils.invert_dict_nonunique(lexicon)

    for it in range(n_iter):
        # loop through every node also in ontology (else just use data
        # estimate)
        for word in lexicon:
            if word not in model:
                continue
            i = model.wv.vocab[word].index
            word_neighbours = [w for w in lexicon_inv[lexicon[word]]
                               if w != word]
            num_neighbours = len(word_neighbours)

            if b_ij == 'degree':
                b_ij = 1/num_neighbours

            #no neighbours, pass - use data estimate
            if num_neighbours == 0:
                continue
            # the weight of the data estimate if the number of neighbours
            model.wv.syn0[i] = num_neighbours * a_i * initial_model.wv.syn0[i]
            # loop over neighbours and add to new vector
            # for pp_word in word_neighbours:
            #     j = model.wv.vocab[pp_word].index
            #     model.wv.syn0[i] += b_ij * model.wv.syn0[j]

            # Vectorized version of the above
            word_neighbours = [model.wv.vocab[w].index for w in word_neighbours]
            model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_neighbours], axis=0)
            model.wv.syn0[i] = model.wv.syn0[i] / (num_neighbours * (b_ij + a_i))
    return model
Beispiel #4
0
def old_compare_model_with_lexicon(model, lexicon,
                               topn=100,
                               sample_size=None,
                               clean_after=True,
                               normalize_word=True):
    """Compare model with lexicon with trec_eval script.

https://faculty.washington.edu/levow/courses/ling573_SPR2011/hw/trec_eval_desc.htm
http://trec.nist.gov/trec_eval/

./trec_eval qrel top

TOP reponse
| QID2  | ITER        | DOCNO | RANK        | SIM                    | RUN_ID |
|-------+-------------+-------+-------------+------------------------+--------|
|       | 0 (ignored) | word  | 1 (ignored) | similarity score float | RUN_ID |

QREL verite terrain
| QID | ITER        | DOCNO | REL |
|-----+-------------+-------+-----|
|     | 0 (ignored) |       |     |

QID = ID du mot

    Args:
        model: variable documentation.
        lexicon: variable documentation.
        topn: variable documentation.
        sample_size: variable documentation.

    Returns:
        Returns information

    Raises:
        IOError: An error occurred.
    """
    logger.info('Build lexicon_index for qid (%s)', sample_size)
    if sample_size is None:
        sample_size = len(list(lexicon))
    else:
        sample_size = min(sample_size, len(list(lexicon)))

    if normalize_word:
        model_vocab = [w_norm(w) for w in model.wv.vocab]
    else:
        model_vocab = list(model.wv.vocab)

    lexicon_index = list(enumerate([word for word
                                    in random.sample(list(lexicon),
                                                     sample_size)
                                    if word in model_vocab]))

    lexicon_inv = utils.invert_dict_nonunique(lexicon)

    qrel_file = tempfile.NamedTemporaryFile(mode='w+',
                                            encoding='utf-8',
                                            delete=False,
                                            prefix='qrel')
    logger.info('Build Ground Truth Qrel file (%s)', qrel_file.name)

    for qid, word in lexicon_index:
        for docno in lexicon_inv[lexicon[word]]:
            qrel_file.write('%d 0 %s 1\n' % (qid, docno))

    top_file = tempfile.NamedTemporaryFile(mode='w+',
                                           encoding='utf-8',
                                           delete=False,
                                           prefix='top')
    logger.info('Build Top (%d) answer from the model (%s)',
                topn, top_file.name)

    for qid, word in lexicon_index:
        seen_docno = {}
        word_in_vocab = list(model.wv.vocab)[model_vocab.index(word)]
        for (rank, (docno, sim)) in enumerate(model.most_similar(word_in_vocab,
                                                                 topn=topn)):
            if docno == '' or docno in seen_docno or not re.match(r'^[a-z]+$', docno):
                continue
            seen_docno[docno] = 1
            top_file.write('%d 0 %s %d %f runid\n' % (qid, docno, rank, sim))
            if len(seen_docno) == topn:
                break

    logger.info('Run trec_eval script')
    ret = None
    try:
        p = Popen(['./trec_eval',
                   '-m', 'all_trec',
                   '-m', 'P.1,2,5,10,25,50,100,200,500,1000',
                   qrel_file.name,
                   top_file.name],
                  stdout=PIPE, stderr=PIPE, cwd=res.TREC_EVAL_PATH)
        out, err = p.communicate()
        ret = out + err
        ret = ret.decode()
        logger.info(ret)
    except Exception:
        pass
    if clean_after:
        os.remove(qrel_file.name)
        os.remove(top_file.name)

    return ret
Beispiel #5
0
def build_custom3_2(initial_model=None,
                    lexicon_name='',
                    a_i=1, b_ij=1,
                    n_iter=10, in_place=True,
                    d=1, topn=50, **kwargs):
    """Derived from faruqui:2014:NIPS-DLRLW method.
Put same class closer.

Also moves the topn neighboors by d x <the actual translation>

    Args:
        in_place: Modify the given model instead of copying it if True."""
    logger.info('Customize 3_2 with %s', lexicon_name)
    if initial_model is None:
        raise ValueError('Need an initial model')

    if not in_place:
        initial_model_file = tempfile.NamedTemporaryFile(mode='w+',
                                                         encoding='utf-8',
                                                         delete=False)
        initial_model_file.close()
        initial_model.save(initial_model_file.name)
        model = gensim.models.Word2Vec.load(initial_model_file.name)
        os.remove(initial_model_file.name)
    else:
        model = initial_model
    lexicon = {}
    for (w, v) in lexicons.get_lexicon(lexicon_name).items():
        if w in model:
            lexicon[w] = v
    lexicon_inv = utils.invert_dict_nonunique(lexicon)

    for it in range(n_iter):
        # loop through every node also in ontology (else just use data
        # estimate)
        for word in lexicon:
            if word not in model:
                continue
            i = model.wv.vocab[word].index
            word_lex_neighbours = [w for w in lexicon_inv[lexicon[word]]
                               if w != word]

            word_model_neighbours = model.most_similar(word, topn=topn)

            num_lex_neighbours = len(word_lex_neighbours)

            if b_ij == 'degree':
                b_ij = 1/num_lex_neighbours

            # FIXE use not_lex_neighbours
            #no lex_neighbours, pass - use data estimate
            if num_lex_neighbours == 0:
                continue
            # the weight of the data estimate if the number of lex_neighbours
            model.wv.syn0[i] = num_lex_neighbours * a_i * initial_model.wv.syn0[i]

            # loop over lex_neighbours and add to new vector
            # for pp_word in word_lex_neighbours:
            #     j = model.wv.vocab[pp_word].index
            #     model.wv.syn0[i] += b_ij * model.wv.syn0[j]

            # Vectorized version of the above
            word_lex_neighbours = [model.wv.vocab[w].index for w in word_lex_neighbours]
            model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_lex_neighbours], axis=0)
            model.wv.syn0[i] = model.wv.syn0[i] / (num_lex_neighbours * (b_ij + a_i))

            for (neighbour, _) in word_model_neighbours:
                i = model.wv.vocab[neighbour].index
                model.wv.syn0[i] = d * num_lex_neighbours * a_i * initial_model.wv.syn0[i]
                model.wv.syn0[i] = d * (model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_lex_neighbours], axis=0))

    return model
Beispiel #6
0
def build_custom3_1(initial_model=None,
                    lexicon_name='',
                    a_i=1, b_ij=1, c_ij=1, n_iter=10, in_place=True, **kwargs):
    """Derived from faruqui:2014:NIPS-DLRLW method.
Put same class closer and other classes away.

    Args:
        in_place: Modify the given model instead of copying it if True."""
    if initial_model is None:
        raise ValueError('Need an initial model')

    if not in_place:
        initial_model_file = tempfile.NamedTemporaryFile(mode='w+',
                                                         encoding='utf-8',
                                                         delete=False)
        initial_model_file.close()
        initial_model.save(initial_model_file.name)
        model = gensim.models.Word2Vec.load(initial_model_file.name)
        os.remove(initial_model_file.name)
    else:
        model = initial_model
    lexicon = {}
    for (w, v) in lexicons.get_lexicon(lexicon_name).items():
        if w in model:
            lexicon[w] = v
    lexicon_inv = utils.invert_dict_nonunique(lexicon)

    for it in range(n_iter):
        # loop through every node also in ontology (else just use data
        # estimate)
        for word in lexicon:
            if word not in model:
                continue
            i = model.wv.vocab[word].index
            word_neighbours = [w for w in lexicon_inv[lexicon[word]]
                               if w != word]

            # Non-neighbours are words with different classes than WORD
            word_not_neighbours = []
            for c in lexicon_inv:
                if c != lexicon[word]:
                    word_not_neighbours.extend(lexicon_inv[c])
            # Remove duplicate
            word_not_neighbours = list(set(word_not_neighbours))

            num_neighbours = len(word_neighbours)
            num_not_neighbours = len(word_not_neighbours)

            if b_ij == 'degree':
                b_ij = 1/num_neighbours

            if c_ij == 'degree':
                c_ij = 1/num_not_neighbours

            # FIXE use not_neighbours
            #no neighbours, pass - use data estimate
            if num_neighbours == 0 and num_not_neighbours == 0:
                continue
            # the weight of the data estimate if the number of neighbours
            model.wv.syn0[i] = num_neighbours * a_i * initial_model.wv.syn0[i]
            # loop over neighbours and add to new vector
            # for pp_word in word_neighbours:
            #     j = model.wv.vocab[pp_word].index
            #     model.wv.syn0[i] += b_ij * model.wv.syn0[j]

            # Vectorized version of the above
            word_neighbours = [model.wv.vocab[w].index for w in word_neighbours]
            model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_neighbours], axis=0)
            word_not_neighbours = [model.wv.vocab[w].index for w in word_not_neighbours]
            model.wv.syn0[i] = model.wv.syn0[i] - c_ij * np.sum(model.wv.syn0[word_not_neighbours], axis=0)
            model.wv.syn0[i] = model.wv.syn0[i] / (num_neighbours * (b_ij + a_i))
    return model
Beispiel #7
0
def build_custom_mce(train_path,
                     word2vec_param=default_word2vec_param,
                     lexicon_name='', valid_num=0.1, top=10,
                     clean_after=True, **kwargs):
    """Build a Word2Vec model using MCE method.

    Args:
        lexicon: The lexicon used to build the inequalities.
        valid_num: How much inequations should be used for cross-validation (either a floar between 0 and 1 or an integer.
        top: See feat.build_ineq_for_model
        clean_after: Clean the files after building the model if True."""
    lexicon = lexicons.get_lexicon(lexicon_name)

    source = TwitterLoggerTextReader(train_path)
    source = GenericTextReader(source, lower=True)
    source = Splitter(source)

    input_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                             delete=False, prefix='input')
    for line in source:
        input_file.write(' '.join(line))
        input_file.write('\n')
    input_file.close()

    output_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                              delete=False, prefix='output')
    output_file.close()

    syn_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                           delete=False, prefix='syn')
    ant_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8',
                                           delete=False, prefix='ant')

    lexicon_inv = utils.invert_dict_nonunique(lexicon)
    # Generate syn_file
    for c in lexicon_inv:
        syn_file.write('\t'.join(lexicon_inv[c]))
        syn_file.write('\n')
    syn_file.close()

    # Generate ant_file
    for cur_c in lexicon_inv:
        for word in lexicon_inv[cur_c]:
            for c in lexicon_inv:
                # skip current observed class
                if c == cur_c:
                    continue
                ant_file.write(word + '\t' + '\t'.join(lexicon_inv[c]))
                ant_file.write('\n')
    ant_file.close()

    cmd = ['./word2vec',
           '-train', input_file.name,
           '-output', output_file.name,
           '-size', str(word2vec_param['size']),
           '-window', str(word2vec_param['window']),
           '-sample', str(word2vec_param['sample']),
           '-hs', str(word2vec_param['hs']),
           '-iter', str(word2vec_param['iter']),
           '-min-count', str(word2vec_param['min_count']),
           '-read-syn', syn_file.name,
           '-read-ant', ant_file.name,
    ]
    logger.info(' '.join(cmd))
    p = Popen(cmd,
              stdin=PIPE,
              stdout=PIPE,
              stderr=PIPE,
              cwd=res.MCE_PATH)
    out, err = p.communicate()
    err = err.decode()
    out = out.decode()
    logger.info(out)
    logger.error(err)
    if p.returncode == 0:
        model = gensim.models.Word2Vec.load_word2vec_format(output_file.name,
                                                            binary=False,
                                                            unicode_errors='replace')
    if clean_after:
        os.remove(output_file.name)
        os.remove(input_file.name)
        os.remove(syn_file.name)
        os.remove(ant_file.name)
    return model
Beispiel #8
0
def build_ineq_for_model(model,
                         lexicon,
                         output_path=None,
                         vocab=None,
                         truncate=None,
                         top=None,
                         fmt_str='{c1_w1} {c1_w2} {c1_w1} {c2_w1}\n'):
    """Write inequalities for SWE-like models in OUTPUT_PATH based on the
words in MODEL that are not properly positioned according to the
LEXICON.

    For all words in the LEXICON build inequalities to constrain the
    words that belong to a class but are closer to words in another
    class.

    If VOCAB is supplied, do not build inequalities for words that
    aren't in the vocabulary.

    TOP is used to keep only the closest/farthest words to the
    other/same class.

    TRUNCATE is used to limit the number of words to consider in the
    LEXICON for each class.

    With an empty OUTPUT_PATH, return a list of inequalities.

    """
    lexicon_inv = utils.invert_dict_nonunique(lexicon)
    ineq = []
    cpt = 0
    if output_path is not None:
        ofile = codecs.open(output_path, 'w+', 'utf-8')
    try:
        for (c1, c2) in itertools.combinations(lexicon_inv, 2):
            # c2, c1 = c1, c2
            # All vectors of words in c1
            c1_w = [w for w in lexicon_inv[c1][:truncate] if w in model]
            c1_v = np.array([model[w] for w in c1_w])
            # All vectors of words in c2
            c2_w = [w for w in lexicon_inv[c2][:truncate] if w in model]
            c2_v = np.array([model[w] for w in c2_w])

            logger.info('c1_v = %s', c1_v.shape)
            logger.info('c2_v = %s', c2_v.shape)
            # Concatenate c1 and c2 vectors
            c1_c2_v = np.append(c1_v, c2_v, 0)

            # The index of the first word of c2 in c1_c2_v
            first_c2_i = c1_v.shape[0]

            # The distances between each words in c1 to c1 and c2
            cdist = scipy.spatial.distance.cdist(c1_v,
                                                 c1_c2_v,
                                                 metric='cosine')
            # We only consider the strict upper triangle of the cdist
            # matrix, everything under the diagonal have already been
            # considered
            cdist = np.triu(cdist, 1)

            # The index of words distances ordered
            sorted_cdist_idx = np.argsort(cdist)

            # Iter on rows (words of c1)
            for i in range(c1_v.shape[0]):
                c1_to_reorder = []
                c2_to_reorder = []
                # Iter on columns (ordered index of dist(c1_v[i], c2_v))
                for (idx_j, j) in enumerate(sorted_cdist_idx[i][i:]):
                    idx_j += i
                    # If the current index belongs to c2 but is in the c1 segment
                    if j >= first_c2_i and idx_j < first_c2_i:
                        # Save it
                        c2_to_reorder.append(j)
                    # If the current index belongs to c1 but is in the c2 segment
                    elif j < first_c2_i and idx_j >= first_c2_i:
                        # Save it
                        c1_to_reorder.append(j)

                # Columns is over, generate the inequalities to reorder c1
                # words before c2 words

                # The word to where everything started (to which the
                # columns was built)
                c1_w1 = c1_w[i]
                products = itertools.product(c1_to_reorder[::-1][:top],
                                             c2_to_reorder[:top])
                for (c1_w2_idx, c2_w1_idx) in products:
                    cpt += 1
                    c1_w2 = c1_w[c1_w2_idx]
                    c2_w1 = c2_w[c2_w1_idx - first_c2_i - 1]
                    if output_path is not None:
                        ofile.write(
                            fmt_str.format(c1_w1=c1_w1,
                                           c1_w2=c1_w2,
                                           c2_w1=c2_w1))
                    else:
                        ineq.append([c1_w1, c1_w2, c2_w1])
                i += 1
                if i % 100 == 0 or i == c1_v.shape[0]:
                    logger.info('(%d/%d) %d inequalities', i, c1_v.shape[0],
                                cpt)
    finally:
        if output_path is not None:
            ofile.close()
    return ineq