def build_all_ineq(lexicon, output_path, vocab=None, truncate=None): """Write inequalities for SWE like models in output_path. For each word of each class in lexicon, build inqualities like this : lexicon['c1'][i] lexicon['c1'][j] lexicon['c1'][i] lexicon['c2'][l] If vocab is supplied, do not build inequalities for words that aren't in the vocabulary. """ with codecs.open(output_path, 'w+', 'utf-8') as ofile: lexicon_inv = utils.invert_dict_nonunique(lexicon) for (c1, c2) in itertools.combinations(lexicon_inv, 2): lst_c1 = lexicon_inv[c1] if vocab is not None: for w in lst_c1: if w not in vocab: lst_c1.remove(w) random.shuffle(lst_c1) lst_c1 = lst_c1[:truncate] lst_c2 = lexicon_inv[c2] if vocab is not None: for w in lst_c2: if w not in vocab: lst_c2.remove(w) random.shuffle(lst_c2) lst_c2 = lst_c2[:truncate] for (c1_w1, c1_w2) in itertools.combinations(lst_c1, 2): for c2_w1 in lst_c2: ofile.write('%s %s %s %s\n' % (c1_w1, c1_w2, c1_w1, c2_w1))
def compare_model_with_lexicon_class(model, lexicon, **kwargs): lexicon_inv = utils.invert_dict_nonunique(lexicon) for c in lexicon_inv: c_lexicon = {} for w in lexicon_inv[c]: c_lexicon[w] = c logger.info('Compare with class %s', c) compare_model_with_lexicon(model, lexicon, **kwargs)
def build_custom3(initial_model=None, lexicon_name='', a_i=0.5, b_ij=0.5, n_iter=10, in_place=True, **kwargs): """Retrofit a model using faruqui:2014:NIPS-DLRLW method. Args: in_place: Modify the given model instead of copying it if True.""" if initial_model is None: raise ValueError('Need an initial model') old_lexicon = lexicons.get_lexicon(lexicon_name) if not in_place: initial_model_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) initial_model_file.close() initial_model.save(initial_model_file.name) model = gensim.models.Word2Vec.load(initial_model_file.name) os.remove(initial_model_file.name) else: model = initial_model lexicon = {} for w in old_lexicon: if w in model: lexicon[w] = old_lexicon[w] lexicon_inv = utils.invert_dict_nonunique(lexicon) for it in range(n_iter): # loop through every node also in ontology (else just use data # estimate) for word in lexicon: if word not in model: continue i = model.wv.vocab[word].index word_neighbours = [w for w in lexicon_inv[lexicon[word]] if w != word] num_neighbours = len(word_neighbours) if b_ij == 'degree': b_ij = 1/num_neighbours #no neighbours, pass - use data estimate if num_neighbours == 0: continue # the weight of the data estimate if the number of neighbours model.wv.syn0[i] = num_neighbours * a_i * initial_model.wv.syn0[i] # loop over neighbours and add to new vector # for pp_word in word_neighbours: # j = model.wv.vocab[pp_word].index # model.wv.syn0[i] += b_ij * model.wv.syn0[j] # Vectorized version of the above word_neighbours = [model.wv.vocab[w].index for w in word_neighbours] model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_neighbours], axis=0) model.wv.syn0[i] = model.wv.syn0[i] / (num_neighbours * (b_ij + a_i)) return model
def old_compare_model_with_lexicon(model, lexicon, topn=100, sample_size=None, clean_after=True, normalize_word=True): """Compare model with lexicon with trec_eval script. https://faculty.washington.edu/levow/courses/ling573_SPR2011/hw/trec_eval_desc.htm http://trec.nist.gov/trec_eval/ ./trec_eval qrel top TOP reponse | QID2 | ITER | DOCNO | RANK | SIM | RUN_ID | |-------+-------------+-------+-------------+------------------------+--------| | | 0 (ignored) | word | 1 (ignored) | similarity score float | RUN_ID | QREL verite terrain | QID | ITER | DOCNO | REL | |-----+-------------+-------+-----| | | 0 (ignored) | | | QID = ID du mot Args: model: variable documentation. lexicon: variable documentation. topn: variable documentation. sample_size: variable documentation. Returns: Returns information Raises: IOError: An error occurred. """ logger.info('Build lexicon_index for qid (%s)', sample_size) if sample_size is None: sample_size = len(list(lexicon)) else: sample_size = min(sample_size, len(list(lexicon))) if normalize_word: model_vocab = [w_norm(w) for w in model.wv.vocab] else: model_vocab = list(model.wv.vocab) lexicon_index = list(enumerate([word for word in random.sample(list(lexicon), sample_size) if word in model_vocab])) lexicon_inv = utils.invert_dict_nonunique(lexicon) qrel_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='qrel') logger.info('Build Ground Truth Qrel file (%s)', qrel_file.name) for qid, word in lexicon_index: for docno in lexicon_inv[lexicon[word]]: qrel_file.write('%d 0 %s 1\n' % (qid, docno)) top_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='top') logger.info('Build Top (%d) answer from the model (%s)', topn, top_file.name) for qid, word in lexicon_index: seen_docno = {} word_in_vocab = list(model.wv.vocab)[model_vocab.index(word)] for (rank, (docno, sim)) in enumerate(model.most_similar(word_in_vocab, topn=topn)): if docno == '' or docno in seen_docno or not re.match(r'^[a-z]+$', docno): continue seen_docno[docno] = 1 top_file.write('%d 0 %s %d %f runid\n' % (qid, docno, rank, sim)) if len(seen_docno) == topn: break logger.info('Run trec_eval script') ret = None try: p = Popen(['./trec_eval', '-m', 'all_trec', '-m', 'P.1,2,5,10,25,50,100,200,500,1000', qrel_file.name, top_file.name], stdout=PIPE, stderr=PIPE, cwd=res.TREC_EVAL_PATH) out, err = p.communicate() ret = out + err ret = ret.decode() logger.info(ret) except Exception: pass if clean_after: os.remove(qrel_file.name) os.remove(top_file.name) return ret
def build_custom3_2(initial_model=None, lexicon_name='', a_i=1, b_ij=1, n_iter=10, in_place=True, d=1, topn=50, **kwargs): """Derived from faruqui:2014:NIPS-DLRLW method. Put same class closer. Also moves the topn neighboors by d x <the actual translation> Args: in_place: Modify the given model instead of copying it if True.""" logger.info('Customize 3_2 with %s', lexicon_name) if initial_model is None: raise ValueError('Need an initial model') if not in_place: initial_model_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) initial_model_file.close() initial_model.save(initial_model_file.name) model = gensim.models.Word2Vec.load(initial_model_file.name) os.remove(initial_model_file.name) else: model = initial_model lexicon = {} for (w, v) in lexicons.get_lexicon(lexicon_name).items(): if w in model: lexicon[w] = v lexicon_inv = utils.invert_dict_nonunique(lexicon) for it in range(n_iter): # loop through every node also in ontology (else just use data # estimate) for word in lexicon: if word not in model: continue i = model.wv.vocab[word].index word_lex_neighbours = [w for w in lexicon_inv[lexicon[word]] if w != word] word_model_neighbours = model.most_similar(word, topn=topn) num_lex_neighbours = len(word_lex_neighbours) if b_ij == 'degree': b_ij = 1/num_lex_neighbours # FIXE use not_lex_neighbours #no lex_neighbours, pass - use data estimate if num_lex_neighbours == 0: continue # the weight of the data estimate if the number of lex_neighbours model.wv.syn0[i] = num_lex_neighbours * a_i * initial_model.wv.syn0[i] # loop over lex_neighbours and add to new vector # for pp_word in word_lex_neighbours: # j = model.wv.vocab[pp_word].index # model.wv.syn0[i] += b_ij * model.wv.syn0[j] # Vectorized version of the above word_lex_neighbours = [model.wv.vocab[w].index for w in word_lex_neighbours] model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_lex_neighbours], axis=0) model.wv.syn0[i] = model.wv.syn0[i] / (num_lex_neighbours * (b_ij + a_i)) for (neighbour, _) in word_model_neighbours: i = model.wv.vocab[neighbour].index model.wv.syn0[i] = d * num_lex_neighbours * a_i * initial_model.wv.syn0[i] model.wv.syn0[i] = d * (model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_lex_neighbours], axis=0)) return model
def build_custom3_1(initial_model=None, lexicon_name='', a_i=1, b_ij=1, c_ij=1, n_iter=10, in_place=True, **kwargs): """Derived from faruqui:2014:NIPS-DLRLW method. Put same class closer and other classes away. Args: in_place: Modify the given model instead of copying it if True.""" if initial_model is None: raise ValueError('Need an initial model') if not in_place: initial_model_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) initial_model_file.close() initial_model.save(initial_model_file.name) model = gensim.models.Word2Vec.load(initial_model_file.name) os.remove(initial_model_file.name) else: model = initial_model lexicon = {} for (w, v) in lexicons.get_lexicon(lexicon_name).items(): if w in model: lexicon[w] = v lexicon_inv = utils.invert_dict_nonunique(lexicon) for it in range(n_iter): # loop through every node also in ontology (else just use data # estimate) for word in lexicon: if word not in model: continue i = model.wv.vocab[word].index word_neighbours = [w for w in lexicon_inv[lexicon[word]] if w != word] # Non-neighbours are words with different classes than WORD word_not_neighbours = [] for c in lexicon_inv: if c != lexicon[word]: word_not_neighbours.extend(lexicon_inv[c]) # Remove duplicate word_not_neighbours = list(set(word_not_neighbours)) num_neighbours = len(word_neighbours) num_not_neighbours = len(word_not_neighbours) if b_ij == 'degree': b_ij = 1/num_neighbours if c_ij == 'degree': c_ij = 1/num_not_neighbours # FIXE use not_neighbours #no neighbours, pass - use data estimate if num_neighbours == 0 and num_not_neighbours == 0: continue # the weight of the data estimate if the number of neighbours model.wv.syn0[i] = num_neighbours * a_i * initial_model.wv.syn0[i] # loop over neighbours and add to new vector # for pp_word in word_neighbours: # j = model.wv.vocab[pp_word].index # model.wv.syn0[i] += b_ij * model.wv.syn0[j] # Vectorized version of the above word_neighbours = [model.wv.vocab[w].index for w in word_neighbours] model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_neighbours], axis=0) word_not_neighbours = [model.wv.vocab[w].index for w in word_not_neighbours] model.wv.syn0[i] = model.wv.syn0[i] - c_ij * np.sum(model.wv.syn0[word_not_neighbours], axis=0) model.wv.syn0[i] = model.wv.syn0[i] / (num_neighbours * (b_ij + a_i)) return model
def build_custom_mce(train_path, word2vec_param=default_word2vec_param, lexicon_name='', valid_num=0.1, top=10, clean_after=True, **kwargs): """Build a Word2Vec model using MCE method. Args: lexicon: The lexicon used to build the inequalities. valid_num: How much inequations should be used for cross-validation (either a floar between 0 and 1 or an integer. top: See feat.build_ineq_for_model clean_after: Clean the files after building the model if True.""" lexicon = lexicons.get_lexicon(lexicon_name) source = TwitterLoggerTextReader(train_path) source = GenericTextReader(source, lower=True) source = Splitter(source) input_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='input') for line in source: input_file.write(' '.join(line)) input_file.write('\n') input_file.close() output_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='output') output_file.close() syn_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='syn') ant_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='ant') lexicon_inv = utils.invert_dict_nonunique(lexicon) # Generate syn_file for c in lexicon_inv: syn_file.write('\t'.join(lexicon_inv[c])) syn_file.write('\n') syn_file.close() # Generate ant_file for cur_c in lexicon_inv: for word in lexicon_inv[cur_c]: for c in lexicon_inv: # skip current observed class if c == cur_c: continue ant_file.write(word + '\t' + '\t'.join(lexicon_inv[c])) ant_file.write('\n') ant_file.close() cmd = ['./word2vec', '-train', input_file.name, '-output', output_file.name, '-size', str(word2vec_param['size']), '-window', str(word2vec_param['window']), '-sample', str(word2vec_param['sample']), '-hs', str(word2vec_param['hs']), '-iter', str(word2vec_param['iter']), '-min-count', str(word2vec_param['min_count']), '-read-syn', syn_file.name, '-read-ant', ant_file.name, ] logger.info(' '.join(cmd)) p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=res.MCE_PATH) out, err = p.communicate() err = err.decode() out = out.decode() logger.info(out) logger.error(err) if p.returncode == 0: model = gensim.models.Word2Vec.load_word2vec_format(output_file.name, binary=False, unicode_errors='replace') if clean_after: os.remove(output_file.name) os.remove(input_file.name) os.remove(syn_file.name) os.remove(ant_file.name) return model
def build_ineq_for_model(model, lexicon, output_path=None, vocab=None, truncate=None, top=None, fmt_str='{c1_w1} {c1_w2} {c1_w1} {c2_w1}\n'): """Write inequalities for SWE-like models in OUTPUT_PATH based on the words in MODEL that are not properly positioned according to the LEXICON. For all words in the LEXICON build inequalities to constrain the words that belong to a class but are closer to words in another class. If VOCAB is supplied, do not build inequalities for words that aren't in the vocabulary. TOP is used to keep only the closest/farthest words to the other/same class. TRUNCATE is used to limit the number of words to consider in the LEXICON for each class. With an empty OUTPUT_PATH, return a list of inequalities. """ lexicon_inv = utils.invert_dict_nonunique(lexicon) ineq = [] cpt = 0 if output_path is not None: ofile = codecs.open(output_path, 'w+', 'utf-8') try: for (c1, c2) in itertools.combinations(lexicon_inv, 2): # c2, c1 = c1, c2 # All vectors of words in c1 c1_w = [w for w in lexicon_inv[c1][:truncate] if w in model] c1_v = np.array([model[w] for w in c1_w]) # All vectors of words in c2 c2_w = [w for w in lexicon_inv[c2][:truncate] if w in model] c2_v = np.array([model[w] for w in c2_w]) logger.info('c1_v = %s', c1_v.shape) logger.info('c2_v = %s', c2_v.shape) # Concatenate c1 and c2 vectors c1_c2_v = np.append(c1_v, c2_v, 0) # The index of the first word of c2 in c1_c2_v first_c2_i = c1_v.shape[0] # The distances between each words in c1 to c1 and c2 cdist = scipy.spatial.distance.cdist(c1_v, c1_c2_v, metric='cosine') # We only consider the strict upper triangle of the cdist # matrix, everything under the diagonal have already been # considered cdist = np.triu(cdist, 1) # The index of words distances ordered sorted_cdist_idx = np.argsort(cdist) # Iter on rows (words of c1) for i in range(c1_v.shape[0]): c1_to_reorder = [] c2_to_reorder = [] # Iter on columns (ordered index of dist(c1_v[i], c2_v)) for (idx_j, j) in enumerate(sorted_cdist_idx[i][i:]): idx_j += i # If the current index belongs to c2 but is in the c1 segment if j >= first_c2_i and idx_j < first_c2_i: # Save it c2_to_reorder.append(j) # If the current index belongs to c1 but is in the c2 segment elif j < first_c2_i and idx_j >= first_c2_i: # Save it c1_to_reorder.append(j) # Columns is over, generate the inequalities to reorder c1 # words before c2 words # The word to where everything started (to which the # columns was built) c1_w1 = c1_w[i] products = itertools.product(c1_to_reorder[::-1][:top], c2_to_reorder[:top]) for (c1_w2_idx, c2_w1_idx) in products: cpt += 1 c1_w2 = c1_w[c1_w2_idx] c2_w1 = c2_w[c2_w1_idx - first_c2_i - 1] if output_path is not None: ofile.write( fmt_str.format(c1_w1=c1_w1, c1_w2=c1_w2, c2_w1=c2_w1)) else: ineq.append([c1_w1, c1_w2, c2_w1]) i += 1 if i % 100 == 0 or i == c1_v.shape[0]: logger.info('(%d/%d) %d inequalities', i, c1_v.shape[0], cpt) finally: if output_path is not None: ofile.close() return ineq