def build_custom3(initial_model=None, lexicon_name='', a_i=0.5, b_ij=0.5, n_iter=10, in_place=True, **kwargs): """Retrofit a model using faruqui:2014:NIPS-DLRLW method. Args: in_place: Modify the given model instead of copying it if True.""" if initial_model is None: raise ValueError('Need an initial model') old_lexicon = lexicons.get_lexicon(lexicon_name) if not in_place: initial_model_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) initial_model_file.close() initial_model.save(initial_model_file.name) model = gensim.models.Word2Vec.load(initial_model_file.name) os.remove(initial_model_file.name) else: model = initial_model lexicon = {} for w in old_lexicon: if w in model: lexicon[w] = old_lexicon[w] lexicon_inv = utils.invert_dict_nonunique(lexicon) for it in range(n_iter): # loop through every node also in ontology (else just use data # estimate) for word in lexicon: if word not in model: continue i = model.wv.vocab[word].index word_neighbours = [w for w in lexicon_inv[lexicon[word]] if w != word] num_neighbours = len(word_neighbours) if b_ij == 'degree': b_ij = 1/num_neighbours #no neighbours, pass - use data estimate if num_neighbours == 0: continue # the weight of the data estimate if the number of neighbours model.wv.syn0[i] = num_neighbours * a_i * initial_model.wv.syn0[i] # loop over neighbours and add to new vector # for pp_word in word_neighbours: # j = model.wv.vocab[pp_word].index # model.wv.syn0[i] += b_ij * model.wv.syn0[j] # Vectorized version of the above word_neighbours = [model.wv.vocab[w].index for w in word_neighbours] model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_neighbours], axis=0) model.wv.syn0[i] = model.wv.syn0[i] / (num_neighbours * (b_ij + a_i)) return model
def build_custom1(train_path, word2vec_param=default_word2vec_param, lexicon_name='', **kwargs): logger.info('Train custom1 model') lexicon = lexicons.get_lexicon(lexicon_name) source = TwitterLoggerTextReader(train_path) source = GenericTextReader(source) source = Splitter(source) source = LexiconProjecter(source, lexicon) return gensim.models.Word2Vec(source, **word2vec_param)
def build_custom3_2(initial_model=None, lexicon_name='', a_i=1, b_ij=1, n_iter=10, in_place=True, d=1, topn=50, **kwargs): """Derived from faruqui:2014:NIPS-DLRLW method. Put same class closer. Also moves the topn neighboors by d x <the actual translation> Args: in_place: Modify the given model instead of copying it if True.""" logger.info('Customize 3_2 with %s', lexicon_name) if initial_model is None: raise ValueError('Need an initial model') if not in_place: initial_model_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) initial_model_file.close() initial_model.save(initial_model_file.name) model = gensim.models.Word2Vec.load(initial_model_file.name) os.remove(initial_model_file.name) else: model = initial_model lexicon = {} for (w, v) in lexicons.get_lexicon(lexicon_name).items(): if w in model: lexicon[w] = v lexicon_inv = utils.invert_dict_nonunique(lexicon) for it in range(n_iter): # loop through every node also in ontology (else just use data # estimate) for word in lexicon: if word not in model: continue i = model.wv.vocab[word].index word_lex_neighbours = [w for w in lexicon_inv[lexicon[word]] if w != word] word_model_neighbours = model.most_similar(word, topn=topn) num_lex_neighbours = len(word_lex_neighbours) if b_ij == 'degree': b_ij = 1/num_lex_neighbours # FIXE use not_lex_neighbours #no lex_neighbours, pass - use data estimate if num_lex_neighbours == 0: continue # the weight of the data estimate if the number of lex_neighbours model.wv.syn0[i] = num_lex_neighbours * a_i * initial_model.wv.syn0[i] # loop over lex_neighbours and add to new vector # for pp_word in word_lex_neighbours: # j = model.wv.vocab[pp_word].index # model.wv.syn0[i] += b_ij * model.wv.syn0[j] # Vectorized version of the above word_lex_neighbours = [model.wv.vocab[w].index for w in word_lex_neighbours] model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_lex_neighbours], axis=0) model.wv.syn0[i] = model.wv.syn0[i] / (num_lex_neighbours * (b_ij + a_i)) for (neighbour, _) in word_model_neighbours: i = model.wv.vocab[neighbour].index model.wv.syn0[i] = d * num_lex_neighbours * a_i * initial_model.wv.syn0[i] model.wv.syn0[i] = d * (model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_lex_neighbours], axis=0)) return model
def build_custom3_1(initial_model=None, lexicon_name='', a_i=1, b_ij=1, c_ij=1, n_iter=10, in_place=True, **kwargs): """Derived from faruqui:2014:NIPS-DLRLW method. Put same class closer and other classes away. Args: in_place: Modify the given model instead of copying it if True.""" if initial_model is None: raise ValueError('Need an initial model') if not in_place: initial_model_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) initial_model_file.close() initial_model.save(initial_model_file.name) model = gensim.models.Word2Vec.load(initial_model_file.name) os.remove(initial_model_file.name) else: model = initial_model lexicon = {} for (w, v) in lexicons.get_lexicon(lexicon_name).items(): if w in model: lexicon[w] = v lexicon_inv = utils.invert_dict_nonunique(lexicon) for it in range(n_iter): # loop through every node also in ontology (else just use data # estimate) for word in lexicon: if word not in model: continue i = model.wv.vocab[word].index word_neighbours = [w for w in lexicon_inv[lexicon[word]] if w != word] # Non-neighbours are words with different classes than WORD word_not_neighbours = [] for c in lexicon_inv: if c != lexicon[word]: word_not_neighbours.extend(lexicon_inv[c]) # Remove duplicate word_not_neighbours = list(set(word_not_neighbours)) num_neighbours = len(word_neighbours) num_not_neighbours = len(word_not_neighbours) if b_ij == 'degree': b_ij = 1/num_neighbours if c_ij == 'degree': c_ij = 1/num_not_neighbours # FIXE use not_neighbours #no neighbours, pass - use data estimate if num_neighbours == 0 and num_not_neighbours == 0: continue # the weight of the data estimate if the number of neighbours model.wv.syn0[i] = num_neighbours * a_i * initial_model.wv.syn0[i] # loop over neighbours and add to new vector # for pp_word in word_neighbours: # j = model.wv.vocab[pp_word].index # model.wv.syn0[i] += b_ij * model.wv.syn0[j] # Vectorized version of the above word_neighbours = [model.wv.vocab[w].index for w in word_neighbours] model.wv.syn0[i] = model.wv.syn0[i] + b_ij * np.sum(model.wv.syn0[word_neighbours], axis=0) word_not_neighbours = [model.wv.vocab[w].index for w in word_not_neighbours] model.wv.syn0[i] = model.wv.syn0[i] - c_ij * np.sum(model.wv.syn0[word_not_neighbours], axis=0) model.wv.syn0[i] = model.wv.syn0[i] / (num_neighbours * (b_ij + a_i)) return model
def build_custom_mce(train_path, word2vec_param=default_word2vec_param, lexicon_name='', valid_num=0.1, top=10, clean_after=True, **kwargs): """Build a Word2Vec model using MCE method. Args: lexicon: The lexicon used to build the inequalities. valid_num: How much inequations should be used for cross-validation (either a floar between 0 and 1 or an integer. top: See feat.build_ineq_for_model clean_after: Clean the files after building the model if True.""" lexicon = lexicons.get_lexicon(lexicon_name) source = TwitterLoggerTextReader(train_path) source = GenericTextReader(source, lower=True) source = Splitter(source) input_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='input') for line in source: input_file.write(' '.join(line)) input_file.write('\n') input_file.close() output_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='output') output_file.close() syn_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='syn') ant_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='ant') lexicon_inv = utils.invert_dict_nonunique(lexicon) # Generate syn_file for c in lexicon_inv: syn_file.write('\t'.join(lexicon_inv[c])) syn_file.write('\n') syn_file.close() # Generate ant_file for cur_c in lexicon_inv: for word in lexicon_inv[cur_c]: for c in lexicon_inv: # skip current observed class if c == cur_c: continue ant_file.write(word + '\t' + '\t'.join(lexicon_inv[c])) ant_file.write('\n') ant_file.close() cmd = ['./word2vec', '-train', input_file.name, '-output', output_file.name, '-size', str(word2vec_param['size']), '-window', str(word2vec_param['window']), '-sample', str(word2vec_param['sample']), '-hs', str(word2vec_param['hs']), '-iter', str(word2vec_param['iter']), '-min-count', str(word2vec_param['min_count']), '-read-syn', syn_file.name, '-read-ant', ant_file.name, ] logger.info(' '.join(cmd)) p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=res.MCE_PATH) out, err = p.communicate() err = err.decode() out = out.decode() logger.info(out) logger.error(err) if p.returncode == 0: model = gensim.models.Word2Vec.load_word2vec_format(output_file.name, binary=False, unicode_errors='replace') if clean_after: os.remove(output_file.name) os.remove(input_file.name) os.remove(syn_file.name) os.remove(ant_file.name) return model
def build_custom2(train_path, word2vec_param=default_word2vec_param, lexicon_name='', valid_num=0.1, top=10, clean_after=True, **kwargs): """Build a Word2Vec model using SWE method (optimization with inequalities). Args: lexicon: The lexicon used to build the inequalities. valid_num: How much inequations should be used for cross-validation (either a floar between 0 and 1 or an integer. top: See feat.build_ineq_for_model clean_after: Clean the files after building the model if True.""" lexicon = lexicons.get_lexicon(lexicon_name) model = None source = TwitterLoggerTextReader(train_path) source = GenericTextReader(source, lower=True) source = Splitter(source) input_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='input') output_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='output') output_file.close() ineq_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='ineq') ineq_file.close() vocab_file = tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False, prefix='vocab') try: logger.info('Build vocabulary file') vocab = Counter() for line in source: vocab.update(line) input_file.write(' '.join(line)) input_file.write('\n') vocab = OrderedDict(sorted(vocab.items(), key=lambda t: t[1], reverse=True)) for word in vocab: # Ignore word with freq < min_count if 'min_count' in word2vec_param and vocab[word] < word2vec_param['min_count']: del vocab[word] continue vocab_file.write('%s\t%d\n' % (word, vocab[word])) vocab_file.close() model0 = get_custom0(word2vec_param=word2vec_param) new_lexicon = {} for w in lexicon: # Ignore word not in vocab nor in model if w not in model0 or w not in vocab: continue new_lexicon[w] = lexicon[w] feat.build_ineq_for_model(model0, new_lexicon, output_path=ineq_file.name, vocab=list(vocab), top=top) utils.split_train_valid(ineq_file.name, valid_num=valid_num) input_file.close() cmd = ['bin/SWE_Train', '-train', input_file.name, '-read-vocab', vocab_file.name, '-output', output_file.name, '-size', str(word2vec_param['size']), '-window', str(word2vec_param['window']), '-sample', str(word2vec_param['sample']), '-hs', str(word2vec_param['hs']), '-iter', str(word2vec_param['iter']), '-min-count', str(word2vec_param['min_count']), '-sem-train', ineq_file.name + '.train', '-sem-valid', ineq_file.name + '.valid', ] logger.info(' '.join(cmd)) p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=res.SWE_PATH) out, err = p.communicate() err = err.decode() out = out.decode() logger.info(out) logger.error(err) if p.returncode == 0: model = gensim.models.Word2Vec.load_word2vec_format(output_file.name, binary=False, unicode_errors='replace') finally: if clean_after: os.remove(vocab_file.name) os.remove(input_file.name) os.remove(output_file.name) os.remove(ineq_file.name) os.remove(ineq_file.name + '.train') os.remove(ineq_file.name + '.valid') return model