def build_model(params, with_dis): """ Build all components of the model. """ # source embeddings src_dico, _src_emb = load_embeddings(params, source=True) params.src_dico = src_dico src_emb = nn.Embedding(len(src_dico), params.emb_dim, sparse=True) src_emb.weight.data.copy_(_src_emb) # target embeddings if params.tgt_lang: tgt_dico, _tgt_emb = load_embeddings(params, source=False) params.tgt_dico = tgt_dico tgt_emb = nn.Embedding(len(tgt_dico), params.emb_dim, sparse=True) tgt_emb.weight.data.copy_(_tgt_emb) else: tgt_emb = None # mapping mapping = nn.Linear(params.emb_dim, params.emb_dim, bias=False) if getattr(params, 'map_id_init', True): mapping.weight.data.copy_(torch.diag(torch.ones(params.emb_dim))) # normalize embeddings params.src_mean = normalize_embeddings(src_emb.weight.data, params.normalize_embeddings) if params.tgt_lang: params.tgt_mean = normalize_embeddings(tgt_emb.weight.data, params.normalize_embeddings) return src_emb, tgt_emb, mapping
def create_word_emb(bert_service_ip=None, reduced_size=100, normalize=False): # Load dict word_dictionary = utils.load('word_dictionary') # Show length info words = show_words_length_info(word_dictionary) # Bert print('Bert converting...') if bert_service_ip is None: bc = BertClient() else: bc = BertClient(ip=bert_service_ip) vecs = bc.encode(words) print('vecs type:', type(vecs)) print('vecs shape:', vecs.shape) if normalize: vecs = utils.normalize_embeddings(vecs) # Save utils.save(vecs, 'bert_word_embeddings') # PCA vecs = utils.reduce_dim(vecs, reduced_size) if normalize: vecs = utils.normalize_embeddings(vecs) utils.save(vecs, 'bert_word_embeddings_' + str(reduced_size))
def create_detail_emb(bert_service_ip=None, reduced_size=100, normalize=False): # Load docs all_docs = utils.load('all_docs') print('docs num:', len(all_docs)) # Show length info details = show_details_length_info(all_docs) # Bert print('Bert converting...') if bert_service_ip is None: bc = BertClient() else: bc = BertClient(ip=bert_service_ip) print('Converting detail') vecs = bc.encode(details) print('detail_vecs shape:', vecs.shape) if normalize: vecs = utils.normalize_embeddings(vecs) utils.save(vecs, 'bert_detail_embeddings') # PCA vecs = utils.reduce_dim(vecs, reduced_size) if normalize: vecs = utils.normalize_embeddings(vecs) utils.save(vecs, 'bert_detail_embeddings_' + str(reduced_size))
def export(self): """ Export embeddings. """ params = self.params # load all embeddings params.src_dico, src_emb = load_embeddings(params, source=True, full_vocab=True) params.tgt_dico, tgt_emb = load_embeddings(params, source=False, full_vocab=True) # apply same normalization as during training normalize_embeddings(src_emb, params.normalize_embeddings, mean=params.src_mean) normalize_embeddings(tgt_emb, params.normalize_embeddings, mean=params.tgt_mean) # map source embeddings to the target space bs = 4096 for i, k in enumerate(range(0, len(src_emb), bs)): x = Variable(src_emb[k:k + bs], volatile=True) src_emb[k:k + bs] = self.mapping(x).data # write embeddings to the disk export_embeddings(src_emb, tgt_emb, params)
def load_embeddings(embeddings_path, vocabulary_path=None, generate=True, load_extra_from=None, normalize=True): """ Load and return an embedding model in either text format or numpy binary format. The text format is used if vocabulary_path is None (because the vocabulary is in the same file as the embeddings). :param embeddings_path: path to embeddings file :param vocabulary_path: path to text file with vocabulary, if needed :param generate: whether to generate random embeddings for unknown, padding and null :param load_extra_from: path to directory with embeddings file with vectors for unknown, padding and null :param normalize: whether to normalize embeddings :return: a tuple (defaultdict, array) """ assert not (generate and load_extra_from), \ 'Either load or generate extra vectors' logging.debug('Loading embeddings') if vocabulary_path is None: wordlist, embeddings = load_text_embeddings(embeddings_path) else: wordlist, embeddings = load_binary_embeddings(embeddings_path, vocabulary_path) if generate or load_extra_from: mapping = zip(wordlist, range(3, len(wordlist) + 3)) # always map OOV words to 0 wd = defaultdict(int, mapping) wd[utils.UNKNOWN] = 0 wd[utils.PADDING] = 1 wd[utils.GO] = 2 if generate: vector_size = embeddings.shape[1] extra = [_generate_random_vector(vector_size), _generate_random_vector(vector_size), _generate_random_vector(vector_size)] else: path = os.path.join(load_extra_from, 'extra-embeddings.npy') extra = np.load(path) embeddings = np.append(extra, embeddings, 0) else: mapping = zip(wordlist, range(0, len(wordlist))) wd = defaultdict(int, mapping) logging.debug('Embeddings have shape {}'.format(embeddings.shape)) if normalize: embeddings = utils.normalize_embeddings(embeddings) return wd, embeddings
def concat_embeddings(): emb1_name = 'dbow_bert_ver2_500_200' emb2_name = 'bert_doc_embeddings_100' emb1 = utils.load_doc_embeddings(emb1_name) emb2 = utils.load_doc_embeddings(emb2_name) emb = utils.concat_embeddings([emb1, emb2]) emb = utils.normalize_embeddings(emb) utils.save_doc_embeddings(emb, emb1_name + ' + ' + emb2_name)
def create_emb(seq_list, save_file_name, bert_service_ip=None, reduced_size=100, normalize=False): # Bert print('Bert converting...') if bert_service_ip is None: bc = BertClient() else: bc = BertClient(ip=bert_service_ip) vecs = bc.encode(seq_list) print('vecs type:', type(vecs)) print('vecs shape:', vecs.shape) if normalize: vecs = utils.normalize_embeddings(vecs) # Save utils.save(vecs, save_file_name) # PCA vecs = utils.reduce_dim(vecs, reduced_size) if normalize: vecs = utils.normalize_embeddings(vecs) utils.save(vecs, save_file_name + '_' + str(reduced_size))
def main(): if len(sys.argv) > 1: emb_path = sys.argv[1] if not os.path.exists(emb_path): print('Error. Embeddings file is not found') return else: print('Error. Specify path to embeddings file') return embeddings, words2ids = read_embeddings(emb_path) embeddings = normalize_embeddings(embeddings) print('SIMILARITY test:') human_vs_cos_sim_correlation('datasets/tt_similarity.csv', embeddings, words2ids) print('RELATEDNESS test:') human_vs_cos_sim_correlation('datasets/tt_relatedness.csv', embeddings, words2ids) print('ANALOGIES test:') top_k = 10 answer_analogy_questions('datasets/tt_analogies.txt', embeddings, words2ids, top_k)
def train(gpu_no, show_loss, train_data, label_data, window_size, word_embedding_size, doc_embedding_size, batch_size, negative_sample_size, is_concat, epochs): os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no) print('window_size', window_size) print('word_embedding_size', word_embedding_size) print('doc_embedding_size', doc_embedding_size) print('batch_size', batch_size) print('negative_sample_size', negative_sample_size) print('is_concat', is_concat) print('epochs:', epochs) # Init ops.reset_default_graph() os.chdir(os.path.dirname(os.path.realpath(__file__))) # Load print('Loading pre processed data') all_docs = utils.load('all_docs') word_dictionary = utils.load('word_dictionary') bert_word_embeddings = utils.load('bert_word_embeddings_100') docs_size = len(all_docs) vocabulary_size = len(word_dictionary) train_set_size = len(train_data) if is_concat: final_embedding_size = word_embedding_size * window_size + doc_embedding_size else: final_embedding_size = doc_embedding_size print('vocabulary_size:', vocabulary_size) print('final_embedding_size:', final_embedding_size) print('train_set_size:', train_set_size) print('Creating model') # Define Embeddings: with tf.name_scope('embeddings'): special_word_embeddings = tf.Variable(tf.random_uniform([2, word_embedding_size], -1.0, 1.0)) word_embeddings = tf.concat([special_word_embeddings, tf.constant(bert_word_embeddings[2:])], axis=0) # word_embeddings = tf.constant(bert_word_embeddings) doc_embeddings = tf.Variable(tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0)) # NCE loss parameters nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, final_embedding_size], stddev=1.0 / np.sqrt(final_embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Create data/target placeholders x_inputs = tf.placeholder(tf.int32, shape=[None, window_size + 1]) # plus 1 for doc index y_target = tf.placeholder(tf.int32, shape=[None, 1]) # Lookup the word embedding # Add together element embeddings in window: # Concat all embeddings if is_concat: word_embed = [tf.nn.embedding_lookup(word_embeddings, x_inputs[:, element]) for element in range(window_size)] doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1]) doc_embed = tf.nn.embedding_lookup(doc_embeddings, doc_indices) final_embed = tf.concat([*word_embed, tf.squeeze(doc_embed, axis=1)], 1) else: word_embed = tf.zeros([batch_size, word_embedding_size]) for element in range(window_size): word_embed += tf.nn.embedding_lookup(word_embeddings, x_inputs[:, element]) doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1]) doc_embed = tf.squeeze(tf.nn.embedding_lookup(doc_embeddings, doc_indices), axis=1) final_embed = (word_embed + doc_embed) / (window_size + 1) # Get loss from prediction with tf.name_scope('loss'): loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed, negative_sample_size, vocabulary_size)) # Create optimizer optimizer = tf.train.AdamOptimizer() train_step = optimizer.minimize(loss) # Add variable initializer. init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) print('Starting training') generations = math.ceil(train_set_size / batch_size) for epoch in range(epochs): for generation in range(generations): # Generate training data batch_train, batch_label = dataset_imdb.generate_batch_data(train_data, label_data, batch_size, generation) # Run the train step feed_dict = {x_inputs: batch_train, y_target: batch_label} sess.run(train_step, feed_dict=feed_dict) # Print the loss if show_loss and (generation + 1) == generations: loss_val = sess.run(loss, feed_dict=feed_dict) print('Loss at epoch {} : {}'.format(epoch, loss_val)) print('Saving model') doc_embeddings = sess.run(doc_embeddings) # Norm doc_embeddings = utils.normalize_embeddings(doc_embeddings) utils.save_doc_embeddings(doc_embeddings, proj_name, is_concat=is_concat, window_size=window_size, batch_size=batch_size, negative_size=negative_sample_size)
def train(gpu_no, show_loss, train_data, label_data, word_embedding_size, doc_embedding_size, batch_size, negative_sample_size, epochs): os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no) print('word_embedding_size', word_embedding_size) print('doc_embedding_size', doc_embedding_size) print('batch_size', batch_size) print('negative_sample_size', negative_sample_size) print('epochs:', epochs) # Init ops.reset_default_graph() os.chdir(os.path.dirname(os.path.realpath(__file__))) # Load print('Loading pre processed data') all_docs = utils.load('all_docs') word_dictionary = utils.load('word_dictionary') bert_title_embeddings = utils.load('bert_title_embeddings') bert_detail_sentence_embeddings = utils.load( 'bert_detail_sentence_embeddings') docs_size = len(all_docs) vocabulary_size = len(word_dictionary) train_set_size = len(train_data) final_embedding_size = doc_embedding_size print('vocabulary_size:', vocabulary_size) print('final_embedding_size:', final_embedding_size) print('train_set_size:', train_set_size) print('Creating model') # Define Embeddings: with tf.name_scope('embeddings'): detail_sentence_embeddings = tf.constant( bert_detail_sentence_embeddings) title_embeddings = tf.constant(bert_title_embeddings) doc_embeddings = tf.Variable( tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0)) title_weights = tf.Variable(tf.random_uniform([docs_size, 1], 0.0, 1.0)) # NCE loss parameters nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, final_embedding_size], stddev=1.0 / np.sqrt(final_embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Create data/target placeholders x_inputs = tf.placeholder(tf.int32, shape=[None, 2]) y_target = tf.placeholder(tf.int32, shape=[None, 1]) # Lookup the word embedding # Add together element embeddings in window: # Concat all embeddings doc_indices = tf.slice(x_inputs, [0, 0], [batch_size, 1]) sentence_indices = tf.slice(x_inputs, [0, 1], [batch_size, 1]) doc_embed = tf.squeeze(tf.nn.embedding_lookup(doc_embeddings, doc_indices), axis=1) sentence_embed = tf.squeeze(tf.nn.embedding_lookup( detail_sentence_embeddings, sentence_indices), axis=1) title_embed = tf.squeeze(tf.nn.embedding_lookup(title_embeddings, doc_indices), axis=1) title_weight = tf.squeeze(tf.nn.embedding_lookup(title_weights, doc_indices), axis=1) title_embed_weighted = tf.math.multiply(title_embed, title_weight) final_embed = (title_embed_weighted + sentence_embed + doc_embed) / 3 # Get loss from prediction with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed, negative_sample_size, vocabulary_size)) # Create optimizer # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) optimizer = tf.train.AdamOptimizer() train_step = optimizer.minimize(loss) # Add variable initializer. init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) print('Starting training') generations = math.ceil(train_set_size / batch_size) for epoch in range(epochs): for generation in range(generations): # Generate training data batch_train, batch_label = dataset.generate_batch_data( train_data, label_data, batch_size, generation) # Run the train step feed_dict = {x_inputs: batch_train, y_target: batch_label} sess.run(train_step, feed_dict=feed_dict) # Print the loss if show_loss and (generation + 1) == generations: loss_val = sess.run(loss, feed_dict=feed_dict) print('Loss at epoch {} : {}'.format(epoch, loss_val)) print('Saving model') doc_embeddings = sess.run(doc_embeddings) title_weights = sess.run(title_weights) title_embeddings_weighted = np.multiply(bert_title_embeddings, title_weights) detail_emb = utils.mean_embeddings( [title_embeddings_weighted, doc_embeddings]) detail_emb_norm = utils.normalize_embeddings(detail_emb) utils.save_doc_embeddings(detail_emb_norm, 'memory_dbow_detail', batch_size=batch_size, negative_size=negative_sample_size) emb = utils.concat_embeddings([bert_title_embeddings, detail_emb_norm]) utils.save_doc_embeddings(emb, 'memory_dbow', batch_size=batch_size, negative_size=negative_sample_size)
parser.add_argument('load', help='Directory with saved model files') parser.add_argument('embeddings', help='Text or numpy file with word embeddings') parser.add_argument('--vocab', help='Vocabulary file (only needed if numpy' 'embedding file is given)') args = parser.parse_args() utils.config_logger(verbose=False) logger = utils.get_logger() logger.info('Reading model') sess = tf.InteractiveSession() model = multimlp.MultiFeedForward.load(args.load, sess) word_dict, embeddings = readdata.load_embeddings(args.embeddings, args.vocab, generate=False, load_extra_from=args.load) embeddings = utils.normalize_embeddings(embeddings) model.initialize_embeddings(sess, embeddings) number_to_label = {v: k for (k, v) in utils.label_map.items()} while True: sent1 = raw_input('Type sentence 1: ') sent2 = raw_input('Type sentence 2: ') tokens1 = utils.tokenize(sent1) tokens2 = utils.tokenize(sent2) vector1 = convert_tokens(tokens1, word_dict, model.max_time_steps1) vector2 = convert_tokens(tokens2, word_dict, model.max_time_steps2, prepend=word_dict[utils.GO]) feeds = {model.sentence1: vector1, model.sentence2: vector2, model.sentence1_size: [len(tokens1)],
def build_model(params, with_dis): """ Build all components of the model. """ src_dico, src_adj, src_features = load_src_data(params.src_file, params.src_nns, params) params.src_dico = src_dico src_emb = nn.Embedding(len(src_features), params.emb_dim, sparse=True) src_emb.weight.data.copy_(src_features) # target embeddings if params.tgt_lang: tgt_dico, tgt_adj, tgt_features = load_tgt_data( params.tgt_file, params.tgt_nns, params) params.tgt_dico = tgt_dico tgt_emb = nn.Embedding(len(tgt_features), params.emb_dim, sparse=True) tgt_emb.weight.data.copy_(tgt_features) else: tgt_emb = None # mapping if params.sparse: src_mapping = SpGAT(nfeat=params.emb_dim, nhid=params.emb_dim, nclass=params.enc_dim, dropout=params.dropout, nheads=params.nb_heads, alpha=params.alpha) tgt_mapping = SpGAT(nfeat=params.emb_dim, nhid=params.emb_dim, nclass=params.enc_dim, dropout=params.dropout, nheads=params.nb_heads, alpha=params.alpha) else: src_mapping = GAT(nfeat=params.emb_dim, nhid=params.emb_dim, nclass=params.enc_dim, dropout=params.dropout, nheads=params.nb_heads, alpha=params.alpha) tgt_mapping = GAT(nfeat=params.emb_dim, nhid=params.emb_dim, nclass=params.enc_dim, dropout=params.dropout, nheads=params.nb_heads, alpha=params.alpha) src_decoder = Decoder(params) tgt_decoder = Decoder(params) # discriminator discriminator = Discriminator(params) if with_dis else None # cuda if params.cuda: src_emb.cuda() src_adj.cuda() src_mapping.cuda() src_decoder.cuda() if params.tgt_lang: tgt_emb.cuda() tgt_adj.cuda() tgt_mapping.cuda() tgt_decoder.cuda() if with_dis: discriminator.cuda() # normalize embeddings params.src_mean = normalize_embeddings(src_emb.weight.data, params.normalize_embeddings) if params.tgt_lang: params.tgt_mean = normalize_embeddings(tgt_emb.weight.data, params.normalize_embeddings) return src_emb, tgt_emb, src_adj, tgt_adj, src_mapping, tgt_mapping, src_decoder, tgt_decoder, discriminator
def train(gpu_no, show_loss, train_data, label_data, word_embedding_size, doc_embedding_size, batch_size, negative_sample_size, epochs_step_1, epochs_step_2): os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no) print('word_embedding_size', word_embedding_size) print('doc_embedding_size', doc_embedding_size) print('batch_size', batch_size) print('negative_sample_size', negative_sample_size) print('epochs_step_1:', epochs_step_1) print('epochs_step_2:', epochs_step_2) # Init ops.reset_default_graph() os.chdir(os.path.dirname(os.path.realpath(__file__))) # Load print('Loading pre processed data') all_docs = utils.load('all_docs') word_dictionary = utils.load('word_dictionary') # bert_title_embeddings = utils.load('bert_title_embeddings') # bert_detail_embeddings = utils.load('bert_detail_embeddings_100') bert_embeddings = utils.load_doc_embeddings('bert_doc_embeddings') docs_size = len(all_docs) vocabulary_size = len(word_dictionary) train_set_size = len(train_data) final_embedding_size = doc_embedding_size print('docs_size:', docs_size) print('vocabulary_size:', vocabulary_size) print('final_embedding_size:', final_embedding_size) print('train_set_size:', train_set_size) print('Creating model') # Define Embeddings: with tf.name_scope('embeddings'): # doc_embeddings = tf.Variable(tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0)) # doc_embeddings = tf.Variable(bert_detail_embeddings) doc_embeddings = tf.Variable(bert_embeddings) # NCE loss parameters nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, final_embedding_size], stddev=1.0 / np.sqrt(final_embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Create data/target placeholders x_inputs = tf.placeholder(tf.int32, shape=[None, 1]) y_target = tf.placeholder(tf.int32, shape=[None, 1]) # Lookup the embedding final_embed = tf.nn.embedding_lookup(doc_embeddings, x_inputs[:, 0]) # Get loss from prediction with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed, negative_sample_size, vocabulary_size)) # Create optimizer optimizer = tf.train.AdamOptimizer() train_step = optimizer.minimize(loss, var_list=[nce_weights, nce_biases]) optimizer_2 = tf.train.GradientDescentOptimizer(learning_rate=0.005) train_step_2 = optimizer_2.minimize(loss, var_list=[doc_embeddings]) # Add variable initializer. init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) print('Starting training') generations = math.ceil(train_set_size / batch_size) for epoch in range(epochs_step_1): for generation in range(generations): # Generate training data batch_train, batch_label = dataset.generate_batch_data( train_data, label_data, batch_size, generation) # Run the train step feed_dict = {x_inputs: batch_train, y_target: batch_label} sess.run(train_step, feed_dict=feed_dict) # Print the loss if show_loss and (generation + 1) == generations: loss_val = sess.run(loss, feed_dict=feed_dict) print('Loss at epoch {} : {}'.format(epoch, loss_val)) for epoch in range(epochs_step_2): for generation in range(generations): # Generate training data batch_train, batch_label = dataset.generate_batch_data( train_data, label_data, batch_size, generation) # Run the train step feed_dict = {x_inputs: batch_train, y_target: batch_label} sess.run(train_step_2, feed_dict=feed_dict) # Print the loss if show_loss and (generation + 1) == generations: loss_val = sess.run(loss, feed_dict=feed_dict) print('Loss at epoch {} : {}'.format(epoch, loss_val)) print('Saving model') doc_embeddings = sess.run(doc_embeddings) # Norm doc_embeddings = utils.normalize_embeddings(doc_embeddings) utils.save_doc_embeddings(doc_embeddings, proj_name, batch_size=batch_size, negative_size=negative_sample_size)
def train(doc_embedding_size, negative_sample_size, epochs): print('doc_embedding_size:', doc_embedding_size) print('negative_sample_size:', negative_sample_size) print('epochs:', epochs) logging.getLogger().setLevel(logging.DEBUG) all_docs = utils.load('all_docs') alldocs = [] corpus_size = len(all_docs) GoogleJobSkillDocument = namedtuple('GoogleJobSkillDocument', 'words tags') for i in range(corpus_size): words = all_docs[i].title_words tags = [i] alldocs.append(GoogleJobSkillDocument(words, tags)) for i in range(corpus_size): words = all_docs[i].detail_words tags = [i + corpus_size] alldocs.append(GoogleJobSkillDocument(words, tags)) print('docs size:', len(alldocs)) doc_list = alldocs[:] shuffle(doc_list) cores = multiprocessing.cpu_count() assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise" model = Doc2Vec(dm=0, vector_size=doc_embedding_size, negative=negative_sample_size, hs=0, min_count=2, sample=0, epochs=epochs, workers=cores) # Build corpus model.build_vocab(alldocs) print("%s vocabulary scanned & state initialized" % model) print("vocab size:", len(model.wv.vocab)) print("docvecs size:", len(model.docvecs)) # Train print("Training %s" % model) model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs) # Save title_emb, detail_emb = utils.split_embeddings(model.docvecs, 2) doc_emb = utils.concat_embeddings([title_emb, detail_emb]) title_emb = utils.normalize_embeddings(title_emb) detail_emb = utils.normalize_embeddings(detail_emb) doc_emb = utils.normalize_embeddings(doc_emb) utils.save_doc_embeddings(title_emb, 'gensim_dbow_title', negative_size=negative_sample_size) utils.save_doc_embeddings(detail_emb, 'gensim_dbow_detail', negative_size=negative_sample_size) utils.save_doc_embeddings(doc_emb, 'gensim_dbow', negative_size=negative_sample_size)
def train(doc_embedding_size, window_size, negative_sample_size, is_concat, epochs): print('window_size:', window_size) print('doc_embedding_size:', doc_embedding_size) print('negative_sample_size:', negative_sample_size) print('is_concat', is_concat) print('epochs:', epochs) logging.getLogger().setLevel(logging.DEBUG) all_docs = utils.load('all_docs') alldocs = [] corpus_size = len(all_docs) ImdbDocument = namedtuple('ImdbDocument', 'words tags') for i in range(corpus_size): words = all_docs[i].words tags = [i] alldocs.append(ImdbDocument(words, tags)) print('docs size:', len(alldocs)) doc_list = alldocs[:] shuffle(doc_list) cores = multiprocessing.cpu_count() assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise" if is_concat: model = Doc2Vec(dm=1, vector_size=100, negative=negative_sample_size, window=window_size, hs=0, min_count=2, sample=0, epochs=epochs, workers=cores, alpha=0.05, dm_concat=1) else: model = Doc2Vec(dm=1, vector_size=100, negative=negative_sample_size, window=window_size, hs=0, min_count=2, sample=0, epochs=epochs, workers=cores, alpha=0.05) # Build corpus model.build_vocab(alldocs) print("%s vocabulary scanned & state initialized" % model) print("vocab size:", len(model.wv.vocab)) print("docvecs size:", len(model.docvecs)) # Train print("Training %s" % model) model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs) # Save emb = [] for i in range(corpus_size): emb.append(model.docvecs[i]) emb = utils.normalize_embeddings(emb) utils.save_doc_embeddings(emb, 'gensim_dm', is_concat=is_concat, window_size=window_size, negative_size=negative_sample_size) # Sample words sample_words = ['engineer'] for word in sample_words: similars = model.wv.most_similar(word, topn=10) print(similars)
def concat_embeddings(emb1_name, emb2_name): emb1 = utils.load_doc_embeddings(emb1_name) emb2 = utils.load_doc_embeddings(emb2_name) emb = utils.concat_embeddings([emb1, emb2]) emb = utils.normalize_embeddings(emb) utils.save_doc_embeddings(emb, emb1_name + ' + ' + emb2_name)