def elmo_indexing(cleaned, batcher, sentence_character_ids, elmo_sentence_input): #preprocessing """ Indexing corpus :param cleaned: list if lists of str, tokenized documents from the corpus :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model :return: matrix of document vectors """ with tf.Session() as sess: # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) indexed = [] for i in range(200, len(cleaned) + 1, 200): sentences = cleaned[i - 200:i] elmo_vectors = get_elmo_vectors(sess, sentences, batcher, sentence_character_ids, elmo_sentence_input) for vect, sent in zip(elmo_vectors, sentences): cropped_vector = crop_vec(vect, sent) indexed.append(cropped_vector) data_elmo = pd.DataFrame(indexed) data_elmo.to_csv('elmo_index.csv', index=False) #with open('ELMO_model.pickle', 'wb') as f: # pickle.dump((batcher, sentence_character_ids, elmo_sentence_input), f) return indexed
def elmo_query2vec(query, batcher, sentence_character_ids, elmo_sentence_input): query = tokenize(query) with tensorflow.Session() as sess: sess.run(tensorflow.global_variables_initializer()) vector = get_vect(get_elmo_vectors(sess, query, batcher, sentence_character_ids, elmo_sentence_input)[0], query[0]) return vector
def search(self, query, n=5): with tf.Session() as sess: sess.run(tf.global_variables_initializer()) query_vec = np.transpose( np.mean(get_elmo_vectors(sess, [lemmatize(query)], batcher, ids, elmo_input), axis=1)).flatten() result = np.matmul(self.vec, query_vec) indices = np.argsort(result)[::-1].tolist()[:n] return list(zip(self.texts[indices], result[indices]))
def build_vec(self, sentences): with tf.Session() as sess: sess.run(tf.global_variables_initializer()) elmo_vectors = get_elmo_vectors( sess, sentences, self.batcher, self.sentence_character_ids, self.elmo_sentence_input) results = [] for vect, sent in zip(elmo_vectors, sentences): results.append(np.mean(vect[:len(sent), :], axis=0)) return results
def transform(self, query): """ Gets vector of query :param query: str :return: vector of query """ batcher, sentence_character_ids, elmo_sentence_input = self.model q = [tokenize(query)] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) vector = crop_vec( get_elmo_vectors(sess, q, batcher, sentence_character_ids, elmo_sentence_input)[0], q[0]) return vector
def create_elmo_matrix(corpus, batcher, sentence_character_ids, elmo_sentence_input): with tensorflow.Session() as sess: sess.run(tensorflow.global_variables_initializer()) matrix = [] for i in range(200, len(corpus) + 1, 200): sentences = corpus[i - 200: i] elmo_vectors = get_elmo_vectors(sess, sentences, batcher, sentence_character_ids, elmo_sentence_input) for vect, sent in zip(elmo_vectors, sentences): vector = get_vect(vect, sent) matrix.append(vector) return matrix
def _search_elmo(self, query, top=10): logging.log(logging.INFO, "Запрос: " + query) query = self._preprocess_string(query) _, index = self.index_structs["elmo"] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True, gpu_options=gpu_options)) as sess: with tf.device('/gpu:0'): # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) start = time.time() elmo_vectors = get_elmo_vectors( sess, [query], self.models["elmo"].batcher, self.models["elmo"].sentence_character_ids, self.models["elmo"].elmo_sentence_input) # Due to batch processing, the above code produces for each sentence # the same number of token vectors, equal to the length of the longest sentence # (the 2nd dimension of the elmo_vector tensor). # If a sentence is shorter, the vectors for non-existent words are filled with zeroes. # Let's make a version without these redundant vectors: cropped_vectors = [] for vect, sent in zip(elmo_vectors, [query]): cropped_vector = vect[:len(sent), :] cropped_vectors.append(cropped_vector) vec = cropped_vectors[0] entity = np.sum(vec, axis=0) / len(vec) den = np.sqrt( np.einsum('ij,ij->i', index, index) * np.einsum('j,j', entity, entity)) match = index.dot(entity) / den.flatten() match_dict = { self.orig_strings[i]: 1 - score for i, score in enumerate(match) } match = tuple( sorted(match_dict.items(), key=lambda x: x[1])[:top]) print(query) return match
def process(self, docs, index_path): docs = docs gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True, gpu_options=gpu_options)) as sess: with tf.device('/gpu:0'): # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) start = time.time() vectors = [] batch_size = 50 bs, be = 0, batch_size while be <= len(docs): batch = docs[bs:be] elmo_vectors = get_elmo_vectors( sess, batch, self.model.batcher, self.model.sentence_character_ids, self.model.elmo_sentence_input) # Due to batch processing, the above code produces for each sentence # the same number of token vectors, equal to the length of the longest sentence # (the 2nd dimension of the elmo_vector tensor). # If a sentence is shorter, the vectors for non-existent words are filled with zeroes. # Let's make a version without these redundant vectors: cropped_vectors = [] for vect, sent in zip(elmo_vectors, batch): cropped_vector = vect[:len(sent), :] cropped_vectors.append(cropped_vector) vectors += cropped_vectors bs += batch_size be += batch_size index = np.array([np.sum(v, axis=0) / len(v) for v in vectors]) struct = [{}, index] with open(os.path.join(index_path, self.label + ".pickle"), "wb") as pfile: pickle.dump(struct, pfile) return True
def prepare_elmo_query(query, batcher, sentence_character_ids, elmo_sentence_input): """ Gets vector of query :param query: str :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model :return: vector of query """ query = preproc(query) q = [tokenize(query)] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) vector = crop_vec( get_elmo_vectors(sess, q, batcher, sentence_character_ids, elmo_sentence_input)[0], q[0]) return vector
def search(self, query): tf.reset_default_graph() batcher, sentence_character_ids, elmo_sentence_input = \ load_elmo_embeddings(self.elmo_path) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) elmo_vectors = get_elmo_vectors(sess, [query], batcher, sentence_character_ids, elmo_sentence_input) results = [] for vect, sent in zip(elmo_vectors, [query]): results.append(np.mean(vect[:len(sent), :], axis=0)) vec = results res = cosine_similarity(vec, self.vectors) docs = [(idx, doc) for idx, doc in enumerate(res[0])] docs = sorted(docs, key=lambda x: x[1], reverse=True) docs = [(x[0], x[1], self.corpus['question2'][x[0]]) for x in docs[:10]] return docs
def indexing(self, cleaned): """ Indexing corpus :param cleaned: list if lists of str, tokenized documents from the corpus :return: matrix of document vectors """ batcher, sentence_character_ids, elmo_sentence_input = self.model with tf.Session() as sess: # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) indexed = [] for i in range(200, len(cleaned) + 1, 200): sentences = cleaned[i - 200:i] elmo_vectors = get_elmo_vectors(sess, sentences, batcher, sentence_character_ids, elmo_sentence_input) for vect, sent in zip(elmo_vectors, sentences): cropped_vector = crop_vec(vect, sent) indexed.append(cropped_vector) return indexed
def make_elmo_vectors_ruwordnet(data_path, model_directory, batch_size=25): model_name = os.path.basename(model_directory) data_name = os.path.basename(data_path).split('.')[0] data_dir = os.path.dirname(data_path) raw_sentences = [] with open(data_path, 'r') as f: for line in f: res = line.strip() raw_sentences.append(res) sentences = [tokenize(s) for s in raw_sentences] print('=====') print('%d sentences total' % len(sentences)) print('=====') batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings( model_directory) cropped_vectors = list() averaged_vectors = list() # Actually producing ELMo embeddings for our data: config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.InteractiveSession(config=config) # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) for batch in [ sentences[i * batch_size:(i + 1) * batch_size] for i in range((len(sentences) + batch_size - 1) // batch_size) ]: elmo_vectors_batch = get_elmo_vectors(sess, batch, batcher, sentence_character_ids, elmo_sentence_input) # print('ELMo embeddings for your input are ready') # print('Tensor shape:', elmo_vectors.shape) # Due to batch processing, the above code produces for each sentence # the same number of token vectors, equal to the length of the longest sentence # (the 2nd dimension of the elmo_vector tensor). # If a sentence is shorter, the vectors for non-existent words are filled with zeroes. # Let's make a version without these redundant vectors: cropped_vectors_batch = [] for vect, sent in zip(elmo_vectors_batch, sentences): cropped_vector = vect[:len(sent), :] cropped_vectors_batch.append(cropped_vector) averaged_vectors.append(np.mean(cropped_vector, axis=0)) cropped_vectors.extend(cropped_vectors_batch) averaged_vectors_np = np.ndarray( (len(averaged_vectors), averaged_vectors[0].shape[0]), averaged_vectors[0].dtype) for i, avg_vector in enumerate(averaged_vectors): averaged_vectors_np[i] = averaged_vectors[i] out_filename_pckl = os.path.join( data_dir, '_'.join([data_name, 'elmo_vectors', model_name]) + '.pkl') out_filename_npy = os.path.join( data_dir, '_'.join([data_name, 'elmo_avg_vectors', model_name]) + '.npy') with open(out_filename_pckl, 'wb') as f: pickle.dump(cropped_vectors, f) with open(out_filename_npy, 'wb') as f: np.save(f, averaged_vectors_np)