def get_data_elmo(corpus, stop=5000): """ Проходит по корпусу и токенизирует тексты. :param corpus: path to csv file with corpus :param stop: int, how many lines we want to get :return: indexed -> list of list of strings id_to_text -> dict, map of text_id to raw text. query_to_dupl -> dict, query:id of its duplicate """ indexed = [] id_to_text = {} query_to_id = {} counter = 0 for idx, doc in enumerate(corpus): #sent = preproc(doc) doc = str(doc) indexed.append(tokenize(doc)) id_to_text[idx] = doc counter += 1 query_to_id[doc] = idx if counter >= stop: break return indexed, id_to_text, query_to_id
def get_data_elmo(corpus, stop=5000): """ Проходит по корпусу и токенизирует тексты. :param corpus: path to csv file with corpus :param stop: int, how many lines we want to get :return: indexed -> list of list of strings id_to_text -> dict, map of text_id to raw text. query_to_dupl -> dict, query:id of its duplicate """ indexed = [] counter = 0 with open(corpus, 'r', encoding='utf-8') as f: r = csv.reader(f) for line in r: if line[0] == '': continue _id, text, query, isduplicate = line indexed.append(tokenize(text)) counter += 1 if counter >= stop: break return indexed
def elmo_query2vec(query, batcher, sentence_character_ids, elmo_sentence_input): query = tokenize(query) with tensorflow.Session() as sess: sess.run(tensorflow.global_variables_initializer()) vector = get_vect(get_elmo_vectors(sess, query, batcher, sentence_character_ids, elmo_sentence_input)[0], query[0]) return vector
def indexing(self, sentences): self.collection = sentences with tf.Session() as sess: sess.run(tf.global_variables_initializer()) start = time() sentences = [tokenize(sent) for sent in sentences] for sent in sentences: sent_vec = self.build_vec([sent]) self.vectors.append(sent_vec[0]) print(f'=====\n'\ f'ElmoSearch Indexing takes {time() - start} sec'\ f'for {len(sentences)} docs') return self.vectors
def transform(self, query): """ Gets vector of query :param query: str :return: vector of query """ batcher, sentence_character_ids, elmo_sentence_input = self.model q = [tokenize(query)] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) vector = crop_vec( get_elmo_vectors(sess, q, batcher, sentence_character_ids, elmo_sentence_input)[0], q[0]) return vector
def prepare_elmo_query(query, batcher, sentence_character_ids, elmo_sentence_input): """ Gets vector of query :param query: str :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model :return: vector of query """ query = preproc(query) q = [tokenize(query)] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) vector = crop_vec( get_elmo_vectors(sess, q, batcher, sentence_character_ids, elmo_sentence_input)[0], q[0]) return vector
def fit(self, n=0): LOGGER.info('Wait: indexing of elmo') if os.path.isfile(Elmo_model): return np.load(Elmo_model) vectors = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) start = time() sentences = [ tokenize(sent) for sent in self.collection[n:n + 1000] ] for idx, sent in enumerate(sentences): sent_vec = self.build_vec([sent]) vectors.append(sent_vec) print(idx) LOGGER.info(f'ElmoSearch Indexing takes {time() - start} ' f'sec for {len(sentences)} docs') np.save(Elmo_model, vectors) return vectors
if not os.path.exists('documents.pickle'): docs = [] for idx, line in enumerate(file): if idx != 0 and idx < 5002: docs.append(line[2]) with open("documents.pickle", "wb") as c: pickle.dump(docs, c) else: with open("documents.pickle", "rb") as c: docs = pickle.load(c) if not os.path.exists('elmo_corpus.pickle'): corpus = [] for idx, sent in enumerate(docs): if idx < 1001: corpus.append(tokenize(sent)) else: break with open("elmo_corpus.pickle", "wb") as c: pickle.dump(corpus, c) else: with open("elmo_corpus.pickle", "rb") as c: corpus = pickle.load(c) def get_vect(vect, sent): vector = vect[:len(sent), :] vector = np.mean(vector, axis=0) return vector
def make_elmo_vectors_ruwordnet(data_path, model_directory, batch_size=25): model_name = os.path.basename(model_directory) data_name = os.path.basename(data_path).split('.')[0] data_dir = os.path.dirname(data_path) raw_sentences = [] with open(data_path, 'r') as f: for line in f: res = line.strip() raw_sentences.append(res) sentences = [tokenize(s) for s in raw_sentences] print('=====') print('%d sentences total' % len(sentences)) print('=====') batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings( model_directory) cropped_vectors = list() averaged_vectors = list() # Actually producing ELMo embeddings for our data: config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.InteractiveSession(config=config) # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) for batch in [ sentences[i * batch_size:(i + 1) * batch_size] for i in range((len(sentences) + batch_size - 1) // batch_size) ]: elmo_vectors_batch = get_elmo_vectors(sess, batch, batcher, sentence_character_ids, elmo_sentence_input) # print('ELMo embeddings for your input are ready') # print('Tensor shape:', elmo_vectors.shape) # Due to batch processing, the above code produces for each sentence # the same number of token vectors, equal to the length of the longest sentence # (the 2nd dimension of the elmo_vector tensor). # If a sentence is shorter, the vectors for non-existent words are filled with zeroes. # Let's make a version without these redundant vectors: cropped_vectors_batch = [] for vect, sent in zip(elmo_vectors_batch, sentences): cropped_vector = vect[:len(sent), :] cropped_vectors_batch.append(cropped_vector) averaged_vectors.append(np.mean(cropped_vector, axis=0)) cropped_vectors.extend(cropped_vectors_batch) averaged_vectors_np = np.ndarray( (len(averaged_vectors), averaged_vectors[0].shape[0]), averaged_vectors[0].dtype) for i, avg_vector in enumerate(averaged_vectors): averaged_vectors_np[i] = averaged_vectors[i] out_filename_pckl = os.path.join( data_dir, '_'.join([data_name, 'elmo_vectors', model_name]) + '.pkl') out_filename_npy = os.path.join( data_dir, '_'.join([data_name, 'elmo_avg_vectors', model_name]) + '.npy') with open(out_filename_pckl, 'wb') as f: pickle.dump(cropped_vectors, f) with open(out_filename_npy, 'wb') as f: np.save(f, averaged_vectors_np)