Beispiel #1
0
def elmo_indexing(cleaned, batcher, sentence_character_ids,
                  elmo_sentence_input):  #preprocessing
    """ 
    Indexing corpus
    :param cleaned: list if lists of str, tokenized documents from the corpus
    :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model

    :return: matrix of document vectors
    """
    with tf.Session() as sess:
        # It is necessary to initialize variables once before running inference.
        sess.run(tf.global_variables_initializer())
        indexed = []
        for i in range(200, len(cleaned) + 1, 200):
            sentences = cleaned[i - 200:i]
            elmo_vectors = get_elmo_vectors(sess, sentences, batcher,
                                            sentence_character_ids,
                                            elmo_sentence_input)

            for vect, sent in zip(elmo_vectors, sentences):
                cropped_vector = crop_vec(vect, sent)
                indexed.append(cropped_vector)
    data_elmo = pd.DataFrame(indexed)
    data_elmo.to_csv('elmo_index.csv', index=False)
    #with open('ELMO_model.pickle', 'wb') as f:
    #    pickle.dump((batcher, sentence_character_ids, elmo_sentence_input), f)
    return indexed
Beispiel #2
0
def elmo_query2vec(query, batcher, sentence_character_ids,
                   elmo_sentence_input):
    query = tokenize(query)
    with tensorflow.Session() as sess:
        sess.run(tensorflow.global_variables_initializer())
        vector = get_vect(get_elmo_vectors(sess, query, batcher,
                                           sentence_character_ids,
                                           elmo_sentence_input)[0], query[0])
    return vector
Beispiel #3
0
 def search(self, query, n=5):
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
         query_vec = np.transpose(
             np.mean(get_elmo_vectors(sess, [lemmatize(query)], batcher,
                                      ids, elmo_input),
                     axis=1)).flatten()
         result = np.matmul(self.vec, query_vec)
         indices = np.argsort(result)[::-1].tolist()[:n]
     return list(zip(self.texts[indices], result[indices]))
Beispiel #4
0
 def build_vec(self, sentences):
    with tf.Session() as sess:
       sess.run(tf.global_variables_initializer())
       elmo_vectors = get_elmo_vectors(
           sess, sentences, self.batcher, self.sentence_character_ids,
           self.elmo_sentence_input)
       
       results = []
       for vect, sent in zip(elmo_vectors, sentences):
         results.append(np.mean(vect[:len(sent), :], axis=0))
       
       return results
    def transform(self, query):
        """
        Gets vector of query

        :param query: str
        :return: vector of query
        """
        batcher, sentence_character_ids, elmo_sentence_input = self.model
        q = [tokenize(query)]
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            vector = crop_vec(
                get_elmo_vectors(sess, q, batcher, sentence_character_ids,
                                 elmo_sentence_input)[0], q[0])
        return vector
Beispiel #6
0
def create_elmo_matrix(corpus, batcher, sentence_character_ids,
                       elmo_sentence_input):
    with tensorflow.Session() as sess:
        sess.run(tensorflow.global_variables_initializer())
        matrix = []

        for i in range(200, len(corpus) + 1, 200):
            sentences = corpus[i - 200: i]
            elmo_vectors = get_elmo_vectors(sess, sentences, batcher,
                                            sentence_character_ids,
                                            elmo_sentence_input)

            for vect, sent in zip(elmo_vectors, sentences):
                vector = get_vect(vect, sent)
                matrix.append(vector)
    return matrix
Beispiel #7
0
    def _search_elmo(self, query, top=10):
        logging.log(logging.INFO, "Запрос: " + query)
        query = self._preprocess_string(query)
        _, index = self.index_structs["elmo"]
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

        with tf.Session(
                config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True,
                                      gpu_options=gpu_options)) as sess:
            with tf.device('/gpu:0'):
                # It is necessary to initialize variables once before running inference.
                sess.run(tf.global_variables_initializer())

                start = time.time()
                elmo_vectors = get_elmo_vectors(
                    sess, [query], self.models["elmo"].batcher,
                    self.models["elmo"].sentence_character_ids,
                    self.models["elmo"].elmo_sentence_input)

                # Due to batch processing, the above code produces for each sentence
                # the same number of token vectors, equal to the length of the longest sentence
                # (the 2nd dimension of the elmo_vector tensor).
                # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
                # Let's make a version without these redundant vectors:

                cropped_vectors = []
                for vect, sent in zip(elmo_vectors, [query]):
                    cropped_vector = vect[:len(sent), :]
                    cropped_vectors.append(cropped_vector)

                vec = cropped_vectors[0]
                entity = np.sum(vec, axis=0) / len(vec)
                den = np.sqrt(
                    np.einsum('ij,ij->i', index, index) *
                    np.einsum('j,j', entity, entity))
                match = index.dot(entity) / den.flatten()

                match_dict = {
                    self.orig_strings[i]: 1 - score
                    for i, score in enumerate(match)
                }
                match = tuple(
                    sorted(match_dict.items(), key=lambda x: x[1])[:top])
                print(query)
                return match
Beispiel #8
0
    def process(self, docs, index_path):
        docs = docs
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

        with tf.Session(
                config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True,
                                      gpu_options=gpu_options)) as sess:
            with tf.device('/gpu:0'):
                # It is necessary to initialize variables once before running inference.
                sess.run(tf.global_variables_initializer())

                start = time.time()
                vectors = []
                batch_size = 50
                bs, be = 0, batch_size

                while be <= len(docs):
                    batch = docs[bs:be]

                    elmo_vectors = get_elmo_vectors(
                        sess, batch, self.model.batcher,
                        self.model.sentence_character_ids,
                        self.model.elmo_sentence_input)

                    # Due to batch processing, the above code produces for each sentence
                    # the same number of token vectors, equal to the length of the longest sentence
                    # (the 2nd dimension of the elmo_vector tensor).
                    # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
                    # Let's make a version without these redundant vectors:

                    cropped_vectors = []
                    for vect, sent in zip(elmo_vectors, batch):
                        cropped_vector = vect[:len(sent), :]
                        cropped_vectors.append(cropped_vector)

                    vectors += cropped_vectors
                    bs += batch_size
                    be += batch_size

            index = np.array([np.sum(v, axis=0) / len(v) for v in vectors])
            struct = [{}, index]
            with open(os.path.join(index_path, self.label + ".pickle"),
                      "wb") as pfile:
                pickle.dump(struct, pfile)
            return True
Beispiel #9
0
def prepare_elmo_query(query, batcher, sentence_character_ids,
                       elmo_sentence_input):
    """ 
    Gets vector of query

    :param query: str
    :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model
    
    :return: vector of query
    """
    query = preproc(query)
    q = [tokenize(query)]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        vector = crop_vec(
            get_elmo_vectors(sess, q, batcher, sentence_character_ids,
                             elmo_sentence_input)[0], q[0])
    return vector
Beispiel #10
0
 def search(self, query):
     tf.reset_default_graph()
     batcher, sentence_character_ids, elmo_sentence_input = \
       load_elmo_embeddings(self.elmo_path)
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
         elmo_vectors = get_elmo_vectors(sess, [query], batcher,
                                         sentence_character_ids,
                                         elmo_sentence_input)
         results = []
         for vect, sent in zip(elmo_vectors, [query]):
             results.append(np.mean(vect[:len(sent), :], axis=0))
     vec = results
     res = cosine_similarity(vec, self.vectors)
     docs = [(idx, doc) for idx, doc in enumerate(res[0])]
     docs = sorted(docs, key=lambda x: x[1], reverse=True)
     docs = [(x[0], x[1], self.corpus['question2'][x[0]])
             for x in docs[:10]]
     return docs
    def indexing(self, cleaned):
        """
        Indexing corpus
        :param cleaned: list if lists of str, tokenized documents from the corpus

        :return: matrix of document vectors
        """
        batcher, sentence_character_ids, elmo_sentence_input = self.model
        with tf.Session() as sess:
            # It is necessary to initialize variables once before running inference.
            sess.run(tf.global_variables_initializer())
            indexed = []
            for i in range(200, len(cleaned) + 1, 200):
                sentences = cleaned[i - 200:i]
                elmo_vectors = get_elmo_vectors(sess, sentences, batcher,
                                                sentence_character_ids,
                                                elmo_sentence_input)

                for vect, sent in zip(elmo_vectors, sentences):
                    cropped_vector = crop_vec(vect, sent)
                    indexed.append(cropped_vector)
        return indexed
def make_elmo_vectors_ruwordnet(data_path, model_directory, batch_size=25):
    model_name = os.path.basename(model_directory)
    data_name = os.path.basename(data_path).split('.')[0]
    data_dir = os.path.dirname(data_path)

    raw_sentences = []
    with open(data_path, 'r') as f:
        for line in f:
            res = line.strip()
            raw_sentences.append(res)
    sentences = [tokenize(s) for s in raw_sentences]
    print('=====')
    print('%d sentences total' % len(sentences))
    print('=====')

    batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(
        model_directory)

    cropped_vectors = list()
    averaged_vectors = list()
    # Actually producing ELMo embeddings for our data:
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession(config=config)
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    for batch in [
            sentences[i * batch_size:(i + 1) * batch_size]
            for i in range((len(sentences) + batch_size - 1) // batch_size)
    ]:
        elmo_vectors_batch = get_elmo_vectors(sess, batch, batcher,
                                              sentence_character_ids,
                                              elmo_sentence_input)

        # print('ELMo embeddings for your input are ready')
        # print('Tensor shape:', elmo_vectors.shape)

        # Due to batch processing, the above code produces for each sentence
        # the same number of token vectors, equal to the length of the longest sentence
        # (the 2nd dimension of the elmo_vector tensor).
        # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
        # Let's make a version without these redundant vectors:
        cropped_vectors_batch = []
        for vect, sent in zip(elmo_vectors_batch, sentences):
            cropped_vector = vect[:len(sent), :]
            cropped_vectors_batch.append(cropped_vector)
            averaged_vectors.append(np.mean(cropped_vector, axis=0))

        cropped_vectors.extend(cropped_vectors_batch)

    averaged_vectors_np = np.ndarray(
        (len(averaged_vectors), averaged_vectors[0].shape[0]),
        averaged_vectors[0].dtype)
    for i, avg_vector in enumerate(averaged_vectors):
        averaged_vectors_np[i] = averaged_vectors[i]

    out_filename_pckl = os.path.join(
        data_dir, '_'.join([data_name, 'elmo_vectors', model_name]) + '.pkl')
    out_filename_npy = os.path.join(
        data_dir,
        '_'.join([data_name, 'elmo_avg_vectors', model_name]) + '.npy')

    with open(out_filename_pckl, 'wb') as f:
        pickle.dump(cropped_vectors, f)

    with open(out_filename_npy, 'wb') as f:
        np.save(f, averaged_vectors_np)