コード例 #1
0
def get_data_elmo(corpus, stop=5000):
    """
    Проходит по корпусу и токенизирует тексты.

    :param corpus: path to csv file with corpus
    :param stop: int, how many lines we want to get
    :return: 
        indexed -> list of list of strings
        id_to_text -> dict, map of text_id to raw text. 
        query_to_dupl -> dict, query:id of its duplicate

    """
    indexed = []
    id_to_text = {}
    query_to_id = {}
    counter = 0

    for idx, doc in enumerate(corpus):
        #sent = preproc(doc)
        doc = str(doc)
        indexed.append(tokenize(doc))
        id_to_text[idx] = doc
        counter += 1
        query_to_id[doc] = idx

        if counter >= stop:
            break

    return indexed, id_to_text, query_to_id
コード例 #2
0
def get_data_elmo(corpus, stop=5000):
    """
    Проходит по корпусу и токенизирует тексты.

    :param corpus: path to csv file with corpus
    :param stop: int, how many lines we want to get
    :return:
        indexed -> list of list of strings
        id_to_text -> dict, map of text_id to raw text.
        query_to_dupl -> dict, query:id of its duplicate

    """
    indexed = []
    counter = 0

    with open(corpus, 'r', encoding='utf-8') as f:
        r = csv.reader(f)
        for line in r:

            if line[0] == '':
                continue

            _id, text, query, isduplicate = line
            indexed.append(tokenize(text))

            counter += 1
            if counter >= stop:
                break
    return indexed
コード例 #3
0
ファイル: via_elmo.py プロジェクト: mcqueenaa/infosearch2019
def elmo_query2vec(query, batcher, sentence_character_ids,
                   elmo_sentence_input):
    query = tokenize(query)
    with tensorflow.Session() as sess:
        sess.run(tensorflow.global_variables_initializer())
        vector = get_vect(get_elmo_vectors(sess, query, batcher,
                                           sentence_character_ids,
                                           elmo_sentence_input)[0], query[0])
    return vector
コード例 #4
0
 def indexing(self, sentences):
   self.collection = sentences
   with tf.Session() as sess:
     sess.run(tf.global_variables_initializer())
     start = time()
     sentences = [tokenize(sent) for sent in sentences]
     for sent in sentences:
       sent_vec = self.build_vec([sent])
       self.vectors.append(sent_vec[0])
       
     print(f'=====\n'\
           f'ElmoSearch Indexing takes {time() - start} sec'\
           f'for {len(sentences)} docs')
     return self.vectors
コード例 #5
0
    def transform(self, query):
        """
        Gets vector of query

        :param query: str
        :return: vector of query
        """
        batcher, sentence_character_ids, elmo_sentence_input = self.model
        q = [tokenize(query)]
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            vector = crop_vec(
                get_elmo_vectors(sess, q, batcher, sentence_character_ids,
                                 elmo_sentence_input)[0], q[0])
        return vector
コード例 #6
0
def prepare_elmo_query(query, batcher, sentence_character_ids,
                       elmo_sentence_input):
    """ 
    Gets vector of query

    :param query: str
    :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model
    
    :return: vector of query
    """
    query = preproc(query)
    q = [tokenize(query)]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        vector = crop_vec(
            get_elmo_vectors(sess, q, batcher, sentence_character_ids,
                             elmo_sentence_input)[0], q[0])
    return vector
コード例 #7
0
    def fit(self, n=0):
        LOGGER.info('Wait: indexing of elmo')
        if os.path.isfile(Elmo_model):
            return np.load(Elmo_model)
        vectors = []
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            start = time()
            sentences = [
                tokenize(sent) for sent in self.collection[n:n + 1000]
            ]
            for idx, sent in enumerate(sentences):
                sent_vec = self.build_vec([sent])
                vectors.append(sent_vec)
                print(idx)

            LOGGER.info(f'ElmoSearch Indexing takes {time() - start} '
                        f'sec for {len(sentences)} docs')
            np.save(Elmo_model, vectors)
            return vectors
コード例 #8
0
ファイル: via_elmo.py プロジェクト: mcqueenaa/infosearch2019
if not os.path.exists('documents.pickle'):
    docs = []
    for idx, line in enumerate(file):
        if idx != 0 and idx < 5002:
            docs.append(line[2])
    with open("documents.pickle", "wb") as c:
        pickle.dump(docs, c)
else:
    with open("documents.pickle", "rb") as c:
        docs = pickle.load(c)

if not os.path.exists('elmo_corpus.pickle'):
    corpus = []
    for idx, sent in enumerate(docs):
        if idx < 1001:
            corpus.append(tokenize(sent))
        else:
            break
        with open("elmo_corpus.pickle", "wb") as c:
            pickle.dump(corpus, c)
else:
    with open("elmo_corpus.pickle", "rb") as c:
        corpus = pickle.load(c)


def get_vect(vect, sent):
    vector = vect[:len(sent), :]
    vector = np.mean(vector, axis=0)
    return vector

コード例 #9
0
def make_elmo_vectors_ruwordnet(data_path, model_directory, batch_size=25):
    model_name = os.path.basename(model_directory)
    data_name = os.path.basename(data_path).split('.')[0]
    data_dir = os.path.dirname(data_path)

    raw_sentences = []
    with open(data_path, 'r') as f:
        for line in f:
            res = line.strip()
            raw_sentences.append(res)
    sentences = [tokenize(s) for s in raw_sentences]
    print('=====')
    print('%d sentences total' % len(sentences))
    print('=====')

    batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(
        model_directory)

    cropped_vectors = list()
    averaged_vectors = list()
    # Actually producing ELMo embeddings for our data:
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession(config=config)
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    for batch in [
            sentences[i * batch_size:(i + 1) * batch_size]
            for i in range((len(sentences) + batch_size - 1) // batch_size)
    ]:
        elmo_vectors_batch = get_elmo_vectors(sess, batch, batcher,
                                              sentence_character_ids,
                                              elmo_sentence_input)

        # print('ELMo embeddings for your input are ready')
        # print('Tensor shape:', elmo_vectors.shape)

        # Due to batch processing, the above code produces for each sentence
        # the same number of token vectors, equal to the length of the longest sentence
        # (the 2nd dimension of the elmo_vector tensor).
        # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
        # Let's make a version without these redundant vectors:
        cropped_vectors_batch = []
        for vect, sent in zip(elmo_vectors_batch, sentences):
            cropped_vector = vect[:len(sent), :]
            cropped_vectors_batch.append(cropped_vector)
            averaged_vectors.append(np.mean(cropped_vector, axis=0))

        cropped_vectors.extend(cropped_vectors_batch)

    averaged_vectors_np = np.ndarray(
        (len(averaged_vectors), averaged_vectors[0].shape[0]),
        averaged_vectors[0].dtype)
    for i, avg_vector in enumerate(averaged_vectors):
        averaged_vectors_np[i] = averaged_vectors[i]

    out_filename_pckl = os.path.join(
        data_dir, '_'.join([data_name, 'elmo_vectors', model_name]) + '.pkl')
    out_filename_npy = os.path.join(
        data_dir,
        '_'.join([data_name, 'elmo_avg_vectors', model_name]) + '.npy')

    with open(out_filename_pckl, 'wb') as f:
        pickle.dump(cropped_vectors, f)

    with open(out_filename_npy, 'wb') as f:
        np.save(f, averaged_vectors_np)