Example #1
0
 def _encode_corpus(path, corpus):
     model = ElmoModel()
     tf.reset_default_graph()
     model.load(path)
     elmo_vectors = model.get_elmo_vectors(corpus,
                                           layers = 'top')
     return elmo_vectors
Example #2
0
def prepare_elmo(load_from_file=False):
    logging.info('Preparing sequential data (Elmo)...')

    if load_from_file:
        logging.info('Loading from file...')
        x_train = pickle.load(open('embeddings/X_train_elmo.pickle', 'rb'))
        y_train = pickle.load(open('embeddings/y_train_elmo.pickle', 'rb'))
        x_test = pickle.load(open('embeddings/X_test_elmo.pickle', 'rb'))
        y_test = pickle.load(open('embeddings/y_test_elmo.pickle', 'rb'))
        return x_train, y_train, x_test, y_test

    elmo_model = ElmoModel()
    elmo_model.load('embeddings/193.zip')

    df_train = load_data(dataset_name='train')
    df_test = load_data(dataset_name='test')

    df_train = df_train[['user_id', 'post_title', 'post_body', 'label']]
    df_test = df_test[['user_id', 'post_title', 'post_body', 'label']]

    #x_train = []
    texts_train = []
    for doc in nlp.pipe(df_train.post_body):
        #texts_train.append([spacy_tokenize(doc) for sent in doc.sents])
        texts_train.append(spacy_tokenize(doc))
        #x_train.append(elmo_model.get_elmo_vector_average(spacy_tokenize(doc)))

    #x_test = []
    texts_test = []
    for doc in nlp.pipe(df_test.post_body):
        #texts_test.append([spacy_tokenize(sent) for sent in doc.sents])
        texts_test.append(spacy_tokenize(doc))
        #x_test.append(elmo_model.get_elmo_vector_average(spacy_tokenize(doc)))

    print('x_train:', np.asarray(texts_train).shape)
    print('x_test :', np.asarray(texts_test).shape)

    x_train = elmo_model.get_elmo_vector_average(texts_train)
    x_test = elmo_model.get_elmo_vector_average(texts_test)

    print('x_train.shape:' + str(x_train.shape))
    print('x_test.shape :' + str(x_test.shape))

    logging.info('Preparing train data...')
    lb = LabelBinarizer()
    lb.fit(df_train.label)
    y_train = lb.transform(df_train.label)

    logging.info('Preparing test data...')
    lb.fit(df_test.label)
    y_test = lb.transform(df_test.label)

    logging.info('Saving data to files...')
    pickle.dump(x_train, open('embeddings/X_train_elmo.pickle', 'wb'))
    pickle.dump(y_train, open('embeddings/y_train_elmo.pickle', 'wb'))
    pickle.dump(x_test, open('embeddings/X_test_elmo.pickle', 'wb'))
    pickle.dump(y_test, open('embeddings/y_test_elmo.pickle', 'wb'))

    return x_train, y_train, x_test, y_test
class ELMo_Embedder(nn.Module):
    """
    Transform tokens to embeddings
    """
    def __init__(self, embeddings_path: str):
        super().__init__()
        self.model = ElmoModel()
        self.model.load(embeddings_path)
        self.sess = self.model.get_elmo_session()
        print('ELMo Embedding Model is Loaded')

    def forward(self, x: List) -> torch.Tensor:
        # embed = self.model.get_elmo_vectors(x)
        embed = self.model.get_elmo_vectors_session(x, self.sess)
        embed = torch.Tensor(embed)
        return embed
Example #4
0
def load_elmo(path: str, batch_size: int, method=None):
    """
        loads elmo model and returns it with its graph
        to handle multiple graph issues.

        Both are used in extract_embeddings()
    """
    if method == "simple":
        elmo_model = ElmoModel()
        elmo_model.load(path, batch_size)
        elmo_graph = None
    else:
        graph = Graph()
        with graph.as_default() as elmo_graph:
            elmo_model = ElmoModel()
            elmo_model.load(path, batch_size)
    return elmo_model, elmo_graph
Example #5
0
import csv

import pandas as pd
from simple_elmo import ElmoModel

batch_size = 32

model = ElmoModel()
model.load('../data/195.zip', max_batch_size=batch_size)

data = pd.read_csv('../data/documents_advanced.csv', delimiter=',', index_col=0)
messages = data['text'].tolist()

with open('../data/documents_advanced_vectors.csv', 'w') as out:
    csvwriter = csv.writer(out, delimiter=',', quotechar="\"", quoting=csv.QUOTE_ALL)
    res = ['', 'text'] + [str(i) for i in range(0, 1024)]
    csvwriter.writerow(res)

    i = 0
    while i < len(messages):
        nxt = min(len(messages), i + batch_size)

        batch = messages[i:nxt]
        sentences = [s.split(' ') for s in batch]
        vectors = model.get_elmo_vector_average(sentences)

        for j, vector in enumerate(vectors):
            res = [str(i+j), batch[j]] + [str(val) for val in vector]
            csvwriter.writerow(res)

        i = nxt
Example #6
0
class ConsultantPlusAnalyzer:
    def __init__(self, is_elmo_used=False):
        self.config = get_config('config.yml')
        self.parser = ConsultantPlusParser(config=self.config)
        self.model = ElmoModel()
        self.mystem = Mystem()
        self.spec_chars = string.punctuation + '\n\xa0«»\t—…'
        self.stop_words = stopwords.words("russian")
        self.stop_words.extend([
            'и',
            'в',
            'на',
            'n',
            'рф',
            'гк',
            'юридического',
            ' ',
            '1',
            'ред',
            '2',
            'ст',
            'также',
            'свой',
            'либо',
            'это',
            'текст',
            'закон',
            'который',
            'иной',
            'год',
            'мочь',
        ])
        if is_elmo_used:
            self.model.load(self.config['model_info_file'])
        self.navec = Navec.load(self.config['navec_news_v1_1B_250K_300d_100q'])
        self.syntax = Syntax.load(self.config['slovnet_syntax_news_v1'])
        self.syntax.navec(self.navec)

    def save_information_about_target_words_by_codex_type(
            self, codex_type, codex_id):
        raw_articles_info = self.parser.sorted_articles_info[codex_type]
        if os.path.exists(
                generate_file_name_with_postfix(
                    self.config['information_about_target_words'],
                    str(codex_id))):
            os.remove(
                generate_file_name_with_postfix(
                    self.config['information_about_target_words'],
                    str(codex_id)))
        with open(generate_file_name_with_postfix(
                self.config['information_about_target_words'], str(codex_id)),
                  mode='w') as information_about_target_words_file:
            information_about_target_words_writer = csv.writer(
                information_about_target_words_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            information_about_target_words_writer.writerow([
                'article_id', 'article_title', 'parts_after_target_words',
                'sentences'
            ])
            for article_info in tqdm(raw_articles_info):
                text = self.parser.get_article_text_by_id(article_info.id)
                if text.find('если иное не предусмотрено') != -1:
                    text_parts = text.split('если иное не предусмотрено')
                    parts_before_target_words = list()
                    for i in range(0, len(text_parts) - 1):
                        parts_before_target_words.append(
                            text_parts[i].split('.')[-1])
                    parts_after_target_words = list()
                    for i in range(1, len(text_parts)):
                        parts_after_target_words.append(
                            text_parts[i].split('.')[0])
                    sentences = list()
                    for i in range(len(parts_before_target_words)):
                        sentences.append(parts_before_target_words[i] +
                                         'если иное не предусмотрено' +
                                         parts_after_target_words[i])
                    information_about_target_words_writer.writerow([
                        article_info.id, article_info.title,
                        '~'.join(parts_after_target_words), '~'.join(sentences)
                    ])

    def plot_word_vectors_graph(self,
                                proximity_threshold,
                                count_of_words=None):
        # TODO: найти порог близости для каждой статьи
        articles_vectors_info = dict()
        articles_words_info = dict()
        with open(self.config['articles_vectors_info_file']
                  ) as articles_vectors_info_file_with_words:
            reader = csv.reader(articles_vectors_info_file_with_words)
            for row in tqdm(reader):
                article_id = int(row[0])
                vector = self.convert_vector_from_string_value(row[1:len(row) -
                                                                   1])
                word = row[-1]
                if article_id not in articles_vectors_info:
                    articles_vectors_info[article_id] = list()
                    articles_words_info[article_id] = list()
                articles_vectors_info[article_id].append(vector)
                articles_words_info[article_id].append(word)
        for article_id, article_vectors in tqdm(articles_vectors_info.items()):
            n = len(article_vectors) if not count_of_words else count_of_words
            dist_matrix = np.zeros((n, n))
            for i in range(n):
                for j in range(n):
                    dist_matrix[i][j] = self.get_euclidean_distance(
                        article_vectors[i], article_vectors[j])
            G = nx.Graph()
            edges_info = dict()
            for i in range(n):
                for j in range(n):
                    if dist_matrix[i][j] > proximity_threshold and (
                        (articles_words_info[article_id][i],
                         articles_words_info[article_id][j]) not in edges_info
                            or (articles_words_info[article_id][j],
                                articles_words_info[article_id][i])
                            not in edges_info):
                        edges_info[(
                            articles_words_info[article_id][i],
                            articles_words_info[article_id][j])] = round(
                                dist_matrix[i][j], 2)
            G.add_weighted_edges_from([(item[0][0], item[0][1], item[1])
                                       for item in edges_info.items()])
            pos = nx.spring_layout(G)
            plt.figure(figsize=(50, 50))
            nx.draw(G, pos, node_size=10000, with_labels=True)
            nx.draw_networkx_edge_labels(G, pos, edge_labels=edges_info)
            plt.show()

    def save_syntax_analysis_by_text(self,
                                     text,
                                     file,
                                     is_many_sentences=False):
        f = open(file, 'a')
        sys.stdout = f
        print('-' * 100)
        if text != 'None':
            if not is_many_sentences:
                chunk = list()
                for sent in sentenize(text):
                    tokens = [_.text for _ in tokenize(sent.text)]
                    chunk.append(tokens)
                markup = next(self.syntax.map(chunk))
                words, deps = list(), list()
                for token in markup.tokens:
                    words.append(token.text)
                    source = int(token.head_id) - 1
                    target = int(token.id) - 1
                    if source > 0 and source != target:
                        deps.append([source, target, token.rel])
                show_markup(words, deps)
            else:
                for sentence in text.split('.'):
                    if len(sentence.split()) > 5:
                        chunk = list()
                        for sent in sentenize(sentence):
                            tokens = [_.text for _ in tokenize(sent.text)]
                            chunk.append(tokens)
                        markup = next(self.syntax.map(chunk))
                        words, deps = list(), list()
                        for token in markup.tokens:
                            words.append(token.text)
                            source = int(token.head_id) - 1
                            target = int(token.id) - 1
                            if source > 0 and source != target:
                                deps.append([source, target, token.rel])
                        show_markup(words, deps)
        else:
            print('None')
        print('-' * 100)

    def get_words_matrix_variance(self):
        articles_vectors_info = dict()
        articles_words_info = dict()
        with open(self.config['articles_vectors_info_file']
                  ) as articles_vectors_info_file_with_words:
            reader = csv.reader(articles_vectors_info_file_with_words)
            for row in tqdm(reader):
                article_id = int(row[0])
                vector = self.convert_vector_from_string_value(row[1:len(row) -
                                                                   1])
                word = row[-1]
                if article_id not in articles_vectors_info:
                    articles_vectors_info[article_id] = list()
                    articles_words_info[article_id] = list()
                articles_vectors_info[article_id].append(vector)
                articles_words_info[article_id].append(word)
        for article_id, article_vectors in tqdm(articles_vectors_info.items()):
            mat = np.array(article_vectors)
            print(f'The variance in article {article_id} is {mat.var()}')

    def get_prediction(self, words=None, file_with_vectors=None):
        words_vectors = list()
        if file_with_vectors:
            with open(file_with_vectors) as file:
                reader = csv.reader(file)
                for row in reader:
                    words_vectors.append(
                        self.convert_vector_from_string_value(row))
        else:
            for word in tqdm(words):
                words_vectors.append(self.model.get_elmo_vectors(word)[0][0])
        articles_vectors_info = dict()
        articles_words_info = dict()
        with open(self.config['articles_vectors_info_file']
                  ) as articles_vectors_info_file_with_words:
            reader = csv.reader(articles_vectors_info_file_with_words)
            for row in tqdm(reader):
                article_id = int(row[0])
                vector = self.convert_vector_from_string_value(row[1:len(row) -
                                                                   1])
                word = row[-1]
                if article_id not in articles_vectors_info:
                    articles_vectors_info[article_id] = list()
                    articles_words_info[article_id] = list()
                articles_vectors_info[article_id].append(vector)
                articles_words_info[article_id].append(word)
        articles_distance_info = list()
        for word_vector in tqdm(words_vectors):
            articles_distance_info.append(dict())
            for article_id in tqdm(articles_vectors_info):
                if article_id not in articles_distance_info[-1]:
                    articles_distance_info[-1][article_id] = list()
                for vector in tqdm(articles_vectors_info[article_id]):
                    articles_distance_info[-1][article_id].append(
                        self.get_euclidean_distance(word_vector, vector))
        articles_average_distance_info = list()
        for info in tqdm(articles_distance_info):
            articles_average_distance_info.append(dict())
            for article_id in tqdm(info):
                articles_average_distance_info[-1][article_id] = np.average(
                    np.array(info[article_id]))
        prediction_articles_id = list()
        for info in articles_average_distance_info:
            id = -1
            min_dist = sys.maxsize
            for article_id, dist in info.items():
                if dist < min_dist:
                    id = article_id
                    min_dist = dist
            prediction_articles_id.append(id)
        print(prediction_articles_id)

    def save_unique_words_in_articles_analysis(self, codex_type, codex_id):
        raw_articles_info = self.parser.sorted_articles_info[codex_type]
        articles_info = list()
        for article_info in tqdm(raw_articles_info):
            text = self.parser.get_article_text_by_id(article_info.id)
            text = text.lower()
            text = self.remove_chars_from_text(text, self.spec_chars)
            article_tokens = word_tokenize(' '.join(
                self.mystem.lemmatize(text)))
            for stop_word in self.stop_words:
                while stop_word in article_tokens:
                    article_tokens.remove(stop_word)
            text = Text(article_tokens)
            f_dist = FreqDist(text)
            f_dist = list(filter(lambda item: item[1] == 1, f_dist.items()))
            articles_info.append(
                (article_info.id, len(f_dist) / len(article_tokens)))
        if os.path.exists(
                generate_file_name_with_postfix(
                    self.config['unique_words_in_articles_analysis_file'],
                    str(codex_id))):
            os.remove(
                generate_file_name_with_postfix(
                    self.config['unique_words_in_articles_analysis_file'],
                    str(codex_id)))
        with open(generate_file_name_with_postfix(
                self.config['unique_words_in_articles_analysis_file'],
                str(codex_id)),
                  mode='w') as unique_words_in_articles_analysis_file:
            unique_words_in_articles_analysis_writer = csv.writer(
                unique_words_in_articles_analysis_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            unique_words_in_articles_analysis_writer.writerow(
                ['article_id', 'unique_words_frequency'])
            for frequency_info in articles_info:
                unique_words_in_articles_analysis_writer.writerow(
                    [frequency_info[0], frequency_info[1]])

    def save_most_popular_words_analysis(self, most_common_quantity):
        articles_tokens = list()
        for (codex_type, _) in tqdm(self.parser.codex_urls):
            raw_articles_info = self.parser.sorted_articles_info[codex_type]
            for article_info in tqdm(raw_articles_info):
                text = self.parser.get_article_text_by_id(article_info.id)
                text = text.lower()
                text = self.remove_chars_from_text(text, self.spec_chars)
                article_tokens = word_tokenize(' '.join(
                    self.mystem.lemmatize(text)))
                for stop_word in self.stop_words:
                    while stop_word in article_tokens:
                        article_tokens.remove(stop_word)
                articles_tokens.extend(article_tokens)
        text = Text(articles_tokens)
        f_dist = FreqDist(text)
        if os.path.exists(self.config['most_popular_words_analysis_file']):
            os.remove(self.config['most_popular_words_analysis_file'])
        with open(self.config['most_popular_words_analysis_file'],
                  mode='w') as most_popular_words_analysis_file:
            most_popular_words_analysis_writer = csv.writer(
                most_popular_words_analysis_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            most_popular_words_analysis_writer.writerow(
                ['word', 'word_count', 'frequency'])
            for info in f_dist.most_common(most_common_quantity):
                most_popular_words_analysis_writer.writerow(
                    [info[0], info[1], info[1] / len(articles_tokens)])

    def save_unique_words_analysis(self, uniqueness_threshold):
        """Сохраняем информацию о количестве уникальных слов и количестве статей, в которых эти слова встречаются, а также информацию о заданном количестве уникальных слов"""
        articles_tokens = list()
        articles_words_info = dict()
        for (codex_type, _) in tqdm(self.parser.codex_urls):
            raw_articles_info = self.parser.sorted_articles_info[codex_type]
            for article_info in tqdm(raw_articles_info):
                text = self.parser.get_article_text_by_id(article_info.id)
                text = text.lower()
                text = self.remove_chars_from_text(text, self.spec_chars)
                article_tokens = word_tokenize(' '.join(
                    self.mystem.lemmatize(text)))
                for stop_word in self.stop_words:
                    while stop_word in article_tokens:
                        article_tokens.remove(stop_word)
                articles_words_info[self.get_unique_article_identifier(
                    codex_type, article_info.id)] = list(set(article_tokens))
                articles_tokens.extend(article_tokens)
        text = Text(articles_tokens)
        f_dist = FreqDist(text)
        f_dist = list(
            filter(lambda item: item[1] <= uniqueness_threshold,
                   f_dist.items()))
        unique_words_info = dict()
        # Сохраняем информацию в виде: 'уникальное слово': ['количество во всем корпусе', 'количество статей, в котром встретилось это слово']
        for word_info in f_dist:
            if word_info[0] not in unique_words_info:
                unique_words_info[word_info[0]] = [word_info[1], 0]
            for article_id in tqdm(articles_words_info):
                if word_info[0] in articles_words_info[article_id]:
                    unique_words_info[word_info[0]][1] += 1
        if os.path.exists(self.config['articles_unique_words_info_file']):
            os.remove(self.config['articles_unique_words_info_file'])
        with open(self.config['articles_unique_words_info_file'],
                  mode='w') as articles_unique_words_info_file:
            articles_unique_words_info_writer = csv.writer(
                articles_unique_words_info_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            articles_unique_words_info_writer.writerow(
                ['word', 'word_count', 'articles_count'])
            for info in unique_words_info.items():
                articles_unique_words_info_writer.writerow(
                    [info[0], info[1][0], info[1][1]])
        unique_words_metrics = dict()
        # Сохраняем информацию в виде: 'заданное количество слова во всем корпусе': 'количество таких слов во всем корпусе'
        for value in unique_words_info.values():
            if value[0] not in unique_words_metrics:
                unique_words_metrics[value[0]] = value[1]
            else:
                unique_words_metrics[value[0]] += value[1]
        if os.path.exists(self.config['articles_unique_words_analysis_file']):
            os.remove(self.config['articles_unique_words_analysis_file'])
        with open(self.config['articles_unique_words_analysis_file'],
                  mode='w') as articles_unique_words_analysis_file:
            articles_unique_words_analysis_writer = csv.writer(
                articles_unique_words_analysis_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            articles_unique_words_analysis_writer.writerow([
                'count_unique_words_frequency', 'count_unique_words_in_corpus'
            ])
            for info in unique_words_metrics.items():
                articles_unique_words_analysis_writer.writerow(
                    [info[0], info[1]])
        if os.path.exists(
                self.
                config['articles_unique_words_analysis_file_with_frequency']):
            os.remove(
                self.
                config['articles_unique_words_analysis_file_with_frequency'])
        with open(self.
                  config['articles_unique_words_analysis_file_with_frequency'],
                  mode='w') as articles_unique_words_analysis_file:
            articles_unique_words_analysis_writer = csv.writer(
                articles_unique_words_analysis_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            articles_unique_words_analysis_writer.writerow([
                'count_unique_words_frequency',
                'count_unique_words_in_corpus_frequency'
            ])
            for info in unique_words_metrics.items():
                articles_unique_words_analysis_writer.writerow(
                    [info[0], info[1] / len(articles_tokens)])

    def save_codex_hist_info(self, codex_type, codex_id, constraint=None):
        """Сохранение частотности слов во всем корпусе"""
        raw_articles_info = self.parser.sorted_articles_info[codex_type]
        articles_tokens = list()
        for article_info in tqdm(raw_articles_info):
            text = self.parser.get_article_text_by_id(article_info.id)
            text = text.lower()
            text = self.remove_chars_from_text(text, self.spec_chars)
            article_tokens = word_tokenize(' '.join(
                self.mystem.lemmatize(text)))
            for stop_word in self.stop_words:
                while stop_word in article_tokens:
                    article_tokens.remove(stop_word)
            articles_tokens.extend(article_tokens)
        text = Text(articles_tokens)
        f_dist = FreqDist(text)
        if not constraint:
            if os.path.exists(
                    generate_file_name_with_postfix(
                        self.config['articles_frequency_info_file'],
                        str(codex_id))):
                os.remove(
                    generate_file_name_with_postfix(
                        self.config['articles_frequency_info_file'],
                        str(codex_id)))
            with open(generate_file_name_with_postfix(
                    self.config['articles_frequency_info_file'],
                    str(codex_id)),
                      mode='w') as articles_frequency_info_file:
                articles_frequency_info_writer = csv.writer(
                    articles_frequency_info_file,
                    delimiter=',',
                    quotechar='"',
                    quoting=csv.QUOTE_MINIMAL)
                articles_frequency_info_writer.writerow(['word', 'frequency'])
                for frequency_info in f_dist.most_common(100):
                    articles_frequency_info_writer.writerow([
                        frequency_info[0],
                        frequency_info[1] / len(articles_tokens)
                    ])
        else:
            if os.path.exists(
                    generate_file_name_with_postfix(
                        self.
                        config['articles_frequency_info_file_with_constraint'],
                        str(codex_id))):
                os.remove(
                    generate_file_name_with_postfix(
                        self.
                        config['articles_frequency_info_file_with_constraint'],
                        str(codex_id)))
            with open(generate_file_name_with_postfix(
                    self.
                    config['articles_frequency_info_file_with_constraint'],
                    str(codex_id)),
                      mode='w') as articles_frequency_info_file:
                articles_frequency_info_writer = csv.writer(
                    articles_frequency_info_file,
                    delimiter=',',
                    quotechar='"',
                    quoting=csv.QUOTE_MINIMAL)
                articles_frequency_info_writer.writerow(['word', 'frequency'])
                f_dist = list(
                    filter(lambda item: item[1] > constraint, f_dist.items()))
                for frequency_info in f_dist:
                    articles_frequency_info_writer.writerow([
                        frequency_info[0],
                        frequency_info[1] / len(articles_tokens)
                    ])

    def save_word_vectors_analysis_info(self, codex_type, most_common_count):
        articles_info = dict()
        raw_articles_info = self.parser.sorted_articles_info[codex_type]
        i = 0
        for article_info in tqdm(raw_articles_info):
            text = self.parser.get_article_text_by_id(article_info.id)
            text = text.lower()
            text = self.remove_chars_from_text(text, self.spec_chars)
            article_tokens = word_tokenize(' '.join(
                self.mystem.lemmatize(text)))
            for stop_word in self.stop_words:
                while stop_word in article_tokens:
                    article_tokens.remove(stop_word)
            article_vectors = list()
            article_words = list()
            text = Text(article_tokens)
            f_dist = FreqDist(text)
            for token in tqdm(f_dist.most_common(most_common_count)):
                vector = self.model.get_elmo_vectors(token[0])
                article_words.append(token[0])
                article_vectors.append(vector[0][0])
            articles_info[article_info.id] = [article_vectors, article_words]
            i += 1
            if i == 20:
                break
        if os.path.exists(self.config['articles_vectors_info_file']):
            os.remove(self.config['articles_vectors_info_file'])
        with open(self.config['articles_vectors_info_file'],
                  mode='w') as articles_vectors_info_file:
            articles_vectors_info_writer = csv.writer(
                articles_vectors_info_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            for article_id, article_vectors_info in articles_info.items():
                for i, article_vector_info in enumerate(
                        article_vectors_info[0]):
                    articles_vectors_info_writer.writerow([
                        article_id, *article_vector_info,
                        article_vectors_info[1][i]
                    ])

    def frequency_analysis_of_words(self):
        articles_tokens = list()
        for i in tqdm(range(10)):
            text = self.parser.get_article_text_by_id(i)
            text = text.lower()
            text = self.remove_chars_from_text(text, self.spec_chars)
            articles_tokens.extend(
                word_tokenize(' '.join(self.mystem.lemmatize(text))))
        for stop_word in self.stop_words:
            while stop_word in articles_tokens:
                articles_tokens.remove(stop_word)
        raw_text = ' '.join(articles_tokens)
        word_cloud = WordCloud().generate(raw_text)
        plt.imshow(word_cloud, interpolation='bilinear')
        plt.axis('off')
        plt.show()
        text = Text(articles_tokens)
        f_dist = FreqDist(text)
        f_dist.plot(30, cumulative=False)

    def links_on_target_words_analysis(self):
        for codex_id in tqdm(range(len(self.parser.codex_urls))):
            if os.path.exists(
                    generate_file_name_with_postfix(
                        self.
                        config['information_about_target_words_with_links'],
                        str(codex_id))):
                os.remove(
                    generate_file_name_with_postfix(
                        self.
                        config['information_about_target_words_with_links'],
                        str(codex_id)))
            with open(generate_file_name_with_postfix(
                    self.config['information_about_target_words_with_links'],
                    str(codex_id)),
                      mode='w') as information_about_target_words_file:
                information_about_target_words_writer = csv.writer(
                    information_about_target_words_file,
                    delimiter=',',
                    quotechar='"',
                    quoting=csv.QUOTE_MINIMAL)
                information_about_target_words_writer.writerow([
                    'article_id', 'article_title', 'article_url',
                    'links_on_target'
                ])
                target_words_info = pd.read_csv(
                    generate_file_name_with_postfix(
                        self.config['information_about_target_words'],
                        str(codex_id)))
                for row in tqdm(target_words_info.itertuples()):
                    links_on_target = list()
                    for part_of_target_words in row[3].split('~'):
                        if self.parser.get_links_on_target_words_by_id_and_target_words(
                                row[1], part_of_target_words):
                            links_on_target.append(
                                self.parser.
                                get_links_on_target_words_by_id_and_target_words(
                                    row[1], part_of_target_words))
                        else:
                            links_on_target.append('None')
                    information_about_target_words_writer.writerow([
                        row[1], row[2],
                        self.parser.get_article_url_by_id(row[1]),
                        ' '.join(links_on_target)
                    ])

    @staticmethod
    def save_syntax_analysis(analyzer):
        for codex_id in tqdm(range(len(analyzer.parser.codex_urls))):
            target_words_info = pd.read_csv(
                generate_file_name_with_postfix(
                    analyzer.config['information_about_target_words'],
                    str(codex_id)))
            for row in tqdm(target_words_info.itertuples()):
                for sentence in row[-1].split('~'):
                    analyzer.save_syntax_analysis_by_text(
                        sentence,
                        generate_file_name_with_postfix(
                            analyzer.
                            config['article_target_words_realation_info'],
                            str(row[1])))

    @staticmethod
    def save_syntax_analysis_in_links(analyzer):
        for codex_id in tqdm(range(len(analyzer.parser.codex_urls))):
            target_words_info = pd.read_csv(
                generate_file_name_with_postfix(
                    analyzer.
                    config['information_about_target_words_with_links'],
                    str(codex_id)))
            for row in tqdm(target_words_info.itertuples()):
                for url in row[-1].split(' '):
                    if url != 'None':
                        analyzer.save_syntax_analysis_by_text(
                            analyzer.parser.get_text_by_url(url),
                            generate_file_name_with_postfix(
                                analyzer.config[
                                    'article_target_words_in_links_realation_info'],
                                str(row[1])),
                            is_many_sentences=True)
                    else:
                        analyzer.save_syntax_analysis_by_text(
                            'None',
                            generate_file_name_with_postfix(
                                analyzer.config[
                                    'article_target_words_in_links_realation_info'],
                                str(row[1])))

    @staticmethod
    def plot_frequency_analysis_of_words(analyzer, is_constraint=None):
        """Построение частотности слов во всем корпусе"""
        if not is_constraint:
            for i in range(10):
                data = pd.read_csv(generate_file_name_with_postfix(
                    analyzer.config['articles_frequency_info_file'], str(i)),
                                   delimiter=',')
                data.plot(x='word',
                          y='frequency',
                          figsize=(50, 7),
                          kind='scatter')
                plt.xticks(rotation=60)
                plt.show()
        else:
            for i in range(10):
                data = pd.read_csv(generate_file_name_with_postfix(
                    analyzer.
                    config['articles_frequency_info_file_with_constraint'],
                    str(i)),
                                   delimiter=',')
                data = data.sort_values(by='frequency', axis='index')
                data.plot(x='word',
                          y='frequency',
                          figsize=(50, 7),
                          kind='scatter')
                plt.xticks(rotation=60)
                plt.show()

    @staticmethod
    def plot_unique_words_in_articles_analysis(analyzer):
        """Графики частотности уникальных слов в каждом кодексе по article_id"""
        for i in range(10):
            data = pd.read_csv(generate_file_name_with_postfix(
                analyzer.config['unique_words_in_articles_analysis_file'],
                str(i)),
                               delimiter=',')
            data = data.sort_values('unique_words_frequency')
            data.plot(x='article_id',
                      y='unique_words_frequency',
                      kind='scatter')
            plt.show()

    @staticmethod
    def plot_unique_words_in_articles_analysis_on_one_graph(analyzer):
        """График частотности уникальных слов в каждом кодексе на одном графике с отсортированной частотностью"""
        data = pd.read_csv(generate_file_name_with_postfix(
            analyzer.config['unique_words_in_articles_analysis_file'], str(0)),
                           delimiter=',')
        for i in range(1, 10):
            data = pd.concat([
                data,
                pd.read_csv(generate_file_name_with_postfix(
                    analyzer.config['unique_words_in_articles_analysis_file'],
                    str(i)),
                            delimiter=',')
            ])
        data['article_id'] = data.apply(
            lambda row: row['article_id'] / data['article_id'].max(), axis=1)
        data = data.sort_values('unique_words_frequency')
        data = data.reset_index()
        data.drop('article_id', axis='columns', inplace=True)
        data.drop('index', axis='columns', inplace=True)
        data.plot()
        plt.show()

    @staticmethod
    def plot_unique_words_analysis(analyzer, is_frequency_analysis=False):
        """Построение графика анализа уникальных слов"""
        if not is_frequency_analysis:
            data = pd.read_csv(
                analyzer.config['articles_unique_words_analysis_file'])
        else:
            data = pd.read_csv(
                analyzer.
                config['articles_unique_words_analysis_file_with_frequency'])
        data.plot(x='count_unique_words_frequency',
                  y='count_unique_words_in_corpus',
                  kind='scatter')
        plt.show()
        plt.hist(data.count_unique_words_frequency,
                 weights=data.count_unique_words_in_corpus)
        plt.show()

    @staticmethod
    def plot_most_popular_words_analysis(analyzer):
        """Построение графика частотности самых популярных во всем корпусе слов"""
        data = pd.read_csv(analyzer.config['most_popular_words_analysis_file'])
        plt.hist(data.word_count, weights=data.frequency)
        plt.show()
        data.plot(x='word', y='frequency', kind='scatter', figsize=(50, 7))
        plt.xticks(rotation=60)
        plt.show()

    @staticmethod
    def remove_chars_from_text(text, chars):
        return ''.join([ch for ch in text if ch not in chars])

    @staticmethod
    def convert_vector_from_string_value(vector):
        return list(map(lambda value: float(value), vector))

    @staticmethod
    def get_euclidean_distance(vector1, vector2):
        if len(vector1) != len(vector2):
            raise ConsultantPlusAnalyzerException(
                'It is not possible to compare vectors of different dimensions'
            )
        v1 = np.array(vector1)
        v2 = np.array(vector2)
        return np.linalg.norm(v1 - v2)

    @staticmethod
    def get_unique_article_identifier(codex_type, article_id):
        return codex_type + '_' + str(article_id)
Example #7
0
    logger.info("=====")
    logger.info(
        f"Average precision value for all words: {float(np.mean([x[0] for x in scores])):.3f} "
        f"(+/- {np.std([x[0] for x in scores]) * 2:.3f})")
    logger.info(
        f"Average recall value for all words: {float(np.mean([x[1] for x in scores])):.3f} "
        f"(+/- {np.std([x[1] for x in scores]) * 2:.3f})")
    logger.info(
        f"Average F1 value for all words: {float(np.mean([x[2] for x in scores])):.3f}"
        f"(+/- {np.std([x[2] for x in scores]) * 2:.3f})")
    return scores


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg("--input",
        "-i",
        help="Path to tab-separated file with WSD data",
        required=True)
    arg("--elmo", "-e", help="Path to ELMo model", required=True)

    args = parser.parse_args()
    data_path = args.input

    model = ElmoModel()
    model.load(args.elmo)

    eval_scores = classify(data_path, model)
    with open(data_path, 'r') as f:
        for line in f:
            res = line.strip()
            raw_sentences.append(res)
            if len(raw_sentences) > max_sentences:
                break
    sentences = [s.split()[:100] for s in raw_sentences]

    print('=====')
    print(f'{len(sentences)} sentences total')
    print('=====')

    model = ElmoModel()

    model.load(args.elmo, top=False)

    # Actually producing ELMo embeddings for our data:

    elmo_vectors = model.get_elmo_vectors(sentences)

    print('ELMo embeddings for your input are ready')
    print(f'Tensor shape: {elmo_vectors.shape}')

    # Due to batch processing, the above code produces for each sentence
    # the same number of token vectors, equal to the length of the longest sentence
    # (the 2nd dimension of the elmo_vector tensor).
    # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
    # Let's make a version without these redundant vectors:
    cropped_vectors = []
    for vect, sent in zip(elmo_vectors, sentences):
    print(f"Precision: {scores[0]:.3f}")
    print(f"Recall: {scores[1]:.3f}")
    print(f"F1: {scores[2]:.3f}")

    print("Random choice scores:")
    print("=====")
    print(f"Precision: {dummy_scores[0]:.3f}")
    print(f"Recall: {dummy_scores[1]:.3f}")
    print(f"F1: {dummy_scores[2]:.3f}")
    return scores


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg("--input",
        "-i",
        help="Path to tab-separated file with input data",
        required=True)
    arg("--elmo", "-e", required=True, help="Path to ELMo model")
    arg("--batch", "-b", type=int, help="Max batch size", default=300)

    args = parser.parse_args()
    data_path = args.input
    max_batch_size = args.batch

    emb_model = ElmoModel()
    emb_model.load(args.elmo, top=False, max_batch_size=max_batch_size)

    eval_scores = classify(data_path, elmo=emb_model)
Example #10
0
import math
import json
import string

from simple_elmo import ElmoModel

model = ElmoModel()

model.load("199")
#not_specified_string="не указано"
#not_specified_string_in="неуказано"

key_vectors = []

key_lengths = []
key_words = []
keys = []
keys_grouped = []


def calculate_keys(keysIn):
    global key_vectors, key_words, key_lengths, keys_grouped, keys
    keys_grouped = keysIn
    for key_group in keys_grouped:
        #key_group.append(not_specified_string)
        for key in key_group:
            keys.append(key)
            key_words.append(sentence_to_words(key))
            key_lengths.append(len(key_words[len(key_words) - 1]))
    key_vectors = []
    #key_vectors_pre=model.get_elmo_vectors_shit(key_words, layers="average")
Example #11
0
    logger.info("Counting occurrences...")

    wordcount = 0
    with open(data_path, "r") as corpus:
        for line in corpus:
            res = line.strip().split()[:WORD_LIMIT]
            for word in res:
                if word in vect_dict:
                    vect_dict[word] += 1
                    wordcount += 1
    logger.info(f"Total occurrences of target words: {wordcount}")
    logger.info(vect_dict)

    # Loading a pre-trained ELMo model:
    model = ElmoModel()
    model.load(args.elmo, max_batch_size=batch_size)

    vect_dict = {
        word: np.zeros((int(vect_dict[word]), model.vector_size))
        for word in vect_dict
    }
    target_words = set(vect_dict)

    counters = {w: 0 for w in vect_dict}

    # Actually producing ELMo embeddings for our data:
    start = time.time()

    CACHE = 12800

    lines_processed = 0
Example #12
0
import sys
from simple_elmo import ElmoModel
from blimp_utils import get_ppl
from spacy.lang.en import English

model = ElmoModel()
model.load(sys.argv[1], max_batch_size=1, full=True)

tokenizer = English().tokenizer

pairs = [("Could that window ever shut?", "That window could ever shut."),
         ("could that window ever shut?", "that window could ever shut."),
         ("Could that window ever shut.", "That window could ever shut?"),
         ("Can it mean something?", "Piece does mean something."),
         ("Has the river ever frozen?", "The river has ever frozen."),
         ("Is it really good?", "It is really good?"),
         ("Is it really good.", "It is really good.")]

print("=========================")
for pair in pairs:
    good, bad = pair
    print(good)
    print(get_ppl(model.get_elmo_substitutes([" ".join([token.text for token in tokenizer(good)])],
                                             topn=model.vocab.size)[0], "bidirectional"))
    print(bad)
    print(get_ppl(model.get_elmo_substitutes([" ".join([token.text for token in tokenizer(bad)])],
                                             topn=model.vocab.size)[0], "bidirectional"))
    print("=========================")


Example #13
0
)

# Contextualized models:
contextualized = config.getboolean("Token", "use_contextualized")
if contextualized:
    import tensorflow as tf
    from simple_elmo import ElmoModel

    token_model_file = config.get("Token", "token_model")
    type_model_file = config.get("Token", "type_model")
    frequency_file = config.get("Token", "freq_file")
    type_model = gensim.models.KeyedVectors.load_word2vec_format(type_model_file, binary=True)
    graph = tf.compat.v1.get_default_graph()
    with graph.as_default():
        token_model = ElmoModel()
        token_model.load(token_model_file)
    elmo_frequency = {}
    for line in open(frequency_file, "r"):
        if "\t" not in line:
            elmo_frequency["corpus_size"] = int(line.strip())
        else:
            (external_word, frequency) = line.strip().split("\t")
            elmo_frequency[external_word] = int(frequency)

our_models = {}
with open(root + config.get("Files and directories", "models"), "r") as csvfile:
    reader = csv.DictReader(csvfile, delimiter="\t")
    for row in reader:
        our_models[row["identifier"]] = {}
        our_models[row["identifier"]]["path"] = row["path"]
        our_models[row["identifier"]]["default"] = row["default"]
Example #14
0
if __name__ == "__main__":

    # Process only the first k sentences
    max_sentences = 100

    sentences = ["Привет, мир!", "Надо отправить сообщение. В цехе до сих пор не была установлена седьмая центрифуга, а по плану мы должны уже на ней работать"]

    sentences = [s.split()[:100] for s in raw_sentences]

    print("=====")
    print(f"{len(sentences)} sentences total")
    print("=====")

    model = ElmoModel()

    model.load("212")

    # Actually producing ELMo embeddings for our data:
    start = time.time()
    elmo_vectors = model.get_elmo_vectors(sentences, layers="average")
    end = time.time()

    processing_time = int(end - start)

    print(f"ELMo embeddings for your input are ready in {processing_time} seconds")
    print(f"Tensor shape: {elmo_vectors.shape}")

    # Due to batch processing, the above code produces for each sentence
    # the same number of token vectors, equal to the length of the longest sentence
    # (the 2nd dimension of the elmo_vector tensor).
    # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
    wordcount = 0
    with open(data_path, "r") as corpus:
        for line in corpus:
            res = line.strip().split()[:WORD_LIMIT]
            for word in res:
                if word in lemma_targets:
                    # targets[word] += 1
                    wordcount += 1
    logger.info(f"Total occurrences of target words: {wordcount}")
    logger.info(lemma_targets)

    targets = set(lemma_targets)

    model = ElmoModel()

    model.load(args.elmo, full=True, max_batch_size=args.batch)

    target_substitutes = {w: [] for w in targets}

    start = time.time()
    CACHE = 1000
    lines_processed = 0
    lines_cache = []

    with open(data_path, "r") as dataset:
        for line in dataset:
            res = line.strip().split()[:WORD_LIMIT]
            if targets & set(res):
                lines_cache.append(" ".join(res))
            lines_processed += 1
            if len(lines_cache) == CACHE:
from joblib import load
import pandas as pd
from simple_elmo import ElmoModel
import csv

test = pd.read_csv('../data/test.csv', delimiter=',', index_col=0)

elmo = ElmoModel()
elmo.load('../data/195.zip')


vectors = elmo.get_elmo_vector_average([test['text'].tolist()])

model = load('../data/advanced_kmeans.joblib')
clusters = model.predict(vectors)

test['cluster'] = pd.DataFrame(clusters)
test.to_csv('../data/test.csv', index=True, quoting=csv.QUOTE_ALL)