Esempio n. 1
0
def get_tm_index(**kwargs):
    from util.service_es import search
    from nlpmonitor.settings import ES_CLIENT
    name = kwargs['name']
    index_tm = kwargs['index_tm']

    # Check if already exists
    if ES_CLIENT.indices.exists(index_tm):
        query = {
            "name": name,
        }
        if 'perform_actualize' in kwargs:
            query['is_ready'] = True

        s = search(ES_CLIENT, index_tm, query, source=[], get_search_obj=True)
        s = s.filter('exists', field="number_of_topics")
        s = s.execute()
        if s:
            return s[-1]
        query = {
            "name.keyword": name,
        }
        if 'perform_actualize' in kwargs:
            query['is_ready'] = True

        s = search(ES_CLIENT, index_tm, query, source=[], get_search_obj=True)
        s = s.filter('exists', field="number_of_topics")
        s = s.execute()
        if s:
            return s[-1]
    raise TMNotFoundException("Topic Modelling index not found!")
Esempio n. 2
0
def validator(mappings_dict, client, index_theta_one, index_theta_two, datetime_from_tm_2, datetime_to_tm_1,
              number_of_topics):
    """
    pass
    """
    from sklearn.preprocessing import MinMaxScaler
    from nltk.metrics import jaccard_distance
    scaler = MinMaxScaler()
    scores = dict(zip(mappings_dict.keys(), [0] * len(mappings_dict)))
    scores_for_normalization = []
    for threshhold, map_dict in mappings_dict.items():
        cnt_matches_for_threshhold = 0
        for topic_parent, topic_childs_list in map_dict.items():

            theta_1 = search(client=client, index=index_theta_one,
                             query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1,
                                    'topic_id': topic_parent, 'topic_weight__gte': 0.05},
                             source=['document_es_id'],
                             start=0,
                             end=1000000,
                             get_scan_obj=True
                             )
            scanned_parent = set([elem.document_es_id for elem in theta_1])

            for topic_child in topic_childs_list:
                theta_2 = search(client=client, index=index_theta_two,
                                 query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1,
                                        'topic_id': topic_child, 'topic_weight__gte': 0.05},
                                 source=['document_es_id'],
                                 start=0,
                                 end=1000000,
                                 get_scan_obj=True
                                 )
                jaccard_score = 1 - jaccard_distance(scanned_parent, set([elem.document_es_id for elem in theta_2]))

                scores[threshhold] += jaccard_score
                cnt_matches_for_threshhold += 1
        try:
            avg_score = scores[threshhold] / cnt_matches_for_threshhold

            scores_for_normalization.append(avg_score)
            scores[threshhold] = [len(map_dict) / number_of_topics, avg_score]

        except ZeroDivisionError:
            scores[threshhold] = [len(map_dict) / number_of_topics, 0]

    scores_normalized = [score[0] for score in scaler.fit_transform(np.array(scores_for_normalization).reshape(-1, 1))]

    for i, items in enumerate(scores.items()):
        scores[items[0]] += [scores_normalized[i]]

    return scores
Esempio n. 3
0
def pool_embeddings(**kwargs):
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING

    start = kwargs['start']
    end = kwargs['end']
    corpus = kwargs['corpus']
    from_embedding_name = kwargs['from_embedding_name']
    from_embedding_by_unit = kwargs['from_embedding_by_unit']
    to_embedding_name = kwargs['to_embedding_name']
    to_embedding_by_unit = kwargs['to_embedding_by_unit']
    pooling = kwargs['pooling']

    # Get embedding object
    query = {
        "corpus": corpus.lower(),
        # "is_ready": False, # TODO Uncomment
        "name": to_embedding_name.lower(),
    }
    embedding = search(ES_CLIENT, ES_INDEX_EMBEDDING, query)[-1]
    number_of_documents = embedding['number_of_documents']

    # Get documents
    documents = search(ES_CLIENT,
                       ES_INDEX_DOCUMENT, {"corpus": corpus.lower()},
                       start=int(start / 100 * number_of_documents),
                       end=int(end / 100 * number_of_documents),
                       source=['id', from_embedding_name],
                       sort=['id'])

    embeddings_to_write = []
    documents_to_write = []
    batch_size = 10000
    # Pooling
    for document in documents:
        embeddings_to_write.append([])
        pool_document(document, embeddings_to_write, documents_to_write,
                      from_embedding_name, to_embedding_by_unit,
                      from_embedding_by_unit, pooling)
        # Update to ES
        if len(embeddings_to_write) >= batch_size:
            persist_embeddings_to_es(ES_CLIENT, ES_INDEX_DOCUMENT,
                                     documents_to_write, embeddings_to_write,
                                     to_embedding_name)
            embeddings_to_write = []
            documents_to_write = []
    persist_embeddings_to_es(ES_CLIENT, ES_INDEX_DOCUMENT, documents_to_write,
                             embeddings_to_write, to_embedding_name)
def generate_meta_dtm(**kwargs):
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_META_DTM
    from mainapp.documents import META_DTM
    from util.service_es import search
    from elasticsearch_dsl import Index

    meta_name = kwargs['meta_dtm_name']
    volume_days = kwargs['tm_volume_days']
    delta_days = kwargs['delta_days']
    reset_index = kwargs['reset_index']
    from_date = kwargs['from_date']
    to_date = kwargs['to_date']

    if reset_index:
        Index(ES_INDEX_META_DTM).delete(using=ES_CLIENT, ignore=404)

    if not ES_CLIENT.indices.exists(ES_INDEX_META_DTM):
        ES_CLIENT.indices.create(index=ES_INDEX_META_DTM,
                                 body={
                                     "settings": META_DTM.Index.settings,
                                     "mappings": META_DTM.Index.mappings
                                 })

    s = search(client=ES_CLIENT,
               index=ES_INDEX_META_DTM,
               query={
                   'meta_name': meta_name,
                   'volume_days': volume_days,
                   'delta_days': delta_days,
                   'from_date': from_date,
                   'to_date': to_date
               })

    if s:
        ES_CLIENT.update(index=ES_INDEX_META_DTM,
                         id=s[-1].meta.id,
                         body={
                             "doc": {
                                 "meta_name": meta_name,
                                 "volume_days": volume_days,
                                 "delta_days": delta_days,
                                 'from_date': from_date,
                                 'to_date': to_date,
                                 'reset_index': reset_index
                             }
                         })
    else:
        index = META_DTM(
            **{
                "meta_name": meta_name,
                "volume_days": volume_days,
                "delta_days": delta_days,
                'from_date': from_date,
                'to_date': to_date,
                'reset_index': reset_index
            })
        index.save()

    return 'META DTM GENERATED'
Esempio n. 5
0
def init_dictionary_index(**kwargs):
    from elasticsearch_dsl import Search, Index

    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_DICTIONARY_INDEX, ES_INDEX_DICTIONARY_WORD
    from mainapp.documents import Dictionary, DictionaryWord

    from util.service_es import search

    name = kwargs['name']
    es_index = Index(f"{ES_INDEX_DICTIONARY_WORD}_{name}", using=ES_CLIENT)
    es_index.delete(ignore=404)
    settings = DictionaryWord.Index.settings
    ES_CLIENT.indices.create(
        index=f"{ES_INDEX_DICTIONARY_WORD}_{name}",
        body={
            "settings": settings,
            "mappings": DictionaryWord.Index.mappings
        }
    )

    es_index = Index(f"{ES_INDEX_DICTIONARY_WORD}_{name}_temp", using=ES_CLIENT)
    es_index.delete(ignore=404)
    settings = DictionaryWord.Index.settings
    ES_CLIENT.indices.create(
        index=f"{ES_INDEX_DICTIONARY_WORD}_{name}_temp",
        body={
            "settings": settings,
            "mappings": DictionaryWord.Index.mappings
        }
    )

    s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT).filter("terms", corpus=kwargs['corpuses'])
    number_of_documents = s.count()

    kwargs['corpuses'] = ",".join(kwargs['corpuses'])
    # Check if already exists
    if ES_CLIENT.indices.exists(ES_INDEX_DICTIONARY_INDEX):
        query = {
            "corpus": kwargs['corpuses'],
            "name": kwargs['name']
        }
        if search(ES_CLIENT, ES_INDEX_DICTIONARY_INDEX, query):
            return "Already exists"

    kwargs["number_of_documents"] = number_of_documents
    kwargs["is_ready"] = False
    dictionary = Dictionary(**kwargs)
    dictionary.save()
    return "Created"
Esempio n. 6
0
def es_etl(**kwargs):
    from util.service_es import search, update_generator
    from util.constants import BASE_DAG_DIR

    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING

    stuff = kwargs['stuff']

    # Extract
    query = {
        "corpus": "main",
    }
    documents = search(ES_CLIENT, ES_INDEX_DOCUMENT, query)
    print("!!!", len(documents))

    # Transform
    for document in documents:
        # if 'num_views' in document:
        #     document.num_views += 1
        document.any_stuff = stuff
        document.literally_any_stuff = {
            "literally": [{
                "any_stuff": [1, 2, 3, 4, 5, 6]
            }]
        }
    print("!!!", list(documents[0].to_dict().keys()))
    print("!!!", documents[0].any_stuff)
    print("!!!", documents[0].literally_any_stuff)

    # Load
    from elasticsearch.helpers import streaming_bulk

    for ok, result in streaming_bulk(ES_CLIENT,
                                     update_generator(ES_INDEX_DOCUMENT,
                                                      documents),
                                     index=ES_INDEX_DOCUMENT,
                                     chunk_size=1000,
                                     raise_on_error=True,
                                     max_retries=10):
        print(ok, result)
Esempio n. 7
0
def init_embedding_index(**kwargs):
    from elasticsearch_dsl import Search

    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING
    from mainapp.documents import EmbeddingIndex

    s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT)
    number_of_documents = s.count()

    # Check if already exists
    if ES_CLIENT.indices.exists(ES_INDEX_EMBEDDING):
        query = {
            "corpus": kwargs['corpus'],
            "name": kwargs['name'],
            "number_of_documents": number_of_documents,
        }
        if search(ES_CLIENT, ES_INDEX_EMBEDDING, query):
            return ("!!!", "Already exists")

    kwargs["number_of_documents"] = number_of_documents
    index = EmbeddingIndex(**kwargs)
    index.save()
def preprocessing_raw_data(**kwargs):
    import re
    import requests

    from airflow.models import Variable
    from elasticsearch.helpers import streaming_bulk
    from elasticsearch_dsl import Search, Q
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT

    from util.service_es import search, update_generator
    from util.util import is_word, is_kazakh

    start = kwargs['start']
    end = kwargs['end']

    number_of_documents = int(
        Variable.get("lemmatize_number_of_documents_kz", default_var=None))
    if number_of_documents is None:
        raise Exception("No variable!")

    s = search(ES_CLIENT,
               ES_INDEX_DOCUMENT,
               query={},
               source=['text'],
               sort=['id'],
               get_search_obj=True)
    s = s.exclude('exists', field="is_kazakh")
    s = s[int(start / 100 *
              number_of_documents):int(end / 100 * number_of_documents) + 1]
    documents = s.execute()

    print('!!! len docs', len(documents))
    for doc in documents:
        if not is_kazakh(doc.text):
            doc['is_kazakh'] = False
            continue
        cleaned_doc = [
            x.lower() for x in ' '.join(
                re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ',
                       doc.text).split()).split()
        ]
        result = ""
        for i in range(len(cleaned_doc) // 10000 + 1):
            req_text = ' '.join(cleaned_doc[i * 10000:(i + 1) * 10000])
            r = requests.get(f"http://apertium-flask:8005?text={req_text}")
            result += r.json()['result']
        doc['text_lemmatized_kz_apertium'] = result
        doc['is_kazakh'] = True

    documents_processed = 0
    failed = 0
    for ok, result in streaming_bulk(ES_CLIENT,
                                     update_generator(ES_INDEX_DOCUMENT,
                                                      documents),
                                     index=ES_INDEX_DOCUMENT,
                                     chunk_size=5000,
                                     raise_on_error=True,
                                     max_retries=10):
        if not ok:
            failed += 1
        if failed > 5:
            raise Exception("Too many failed ES!!!")
        documents_processed += 1
    return f"{documents_processed} Processed"
Esempio n. 9
0
def topic_modelling(**kwargs):
    import numpy as np
    import numba as nb
    from util.util import save_obj, load_obj
    from util.service_es import search
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DICTIONARY_WORD
    from .clustering_util import unique_clots, clots_binding, object_weighting1, cluster_weighting1, full_weighting1, \
        n_extra_objects, n_key_objects, save_clustering
    '''
    CLOT IN THE NEIGHBORHOOD
    Finding the clot in the neighborhood of the object defined by its pairwise distance matrix dm 
    The maximal intra-cluster distance is specified by the parameter d1
    None of the pairwise distances in formed clots exceeds d1
    start_ind - index of object which will be used as starting point for clot growing
    Returns object indices included to the builded clot
    ###################################
    СГУСТОК В ОКРЕСТНОСТИ
    Нахождение сгустка в окрестности объекта заданной dm - квадратной матрицей попарных расстояний
    от анализируемого объекта до всех объектов в его окрестности
    Максимальное внутрикластерное расстояние задаются параметром d1
    В формируемых сгустках расстояние между любыми двумя объектами не превышает d1
    start_ind - индекс объекта который будет использован как начальная точка роста при формировании сгустка
    Возвращает индексы объектов включённых в сгусток
    '''
    @nb.jit(nopython=True)
    def single_clot(dm, d1, start_ind):
        n = dm.shape[0]

        if start_ind < 0 or start_ind > n - 1:
            raise ValueError('start_ind is out of bounds')

        R = np.array([start_ind])
        C = np.delete(np.arange(n), start_ind)

        while len(C) > 0:
            C = C[np.sum(dm[R][:, C] <= d1, axis=0) == len(R)]

            if len(C) > 0:
                dist_sum = np.sum(dm[R][:, C], axis=0)

                best_ind = np.argsort(dist_sum)[0]

                R = np.append(R, C[best_ind])

                C = np.delete(C, best_ind)

        return R

    '''
    CLOTS FOR ALL OBJECTS
    Starting the process of finding clots for all objects in their neighbourhood
    defined by circle with center in considered object and radius d2
    Maximal pairwise intra-clot distance is specified by the parameter d1
    D - symmetric square matrix of pairwise distances between objects.
    Each clot is defined by indices of included objects
    use_medoid - if True then the medoids will be used as initial growing points, otherwise -
    the circle centers.
    Returns the array where each i-th element is clot in neighbourhood of i-th object.
    ###################################
    СГУСТКИ ДЛЯ КАЖДОГО ИЗ ОБЪЕКТОВ
    Запустает процесс выявления сгустков для каждого из объектов в их окрестности
    заданной окружностью с центорм в рассматриваемом объекте и радиусом d2
    Максимально допустимое попарное расстояние внутри сгустка задаётся параметром d1
    D - матрица попарных межобъектных расстояний между объектами
    Каждый сгусток задан множеством индексов включённых в него объектов
    use_medoid - если True, то в качестве точек роста сгустков будут использоваться медоиды. В противном случае -
    центры окружностей.
    Возвращает массив в котором каждый i-ый элемент является сгустком в окрестности i-го объекта.
    '''

    @nb.njit(parallel=True)
    def all_clots(D, d1, d2, use_medoid=True):
        n = D.shape[0]
        global_inds = np.arange(n)

        clots = [np.array([0])] * n

        for i in nb.prange(n):
            local_inds = global_inds[D[i] <= d2]

            if len(local_inds) > 0:

                dm = D[local_inds][:, local_inds]

                if use_medoid:
                    start_ind = np.argmin(np.sum(dm, axis=0))
                else:
                    start_ind = np.where(local_inds == i)[0][0]

                clot = single_clot(dm, d1, start_ind)

                if len(clot) > 0:
                    clots[i] = local_inds[clot]
                else:
                    clots[i] = np.empty(0, dtype=nb.int64)
            else:
                clots[i] = np.empty(0, dtype=nb.int64)

        return clots

    print("!!!", "Initial stuff", datetime.datetime.now())
    max_dict_size = 10000000
    if 'max_dict_size' in kwargs:
        max_dict_size = kwargs['max_dict_size']
    name = kwargs['name']
    d1 = kwargs['d1']
    d2 = kwargs['d2']
    d3 = kwargs['d3']
    min_clot_size = kwargs['min_clot_size']
    use_medoid = kwargs['use_medoid']

    dictionary_words = search(ES_CLIENT,
                              ES_INDEX_DICTIONARY_WORD,
                              query=kwargs['dictionary_filters'],
                              source=("word_normal", ),
                              sort=('_id', ),
                              get_search_obj=True,
                              end=max_dict_size)
    dictionary_words.aggs.bucket('unique_word_normals',
                                 'terms',
                                 field='word_normal.keyword')
    vocab = [
        dw.key for dw in
        dictionary_words.execute().aggregations.unique_word_normals.buckets
    ]

    data_folder = os.path.join(BASE_DAG_DIR, "mussabayev_tm_temp", name)
    distance_matrix = np.array(load_obj(
        os.path.join(data_folder, 'distance_matrix.pkl')),
                               dtype=np.float32)
    cooccurrence_matrix = load_obj(
        os.path.join(data_folder, 'cooc_sparse_matrix.pkl'))
    matrix_dimensions = distance_matrix.shape[0]

    print("!!!", "Start all_clots", datetime.datetime.now())
    # Запускаем процесс поиска сгустков по каждому из объектов
    # Увеличение d1 приводит к увеличению: окрестности поиска,количества и размеров получаемых сгустков,
    # времени вычисления и объёма использованной памяти
    a_clots = all_clots(distance_matrix, d1, d2, use_medoid)
    print('!!!', 'All clot count: ' + str(len(a_clots)))

    print("!!!", "Start unique_clots", datetime.datetime.now())
    # Оставляем только уникальные сгустки
    clots = unique_clots(a_clots, min_clot_size)
    save_obj(data_folder, 'clots.pkl')
    print('!!!', 'Count of unique clots: ' + str(len(clots)))

    print("!!!", "Start clots_binding", datetime.datetime.now())
    # Cвязывание сгустков в единые кластера по мере их пересечения
    clusters = clots_binding(clots, d3, -1)
    print('!!!', 'Cluster count: ' + str(len(clusters)))

    print("!!!", "Start object_weighting1", datetime.datetime.now())
    # Взвешивание объектов внутри каждого кластера
    # Метод 1 - на основе взаимных расстояний
    object_weights1 = object_weighting1(clusters, distance_matrix)
    # Для каждого кластера определяем заданное nk количество ключевых объектов
    # по максимуму полученных весовых коэффициентов
    nk = 3
    key_objects1 = n_key_objects(nk, clusters, object_weights1)

    # Вычисление коэффициентов соответствия кластеру для полного множества объектов по всему множеству кластеров
    # Т.е. для для всех объектов полного множества получаем коэффициенты соответствия каждому из
    # полученных кластеров
    # Для каждого кластера определяем заданное ne количество дополнительных объектов (кандидатов на включение в кластер)
    ne = 3
    # Метод 1 - на основе взаимных расстояний
    full_weights1 = full_weighting1(clusters, distance_matrix)

    # На основе полученных коэффициентов осуществляем выбор в количестве ne объектов
    # имеющих максимальные коэффициенты соответствия кластеру но не включённых в рассматриваемый кластер
    extra_objects1 = n_extra_objects(ne, clusters, full_weights1)

    print("!!!", "Start cluster_weighting1", datetime.datetime.now())
    # Взвешивание каждого из полученных кластеров по мере их соответствия полному исходному множеству объектов
    # Каждому кластеру присваивается весовой коэффициент отражающий насколько хорошо объекты входящие в кластер
    # согласуются с полным множеством всех объектов
    cluster_weights1 = cluster_weighting1(clusters,
                                          distance_matrix)  # Method 1

    save_clustering(clusters, cluster_weights1, object_weights1, key_objects1,
                    extra_objects1, vocab,
                    os.path.join(data_folder, "result_example.txt"))

    return f"Dictionary len={matrix_dimensions}, documents_len={'???TODO'}"
Esempio n. 10
0
def generate_cooccurrence_codistance(**kwargs):
    from util.util import save_obj
    import numpy as np
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics.pairwise import pairwise_distances
    from util.service_es import search
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_DICTIONARY_WORD

    max_dict_size = 30000
    if 'max_dict_size' in kwargs:
        max_dict_size = kwargs['max_dict_size']

    dictionary_words = search(ES_CLIENT,
                              ES_INDEX_DICTIONARY_WORD,
                              query=kwargs['dictionary_filters'],
                              source=("word_normal", ),
                              sort=('_id', ),
                              get_search_obj=True,
                              start=0,
                              end=10)
    dictionary_words.aggs.bucket('unique_word_normals',
                                 'terms',
                                 field='word_normal.keyword',
                                 size=max_dict_size)
    dictionary_words = dictionary_words.execute()
    documents_scan = search(ES_CLIENT,
                            ES_INDEX_DOCUMENT,
                            query=kwargs['document_filters'],
                            source=("text_lemmatized", ),
                            get_scan_obj=True,
                            end=5000000)

    print("!!!", "Start count_vectorizing", datetime.datetime.now())
    vectorizer = CountVectorizer(vocabulary=(
        dw.key
        for dw in dictionary_words.aggregations.unique_word_normals.buckets))
    documents_vectorized = vectorizer.fit_transform(
        (d.text_lemmatized for d in documents_scan))

    print("!!!", "Start dot product for coocurance matrix",
          datetime.datetime.now())
    coocurance_matrix = documents_vectorized.T.dot(
        documents_vectorized).astype(np.uint32)
    print("!!!", "Saving coocurance matrix", datetime.datetime.now())
    data_folder = os.path.join(BASE_DAG_DIR, "mussabayev_tm_temp")
    if not os.path.exists(data_folder):
        os.mkdir(data_folder)
    data_folder = os.path.join(data_folder, kwargs['name'])
    if not os.path.exists(data_folder):
        os.mkdir(data_folder)
    save_obj(coocurance_matrix,
             os.path.join(data_folder, 'cooc_sparse_matrix.pkl'))

    print("!!!", "Start distance matrix calc", datetime.datetime.now())
    distance_matrix = pairwise_distances(coocurance_matrix,
                                         metric='cosine',
                                         n_jobs=4)
    print("!!!", "Save distance matrix ", datetime.datetime.now())
    save_obj(distance_matrix, os.path.join(data_folder, 'distance_matrix.pkl'))

    return f"Dictionary len={len(vectorizer.vocabulary_.keys())}, documents_len={documents_vectorized.shape[0]}"
def preprocessing_raw_data(**kwargs):
    import re

    from airflow.models import Variable
    from elasticsearch.helpers import streaming_bulk
    from lemminflect import getAllLemmas, getAllLemmasOOV
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT
    from nltk.corpus import stopwords
    from stop_words import get_stop_words
    from util.service_es import search, update_generator
    from util.util import is_latin

    process_num = kwargs['process_num']
    total_proc = kwargs['total_proc']

    number_of_documents = int(
        Variable.get("lemmatize_number_of_documents_eng", default_var=None))
    if number_of_documents is None:
        raise Exception("No variable!")

    s = search(ES_CLIENT,
               ES_INDEX_DOCUMENT,
               query={},
               source=['id', 'text'],
               sort=['id'],
               get_search_obj=True)
    s = s.exclude('exists', field="is_english")

    stopwords = set(
        get_stop_words('ru') + get_stop_words('en') +
        stopwords.words('english'))
    success = 0
    documents = []
    for doc in s.params(raise_on_error=False).scan():
        if int(doc.id) % total_proc != process_num:
            continue
        success += 1
        if success > 50_000:
            break
        if success % 10_000 == 0:
            print(f"{success}/{50_000}")
        if not is_latin(doc.text):
            doc['is_english'] = False
            documents.append(doc)
            continue
        cleaned_doc = [
            x.lower() for x in ' '.join(
                re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ',
                       doc.text).split()).split()
            if not x in stopwords and len(x) > 2
        ]
        result = ""
        for word in cleaned_doc:
            try:
                result += list(getAllLemmas(word).values())[0][0] + " "
            except IndexError:
                result += list(getAllLemmasOOV(
                    word, upos="NOUN").values())[0][0] + " "
        doc['text_lemmatized_eng_lemminflect'] = result
        doc['is_english'] = True
        documents.append(doc)
Esempio n. 12
0
def persist_embeddings(**kwargs):
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING
    from mainapp.models import Corpus, Document
    from preprocessing.models import ProcessedCorpus, ProcessedDocument, AnalysisUnit

    from elasticsearch_dsl import Search

    corpus = kwargs['corpus']
    embedding_name = kwargs['embedding_name']
    by_unit = kwargs['by_unit']
    type_unit_int = kwargs['type_unit_int']
    algorithm = kwargs['algorithm']
    pooling = kwargs['pooling']
    description = kwargs['description']

    # Update embedding object to is_ready
    query = {
        "corpus": corpus.lower(),
        "name": embedding_name.lower(),
        # "is_ready": False,  # TODO uncomment
    }
    embedding = search(ES_CLIENT, ES_INDEX_EMBEDDING, query)[-1]
    ES_CLIENT.update(index=ES_INDEX_EMBEDDING,
                     id=embedding.meta.id,
                     body={"doc": {
                         "is_ready": True
                     }})

    # Init processedCorpus
    pcs = ProcessedCorpus.objects.filter(
        corpus=Corpus.objects.get(name=corpus), name=embedding_name)
    if pcs.exists():
        for pc in pcs:
            pc.delete()
    pc = ProcessedCorpus.objects.create(corpus=Corpus.objects.get(name=corpus),
                                        name=embedding_name,
                                        description=description)

    s = Search(using=ES_CLIENT,
               index=ES_INDEX_DOCUMENT).source(['id', embedding_name
                                                ]).filter("term",
                                                          corpus=corpus)

    def persist(batch_docs, batch_units, type):
        batch_docs = ProcessedDocument.objects.bulk_create(batch_docs)
        batch_units_objs = []
        batch_size = 10000
        for doc, embs in zip(batch_docs, batch_units):
            ind = 0
            for emb in embs:
                batch_units_objs.append(
                    AnalysisUnit(type=type,
                                 processed_document=doc,
                                 value=emb[by_unit],
                                 index=ind,
                                 embedding=emb['values']))
                ind += 1
            if len(batch_units_objs) >= batch_size:
                AnalysisUnit.objects.bulk_create(batch_units_objs)
                batch_units_objs = []
        AnalysisUnit.objects.bulk_create(batch_units_objs)

    batch_size = 10000
    batch_docs = []
    batch_units = []
    for document in s.scan():
        batch_docs.append(
            ProcessedDocument(processed_corpus=pc,
                              original_document_id=document.id))
        embeddings = document[embedding_name]
        document_embeddings = []
        if type_unit_int in [0, 1, 2]:
            for sent in embeddings:
                for token in sent:
                    document_embeddings.append({
                        by_unit: token[by_unit],
                        "values": token.layers[0].values
                    })
        elif type_unit_int in [3, 4]:
            for elem in embeddings:
                document_embeddings.append({
                    by_unit: elem[by_unit],
                    "values": elem.layers[0].values
                })
        elif type_unit_int in [5]:
            document_embeddings.append({
                by_unit: embeddings[by_unit],
                "values": embeddings.layers[0].values
            })
        else:
            raise Exception("Unknown Unit_by type")
        batch_units.append(document_embeddings)
        if len(batch_docs) >= batch_size:
            persist(batch_docs, batch_units, type_unit_int)
            batch_docs = []
            batch_units = []
    persist(batch_docs, batch_units, type=type_unit_int)
Esempio n. 13
0
def generate_dictionary_batch(**kwargs):
    import datetime
    import re

    from elasticsearch.helpers import streaming_bulk
    from stop_words import get_stop_words
    from nltk.corpus import stopwords

    from util.util import is_kazakh, is_latin
    from util.service_es import search

    from nlpmonitor.settings import ES_INDEX_DOCUMENT, ES_INDEX_DICTIONARY_INDEX, ES_INDEX_DICTIONARY_WORD, ES_CLIENT

    import logging
    es_logger = logging.getLogger('elasticsearch')
    es_logger.setLevel(logging.ERROR)

    name = kwargs['name']
    process_num = kwargs['process_num']
    total_proc = kwargs['total_proc']
    corpuses = kwargs['corpuses']
    max_n_gram_len = kwargs['max_n_gram_len']
    min_relative_document_frequency = kwargs['min_relative_document_frequency']
    field_to_parse = kwargs['field_to_parse']

    query = {
        "name": name,
        "is_ready": False,
    }
    dictionary = search(ES_CLIENT, ES_INDEX_DICTIONARY_INDEX, query)[-1]
    number_of_documents = dictionary.number_of_documents
    if not number_of_documents:
        raise Exception("No variable!")

    print("!!!", "Getting documents from ES", datetime.datetime.now())
    documents = search(ES_CLIENT, ES_INDEX_DOCUMENT,
                       query={"corpus": corpuses},
                       source=[field_to_parse, 'id'],
                       sort=['id'],
                       get_search_obj=True,
                       )
    documents = documents.filter("exists", field=field_to_parse)
    number_of_documents = documents.count()

    # stopwords = set(get_stop_words('ru') + get_stop_words('en') + stopwords.words('english'))
    dictionary_words = {}
    print("!!!", "Iterating through documents", datetime.datetime.now())
    for i, doc in enumerate(documents.params(raise_on_error=False).scan()):
        if i % 100_000 == 0:
            print(f"Processed {i} documents")
            print(f"Dictionary length is {len(dictionary_words)}")
        if int(doc.id) % total_proc != process_num:
            continue
        if len(doc[field_to_parse]) == 0:
            print("!!! WTF", doc.meta.id)
            continue
        if is_kazakh(doc[field_to_parse]):
            continue
        word_in_doc = set()
        cleaned_words = [x for x in ' '.join(re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ', doc[field_to_parse]).split()).split()]
        if is_latin(doc[field_to_parse]):
            lang = "eng"
        elif is_kazakh((doc[field_to_parse])):
            lang = "kaz"
        else:
            lang = "rus"
        for n_gram_len in range(1, max_n_gram_len + 1):
            for n_gram in (cleaned_words[i:i + n_gram_len] for i in range(len(cleaned_words) - n_gram_len + 1)):
                word = "_".join(n_gram)
                is_first_upper = word[0].isupper()
                word = word.lower()
                # TEMP - DISABLED lemmatization
                # if lang == "eng":
                #     parse = lemmatize_eng(word)
                # elif lang == "kaz":
                #     continue # raise NotImplemented()
                # elif lang == "rus":
                #     parse = lemmatize_ru(word)
                # else:
                #     raise NotImplemented()
                if word not in dictionary_words:
                    dictionary_words[word] = {
                        "dictionary": name,
                        "word": word,
                        # "word_normal": parse["normal_form"],
                        "word_normal": word,
                        # "is_in_pymorphy2_dict": parse["is_known"],
                        "is_in_pymorphy2_dict": True,
                        # "is_multiple_normals_in_pymorphy2": parse["is_multiple_forms"],
                        "is_multiple_normals_in_pymorphy2": False,
                        # "is_stop_word": word in stopwords or parse["normal_form"] in stopwords,
                        "is_stop_word": False,
                        "is_latin": any([c in "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM" for c in word]),
                        "is_kazakh": any([c in "ӘәҒғҚқҢңӨөҰұҮүІі" for c in word]) or lang == "kaz",
                        "n_gram_len": n_gram_len,
                        # "pos_tag": parse["pos_tag"],
                        "pos_tag": "NA",
                        "word_len": len(word),
                        "word_frequency": 1,
                        "document_frequency": 1,
                        "word_first_capital_ratio": 1 if is_first_upper else 0,
                    }
                else:
                    dictionary_words[word]['word_frequency'] += 1
                    dictionary_words[word]['word_first_capital_ratio'] += 1 if word[0].isupper() else 0
                    if word not in word_in_doc:
                        dictionary_words[word]['document_frequency'] += 1
                word_in_doc.add(word)
Esempio n. 14
0
def aggregate_dicts(**kwargs):
    import datetime

    from util.service_es import search
    from elasticsearch.helpers import streaming_bulk
    from elasticsearch_dsl import Search, Index
    from nlpmonitor.settings import ES_INDEX_DICTIONARY_INDEX, ES_INDEX_DICTIONARY_WORD, ES_CLIENT, ES_INDEX_DOCUMENT

    import logging
    es_logger = logging.getLogger('elasticsearch')
    es_logger.setLevel(logging.ERROR)

    name = kwargs['name']
    corpuses = kwargs['corpuses']

    min_relative_document_frequency = kwargs['min_relative_document_frequency']

    query = {
        "dictionary": name,
    }
    dictionary_scan = search(ES_CLIENT, f"{ES_INDEX_DICTIONARY_WORD}_{name}_temp" , query, get_scan_obj=True)
    dictionary_index = search(ES_CLIENT, ES_INDEX_DICTIONARY_INDEX, {"name": name})[-1]
    dictionary_words_final = {}
    dictionary_normal_words = {}
    print("!!!", "Iteration through scan", datetime.datetime.now())
    for word in dictionary_scan:
        key = word['word']
        key_normal = word['word_normal']
        if not key in dictionary_words_final:
            dictionary_words_final[key] = word.to_dict()
        else:
            dictionary_words_final[key]['word_frequency'] += word['word_frequency']
            dictionary_words_final[key]['word_first_capital_ratio'] += word['word_first_capital_ratio']
            dictionary_words_final[key]['document_frequency'] += word['document_frequency']

        if not key_normal in dictionary_normal_words:
            dictionary_normal_words[key_normal] = {
                "word_normal_frequency": word['word_frequency'],
                "word_normal_first_capital_ratio": word['word_first_capital_ratio'],
                "document_normal_frequency": word['document_frequency']
            }
        else:
            dictionary_normal_words[key_normal]['word_normal_frequency'] += word['word_frequency']
            dictionary_normal_words[key_normal]['word_normal_first_capital_ratio'] += word['word_first_capital_ratio']
            dictionary_normal_words[key_normal]['document_normal_frequency'] += word['document_frequency']

    print("!!!", "Forming final words dict", datetime.datetime.now())
    for key in dictionary_words_final.keys():
        dictionary_words_final[key]['word_normal_frequency'] = \
            dictionary_normal_words[dictionary_words_final[key]['word_normal']]['word_normal_frequency']
        dictionary_words_final[key]['word_normal_first_capital_ratio'] = \
            dictionary_normal_words[dictionary_words_final[key]['word_normal']]['word_normal_first_capital_ratio']
        dictionary_words_final[key]['document_normal_frequency'] = \
            dictionary_normal_words[dictionary_words_final[key]['word_normal']]['document_normal_frequency']

        dictionary_words_final[key]['word_first_capital_ratio'] /= \
            dictionary_words_final[key]['word_frequency']
        dictionary_words_final[key]['word_normal_first_capital_ratio'] /= \
            dictionary_words_final[key]['word_normal_frequency']

        dictionary_words_final[key]['word_frequency_relative'] = \
            dictionary_words_final[key]['word_frequency'] / dictionary_index.number_of_documents
        dictionary_words_final[key]['word_normal_frequency_relative'] = \
            dictionary_words_final[key]['word_normal_frequency'] / dictionary_index.number_of_documents
        dictionary_words_final[key]['document_frequency_relative'] = \
            dictionary_words_final[key]['document_frequency'] / dictionary_index.number_of_documents
        dictionary_words_final[key]['document_normal_frequency_relative'] = \
            dictionary_words_final[key]['document_normal_frequency'] / dictionary_index.number_of_documents

    success = 0
    failed = 0
    print("!!!", "Writing to ES", datetime.datetime.now())
    len_dictionary = len(dictionary_words_final)
    s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT).filter("terms", corpus=corpuses).source([])[:0]
    number_of_documents = s.count()
    print("!!!", "Number of documents", number_of_documents)
    print("!!! Min documents threshold", number_of_documents * min_relative_document_frequency)
    dictionary_words_final = filter(lambda x: x['document_frequency'] > number_of_documents * min_relative_document_frequency, dictionary_words_final.values())
    for ok, result in streaming_bulk(ES_CLIENT, dictionary_words_final,
                                    index=f"{ES_INDEX_DICTIONARY_WORD}_{name}",
                                    chunk_size=1000, raise_on_error=True, max_retries=10):
        if not ok:
            failed += 1
        else:
            success += 1
        if success % 1000 == 0:
            print(f"{success}/{len_dictionary} processed, {datetime.datetime.now()}")
        if failed > 3:
            raise Exception("Too many failed!!")
    ES_CLIENT.update(index=ES_INDEX_DICTIONARY_INDEX, id=dictionary_index.meta.id, body={"doc": {"is_ready": True}})
    es_index = Index(f"{ES_INDEX_DICTIONARY_WORD}_{name}_temp", using=ES_CLIENT)
    es_index.delete(ignore=404)
    return success
Esempio n. 15
0
def generate_word_embeddings(**kwargs):
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING

    start = kwargs['start']
    end = kwargs['end']

    # Get embedding object
    query = {
        "corpus": "main",
        # "is_ready": False, # TODO Uncomment
        "name": WORD_EMBEDDING_NAME.lower(),
        "by_unit": "word",
        "algorithm": "BERT".lower(),
        "pooling": "None".lower(),
    }
    embedding = search(ES_CLIENT, ES_INDEX_EMBEDDING, query)[-1]
    number_of_documents = embedding['number_of_documents']

    # Get documents
    documents = search(ES_CLIENT,
                       ES_INDEX_DOCUMENT, {"corpus": "main"},
                       start=int(start / 100 * number_of_documents),
                       end=int(end / 100 * number_of_documents),
                       source=['id', 'text'],
                       sort=['id'])

    # Embeddings themselves
    from textblob import TextBlob
    embeddings = []
    documents_to_write = []
    input_file_name = f"input-{start}-{end}.txt"
    output_file_name = f"output-{start}-{end}.json"
    batch_size = 10000
    with tempfile.TemporaryDirectory() as tmpdir:
        for document in documents:
            # Write to input.txt
            with open(os.path.join(tmpdir, input_file_name),
                      "w",
                      encoding='utf-8') as f:
                text = TextBlob(document.text)
                for sentence in text.sentences:
                    f.write(sentence.string.replace("\n", " ") + "\n")
            # Run bert
            subprocess.run([
                "python",
                f"{os.path.join(BASE_DAG_DIR, 'dags', 'bert_embeddings', 'bert', 'extract_features.py')}",
                f"--input_file={os.path.join(tmpdir, input_file_name)}",
                f"--output_file={os.path.join(tmpdir, output_file_name)}",
                f"--vocab_file={os.path.join(BASE_DAG_DIR, 'dags', 'bert_embeddings','bert', 'models', 'rubert_cased_L-12_H-768_A-12_v1', 'vocab.txt')}",
                f"--bert_config_file={os.path.join(BASE_DAG_DIR, 'dags', 'bert_embeddings','bert', 'models', 'rubert_cased_L-12_H-768_A-12_v1', 'bert_config.json')}",
                f"--init_checkpoint={os.path.join(BASE_DAG_DIR, 'dags', 'bert_embeddings','bert', 'models', 'rubert_cased_L-12_H-768_A-12_v1', 'bert_model.ckpt')}",
                "--layers=-2", "--max_seq_length=128", "--batch_size=1000"
            ])

            # Read from output.json
            document_embeddings = []
            with open(os.path.join(tmpdir, output_file_name),
                      "r",
                      encoding='utf-8') as f:
                for line in f.readlines():
                    embedding = json.loads(line)
                    tokens = embedding['features']
                    words = []
                    # Pool tokens into words
                    cur_token = ""
                    cur_embed = []
                    for token in tokens[1:-1]:
                        token_str = token['token']
                        token_emb = token['layers'][0]['values']
                        if not cur_token and not cur_embed:
                            cur_token = token_str
                            cur_embed.append(token_emb)
                        elif "##" in token_str:
                            cur_token += token_str.replace("##", "")
                            cur_embed.append(token_emb)
                        else:
                            cur_embed = pool_vectors(cur_embed, "Average")
                            words.append({
                                "layers": [{
                                    "values": cur_embed,
                                    "index": -2
                                }],
                                "word":
                                cur_token
                            })
                            cur_token = token_str
                            cur_embed = [token_emb]
                    if cur_token and cur_embed:
                        cur_embed = pool_vectors(cur_embed, "Average")
                        words.append({
                            "layers": [{
                                "values": cur_embed,
                                "index": -2
                            }],
                            "word":
                            cur_token
                        })
                    document_embeddings.append(words)
            embeddings.append(document_embeddings)
            documents_to_write.append(document)
            if len(embeddings) >= batch_size:
                persist_embeddings_to_es(ES_CLIENT, ES_INDEX_DOCUMENT,
                                         documents_to_write, embeddings,
                                         WORD_EMBEDDING_NAME)
                embeddings = []
                documents_to_write = []
        persist_embeddings_to_es(ES_CLIENT, ES_INDEX_DOCUMENT,
                                 documents_to_write, embeddings,
                                 WORD_EMBEDDING_NAME)
Esempio n. 16
0
def preprocess_data(**kwargs):
    """

    :param kwargs:
    :return:
    """
    import os
    import pickle

    import numpy as np

    from scipy.io import savemat
    from sklearn.feature_extraction.text import CountVectorizer

    from util.service_es import search
    from util.constants import BASE_DAG_DIR
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT

    from .utils import split_bow, remove_empty, create_bow, create_doc_indices, create_list_words

    corpus = kwargs.get('corpus', 'main')
    test_size = kwargs.get('test_size', 0.1)

    max_df = 0.7
    min_df = 100  # choose desired value for min_df

    # Read data
    print('reading text file...')
    docs = search(client=ES_CLIENT,
                  index=ES_INDEX_DOCUMENT,
                  start=0,
                  end=1_000_000,
                  query={'corpus': corpus},
                  source=['text_lemmatized'],
                  get_scan_obj=True)
    docs = [' '.join(doc) for doc in docs.text_lemmatized]
    #  Create count vectorizer
    print('counting document frequency of words...')
    cvectorizer = CountVectorizer(min_df=min_df,
                                  max_df=max_df,
                                  stop_words=None)
    cvz = cvectorizer.fit_transform(docs).sign()

    #  Get vocabulary
    print('building the vocabulary...')
    sum_counts = cvz.sum(axis=0)
    v_size = sum_counts.shape[1]
    sum_counts_np = np.zeros(v_size, dtype=int)
    for v in range(v_size):
        sum_counts_np[v] = sum_counts[0, v]
    word2id = dict([(w, cvectorizer.vocabulary_.get(w))
                    for w in cvectorizer.vocabulary_])
    del cvectorizer
    print('  initial vocabulary size: {}'.format(v_size))

    #  Split in train/test/valid
    print('tokenizing documents and splitting into train/test/valid...')
    num_docs = cvz.shape[0]

    #  Remove words not in train_data
    vocab = [word for word in word2id.keys()]
    print('  vocabulary after removing words not in train: {}'.format(
        len(vocab)))

    docs_tr = [[word2id[w] for w in docs[idx_d].split() if w in word2id]
               for idx_d in range(num_docs)]
    docs_ts = docs_tr[:test_size]

    del docs
    print('  number of documents (train): {} [this should be equal to {}]'.
          format(len(docs_tr), num_docs))

    # Getting lists of words and doc_indices
    print('creating lists of words...')

    words_tr = create_list_words(docs_tr)
    words_ts = create_list_words(docs_ts)

    # Get doc indices
    print('getting doc indices...')

    doc_indices_tr = create_doc_indices(docs_tr)
    doc_indices_ts = create_doc_indices(docs_ts)

    #  Remove empty documents
    print('removing empty documents...')

    docs_tr = remove_empty(docs_tr)
    docs_ts = remove_empty(docs_ts)

    # Number of documents in each set
    n_docs_tr = len(docs_tr)
    n_docs_ts = len(docs_ts)

    # Create bow representation
    print('creating bow representation...')

    bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab))
    bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab))

    # Save vocabulary to file
    path_save = os.path.join(BASE_DAG_DIR, 'etm_temp')
    if not os.path.isdir(path_save):
        os.system('mkdir -p ' + path_save)

    with open(os.path.join(path_save, 'vocab.pkl'), 'wb') as f:
        pickle.dump(vocab, f)

    # Split bow intro token/value pairs
    print('splitting bow intro token/value pairs and saving to disk...')

    bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr)
    savemat(os.path.join(path_save, 'bow_tr_tokens.mat'),
            {'tokens': bow_tr_tokens},
            do_compression=True)
    savemat(os.path.join(path_save, 'bow_tr_counts.mat'),
            {'counts': bow_tr_counts},
            do_compression=True)

    bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts)
    savemat(os.path.join(path_save, 'bow_ts_tokens.mat'),
            {'tokens': bow_ts_tokens},
            do_compression=True)
    savemat(os.path.join(path_save, 'bow_ts_counts.mat'),
            {'counts': bow_ts_counts},
            do_compression=True)

    print('Data ready !!')
Esempio n. 17
0
def ngramize(**kwargs):
    import datetime

    from airflow.models import Variable
    from elasticsearch.helpers import streaming_bulk
    from elasticsearch_dsl import Search
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_CUSTOM_DICTIONARY_WORD, \
        ES_INDEX_DICTIONARY_WORD

    from util.service_es import search, update_generator

    process_num = kwargs['process_num']
    total_proc = kwargs['total_proc']
    corpus = kwargs['corpus']
    dict_name = kwargs['dict_name']
    source_field = kwargs['source_field']
    min_document_frequency_relative = kwargs['min_document_frequency_relative']
    max_n_gram_len = kwargs['max_n_gram_len']

    print("!!!", "Getting documents", datetime.datetime.now())
    documents = search(ES_CLIENT,
                       ES_INDEX_DOCUMENT,
                       query={},
                       source=(source_field, 'id'),
                       sort=('id', ),
                       get_search_obj=True)
    documents = documents.exclude('exists',
                                  field=f'text_ngramized_{dict_name}')
    documents = documents.filter('exists', field=source_field)
    documents = documents.filter('terms', corpus=corpus)

    print("!!!", "Getting dictionary", datetime.datetime.now())
    s = Search(using=ES_CLIENT,
               index=f"{ES_INDEX_DICTIONARY_WORD}_{dict_name}")
    s = s.filter(
        "range",
        document_frequency_relative={"gt": min_document_frequency_relative})
    s = s.filter("range", n_gram_len={"gte": 2})
    s = s.source(("word", ))
    dict_words = set(w.word for w in s.scan())
    print('!!! len dict', len(dict_words))

    print("!!!", "Processing documents", datetime.datetime.now())
    success = 0
    documents_to_process = []
    for doc in documents.params(raise_on_error=False).scan():
        if int(doc.id) % total_proc != process_num:
            continue
        success += 1
        if success > 50_000:
            break
        if success % 10_000 == 0:
            print(f"{success}/{50_000}")
        text_ngramized = doc[source_field]
        text_ngramized_split = text_ngramized.split()
        n_grams_to_append = []
        for n_gram_len in range(2, max_n_gram_len + 1):
            n_grams = [
                text_ngramized_split[i:i + n_gram_len]
                for i in range(len(text_ngramized_split) - n_gram_len + 1)
            ]
            for n_gram in n_grams:
                word = "_".join(n_gram)
                if word in dict_words:
                    n_grams_to_append.append(word)
        doc[f'text_ngramized_{dict_name}'] = text_ngramized + " ".join(
            n_grams_to_append)
        documents_to_process.append(doc)
Esempio n. 18
0
def preprocessing_raw_data(**kwargs):
    import re

    from airflow.models import Variable
    from elasticsearch.helpers import streaming_bulk
    from elasticsearch_dsl import Search, Q
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_CUSTOM_DICTIONARY_WORD
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from pymorphy2 import MorphAnalyzer
    from pymystem3 import Mystem
    from stop_words import get_stop_words

    from util.service_es import search, update_generator
    from util.util import is_latin, is_word

    start = kwargs['start']
    end = kwargs['end']

    number_of_documents = int(
        Variable.get("lemmatize_number_of_documents", default_var=None))
    if number_of_documents is None:
        raise Exception("No variable!")

    s = search(ES_CLIENT,
               ES_INDEX_DOCUMENT,
               query={},
               source=['text'],
               sort=['id'],
               get_search_obj=True)
    s = s.query(~Q('exists', field="text_lemmatized_yandex")
                | ~Q('exists', field="text_lemmatized"))
    s = s[int(start / 100 *
              number_of_documents):int(end / 100 * number_of_documents) + 1]
    documents = s.execute()

    print('!!! len docs', len(documents))
    stopwords_ru = set(get_stop_words('ru'))
    stopwords_eng = set(get_stop_words('en') + stopwords.words('english'))

    lemmatizer = WordNetLemmatizer()
    morph = MorphAnalyzer()
    m = Mystem()

    s = Search(using=ES_CLIENT, index=ES_INDEX_CUSTOM_DICTIONARY_WORD)
    r = s[:1000000].scan()
    custom_dict = dict((w.word, w.word_normal) for w in r)

    for doc in documents:
        cleaned_doc = " ".join(x.lower() for x in ' '.join(
            re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ',
                   doc.text).split()).split())
        if is_latin(cleaned_doc):
            cleaned_words_list = [
                lemmatizer.lemmatize(word) for word in cleaned_doc.split()
                if len(word) > 3 and word not in stopwords_eng
            ]
            doc['text_lemmatized_yandex'] = ""
        else:
            cleaned_words_list = [
                morph_with_dictionary(morph, word, custom_dict)
                for word in cleaned_doc.split()
                if len(word) > 2 and word not in stopwords_ru
            ]
            cwl_yandex = filter(
                lambda word: is_word(word) and len(word) > 2 and word not in
                stopwords_ru, m.lemmatize(cleaned_doc))
            cleaned_doc_yandex = " ".join(cwl_yandex)
            doc['text_lemmatized_yandex'] = cleaned_doc_yandex
        cleaned_doc = " ".join(cleaned_words_list)
        doc['text_lemmatized'] = cleaned_doc

    documents_processed = 0
    failed = 0
    for ok, result in streaming_bulk(ES_CLIENT,
                                     update_generator(ES_INDEX_DOCUMENT,
                                                      documents),
                                     index=ES_INDEX_DOCUMENT,
                                     chunk_size=5000,
                                     raise_on_error=True,
                                     max_retries=10):
        if not ok:
            failed += 1
        if failed > 5:
            raise Exception("Too many failed ES!!!")
        documents_processed += 1
    return f"{documents_processed} Processed, {known_counter} in pymorphie dict, {custom_dict_counter} in custom dict, {not_in_dict_counter} not found"
Esempio n. 19
0
def mapper(**kwargs):
    """
    идем в мета дтм, тянем оттуда дтм для которого хотим получить маппинги потом идем по этому meta_dtm_name тянем все
    топик моделлинги, формируем два листа набора слов, скорим

    """
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DYNAMIC_TOPIC_MODELLING, ES_INDEX_DYNAMIC_TOPIC_DOCUMENT
    from mainapp.documents import Mappings
    from util.service_es import search
    from util.util import parse_topics_field, mapper, validator
    import json

    meta_dtm_name = kwargs['meta_dtm_name']
    datetime_from_tm_1 = kwargs['datetime_from_tm_1']
    datetime_to_tm_1 = kwargs['datetime_to_tm_1']
    datetime_from_tm_2 = kwargs['datetime_from_tm_2']
    datetime_to_tm_2 = kwargs['datetime_to_tm_2']
    number_of_topics = kwargs['number_of_topics']
    theta_name_1 = ES_INDEX_DYNAMIC_TOPIC_DOCUMENT + "_" + kwargs['name_immutable'] + "_" + str(
        datetime_from_tm_1) + "_" + str(datetime_to_tm_1)
    theta_name_2 = ES_INDEX_DYNAMIC_TOPIC_DOCUMENT + "_" + kwargs['name_immutable'] + "_" + str(
        datetime_from_tm_2) + "_" + str(datetime_to_tm_2)
    # TODO fix meta_dtm_name issue
    tm_1 = search(client=ES_CLIENT, index=ES_INDEX_DYNAMIC_TOPIC_MODELLING,
                  query={
                      'meta_dtm_name.keyword': meta_dtm_name,
                      'datetime_from__gte': datetime_from_tm_1,
                      'datetime_to__lte': datetime_to_tm_1
                  },
                  source=['name', 'meta_dtm_name', 'datetime_from', 'datetime_to', 'topics', 'topic_doc'],
                  )

    tm_2 = search(client=ES_CLIENT, index=ES_INDEX_DYNAMIC_TOPIC_MODELLING,
                  query={
                      'meta_dtm_name.keyword': meta_dtm_name,
                      'datetime_from__gte': datetime_from_tm_2,
                      'datetime_to__lte': datetime_to_tm_2
                  },
                  source=['name', 'meta_dtm_name', 'datetime_from', 'datetime_to', 'topics', 'topic_doc'],
                  )

    tm_1_dict, tm_1_name = parse_topics_field(tm_1[0])
    tm_2_dict, tm_2_name = parse_topics_field(tm_2[0])

    topic_modelling_first_from = tm_1_name.split('_')[-2]
    topic_modelling_second_to = tm_2_name.split('_')[-1]

    thresholds = list(map(str, [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]))

    mappings_dict, delta_words_dict, delta_count_dict = mapper(topic_seq_1=tm_1_dict,
                                                               topic_seq_2=tm_2_dict,
                                                               threshold_list=thresholds)

    scores = validator(mappings_dict=mappings_dict,
                       client=ES_CLIENT,
                       index_theta_one=theta_name_1,
                       index_theta_two=theta_name_2,
                       datetime_from_tm_2=datetime_from_tm_2,
                       datetime_to_tm_1=datetime_to_tm_1,
                       number_of_topics=number_of_topics)

    for threshold in thresholds:
        index = Mappings(
                threshold=threshold,
                meta_dtm_name=meta_dtm_name,
                topic_modelling_first=tm_1_name,
                topic_modelling_second=tm_2_name,
                topic_modelling_first_from=topic_modelling_first_from,
                topic_modelling_second_to=topic_modelling_second_to,
                mappings_dict=json.dumps(mappings_dict[threshold]),
                scores_list=scores[threshold],
                delta_words_dict=json.dumps(delta_words_dict[threshold]),
                delta_count_dict=json.dumps(delta_count_dict[threshold]),
        )
        index.save()

    return 'Mapping created'