Ejemplo n.º 1
0
def cache_geo_config():
    geo_config = {}
    for name, pk, priority in GeoEntity.objects.values_list(
            'name', 'pk', 'priority'):
        entity = dict_entities.DictionaryEntry(pk,
                                               name,
                                               priority or 0,
                                               name_is_alias=True)
        geo_config[pk] = entity
    for alias_id, alias_text, alias_type, entity_id, alias_lang \
            in GeoAlias.objects.values_list('pk', 'alias', 'type', 'entity', 'locale'):
        entity = geo_config[entity_id]
        if entity:
            is_abbrev = alias_type.startswith('iso') or alias_type.startswith(
                'abbrev')
            for alias in alias_text.split(';'):
                entity.aliases.append(
                    dict_entities.DictionaryEntryAlias(
                        alias,
                        language=alias_lang,
                        is_abbreviation=is_abbrev,
                        alias_id=alias_id))
    res = list(geo_config.values())

    # DbCache.put_to_db(CACHE_KEY_GEO_CONFIG, res)
    redis.push(CACHE_KEY_GEO_CONFIG, res)
    return res
Ejemplo n.º 2
0
 def create(self, user):
     token_user_key = self.get_token_user_key(user.id)
     token = self.gen_auth_token()
     token_key = self.get_token_key(token)
     # create token pair - because redis has no support for quick search by value (token)
     redis.push(token_user_key, token_key, ex=self.key_expiration_time)
     redis.push(token_key, token_user_key, ex=self.key_expiration_time)
     logger.info(
         f'Cached auth token "{token_user_key}:{token}" for user {user}')
     return token_user_key, self.get_token_object(user, token)
Ejemplo n.º 3
0
 def rewrite_cache(cls):
     records = list(BanListRecord.objects.all())
     records_str = pickle.dumps(records)
     m = hashlib.md5()
     m.update(records_str)
     records_checksum = m.hexdigest()
     redis.push(f'{cls.CACHE_KEY}_data', records_str, pickle_value=False)
     redis.push(f'{cls.CACHE_KEY}_hash', records_checksum, pickle_value=False)
     cls.LAST_CACHED_HASH = records_checksum
     return records
Ejemplo n.º 4
0
def cache_court_config():
    res = [
        dict_entities.DictionaryEntry(id=i.id,
                                      name=i.name,
                                      priority=0,
                                      aliases=[
                                          dict_entities.DictionaryEntryAlias(a)
                                          for a in i.alias.split(';')
                                      ] if i.alias else [])
        for i in Court.objects.all()
    ]

    # DbCache.put_to_db(CACHE_KEY_COURT_CONFIG, res)
    redis.push(CACHE_KEY_COURT_CONFIG, res)
    return res
Ejemplo n.º 5
0
def cache_term_stems(project_id=None):
    term_stems = {}

    terms_qs = Term.objects
    key = CACHE_KEY_TERM_STEMS

    if project_id is not None:
        qs = ProjectTermConfiguration.objects.filter(project_id=project_id)
        if qs.exists():
            terms_qs = qs.last().terms
            key = CACHE_KEY_TERM_STEMS_PROJECT_PTN.format(project_id)

    for t, pk in terms_qs.values_list('term', 'pk'):
        stemmed_term = ' %s ' % ' '.join(get_stems(t))
        stemmed_item = term_stems.get(stemmed_term, [])
        stemmed_item.append([t, pk])
        term_stems[stemmed_term] = stemmed_item
    for item in term_stems:
        term_stems[item] = dict(values=term_stems[item],
                                length=len(term_stems[item]))

    # DbCache.put_to_db(key, term_stems)
    redis.push(key, term_stems)
    return term_stems
Ejemplo n.º 6
0
 def cache_users(self, users):
     # cache user qs for 5 min
     redis.push(self.users_cache_key,
                users,
                ex=self.cached_users_expiration_time)
Ejemplo n.º 7
0
    def process(self, **kwargs):
        search_similar_documents = kwargs['search_similar_documents']
        search_similar_text_units = kwargs['search_similar_text_units']

        project = kwargs['project']
        project_id = project['pk'] if project else None
        unit_type = kwargs['unit_type']
        feature_source = kwargs['feature_source']
        use_tfidf = kwargs['use_tfidf']
        distance_type = kwargs['distance_type']
        similarity_threshold = kwargs['similarity_threshold'] / 100

        self.log_info('Min similarity: {}'.format(similarity_threshold))

        if search_similar_documents:
            engine_class = DocumentSimilarityEngine
        elif search_similar_text_units:
            engine_class = TextUnitSimilarityEngine
        else:
            self.log_error("Classify task target (documents or text units) is not specified.")
            return

        if kwargs['delete']:
            if search_similar_text_units:
                if project_id:
                    deleted = TextUnitSimilarity.objects.filter(
                        Q(project_a__id=project_id) |
                        Q(project_b__id=project_id)).delete()
                else:
                    deleted = TextUnitSimilarity.objects.all().delete()
            else:
                if project_id:
                    deleted = DocumentSimilarity.objects.filter(
                        Q(document_a__project__id=project_id) |
                        Q(document_b__project__id=project_id)).delete()
                else:
                    deleted = DocumentSimilarity.objects.all().delete()

            self.log_info('Deleted "{}"'.format(deleted[1]))

        similarity_engine_kwargs = dict(
            project_id=project_id,
            unit_type=unit_type,
            feature_source=feature_source,
            use_tfidf=use_tfidf,
            distance_type=distance_type,
            threshold=similarity_threshold
        )
        similarity_engine = engine_class(**similarity_engine_kwargs)
        features = similarity_engine.get_features()
        feature_matrix = features.term_frequency_matrix
        feature_records = feature_matrix.shape[0]

        subtasks_args = []

        for block_i_start in range(0, feature_records, similarity_engine.block_step):

            block_i_end = block_i_start + similarity_engine.block_step
            df1_redis_key = f'{self.task.pk}_{block_i_start}_{block_i_end}'
            if not redis.exists(df1_redis_key):
                df1_data = (feature_matrix[block_i_start:block_i_end],
                            features.item_index[block_i_start:block_i_end],
                            features.feature_names)
                redis.push(key=df1_redis_key, value=df1_data, pickle_value=True)

            for block_j_start in range(0, feature_records, similarity_engine.block_step):

                block_j_end = block_j_start + similarity_engine.block_step
                self.log_info(f'Cache data for blocks: '
                              f'{block_i_start}:{block_i_end} - {block_j_start}:{block_j_end}')

                df2_redis_key = f'{self.task.pk}_{block_j_start}_{block_j_end}'
                if not redis.exists(df2_redis_key):
                    df2_data = (feature_matrix[block_j_start:block_j_end],
                                features.item_index[block_j_start:block_j_end],
                                features.feature_names)
                    redis.push(key=df2_redis_key, value=df2_data, pickle_value=True)

                subtasks_args.append((
                    df1_redis_key,
                    df2_redis_key,
                    search_similar_documents,
                    similarity_engine_kwargs,
                    project_id
                ))

        self.run_sub_tasks(
            'Calculate similarities for feature_df blocks',
            self.calc_block_similarity,
            subtasks_args
        )
        self.run_after_sub_tasks_finished('Clear redis keys.', self.finalize, [()])
Ejemplo n.º 8
0
 def cache(self):
     redis.push(self.cache_key, self.value)
Ejemplo n.º 9
0
    def process(self, **kwargs):
        search_similar_documents = kwargs['search_similar_documents']
        search_similar_text_units = kwargs['search_similar_text_units']

        project = kwargs['project']
        project_id = project['pk'] if project else None
        unit_type = kwargs['unit_type']
        feature_source = kwargs['feature_source']
        use_tfidf = kwargs['use_tfidf']
        distance_type = kwargs['distance_type']
        similarity_threshold = kwargs['similarity_threshold'] / 100

        self.log_info('Min similarity: {}'.format(similarity_threshold))

        if search_similar_documents:
            db_model = DocumentSimilarity
            engine_class = DocumentSimilarityEngine
        elif search_similar_text_units:
            db_model = TextUnitSimilarity
            engine_class = TextUnitSimilarityEngine
        else:
            self.log_error(
                "Classify task target (documents or text units) is not specified."
            )
            return

        if kwargs['delete']:
            # TODO: delete all Similarity db objects OR filter by unit_type/project_id
            deleted = db_model.objects.filter().delete()
            self.log_info('Deleted "{}"'.format(deleted[1]))

        similarity_engine_kwargs = dict(project_id=project_id,
                                        unit_type=unit_type,
                                        feature_source=feature_source,
                                        use_tfidf=use_tfidf,
                                        distance_type=distance_type,
                                        threshold=similarity_threshold)
        similarity_engine = engine_class(**similarity_engine_kwargs)
        feature_df = similarity_engine.get_features().feature_df

        subtasks_args = []

        for block_i_start in range(0, feature_df.shape[0],
                                   similarity_engine.block_step):
            for block_j_start in range(0, feature_df.shape[0],
                                       similarity_engine.block_step):

                df1 = feature_df.iloc[block_i_start:block_i_start +
                                      similarity_engine.block_step, :]
                df1_redis_key = f'{self.task.pk}_{block_i_start}_{block_i_start + similarity_engine.block_step}'
                redis.push(key=df1_redis_key, value=df1, pickle_value=True)

                df2 = feature_df.iloc[block_j_start:block_j_start +
                                      similarity_engine.block_step, :]
                df2_redis_key = f'{self.task.pk}_{block_j_start}_{block_j_start + similarity_engine.block_step}'
                redis.push(key=df2_redis_key, value=df2, pickle_value=True)

                subtasks_args.append(
                    (df1_redis_key, df2_redis_key, search_similar_documents,
                     similarity_engine_kwargs))

        self.run_sub_tasks('Calculate similarities for feature_df blocks',
                           self.calc_block_similarity, subtasks_args)
        self.run_after_sub_tasks_finished('Clear redis keys.', self.finalize,
                                          [()])