Exemple #1
0
def index_distribution(distribution_id,
                       node_id,
                       task_id,
                       read_local=False,
                       index=settings.TS_INDEX,
                       force=False):
    node = Node.objects.get(id=node_id)
    task = IndexDataTask.objects.get(id=task_id)
    distribution_model = Distribution.objects.get(
        identifier=distribution_id,
        dataset__catalog__identifier=node.catalog_id,
        present=True)

    try:
        DistributionValidator(read_local).run(distribution_model)

        changed = True
        _hash = distribution_model.enhanced_meta.filter(
            key=meta_keys.LAST_HASH)
        if _hash:
            changed = _hash[0].value != distribution_model.data_hash

        if changed or force:
            DistributionIndexer(index=index).reindex(distribution_model)

        update_distribution_metadata(changed, distribution_model)

    except Exception as e:
        _handle_exception(distribution_model.dataset, distribution_id, e, node,
                          task)
def index_distribution(distribution_id, node_id, task_id,
                       read_local=False, index=settings.TS_INDEX, force=False):

    node = Node.objects.get(id=node_id)
    task = ReadDataJsonTask.objects.get(id=task_id)
    catalog = DataJson(json.loads(node.catalog))
    distribution_model = Distribution.objects.get(identifier=distribution_id,
                                                  dataset__catalog__identifier=node.catalog_id)

    try:
        Scraper(read_local).run(distribution_model, catalog)

        changed = True
        _hash = distribution_model.enhanced_meta.filter(key=meta_keys.LAST_HASH)
        if _hash:
            changed = _hash[0].value != distribution_model.data_hash

        if changed or force:
            DistributionIndexer(index=index).run(distribution_model)

        distribution_model.enhanced_meta.update_or_create(key=meta_keys.LAST_HASH,
                                                          defaults={'value': distribution_model.data_hash})
        distribution_model.enhanced_meta.update_or_create(key=meta_keys.CHANGED,
                                                          defaults={'value': str(changed)})

    except Exception as e:
        _handle_exception(distribution_model.dataset, distribution_id, e, node, task)
    def _index_catalog(self, catalog_path):
        self.read_data(catalog_path)
        with mock.patch('series_tiempo_ar_api.libs.indexing.indexer.distribution_indexer.parallel_bulk'):
            distributions = Distribution.objects.all()

            for distribution in distributions:
                DistributionIndexer('some_index').reindex(distribution)
    def test_index_series_no_identifier(self):
        catalog = os.path.join(SAMPLES_DIR, 'series_metadata_no_identifier.json')
        self._index_catalog(catalog)

        distribution = Distribution.objects.first()

        actions = DistributionIndexer('some_index').generate_es_actions(distribution)

        self.assertFalse(actions)
Exemple #5
0
    def test_init_dataframe_columns(self):
        self._index_catalog('full_ts_data.json')

        distribution = Distribution.objects.get(identifier="212.1")
        fields = distribution.field_set.all()
        fields = {field.title: field.identifier for field in fields}
        df = DistributionIndexer(self.test_index).init_df(distribution, fields)

        for field in fields.values():
            self.assertTrue(field in df.columns)
Exemple #6
0
    def test_reindex_distribution_no_time_index_identifier(self):
        self._index_catalog('distribution_time_index_no_identifier.json')

        DistributionIndexer(index=self.test_index).reindex(Distribution.objects.first())
        self.elastic.indices.forcemerge(index=self.test_index)
        series_id = '89.2_TS_INTEALL_0_D_18'
        results = Search(index=self.test_index) \
            .filter('match', series_id=series_id).execute()

        self.assertTrue(list(results))
 def init_df(self):
     self.field.distribution.data_file = File(open(os.path.join(SAMPLES_DIR,
                                                                'daily_periodicity.csv')))
     self.field.distribution.field_set.create(identifier='indice_tiempo',
                                              metadata='{"specialTypeDetail": "R/P1D"}')
     df = DistributionIndexer('test_index').init_df(
         self.field.distribution,
         {'tasas_interes_call': self.field.identifier,
          'indice_tiempo': 'indice_tiempo'}
     )
     return df
Exemple #8
0
    def test_reindex_same_distribution(self):
        self._index_catalog('single_data.json')
        self.assertEqual(Field.objects.count(), 2)
        series_id = '102.1_I2NG_ABRI_M_22'
        results = Search(index=self.test_index) \
            .filter('match', series_id=series_id).execute()

        DistributionIndexer(index=self.test_index).reindex(Distribution.objects.first())
        updated_results = Search(index=self.test_index) \
            .filter('match', series_id=series_id).execute()

        # No cambia nada
        self.assertEqual(list(results), list(updated_results))
    def _index_catalog(self, catalog_path):
        Node.objects.create(catalog_id='test_catalog',
                            catalog_url=catalog_path,
                            indexable=True)
        task = ReadDataJsonTask.objects.create()

        read_datajson(task, whitelist=True)
        with mock.patch(
                'series_tiempo_ar_api.libs.indexing.indexer.distribution_indexer.parallel_bulk'
        ):
            distributions = Distribution.objects.all()

            for distribution in distributions:
                DistributionIndexer('some_index').reindex(distribution)
Exemple #10
0
    def test_reindex_remove_value(self):
        self._index_catalog('single_data_updated.json')
        self.assertEqual(Field.objects.count(), 2)
        series_id = '102.1_I2NG_ABRI_M_22'
        results = Search(index=self.test_index) \
            .filter('match', series_id=series_id).execute()

        self.assertEqual(len(list(results)), 3)
        distribution = Distribution.objects.get(identifier="102.1")
        distribution.data_file = File(open(os.path.join(SAMPLES_DIR, 'single_data.csv'), 'rb'))
        distribution.save()
        DistributionIndexer(index=self.test_index).reindex(distribution)
        self.elastic.indices.forcemerge(index=self.test_index)
        updated_results = Search(index=self.test_index) \
            .filter('match', series_id=series_id).execute()
        self.assertEqual(len(list(updated_results)), 2)

        self.assertEqual(list(results[:2]), list(updated_results))
Exemple #11
0
def index_catalog(catalog_id, catalog_path, index, node=None):
    """Indexa un catálogo. Útil para tests"""
    if not node:
        node = Node(catalog_id=catalog_id,
                    catalog_url=catalog_path,
                    indexable=True)

    catalog = DataJson(node.catalog_url)
    node.catalog = json.dumps(catalog)
    node.save()
    task = ReadDataJsonTask()
    task.save()

    read_datajson(task, read_local=True, whitelist=True)
    for distribution in Distribution.objects.filter(
            dataset__catalog__identifier=catalog_id):
        DistributionIndexer(index=index).run(distribution)
    ElasticInstance.get().indices.forcemerge(index=index)
Exemple #12
0
def index_distribution(distribution_id,
                       node_id,
                       task_id,
                       read_local=False,
                       index=None,
                       force=False):
    if index is None:  # Lazy loading
        index = settings.TS_INDEX
    node = Node.objects.get(id=node_id)
    task = IndexDataTask.objects.get(id=task_id)
    distribution_model = Distribution.objects.get(
        identifier=distribution_id,
        dataset__catalog__identifier=node.catalog_id,
        present=True)

    try:
        config = DistributionValidatorConfig().get_solo()
        options = ValidationOptions.create_with_defaults(
            minimum_values=config.minimum_values,
            max_missing_proportion=config.max_missing_proportion,
            max_too_small_proportion=config.max_too_small_proportion,
            max_field_title_len=config.max_field_title_len,
            max_null_series_proportion=config.max_null_series_proportion,
        )
        validator = DataValidator(get_distribution_errors, options)
        DistributionValidator(read_local,
                              data_validator=validator).run(distribution_model)

        changed = True
        _hash = distribution_model.enhanced_meta.filter(
            key=meta_keys.LAST_HASH)
        if _hash:
            changed = _hash[0].value != distribution_model.data_hash

        if changed or force:
            DistributionIndexer(index=index).reindex(distribution_model)

        update_distribution_metadata(changed, distribution_model)

    except Exception as e:
        _handle_exception(distribution_model.dataset, distribution_id, e, node,
                          task)
Exemple #13
0
def reindex_distribution(distribution: Distribution):
    DistributionIndexer(index=settings.TS_INDEX).reindex(distribution)