コード例 #1
0
 def test_ibatch_with_list_no_progress(self):
     input = list(range(0, 100))
     # Neatly dividable batches
     iterator = ibatch(input, batch_size=20)
     self.assertIsInstance(iterator, Iterator)
     batches = [batch for batch in iterator]
     self.assertEqual(len(batches), 5)
     self.assertEqual(batches[0], list(range(0, 20)))
     self.assertEqual(batches[-1], list(range(80, 100)))
     # Rest batches
     iterator = ibatch(input, batch_size=11)
     self.assertIsInstance(iterator, Iterator)
     batches = [batch for batch in iterator]
     self.assertEqual(len(batches), 10)
     self.assertEqual(batches[0], list(range(0, 11)))
     self.assertEqual(batches[-1], list(range(99, 100)))
     # Batch size equals list length
     input = list(range(0, 10))
     iterator = ibatch(input, batch_size=10)
     self.assertIsInstance(iterator, Iterator)
     batches = [batch for batch in iterator]
     self.assertEqual(len(batches), 1)
     self.assertEqual(batches[0], list(range(0, 10)))
     # Batch size larger than list length
     input = list(range(0, 11))
     iterator = ibatch(input, batch_size=20)
     self.assertIsInstance(iterator, Iterator)
     batches = [batch for batch in iterator]
     self.assertEqual(len(batches), 1)
     self.assertEqual(batches[0], list(range(0, 11)))
コード例 #2
0
 def test_ibatch_with_list_progress_no_total(self):
     progress_bar_mock = Mock(spec=tqdm(disable=True, total=100))
     input = range(0, 100)
     with patch("datagrowth.utils.iterators.tqdm",
                return_value=progress_bar_mock) as tqdm_mock:
         # Neatly dividable batches
         iterator = ibatch(input, batch_size=20, progress_bar=True)
         self.assertIsInstance(iterator, Iterator)
         batches = [batch for batch in iterator]
         self.assertEqual(len(batches), 5)
         self.assertEqual(batches[0], list(range(0, 20)))
         self.assertEqual(batches[-1], list(range(80, 100)))
         tqdm_mock.assert_called_once_with()
         self.assertEqual(progress_bar_mock.update.call_count, 5)
         self.assertEqual(progress_bar_mock.close.call_count, 1)
     progress_bar_mock.reset_mock()
     with patch("datagrowth.utils.iterators.tqdm",
                return_value=progress_bar_mock) as tqdm_mock:
         # Rest batches
         iterator = ibatch(input, batch_size=11, progress_bar=True)
         self.assertIsInstance(iterator, Iterator)
         batches = [batch for batch in iterator]
         self.assertEqual(len(batches), 10)
         self.assertEqual(batches[0], list(range(0, 11)))
         self.assertEqual(batches[-1], list(range(99, 100)))
         tqdm_mock.assert_called_once_with()
         self.assertEqual(progress_bar_mock.update.call_count, 10)
         self.assertEqual(progress_bar_mock.close.call_count, 1)
     progress_bar_mock.reset_mock()
     input = range(0, 10)
     with patch("datagrowth.utils.iterators.tqdm",
                return_value=progress_bar_mock) as tqdm_mock:
         # Batch size equals list length
         iterator = ibatch(input, batch_size=10, progress_bar=True)
         self.assertIsInstance(iterator, Iterator)
         batches = [batch for batch in iterator]
         self.assertEqual(len(batches), 1)
         self.assertEqual(batches[0], list(range(0, 10)))
         tqdm_mock.assert_called_once_with()
         self.assertEqual(progress_bar_mock.update.call_count, 1)
         self.assertEqual(progress_bar_mock.close.call_count, 1)
     progress_bar_mock.reset_mock()
     input = range(0, 11)
     with patch("datagrowth.utils.iterators.tqdm",
                return_value=progress_bar_mock) as tqdm_mock:
         # Batch size larger than list length
         iterator = ibatch(input, batch_size=20, progress_bar=True)
         self.assertIsInstance(iterator, Iterator)
         batches = [batch for batch in iterator]
         self.assertEqual(len(batches), 1)
         self.assertEqual(batches[0], list(range(0, 11)))
         tqdm_mock.assert_called_once_with()
         self.assertEqual(progress_bar_mock.update.call_count, 1)
         self.assertEqual(progress_bar_mock.close.call_count, 1)
コード例 #3
0
 def __call__(self, queryset):
     # Prepare some values for serialization
     processor = self.__class__.__name__
     config = self.config.to_dict(private=True, protected=True)
     # Allow derived classes to filter the target Documents
     queryset = self.filter_documents(queryset)
     # Only target Documents that have no ProcessResult associated
     queryset = queryset.exclude(
         processresult__result_type=self.result_type)
     # Create batches of documents with no processing results
     batches = []
     for document_batch in ibatch(queryset,
                                  batch_size=self.config.batch_size):
         batch = self.Batch.objects.create(processor=processor)
         results = [
             self.ProcessResult(document=document, batch=batch)
             for document in document_batch
         ]
         self.ProcessResult.objects.bulk_create(results)
         batches.append(batch)
     # Create tasks and dispatch
     tasks = [
         process_and_merge.s(batch.id, config=config) for batch in batches
     ]
     finish = full_merge.s(processor, config=config)
     return self._dispatch_tasks(tasks,
                                 finish,
                                 asynchronous=self.config.asynchronous)
コード例 #4
0
ファイル: base.py プロジェクト: surfedushare/search-portal
 def batchify(self, phase, iterator, total):
     batches = int(math.floor(total / self.batch_size))
     rest = total % self.batch_size
     if rest:
         batches += 1
     for batch in ibatch(iterator, batch_size=self.batch_size):
         self.logger.progress(phase, batches)
         yield batch
コード例 #5
0
    def add(self,
            data,
            reset=False,
            batch_size=500,
            collection=None,
            modified_at=None,
            validate=True):
        """
        Add new data to the Collection in batches, possibly deleting all data before adding.

        :param data: The data to use for the update
        :param reset: (optional) whether to delete existing data or not (no by default)
        :param batch_size: (optional) how many instances to add in a single batch (default: 500)
        :param collection: (optional) a collection instance to add the data to (default: self)
        :param modified_at: (optional) the datetime to use as modified_at value for the collection (default: now)
        :param validate: (deprecated) used to allow JSON schema validation before addition
        :return: A list of updated or created instances.
        """
        collection = collection or self
        modified_at = modified_at or make_aware(datetime.now())
        Document = collection.get_document_model()
        assert isinstance(data, (Iterator, list, tuple, dict, Document)), \
            f"Collection.add expects data to be formatted as iteratable, dict or {type(Document)} not {type(data)}"

        if reset:
            self.documents.all().delete()

        def prepare_additions(data):

            prepared = []
            if isinstance(data, dict):
                document = self.init_document(data, collection=collection)
                document.clean()
                prepared.append(document)
            elif isinstance(data, Document):
                data = self.init_document(data.properties,
                                          collection=collection)
                data.clean()
                prepared.append(data)
            else:  # type is list
                for instance in data:
                    prepared += prepare_additions(instance)
            return prepared

        count = 0
        for additions in ibatch(data, batch_size=batch_size):
            additions = prepare_additions(additions)
            count += len(additions)
            Document.objects.bulk_create(
                additions,
                batch_size=datagrowth_settings.DATAGROWTH_MAX_BATCH_SIZE)

        if collection.modified_at.replace(
                microsecond=0) != modified_at.replace(microsecond=0):
            collection.modified_at = modified_at
            collection.save()
        return count
コード例 #6
0
ファイル: dataset.py プロジェクト: surfedushare/search-portal
 def copy_collection(self, collection):
     Document = collection.get_document_model()
     source_id = collection.id
     collection.pk = None
     collection.id = None
     collection.dataset_version = self
     collection.save()
     for batch in ibatch(Document.objects.filter(collection_id=source_id),
                         batch_size=100):
         for doc in batch:
             doc.collection_id = collection.id
             doc.dataset_version = self
             doc.pk = None
             doc.id = None
         Document.objects.bulk_create(batch)
     return collection
コード例 #7
0
    def handle_label(self, label, **options):
        try:
            Model = apps.get_model(label)
        except LookupError as exc:
            log.error("Failed to find '{}': {}".format(label, exc))
            return
        assert issubclass(Model, HttpFileResource)
        log.info("Stripping from {}\r".format(Model.__name__))

        batch_size = 500
        queryset = Model.objects.filter(status=200)
        count = queryset.count()
        for batch in ibatch(queryset.iterator(), batch_size, progress_bar=True, total=count):
            for instance in batch:
                if instance.body and instance.body.startswith(options["path"]):
                    instance.body = instance.body.replace(datagrowth_settings.DATAGROWTH_MEDIA_ROOT, "", 1)
                    instance.save()
コード例 #8
0
ファイル: documents.py プロジェクト: SURFpol/pol-harvester
    def update(self,
               data,
               by_reference,
               validate=True,
               batch_size=32,
               collection=None):
        collection = collection or self
        Document = collection.get_document_model()
        assert isinstance(data, (Iterator, list, tuple, dict, Document)), \
            f"Collection.update expects data to be formatted as iteratable, dict or {type(Document)} not {type(data)}"

        count = 0
        for updates in ibatch(data, batch_size=batch_size):
            # First we bulk update by getting all objects whose identifier value match any update's "by" value
            # and then updating these source objects.
            # One update object can potentially target multiple sources
            # if multiple objects with an identifier of "by" exist.
            updated = set()
            hashed = {update[by_reference]: update for update in updates}
            sources = {
                source[by_reference]: source
                for source in collection.documents.filter(
                    reference__in=hashed.keys())
            }
            for source in sources.values():
                source.update(hashed[source.reference], validate=validate)
                count += 1
                updated.add(source.reference)
            Document.objects.bulk_update(
                sources.values(), ["properties"],
                batch_size=datagrowth_settings.DATAGROWTH_MAX_BATCH_SIZE)
            # After all updates we add all data that hasn't been used in any update operation
            additions = [
                update for identify, update in hashed.items()
                if identify not in updated
            ]
            if len(additions):
                count += self.add(additions,
                                  validate=validate,
                                  batch_size=batch_size,
                                  collection=collection)

        return count
コード例 #9
0
 def handle_deletion_seeds(self, collection, deletion_seeds):
     self.info(f"Deleting for {collection.name} ...")
     document_delete_total = 0
     for seeds in ibatch(deletion_seeds,
                         32,
                         progress_bar=self.show_progress):
         ids = [seed["external_id"] for seed in seeds]
         for id in ids:
             for doc in collection.documents.filter(
                     collection=collection,
                     properties__contains={"external_id": id}):
                 doc.delete()
                 document_delete_total += 1
     arrangement_delete_count = 0
     for arrangement in Arrangement.objects.annotate(num_docs=Count('document')) \
             .filter(collection=collection, num_docs=0):
         arrangement.delete()
         arrangement_delete_count += 1
     return arrangement_delete_count, document_delete_total
コード例 #10
0
    def handle_label(self, label, **options):
        try:
            Model = apps.get_model(label)
        except LookupError as exc:
            log.error("Failed to find '{}': {}".format(label, exc))
            return
        assert issubclass(Model, HttpFileResource)
        log.info("Stripping from {}\r".format(Model.__name__))

        batch_size = 500
        queryset = Model.objects.filter(status=200)
        count = queryset.count()
        for batch in ibatch(queryset.iterator(),
                            batch_size,
                            progress_bar=True,
                            total=count):
            for instance in batch:
                if instance.body and instance.body.startswith(options["path"]):
                    instance.body = instance.body.replace(
                        options["path"], "", 1)
                    instance.save()
コード例 #11
0
    def update(self,
               data,
               by_property,
               batch_size=32,
               collection=None,
               modified_at=None,
               validate=True):
        """
        Update data to the Collection in batches, using a property value to identify which Documents to update.

        :param data: The data to use for the update
        :param by_property: The property to identify a Document with
        :param batch_size: (optional) how many instances to add in a single batch (default: 32)
        :param collection: (optional) a collection instance to update the data to (default: self)
        :param modified_at: (optional) the datetime to use as modified_at value for the collection (default: now)
        :param validate: (deprecated) used to allow JSON schema validation before updates
        :return: A list of updated or created instances.
        """
        collection = collection or self
        modified_at = modified_at or make_aware(datetime.now())
        Document = collection.get_document_model()
        assert isinstance(data, (Iterator, list, tuple,)), \
            f"Collection.update expects data to be formatted as iteratable not {type(data)}"

        count = 0
        for updates in ibatch(data, batch_size=batch_size):
            # We bulk update by getting all objects whose property matches
            # any update's "by_property" property value and then updating these source objects.
            # One update object can potentially target multiple sources
            # if multiple objects with the same value for the by_property property exist.
            updated = set()
            prepared = []
            sources_by_lookup = defaultdict(list)
            for update in updates:
                sources_by_lookup[update[by_property]].append(update)
            target_filters = Q()
            for lookup_value in sources_by_lookup.keys():
                target_filters |= Q(
                    **{f"properties__{by_property}": lookup_value})
            for target in collection.documents.filter(target_filters):
                for update_value in sources_by_lookup[
                        target.properties[by_property]]:
                    target.update(update_value, commit=False)
                count += 1
                updated.add(target.properties[by_property])
                prepared.append(target)
            Document.objects.bulk_update(
                prepared,
                ["properties", "identity", "reference", "modified_at"],
                batch_size=datagrowth_settings.DATAGROWTH_MAX_BATCH_SIZE)
            # After all updates we add all data that hasn't been used in any update operation
            additions = []
            for lookup_value, sources in sources_by_lookup.items():
                if lookup_value not in updated:
                    additions += sources
            if len(additions):
                count += self.add(additions,
                                  batch_size=batch_size,
                                  collection=collection,
                                  modified_at=modified_at)

        if collection.modified_at.replace(
                microsecond=0) != modified_at.replace(microsecond=0):
            collection.modified_at = modified_at
            collection.save()
        return count
コード例 #12
0
ファイル: dump_dataset.py プロジェクト: fako/datascope
 def queryset_to_disk(queryset, json_file, batch_size=100):
     count = queryset.all().count()
     batch_iterator = ibatch(queryset.iterator(), batch_size=batch_size, progress_bar=True, total=count)
     for batch in batch_iterator:
         batch_data = serialize("json", batch, use_natural_foreign_keys=True)
         json_file.writelines([batch_data + "\n"])