def link_to_log(self, log, datum_ids): if not datum_ids: return True logger.debug('Linking RawData to %r', log) with connection.cursor() as cursor: for chunk in chunked(datum_ids, size=500): if not chunk: break cursor.execute( ''' INSERT INTO "{table}" ("{rawdatum}", "{harvestlog}") VALUES {values} ON CONFLICT ("{rawdatum}", "{harvestlog}") DO NOTHING; '''.format( values=', '.join( '%s' for _ in range(len(chunk)) ), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values table=RawDatum.logs.through._meta.db_table, rawdatum=RawDatum.logs.through._meta.get_field( 'rawdatum').column, harvestlog=RawDatum.logs.through._meta.get_field( 'harvestlog').column, ), [(raw_id, log.id) for raw_id in chunk]) return True
def bulk_get_or_create(self, objs, defaults=None, using='default'): if len(self.model._meta.unique_together) != 1: raise ValueError( 'Cannot determine the constraint to use for ON CONFLICT') if not objs: return [] columns = [] defaults = defaults or {} for field in self.model._meta.concrete_fields: if field is not self.model._meta.pk: columns.append(field.column) if field in defaults: continue if field.default is not models.NOT_PROVIDED or field.null: defaults[field] = field._get_default() elif isinstance(field, models.DateField) and (field.auto_now or field.auto_now_add): defaults[field] = timezone.now() if any(obj.pk for obj in objs): raise ValueError( 'Cannot bulk_get_or_create objects with primary keys') constraint = ', '.join( '"{1.column}"'.format(self.model, self.model._meta.get_field( field)) for field in self.model._meta.unique_together[0]) loaded = [] with transaction.atomic(using): for chunk in chunked(objs, 500): if not chunk: break loaded.extend( self.raw( ''' INSERT INTO "{model._meta.db_table}" ({columns}) VALUES {values} ON CONFLICT ({constraint}) DO UPDATE SET id = "{model._meta.db_table}".id RETURNING * '''.format( model=self.model, columns=', '.join(columns), constraint=constraint, values=', '.join(['%s'] * len(chunk)), ), [ tuple( getattr(obj, field.attname, None) or defaults[field] for field in self.model._meta.concrete_fields[1:]) for obj in chunk ])) return loaded
def __iter__(self): opts = {'_index': self._index, '_type': self._model._meta.verbose_name_plural.replace(' ', '')} for chunk in util.chunked(self._flatten(), size=250): for result in self._fetcher(chunk): if result is None: yield None elif result.pop('is_deleted', False): yield {'_id': result['id'], '_op_type': 'delete', **opts} else: yield {'_id': result['id'], '_op_type': 'index', **opts, **result}
def _bulk_query(self, query, default_values, data, db_alias): fields = [field.name for field in self.model._meta.concrete_fields] with connection.cursor() as cursor: for chunk in chunked(data, 500): if not chunk: break cursor.execute(query.format( values=', '.join('%s' for _ in range(len(chunk))), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values ), [c + default_values for c in chunk]) for row in cursor.fetchall(): yield self.model.from_db(db_alias, fields, row)
def __iter__(self): opts = { '_index': self._index, '_type': self._model._meta.verbose_name_plural.replace(' ', '') } for chunk in util.chunked(self._flatten(), size=self._size): for result in self._fetcher(chunk): if result is None: yield None elif result.pop('is_deleted', False): yield {'_id': result['id'], '_op_type': 'delete', **opts} else: yield { '_id': result['id'], '_op_type': 'index', **opts, **result }
def archive_queryset(self, task_name, queryset): if self.bucket is None: logger.warning('%r.bucket is None. Results will NOT be archived', self) return None if task_name in self.NO_ARCHIVE: logger.info('Found %s in NO_ARCHIVE, archival will be skipped', task_name) total = queryset.count() logger.info('Found %s %ss eligible for archiving', total, task_name) logger.info('Archiving in chunks of %d', self.chunk_size) i = 0 for chunk in chunked(queryset.iterator(), size=self.chunk_size): compressed = self.compress_and_serialize(chunk) self.put_s3(task_name, compressed) i += len(chunk) logger.info('Archived %d of %d', i, total)
def delete_queryset(self, queryset): if not self.delete: logger.warning('%r.delete is False. Results will NOT be deleted', self) return 0 total_deleted = 0 try: with transaction.atomic(): # .delete loads the entire queryset and can't be sliced... Hooray for ids in chunked(queryset.values_list('id', flat=True).iterator(), size=self.chunk_size): num_deleted, _ = queryset.model.objects.filter(id__in=ids).delete() total_deleted += num_deleted except Exception as e: logger.exception('Failed to delete queryset with exception %s', e) raise logger.info('Deleted %s CeleryTasks', total_deleted) return total_deleted
def _consume_job(self, job, force, superfluous, limit=None, ingest=True): try: if ingest: datum_gen = (datum for datum in self._harvest(job, force, limit) if datum.created or superfluous) for chunk in chunked(datum_gen, 500): self._bulk_schedule_ingest(job, chunk) else: for _ in self._harvest(job, force, limit): pass except HarvesterConcurrencyError as e: if not self.task: raise # If job_id was specified there's a chance that the advisory lock was not, in fact, acquired. # If so, retry indefinitely to preserve existing functionality. # Use random to add jitter to help break up locking issues # Kinda hacky, allow a stupidly large number of retries as there is no options for infinite raise self.task.retry( exc=e, max_retries=99999, countdown=(random.random() + 1) * min(settings.CELERY_RETRY_BACKOFF_BASE ** self.task.request.retries, 60 * 15) )
def link_to_log(self, log, datum_ids): if not datum_ids: return True logger.debug('Linking RawData to %r', log) with connection.cursor() as cursor: for chunk in chunked(datum_ids, size=500): if not chunk: break cursor.execute(''' INSERT INTO "{table}" ("{rawdatum}", "{harvestlog}") VALUES {values} ON CONFLICT ("{rawdatum}", "{harvestlog}") DO NOTHING; '''.format( values=', '.join('%s' for _ in range(len(chunk))), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values table=RawDatum.logs.through._meta.db_table, rawdatum=RawDatum.logs.through._meta.get_field('rawdatum').column, harvestlog=RawDatum.logs.through._meta.get_field('harvestlog').column, ), [(raw_id, log.id) for raw_id in chunk]) return True
def delete_queryset(self, queryset): if not self.delete: logger.warning('%r.delete is False. Results will NOT be deleted', self) return 0 total_deleted = 0 try: with transaction.atomic(): # .delete loads the entire queryset and can't be sliced... Hooray for ids in chunked(queryset.values_list('id', flat=True).iterator(), size=self.chunk_size): num_deleted, _ = queryset.model.objects.filter( id__in=ids).delete() total_deleted += num_deleted except Exception as e: logger.exception('Failed to delete queryset with exception %s', e) raise logger.info('Deleted %s CeleryTasks', total_deleted) return total_deleted
def store_chunk(self, source_config, data, limit=None, db=DEFAULT_DB_ALIAS): """Store a large amount of data for a single source_config. Data MUST be a utf-8 encoded string (Just a str type). Take special care to make sure you aren't destroying data by mis-encoding it. Args: source_config (SourceConfig): data Generator[(str, str)]: (identifier, datum) Returns: Generator[RawDatum] """ hashes = {} identifiers = {} now = timezone.now() if limit == 0: return [] for chunk in chunked(data, 500): if not chunk: break new = [] new_identifiers = set() for fr in chunk: if limit and len(hashes) >= limit: break if fr.sha256 in hashes: if hashes[fr.sha256] != fr.identifier: raise ValueError( '{!r} has already been seen or stored with identifier "{}". ' 'Perhaps your identifier extraction is incorrect?'.format(fr, hashes[fr.sha256]) ) logger.warning('Recieved duplicate datum %s from %s', fr, source_config) continue new.append(fr) hashes[fr.sha256] = fr.identifier new_identifiers.add(fr.identifier) if new_identifiers: suids = SourceUniqueIdentifier.objects.raw(''' INSERT INTO "{table}" ("{identifier}", "{source_config}") VALUES {values} ON CONFLICT ("{identifier}", "{source_config}") DO UPDATE SET id = "{table}".id RETURNING {fields} '''.format( table=SourceUniqueIdentifier._meta.db_table, identifier=SourceUniqueIdentifier._meta.get_field('identifier').column, source_config=SourceUniqueIdentifier._meta.get_field('source_config').column, values=placeholders(len(new_identifiers)), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values fields=', '.join('"{}"'.format(field.column) for field in SourceUniqueIdentifier._meta.concrete_fields), ), [(identifier, source_config.id) for identifier in new_identifiers]) for suid in suids: identifiers[suid.identifier] = suid.pk if new: # Defer 'datum' by omitting it from the returned fields yield from RawDatum.objects.raw( ''' INSERT INTO "{table}" ("{suid}", "{hash}", "{datum}", "{datestamp}", "{date_modified}", "{date_created}") VALUES {values} ON CONFLICT ("{suid}", "{hash}") DO UPDATE SET "{datestamp}" = EXCLUDED."{datestamp}", "{date_modified}" = EXCLUDED."{date_modified}" RETURNING id, "{suid}", "{hash}", "{datestamp}", "{date_modified}", "{date_created}" '''.format( table=RawDatum._meta.db_table, suid=RawDatum._meta.get_field('suid').column, hash=RawDatum._meta.get_field('sha256').column, datum=RawDatum._meta.get_field('datum').column, datestamp=RawDatum._meta.get_field('datestamp').column, date_modified=RawDatum._meta.get_field('date_modified').column, date_created=RawDatum._meta.get_field('date_created').column, values=', '.join('%s' for _ in range(len(new))), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values ), [ (identifiers[fr.identifier], fr.sha256, fr.datum, fr.datestamp or now, now, now) for fr in new ] ) if limit and len(hashes) >= limit: break
def store_chunk(self, source_config, data, limit=None, db=DEFAULT_DB_ALIAS): """Store a large amount of data for a single source_config. Data MUST be a utf-8 encoded string (Just a str type). Take special care to make sure you aren't destroying data by mis-encoding it. Args: source_config (SourceConfig): data Generator[(str, str)]: (identifier, datum) Returns: Generator[MemoryFriendlyRawDatum] """ unique_data = set() now = timezone.now() with connection.cursor() as cursor: for chunk in chunked(data, 500): chunk_data = [] for identifier, datum in chunk: if limit is not None and len(unique_data) >= limit: break hash_ = sha256(datum.encode('utf-8')).hexdigest() chunk_data.append((identifier, hash_, datum)) unique_data.add((identifier, hash_)) if not chunk_data: break identifiers = list({(identifier, source_config.id) for identifier, _, _ in chunk_data}) cursor.execute(''' INSERT INTO "{table}" ("{identifier}", "{source_config}") VALUES {values} ON CONFLICT ("{identifier}", "{source_config}") DO UPDATE SET id = "{table}".id RETURNING {fields} '''.format( table=SourceUniqueIdentifier._meta.db_table, identifier=SourceUniqueIdentifier._meta.get_field('identifier').column, source_config=SourceUniqueIdentifier._meta.get_field('source_config').column, values=', '.join('%s' for _ in range(len(identifiers))), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values fields=', '.join('"{}"'.format(field.column) for field in SourceUniqueIdentifier._meta.concrete_fields), ), identifiers) suids = {} fields = [field.attname for field in SourceUniqueIdentifier._meta.concrete_fields] for row in cursor.fetchall(): suid = SourceUniqueIdentifier.from_db(db, fields, row) suids[suid.pk] = suid suids[suid.identifier] = suid raw_data = {} for identifier, hash_, datum in chunk_data: raw_data[identifier, hash_] = (suids[identifier].pk, hash_, datum, now, now) cursor.execute(''' INSERT INTO "{table}" ("{suid}", "{hash}", "{datum}", "{date_created}", "{date_modified}") VALUES {values} ON CONFLICT ("{suid}", "{hash}") DO UPDATE SET "{date_modified}" = %s RETURNING id, "{suid}", "{hash}", "{date_created}", "{date_modified}" '''.format( table=RawDatum._meta.db_table, suid=RawDatum._meta.get_field('suid').column, hash=RawDatum._meta.get_field('sha256').column, datum=RawDatum._meta.get_field('datum').column, date_created=RawDatum._meta.get_field('date_created').column, date_modified=RawDatum._meta.get_field('date_modified').column, values=', '.join('%s' for _ in range(len(raw_data))), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values ), list(raw_data.values()) + [now]) for row in cursor.fetchall(): yield MemoryFriendlyRawDatum.from_db(db, ('id', 'suid', 'sha256', 'date_created', 'date_modified'), row[:1] + (suids[row[1]], ) + row[2:]) if limit is not None and len(unique_data) >= limit: break
def store_chunk(self, source_config, data, limit=None, db=DEFAULT_DB_ALIAS): """Store a large amount of data for a single source_config. Data MUST be a utf-8 encoded string (Just a str type). Take special care to make sure you aren't destroying data by mis-encoding it. Args: source_config (SourceConfig): data Generator[(str, str)]: (identifier, datum) Returns: Generator[RawDatum] """ hashes = {} identifiers = {} now = timezone.now() if limit == 0: return [] for chunk in chunked(data, 500): if not chunk: break new = [] new_identifiers = set() for fr in chunk: if limit and len(hashes) >= limit: break if fr.sha256 in hashes: if hashes[fr.sha256] != fr.identifier: raise ValueError( '{!r} has already been seen or stored with identifier "{}". ' 'Perhaps your identifier extraction is incorrect?'. format(fr, hashes[fr.sha256])) logger.warning('Recieved duplicate datum %s from %s', fr, source_config) continue new.append(fr) hashes[fr.sha256] = fr.identifier new_identifiers.add(fr.identifier) if new_identifiers: suids = SourceUniqueIdentifier.objects.raw( ''' INSERT INTO "{table}" ("{identifier}", "{source_config}") VALUES {values} ON CONFLICT ("{identifier}", "{source_config}") DO UPDATE SET id = "{table}".id RETURNING {fields} '''.format( table=SourceUniqueIdentifier._meta.db_table, identifier=SourceUniqueIdentifier._meta.get_field( 'identifier').column, source_config=SourceUniqueIdentifier._meta.get_field( 'source_config').column, values=placeholders( len(new_identifiers) ), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values fields=', '.join( '"{}"'.format(field.column) for field in SourceUniqueIdentifier._meta.concrete_fields), ), [(identifier, source_config.id) for identifier in new_identifiers]) for suid in suids: identifiers[suid.identifier] = suid.pk if new: # Defer 'datum' by omitting it from the returned fields yield from RawDatum.objects.raw( ''' INSERT INTO "{table}" ("{suid}", "{hash}", "{datum}", "{datestamp}", "{date_modified}", "{date_created}") VALUES {values} ON CONFLICT ("{suid}", "{hash}") DO UPDATE SET "{datestamp}" = EXCLUDED."{datestamp}", "{date_modified}" = EXCLUDED."{date_modified}" RETURNING id, "{suid}", "{hash}", "{datestamp}", "{date_modified}", "{date_created}" '''.format( table=RawDatum._meta.db_table, suid=RawDatum._meta.get_field('suid').column, hash=RawDatum._meta.get_field('sha256').column, datum=RawDatum._meta.get_field('datum').column, datestamp=RawDatum._meta.get_field('datestamp').column, date_modified=RawDatum._meta.get_field( 'date_modified').column, date_created=RawDatum._meta.get_field( 'date_created').column, values=', '.join( '%s' for _ in range(len(new)) ), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values ), [(identifiers[fr.identifier], fr.sha256, fr.datum, fr.datestamp, now, now) for fr in new]) if limit and len(hashes) >= limit: break
def bulk_get_or_create(self, objs, defaults=None, using='default', update_fields=None, defer_fields=None, chunk_size=500, ): if len(self.model._meta.unique_together) != 1: raise ValueError('Cannot determine the constraint to use for ON CONFLICT') def col(field_name): return self.model._meta.get_field(field_name).column columns = [] field_names = [] defaults = defaults or {} for field in self.model._meta.concrete_fields: if field is not self.model._meta.pk: columns.append(field.column) field_names.append(field.attname) if field in defaults: continue if field.default is not models.NOT_PROVIDED or field.null: defaults[field.attname] = field._get_default() elif isinstance(field, models.DateField) and (field.auto_now or field.auto_now_add): defaults[field.attname] = timezone.now() constraint = ', '.join( '"{}"'.format(col(f)) for f in self.model._meta.unique_together[0] ) if update_fields: update = [ '"{0}" = EXCLUDED."{0}"'.format(col(f)) for f in update_fields ] else: update = ['id = "{}".id'.format(self.model._meta.db_table)] returning = '*' if defer_fields: defer_columns = {col(f) for f in defer_fields} returning = ', '.join(['id'] + [c for c in columns if c not in defer_columns]) loaded = [] with transaction.atomic(using): for chunk in chunked(objs, chunk_size): if not chunk: break loaded.extend(self.raw(''' INSERT INTO "{model._meta.db_table}" ({columns}) VALUES {values} ON CONFLICT ({constraint}) DO UPDATE SET {update} RETURNING {returning} '''.format( model=self.model, columns=', '.join(columns), constraint=constraint, values=', '.join(['%s'] * len(chunk)), update=', '.join(update), returning=returning, ), [ tuple(getattr(obj, f, None) or defaults[f] for f in field_names) for obj in chunk ])) return loaded