def ingest_edits(cls, json_batch): # Map from (toolid, uid, user) to Batch object batches = {} model_edits = [] reverted_ids = [] deleted_pages = {} # map: title -> latest deletion timestamp restored_pages = {} # map: title -> latest restoration timestamp modified_pages = defaultdict( set) # map: batch_key -> set of touched pages new_tags = defaultdict(set) tools = Tool.objects.all() for edit_json in json_batch: if not edit_json or edit_json.get( 'namespace') not in settings.WATCHED_NAMESPACES: continue timestamp = datetime.fromtimestamp(edit_json['timestamp'], tz=UTC) # First, check if this is a revert revert_match = cls.reverted_re.match(edit_json['comment']) if revert_match: reverted_ids.append(int(revert_match.group(1))) # or a deletion if edit_json.get('log_action') == 'delete': deleted_pages[edit_json['title']] = timestamp # or a restore if edit_json.get('log_action') == 'restore': restored_pages[edit_json['title']] = timestamp # Then, try to match the edit with a tool match = None matching_tool = None for tool in tools: match = tool.match(edit_json['user'], edit_json['comment']) if match is not None: matching_tool = tool break if match is None: continue # Try to find an existing batch for that edit batch_key = (matching_tool.shortid, match.uid) batch = batches.get(batch_key) created = False if not batch: batch, created = Batch.objects.get_or_create( tool=tool, uid=match.uid, defaults={ 'user': match.user[:MAX_CHARFIELD_LENGTH], 'summary': match.summary[:MAX_CHARFIELD_LENGTH], 'started': timestamp, 'ended': timestamp, 'nb_edits': 0, 'nb_distinct_pages': 0, 'nb_new_pages': 0, 'nb_reverted_edits': 0, 'total_diffsize': 0, }) # Check that the batch is owned by the right user if batch.user != match.user: if created: batch.delete() continue batch.nb_edits += 1 length_obj = edit_json.get('length') or {} batch.total_diffsize += (length_obj.get('new') or 0) - (length_obj.get('old') or 0) batch.ended = max(batch.ended, timestamp) batches[batch_key] = batch # Create the edit object model_edit = Edit.from_json(edit_json, batch) model_edits.append(model_edit) # Extract tags from the edit edit_tags = Tag.extract(model_edit) missing_tags = [ tag.id for tag in edit_tags if tag.id not in batch.tag_ids ] new_tags[batch.id].update(missing_tags) # Take note of the modified page, for computation of the number of entities edited by a batch modified_pages[batch_key].add(edit_json['title']) # And the number of new pages if model_edit.changetype == 'new': batch.nb_new_pages += 1 # if we saw some deletions which match any creations or undeletions we know of, mark them as deleted. # We do this before creating the previous edits in the same batch, because deletions and restorations # do not come with unique ids to identify the creation, deletion or restoration that they undo # (this is a notion that we introduce ourselves) so if a deletion and the corresponding revert happen # in the same batch we need to inspect the order in which they happened. if deleted_pages: cls.mark_as_reverted( Edit.objects.filter(title__in=deleted_pages.keys(), changetype__in=['new', 'restore'])) for edit in model_edits: if (edit.title in deleted_pages and edit.changetype in ['new', 'restore'] and edit.timestamp < deleted_pages.get(edit.title)): edit.reverted = True edit.batch.nb_reverted_edits += 1 # finally if we saw some undeletions which match any deletions we know of, mark them as undone if restored_pages: cls.mark_as_reverted( Edit.objects.filter(title__in=restored_pages.keys(), changetype='delete')) for edit in model_edits: if (edit.title in restored_pages and edit.changetype == 'delete' and edit.timestamp < restored_pages.get(edit.title)): edit.reverted = True edit.batch.nb_reverted_edits += 1 # Create all Edit objects update all the batch objects if batches: # Update the number of modified pages for batch_key, pages in modified_pages.items(): batch = batches.get(batch_key) existing_pages = set( batch.edits.filter(title__in=pages).values_list('title', flat=True)) unseen_pages = pages - existing_pages batch.nb_distinct_pages += len(unseen_pages) # Create all the edit objects try: with transaction.atomic(): Edit.objects.bulk_create(model_edits) except IntegrityError as e: # Oops! Some of them existed already! # Let's add them one by one instead. for edit in model_edits: try: existing_edit = Edit.objects.get(id=edit.id) # this edit was already seen: we need to remove it # from the associated batch count batch_key = (edit.batch.tool.shortid, edit.batch.uid) batch = batches.get(batch_key) if batch: batch.nb_edits -= 1 batch.total_diffsize -= edit.newlength - edit.oldlength if edit.changetype == 'new': batch.nb_new_pages -= 1 if edit.reverted: batch.nb_reverted_edits -= 1 except Edit.DoesNotExist: edit.save() # update batch objects Batch.objects.bulk_update(list(batches.values()), update_fields=[ 'ended', 'nb_edits', 'nb_distinct_pages', 'nb_reverted_edits', 'nb_new_pages', 'total_diffsize' ]) # update tags for batches if new_tags: Tag.add_tags_to_batches(new_tags) # If we saw any "undo" edit, mark all matching edits as reverted. # We do this after creating the latest edits because it could be possible that # an edit from the batch we just processed was undone in the same go. if reverted_ids: cls.mark_as_reverted( Edit.objects.filter(newrevid__in=reverted_ids))
def ingest_edits(cls, json_batch): # Map from (toolid, uid, user) to Batch object batches = {} model_edits = [] reverted_ids = [] new_tags = defaultdict(set) tools = Tool.objects.all() for edit_json in json_batch: if not edit_json: continue timestamp = datetime.fromtimestamp(edit_json['timestamp'], tz=UTC) # First, check if this is a revert revert_match = cls.reverted_re.match(edit_json['comment']) if revert_match: reverted_ids.append(int(revert_match.group(1))) # Otherwise, try to match the edit with a tool match = None matching_tool = None for tool in tools: match = tool.match(edit_json['user'], edit_json['comment']) if match is not None: matching_tool = tool break if match is None: continue # Try to find an existing batch for that edit batch_key = (matching_tool.shortid, match.uid) batch = batches.get(batch_key) created = False if not batch: batch, created = Batch.objects.get_or_create(tool=tool, uid=match.uid, defaults={ 'user': match.user, 'summary': match.summary, 'started': timestamp, 'ended': timestamp, 'nb_edits': 0, }) # Check that the batch is owned by the right user if batch.user != match.user: if created: batch.delete() continue batch.nb_edits += 1 batch.ended = max(batch.ended, timestamp) batches[batch_key] = batch # Create the edit object model_edit = Edit.from_json(edit_json, batch) model_edits.append(model_edit) # Extract tags from the edit edit_tags = Tag.extract(model_edit) missing_tags = [ tag.id for tag in edit_tags if tag.id not in batch.tag_ids ] new_tags[batch.id].update(missing_tags) # Create all Edit objects update all the batch objects if batches: # Create all the edit objects try: with transaction.atomic(): Edit.objects.bulk_create(model_edits) except IntegrityError as e: # Oops! Some of them existed already! # Let's add them one by one instead. for edit in model_edits: try: existing_edit = Edit.objects.get(id=edit.id) # this edit was already seen: we need to remove it # from the associated batch count batch_key = (edit.batch.tool.shortid, edit.batch.uid) batch = batches.get(batch_key) if batch: batch.nb_edits -= 1 except Edit.DoesNotExist: edit.save() # update batch objects Batch.objects.bulk_update(list(batches.values()), update_fields=['ended', 'nb_edits']) # update tags for batches if new_tags: Tag.add_tags_to_batches(new_tags) # If we saw any "undo" edit, mark all matching edits as reverted if reverted_ids: Edit.objects.filter(newrevid__in=reverted_ids).update( reverted=True)