def harvest_seeds(self, harvest, current_time): send_config = create_config("http_resource", { "resource": harvest.source.repository, "continuation_limit": 10000, }) set_specification = harvest.source.spec scc, err = send(set_specification, f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}", config=send_config, method="get") if len(err): Resource = apps.get_model(harvest.source.repository) error_counter = Counter([ error.status for error in Resource.objects.filter(id__in=err) ]) raise CommandError( f"Failed to harvest seeds from {harvest.source.name}: {error_counter}" ) harvest.harvested_at = current_time harvest.save() return len(scc), len(err)
def sync_sharekit_metadata(): # Select which data to sync this run latest_active_dataset = Dataset.objects.filter(is_active=True).last() if not latest_active_dataset: return dataset_version = DatasetVersion.objects.get_current_version() harvest_queryset = Harvest.objects.filter( dataset=latest_active_dataset, source__repository=Repositories.SHAREKIT, stage=HarvestStages. COMPLETE # prevents syncing materials half way a full harvest ) # First we acquire a permanent lock on Harvests, # because if latest_update_at is a while ago this command will run a long time. # We don't want to keep all those syncing changes waiting in that one transaction. try: with atomic(): harvest_queryset.filter(is_syncing=False).select_for_update( nowait=True).update(is_syncing=True) except DatabaseError: logger.warning( "Did not acquire lock on Harvester when syncing Sharekit metadata") return # Now that we're the only ones starting the sync we execute it for harvest in harvest_queryset.filter(is_syncing=True): # Check that a non-valid harvest source didn't slip through the lock if harvest.stage != HarvestStages.COMPLETE: logging.warning( "Encountered a non-complete harvest source during sync") continue # Recording which time will become latest_update_at current_time = make_aware(datetime.now()) # Getting metadata from Sharekit and stop immediately if anything went wrong send_config = create_config("http_resource", { "resource": harvest.source.repository, "continuation_limit": 10000, }) set_specification = harvest.source.spec scc, err = send(set_specification, f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}", config=send_config, method="get") if len(err) or not len(scc): continue # Now parse the metadata and update current Collection for this Harvest seeds = get_harvest_seeds(Repositories.SHAREKIT, set_specification, harvest.latest_update_at, include_no_url=True) collection = dataset_version.collection_set.filter( name=harvest.source.spec).last() for seeds_batch in ibatch(seeds, batch_size=32): collection.update(seeds_batch, "external_id") # Last but not least we update the harvest update time to get a different delta later harvest.latest_update_at = current_time harvest.save() # And we release the syncing lock with atomic(): harvest_queryset.filter(is_syncing=True).select_for_update().update( is_syncing=False)
def transcribe_video_resources(self, video_download_ids, seeds): no_paths_count, invalid_paths_count, success_count, error_count = 0, 0, 0, 0 kaldi_file_paths = defaultdict(list) # Preprocess the videos for video_download_resource in YouTubeDLResource.objects.filter( id__in=video_download_ids, status=0): # Make sure that the video has a valid audio file _, data = video_download_resource.content file_path = data.get("file_path", None) if not file_path: no_paths_count += 1 continue if not os.path.exists(file_path): invalid_paths_count += 1 continue # Try to transcribe the file based on metadata video_url = video_download_resource.variables()["url"] # TODO: determine Kaldi through meta data # It's possible to pass the URL through get_edurep_basic_resources # With the Tika and File resources it's much easier to determine a proper language seed = seeds[video_url] title = seed.get("title", None) kaldi_model = get_kaldi_model_from_snippet(title) kaldi_file_paths[kaldi_model].append(file_path) no_language_count = len(kaldi_file_paths.pop(None, [])) # Actual transcribing for kaldi_model, paths in kaldi_file_paths.items(): config = create_config("shell_resource", {"resource": kaldi_model}) sccs, errs = run_serie(self.progress([[path] for path in paths]), [{} for _ in paths], config=config) success_count += len(sccs) error_count += len(errs) return no_paths_count, invalid_paths_count, no_language_count, success_count, error_count
def extract_seeds(self, set_specification, latest_update): queryset = self.get_queryset().filter( set_specification=set_specification, since__date__gte=latest_update.date(), status=200, is_extracted=False) oaipmh_objective = { "@": EdurepDataExtraction.get_oaipmh_records, "external_id": EdurepDataExtraction.get_oaipmh_external_id, "state": EdurepDataExtraction.get_oaipmh_record_state } oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE) extract_config = create_config("extract_processor", {"objective": oaipmh_objective}) prc = ExtractProcessor(config=extract_config) results = [] for harvest in queryset: seed_resource = { "resource": f"{harvest._meta.app_label}.{harvest._meta.model_name}", "id": harvest.id, "success": True } try: for seed in prc.extract_from_resource(harvest): seed["seed_resource"] = seed_resource results.append(seed) except ValueError as exc: logger.warning("Invalid XML:", exc, harvest.uri) return results
def download_seed_videos(self, video_seeds): config = create_config("http_resource", {"resource": "pol_harvester.YouTubeDLResource"}) return run_serie( # TODO: make this parallel self.progress([[seed["url"]] for seed in video_seeds]), [{} for _ in video_seeds], config=config)
def process_batch(self, batch): config = create_config(self.resource_type, self.config.retrieve_data) app_label, resource_model = config.resource.split(".") resource_type = ContentType.objects.get_by_natural_key(app_label, resource_model) updates = [] creates = [] for process_result in batch.processresult_set.all(): args, kwargs = process_result.document.output(config.args, config.kwargs) successes, fails = self.dispatch_resource(config, *args, **kwargs) results = successes + fails if not len(results): continue result_id = results.pop(0) process_result.result_type = resource_type process_result.result_id = result_id updates.append(process_result) for result_id in results: # TODO: create docs here where necessary creates.append( self.ProcessResult(document=process_result.document, batch=batch, result_id=result_id, result_type=resource_type) ) self.ProcessResult.objects.bulk_create(creates) self.ProcessResult.objects.bulk_update(updates, ["result_type", "result_id"])
def extract_seeds(self, latest_update): queryset = self.get_queryset() \ .filter(since__date__gte=latest_update.date(), status=200) metadata_objective = { "@": "$.items", "external_id": "$.uuid", "state": BuasMetadataExtraction.get_record_state } metadata_objective.update(BuasMetadataExtraction.OBJECTIVE) extract_config = create_config("extract_processor", { "objective": metadata_objective }) prc = ExtractProcessor(config=extract_config) results = [] for harvest in queryset: seed_resource = { "resource": f"{harvest._meta.app_label}.{harvest._meta.model_name}", "id": harvest.id, "success": True } for seed in prc.extract_from_resource(harvest): seed["seed_resource"] = seed_resource results.append(seed) return results
def get_edurep_query_seeds(query): queryset = EdurepSearch.objects.filter(request__contains=query) api_objective = { "@": EdurepDataExtraction.get_api_records, "external_id": EdurepDataExtraction.get_api_external_id, "state": EdurepDataExtraction.get_api_record_state } api_objective.update(EDUREP_EXTRACTION_OBJECTIVE) extract_config = create_config("extract_processor", {"objective": api_objective}) prc = ExtractProcessor(config=extract_config) results = [] for search in queryset.filter(status=200): try: results += list(prc.extract_from_resource(search)) except ValueError as exc: err.warning("Invalid XML:", exc, search.uri) seeds = {} for seed in sorted(results, key=lambda rsl: rsl["publisher_date"] or ""): # Some records in Edurep do not have any known URL # As we can't possibly process those we ignore them (silently) # If we want to fix this it should happen on Edurep's or Sharekit's side # We informed Kirsten van Veelo and Martine Teirlinck about the situation. if not seed["url"]: continue # We adjust url's of seeds if the source files are not at the URL # We should improve data extraction to always get source files if seed["mime_type"] == "application/x-Wikiwijs-Arrangement": seed["package_url"] = seed["url"] seed["url"] += "?p=imscp" # And deduplicate entire seeds based on URL seeds[seed["url"]] = seed return seeds.values()
def test_purge_after(self): instance = self.get_test_instance() instance.config = create_config("global", { "purge_after": {"days": 30} }) instance.clean() self.assertIsNotNone(instance.purge_at) self.assertEqual(instance.purge_at.date() - date.today(), timedelta(days=30))
def test_create_config(self): test_config = create_config("name", { "test": "public", "_test2": "protected", "_test3": "protected 2" }) self.assertIsNone(test_config._defaults) self.assertIsInstance(test_config, ConfigurationType) self.assertEqual(test_config.test, "public") self.assertEqual(test_config.test2, "protected") self.assertEqual(test_config.test3, "protected 2") self.assertEqual(test_config._test2, "protected") self.assertEqual(test_config._test3, "protected 2")
def test_create_config_registered_defaults(self): register_defaults("name", {"test4": "namespaced default"}) test_config = create_config("name", { "test": "public", "_test2": "protected", "_test3": "protected 2" }) self.assertIsNone(test_config._defaults) self.assertIsInstance(test_config, ConfigurationType) self.assertEqual(test_config._namespace, "name") self.assertEqual(test_config.test4, "namespaced default") self.assertEqual(test_config._defaults, DATAGROWTH_DEFAULT_CONFIGURATION)
def test_create_config_registered_defaults(self): register_defaults("name", { "test4": "namespaced default" }) test_config = create_config("name", { "test": "public", "_test2": "protected", "_test3": "protected 2" }) self.assertIsNone(test_config._defaults) self.assertIsInstance(test_config, ConfigurationType) self.assertEqual(test_config._namespace, "name") self.assertEqual(test_config.test4, "namespaced default") self.assertEqual(test_config._defaults, DEFAULT_CONFIGURATION)
def get_edurep_oaipmh_seeds(set_specification, latest_update, include_deleted=True): queryset = EdurepOAIPMH.objects\ .filter(set_specification=set_specification, since__date__gte=latest_update.date(), status=200) oaipmh_objective = { "@": EdurepDataExtraction.get_oaipmh_records, "external_id": EdurepDataExtraction.get_oaipmh_external_id, "state": EdurepDataExtraction.get_oaipmh_record_state } oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE) extract_config = create_config("extract_processor", {"objective": oaipmh_objective}) prc = ExtractProcessor(config=extract_config) results = [] for harvest in queryset: try: results += list(prc.extract_from_resource(harvest)) except ValueError as exc: err.warning("Invalid XML:", exc, harvest.uri) seeds = [] for seed in results: # Some records in Edurep do not have any known URL # As we can't possibly process those we ignore them (silently) # If we want to fix this it should happen on Edurep's or Sharekit's side # We informed Kirsten van Veelo and Martine Teirlinck about the situation. if seed["state"] == "active" and not seed["url"]: continue # We adjust url's of seeds if the source files are not at the URL # We should improve data extraction to always get source files if seed["mime_type"] == "application/x-Wikiwijs-Arrangement" and seed.get( "url", None): seed["package_url"] = seed["url"] seed["url"] += "?p=imscp" # We deduplicate based on the external_id a UID by Edurep seeds.append(seed) # Now we'll mark any invalid seeds as deleted to make sure they disappear # Invalid seeds have a copyright or are of insufficient education level for seed in seeds: if not seed["copyright"] or seed["copyright"] == "no": seed["state"] = "deleted" if seed["lowest_educational_level"] < 1: # lower level than MBO seed["state"] = "deleted" # And we return the seeds based on whether to include deleted or not return seeds if include_deleted else \ [result for result in seeds if result.get("state", "active") == "active"]
def handle(self, *args, **options): freeze_name = options["freeze"] dummy = options["dummy"] if not dummy: self.prepare_harvest(freeze_name) harvest_queryset = EdurepHarvest.objects.filter( freeze__name=freeze_name, stage=HarvestStages.NEW) if not harvest_queryset.exists(): raise EdurepHarvest.DoesNotExist( f"There are no NEW EdurepHarvest objects for '{freeze_name}'") self.header("EDUREP SEEDS HARVEST", options) # Calling the Edurep OAI-PMH interface and get the Edurep meta data about learning materials self.info("Fetching metadata for sources ...") send_config = create_config("http_resource", { "resource": "edurep.EdurepOAIPMH", "continuation_limit": 1000, }) current_time = now() successes = defaultdict(int) fails = defaultdict(int) for harvest in self.progress(harvest_queryset, total=harvest_queryset.count()): set_specification = harvest.source.collection_name scc, err = send(set_specification, f"{harvest.latest_update_at:%Y-%m-%d}", config=send_config, method="get") if len(err): raise CommandError( "Failed to harvest seeds from Edurep OAI-PMH") successes[set_specification] += len(scc) fails[set_specification] += len(err) if not dummy: harvest.harvested_at = current_time harvest.save() self.info('Failed OAI-PMH calls: ', fails) self.info('Successful OAI-PMH calls: ', successes) success_count = sum(successes.values()) fail_count = sum(fails.values()) return f'OAI-PMH: {success_count}/{success_count+fail_count}'
def merge_batch(self, batch): pipeline_phase = self.config.pipeline_phase config = create_config("extract_processor", self.config.contribute_data) contribution_processor = config.extractor contribution_property = config.to_property while True: documents = [] for process_result in batch.processresult_set.filter(result_id__isnull=False): result = process_result.result # Write results to the pipeline process_result.document.pipeline[pipeline_phase] = { "success": result.success, "resource": f"{result._meta.app_label}.{result._meta.model_name}", "id": result.id } documents.append(process_result.document) # Write data to the Document extractor_name, method_name = Processor.get_processor_components(contribution_processor) extractor_class = Processor.get_processor_class(extractor_name) extractor = extractor_class(config) extractor_method = getattr(extractor, method_name) contributions = list(extractor_method(result)) if not len(contributions): continue contribution = contributions.pop(0) # TODO: create docs here where necessary if contribution_property is None: process_result.document.properties.update(contribution) else: process_result.document.properties[contribution_property] = contribution # We'll be locking the Documents for update to prevent accidental overwrite of parallel results with transaction.atomic(): try: list(self.Document.objects.filter(id__in=[doc.id for doc in documents]).select_for_update()) except transaction.DatabaseError: continue self.Document.objects.bulk_update(documents, ["pipeline", "properties"]) break
def extract_seeds(self, latest_update): latest_update = latest_update.replace(microsecond=0) queryset = self.get_queryset().filter(since__gte=latest_update, status=200, is_extracted=False) extract_config = create_config("extract_processor", {"objective": self._create_objective()}) prc = HanzeResourceObjectExtraction(config=extract_config) results = [] for harvest in queryset: seed_resource = { "resource": f"{harvest._meta.app_label}.{harvest._meta.model_name}", "id": harvest.id, "success": True } for seed in prc.extract_from_resource(harvest): seed["seed_resource"] = seed_resource results.append(seed) return results
def edit_document_webhook(request, channel, secret): # Webhook validation if str(secret) != settings.HARVESTER_WEBHOOK_SECRET: return HttpResponse(status=403, reason="Webhook not allowed in this environment") if request.META[ "HTTP_X_FORWARDED_FOR"] not in settings.SHAREKIT_WEBHOOK_ALLOWED_IPS: capture_message( f"edit_document_webhook called from invalid IP: {request.META['HTTP_X_FORWARDED_FOR']}", level="warning") return HttpResponse(status=403, reason="Webhook not allowed from source") try: data = json.loads(request.body) except json.decoder.JSONDecodeError: return HttpResponse(status=400, reason="Invalid JSON") # Patches data coming from Sharekit to be consistent if isinstance(data["attributes"], list): data["attributes"] = {} # Processing of incoming data extract_config = create_config( "extract_processor", {"objective": create_objective(root="$", include_is_restricted=False)}) prc = SharekitMetadataExtraction(config=extract_config) seed = next(prc.extract("application/json", data)) seed["is_restricted"] = channel == "edusourcesprivate" prepare_seed(seed) # Commit changes to the database dataset_version = DatasetVersion.objects.get_current_version() collection = dataset_version.collection_set.filter(name=channel).last() collection.update([seed], "external_id") # Finish webhook request logger = HarvestLogger(dataset_version.dataset.name, "edit_document_webhook", {}) logger.report_material(seed["external_id"], title=seed["title"], url=seed["url"]) return HttpResponse("ok")
def handle(self, *args, **options): freeze = Freeze.objects.get(name=options["freeze"]) videos = [( doc.reference, doc.properties["url"], ) for doc in freeze.documents.filter( reference__in=HBOVPK_TEST_REFERENCES)] successes = [] errors = [] for ref, url in tqdm(videos): try: download = YouTubeDLResource().run(url) except DGShellError: print("Download does not exist") continue if not download.success: print("Download error") continue _, data = download.content file_path = data.get("file_path", None) if not file_path: print("Download missing file in output") continue config = create_config("shell_resource", { "resource": "pol_harvester.kaldinlresource", "reference": ref }) if not os.path.exists(file_path): print("Download missing file") continue sccs, errs = run(file_path, config=config) successes += sccs errors += errs
def handle(self, *args, **options): language = options["language"] category_namespace = self.CATEGORY_NAMESPACES[language] categories = options["categories"] corpus_name = "-".join( sorted([ category.replace(category_namespace, "") for category in categories ])) results = [] for category in categories: category_name = category.replace(category_namespace, "") send_config = create_config( "http_resource", { "resource": "pol_harvester.wikipediacategorymembers", "wiki_country": language, "continuation_limit": 100 }) scc, err = send(category, config=send_config, method="get") print(f"Send {category_name}:", scc, err) resources = WikipediaCategoryMembers.objects.filter(id__in=scc) extract_config = { "objective": { "@": "$.query.pages", "pageid": "$.pageid", "title": "$.title", "categories": "$.categories", "wikidata": "$.pageprops.wikibase_item", "wikitext": "$.revisions.0.slots.main.*" } } prc = ExtractProcessor(config=extract_config) for resource in resources: results += prc.extract_from_resource(resource) corpus, created = Corpus.objects.get_or_create(name=corpus_name, identifier="pageid", schema={}) articles = [] for result in results: if not result["wikitext"]: continue result["text"] = self.clean_text( mwparserfromhell.parse(result["wikitext"]).strip_code(), category_namespace) articles.append( Article(properties=result, collection=corpus, schema={})) corpus.add(articles, reset=True) vectorizer = CountVectorizer() vectorizer.fit_transform([ self.clean_text(doc.properties["text"], category_namespace) for doc in corpus.documents.all() ]) dst = os.path.join(datagrowth_settings.DATAGROWTH_DATA_DIR, "custom_vocabulary", language) os.makedirs(dst, exist_ok=True) joblib.dump(vectorizer, os.path.join(dst, corpus_name + ".pkl"))