def test_application_json_dict(self): self.json_obj["@"] = "$.keys" keys_processor = ExtractProcessor(config={"objective": self.json_obj}) rsl = keys_processor.application_json(self.json_dict) self.assertEqual(list(rsl), MOCK_JSON_DATA) self.assertIsInstance(rsl, GeneratorType, "Extractors are expected to return generators.")
def extract_seeds(self, set_specification, latest_update): queryset = self.get_queryset().filter( set_specification=set_specification, since__date__gte=latest_update.date(), status=200, is_extracted=False) oaipmh_objective = { "@": EdurepDataExtraction.get_oaipmh_records, "external_id": EdurepDataExtraction.get_oaipmh_external_id, "state": EdurepDataExtraction.get_oaipmh_record_state } oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE) extract_config = create_config("extract_processor", {"objective": oaipmh_objective}) prc = ExtractProcessor(config=extract_config) results = [] for harvest in queryset: seed_resource = { "resource": f"{harvest._meta.app_label}.{harvest._meta.model_name}", "id": harvest.id, "success": True } try: for seed in prc.extract_from_resource(harvest): seed["seed_resource"] = seed_resource results.append(seed) except ValueError as exc: logger.warning("Invalid XML:", exc, harvest.uri) return results
def extract_seeds(self, latest_update): queryset = self.get_queryset() \ .filter(since__date__gte=latest_update.date(), status=200) metadata_objective = { "@": "$.items", "external_id": "$.uuid", "state": BuasMetadataExtraction.get_record_state } metadata_objective.update(BuasMetadataExtraction.OBJECTIVE) extract_config = create_config("extract_processor", { "objective": metadata_objective }) prc = ExtractProcessor(config=extract_config) results = [] for harvest in queryset: seed_resource = { "resource": f"{harvest._meta.app_label}.{harvest._meta.model_name}", "id": harvest.id, "success": True } for seed in prc.extract_from_resource(harvest): seed["seed_resource"] = seed_resource results.append(seed) return results
def get_edurep_query_seeds(query): queryset = EdurepSearch.objects.filter(request__contains=query) api_objective = { "@": EdurepDataExtraction.get_api_records, "external_id": EdurepDataExtraction.get_api_external_id, "state": EdurepDataExtraction.get_api_record_state } api_objective.update(EDUREP_EXTRACTION_OBJECTIVE) extract_config = create_config("extract_processor", {"objective": api_objective}) prc = ExtractProcessor(config=extract_config) results = [] for search in queryset.filter(status=200): try: results += list(prc.extract_from_resource(search)) except ValueError as exc: err.warning("Invalid XML:", exc, search.uri) seeds = {} for seed in sorted(results, key=lambda rsl: rsl["publisher_date"] or ""): # Some records in Edurep do not have any known URL # As we can't possibly process those we ignore them (silently) # If we want to fix this it should happen on Edurep's or Sharekit's side # We informed Kirsten van Veelo and Martine Teirlinck about the situation. if not seed["url"]: continue # We adjust url's of seeds if the source files are not at the URL # We should improve data extraction to always get source files if seed["mime_type"] == "application/x-Wikiwijs-Arrangement": seed["package_url"] = seed["url"] seed["url"] += "?p=imscp" # And deduplicate entire seeds based on URL seeds[seed["url"]] = seed return seeds.values()
def get_edurep_oaipmh_seeds(set_specification, latest_update, include_deleted=True): queryset = EdurepOAIPMH.objects\ .filter(set_specification=set_specification, since__date__gte=latest_update.date(), status=200) oaipmh_objective = { "@": EdurepDataExtraction.get_oaipmh_records, "external_id": EdurepDataExtraction.get_oaipmh_external_id, "state": EdurepDataExtraction.get_oaipmh_record_state } oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE) extract_config = create_config("extract_processor", {"objective": oaipmh_objective}) prc = ExtractProcessor(config=extract_config) results = [] for harvest in queryset: try: results += list(prc.extract_from_resource(harvest)) except ValueError as exc: err.warning("Invalid XML:", exc, harvest.uri) seeds = [] for seed in results: # Some records in Edurep do not have any known URL # As we can't possibly process those we ignore them (silently) # If we want to fix this it should happen on Edurep's or Sharekit's side # We informed Kirsten van Veelo and Martine Teirlinck about the situation. if seed["state"] == "active" and not seed["url"]: continue # We adjust url's of seeds if the source files are not at the URL # We should improve data extraction to always get source files if seed["mime_type"] == "application/x-Wikiwijs-Arrangement" and seed.get( "url", None): seed["package_url"] = seed["url"] seed["url"] += "?p=imscp" # We deduplicate based on the external_id a UID by Edurep seeds.append(seed) # Now we'll mark any invalid seeds as deleted to make sure they disappear # Invalid seeds have a copyright or are of insufficient education level for seed in seeds: if not seed["copyright"] or seed["copyright"] == "no": seed["state"] = "deleted" if seed["lowest_educational_level"] < 1: # lower level than MBO seed["state"] = "deleted" # And we return the seeds based on whether to include deleted or not return seeds if include_deleted else \ [result for result in seeds if result.get("state", "active") == "active"]
def get_xml_processor(self, callables=False): at = "soup.find_all('result')" if not callables else ExtractTextImplementation.get_xml_elements link = "el.find('url').text" if not callables else ExtractTextImplementation.get_xml_link page = "soup.find('title').text" if not callables else ExtractTextImplementation.get_page_text objective = { "@": at, "text": "el.find('label').text", "link": link, "#page": page, } return ExtractProcessor(config={"objective": objective})
def setUp(self): super(TestCase, self).setUp() self.content_types = [ "text/html", "text/xml", "application/json", "nothing/quantum" ] self.html_obj = { "@": "soup.find_all('a')", "text": "el.text", "link": "el['href']", "#page": "soup.find('title').text", } self.html_prc = ExtractProcessor(config={"objective": self.html_obj}) self.soup = BeautifulSoup(MOCK_HTML, "html5lib") self.xml_obj = { "@": "soup.find_all('result')", "text": "el.find('label').text", "link": "el.find('url').text", "#page": "soup.find('title').text", } self.xml_prc = ExtractProcessor(config={"objective": self.xml_obj}) self.xml = BeautifulSoup(MOCK_XML, "lxml") self.json_obj = { "@": "$.records", "#unicode": "$.unicode.0", "#goal": "$.dict.dict.test", "id": "$.id", "record": "$.record" } self.json_prc = ExtractProcessor(config={"objective": self.json_obj}) self.json_records = MOCK_DATA_WITH_RECORDS self.json_dict = MOCK_DATA_WITH_KEYS self.test_resources_data = [ self.soup, self.xml, self.json_records, None ] self.test_resources_extractions = [ MOCK_SCRAPE_DATA, MOCK_SCRAPE_DATA, MOCK_JSON_DATA, None ] self.test_resources = [( Mock(content=(content_type, data)), processor, ) for content_type, data, processor in zip( self.content_types, self.test_resources_data, [self.html_prc, self.xml_prc, self.json_prc, self.html_prc])]
def get_json_processor(self, callables=False, object_values=False, from_dict=False): if not object_values and not from_dict: at = "$.records" if not callables else ExtractJSONImplementation.get_nodes elif from_dict: at = "$.records.0" if not callables else ExtractJSONImplementation.get_dict elif object_values: at = "$.keys" if not callables else ExtractJSONImplementation.get_keys_nodes unicode = "$.unicode.0" if not callables else ExtractJSONImplementation.get_json_unicode id = "$.id" if not callables else ExtractJSONImplementation.get_json_id objective = { "@": at, "#unicode": unicode, "#goal": "$.dict.dict.test", "id": id, "record": "$.record" } return ExtractProcessor(config={ "objective": objective, "extract_from_object_values": object_values })
class TestExtractProcessor(TestCase): def setUp(self): super(TestCase, self).setUp() self.content_types = [ "text/html", "text/xml", "application/json", "nothing/quantum" ] self.html_obj = { "@": "soup.find_all('a')", "text": "el.text", "link": "el['href']", "#page": "soup.find('title').text", } self.html_prc = ExtractProcessor(config={"objective": self.html_obj}) self.soup = BeautifulSoup(MOCK_HTML, "html5lib") self.xml_obj = { "@": "soup.find_all('result')", "text": "el.find('label').text", "link": "el.find('url').text", "#page": "soup.find('title').text", } self.xml_prc = ExtractProcessor(config={"objective": self.xml_obj}) self.xml = BeautifulSoup(MOCK_XML, "lxml") self.json_obj = { "@": "$.records", "#unicode": "$.unicode.0", "#goal": "$.dict.dict.test", "id": "$.id", "record": "$.record" } self.json_prc = ExtractProcessor(config={"objective": self.json_obj}) self.json_records = MOCK_DATA_WITH_RECORDS self.json_dict = MOCK_DATA_WITH_KEYS self.test_resources_data = [ self.soup, self.xml, self.json_records, None ] self.test_resources_extractions = [ MOCK_SCRAPE_DATA, MOCK_SCRAPE_DATA, MOCK_JSON_DATA, None ] self.test_resources = [( Mock(content=(content_type, data)), processor, ) for content_type, data, processor in zip( self.content_types, self.test_resources_data, [self.html_prc, self.xml_prc, self.json_prc, self.html_prc])] def test_init_and_load_objective(self): self.assertEqual(self.html_prc._at, "soup.find_all('a')") self.assertEqual(self.html_prc._context, {"page": "soup.find('title').text"}) self.assertEqual(self.html_prc._objective, { "text": "el.text", "link": "el['href']" }) def test_extract(self): self.html_prc.text_html = Mock() self.html_prc.text_xml = Mock() self.html_prc.application_json = Mock() for content_type in self.content_types: try: self.html_prc.extract(content_type, {"test": "test"}) except TypeError: self.assertEqual( content_type, "nothing/quantum", "{} does not exist as a method on ExtractProcessor.". format(content_type)) self.assertTrue(self.html_prc.text_html.called) self.assertTrue(self.html_prc.text_xml.called) self.assertTrue(self.html_prc.application_json.called) self.assertEquals(self.html_prc.extract(None, None), []) def test_extract_from_resource(self): data = [] try: for test_resource in self.test_resources: resource, processor = test_resource data.append(processor.extract_from_resource(resource)) self.fail("Wrong content_type did not raise exception") except TypeError: pass for test_result, expected_data in zip(data, self.test_resources_extractions): self.assertIsInstance(test_result, GeneratorType) self.assertEqual(list(test_result), expected_data) def test_pass_resource_through(self): for test_resource, expected_data in zip(self.test_resources, self.test_resources_data): resource, processor = test_resource data = processor.pass_resource_through(resource) self.assertNotIsInstance(data, GeneratorType) self.assertIs(data, expected_data) def test_html_text(self): rsl = self.html_prc.text_html(self.soup) self.assertEqual(list(rsl), MOCK_SCRAPE_DATA) self.assertIsInstance(rsl, GeneratorType, "Extractors are expected to return generators.") def test_xml_text(self): rsl = self.xml_prc.text_xml(self.xml) self.assertEqual(list(rsl), MOCK_SCRAPE_DATA) self.assertIsInstance(rsl, GeneratorType, "Extractors are expected to return generators.") def test_application_json_records(self): rsl = self.json_prc.application_json(self.json_records) self.assertEqual(list(rsl), MOCK_JSON_DATA) self.assertIsInstance(rsl, GeneratorType, "Extractors are expected to return generators.") def test_application_json_dict(self): self.json_obj["@"] = "$.keys" keys_processor = ExtractProcessor(config={"objective": self.json_obj}) rsl = keys_processor.application_json(self.json_dict) self.assertEqual(list(rsl), MOCK_JSON_DATA) self.assertIsInstance(rsl, GeneratorType, "Extractors are expected to return generators.")