Python ExtractProcessor Examples, datagrowth.processors.ExtractProcessor Python Examples

Example #1

0

Show file

File: extraction.py Project: tstikvoort/datagrowth

 def test_application_json_dict(self):
     self.json_obj["@"] = "$.keys"
     keys_processor = ExtractProcessor(config={"objective": self.json_obj})
     rsl = keys_processor.application_json(self.json_dict)
     self.assertEqual(list(rsl), MOCK_JSON_DATA)
     self.assertIsInstance(rsl, GeneratorType,
                           "Extractors are expected to return generators.")

Example #2

0

Show file

    def extract_seeds(self, set_specification, latest_update):
        queryset = self.get_queryset().filter(
            set_specification=set_specification,
            since__date__gte=latest_update.date(),
            status=200,
            is_extracted=False)

        oaipmh_objective = {
            "@": EdurepDataExtraction.get_oaipmh_records,
            "external_id": EdurepDataExtraction.get_oaipmh_external_id,
            "state": EdurepDataExtraction.get_oaipmh_record_state
        }
        oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE)
        extract_config = create_config("extract_processor",
                                       {"objective": oaipmh_objective})
        prc = ExtractProcessor(config=extract_config)

        results = []
        for harvest in queryset:
            seed_resource = {
                "resource":
                f"{harvest._meta.app_label}.{harvest._meta.model_name}",
                "id": harvest.id,
                "success": True
            }
            try:
                for seed in prc.extract_from_resource(harvest):
                    seed["seed_resource"] = seed_resource
                    results.append(seed)
            except ValueError as exc:
                logger.warning("Invalid XML:", exc, harvest.uri)
        return results

Example #3

0

Show file

    def extract_seeds(self, latest_update):
        queryset = self.get_queryset() \
            .filter(since__date__gte=latest_update.date(), status=200)

        metadata_objective = {
            "@": "$.items",
            "external_id": "$.uuid",
            "state": BuasMetadataExtraction.get_record_state
        }
        metadata_objective.update(BuasMetadataExtraction.OBJECTIVE)
        extract_config = create_config("extract_processor", {
            "objective": metadata_objective
        })
        prc = ExtractProcessor(config=extract_config)

        results = []
        for harvest in queryset:
            seed_resource = {
                "resource": f"{harvest._meta.app_label}.{harvest._meta.model_name}",
                "id": harvest.id,
                "success": True
            }
            for seed in prc.extract_from_resource(harvest):
                seed["seed_resource"] = seed_resource
                results.append(seed)
        return results

Example #4

0

Show file

def get_edurep_query_seeds(query):
    queryset = EdurepSearch.objects.filter(request__contains=query)

    api_objective = {
        "@": EdurepDataExtraction.get_api_records,
        "external_id": EdurepDataExtraction.get_api_external_id,
        "state": EdurepDataExtraction.get_api_record_state
    }
    api_objective.update(EDUREP_EXTRACTION_OBJECTIVE)
    extract_config = create_config("extract_processor",
                                   {"objective": api_objective})
    prc = ExtractProcessor(config=extract_config)

    results = []
    for search in queryset.filter(status=200):
        try:
            results += list(prc.extract_from_resource(search))
        except ValueError as exc:
            err.warning("Invalid XML:", exc, search.uri)
    seeds = {}
    for seed in sorted(results, key=lambda rsl: rsl["publisher_date"] or ""):
        # Some records in Edurep do not have any known URL
        # As we can't possibly process those we ignore them (silently)
        # If we want to fix this it should happen on Edurep's or Sharekit's side
        # We informed Kirsten van Veelo and Martine Teirlinck about the situation.
        if not seed["url"]:
            continue
        # We adjust url's of seeds if the source files are not at the URL
        # We should improve data extraction to always get source files
        if seed["mime_type"] == "application/x-Wikiwijs-Arrangement":
            seed["package_url"] = seed["url"]
            seed["url"] += "?p=imscp"
        # And deduplicate entire seeds based on URL
        seeds[seed["url"]] = seed
    return seeds.values()

Example #5

0

Show file

def get_edurep_oaipmh_seeds(set_specification,
                            latest_update,
                            include_deleted=True):
    queryset = EdurepOAIPMH.objects\
        .filter(set_specification=set_specification, since__date__gte=latest_update.date(), status=200)

    oaipmh_objective = {
        "@": EdurepDataExtraction.get_oaipmh_records,
        "external_id": EdurepDataExtraction.get_oaipmh_external_id,
        "state": EdurepDataExtraction.get_oaipmh_record_state
    }
    oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE)
    extract_config = create_config("extract_processor",
                                   {"objective": oaipmh_objective})
    prc = ExtractProcessor(config=extract_config)

    results = []
    for harvest in queryset:
        try:
            results += list(prc.extract_from_resource(harvest))
        except ValueError as exc:
            err.warning("Invalid XML:", exc, harvest.uri)
    seeds = []
    for seed in results:
        # Some records in Edurep do not have any known URL
        # As we can't possibly process those we ignore them (silently)
        # If we want to fix this it should happen on Edurep's or Sharekit's side
        # We informed Kirsten van Veelo and Martine Teirlinck about the situation.
        if seed["state"] == "active" and not seed["url"]:
            continue
        # We adjust url's of seeds if the source files are not at the URL
        # We should improve data extraction to always get source files
        if seed["mime_type"] == "application/x-Wikiwijs-Arrangement" and seed.get(
                "url", None):
            seed["package_url"] = seed["url"]
            seed["url"] += "?p=imscp"
        # We deduplicate based on the external_id a UID by Edurep
        seeds.append(seed)
    # Now we'll mark any invalid seeds as deleted to make sure they disappear
    # Invalid seeds have a copyright or are of insufficient education level
    for seed in seeds:
        if not seed["copyright"] or seed["copyright"] == "no":
            seed["state"] = "deleted"
        if seed["lowest_educational_level"] < 1:  # lower level than MBO
            seed["state"] = "deleted"
    # And we return the seeds based on whether to include deleted or not
    return seeds if include_deleted else \
        [result for result in seeds if result.get("state", "active") == "active"]

Example #6

0

Show file

 def get_xml_processor(self, callables=False):
     at = "soup.find_all('result')" if not callables else ExtractTextImplementation.get_xml_elements
     link = "el.find('url').text" if not callables else ExtractTextImplementation.get_xml_link
     page = "soup.find('title').text" if not callables else ExtractTextImplementation.get_page_text
     objective = {
         "@": at,
         "text": "el.find('label').text",
         "link": link,
         "#page": page,
     }
     return ExtractProcessor(config={"objective": objective})

Example #7

0

Show file

File: extraction.py Project: tstikvoort/datagrowth

    def setUp(self):
        super(TestCase, self).setUp()

        self.content_types = [
            "text/html", "text/xml", "application/json", "nothing/quantum"
        ]

        self.html_obj = {
            "@": "soup.find_all('a')",
            "text": "el.text",
            "link": "el['href']",
            "#page": "soup.find('title').text",
        }
        self.html_prc = ExtractProcessor(config={"objective": self.html_obj})
        self.soup = BeautifulSoup(MOCK_HTML, "html5lib")

        self.xml_obj = {
            "@": "soup.find_all('result')",
            "text": "el.find('label').text",
            "link": "el.find('url').text",
            "#page": "soup.find('title').text",
        }
        self.xml_prc = ExtractProcessor(config={"objective": self.xml_obj})
        self.xml = BeautifulSoup(MOCK_XML, "lxml")

        self.json_obj = {
            "@": "$.records",
            "#unicode": "$.unicode.0",
            "#goal": "$.dict.dict.test",
            "id": "$.id",
            "record": "$.record"
        }
        self.json_prc = ExtractProcessor(config={"objective": self.json_obj})
        self.json_records = MOCK_DATA_WITH_RECORDS
        self.json_dict = MOCK_DATA_WITH_KEYS

        self.test_resources_data = [
            self.soup, self.xml, self.json_records, None
        ]
        self.test_resources_extractions = [
            MOCK_SCRAPE_DATA, MOCK_SCRAPE_DATA, MOCK_JSON_DATA, None
        ]
        self.test_resources = [(
            Mock(content=(content_type, data)),
            processor,
        ) for content_type, data, processor in zip(
            self.content_types, self.test_resources_data,
            [self.html_prc, self.xml_prc, self.json_prc, self.html_prc])]

Example #8

0

Show file

 def get_json_processor(self,
                        callables=False,
                        object_values=False,
                        from_dict=False):
     if not object_values and not from_dict:
         at = "$.records" if not callables else ExtractJSONImplementation.get_nodes
     elif from_dict:
         at = "$.records.0" if not callables else ExtractJSONImplementation.get_dict
     elif object_values:
         at = "$.keys" if not callables else ExtractJSONImplementation.get_keys_nodes
     unicode = "$.unicode.0" if not callables else ExtractJSONImplementation.get_json_unicode
     id = "$.id" if not callables else ExtractJSONImplementation.get_json_id
     objective = {
         "@": at,
         "#unicode": unicode,
         "#goal": "$.dict.dict.test",
         "id": id,
         "record": "$.record"
     }
     return ExtractProcessor(config={
         "objective": objective,
         "extract_from_object_values": object_values
     })

Example #9

0

Show file

File: extraction.py Project: tstikvoort/datagrowth

class TestExtractProcessor(TestCase):
    def setUp(self):
        super(TestCase, self).setUp()

        self.content_types = [
            "text/html", "text/xml", "application/json", "nothing/quantum"
        ]

        self.html_obj = {
            "@": "soup.find_all('a')",
            "text": "el.text",
            "link": "el['href']",
            "#page": "soup.find('title').text",
        }
        self.html_prc = ExtractProcessor(config={"objective": self.html_obj})
        self.soup = BeautifulSoup(MOCK_HTML, "html5lib")

        self.xml_obj = {
            "@": "soup.find_all('result')",
            "text": "el.find('label').text",
            "link": "el.find('url').text",
            "#page": "soup.find('title').text",
        }
        self.xml_prc = ExtractProcessor(config={"objective": self.xml_obj})
        self.xml = BeautifulSoup(MOCK_XML, "lxml")

        self.json_obj = {
            "@": "$.records",
            "#unicode": "$.unicode.0",
            "#goal": "$.dict.dict.test",
            "id": "$.id",
            "record": "$.record"
        }
        self.json_prc = ExtractProcessor(config={"objective": self.json_obj})
        self.json_records = MOCK_DATA_WITH_RECORDS
        self.json_dict = MOCK_DATA_WITH_KEYS

        self.test_resources_data = [
            self.soup, self.xml, self.json_records, None
        ]
        self.test_resources_extractions = [
            MOCK_SCRAPE_DATA, MOCK_SCRAPE_DATA, MOCK_JSON_DATA, None
        ]
        self.test_resources = [(
            Mock(content=(content_type, data)),
            processor,
        ) for content_type, data, processor in zip(
            self.content_types, self.test_resources_data,
            [self.html_prc, self.xml_prc, self.json_prc, self.html_prc])]

    def test_init_and_load_objective(self):
        self.assertEqual(self.html_prc._at, "soup.find_all('a')")
        self.assertEqual(self.html_prc._context,
                         {"page": "soup.find('title').text"})
        self.assertEqual(self.html_prc._objective, {
            "text": "el.text",
            "link": "el['href']"
        })

    def test_extract(self):
        self.html_prc.text_html = Mock()
        self.html_prc.text_xml = Mock()
        self.html_prc.application_json = Mock()
        for content_type in self.content_types:
            try:
                self.html_prc.extract(content_type, {"test": "test"})
            except TypeError:
                self.assertEqual(
                    content_type, "nothing/quantum",
                    "{} does not exist as a method on ExtractProcessor.".
                    format(content_type))
        self.assertTrue(self.html_prc.text_html.called)
        self.assertTrue(self.html_prc.text_xml.called)
        self.assertTrue(self.html_prc.application_json.called)
        self.assertEquals(self.html_prc.extract(None, None), [])

    def test_extract_from_resource(self):
        data = []
        try:
            for test_resource in self.test_resources:
                resource, processor = test_resource
                data.append(processor.extract_from_resource(resource))
            self.fail("Wrong content_type did not raise exception")
        except TypeError:
            pass
        for test_result, expected_data in zip(data,
                                              self.test_resources_extractions):
            self.assertIsInstance(test_result, GeneratorType)
            self.assertEqual(list(test_result), expected_data)

    def test_pass_resource_through(self):
        for test_resource, expected_data in zip(self.test_resources,
                                                self.test_resources_data):
            resource, processor = test_resource
            data = processor.pass_resource_through(resource)
            self.assertNotIsInstance(data, GeneratorType)
            self.assertIs(data, expected_data)

    def test_html_text(self):
        rsl = self.html_prc.text_html(self.soup)
        self.assertEqual(list(rsl), MOCK_SCRAPE_DATA)
        self.assertIsInstance(rsl, GeneratorType,
                              "Extractors are expected to return generators.")

    def test_xml_text(self):
        rsl = self.xml_prc.text_xml(self.xml)
        self.assertEqual(list(rsl), MOCK_SCRAPE_DATA)
        self.assertIsInstance(rsl, GeneratorType,
                              "Extractors are expected to return generators.")

    def test_application_json_records(self):
        rsl = self.json_prc.application_json(self.json_records)
        self.assertEqual(list(rsl), MOCK_JSON_DATA)
        self.assertIsInstance(rsl, GeneratorType,
                              "Extractors are expected to return generators.")

    def test_application_json_dict(self):
        self.json_obj["@"] = "$.keys"
        keys_processor = ExtractProcessor(config={"objective": self.json_obj})
        rsl = keys_processor.application_json(self.json_dict)
        self.assertEqual(list(rsl), MOCK_JSON_DATA)
        self.assertIsInstance(rsl, GeneratorType,
                              "Extractors are expected to return generators.")