Beispiel #1
0
    def complete(self, initiative: InitiativeImport):
        initiative_url = self.config.get_initiative_url(initiative.source_id)

        try:
            detail = PlatformSource.get(initiative_url)

            soup = BeautifulSoup(detail.content, 'html.parser')

            table = soup.find("dl")
            records = table.findAll(["dd", "dt"])
            initiative.description = soup.find("p").text.strip('\t\n\r')
            initiative.group = self.config.group
            initiative.source = initiative_url

            setcount = 0
            for i in range(0, len(records), 2):
                # TODO: Error prevention
                label = records[i].contents[1].strip("\":").lower()
                if label in self.config.field_map:
                    setattr(initiative, self.config.field_map[label], records[i + 1].contents[0])
                    setcount += 1

            if self.config.group == InitiativeGroup.DEMAND:
                title = soup.find("h2", "result__title")
                initiative.organiser = title.contents[0]

            # TODO: Logging is no values are assigned
        except ScrapeException as e:
            # should not catch
            # ('error scraping ' + initiative_url + ':' + e.args[0])
            if initiative is not None:
                initiative.state = "processing_error"
Beispiel #2
0
    def initiatives(self) -> Generator[InitiativeImport, None, None]:
        response = self.get(self.config.list_endpoint)

        data = json.loads(response.content,
                          object_hook=lambda d: namedtuple('X', d.keys())
                          (*d.values()))

        for item in data:
            initiative = InitiativeImport(
                source_id=item.id,
                source_uri=f"https://wijamsterdam.nl/initiatief/{item.id}",
                # using dateutil and not datetime because: https://stackoverflow.com/a/3908349/167131
                created_at=parser.parse(item.createdAt),
                name=item.title,
                description=f"{item.summary}"
                f"\n--------\n"
                f"{item.description}",
                location=item.extraData.area,
                organiser=item.extraData.isOrganiserName,
                group=InitiativeGroup.SUPPLY,
                category=item.extraData.theme,
                url=item.extraData.isOrganiserWebsite,
                extra_fields=response.content.decode("utf-8")
                # Probably better to leave email / phone empty
                # name is already tricky maybe albeit open data.
            )
            if hasattr(item, "position"):
                initiative.latitude = item.position.lat
                initiative.longitude = item.position.lng
            yield initiative
Beispiel #3
0
    def complete(self, initiative: InitiativeImport):
        initiative_url = self.config.get_initiative_url(initiative.source_id)
        # This already raises ScrapeExceptions
        detail = PlatformSource.get(initiative_url)

        try:
            soup = BeautifulSoup(detail.content, 'html.parser')

            table = soup.find("dl")
            records = table.findAll(["dd", "dt"])
            initiative.description = soup.find("p").text.strip('\t\n\r ')
            initiative.group = self.config.group
            initiative.source = initiative_url

            set_count = self.extract_details_table(initiative, records)

            if self.config.group == InitiativeGroup.DEMAND:
                title = soup.find("h2", "result__title")
                initiative.organiser = title.contents[0]

            if not initiative.location:
                self.try_alternative_place(soup, initiative)
        except Exception as ex:
            msg = f"Error reading contents from {initiative_url}"
            raise ScrapeException(msg) from ex

        if set_count == 0:
            raise ScrapeException("Failed to load field map details table")
Beispiel #4
0
    def scrape_group(self, config: InitiativeGroupConfig, batch: ImportBatch):
        print('scraping ' + config.group)
        page = requests.get(config.url)
        # TODO: Handle http error codes
        result = page.json()
        parsed_markers = []

        for marker in result['markers']:
            if marker['id'] not in parsed_markers:
                # TODO: Error handling and possibly a retry
                parsed_markers.append(marker['id'])
                markerurl = config.get_marker_url(marker['id'])
                print('scraping ' + markerurl)

                initiative = None
                try:
                    detail = requests.get(markerurl)
                    # TODO: Handle http error codes
                    soup = BeautifulSoup(detail.content, 'html.parser')

                    table = soup.find("dl")
                    records = table.findAll(["dd", "dt"])
                    description = soup.find("p").text.strip('\t\n\r')
                    initiative = InitiativeImport(description=description,
                                                  group=config.group,
                                                  source=markerurl,
                                                  source_id=marker['id'])

                    setcount = 0
                    for i in range(0, len(records), 2):
                        # TODO: Error prevention
                        label = records[i].contents[1].strip("\":").lower()
                        if label in config.field_map:
                            setattr(initiative, config.field_map[label],
                                    records[i + 1].contents[0])
                            setcount += 1

                    if config.group == InitiativeGroup.DEMAND:
                        title = soup.find("h2", "result__title")
                        name = title.contents[0]

                    # TODO: Logging is no values are assigned
                except Exception as e:
                    print('error scraping ' + markerurl + ':' + e.args[0])
                    if initiative is not None:
                        initiative.state = "processing_error"

                if initiative is not None:
                    batch.initiatives.append(initiative)

                # debugging
                if not self.should_continue(len(parsed_markers)):
                    break

        self._db.session.commit()
Beispiel #5
0
    def map_initiative(response, item):
        initiative = InitiativeImport(
            source_id=item.id,
            source_uri=f"https://wijamsterdam.nl/initiatief/{item.id}",
            # using dateutil and not datetime because: https://stackoverflow.com/a/3908349/167131
            created_at=parser.parse(item.createdAt),
            name=item.title,
            description=f"{item.summary}"
            f"\n--------\n"
            f"{item.description}",
            group=InitiativeGroup.SUPPLY,
            extra_fields=response.content.decode("utf-8")
            # Probably better to leave email / phone empty
            # name is already tricky maybe albeit open data.
        )

        if hasattr(item.extraData, "area"):
            initiative.location = item.extraData.area
        if hasattr(item.extraData, "isOrganiserName"):
            initiative.organiser = item.extraData.isOrganiserName
        if hasattr(item.extraData, "theme"):
            initiative.category = item.extraData.theme
        if hasattr(item.extraData, "isOrganiserWebsite"):
            initiative.url = item.extraData.isOrganiserWebsite
        if hasattr(item, "position"):
            initiative.latitude = item.position.lat
            initiative.longitude = item.position.lng

        return initiative
Beispiel #6
0
    def map_initiative(item):
        org = json.dumps(item)
        initiative = InitiativeImport(
            source_id=item["id"],
            source_uri=f"https://wijamsterdam.nl/initiatief/{item['id']}",
            # using dateutil and not datetime because: https://stackoverflow.com/a/3908349/167131
            created_at=parser.parse(item["createdAt"]),
            name=item["title"],
            description=f"{item['summary']}"
            f"\n--------\n"
            f"{item['description']}",
            group=InitiativeGroup.SUPPLY,
            extra_fields=org
            # Probably better to leave email / phone empty
            # name is already tricky maybe albeit open data.
        )

        extra_data = item["extraData"]
        if "area" in extra_data:
            initiative.location = extra_data["area"]
        if "isOrganiserName" in extra_data:
            initiative.organiser = extra_data["isOrganiserName"]
        if "theme" in extra_data:
            initiative.category = extra_data["theme"]
        if "isOrganiserWebsite" in extra_data:
            initiative.url = extra_data["isOrganiserWebsite"]
        if "position" in item:
            initiative.latitude = item["position"]["lat"]
            initiative.longitude = item["position"]["lng"]

        return initiative
Beispiel #7
0
    def scrape(self):
        super().scrape()
        page = requests.get(self.URL)

        soup = BeautifulSoup(page.content, 'html.parser')
        results = soup.find(class_='ideas-list')

        questions = results.find_all(class_='idea-item')
        count = 0
        for card in questions:
            title = card.find('h3').text.strip(' \t\n\r')
            rawlocation = card.find(class_='gebied').text.strip(' \t\n\r')
            description = card.find('p').text.strip(' \t\n\r')
            link = card.find('a')['href']
            self._db.session.add(
                InitiativeImport(
                    name=title,
                    description=description,
                    group="unknown",
                    source='https://wijamsterdam.nl' + link,
                    source_id=link.strip('/initiatief/'),
                    location=rawlocation,
                ))
            count += 1
            if not self.should_continue(count):
                break

        self._db.session.commit()
Beispiel #8
0
    def complete(self, initiative: InitiativeImport):
        post_url = self.config.get_api_post_url(initiative.source_id)
        detail = self.get(post_url)

        try:
            initiative_url_guid = '75aa5e4d-fe98-4a7a-94ec-adab2f7f9b88'

            result = detail.json()
            initiative.created_at=parser.parse(result['created'])
            initiative.scraped_at=datetime.datetime.now()
            
            initiative.name=result['title']
            initiative.description=result['content']

            if initiative_url_guid in result['values']:
                initiative.url = result['values'][initiative_url_guid][0]
            
            initiative.extra_fields = self.parse_extra_fields(result)

            category_list = []
            for tag in result['tags']:
                category_list.append(self.category_dict[tag['id']])
            s = ', '
            initiative.category = s.join(category_list)

        except Exception as ex:
            msg = f"Error in complete function for initiative {initiative.source_id}"
            raise ScrapeException(msg) from ex
Beispiel #9
0
    def _collect_initiative(self, initiative: InitiativeImport, source):
        if initiative is None:
            raise ValueError("Expecting an initiative instance!")

        try:
            source.complete(initiative)
            initiative.scraped_at = datetime.utcnow()
            initiative.source = self.platform_url
            self.add_initiative(initiative)
            self.get_logger().debug(f"Scraped {initiative.source_uri}")
        except ScrapeException as e:
            self.get_logger()\
                .exception(f"Error while collecting initiative {initiative.source_uri}")
            # There's maybe no point in doing this unless it's saved or at least counted.
            # this is actually indicating error with down the line processing.
            initiative.state = "processing_error"
            # Should probably do this very neat with a context manager.
            if self._collect_recovery.should_raise(e):
                raise e
Beispiel #10
0
    def initiatives(self) -> Generator[InitiativeImport, None, None]:
        url = self.config.get_list_url()
        page = PlatformSource.get(url)
        result = page.json()

        for marker in result['markers']:
            initiative = InitiativeImport(
                source_id=marker['id'],
                source_uri=self.config.get_marker_url(marker['id']),
                latitude=marker['lat'],
                longitude=marker['lon'],
            )
            yield initiative
Beispiel #11
0
    def _collect_initiative(self, initiative: InitiativeImport, source):
        if initiative is None:
            raise ValueError("Expecting an initiative instance!")

        try:
            source.complete(initiative)
            initiative.scraped_at = datetime.utcnow()
            initiative.source = self.platform_url
            self.get_logger().debug(f"Scraped {initiative.source_uri}")
        except ScrapeException as e:
            self.get_logger()\
                .exception(f"Error while collecting initiative {initiative.source_uri}")
            # There's maybe no point in doing this unless it's saved or at least counted.
            # this is actually indicating error with down the line processing.
            initiative.state = InitiativeImportState.IMPORT_ERROR
            ex_info = sys.exc_info()
            initiative.error_reason = "".join(
                traceback.format_exception(*ex_info))
            # Should probably do this very neat with a context manager.
            if self._collect_recovery.should_raise(e):
                raise e
        finally:
            # Always store initiative for traceability.
            self.add_initiative(initiative)
Beispiel #12
0
    def initiatives(self) -> Generator[InitiativeImport, None, None]:
        url = self.config.get_list_url()
        page = PlatformSource.get(url)

        try:
            result = page.json()
            for marker in result['markers']:
                initiative = InitiativeImport(
                    source_id=marker['id'],
                    source_uri=self.config.get_marker_url(marker['id']),
                    latitude=marker['lat'],
                    longitude=marker['lon'],
                )
                yield initiative
        except Exception as ex:
            msg = f"Error reading contents from {url}"
            raise ScrapeException(msg) from ex
Beispiel #13
0
    def createInitiativeFromDeedDetails(self, deedDetails):
        logging.info("Creating initiative from deed details")

        deedID = self.getDeedIDFromJSON(deedDetails)
        coordinates = self.getCoordinatesFromDeedDetails(deedDetails)

        initiative = InitiativeImport(
            category=deedDetails["fullType"],
            group="supply",
            description=deedDetails["summary"],
            # name = deedDetails[""],
            source=self.getAPIDeedDetailsURL(deedID),
            # frequency = deedDetails["subtype"],
            location=deedDetails["address"],
            latitude=coordinates["lat"],
            longitude=coordinates["lng"])

        return initiative
Beispiel #14
0
    def create_initiative_from_deed_details(self, deed_details):
        logging.info("Creating initiative from deed details")

        deed_id = self.get_deed_id_from_json(deed_details)
        coordinates = self.get_coordinates_from_deed_details(deed_details)

        initiative = InitiativeImport(
            category=deed_details["fullType"],
            group="supply",
            description=deed_details["summary"],
            # name = deedDetails[""],
            source=self.get_api_deed_details_url(deed_id),
            # frequency = deedDetails["subtype"],
            location=deed_details["address"],
            latitude=coordinates["lat"],
            longitude=coordinates["lng"])

        return initiative
Beispiel #15
0
    def initiatives(self) -> Generator[InitiativeImport, None, None]:
        self.category_dict = self.get_category_dict()

        url = self.config.get_api_list_url()
        page = self.get(url)

        try:
            result = page.json()
            for feature in result['features']:
                initiative = InitiativeImport(
                    name=feature['properties']['title'],
                    description=feature['properties']['description'],
                    group= self.config.group,
                    source=self.config.url,
                    source_id=feature['properties']['id'],
                    source_uri=feature['properties']['url'].replace('/api/v3', ""),
                    longitude=feature['geometry']['geometries'][0]['coordinates'][0],
                    latitude=feature['geometry']['geometries'][0]['coordinates'][1]
                )
                yield initiative
        except Exception as ex:
            msg = f"Error reading contents from {url}"
            raise ScrapeException(msg) from ex
Beispiel #16
0
    def complete(self, initiative: InitiativeImport):
        initiative_url = self.config.get_initiative_url(initiative.source_id)
        # This already raises ScrapeExceptions
        detail = self.get(initiative_url)

        try:
            soup = BeautifulSoup(detail.content, 'html.parser')

            table = soup.find("dl")
            records = table.findAll(["dd", "dt"])
            initiative.description = soup.find("p").text.strip('\t\n\r ')
            initiative.group = self.config.group
            initiative.source = initiative_url

            set_count = self.extract_details_table(initiative, records)

            if self.config.group == InitiativeGroup.DEMAND:
                title = soup.find("h2", "result__title")
                initiative.name = title.contents[0]

            h5nodeOrganization = soup.find("h5", text="Aangesloten bij:")
            if h5nodeOrganization:
                initiative.organiser = h5nodeOrganization.find_next_sibling(
                ).get_text(strip=True)
            else:
                h5nodePerson = soup.find("h5", text="Geplaatst door:")
                if h5nodePerson:
                    initiative.organiser = h5nodePerson.find_next_sibling(
                    ).get_text(strip=True)

            if not initiative.location:
                self.try_alternative_place(soup, initiative)
        except Exception as ex:
            msg = f"Error reading contents from {initiative_url}"
            raise ScrapeException(msg) from ex

        if set_count == 0:
            raise ScrapeException("Failed to load field map details table")