Ejemplo n.º 1
0
    def test_should_flag_batch_imported_on_success(self):
        first_import = InitiativeImport()
        self.pf_source_mock.initiatives = MagicMock(return_value=iter([first_import]))

        self.scraper.scrape()

        assert self.scraper.get_current_batch().state == BatchImportState.IMPORTED
Ejemplo n.º 2
0
    def assert_running_batch_on_iter(self):
        started_batch = self.scraper.get_current_batch()
        assert started_batch is not None
        assert started_batch.platform == self.db_platform
        assert started_batch.state == "running"

        return iter([InitiativeImport()])
Ejemplo n.º 3
0
    def test_should_set_batch_stopped_time(self):
        self.pf_source_mock.initiatives = MagicMock(return_value=iter([InitiativeImport()]))

        self.scraper.scrape()

        batch = self.scraper.get_current_batch()
        assert batch.started_at < batch.stopped_at
Ejemplo n.º 4
0
    def initiatives(self) -> Generator[InitiativeImport, None, None]:
        page_counter = 1
        try:
            while page_counter < 100:
                list_page_url = self.config.list_endpoint + f"&page={page_counter}"

                # schemas: defines fields to be scraped
                # schema: fieldname:{xpath,all,cast,transform}
                schemas = {'initiatives':
                               {'xpath': '//a[@href and contains(@class, "postpreview-content")]',
                                'all': True,
                                'transform': lambda elements: self.find_initiative_links(elements)}}

                # initialize TreeParser using url and schemas, returns html tree
                initiative_parser = TreeParser(list_page_url, None, schemas)
                if initiative_parser.tree is None:
                    break

                output = initiative_parser.apply_schemas()
                for uri in output['initiatives']:
                    yield InitiativeImport(source_uri=uri[0])

                page_counter = page_counter + 1
        except Exception as ex:
            raise ScrapeException("Error loading list of initiatives") from ex
Ejemplo n.º 5
0
    def test_should_add_completed_initiative_to_batch(self):
        first_import = InitiativeImport()
        self.pf_source_mock.initiatives = MagicMock(return_value=iter([first_import]))

        self.scraper.scrape()

        assert self.scraper.get_current_batch().initiatives[0] == first_import
Ejemplo n.º 6
0
    def scrape():
        db = Db()

        # read data
        response = requests.get(
            'https://api-server-271218.appspot.com/v1/tasks?zipcode=')
        result = json.loads(response.content)
        # print(result)

        questions = result['data']['tasks']
        for card in questions:
            db.session.add(
                InitiativeImport(
                    name=card['firstName'],
                    category=card['TaskType']['name'],
                    description=card['description'],
                    group="demand",
                    source=
                    'https://www.gewoonmensendiemensenwillenhelpen.nl/ik-wil-helpen',
                    source_id=card['id'],
                    location=card['zipcode'] + ' ' + card['city'],
                    frequency=card['when'],
                ))

        db.session.commit()
Ejemplo n.º 7
0
    def test_should_iterate_platform_source(self):
        first_import = InitiativeImport()
        self.pf_source_mock.initiatives = MagicMock(return_value=iter([first_import]))

        self.scraper.scrape()

        self.pf_source_mock.complete.assert_called_once_with(first_import)
Ejemplo n.º 8
0
    def scrape(self):
        db = Db()
        page = requests.get(self.URL)

        soup = BeautifulSoup(page.content, 'html.parser')
        results = soup.find(class_='container')

        questions = results.find_all(class_='card')
        for card in questions:
            title = card.find('h5').text.strip(' \t\n\r')
            rawlocation = card.find('h6').text.strip(' \t\n\r')
            # remove (maps) from rawlocation, split on first space
            rawlocation = rawlocation.strip('(Maps)')

            description = card.find('p',
                                    class_='card-text').text.strip(' \t\n\r')

            db.session.add(
                InitiativeImport(
                    category=title,
                    description=description,
                    group="demand",
                    source=self.URL,
                    location=rawlocation,
                ))

        db.session.commit()
Ejemplo n.º 9
0
    def test_should_set_location(self, request_mock):
        self.setup_item_request(request_mock, self.item_nolocation_response)
        test_source = MijnBuurtjeSource(self.config)
        actual = InitiativeImport(source_uri=self.config.details_endpoint +
                                  "1234/loc-test")
        test_source.complete(actual)

        assert actual.location == self.config.location
Ejemplo n.º 10
0
    def scrape_collection_exception(self):
        self.pf_source_mock.initiatives = MagicMock(
            return_value=iter([InitiativeImport(source_uri="test/123")]))

        self.pf_source_mock.complete = Mock(
            side_effect=ScrapeException("Test"))

        self.scraper.scrape()
Ejemplo n.º 11
0
    def test_should_set_platform_url_as_source(self):
        self.pf_source_mock.initiatives = MagicMock(
            return_value=iter([InitiativeImport(source_uri="test/123")]))

        self.scraper.scrape()

        actual = self.scraper.get_current_batch().initiatives[0]
        assert self.scraper.platform_url == actual.source
Ejemplo n.º 12
0
    def test_invalid_stop_throws_error(self):
        self.pf_source_mock.initiatives = MagicMock(
            return_value=iter([InitiativeImport()]))
        self.scraper.scrape()

        batch = self.scraper.get_current_batch()
        with self.assertRaises(ValueError):
            batch.stop(BatchImportState.PROCESSED)
Ejemplo n.º 13
0
    def test_should_log_item_exception(self):
        self.pf_source_mock.initiatives = MagicMock(return_value=iter([InitiativeImport(
            source_uri="test/123"
        )]))
        self.pf_source_mock.complete = \
            MagicMock(side_effect=ScrapeException("Failed loading item"))

        self.scraper.scrape()

        self.logger_mock.exception.assert_called_once_with("Error while collecting initiative test/123")
Ejemplo n.º 14
0
    def test_should_have_set_scraped_at(self):
        self.pf_source_mock.initiatives = MagicMock(
            return_value=iter([InitiativeImport(source_uri="test/123")]))

        self.scraper.scrape()

        now = datetime.utcnow()
        actual = self.scraper.get_current_batch().initiatives[0].scraped_at
        # can't mock datetime.utcnow so this is my workaround.
        datediff = now - actual
        assert datediff.seconds < 1
Ejemplo n.º 15
0
    def scrape(self):
        db = Db()
        for company in self.zorgheldenautos:
            db.session.add(
                InitiativeImport(
                    name=company,
                    group="zorgheldenauto",
                    source='https://www.auto.nl/zorgheldenauto',
                ))

        db.session.commit()
Ejemplo n.º 16
0
    def test_should_wrap_complete_exceptions(self, request_mock):
        test_source = MijnBuurtjeSource(self.config)
        self.setup_item_request(request_mock, self.item_response)

        with patch.object(TreeParser,
                          'apply_schemas',
                          side_effect=HtmlParseError()):
            with self.assertRaises(ScrapeException):
                _ = test_source.complete(
                    InitiativeImport(source_uri=self.config.details_endpoint +
                                     "1234"))
Ejemplo n.º 17
0
    def setUp(self, request_mock):
        self.response = responses.read("nlvoorelkaar_demand.html")

        scraper = NLvoorElkaar()
        self.source = scraper._sources[1]

        self.url = "https://www.nlvoorelkaar.nl/hulpvragen/183242"
        request_mock.get(self.url, text=self.response, status_code=200)
        self.request_mock = request_mock

        self.actual = InitiativeImport(source_id=183242, source_uri=self.url)
        scraper._sources[1].complete(self.actual)
Ejemplo n.º 18
0
    def setUp(self, request_mock):
        self.response = responses.read("nlvoorelkaar_supply.html")

        scraper = NLvoorElkaar()
        self.source = scraper._sources[0]

        self.url = "https://www.nlvoorelkaar.nl/hulpaanbod/179582"
        request_mock.get(self.url, text=self.response, status_code=200)
        self.request_mock = request_mock

        self.actual = InitiativeImport(source_id=179582, source_uri=self.url)
        scraper._sources[0].complete(self.actual)
Ejemplo n.º 19
0
    def complete(self, initiative: InitiativeImport):
        try:
            # Robots.txt mentions 10 secs crawl delay.
            time.sleep(10)

            session_metadata = self.item_parser.get_session_metadata(initiative.source_uri)
            full_initiative = self.item_parser.apply_schemas(metadata=session_metadata,
                                                             url=initiative.source_uri)
            for key, value in full_initiative.items():
                setattr(initiative, key, value)

            if not initiative.location:
                initiative.location = self.config.location

        except Exception as ex:
            raise ScrapeException(f"Error scraping {initiative.source_uri}") from ex
Ejemplo n.º 20
0
    def scrape(self):
        db = Db()
        counter = 1
        while counter > 0:
            # print(self.URL + str(counter))
            page = requests.get(self.URL + str(counter))
            soup = BeautifulSoup(page.content, 'html.parser')
            results = soup.find_all(class_='postpreview')

            if len(results) > 0:
                counter += 1
                for card in results:
                    try:
                        title = card.find(class_='heading3 heading3--semibold'
                                          ).text.strip(' \t\n\r')
                        name = card.find(class_='entity-content-title').text
                        description = card.find(
                            class_='paragraph').text.strip(' \t\n\r')
                        rawtheme = card.find(
                            class_='postpreview-subtitle').text
                        link = card.find(class_='postpreview-content')
                        final_link = link['href']
                        source_id = final_link.split('/')[-2]

                        db.session.add(
                            InitiativeImport(
                                name=name + " - " + title,
                                description=description,
                                group=rawtheme,
                                source=final_link,
                                source_id=source_id,
                            ))
                    except:
                        print(card)
                        pass
            else:
                counter = -1

        db.session.commit()
Ejemplo n.º 21
0
 def test_missing_plaats(self):
     scraper = NLvoorElkaar()
     item = scraper._sources[0].complete(
         InitiativeImport(
             source_id=179582,
             source_uri="https://www.nlvoorelkaar.nl/hulpvragen/183242"))
Ejemplo n.º 22
0
    'notes': {
        'xpath': ''
    }
}
# description (xpath) returns multiple elements, define clean_description and set in schemas.description.transform
clean_description = lambda x: ''.join(
    [re.sub('\n|\n  |\n    |\s{2,}', '', e) for e in x])
schemas['description']['transform'] = clean_description

records = []  # store scraper output in records
c = 1
MAX_URLS = 10
for url in initiatief_urls:
    metadata = {
        'source_url': re.findall('https:\/\/([A-Z,a-z,0-9,\.]+)\/', url)[0],
        'source_uri': url,
        'scraped_at': str(dt.datetime.now()),
        'created_at': dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d')
    }
    TreeParser0 = TreeParser(url, None, schemas)
    output = TreeParser0.apply_schemas(metadata)
    records.append(output)
    if c > MAX_URLS:  # scraping all 300+ initiatives took a bit long
        break
    c = c + 1

# Step 3 insert into db
for r in records:
    # TODO: determine which fields are inserted into DB
    db.session.add(InitiativeImport())