class FirstPage(Page): source = NullSource() def process_page(self): yield SecondPage({"first": 1}) yield SecondPage({"first": 2}) yield SecondPage({"first": 3})
class SkipOddList(ListPage): source = NullSource() def process_page(self): yield from self._process_or_skip_loop([1, 2, 3, 4, 5]) def process_item(self, item): return SkipOddDetail(item, source=NullSource())
class SingleReturnPaginatedPage(Page): source = NullSource() def process_page(self): return {"dummy": "value"} def get_next_source(self): # a hack to fake a second identical page if isinstance(self.source, NullSource): return "https://httpbin.org/get"
class ExampleListPage(ListPage): # need this here to test that default source is used source = NullSource() def process_page(self): yield {"val": "1"} yield {"val": "2"} yield {"val": "3"} yield {"val": "4"} yield {"val": "5"}
class ResultJsonListPage(JsonListPage): """ Parse each row of our JSON results file supplemented with additional page information so that we know which race we have parsed. """ example_source = "https://ultrasignup.com/service/events.svc/results/63105/1/json" source = NullSource() def process_item(self, item): return dict(**self.input, **item)
class LegPageGenerator(ListPage): source = NullSource() """ NE is an interesting test case for Spatula, since there are individual senator pages but no real index that's useful at all. Right now this is using a dummy source page to spawn the 49 subpage scrapers. """ def process_page(self): for n in range(1, 50): yield LegPage( source=f"http://news.legislature.ne.gov/dist{n:02d}/")
class SkipOddPage(ListPage): source = NullSource() def process_page(self): yield from self._process_or_skip_loop([1, 2, 3, 4, 5]) def process_item(self, item): if item % 2: raise SkipItem(f"{item} is odd!") else: return item
class ExamplePaginatedPage(Page): source = NullSource() another_page = True def process_page(self): yield {"val": "a man"} yield {"val": "a plan"} yield {"val": "panama"} def get_next_source(self): # a hack to fake a second identical page if isinstance(self.source, NullSource): return "https://httpbin.org/get"
class RaceListFromDjango(Page): source = NullSource() def process_error_response(self, exception): self.logger.warning(exception) def process_page(self): races = Race.objects.exclude(ultrasignup_id=None) for race in races: source = ( f"https://ultrasignup.com/results_event.aspx?did={race.ultrasignup_id}" ) yield RaceResultListPage( dict( did=race.ultrasignup_id, race_url=f"https://trailhawks.com{race.get_absolute_url()}", year=race.start_datetime.year, ), source=source, )
class RaceResultListPage(HtmlListPage): """ Every race may have zero or more distances which have their own unique`did` race number. """ selector = XPath( "//a[@class='event_link' or @class='event_selected_link']", min_items=None) source = NullSource() def process_error_response(self, exception): self.logger.warning(exception) def process_item(self, item): href = XPath("@href").match_one(item) if not href.startswith("http"): href = f"https://ultrasignup.com{href}" race_id = href.split("=")[-1] return RaceResultDetail(dict(race_id=race_id, race_results_url=href, **self.input), source=href)
class RaceResultDetail(HtmlPage): """ Process the main race information including individual information about the event. """ example_source = "https://ultrasignup.com/results_event.aspx?did=63105" source = NullSource() def process_error_response(self, exception): self.logger.warning(exception) def process_page(self): try: cancellation = XPath( "//span[contains(@class,'cancellation_text')]").match_one( self.root) cancellation = True except SelectorError: cancellation = False try: did = (XPath("//a[@class='event_selected_link']").match_one( self.root).get("href").split("=")[-1]) except SelectorError: did = "" try: distance = (XPath("//a[@class='event_selected_link']").match_one( self.root).text) except SelectorError: distance = "" # try: # distance_results = XPath( # "//a[@class='event_link' or @class='event_selected_link']" # ).match(self.root) # distance_results = { # item.text: item.get("href") for item in distance_results # } # except SelectorError: # distance_results = None try: event_date = XPath("//span[@class='event-date']").match_one( self.root).text except SelectorError: event_date = "" try: virtual = XPath( "//span[contains(@class,'virtual_text')]").match_one(self.root) virtual = True except SelectorError: virtual = False title = XPath("//h1").match_one(self.root) website = XPath("//a[@class='websiteitem']").match_one( self.root).get("href") return ResultJsonListPage( dict( cancellation=cancellation, date=event_date, distance=distance, title=title.text, virtual=virtual, website=website, **self.input, ), source= f"https://ultrasignup.com/service/events.svc/results/{did}/1/json", )
def process_item(self, item): return SkipOddDetail(item, source=NullSource())
class SecondPage(Page): source = NullSource() def process_page(self): return {**self.input, "second": "appended"}
class SimpleInputPage(Page): source = NullSource() input_type = Input def process_page(self): return {"name": self.input.name, "number": self.input.number}
class ExamplePage(Page): # need this here to test example_sources are picked up example_source = NullSource() def process_page(self): return {"source": str(self.source)}
class Subpage(Page): source = NullSource() def process_page(self): return self.input