Beispiel #1
0
class FirstPage(Page):
    source = NullSource()

    def process_page(self):
        yield SecondPage({"first": 1})
        yield SecondPage({"first": 2})
        yield SecondPage({"first": 3})
Beispiel #2
0
    class SkipOddList(ListPage):
        source = NullSource()

        def process_page(self):
            yield from self._process_or_skip_loop([1, 2, 3, 4, 5])

        def process_item(self, item):
            return SkipOddDetail(item, source=NullSource())
Beispiel #3
0
    class SingleReturnPaginatedPage(Page):
        source = NullSource()

        def process_page(self):
            return {"dummy": "value"}

        def get_next_source(self):
            # a hack to fake a second identical page
            if isinstance(self.source, NullSource):
                return "https://httpbin.org/get"
Beispiel #4
0
class ExampleListPage(ListPage):
    # need this here to test that default source is used
    source = NullSource()

    def process_page(self):
        yield {"val": "1"}
        yield {"val": "2"}
        yield {"val": "3"}
        yield {"val": "4"}
        yield {"val": "5"}
class ResultJsonListPage(JsonListPage):
    """
    Parse each row of our JSON results file supplemented with additional page
    information so that we know which race we have parsed.
    """

    example_source = "https://ultrasignup.com/service/events.svc/results/63105/1/json"
    source = NullSource()

    def process_item(self, item):
        return dict(**self.input, **item)
Beispiel #6
0
class LegPageGenerator(ListPage):
    source = NullSource()
    """
    NE is an interesting test case for Spatula, since there are individual senator pages
    but no real index that's useful at all.  Right now this is using a dummy source page
    to spawn the 49 subpage scrapers.
    """
    def process_page(self):
        for n in range(1, 50):
            yield LegPage(
                source=f"http://news.legislature.ne.gov/dist{n:02d}/")
Beispiel #7
0
    class SkipOddPage(ListPage):
        source = NullSource()

        def process_page(self):
            yield from self._process_or_skip_loop([1, 2, 3, 4, 5])

        def process_item(self, item):
            if item % 2:
                raise SkipItem(f"{item} is odd!")
            else:
                return item
Beispiel #8
0
class ExamplePaginatedPage(Page):
    source = NullSource()
    another_page = True

    def process_page(self):
        yield {"val": "a man"}
        yield {"val": "a plan"}
        yield {"val": "panama"}

    def get_next_source(self):
        # a hack to fake a second identical page
        if isinstance(self.source, NullSource):
            return "https://httpbin.org/get"
class RaceListFromDjango(Page):
    source = NullSource()

    def process_error_response(self, exception):
        self.logger.warning(exception)

    def process_page(self):
        races = Race.objects.exclude(ultrasignup_id=None)
        for race in races:
            source = (
                f"https://ultrasignup.com/results_event.aspx?did={race.ultrasignup_id}"
            )
            yield RaceResultListPage(
                dict(
                    did=race.ultrasignup_id,
                    race_url=f"https://trailhawks.com{race.get_absolute_url()}",
                    year=race.start_datetime.year,
                ),
                source=source,
            )
class RaceResultListPage(HtmlListPage):
    """
    Every race may have zero or more distances which have their own unique`did`
    race number.
    """

    selector = XPath(
        "//a[@class='event_link' or @class='event_selected_link']",
        min_items=None)
    source = NullSource()

    def process_error_response(self, exception):
        self.logger.warning(exception)

    def process_item(self, item):
        href = XPath("@href").match_one(item)
        if not href.startswith("http"):
            href = f"https://ultrasignup.com{href}"
        race_id = href.split("=")[-1]
        return RaceResultDetail(dict(race_id=race_id,
                                     race_results_url=href,
                                     **self.input),
                                source=href)
class RaceResultDetail(HtmlPage):
    """
    Process the main race information including individual information about the event.
    """

    example_source = "https://ultrasignup.com/results_event.aspx?did=63105"
    source = NullSource()

    def process_error_response(self, exception):
        self.logger.warning(exception)

    def process_page(self):
        try:
            cancellation = XPath(
                "//span[contains(@class,'cancellation_text')]").match_one(
                    self.root)
            cancellation = True
        except SelectorError:
            cancellation = False

        try:
            did = (XPath("//a[@class='event_selected_link']").match_one(
                self.root).get("href").split("=")[-1])
        except SelectorError:
            did = ""

        try:
            distance = (XPath("//a[@class='event_selected_link']").match_one(
                self.root).text)
        except SelectorError:
            distance = ""

        # try:
        #     distance_results = XPath(
        #         "//a[@class='event_link' or @class='event_selected_link']"
        #     ).match(self.root)
        #     distance_results = {
        #         item.text: item.get("href") for item in distance_results
        #     }
        # except SelectorError:
        #     distance_results = None

        try:
            event_date = XPath("//span[@class='event-date']").match_one(
                self.root).text
        except SelectorError:
            event_date = ""

        try:
            virtual = XPath(
                "//span[contains(@class,'virtual_text')]").match_one(self.root)
            virtual = True
        except SelectorError:
            virtual = False

        title = XPath("//h1").match_one(self.root)
        website = XPath("//a[@class='websiteitem']").match_one(
            self.root).get("href")

        return ResultJsonListPage(
            dict(
                cancellation=cancellation,
                date=event_date,
                distance=distance,
                title=title.text,
                virtual=virtual,
                website=website,
                **self.input,
            ),
            source=
            f"https://ultrasignup.com/service/events.svc/results/{did}/1/json",
        )
Beispiel #12
0
 def process_item(self, item):
     return SkipOddDetail(item, source=NullSource())
Beispiel #13
0
class SecondPage(Page):
    source = NullSource()

    def process_page(self):
        return {**self.input, "second": "appended"}
Beispiel #14
0
class SimpleInputPage(Page):
    source = NullSource()
    input_type = Input

    def process_page(self):
        return {"name": self.input.name, "number": self.input.number}
Beispiel #15
0
class ExamplePage(Page):
    # need this here to test example_sources are picked up
    example_source = NullSource()

    def process_page(self):
        return {"source": str(self.source)}
Beispiel #16
0
class Subpage(Page):
    source = NullSource()

    def process_page(self):
        return self.input