Python extract_full_href_or_none Examples, crawler.utils.extract_full_href_or_none Python Examples

Example #1

0

Show file

    def scrape_bills_and_urls_from_table(table, response_url):
        def get_bill_category_or_none(caption):
            if caption == '閣法の一覧':
                return BillCategory.KAKUHOU
            elif caption == '衆法の一覧':
                return BillCategory.SHUHOU
            elif caption == '参法の一覧':
                return BillCategory.SANHOU
            else:
                return None

        bills, urls = [], []

        caption = extract_text(table.xpath('./caption')).strip()
        maybe_bill_category = get_bill_category_or_none(caption)
        if not maybe_bill_category:
            return bills, urls
        bill_category = maybe_bill_category

        for row in table.xpath('./tr')[1:]:  # skip header
            cells = row.xpath('./td')
            assert len(cells) == 6

            # build Bill instance with necessary info
            try:
                diet_number = int(extract_text(cells[0]))
                submission_number = int(extract_text(cells[1]))
                bill_name = extract_text(cells[2])
            except Exception as e:
                LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}')
                continue
            bill = build_bill(bill_category, diet_number, submission_number,
                              bill_name)
            bills.append(bill)

            # build keika URL if exists
            maybe_keika_href = extract_full_href_or_none(
                cells[4], response_url)
            if maybe_keika_href:
                url = build_url(maybe_keika_href, UrlTitle.KEIKA,
                                ShugiinSpider.domain)
                url.meta = {'bill_id': bill.id}
                urls.append(url)

            # build honbun URL if exists
            maybe_honbun_href = extract_full_href_or_none(
                cells[5], response_url)
            if maybe_honbun_href:
                url = build_url(maybe_honbun_href, UrlTitle.HONBUN,
                                ShugiinSpider.domain)
                url.meta = {'bill_id': bill.id}
                urls.append(url)

        return bills, urls

Example #2

0

Show file

File: shugiin_minutes_spider.py Project: sarcastic555/politylink-crawler

 def scrape_minutes_urls_from_response(response):
     urls = []
     for li in response.xpath('//div[@id="mainlayout"]/li'):
         url = extract_full_href_or_none(li, response.url)
         if url:
             urls.append(url)
     return urls

Example #3

0

Show file

File: shugiin_minutes_spider.py Project: sarcastic555/politylink-crawler

    def parse_minutes(self, response):
        # merge url if exists
        maybe_href = extract_full_href_or_none(response.xpath('//h4'),
                                               response.url)
        if not maybe_href:
            LOGGER.warning(f'failed to find url in {response.url}')
            return
        url = build_url(maybe_href,
                        title=UrlTitle.GAIYOU_PDF,
                        domain=self.domain)
        self.gql_client.merge(url)
        LOGGER.debug(f'merged {url.id}')

        # link to minutes
        title = extract_text(response.xpath('//title'))
        committee_name = response.meta['committee_name']
        date_time = self.extract_datetime_from_title(title)
        minutes = build_minutes(committee_name, date_time)
        try:
            self.gql_client.get(minutes.id,
                                ['id'])  # minutes should already exist
            self.gql_client.link(url.id, minutes.id)
        except GraphQLException:
            LOGGER.warning(
                f'failed to find minutes ({committee_name}, {date_time})')

Example #4

0

Show file

File: sangiin_spider.py Project: sarcastic555/politylink-crawler

    def scrape_bills_and_urls_from_table(self, table, bill_category,
                                         response_url):
        bills, urls = [], []
        for row in table.xpath('./tr')[1:]:  # skip header
            cells = row.xpath('./td')
            assert len(cells) == 5

            # build Bill instance with necessary info
            try:
                diet_number = int(extract_text(cells[0]))
                submission_number = int(extract_text(cells[1]))
                bill_name = extract_text(cells[2])
            except Exception as e:
                LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}')
                continue
            bill = build_bill(bill_category, diet_number, submission_number,
                              bill_name)
            bills.append(bill)

            # build  URL if exists
            maybe_meisai_href = extract_full_href_or_none(
                cells[2], response_url)
            if maybe_meisai_href:
                url = build_url(maybe_meisai_href, UrlTitle.GIAN_ZYOUHOU,
                                self.domain)
                url.meta = {'bill_id': bill.id}
                urls.append(url)

        return bills, urls

Example #5

0

Show file

File: shugiin_minutes_spider.py Project: sarcastic555/politylink-crawler

 def scrape_committees_from_table(table, root_url):
     committees = []
     for row in table.xpath('./tr'):
         for cell in row.xpath('./td'):
             committee = Committee(None)
             committee.name = '衆議院' + extract_text(cell.xpath('./span/a'))
             committee.url = extract_full_href_or_none(cell, root_url)
             committees.append(committee)
     return committees

Example #6

0

Show file

    def scrape_members_and_urls(self, response):
        members, urls = [], []
        table = response.xpath('//table[@summary="議員一覧（50音順）"]')[0]
        for row in table.xpath('./tr')[1:]:  # skip header
            cells = row.xpath('./td')
            assert len(cells) == 6

            name = ''.join(extract_text(cells[0]).strip().split())
            tags = [  # store 会派 and 選挙区 as tags for now
                extract_text(cells[2]).strip(),
                extract_text(cells[3]).strip()
            ]
            member = build_member(name)
            member.tags = tags
            member.house = 'COUNCILORS'
            members.append(member)

            maybe_href = extract_full_href_or_none(cells[0], response.url)
            if maybe_href:
                url = build_url(maybe_href, UrlTitle.GIIN_ZYOUHOU, self.domain)
                url.meta = {'member_id': member.id}
                urls.append(url)
        return members, urls