Example #1
0
    def _parse_search_item(self, html1: _Element, html2: _Element,
                           html3: _Element,
                           metadata: dict) -> Optional[ChinaCdcItem]:
        title = get_element_str(html1)
        url = html1.attrib['href']
        if not title:
            print('')

        abstract = get_element_str(html2)

        try:
            publish_str = html3.text.strip()
            matches = self.publish_time_pattern.findall(publish_str)
            if not matches:
                raise Exception(
                    f'Failed to parse publish datetime from {publish_str}')
            publish = datetime.datetime(int(matches[0][1]), int(matches[0][2]),
                                        int(matches[0][3]))
        except:
            return None

        item = ChinaCdcItem()
        item.title = title
        item.url = url
        item.keyword = metadata.get('keyword', '')
        item.abstract = abstract
        item.publish = publish

        return item
Example #2
0
    def _parse_search_item(self, html: _Element,
                           metadata: dict) -> Optional[ChinaNewsItem]:
        element = html.xpath('.//li[contains(@class, "news_title")]/a')[0]
        title = utility.get_element_str(element)
        url = element.attrib['href']

        abstract = utility.get_element_str(
            html.xpath('.//li[@class="news_content"]')[0])

        try:
            element = html.xpath('.//li[@class="news_other"]')
            element = element[0]
            publish_str = element.text.strip()
            publish_str = publish_str.split('\t')[-1]
            publish = datetime.datetime.strptime(publish_str,
                                                 '%Y-%m-%d %H:%M:%S')
        except:
            return None

        item = ChinaNewsItem()
        item.title = title
        item.url = url
        item.keyword = metadata.get('keyword', '')
        item.abstract = abstract
        item.publish = publish

        return item
Example #3
0
    def _parse_search_item(self, html: _Element,
                           metadata: dict) -> Optional[CnrItem]:
        element = html.xpath('div[1]/a')[0]
        title = utility.get_element_str(element)
        url = element.attrib['href']

        element = html.xpath('div[2]')
        abstract = ''
        if element:
            abstract = utility.get_element_str(element[0])

        try:
            element = html.xpath('div/span[@class="searchresulturl"]')
            element = element[0]
            publish_str = element.tail.strip()
            publish = datetime.datetime.strptime(publish_str,
                                                 '%Y.%m.%d %H:%M:%S')
        except:
            return None

        item = CnrItem()
        item.title = title
        item.url = url
        item.abstract = abstract
        item.keyword = metadata.get('keyword', '')
        item.publish = publish

        return item
Example #4
0
    def _parse_search_item(self, html: _Element,
                           metadata: dict) -> Optional[GovItem]:
        elements = html.xpath('h3/a')
        if not elements:
            raise Exception(f'Failed to parse item')

        element = elements[0]
        title = get_element_str(element)
        url = element.attrib['href']
        if not title:
            print('')

        abstract = ''
        abstract_elements = html.xpath('p[@class="res-sub"]')
        if abstract_elements:
            abstract = get_element_str(abstract_elements[0])

        try:
            element = html.xpath('.//p[@class="res-other"]/span')
            element = element[0]
            publish_str = element.text.strip()
            matches = self.publish_time_pattern.findall(publish_str)
            if not matches:
                raise Exception(
                    f'Failed to parse publish datetime from {publish_str}')
            publish = datetime.datetime(int(matches[0][1]), int(matches[0][2]),
                                        int(matches[0][3]))
        except:
            return None

        item = GovItem()
        item.title = title
        item.url = url
        item.keyword = metadata.get('keyword', '')
        item.abstract = abstract
        item.publish = publish

        return item