Exemple #1
0
class TestWikiScraper(TestCase):

    es_ops = None

    def __init__(self, *args, **kwargs):
        super(TestWikiScraper, self).__init__(*args, **kwargs)
        self.es_ops = ElasticSearchOperate()

    @classmethod
    def test_query_wiki_pages(cls):
        query_set = [
            "Alan Turing", "Harry Potter and the Deathly Hallows", "Tiger",
            "Melbourne"
        ]
        for query in query_set:
            wikiq = WikiQuery(query)
            page_list = wikiq.fetch_wiki_pages()
            assert len(page_list) == wikiq.wiki_max_results

    def test_fetch_wiki_pages(self):
        query_set = [736]
        wikif = WikiFetch(query_set)
        wikif.parse_wiki_page()

        for pageid in query_set:
            wiki_data = self.es_ops.get_wiki_article(pageid)
            assert wiki_data is not None
            assert wiki_data[__wiki_title__]
            assert wiki_data[__wiki_raw__]
            assert wiki_data[__wiki_revision__]

    def test_parse_wiki_pages(self):
        query_set = [736]

        html_tag_expr = '<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>(.*?)</\1>'

        for page in query_set:
            with self.subTest(page):
                xpe = XPathExtractor(page)
                xpe.strip_tag()
                xpe.strip_headings()
                img = xpe.img_extract()
                info = xpe.extract_info()
                table = xpe.extract_tables()
                text = xpe.extract_text()
                self.assertNotRegex(text, html_tag_expr)
Exemple #2
0
class XPathExtractor:

    regex_expressions = "http://exslt.org/regular-expressions"
    non_break_space = u'\xa0'
    new_line_non_break_regex = r'(\n+)|(\xa0)'

    toc_pattern = '''//*[@id="toc"]'''
    non_searchable_pattern = '''/html/body/div/div[starts-with(@class, "hatnote")]'''
    description_list_pattern = '''/html/body/div/dl'''
    references_pattern = '''/html/body/div/div[starts-with(@class, "refbegin")]'''
    references_list_pattern = '''/html/body/div/div[starts-with(@class, "reflist")]'''
    meta_data_box_pattern = '''/html/body/div/div[starts-with(@class, "metadata")]'''
    nav_boxes_pattern = '''/html/body/div/div[@class="navbox"]'''
    vertical_nav_boxes_pattern = '''/html/body/div/table[starts-with(@class, "vertical-navbox")]'''
    no_print_metadata_pattern = '''/html/body/div/div[starts-with(@class, "noprint")]'''
    subscript_pattern = '''//sup[@class="reference"]'''
    edit_pattern = '''//span[@class="mw-editsection"]'''
    meta_data_table = '''/html/body/div/table[contains(@class, "metadata")]'''

    see_also_pattern = '''//*[@id="See_also"]'''
    external_links_pattern = '''//*[@id="External_links"]'''

    img_pattern = '''/html/body/div//div[starts-with(@class, "thumb ")]'''
    img_href = '''./div//a/@href'''
    img_caption = '''.//div[@class="thumbcaption"]/text()'''

    info_box_pattern = '''/html/body/div/table[starts-with(@class, "infobox")]'''
    info_box_item = '''./tr'''
    info_key_pattern = '''./th//text()'''
    info_value_pattern = '''./td//text()'''

    table_pattern = '''/html/body/div/table[@class="wikitable"]'''
    table_row_pattern = '''./tr'''
    table_key_pattern = '''./th'''
    table_value_pattern = '''./td'''
    all_text_pattern = '''.//text()'''

    irrelevant_headlines = [
        '''//*[@id="See_also"]''', '''//*[@id="Notes_and_references"]''',
        '''//*[@id="Explanatory_notes"]''', '''//*[@id="Citations"]''',
        '''//*[@id="Further_reading"]''', '''//*[@id="External_links"]''',
        '''//*[@id="References"]'''
    ]

    html_data = ''
    extracted_img = {}
    html_tree = None
    is_file = False
    page_id = None
    es_ops = None
    new_line_non_break_pattern = None

    def __init__(self, html_data, is_file):
        self.es_ops = ElasticSearchOperate()
        self.html_data = html_data
        self.new_line_non_break_pattern = re.compile(
            self.new_line_non_break_regex)
        parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
        if is_file:
            self.html_tree = etree.parse(self.html_data, parser)
        else:
            self.html_tree = etree.fromstring(self.html_data, parser)

    def __init__(self, pageid):
        self.page_id = pageid
        self.new_line_non_break_pattern = re.compile(
            self.new_line_non_break_regex)
        self.es_ops = ElasticSearchOperate()
        wiki_data = self.es_ops.get_wiki_article(pageid)
        if wiki_data is not None and __wiki_raw__ in wiki_data:
            self.html_data = wiki_data[__wiki_raw__]
            parser = etree.HTMLParser(remove_blank_text=True,
                                      remove_comments=True)
            self.html_tree = etree.fromstring(self.html_data, parser)

    def strip_tag(self):

        toc_list = self.html_tree.xpath(self.toc_pattern)
        for toc in toc_list:
            toc.getparent().remove(toc)

        non_searchable_list = self.html_tree.xpath(self.non_searchable_pattern)
        for non_searchable in non_searchable_list:
            non_searchable.getparent().remove(non_searchable)

        dl_list = self.html_tree.xpath(self.description_list_pattern)
        for dl in dl_list:
            dl.getparent().remove(dl)

        ref_begin_list = self.html_tree.xpath(self.references_pattern)
        for ref_begin in ref_begin_list:
            ref_begin.getparent().remove(ref_begin)

        ref_list = self.html_tree.xpath(self.references_list_pattern)
        for ref in ref_list:
            ref.getparent().remove(ref)

        meta_data_list = self.html_tree.xpath(self.meta_data_box_pattern)
        for meta_data in meta_data_list:
            meta_data.getparent().remove(meta_data)

        meta_data_table = self.html_tree.xpath(self.meta_data_table)
        for meta_data in meta_data_table:
            meta_data.getparent().remove(meta_data)

        nav_box_list = self.html_tree.xpath(self.nav_boxes_pattern)
        for nav_box in nav_box_list:
            nav_box.getparent().remove(nav_box)

        vnav_box_list = self.html_tree.xpath(self.vertical_nav_boxes_pattern)
        for vnav_box in vnav_box_list:
            vnav_box.getparent().remove(vnav_box)

        no_print_list = self.html_tree.xpath(self.no_print_metadata_pattern)
        for no_print in no_print_list:
            no_print.getparent().remove(no_print)

        sub_ref_list = self.html_tree.xpath(self.subscript_pattern)
        for sub_ref in sub_ref_list:
            sub_ref.getparent().remove(sub_ref)

        edit_list = self.html_tree.xpath(self.edit_pattern)
        for edit in edit_list:
            edit.getparent().remove(edit)

        see_also_list = self.html_tree.xpath(self.see_also_pattern)
        for see_also in see_also_list:
            see_also_data = see_also.getparent().getnext()
            see_also_data.getparent().remove(see_also_data)

        external_link_list = self.html_tree.xpath(self.external_links_pattern)
        for external_link in external_link_list:
            external_link_data = external_link.getparent().getnext()
            external_link_data.getparent().remove(external_link_data)

    def strip_headings(self):
        for heading in self.irrelevant_headlines:
            heading_parent_list = self.html_tree.xpath(heading)
            if len(heading_parent_list) > 0:
                heading_parent = heading_parent_list[0].getparent()
                heading_parent.getparent().remove(heading_parent)

    def img_extract(self):
        img_list = self.html_tree.xpath(self.img_pattern)
        for img in img_list:
            img_url, img_caption = "", ""
            img_url_list = img.xpath(self.img_href)
            if len(img_url_list) > 0:
                img_url = str(img_url_list[0])
            img_caption_list = img.xpath(self.img_caption)
            if len(img_caption_list) > 0:
                img_caption = ''.join(img_caption_list).strip()
            img.getparent().remove(img)
            if img_url != "":
                self.extracted_img[img_url] = img_caption
        logger.debug("Extracted Images: %d", len(self.extracted_img))
        return self.extracted_img

    def extract_info(self):
        info_box = self.html_tree.xpath(self.info_box_pattern)
        wiki_info = WikiInfo()
        for info in info_box:
            info_key = info.xpath(self.info_box_item)
            info_list = []
            info_title = ""
            for ikey in info_key:
                info_key = ''.join(ikey.xpath(self.info_key_pattern)).strip(
                )  # issues with &nbsp;    // https://stackoverflow.com/a/33829869/8646414
                info_value = ''.join(ikey.xpath(
                    self.info_value_pattern)).strip()
                info_value = info_value.split('\n')
                info_value = [item.strip() for item in info_value]
                if info_key != "" and len(info_value) >= 1:
                    if info_title == "":
                        info_title = info_key
                    if info_value[0] != '':
                        info_pair = {info_key: info_value}
                        info_list.append(info_pair)
            wiki_info.add_info(info_title, info_list)
            info.getparent().remove(info)
        res = self.es_ops.update_wiki_article(self.page_id,
                                              content_info=json.dumps(
                                                  wiki_info.info_data))
        if res:
            logger.info("Inserted parsed content info for: %d", self.page_id)
        else:
            logger.error("Inserted of parsed content info failed")
        logger.debug("Extracted Bios: %d", len(wiki_info.info_data))
        return wiki_info.info_data

    def extract_tables(self):
        table_list = self.html_tree.xpath(self.table_pattern)
        wikit = WikiTable()
        for table in table_list:
            table_row_list = table.xpath(self.table_row_pattern)
            for table_row in table_row_list:
                table_head_list = table_row.xpath(self.table_key_pattern)
                for table_head in table_head_list:
                    wikit.add_header(''.join(
                        table_head.xpath(self.all_text_pattern)))
                tab_data = []
                table_data_list = table_row.xpath(self.table_value_pattern)
                for table_data in table_data_list:
                    tab_data.append(''.join(
                        table_data.xpath(self.all_text_pattern)))
                wikit.set_values(tab_data)
            table.getparent().remove(table)
        res = self.es_ops.update_wiki_article(self.page_id,
                                              content_table=json.dumps(
                                                  wikit.tab_data))
        if res:
            logger.info("Inserted parsed content table for: %d", self.page_id)
        else:
            logger.error("Inserted of parsed content table failed")
        logger.debug("Extracted Tables: %d", len(wikit.tab_data))
        return wikit.tab_data

    def extract_text(self):
        text_data = ''.join(self.html_tree.xpath(
            self.all_text_pattern)).strip()
        text_data = re.sub(self.new_line_non_break_pattern, ' ', text_data)
        res = self.es_ops.update_wiki_article(self.page_id, content=text_data)
        logger.debug("Parsed content length: %d", len(text_data))
        if res:
            logger.info("Inserted parsed content for: %d", self.page_id)
        else:
            logger.error("Inserted of parsed content failed")
        return text_data

    def save_html(self, page=0):
        html_str = etree.tostring(self.html_tree, pretty_print=True)
        with open(OUTPUT_DIR + '/wiki_content_cleaned_' + str(page) + '.html',
                  'wb') as fp:
            fp.write(html_str)