コード例 #1
0
    def test_empty(self):
        """
        Test the a page with no links
        """
        text = "No one put any links inside me!"
        wiki_page = WikiPage("Test 2", 2, text)

        correct_links = set()
        found_links = extractLinks(wiki_page)
        self.assertSetEqual(found_links, correct_links)
コード例 #2
0
    def test_cat_text(self):
        """
        Test the recognition of category-text values
        [[Page Title#Displayed Text]]
        """
        text = "Sometimes I worry about [[Supermajority#Majority of the entire membership]]"
        wiki_page = WikiPage("Test 4", 4, text)

        correct_links = {'Supermajority'}
        found_links = extractLinks(wiki_page)
        self.assertSetEqual(found_links, correct_links)
コード例 #3
0
    def test_basic(self):
        """
        Test the recognition of a very basic text
        [[Page Title]]
        """
        text = "[[Cats]] are the [[shit]] and everyone knows it."
        wiki_page = WikiPage("Test 1", 1, text)

        correct_links = {'Cats', 'shit'}
        found_links = extractLinks(wiki_page)
        self.assertSetEqual(found_links, correct_links)
コード例 #4
0
    def test_categories(self):
        """
        Test ignoring of embedded categories
        [[Category:some-pic.jpg]]
        """
        text = "Other commonly received and repeated types of [[Athena]] in sculpture may be found in [[:Category:" \
               "Athena types|this list]] or fun stuff like [[Category:Fun Stuff|these categories!]]."
        wiki_page = WikiPage("Test 6", 6, text)

        correct_links = {'Athena'}
        found_links = extractLinks(wiki_page)
        self.assertSetEqual(found_links, correct_links)
コード例 #5
0
    def test_alt_text(self):
        """
        Test the recognition of alt-text values
        [[Page Title|Displayed Text]]
        """
        text = "At first, this was a play on words, as [[Abba Seafood|Abba]] is also the name of a well-known fish-" \
               "canning company in Sweden"
        wiki_page = WikiPage("Test 3", 3, text)

        correct_links = {'Abba Seafood'}
        found_links = extractLinks(wiki_page)
        self.assertSetEqual(found_links, correct_links)
コード例 #6
0
    def test_end_sections(self):
        """
        Test ignoring of common ending sections
        ==References== (also possibly ==External Links==)
        """
        text = """Done Talking.
        ==References==
        [[My references]]
        """
        wiki_page = WikiPage("Test 7", 7, text)

        correct_links = set()
        found_links = extractLinks(wiki_page)
        self.assertSetEqual(found_links, correct_links)
コード例 #7
0
    def test_files(self):
        """
        Test ignoring of embedded files
        [[File:some-pic.jpg]]
        Is that embedded link in the 'File:' tag an issue? Should probably check that.
        """
        text = """Anna Kournikova was a [[Russian]] I think, and she played tennis.
[[File:Kournikova-Hingis-SYD-1.jpg|thumb|300px|right|Kournikova (left) with doubles partner Martina Hingis.]]
While [[Kournikova]] had a successful singles season, she was even more successful in doubles.
        """
        wiki_page = WikiPage("Test 5", 5, text)

        correct_links = {'Russian', 'Kournikova'}
        found_links = extractLinks(wiki_page)
        self.assertSetEqual(found_links, correct_links)
コード例 #8
0
    def test_small_example(self):
        """
        Test full page with several things present.
        """
        text = """==Events in August==
[[August]] is a boring month. Nothing fun in [[August]]. School I guess.
[[File:Les Tres Riches Heures du duc de Berry aout.jpg|right|thumb|August, from the Tres Riches Heures du Duc de Berry showing a group of travelers]]
* In the [[neopaganism|neopagan]] [[wheel of the year]] August begins at or near [[Lughnasadh#Location]] (also known as Lammas) in the [[northern hemisphere]] and [[Imbolc]] (also known as Candlemas) in the [[southern hemisphere]].
==External links==
[[Some link]]
[[Another link|Not a fun link]]
        """
        wiki_page = WikiPage("Test 7", 7, text)

        correct_links = {'August', 'neopaganism', 'wheel of the year', 'Lughnasadh', 'northern hemisphere',
                         'Imbolc', 'southern hemisphere'}
        found_links = extractLinks(wiki_page)
        self.assertSetEqual(found_links, correct_links)
コード例 #9
0
    def extractLinksFromArticle(self, wikiPage):
        """
        Read a WikiPage, extract all the links from it, and store them
          and the article data into Linktable and IndexTable
        Argument: Receives a page_parser.WikiPage object
        Used as callback method with WikiDumpHandler
        :param wikiPage:
        """
        # If the current WikiPage has been added already, skip it
        if int(wikiPage.id) <= self.lastId:
            return

        # Extract links from the current article
        links = extractLinks(wikiPage=wikiPage)

        # If the article is significant enough
        if len(links) >= MIN_LINKS:
            self.linksModel.storeLinks(wikiPage.id, links)
            self.indexModel.storeWikiArticle(wikiPage, len(links), -1)

            self.total_articles += 1
            self.total_links += len(links)
            print "Inserted %s; Number of links: %d" % (wikiPage.__str__(), len(links))