Ejemplo n.º 1
0
    def __init__(self, indexTableName=INDEX_TABLE, linksTableName=LINKS_TABLE):
        """
        Initialize the database and cursors
        """
        # Patterns to find end of main wiki article
        self.reference_pattern = re.compile('=+\s?References\s?=+')
        self.extlink_pattern = re.compile('=+\s?External links\s?=+')

        # Initialize the needed database connections and create a new indexTable if necessary
        self.linksModel = LinksModel(linksTable=linksTableName)
        self.indexModel = IndexModel(indexTable=indexTableName)

        # Stats counters
        self.total_articles = 0
        self.total_links = 0
Ejemplo n.º 2
0
class ArticleLinkExtractor:
    """
    Class to read through a MySQL index of wiki articles, extract a list of all links contained, and store in new indexTable
    """

    def __init__(self, indexTableName=INDEX_TABLE, linksTableName=LINKS_TABLE):
        """
        Initialize the database and cursors
        """
        # Patterns to find end of main wiki article
        self.reference_pattern = re.compile('=+\s?References\s?=+')
        self.extlink_pattern = re.compile('=+\s?External links\s?=+')

        # Initialize the needed database connections and create a new indexTable if necessary
        self.linksModel = LinksModel(linksTable=linksTableName)
        self.indexModel = IndexModel(indexTable=indexTableName)

        # Stats counters
        self.total_articles = 0
        self.total_links = 0

    def extractLinks(self, wikiPage):
        """
        Takes a WikiPage object as an argument
        Returns a list of all valid links found within the current article
        """
        text = wikiPage.text
        links = set()

        # End string at Reference or External Links section if possible
        reference_match = self.reference_pattern.search(text)
        if reference_match:
            text = text[:reference_match.start()]
        else:
            extlink_match = self.extlink_pattern.search(text)
            if extlink_match:
                text = text[:extlink_match.start()]

        # Start iteratively hunting for links (items surrounded by double square brackets)
        link_start = text.find("[[")
        while link_start != -1:
            # Ignore links that start with 'File:'
            if text[link_start + 2: link_start + 7] == "File:":
                link_end = link_start
            #TODO: Check for "Category:" also
            else:
                # Find first matching closing bracket pair
                link_end = text.find("]]", link_start)
                new_link = text[link_start + 2: link_end]

                # Remove alternate display text
                alt_text = new_link.find("|")
                if alt_text > 0:
                    new_link = new_link[:alt_text]
                    # Remove category display text
                category_text = new_link.find("#")
                if category_text > 0:
                    new_link = new_link[:category_text]

                # Add new link to current list of links
                links.add(new_link.strip())

            # Edit string to remove previous link
            text = text[link_end + 2:]
            link_start = text.find("[[")
        return links

    def parseAllNewArticles(self):
        """
        Determine last index row ID added to DB
        Iterate through all articles starting from this row in the indexTable
        Create a WikiPage from the row and pass to extractLinks(WikiPage)
        Store all links found in linksTable
        """
        # Find number of possible objects in DB
        max_count = self.indexModel.getMaxRowId()

        # Determine last article ID added to DB
        last_id = self.linksModel.getMaxIndexId()
        if last_id > 0:
            curr_row = last_id + 1
            print "Indexing all articles starting from row %d in indexTable %s" % (curr_row, self.indexModel.indexTable)
        else:
            curr_row = 1
            print "No previous articles found in linksTable %s, adding all new articles from the beginning." \
                  % self.linksModel.linksTable

        # Fetch every possible row in the database
        while curr_row <= max_count:
            wiki_page = self.indexModel.getWikiPage(curr_row)
            if wiki_page:
                links = self.extractLinks(wiki_page)
                self.total_articles += 1
                self.total_links += len(links)

                # Store links in links_from indexTable
                self.linksModel.storeLinks(curr_row, links)
                print "Links within %s (rowid %d): %d" % (wiki_page.title, curr_row, len(links))
            curr_row += 1

    def exitHandler(self):
        """
        Called when program is killed
        """
        self.linksModel.closeTable()
        self.indexModel.closeTable()
        # Show off the output
        print ""
        print "ArticleLinkExtractor closing. Parsed the following new articles:"
        if self.total_articles > 0:
            print "Total articles:  %d" % self.total_articles
            print "Total links:     %d" % self.total_links
            print "Avg links/art:   %f" % (1.0 * self.total_links / self.total_articles)
        else:
            print "No new articles added."