Beispiel #1
0
def calcReverseLinks(fileName):
    print "Calculating reverse links"
    count = 0
    reverseLinks = {}
    totalLinksCount = 0
    for article in wikipediasql.iterConvertedArticles(fileName):
        if article.fRedirect():
            continue
        count += 1
        title = article.getTitle()
        if -1 != title.find(LINK_SEPARATOR):
            print "rejected title '%s', has link separator (%d)" % (title, ord(LINK_SEPARATOR))
            continue
        body = article.getText()
        links = articleconvert.articleExtractLinksSimple(body)
        totalLinksCount += len(links)
        for link in links:
            linkLower = link.lower()
            if reverseLinks.has_key(linkLower):
                currentLinks = reverseLinks[linkLower]
                if len(currentLinks)<REVERSE_LINK_LIMIT:
                    if title not in currentLinks: # TODO: only needed because of duplicates?
                        currentLinks.append(title)
            else:
                reverseLinks[linkLower] = [link,title]
        if count % 20000 == 0:
            sys.stderr.write("processed %d articles\n" % count)
    print "number of articles with reverse links: %d" % len(reverseLinks)
    avgLinksCount = float(totalLinksCount)/float(len(reverseLinks))
    print "average number of links: %.2f" % avgLinksCount
    # now dump them into a database
    print "started inserting data into reverse_links table"
    dbName = getDbNameFromFileName(fileName)
    cur = getNamedCursor(getIpediaConnection(dbName), "rev_links_write_cur")
    for rLinks in reverseLinks.values():
        title = rLinks[0]
        links = rLinks[1:]
        assert len(links)>0
        # need to escape the character we use for gluing the strings together
        # client will have to un-escape
        #body = string.join([l.replace(":", "::") for l in links],":")
        body = string.join(links, LINK_SEPARATOR)
        try:
            sql = "INSERT INTO reverse_links (title,links_to_it) VALUES ('%s', '%s');" % (dbEscape(title), dbEscape(body))
            cur.execute(sql)
        except:
            # assuming that the exception happend because of trying to insert
            # item with a duplicate title (duplication due to lower-case
            # conversion might convert 2 differnt titles into the same,
            # lower-cased title)
            try:
                sql = "UPDATE reverse_links SET links_to_it='%s' WHERE title='%s';" % (dbEscape(body), dbEscape(title))
                cur.execute(sql)
            except:
                # nothing we can do about it
                sys.stderr.write("Exception in UPDATE article '%s' with body of len %d\n" % (title, len(body)))
    print "finished inserting data into reverse_links table"
Beispiel #2
0
def findConvertedArticle(fileName,titleToFind):
    titleToFind = titleToFind.lower()
    titleToFind = titleToFind.replace(" ", "_")
    print "looking for converted article with title '%s'" % titleToFind
    count = 0
    for article in wikipediasql.iterConvertedArticles(fileName):
        title = article.getTitle().lower()
        if title == titleToFind:
            print "found converted article with title '%s'" % title
            return article
        #if count % 50000 == 0:
        #    print "processed %d articles, last title %s" % (count,title)
        count += 1
    return None
Beispiel #3
0
def findConvertedArticle(fileName, titleToFind):
    titleToFind = titleToFind.lower()
    titleToFind = titleToFind.replace(" ", "_")
    print "looking for converted article with title '%s'" % titleToFind
    count = 0
    for article in wikipediasql.iterConvertedArticles(fileName):
        title = article.getTitle().lower()
        if title == titleToFind:
            print "found converted article with title '%s'" % title
            return article
        #if count % 50000 == 0:
        #    print "processed %d articles, last title %s" % (count,title)
        count += 1
    return None
Beispiel #4
0
def findConvertedArticlesUnderThreshold(fileName,thresholdSize):
    print "looking for converted articles smaller than %d bytes" % thresholdSize
    count = 0
    countNoComma = 0
    totalSizeNoComma = 0
    articles = []
    for article in wikipediasql.iterConvertedArticles(fileName):
        if article.fRedirect():
            continue
        body = article.getText()
        if len(body)<thresholdSize:
            #print "size: %d, title: '%s'" % (len(body),article.getTitle())
            articles.append(article)
        if -1 == body.find(","):
            countNoComma += 1
            totalSizeNoComma += len(body)
        count += 1
        if count % 20000 == 0:
            print "processed %d articles, found %d small" % (count,len(articles))
    print "Articles without comma in converted: %d" % countNoComma
    avgSize = float(totalSizeNoComma)/float(countNoComma)
    print "Average size: %.2f" % avgSize
    return articles
Beispiel #5
0
def findConvertedArticlesUnderThreshold(fileName, thresholdSize):
    print "looking for converted articles smaller than %d bytes" % thresholdSize
    count = 0
    countNoComma = 0
    totalSizeNoComma = 0
    articles = []
    for article in wikipediasql.iterConvertedArticles(fileName):
        if article.fRedirect():
            continue
        body = article.getText()
        if len(body) < thresholdSize:
            #print "size: %d, title: '%s'" % (len(body),article.getTitle())
            articles.append(article)
        if -1 == body.find(","):
            countNoComma += 1
            totalSizeNoComma += len(body)
        count += 1
        if count % 20000 == 0:
            print "processed %d articles, found %d small" % (count,
                                                             len(articles))
    print "Articles without comma in converted: %d" % countNoComma
    avgSize = float(totalSizeNoComma) / float(countNoComma)
    print "Average size: %.2f" % avgSize
    return articles
Beispiel #6
0
def calcReverseLinks(fileName):
    print "Calculating reverse links"
    count = 0
    reverseLinks = {}
    totalLinksCount = 0
    for article in wikipediasql.iterConvertedArticles(fileName):
        if article.fRedirect():
            continue
        count += 1
        title = article.getTitle()
        if -1 != title.find(LINK_SEPARATOR):
            print "rejected title '%s', has link separator (%d)" % (
                title, ord(LINK_SEPARATOR))
            continue
        body = article.getText()
        links = articleconvert.articleExtractLinksSimple(body)
        totalLinksCount += len(links)
        for link in links:
            linkLower = link.lower()
            if reverseLinks.has_key(linkLower):
                currentLinks = reverseLinks[linkLower]
                if len(currentLinks) < REVERSE_LINK_LIMIT:
                    if title not in currentLinks:  # TODO: only needed because of duplicates?
                        currentLinks.append(title)
            else:
                reverseLinks[linkLower] = [link, title]
        if count % 20000 == 0:
            sys.stderr.write("processed %d articles\n" % count)
    print "number of articles with reverse links: %d" % len(reverseLinks)
    avgLinksCount = float(totalLinksCount) / float(len(reverseLinks))
    print "average number of links: %.2f" % avgLinksCount
    # now dump them into a database
    print "started inserting data into reverse_links table"
    dbName = getDbNameFromFileName(fileName)
    cur = getNamedCursor(getIpediaConnection(dbName), "rev_links_write_cur")
    for rLinks in reverseLinks.values():
        title = rLinks[0]
        links = rLinks[1:]
        assert len(links) > 0
        # need to escape the character we use for gluing the strings together
        # client will have to un-escape
        #body = string.join([l.replace(":", "::") for l in links],":")
        body = string.join(links, LINK_SEPARATOR)
        try:
            sql = "INSERT INTO reverse_links (title,links_to_it) VALUES ('%s', '%s');" % (
                dbEscape(title), dbEscape(body))
            cur.execute(sql)
        except:
            # assuming that the exception happend because of trying to insert
            # item with a duplicate title (duplication due to lower-case
            # conversion might convert 2 differnt titles into the same,
            # lower-cased title)
            try:
                sql = "UPDATE reverse_links SET links_to_it='%s' WHERE title='%s';" % (
                    dbEscape(body), dbEscape(title))
                cur.execute(sql)
            except:
                # nothing we can do about it
                sys.stderr.write(
                    "Exception in UPDATE article '%s' with body of len %d\n" %
                    (title, len(body)))
    print "finished inserting data into reverse_links table"