def calcReverseLinks(fileName): print "Calculating reverse links" count = 0 reverseLinks = {} totalLinksCount = 0 for article in wikipediasql.iterConvertedArticles(fileName): if article.fRedirect(): continue count += 1 title = article.getTitle() if -1 != title.find(LINK_SEPARATOR): print "rejected title '%s', has link separator (%d)" % (title, ord(LINK_SEPARATOR)) continue body = article.getText() links = articleconvert.articleExtractLinksSimple(body) totalLinksCount += len(links) for link in links: linkLower = link.lower() if reverseLinks.has_key(linkLower): currentLinks = reverseLinks[linkLower] if len(currentLinks)<REVERSE_LINK_LIMIT: if title not in currentLinks: # TODO: only needed because of duplicates? currentLinks.append(title) else: reverseLinks[linkLower] = [link,title] if count % 20000 == 0: sys.stderr.write("processed %d articles\n" % count) print "number of articles with reverse links: %d" % len(reverseLinks) avgLinksCount = float(totalLinksCount)/float(len(reverseLinks)) print "average number of links: %.2f" % avgLinksCount # now dump them into a database print "started inserting data into reverse_links table" dbName = getDbNameFromFileName(fileName) cur = getNamedCursor(getIpediaConnection(dbName), "rev_links_write_cur") for rLinks in reverseLinks.values(): title = rLinks[0] links = rLinks[1:] assert len(links)>0 # need to escape the character we use for gluing the strings together # client will have to un-escape #body = string.join([l.replace(":", "::") for l in links],":") body = string.join(links, LINK_SEPARATOR) try: sql = "INSERT INTO reverse_links (title,links_to_it) VALUES ('%s', '%s');" % (dbEscape(title), dbEscape(body)) cur.execute(sql) except: # assuming that the exception happend because of trying to insert # item with a duplicate title (duplication due to lower-case # conversion might convert 2 differnt titles into the same, # lower-cased title) try: sql = "UPDATE reverse_links SET links_to_it='%s' WHERE title='%s';" % (dbEscape(body), dbEscape(title)) cur.execute(sql) except: # nothing we can do about it sys.stderr.write("Exception in UPDATE article '%s' with body of len %d\n" % (title, len(body))) print "finished inserting data into reverse_links table"
def findConvertedArticle(fileName,titleToFind): titleToFind = titleToFind.lower() titleToFind = titleToFind.replace(" ", "_") print "looking for converted article with title '%s'" % titleToFind count = 0 for article in wikipediasql.iterConvertedArticles(fileName): title = article.getTitle().lower() if title == titleToFind: print "found converted article with title '%s'" % title return article #if count % 50000 == 0: # print "processed %d articles, last title %s" % (count,title) count += 1 return None
def findConvertedArticle(fileName, titleToFind): titleToFind = titleToFind.lower() titleToFind = titleToFind.replace(" ", "_") print "looking for converted article with title '%s'" % titleToFind count = 0 for article in wikipediasql.iterConvertedArticles(fileName): title = article.getTitle().lower() if title == titleToFind: print "found converted article with title '%s'" % title return article #if count % 50000 == 0: # print "processed %d articles, last title %s" % (count,title) count += 1 return None
def findConvertedArticlesUnderThreshold(fileName,thresholdSize): print "looking for converted articles smaller than %d bytes" % thresholdSize count = 0 countNoComma = 0 totalSizeNoComma = 0 articles = [] for article in wikipediasql.iterConvertedArticles(fileName): if article.fRedirect(): continue body = article.getText() if len(body)<thresholdSize: #print "size: %d, title: '%s'" % (len(body),article.getTitle()) articles.append(article) if -1 == body.find(","): countNoComma += 1 totalSizeNoComma += len(body) count += 1 if count % 20000 == 0: print "processed %d articles, found %d small" % (count,len(articles)) print "Articles without comma in converted: %d" % countNoComma avgSize = float(totalSizeNoComma)/float(countNoComma) print "Average size: %.2f" % avgSize return articles
def findConvertedArticlesUnderThreshold(fileName, thresholdSize): print "looking for converted articles smaller than %d bytes" % thresholdSize count = 0 countNoComma = 0 totalSizeNoComma = 0 articles = [] for article in wikipediasql.iterConvertedArticles(fileName): if article.fRedirect(): continue body = article.getText() if len(body) < thresholdSize: #print "size: %d, title: '%s'" % (len(body),article.getTitle()) articles.append(article) if -1 == body.find(","): countNoComma += 1 totalSizeNoComma += len(body) count += 1 if count % 20000 == 0: print "processed %d articles, found %d small" % (count, len(articles)) print "Articles without comma in converted: %d" % countNoComma avgSize = float(totalSizeNoComma) / float(countNoComma) print "Average size: %.2f" % avgSize return articles
def calcReverseLinks(fileName): print "Calculating reverse links" count = 0 reverseLinks = {} totalLinksCount = 0 for article in wikipediasql.iterConvertedArticles(fileName): if article.fRedirect(): continue count += 1 title = article.getTitle() if -1 != title.find(LINK_SEPARATOR): print "rejected title '%s', has link separator (%d)" % ( title, ord(LINK_SEPARATOR)) continue body = article.getText() links = articleconvert.articleExtractLinksSimple(body) totalLinksCount += len(links) for link in links: linkLower = link.lower() if reverseLinks.has_key(linkLower): currentLinks = reverseLinks[linkLower] if len(currentLinks) < REVERSE_LINK_LIMIT: if title not in currentLinks: # TODO: only needed because of duplicates? currentLinks.append(title) else: reverseLinks[linkLower] = [link, title] if count % 20000 == 0: sys.stderr.write("processed %d articles\n" % count) print "number of articles with reverse links: %d" % len(reverseLinks) avgLinksCount = float(totalLinksCount) / float(len(reverseLinks)) print "average number of links: %.2f" % avgLinksCount # now dump them into a database print "started inserting data into reverse_links table" dbName = getDbNameFromFileName(fileName) cur = getNamedCursor(getIpediaConnection(dbName), "rev_links_write_cur") for rLinks in reverseLinks.values(): title = rLinks[0] links = rLinks[1:] assert len(links) > 0 # need to escape the character we use for gluing the strings together # client will have to un-escape #body = string.join([l.replace(":", "::") for l in links],":") body = string.join(links, LINK_SEPARATOR) try: sql = "INSERT INTO reverse_links (title,links_to_it) VALUES ('%s', '%s');" % ( dbEscape(title), dbEscape(body)) cur.execute(sql) except: # assuming that the exception happend because of trying to insert # item with a duplicate title (duplication due to lower-case # conversion might convert 2 differnt titles into the same, # lower-cased title) try: sql = "UPDATE reverse_links SET links_to_it='%s' WHERE title='%s';" % ( dbEscape(body), dbEscape(title)) cur.execute(sql) except: # nothing we can do about it sys.stderr.write( "Exception in UPDATE article '%s' with body of len %d\n" % (title, len(body))) print "finished inserting data into reverse_links table"