def findOrigArticlesUnderThreshold(fileName, thresholdSize): print "looking for original articles smaller than %d bytes" % thresholdSize count = 0 countNoComma = 0 totalSizeNoComma = 0 articles = [] for article in wikipediasql.iterWikipediaArticles(fileName, None, fUseCache=True, fRecreateCache=False): if article.fRedirect(): continue body = article.getText() if len(body) < thresholdSize: #print "size: %d, title: '%s'" % (len(body),article.getTitle()) articles.append(article) if -1 == body.find(","): countNoComma += 1 totalSizeNoComma += len(body) count += 1 if count % 20000 == 0: print "processed %d articles, found %d small" % (count, len(articles)) print "Articles without comma in orig: %d" % countNoComma avgSize = float(totalSizeNoComma) / float(countNoComma) print "Average size: %.2f" % avgSize return articles
def findOrigArticleNoRedirect(fileName,titleToFind): titleToFind = titleToFind.lower() titleToFind = titleToFind.replace(" ", "_") print "looking for article with title %s" % titleToFind count = 0 for article in wikipediasql.iterWikipediaArticles(fileName,None,fUseCache=True,fRecreateCache=False): title = article.getTitle().lower() if title == titleToFind: return article #if count % 50000 == 0: # print "processed %d articles, last title %s" % (count,title) count += 1 return None
def getRandomArticle(fileName): articleNum = 0 randomArticle = None for article in wikipediasql.iterWikipediaArticles(fileName,None,fUseCache=True,fRecreateCache=False): if article.fRedirect(): continue articleNum += 1 # How likely is it that this is the last line of the file ? # kjk note: I don't get it if random.uniform(0,articleNum)<1: randomArticle = article if articleNum % 30000 == 0: print "processed %d articles" % articleNum return randomArticle
def findOrigArticleNoRedirect(fileName, titleToFind): titleToFind = titleToFind.lower() titleToFind = titleToFind.replace(" ", "_") print "looking for article with title %s" % titleToFind count = 0 for article in wikipediasql.iterWikipediaArticles(fileName, None, fUseCache=True, fRecreateCache=False): title = article.getTitle().lower() if title == titleToFind: return article #if count % 50000 == 0: # print "processed %d articles, last title %s" % (count,title) count += 1 return None
def getRandomArticle(fileName): articleNum = 0 randomArticle = None for article in wikipediasql.iterWikipediaArticles(fileName, None, fUseCache=True, fRecreateCache=False): if article.fRedirect(): continue articleNum += 1 # How likely is it that this is the last line of the file ? # kjk note: I don't get it if random.uniform(0, articleNum) < 1: randomArticle = article if articleNum % 30000 == 0: print "processed %d articles" % articleNum return randomArticle
def findOrigArticlesUnderThreshold(fileName,thresholdSize): print "looking for original articles smaller than %d bytes" % thresholdSize count = 0 countNoComma = 0 totalSizeNoComma = 0 articles = [] for article in wikipediasql.iterWikipediaArticles(fileName,None,fUseCache=True,fRecreateCache=False): if article.fRedirect(): continue body = article.getText() if len(body)<thresholdSize: #print "size: %d, title: '%s'" % (len(body),article.getTitle()) articles.append(article) if -1 == body.find(","): countNoComma += 1 totalSizeNoComma += len(body) count += 1 if count % 20000 == 0: print "processed %d articles, found %d small" % (count,len(articles)) print "Articles without comma in orig: %d" % countNoComma avgSize = float(totalSizeNoComma)/float(countNoComma) print "Average size: %.2f" % avgSize return articles
def convertArticles(sqlDump, articleLimit): count = 0 redirects = {} articleTitles = {} fTesting = False if fTesting: fUseCache = False fRecreateCache = True else: fUseCache = True fRecreateCache = False for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, fUseCache, fRecreateCache): # we only convert article from the main namespace assert article.getNamespace() == wikipediasql.NS_MAIN title = article.getTitle() if article.fRedirect(): redirects[title] = article.getRedirect() else: txt = article.getText() #links = articleconvert.articleExtractLinks(txt) #articleTitles[title] = links articleTitles[title] = 1 count += 1 if 0 == count % 1000: sys.stderr.write("processed %d rows, last title=%s\n" % (count,title.strip())) if articleLimit and count >= articleLimit: break # verify redirects print "Number of real articles: %d" % len(articleTitles) print "Number of all redirects: %d (%d in total)" % (len(redirects), len(articleTitles)+len(redirects)) unresolvedCount = 0 setUnresolvedRedirectWriter(sqlDump) redirectsExisting = {} for (title,redirect) in redirects.items(): redirectResolved = resolveRedirect(title,redirect,redirects,articleTitles) if None == redirectResolved: unresolvedCount +=1 #print "redirect '%s' (to '%s') not resolved" % (title,redirect) else: redirectsExisting[title] = redirectResolved closeUnresolvedRedirectWriter() print "Number of unresolved redirects: %d" % unresolvedCount dbName = getDbNameFromFileName(sqlDump) ipedia_write_cur = getNamedCursor(getIpediaConnection(dbName), "ipedia_write_cur") # go over articles again (hopefully now using the cache), # convert them to a destination format (including removing invalid links) # and insert into a database sizeStats = {} count = 0 convWriter = wikipediasql.ConvertedArticleCacheWriter(sqlDump) convWriter.open() for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, True, False): title = article.getTitle() articleSize = 0 # 0 is for redirects, which we don't log if article.fRedirect(): convertedArticle = ConvertedArticleRedirect(article.getNamespace(), title, article.getRedirect()) else: txt = article.getText() converted = articleconvert.convertArticle(title, txt) try: noLinks = articleconvert.removeInvalidLinks(converted,redirects,articleTitles) except: print "exception in articleconvert.removeInvalidLinks" print "title: _%s_" % title print "txt:\n_%s_" % txt print "converted:\n_%s_" % converted raise if noLinks: converted = noLinks convertedArticle = ConvertedArticle(article.getNamespace(), article.getTitle(), converted) articleSize = len(converted) if article.fRedirect(): if redirectsExisting.has_key(title): redirect = redirectsExisting[title] try: title = title.replace("_", " ") redirect = redirect.replace("_", " ") ipedia_write_cur.execute("""INSERT INTO redirects (title, redirect) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(redirect))) except: print "DUP REDERICT '%s' => '%s'" % (title, redirect) else: title = title.replace("_", " ") if g_fVerbose: log_txt = "title: %s " % title try: ipedia_write_cur.execute("""INSERT INTO articles (title, body) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(converted))) if g_fVerbose: log_txt += "*New record" except: # assuming that the exception happend because of trying to insert # item with a duplicate title (duplication due to lower-case # conversion might convert 2 differnt titles into the same, # lower-cased title) if g_fShowDups: print "dup: " + title if g_fVerbose: log_txt += "Update existing record" print "DUP ARTICLE: '%s'" % title ipedia_write_cur.execute("""UPDATE articles SET body='%s' WHERE title='%s'""" % (dbEscape(converted), dbEscape(title))) if g_fVerbose: print log_txt convWriter.write(convertedArticle) if articleSize != 0: if not sizeStats.has_key(articleSize): sizeStats[articleSize] = 1 else: sizeStats[articleSize] = sizeStats[articleSize]+1 count += 1 if count % 1000 == 0: sys.stderr.write("phase 2 processed %d, last title=%s\n" % (count,article.getTitle())) convWriter.close() # dump size stats to a file statsFileName = wikipediasql.getSizeStatsFileName(sqlDump) statsFo = open(statsFileName, "wb") sizes = sizeStats.keys() sizes.sort() for size in sizes: count = sizeStats[size] statsFo.write("%d\t\t%d\n" % (size,count)) statsFo.close()
def convertArticles(sqlDump, articleLimit): count = 0 redirects = {} articleTitles = {} fTesting = False if fTesting: fUseCache = False fRecreateCache = True else: fUseCache = True fRecreateCache = False for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, fUseCache, fRecreateCache): # we only convert article from the main namespace assert article.getNamespace() == wikipediasql.NS_MAIN title = article.getTitle() if article.fRedirect(): redirects[title] = article.getRedirect() else: txt = article.getText() #links = articleconvert.articleExtractLinks(txt) #articleTitles[title] = links articleTitles[title] = 1 count += 1 if 0 == count % 1000: sys.stderr.write("processed %d rows, last title=%s\n" % (count, title.strip())) if articleLimit and count >= articleLimit: break # verify redirects print "Number of real articles: %d" % len(articleTitles) print "Number of all redirects: %d (%d in total)" % ( len(redirects), len(articleTitles) + len(redirects)) unresolvedCount = 0 setUnresolvedRedirectWriter(sqlDump) redirectsExisting = {} for (title, redirect) in redirects.items(): redirectResolved = resolveRedirect(title, redirect, redirects, articleTitles) if None == redirectResolved: unresolvedCount += 1 #print "redirect '%s' (to '%s') not resolved" % (title,redirect) else: redirectsExisting[title] = redirectResolved closeUnresolvedRedirectWriter() print "Number of unresolved redirects: %d" % unresolvedCount dbName = getDbNameFromFileName(sqlDump) ipedia_write_cur = getNamedCursor(getIpediaConnection(dbName), "ipedia_write_cur") # go over articles again (hopefully now using the cache), # convert them to a destination format (including removing invalid links) # and insert into a database sizeStats = {} count = 0 convWriter = wikipediasql.ConvertedArticleCacheWriter(sqlDump) convWriter.open() for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, True, False): title = article.getTitle() articleSize = 0 # 0 is for redirects, which we don't log if article.fRedirect(): convertedArticle = ConvertedArticleRedirect( article.getNamespace(), title, article.getRedirect()) else: txt = article.getText() converted = articleconvert.convertArticle(title, txt) try: noLinks = articleconvert.removeInvalidLinks( converted, redirects, articleTitles) except: print "exception in articleconvert.removeInvalidLinks" print "title: _%s_" % title print "txt:\n_%s_" % txt print "converted:\n_%s_" % converted raise if noLinks: converted = noLinks convertedArticle = ConvertedArticle(article.getNamespace(), article.getTitle(), converted) articleSize = len(converted) if article.fRedirect(): if redirectsExisting.has_key(title): redirect = redirectsExisting[title] try: title = title.replace("_", " ") redirect = redirect.replace("_", " ") ipedia_write_cur.execute( """INSERT INTO redirects (title, redirect) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(redirect))) except: print "DUP REDERICT '%s' => '%s'" % (title, redirect) else: title = title.replace("_", " ") if g_fVerbose: log_txt = "title: %s " % title try: ipedia_write_cur.execute( """INSERT INTO articles (title, body) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(converted))) if g_fVerbose: log_txt += "*New record" except: # assuming that the exception happend because of trying to insert # item with a duplicate title (duplication due to lower-case # conversion might convert 2 differnt titles into the same, # lower-cased title) if g_fShowDups: print "dup: " + title if g_fVerbose: log_txt += "Update existing record" print "DUP ARTICLE: '%s'" % title ipedia_write_cur.execute( """UPDATE articles SET body='%s' WHERE title='%s'""" % (dbEscape(converted), dbEscape(title))) if g_fVerbose: print log_txt convWriter.write(convertedArticle) if articleSize != 0: if not sizeStats.has_key(articleSize): sizeStats[articleSize] = 1 else: sizeStats[articleSize] = sizeStats[articleSize] + 1 count += 1 if count % 1000 == 0: sys.stderr.write("phase 2 processed %d, last title=%s\n" % (count, article.getTitle())) convWriter.close() # dump size stats to a file statsFileName = wikipediasql.getSizeStatsFileName(sqlDump) statsFo = open(statsFileName, "wb") sizes = sizeStats.keys() sizes.sort() for size in sizes: count = sizeStats[size] statsFo.write("%d\t\t%d\n" % (size, count)) statsFo.close()