Exemple #1
0
def findOrigArticlesUnderThreshold(fileName, thresholdSize):
    print "looking for original articles smaller than %d bytes" % thresholdSize
    count = 0
    countNoComma = 0
    totalSizeNoComma = 0
    articles = []
    for article in wikipediasql.iterWikipediaArticles(fileName,
                                                      None,
                                                      fUseCache=True,
                                                      fRecreateCache=False):
        if article.fRedirect():
            continue
        body = article.getText()
        if len(body) < thresholdSize:
            #print "size: %d, title: '%s'" % (len(body),article.getTitle())
            articles.append(article)
        if -1 == body.find(","):
            countNoComma += 1
            totalSizeNoComma += len(body)
        count += 1
        if count % 20000 == 0:
            print "processed %d articles, found %d small" % (count,
                                                             len(articles))
    print "Articles without comma in orig: %d" % countNoComma
    avgSize = float(totalSizeNoComma) / float(countNoComma)
    print "Average size: %.2f" % avgSize
    return articles
Exemple #2
0
def findOrigArticleNoRedirect(fileName,titleToFind):
    titleToFind = titleToFind.lower()
    titleToFind = titleToFind.replace(" ", "_")
    print "looking for article with title %s" % titleToFind
    count = 0
    for article in wikipediasql.iterWikipediaArticles(fileName,None,fUseCache=True,fRecreateCache=False):
        title = article.getTitle().lower()
        if title == titleToFind:
            return article
        #if count % 50000 == 0:
        #    print "processed %d articles, last title %s" % (count,title)
        count += 1
    return None
Exemple #3
0
def getRandomArticle(fileName):
    articleNum = 0
    randomArticle = None
    for article in wikipediasql.iterWikipediaArticles(fileName,None,fUseCache=True,fRecreateCache=False):
        if article.fRedirect():
            continue
        articleNum += 1
        # How likely is it that this is the last line of the file ?
        # kjk note: I don't get it
        if random.uniform(0,articleNum)<1:
            randomArticle = article
        if articleNum % 30000 == 0:
            print "processed %d articles" % articleNum
    return randomArticle
Exemple #4
0
def findOrigArticleNoRedirect(fileName, titleToFind):
    titleToFind = titleToFind.lower()
    titleToFind = titleToFind.replace(" ", "_")
    print "looking for article with title %s" % titleToFind
    count = 0
    for article in wikipediasql.iterWikipediaArticles(fileName,
                                                      None,
                                                      fUseCache=True,
                                                      fRecreateCache=False):
        title = article.getTitle().lower()
        if title == titleToFind:
            return article
        #if count % 50000 == 0:
        #    print "processed %d articles, last title %s" % (count,title)
        count += 1
    return None
Exemple #5
0
def getRandomArticle(fileName):
    articleNum = 0
    randomArticle = None
    for article in wikipediasql.iterWikipediaArticles(fileName,
                                                      None,
                                                      fUseCache=True,
                                                      fRecreateCache=False):
        if article.fRedirect():
            continue
        articleNum += 1
        # How likely is it that this is the last line of the file ?
        # kjk note: I don't get it
        if random.uniform(0, articleNum) < 1:
            randomArticle = article
        if articleNum % 30000 == 0:
            print "processed %d articles" % articleNum
    return randomArticle
Exemple #6
0
def findOrigArticlesUnderThreshold(fileName,thresholdSize):
    print "looking for original articles smaller than %d bytes" % thresholdSize
    count = 0
    countNoComma = 0
    totalSizeNoComma = 0
    articles = []
    for article in wikipediasql.iterWikipediaArticles(fileName,None,fUseCache=True,fRecreateCache=False):
        if article.fRedirect():
            continue
        body = article.getText()
        if len(body)<thresholdSize:
            #print "size: %d, title: '%s'" % (len(body),article.getTitle())
            articles.append(article)
        if -1 == body.find(","):
            countNoComma += 1
            totalSizeNoComma += len(body)
        count += 1
        if count % 20000 == 0:
            print "processed %d articles, found %d small" % (count,len(articles))
    print "Articles without comma in orig: %d" % countNoComma
    avgSize = float(totalSizeNoComma)/float(countNoComma)
    print "Average size: %.2f" % avgSize
    return articles
Exemple #7
0
def convertArticles(sqlDump, articleLimit):
    count = 0
    redirects = {}
    articleTitles = {}
    fTesting = False
    if fTesting:
        fUseCache = False
        fRecreateCache = True
    else:
        fUseCache = True
        fRecreateCache = False
        
    for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, fUseCache, fRecreateCache):
        # we only convert article from the main namespace
        assert article.getNamespace() == wikipediasql.NS_MAIN
        title = article.getTitle()
        if article.fRedirect():
            redirects[title] = article.getRedirect()
        else:
            txt = article.getText()
            #links = articleconvert.articleExtractLinks(txt)
            #articleTitles[title] = links
            articleTitles[title] = 1
        count += 1
        if 0 == count % 1000:
            sys.stderr.write("processed %d rows, last title=%s\n" % (count,title.strip()))
        if articleLimit and count >= articleLimit:
            break
    # verify redirects
    print "Number of real articles: %d" % len(articleTitles)
    print "Number of all redirects: %d (%d in total)" % (len(redirects), len(articleTitles)+len(redirects))
    unresolvedCount = 0
    setUnresolvedRedirectWriter(sqlDump)
    redirectsExisting = {}
    for (title,redirect) in redirects.items():
        redirectResolved = resolveRedirect(title,redirect,redirects,articleTitles)
        if None == redirectResolved:
            unresolvedCount +=1
            #print "redirect '%s' (to '%s') not resolved" % (title,redirect)
        else:
            redirectsExisting[title] = redirectResolved
    closeUnresolvedRedirectWriter()
    print "Number of unresolved redirects: %d" % unresolvedCount

    dbName = getDbNameFromFileName(sqlDump)
    ipedia_write_cur = getNamedCursor(getIpediaConnection(dbName), "ipedia_write_cur")
        
    # go over articles again (hopefully now using the cache),
    # convert them to a destination format (including removing invalid links)
    # and insert into a database
    sizeStats = {}
    count = 0
    convWriter = wikipediasql.ConvertedArticleCacheWriter(sqlDump)
    convWriter.open()
    for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, True, False):
        title = article.getTitle()
        articleSize = 0 # 0 is for redirects, which we don't log
        if article.fRedirect():
            convertedArticle = ConvertedArticleRedirect(article.getNamespace(), title, article.getRedirect())
        else:
            txt = article.getText()
            converted = articleconvert.convertArticle(title, txt)
            try:
                noLinks = articleconvert.removeInvalidLinks(converted,redirects,articleTitles)
            except:
                print "exception in articleconvert.removeInvalidLinks"
                print "title: _%s_" % title
                print "txt:\n_%s_" % txt
                print "converted:\n_%s_" % converted

                raise
            if noLinks:
                converted = noLinks
            convertedArticle = ConvertedArticle(article.getNamespace(), article.getTitle(), converted)
            articleSize = len(converted)

        if article.fRedirect():
            if redirectsExisting.has_key(title):
                redirect = redirectsExisting[title]
                try:
                    title = title.replace("_", " ")
                    redirect = redirect.replace("_", " ")
                    ipedia_write_cur.execute("""INSERT INTO redirects (title, redirect) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(redirect)))
                except:
                    print "DUP REDERICT '%s' => '%s'" % (title, redirect)
        else:
            title = title.replace("_", " ")
            if g_fVerbose:
                log_txt = "title: %s " % title
            try:
                ipedia_write_cur.execute("""INSERT INTO articles (title, body) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(converted)))
                if g_fVerbose:
                    log_txt += "*New record"
            except:
                # assuming that the exception happend because of trying to insert
                # item with a duplicate title (duplication due to lower-case
                # conversion might convert 2 differnt titles into the same,
                # lower-cased title)
                if g_fShowDups:
                    print "dup: " + title
                if g_fVerbose:
                    log_txt += "Update existing record"
                print "DUP ARTICLE: '%s'" % title
                ipedia_write_cur.execute("""UPDATE articles SET body='%s' WHERE title='%s'""" % (dbEscape(converted), dbEscape(title)))
            if g_fVerbose:
                print log_txt
        convWriter.write(convertedArticle)
        if articleSize != 0:
            if not sizeStats.has_key(articleSize):
                sizeStats[articleSize] = 1
            else:
                sizeStats[articleSize] = sizeStats[articleSize]+1
        count += 1
        if count % 1000 == 0:
            sys.stderr.write("phase 2 processed %d, last title=%s\n" % (count,article.getTitle()))
    convWriter.close()
    # dump size stats to a file
    statsFileName = wikipediasql.getSizeStatsFileName(sqlDump)
    statsFo = open(statsFileName, "wb")
    sizes = sizeStats.keys()
    sizes.sort()
    for size in sizes:
        count = sizeStats[size]
        statsFo.write("%d\t\t%d\n" % (size,count))
    statsFo.close()
def convertArticles(sqlDump, articleLimit):
    count = 0
    redirects = {}
    articleTitles = {}
    fTesting = False
    if fTesting:
        fUseCache = False
        fRecreateCache = True
    else:
        fUseCache = True
        fRecreateCache = False

    for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit,
                                                      fUseCache,
                                                      fRecreateCache):
        # we only convert article from the main namespace
        assert article.getNamespace() == wikipediasql.NS_MAIN
        title = article.getTitle()
        if article.fRedirect():
            redirects[title] = article.getRedirect()
        else:
            txt = article.getText()
            #links = articleconvert.articleExtractLinks(txt)
            #articleTitles[title] = links
            articleTitles[title] = 1
        count += 1
        if 0 == count % 1000:
            sys.stderr.write("processed %d rows, last title=%s\n" %
                             (count, title.strip()))
        if articleLimit and count >= articleLimit:
            break
    # verify redirects
    print "Number of real articles: %d" % len(articleTitles)
    print "Number of all redirects: %d (%d in total)" % (
        len(redirects), len(articleTitles) + len(redirects))
    unresolvedCount = 0
    setUnresolvedRedirectWriter(sqlDump)
    redirectsExisting = {}
    for (title, redirect) in redirects.items():
        redirectResolved = resolveRedirect(title, redirect, redirects,
                                           articleTitles)
        if None == redirectResolved:
            unresolvedCount += 1
            #print "redirect '%s' (to '%s') not resolved" % (title,redirect)
        else:
            redirectsExisting[title] = redirectResolved
    closeUnresolvedRedirectWriter()
    print "Number of unresolved redirects: %d" % unresolvedCount

    dbName = getDbNameFromFileName(sqlDump)
    ipedia_write_cur = getNamedCursor(getIpediaConnection(dbName),
                                      "ipedia_write_cur")

    # go over articles again (hopefully now using the cache),
    # convert them to a destination format (including removing invalid links)
    # and insert into a database
    sizeStats = {}
    count = 0
    convWriter = wikipediasql.ConvertedArticleCacheWriter(sqlDump)
    convWriter.open()
    for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit,
                                                      True, False):
        title = article.getTitle()
        articleSize = 0  # 0 is for redirects, which we don't log
        if article.fRedirect():
            convertedArticle = ConvertedArticleRedirect(
                article.getNamespace(), title, article.getRedirect())
        else:
            txt = article.getText()
            converted = articleconvert.convertArticle(title, txt)
            try:
                noLinks = articleconvert.removeInvalidLinks(
                    converted, redirects, articleTitles)
            except:
                print "exception in articleconvert.removeInvalidLinks"
                print "title: _%s_" % title
                print "txt:\n_%s_" % txt
                print "converted:\n_%s_" % converted

                raise
            if noLinks:
                converted = noLinks
            convertedArticle = ConvertedArticle(article.getNamespace(),
                                                article.getTitle(), converted)
            articleSize = len(converted)

        if article.fRedirect():
            if redirectsExisting.has_key(title):
                redirect = redirectsExisting[title]
                try:
                    title = title.replace("_", " ")
                    redirect = redirect.replace("_", " ")
                    ipedia_write_cur.execute(
                        """INSERT INTO redirects (title, redirect) VALUES ('%s', '%s')"""
                        % (dbEscape(title), dbEscape(redirect)))
                except:
                    print "DUP REDERICT '%s' => '%s'" % (title, redirect)
        else:
            title = title.replace("_", " ")
            if g_fVerbose:
                log_txt = "title: %s " % title
            try:
                ipedia_write_cur.execute(
                    """INSERT INTO articles (title, body) VALUES ('%s', '%s')"""
                    % (dbEscape(title), dbEscape(converted)))
                if g_fVerbose:
                    log_txt += "*New record"
            except:
                # assuming that the exception happend because of trying to insert
                # item with a duplicate title (duplication due to lower-case
                # conversion might convert 2 differnt titles into the same,
                # lower-cased title)
                if g_fShowDups:
                    print "dup: " + title
                if g_fVerbose:
                    log_txt += "Update existing record"
                print "DUP ARTICLE: '%s'" % title
                ipedia_write_cur.execute(
                    """UPDATE articles SET body='%s' WHERE title='%s'""" %
                    (dbEscape(converted), dbEscape(title)))
            if g_fVerbose:
                print log_txt
        convWriter.write(convertedArticle)
        if articleSize != 0:
            if not sizeStats.has_key(articleSize):
                sizeStats[articleSize] = 1
            else:
                sizeStats[articleSize] = sizeStats[articleSize] + 1
        count += 1
        if count % 1000 == 0:
            sys.stderr.write("phase 2 processed %d, last title=%s\n" %
                             (count, article.getTitle()))
    convWriter.close()
    # dump size stats to a file
    statsFileName = wikipediasql.getSizeStatsFileName(sqlDump)
    statsFo = open(statsFileName, "wb")
    sizes = sizeStats.keys()
    sizes.sort()
    for size in sizes:
        count = sizeStats[size]
        statsFo.write("%d\t\t%d\n" % (size, count))
    statsFo.close()