Example #1
0
def runTests(fileName):
    redirects = {}
    articleTitles = {}
    testCount = 0
    failedCount = 0
    for test in iterTests(fileName):
        orig = test.orig
        expected = test.expected
        converted = articleconvert.convertArticle(test.name,orig)
        expected  = arsutils.normalizeNewlines(expected)
        converted = arsutils.normalizeNewlines(converted)
        if converted != expected:
            failedCount += 1
            test.setConverted(converted)
            failedList.append(test)
            sys.stdout.write("-")
        else:
            sys.stdout.write(".")
        noLinks = articleconvert.removeInvalidLinks(converted,redirects,articleTitles)
        testCount += 1
    print
    print "Total  tests: %d" % testCount
    print "Failed tests: %d" % failedCount
    dumpFailed()
    diffFirstFailed()
def runTests(fileName):
    redirects = {}
    articleTitles = {}
    testCount = 0
    failedCount = 0
    for test in iterTests(fileName):
        orig = test.orig
        expected = test.expected
        converted = articleconvert.convertArticle(test.name, orig)
        expected = arsutils.normalizeNewlines(expected)
        converted = arsutils.normalizeNewlines(converted)
        if converted != expected:
            failedCount += 1
            test.setConverted(converted)
            failedList.append(test)
            sys.stdout.write("-")
        else:
            sys.stdout.write(".")
        noLinks = articleconvert.removeInvalidLinks(converted, redirects,
                                                    articleTitles)
        testCount += 1
    print
    print "Total  tests: %d" % testCount
    print "Failed tests: %d" % failedCount
    dumpFailed()
    diffFirstFailed()
Example #3
0
def dumpArticle(fileName,title):
    for article in iterArticlesExactTitle(fileName,title):
        if not article:
            print "couldn't find the body of article %s" % title
            return
        title = article.getTitle().strip() + "\n"
        txt = article.getTxt()
        converted = articleconvert.convertArticle(title,txt)
        print "TITLE: %s" % title
        print "ORIGINAL: %s" % txt
        print "CONVERTED: %s" % converted
        return
Example #4
0
def dumpArticle(fileName, title):
    for article in iterArticlesExactTitle(fileName, title):
        if not article:
            print "couldn't find the body of article %s" % title
            return
        title = article.getTitle().strip() + "\n"
        txt = article.getTxt()
        converted = articleconvert.convertArticle(title, txt)
        print "TITLE: %s" % title
        print "ORIGINAL: %s" % txt
        print "CONVERTED: %s" % converted
        return
Example #5
0
def showDiffTitle(fileName, title, fSave=False, fForceConvert=False):
    article = findOrigArticle(fileName, title)
    if not article:
        print "couldn't find article with the title %s" % title
        return
    origTxt = article.getText()
    origTxt = arsutils.normalizeNewlines(origTxt)

    if fForceConvert:
        convertedTxt = articleconvert.convertArticle(article.getTitle(),
                                                     article.getText())
    else:
        title = article.getTitle(
        )  # re-get the title in case this was a redirect
        convertedArticle = None
        if wikipediasql.fConvertedCacheExists(fileName):
            convertedArticle = findConvertedArticle(fileName, title)
        else:
            print "Converted cache for '%s' doesn't exist" % fileName
            sys.exit(0)

        if None == convertedArticle:
            print "didn't find converted article, generating it myself"
            convertedTxt = articleconvert.convertArticle(
                article.getTitle(), article.getText())
        else:
            convertedTxt = convertedArticle.getText()

    convertedTxt = arsutils.normalizeNewlines(convertedTxt)
    if fSave:
        title = article.getTitle()
        title = title.replace(" ", "_")
        fo = open("%s_orig.txt" % title, "wb")
        fo.write(origTxt)
        fo.close()
        fo = open("%s_conv.txt" % title, "wb")
        fo.write(convertedTxt)
        fo.close()
    arsutils.showTxtDiff(origTxt, convertedTxt)
Example #6
0
def showDiffTitle(fileName,title,fSave=False,fForceConvert=False):
    article = findOrigArticle(fileName,title)
    if not article:
        print "couldn't find article with the title %s" % title
        return
    origTxt = article.getText()
    origTxt = arsutils.normalizeNewlines(origTxt)

    if fForceConvert:
        convertedTxt = articleconvert.convertArticle(article.getTitle(), article.getText())
    else:
        title = article.getTitle() # re-get the title in case this was a redirect
        convertedArticle = None
        if wikipediasql.fConvertedCacheExists(fileName):
            convertedArticle = findConvertedArticle(fileName,title)
        else:
            print "Converted cache for '%s' doesn't exist" % fileName
            sys.exit(0)

        if None == convertedArticle:
            print "didn't find converted article, generating it myself"
            convertedTxt = articleconvert.convertArticle(article.getTitle(), article.getText())
        else:
            convertedTxt = convertedArticle.getText()

    convertedTxt = arsutils.normalizeNewlines(convertedTxt)
    if fSave:
        title = article.getTitle()
        title = title.replace(" ", "_")
        fo = open("%s_orig.txt" % title, "wb")
        fo.write(origTxt)
        fo.close()
        fo = open("%s_conv.txt" % title, "wb")
        fo.write(convertedTxt)
        fo.close()
    arsutils.showTxtDiff(origTxt, convertedTxt)
Example #7
0
def showDiffRandom(fileName):
    # at this point shows the diff of the first article
    article = getRandomArticle(fileName)
    title = article.getTitle()
    convertedArticle = None
    if wikipediasql.fConvertedCacheExists(fileName):
        convertedArticle = findConvertedArticle(fileName,title)

    if not convertedArticle:
        print "didn't find article '%s' in the converted cache" % title
    origTxt = article.getText()
    origTxt = arsutils.normalizeNewlines(origTxt)
    if convertedArticle:
        converted = arsutils.normalizeNewlines(convertedArticle.getText())
        arsutils.showTxtDiff(origTxt, converted)
    else:
        converted = articleconvert.convertArticle(article.getTitle(), article.getText())
        converted = arsutils.normalizeNewlines(converted)
        arsutils.showTxtDiff(origTxt,converted)
Example #8
0
def showDiffRandom(fileName):
    # at this point shows the diff of the first article
    article = getRandomArticle(fileName)
    title = article.getTitle()
    convertedArticle = None
    if wikipediasql.fConvertedCacheExists(fileName):
        convertedArticle = findConvertedArticle(fileName, title)

    if not convertedArticle:
        print "didn't find article '%s' in the converted cache" % title
    origTxt = article.getText()
    origTxt = arsutils.normalizeNewlines(origTxt)
    if convertedArticle:
        converted = arsutils.normalizeNewlines(convertedArticle.getText())
        arsutils.showTxtDiff(origTxt, converted)
    else:
        converted = articleconvert.convertArticle(article.getTitle(),
                                                  article.getText())
        converted = arsutils.normalizeNewlines(converted)
        arsutils.showTxtDiff(origTxt, converted)
Example #9
0
def convertArticles(sqlDump, articleLimit):
    count = 0
    redirects = {}
    articleTitles = {}
    fTesting = False
    if fTesting:
        fUseCache = False
        fRecreateCache = True
    else:
        fUseCache = True
        fRecreateCache = False
        
    for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, fUseCache, fRecreateCache):
        # we only convert article from the main namespace
        assert article.getNamespace() == wikipediasql.NS_MAIN
        title = article.getTitle()
        if article.fRedirect():
            redirects[title] = article.getRedirect()
        else:
            txt = article.getText()
            #links = articleconvert.articleExtractLinks(txt)
            #articleTitles[title] = links
            articleTitles[title] = 1
        count += 1
        if 0 == count % 1000:
            sys.stderr.write("processed %d rows, last title=%s\n" % (count,title.strip()))
        if articleLimit and count >= articleLimit:
            break
    # verify redirects
    print "Number of real articles: %d" % len(articleTitles)
    print "Number of all redirects: %d (%d in total)" % (len(redirects), len(articleTitles)+len(redirects))
    unresolvedCount = 0
    setUnresolvedRedirectWriter(sqlDump)
    redirectsExisting = {}
    for (title,redirect) in redirects.items():
        redirectResolved = resolveRedirect(title,redirect,redirects,articleTitles)
        if None == redirectResolved:
            unresolvedCount +=1
            #print "redirect '%s' (to '%s') not resolved" % (title,redirect)
        else:
            redirectsExisting[title] = redirectResolved
    closeUnresolvedRedirectWriter()
    print "Number of unresolved redirects: %d" % unresolvedCount

    dbName = getDbNameFromFileName(sqlDump)
    ipedia_write_cur = getNamedCursor(getIpediaConnection(dbName), "ipedia_write_cur")
        
    # go over articles again (hopefully now using the cache),
    # convert them to a destination format (including removing invalid links)
    # and insert into a database
    sizeStats = {}
    count = 0
    convWriter = wikipediasql.ConvertedArticleCacheWriter(sqlDump)
    convWriter.open()
    for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, True, False):
        title = article.getTitle()
        articleSize = 0 # 0 is for redirects, which we don't log
        if article.fRedirect():
            convertedArticle = ConvertedArticleRedirect(article.getNamespace(), title, article.getRedirect())
        else:
            txt = article.getText()
            converted = articleconvert.convertArticle(title, txt)
            try:
                noLinks = articleconvert.removeInvalidLinks(converted,redirects,articleTitles)
            except:
                print "exception in articleconvert.removeInvalidLinks"
                print "title: _%s_" % title
                print "txt:\n_%s_" % txt
                print "converted:\n_%s_" % converted

                raise
            if noLinks:
                converted = noLinks
            convertedArticle = ConvertedArticle(article.getNamespace(), article.getTitle(), converted)
            articleSize = len(converted)

        if article.fRedirect():
            if redirectsExisting.has_key(title):
                redirect = redirectsExisting[title]
                try:
                    title = title.replace("_", " ")
                    redirect = redirect.replace("_", " ")
                    ipedia_write_cur.execute("""INSERT INTO redirects (title, redirect) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(redirect)))
                except:
                    print "DUP REDERICT '%s' => '%s'" % (title, redirect)
        else:
            title = title.replace("_", " ")
            if g_fVerbose:
                log_txt = "title: %s " % title
            try:
                ipedia_write_cur.execute("""INSERT INTO articles (title, body) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(converted)))
                if g_fVerbose:
                    log_txt += "*New record"
            except:
                # assuming that the exception happend because of trying to insert
                # item with a duplicate title (duplication due to lower-case
                # conversion might convert 2 differnt titles into the same,
                # lower-cased title)
                if g_fShowDups:
                    print "dup: " + title
                if g_fVerbose:
                    log_txt += "Update existing record"
                print "DUP ARTICLE: '%s'" % title
                ipedia_write_cur.execute("""UPDATE articles SET body='%s' WHERE title='%s'""" % (dbEscape(converted), dbEscape(title)))
            if g_fVerbose:
                print log_txt
        convWriter.write(convertedArticle)
        if articleSize != 0:
            if not sizeStats.has_key(articleSize):
                sizeStats[articleSize] = 1
            else:
                sizeStats[articleSize] = sizeStats[articleSize]+1
        count += 1
        if count % 1000 == 0:
            sys.stderr.write("phase 2 processed %d, last title=%s\n" % (count,article.getTitle()))
    convWriter.close()
    # dump size stats to a file
    statsFileName = wikipediasql.getSizeStatsFileName(sqlDump)
    statsFo = open(statsFileName, "wb")
    sizes = sizeStats.keys()
    sizes.sort()
    for size in sizes:
        count = sizeStats[size]
        statsFo.write("%d\t\t%d\n" % (size,count))
    statsFo.close()
Example #10
0
def convertArticles(sqlDump, articleLimit):
    count = 0
    redirects = {}
    articleTitles = {}
    fTesting = False
    if fTesting:
        fUseCache = False
        fRecreateCache = True
    else:
        fUseCache = True
        fRecreateCache = False

    for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit,
                                                      fUseCache,
                                                      fRecreateCache):
        # we only convert article from the main namespace
        assert article.getNamespace() == wikipediasql.NS_MAIN
        title = article.getTitle()
        if article.fRedirect():
            redirects[title] = article.getRedirect()
        else:
            txt = article.getText()
            #links = articleconvert.articleExtractLinks(txt)
            #articleTitles[title] = links
            articleTitles[title] = 1
        count += 1
        if 0 == count % 1000:
            sys.stderr.write("processed %d rows, last title=%s\n" %
                             (count, title.strip()))
        if articleLimit and count >= articleLimit:
            break
    # verify redirects
    print "Number of real articles: %d" % len(articleTitles)
    print "Number of all redirects: %d (%d in total)" % (
        len(redirects), len(articleTitles) + len(redirects))
    unresolvedCount = 0
    setUnresolvedRedirectWriter(sqlDump)
    redirectsExisting = {}
    for (title, redirect) in redirects.items():
        redirectResolved = resolveRedirect(title, redirect, redirects,
                                           articleTitles)
        if None == redirectResolved:
            unresolvedCount += 1
            #print "redirect '%s' (to '%s') not resolved" % (title,redirect)
        else:
            redirectsExisting[title] = redirectResolved
    closeUnresolvedRedirectWriter()
    print "Number of unresolved redirects: %d" % unresolvedCount

    dbName = getDbNameFromFileName(sqlDump)
    ipedia_write_cur = getNamedCursor(getIpediaConnection(dbName),
                                      "ipedia_write_cur")

    # go over articles again (hopefully now using the cache),
    # convert them to a destination format (including removing invalid links)
    # and insert into a database
    sizeStats = {}
    count = 0
    convWriter = wikipediasql.ConvertedArticleCacheWriter(sqlDump)
    convWriter.open()
    for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit,
                                                      True, False):
        title = article.getTitle()
        articleSize = 0  # 0 is for redirects, which we don't log
        if article.fRedirect():
            convertedArticle = ConvertedArticleRedirect(
                article.getNamespace(), title, article.getRedirect())
        else:
            txt = article.getText()
            converted = articleconvert.convertArticle(title, txt)
            try:
                noLinks = articleconvert.removeInvalidLinks(
                    converted, redirects, articleTitles)
            except:
                print "exception in articleconvert.removeInvalidLinks"
                print "title: _%s_" % title
                print "txt:\n_%s_" % txt
                print "converted:\n_%s_" % converted

                raise
            if noLinks:
                converted = noLinks
            convertedArticle = ConvertedArticle(article.getNamespace(),
                                                article.getTitle(), converted)
            articleSize = len(converted)

        if article.fRedirect():
            if redirectsExisting.has_key(title):
                redirect = redirectsExisting[title]
                try:
                    title = title.replace("_", " ")
                    redirect = redirect.replace("_", " ")
                    ipedia_write_cur.execute(
                        """INSERT INTO redirects (title, redirect) VALUES ('%s', '%s')"""
                        % (dbEscape(title), dbEscape(redirect)))
                except:
                    print "DUP REDERICT '%s' => '%s'" % (title, redirect)
        else:
            title = title.replace("_", " ")
            if g_fVerbose:
                log_txt = "title: %s " % title
            try:
                ipedia_write_cur.execute(
                    """INSERT INTO articles (title, body) VALUES ('%s', '%s')"""
                    % (dbEscape(title), dbEscape(converted)))
                if g_fVerbose:
                    log_txt += "*New record"
            except:
                # assuming that the exception happend because of trying to insert
                # item with a duplicate title (duplication due to lower-case
                # conversion might convert 2 differnt titles into the same,
                # lower-cased title)
                if g_fShowDups:
                    print "dup: " + title
                if g_fVerbose:
                    log_txt += "Update existing record"
                print "DUP ARTICLE: '%s'" % title
                ipedia_write_cur.execute(
                    """UPDATE articles SET body='%s' WHERE title='%s'""" %
                    (dbEscape(converted), dbEscape(title)))
            if g_fVerbose:
                print log_txt
        convWriter.write(convertedArticle)
        if articleSize != 0:
            if not sizeStats.has_key(articleSize):
                sizeStats[articleSize] = 1
            else:
                sizeStats[articleSize] = sizeStats[articleSize] + 1
        count += 1
        if count % 1000 == 0:
            sys.stderr.write("phase 2 processed %d, last title=%s\n" %
                             (count, article.getTitle()))
    convWriter.close()
    # dump size stats to a file
    statsFileName = wikipediasql.getSizeStatsFileName(sqlDump)
    statsFo = open(statsFileName, "wb")
    sizes = sizeStats.keys()
    sizes.sort()
    for size in sizes:
        count = sizeStats[size]
        statsFo.write("%d\t\t%d\n" % (size, count))
    statsFo.close()