def main():
    #production
    #logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.INFO, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S')
    #testing
    logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.DEBUG, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S')
    
    simpleApi = myWikiApi("http://simple.wikipedia.org/w/api.php?")
    enApi = myWikiApi("http://en.wikipedia.org/w/api.php?")
    maxDepth = 15
    dataDir = "data"

    #Create or load training set
    arffh = ArffHandler()
    arffFileCreated = generateTraining(enApi, "training", arffh)
    [featureNames, featureVector] = arffh.readArffFile(arffFileCreated)
    
    nb = NB()
    nb.trainClassifier(featureVector)

    initialCategories = [["Category:Medicine"]]

    for category in initialCategories:
        simpleApi.getAllSubCategories(category, maxDepth, nb, arffh)

    simpleCategories = simpleApi.getVisitedCategories()
    simplePageTitles = []
   
    logging.info("Total number of simple categories used: %d", len(simpleCategories))
    for category in simpleCategories:
        simplePageTitles += simpleApi.getAllPagesBelongingToACategory(category)

    equals = 0
    total = 0
    for page in simplePageTitles:
        print page

        pageFileName = re.sub(" ","_",page)
        simpleFile = open(dataDir + "/" + pageFileName + ".simple", "w")
        enFile = open(dataDir + "/" + pageFileName + ".en", "w")

        simpleContent = simpleApi.getPageContent(page, bagOfWords=False)
        enContent = enApi.getPageContent(page, bagOfWords=False)
        
        if simpleContent and enContent:

            for w in enContent:
                enFile.write("%s" % w)
            for w in simpleContent:
                simpleFile.write("%s" % w)

            #print "simpleContent:", simpleContent
            #print "enContent:", enContent
            print "Equal?", simpleContent == enContent
            if simpleContent == enContent:
                equals += 1
            total += 1

    print ("Number of equals = %d . Total number = %d" % (equals, total))
Beispiel #2
0
    #print "original =", name
    de = de.decode("utf-8")
    #print "decoded utf-8 =", de
    de = de.replace("\u2013", "-")
    #print "replaced =", de
    de = de.encode("utf-8")
    #print "encoded utf-8 =", de
    de = de.decode('unicode-escape')
    #print "decoded unicode-escap =", de
    de = re.sub(ur'[\xc2-\xf4][\x80-\xbf]+',
                lambda m: m.group(0).encode('latin1').decode('utf8'), de)
    return de.encode("utf-8")


if __name__ == "__main__":
    api = myWikiApi("http://" + wikibase + ".wikipedia.org/w/api.php?")

    #categories = ["Category:B-Class_medicine_articles"]
    categories = [
        "Category:FA-Class_medicine_articles",
        "Category:FL-Class_medicine_articles",
        "Category:FM-Class_medicine_articles",
        "Category:A-Class_medicine_articles",
        "Category:GA-Class_medicine_articles",
        "Category:B-Class_medicine_articles",
        "Category:C-Class_medicine_articles",
        "Category:Start-Class_medicine_articles",
        "Category:Book-Class_medicine_articles",
        "Category:Category-Class_medicine_articles",
        "Category:List-Class_medicine_articles",
        "Category:NA-Class_medicine_articles",
def main():
    
    logging.basicConfig(filename='generateBigGraph.log', filemode='w', level=logging.DEBUG, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S')
    enApi = myWikiApi("http://en.wikipedia.org/w/api.php?")
    enApi.printBigGraph("Category:Main_topic_classifications")