def main(): #production #logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.INFO, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S') #testing logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.DEBUG, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S') simpleApi = myWikiApi("http://simple.wikipedia.org/w/api.php?") enApi = myWikiApi("http://en.wikipedia.org/w/api.php?") maxDepth = 15 dataDir = "data" #Create or load training set arffh = ArffHandler() arffFileCreated = generateTraining(enApi, "training", arffh) [featureNames, featureVector] = arffh.readArffFile(arffFileCreated) nb = NB() nb.trainClassifier(featureVector) initialCategories = [["Category:Medicine"]] for category in initialCategories: simpleApi.getAllSubCategories(category, maxDepth, nb, arffh) simpleCategories = simpleApi.getVisitedCategories() simplePageTitles = [] logging.info("Total number of simple categories used: %d", len(simpleCategories)) for category in simpleCategories: simplePageTitles += simpleApi.getAllPagesBelongingToACategory(category) equals = 0 total = 0 for page in simplePageTitles: print page pageFileName = re.sub(" ","_",page) simpleFile = open(dataDir + "/" + pageFileName + ".simple", "w") enFile = open(dataDir + "/" + pageFileName + ".en", "w") simpleContent = simpleApi.getPageContent(page, bagOfWords=False) enContent = enApi.getPageContent(page, bagOfWords=False) if simpleContent and enContent: for w in enContent: enFile.write("%s" % w) for w in simpleContent: simpleFile.write("%s" % w) #print "simpleContent:", simpleContent #print "enContent:", enContent print "Equal?", simpleContent == enContent if simpleContent == enContent: equals += 1 total += 1 print ("Number of equals = %d . Total number = %d" % (equals, total))
#print "original =", name de = de.decode("utf-8") #print "decoded utf-8 =", de de = de.replace("\u2013", "-") #print "replaced =", de de = de.encode("utf-8") #print "encoded utf-8 =", de de = de.decode('unicode-escape') #print "decoded unicode-escap =", de de = re.sub(ur'[\xc2-\xf4][\x80-\xbf]+', lambda m: m.group(0).encode('latin1').decode('utf8'), de) return de.encode("utf-8") if __name__ == "__main__": api = myWikiApi("http://" + wikibase + ".wikipedia.org/w/api.php?") #categories = ["Category:B-Class_medicine_articles"] categories = [ "Category:FA-Class_medicine_articles", "Category:FL-Class_medicine_articles", "Category:FM-Class_medicine_articles", "Category:A-Class_medicine_articles", "Category:GA-Class_medicine_articles", "Category:B-Class_medicine_articles", "Category:C-Class_medicine_articles", "Category:Start-Class_medicine_articles", "Category:Book-Class_medicine_articles", "Category:Category-Class_medicine_articles", "Category:List-Class_medicine_articles", "Category:NA-Class_medicine_articles",
def main(): logging.basicConfig(filename='generateBigGraph.log', filemode='w', level=logging.DEBUG, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S') enApi = myWikiApi("http://en.wikipedia.org/w/api.php?") enApi.printBigGraph("Category:Main_topic_classifications")