def buildNBInput(documents, allTokens): nbInput = [] for doc in documents: input = {} input[0] = int( float(doc.readability) + 0.5) input[1] = doc.name for i in range(len(allTokens)): word = allTokens[i] if word in doc.tokens: input[i+2] = int(doc.tokens[word]) nbInput.append(input) nbTest = [] nbTraining = [] random.shuffle(nbInput) for t in nbInput: if random.random() >= 0.8: nbTest.append(t) else: nbTraining.append(t) print "Using %d intances: %d for training and %d for test" %(len(nbInput), len(nbTraining), len(nbTest)) #nbInput is read! nb = NB() nb.trainClassifier(nbTraining) nb.testInBatch(nbTest)
def main(): #production #logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.INFO, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S') #testing logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.DEBUG, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S') simpleApi = myWikiApi("http://simple.wikipedia.org/w/api.php?") enApi = myWikiApi("http://en.wikipedia.org/w/api.php?") maxDepth = 15 dataDir = "data" #Create or load training set arffh = ArffHandler() arffFileCreated = generateTraining(enApi, "training", arffh) [featureNames, featureVector] = arffh.readArffFile(arffFileCreated) nb = NB() nb.trainClassifier(featureVector) initialCategories = [["Category:Medicine"]] for category in initialCategories: simpleApi.getAllSubCategories(category, maxDepth, nb, arffh) simpleCategories = simpleApi.getVisitedCategories() simplePageTitles = [] logging.info("Total number of simple categories used: %d", len(simpleCategories)) for category in simpleCategories: simplePageTitles += simpleApi.getAllPagesBelongingToACategory(category) equals = 0 total = 0 for page in simplePageTitles: print page pageFileName = re.sub(" ","_",page) simpleFile = open(dataDir + "/" + pageFileName + ".simple", "w") enFile = open(dataDir + "/" + pageFileName + ".en", "w") simpleContent = simpleApi.getPageContent(page, bagOfWords=False) enContent = enApi.getPageContent(page, bagOfWords=False) if simpleContent and enContent: for w in enContent: enFile.write("%s" % w) for w in simpleContent: simpleFile.write("%s" % w) #print "simpleContent:", simpleContent #print "enContent:", enContent print "Equal?", simpleContent == enContent if simpleContent == enContent: equals += 1 total += 1 print ("Number of equals = %d . Total number = %d" % (equals, total))