def buildNBInput(documents, allTokens):
    nbInput = []
    
    for doc in documents:
        input = {}
        input[0] = int( float(doc.readability) + 0.5)
        input[1] = doc.name
        
        for i in range(len(allTokens)):
            word = allTokens[i]

            if word in doc.tokens:
                input[i+2] = int(doc.tokens[word])
        nbInput.append(input)
    
    nbTest = []
    nbTraining = []
    random.shuffle(nbInput)
    for t in nbInput:
        if random.random() >= 0.8:
            nbTest.append(t)
        else:
            nbTraining.append(t)
    
    print "Using %d intances: %d for training and %d for test" %(len(nbInput), len(nbTraining), len(nbTest))
    #nbInput is read!
    nb = NB()
    nb.trainClassifier(nbTraining)

    nb.testInBatch(nbTest)
def main():
    #production
    #logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.INFO, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S')
    #testing
    logging.basicConfig(filename='getSimpleAndRegularEnglish.log', filemode='w', level=logging.DEBUG, format='%(asctime)s-%(levelname)s: %(message)s', datefmt='%d/%m/%Y-%H:%M:%S')
    
    simpleApi = myWikiApi("http://simple.wikipedia.org/w/api.php?")
    enApi = myWikiApi("http://en.wikipedia.org/w/api.php?")
    maxDepth = 15
    dataDir = "data"

    #Create or load training set
    arffh = ArffHandler()
    arffFileCreated = generateTraining(enApi, "training", arffh)
    [featureNames, featureVector] = arffh.readArffFile(arffFileCreated)
    
    nb = NB()
    nb.trainClassifier(featureVector)

    initialCategories = [["Category:Medicine"]]

    for category in initialCategories:
        simpleApi.getAllSubCategories(category, maxDepth, nb, arffh)

    simpleCategories = simpleApi.getVisitedCategories()
    simplePageTitles = []
   
    logging.info("Total number of simple categories used: %d", len(simpleCategories))
    for category in simpleCategories:
        simplePageTitles += simpleApi.getAllPagesBelongingToACategory(category)

    equals = 0
    total = 0
    for page in simplePageTitles:
        print page

        pageFileName = re.sub(" ","_",page)
        simpleFile = open(dataDir + "/" + pageFileName + ".simple", "w")
        enFile = open(dataDir + "/" + pageFileName + ".en", "w")

        simpleContent = simpleApi.getPageContent(page, bagOfWords=False)
        enContent = enApi.getPageContent(page, bagOfWords=False)
        
        if simpleContent and enContent:

            for w in enContent:
                enFile.write("%s" % w)
            for w in simpleContent:
                simpleFile.write("%s" % w)

            #print "simpleContent:", simpleContent
            #print "enContent:", enContent
            print "Equal?", simpleContent == enContent
            if simpleContent == enContent:
                equals += 1
            total += 1

    print ("Number of equals = %d . Total number = %d" % (equals, total))