Beispiel #1
0
def train_on_amazon(goodFilename, badFilename):
    goodFile = open(goodFilename)
    badFile =  open(badFilename)
 
    goodArticles = [inputText for inputText in goodFile.read().split("<!-- BOUNDARY -->")]
    goodArticles.sort()
    #random.shuffle(goodArticles)
    good_index = int(len(goodArticles) * 9.0 / 10.0)
    goodWords = [tuple(ngrams.getWordsForAnalysis(inputText)) 
		 for inputText in goodArticles][:good_index]

    badArticles = [inputText for inputText in badFile.read().split("<!-- BOUNDARY -->")]
    badArticles.sort()
    #random.shuffle(badArticles)
    bad_index = int(len(badArticles) * 9.0 / 10.0)
    badWords = [tuple(ngrams.getWordsForAnalysis(inputText)) 
		for inputText in badArticles][:bad_index]

    for document in goodWords:
        train(document, True)

    for document in badWords:
        train(document, False)
    
    if __name__ == '__main__': 
    	print 'good results'
    	results = [(classify(document), document) for document in goodArticles[good_index:]]
    	percent = ['%s\n%s' % (result, document) for result, document in results if result < 0]
    	print '\n'.join(sorted(percent))
    	print 1.0 * sum([result for result, document in results]) / len(results) 	
    	print 'percent:%s' % ( len(percent) *1.0 / len(results))

    	print 'bad results'
    	results = [(classify(document), document) for document in badArticles[bad_index:]]

    	percent = ['%s\n%s' % (result, document) for result, document in results if result > 0]
    	print '\n'.join(sorted(percent))
    	print 1.0 * sum([result for result, document in results]) / len(results) 	
    	print 'percent:%s' % (1.0 * len(percent) / len(results))
Beispiel #2
0
def classify(text, testing = False):
    words = tuple(ngrams.getWordsForAnalysis(text))
    return goodness(words, testing)