def train_on_amazon(goodFilename, badFilename): goodFile = open(goodFilename) badFile = open(badFilename) goodArticles = [inputText for inputText in goodFile.read().split("<!-- BOUNDARY -->")] goodArticles.sort() #random.shuffle(goodArticles) good_index = int(len(goodArticles) * 9.0 / 10.0) goodWords = [tuple(ngrams.getWordsForAnalysis(inputText)) for inputText in goodArticles][:good_index] badArticles = [inputText for inputText in badFile.read().split("<!-- BOUNDARY -->")] badArticles.sort() #random.shuffle(badArticles) bad_index = int(len(badArticles) * 9.0 / 10.0) badWords = [tuple(ngrams.getWordsForAnalysis(inputText)) for inputText in badArticles][:bad_index] for document in goodWords: train(document, True) for document in badWords: train(document, False) if __name__ == '__main__': print 'good results' results = [(classify(document), document) for document in goodArticles[good_index:]] percent = ['%s\n%s' % (result, document) for result, document in results if result < 0] print '\n'.join(sorted(percent)) print 1.0 * sum([result for result, document in results]) / len(results) print 'percent:%s' % ( len(percent) *1.0 / len(results)) print 'bad results' results = [(classify(document), document) for document in badArticles[bad_index:]] percent = ['%s\n%s' % (result, document) for result, document in results if result > 0] print '\n'.join(sorted(percent)) print 1.0 * sum([result for result, document in results]) / len(results) print 'percent:%s' % (1.0 * len(percent) / len(results))
def classify(text, testing = False): words = tuple(ngrams.getWordsForAnalysis(text)) return goodness(words, testing)