# iterate through all words in this document split by space and create the vector of words for word in documents[i].strip().split(' '): if len(word) > 0: # if this is a word (i.e. not a null character), we add it to the vector vector[ dictionary[word] ] = 1 # the word is in this document, so we set it to 1. For the # index, we pull it from the word dictionary we created above # update the current document we're processing with the new vector of words contained in that document documents[i] = vector # Create Gaussian NB classifier and fit it to the data gnb = GaussianNB() # create the classifier object gnbfit = gnb.fit(documents, labels) # fit (/train) the classifier to the data pred = gnbfit.predict(documents) # classify the documents again using the classifier to test # for accuracy print len(dictionary),"words in the word dictionary" print len(documents),"data points in the training set" print "f1-score (weighted) of classifier on training set is:", f1_score(labels, pred, average='weighted') f = open('test.txt', 'r') test = cleanBody.cleanBody(f.readline()) vector = [0] * len(dictionary) for word in test.strip().split(' '): if len(word) > 0 and word in dictionary: vector[ dictionary[word] ] = 1 prediction = gnbfit.predict(vector) print "Predicted as category: %s (id:%d)" % (categories[prediction], prediction)
# basketball (3) # nil (4) max_per_category = 100 c = 0 print "Size of articles:", len(articles) # for each article in articles for arturl in articles: # if all categories have more than max_per_category then # quit looping. nil category excluded print count[2:4] if min(count[2:4]) < max_per_category: c += 1 print c, "..", arturl category,content = getPage.getPage(arturl) if category == False and content == False: # no content on this page? add it to the nil category count[4] += 1 else: if count[category - 1] < max_per_category: count[category - 1] += 1 data.append( "%d %s" % (category,cleanBody.cleanBody(content)) ) else: break print data print count file = open("data.txt", "w") file.write("\n".join(data).encode('utf-8'))
word]] = 1 # the word is in this document, so we set it to 1. For the # index, we pull it from the word dictionary we created above # update the current document we're processing with the new vector of words contained in that document documents[i] = vector # Create Gaussian NB classifier and fit it to the data gnb = GaussianNB() # create the classifier object gnbfit = gnb.fit(documents, labels) # fit (/train) the classifier to the data pred = gnbfit.predict( documents) # classify the documents again using the classifier to test # for accuracy print len(dictionary), "words in the word dictionary" print len(documents), "data points in the training set" print "f1-score (weighted) of classifier on training set is:", f1_score( labels, pred, average='weighted') f = open('test.txt', 'r') test = cleanBody.cleanBody(f.readline()) vector = [0] * len(dictionary) for word in test.strip().split(' '): if len(word) > 0 and word in dictionary: vector[dictionary[word]] = 1 prediction = gnbfit.predict(vector) print "Predicted as category: %s (id:%d)" % (categories[prediction], prediction)