def getNgrams(data): unigrams = tokenize(data, en_stem=False, en_stopword_removal=True).split()[:25] bigrams = zip(unigrams, unigrams[1:]) trigrams = zip(unigrams, unigrams[1:], unigrams[2:]) grams = list(bigrams) + list(trigrams) return grams
cursor = connection.cursor() total = 1000 accuracy = [] work_data = cursor.execute("select * from News order by title ").fetchall() stopw = set(stopwords.words('english')) # print(stopw) test_set = [] for tuple in work_data: content = tuple[3] print(tuple[2]) if content.strip() is not '': rdata = tokenize(content.strip(), en_stem=True) words = rdata.split() # print(words) string_words = " ".join(words) dict = {item: 0 for item in words} # print(string_words) test_set.append((dict, tuple[0])) while total < 9000: n = total // 5 train_set = [] for cls in cats: work_data = cursor.execute( "select * from News where category = ? limit ? ", ( cls,
def getNgrams(data): unigrams = tokenize(data, en_stem=False, en_stopword_removal=True).split()[:25] bigrams = zip(unigrams, unigrams[1:]) trigrams = zip(unigrams, unigrams[1:], unigrams[2:]) grams = list(bigrams)+list(trigrams) return grams
resSet = [] cats = ['sports', 'health', 'entertainment', 'tech', 'business'] root = './test_data/' newsSet = [] connection = sqlite3.connect('data/news_data.db') cursor = connection.cursor() for cat in cats: dict = {} db_out = cursor.execute('''select * from News where category = ? order by title''',(cat,)).fetchall() for tuple in db_out: print("Testing : ",tuple[2]," --- ",tuple[0]) data = tokenize(tuple[3],en_stem=True) words = re.findall("[a-zA-Z0-9$]+", data) # print(words) tmp = words.copy() for i in words: if (i.lower() in stopw) or (len(i) < 2): # print(i) tmp.remove(i) # print(len(tmp)," ",tmp) words = [stem(item) for item in tmp] string_words = " ".join(words) dict = {item: 0 for item in words} #print(string_words) newsSet.append((dict,cat)) # Test if category equal to that obtained by the classifier res = classifier.classify(dict)
from generals import tokenize cats = ['sports', 'health', 'entertainment', 'tech', 'business'] connection = sqlite3.connect('data/news_data.db') cursor = connection.cursor() work_data = cursor.execute('''select * from News order by title''').fetchall() newsSet = [] stopw = set(stopwords.words('english')) # print(stopw) for tuple in work_data: content = tuple[3] print(tuple[2]) if content.strip() is not '': rdata = tokenize(content.strip(),en_stem=True) words = rdata.split() # print(words) string_words = " ".join(words) dict = {item: 0 for item in words} #print(string_words) newsSet.append((dict,tuple[0])) random.shuffle(newsSet) train_set = newsSet[:int((len(newsSet)/2))] test_set = newsSet[int((len(newsSet)/2)):] cls = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(cls,test_set),len(train_set),len(test_set)) #cls = nltk.NaiveBayesClassifier.train(train_set)
resSet = [] cats = ['sports', 'health', 'entertainment', 'tech', 'business'] root = './test_data/' newsSet = [] connection = sqlite3.connect('data/news_data.db') cursor = connection.cursor() for cat in cats: dict = {} db_out = cursor.execute( '''select * from News where category = ? order by title''', (cat, )).fetchall() for tuple in db_out: print("Testing : ", tuple[2], " --- ", tuple[0]) data = tokenize(tuple[3], en_stem=True) words = re.findall("[a-zA-Z0-9$]+", data) # print(words) tmp = words.copy() for i in words: if (i.lower() in stopw) or (len(i) < 2): # print(i) tmp.remove(i) # print(len(tmp)," ",tmp) words = [stem(item) for item in tmp] string_words = " ".join(words) dict = {item: 0 for item in words} #print(string_words) newsSet.append((dict, cat)) # Test if category equal to that obtained by the classifier res = classifier.classify(dict)