def getNgrams(data):
    unigrams = tokenize(data, en_stem=False,
                        en_stopword_removal=True).split()[:25]
    bigrams = zip(unigrams, unigrams[1:])
    trigrams = zip(unigrams, unigrams[1:], unigrams[2:])
    grams = list(bigrams) + list(trigrams)
    return grams
Beispiel #2
0
cursor = connection.cursor()

total = 1000
accuracy = []

work_data = cursor.execute("select * from News order by title ").fetchall()

stopw = set(stopwords.words('english'))
# print(stopw)
test_set = []

for tuple in work_data:
    content = tuple[3]
    print(tuple[2])
    if content.strip() is not '':
        rdata = tokenize(content.strip(), en_stem=True)
        words = rdata.split()
        # print(words)
        string_words = " ".join(words)
        dict = {item: 0 for item in words}
        # print(string_words)
        test_set.append((dict, tuple[0]))

while total < 9000:
    n = total // 5
    train_set = []
    for cls in cats:

        work_data = cursor.execute(
            "select * from News where category = ? limit ? ", (
                cls,
Beispiel #3
0
def getNgrams(data):
    unigrams = tokenize(data, en_stem=False, en_stopword_removal=True).split()[:25]
    bigrams = zip(unigrams, unigrams[1:])
    trigrams = zip(unigrams, unigrams[1:], unigrams[2:])
    grams = list(bigrams)+list(trigrams)
    return grams
resSet = []
cats = ['sports', 'health', 'entertainment', 'tech', 'business']
root = './test_data/'
newsSet = []

connection = sqlite3.connect('data/news_data.db')
cursor = connection.cursor()


for cat in cats:
    dict = {}
    db_out = cursor.execute('''select * from News where category = ? order by title''',(cat,)).fetchall()
    for tuple in db_out:
        print("Testing : ",tuple[2]," --- ",tuple[0])
        data = tokenize(tuple[3],en_stem=True)
        words = re.findall("[a-zA-Z0-9$]+", data)
        # print(words)
        tmp = words.copy()
        for i in words:
            if (i.lower() in stopw) or (len(i) < 2):
                # print(i)
                tmp.remove(i)
                # print(len(tmp)," ",tmp)
        words = [stem(item) for item in tmp]
        string_words = " ".join(words)
        dict = {item: 0 for item in words}
        #print(string_words)
        newsSet.append((dict,cat))
        # Test if category equal to that obtained by the classifier
        res = classifier.classify(dict)
from generals import tokenize

cats = ['sports', 'health', 'entertainment', 'tech', 'business']
connection = sqlite3.connect('data/news_data.db')
cursor = connection.cursor()

work_data = cursor.execute('''select * from News order by title''').fetchall()
newsSet = []
stopw = set(stopwords.words('english'))
# print(stopw)

for tuple in work_data:
    content = tuple[3]
    print(tuple[2])
    if content.strip() is not '':
        rdata = tokenize(content.strip(),en_stem=True)
        words = rdata.split()
        # print(words)
        string_words = " ".join(words)
        dict = {item: 0 for item in words}
        #print(string_words)
        newsSet.append((dict,tuple[0]))


random.shuffle(newsSet)

train_set = newsSet[:int((len(newsSet)/2))]
test_set = newsSet[int((len(newsSet)/2)):]
cls = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(cls,test_set),len(train_set),len(test_set))
#cls = nltk.NaiveBayesClassifier.train(train_set)
resSet = []
cats = ['sports', 'health', 'entertainment', 'tech', 'business']
root = './test_data/'
newsSet = []

connection = sqlite3.connect('data/news_data.db')
cursor = connection.cursor()

for cat in cats:
    dict = {}
    db_out = cursor.execute(
        '''select * from News where category = ? order by title''',
        (cat, )).fetchall()
    for tuple in db_out:
        print("Testing : ", tuple[2], " --- ", tuple[0])
        data = tokenize(tuple[3], en_stem=True)
        words = re.findall("[a-zA-Z0-9$]+", data)
        # print(words)
        tmp = words.copy()
        for i in words:
            if (i.lower() in stopw) or (len(i) < 2):
                # print(i)
                tmp.remove(i)
                # print(len(tmp)," ",tmp)
        words = [stem(item) for item in tmp]
        string_words = " ".join(words)
        dict = {item: 0 for item in words}
        #print(string_words)
        newsSet.append((dict, cat))
        # Test if category equal to that obtained by the classifier
        res = classifier.classify(dict)