def test(test_file, classifier, word_frequency_in_class, total_words_in_category, categories, filter_): """Uzima test fajl i za svaki red radi tokenizaciju, zatim provlaci listu kroz NB klasifikator, izlaz je kategorija tweeta, uporedjujemo je sa stvarnom kategorijom tweeta (nalazi se u nekoj koloni testa), vraca """ total_test_tweets = 0 correct_tweets = 0 conf = defaultdict(Counter) test_data = Read.load_data(test_file) for line in test_data: total_test_tweets += 1 sent = classifier( categories, filter_(line[5]), word_frequency_in_class, total_words_in_category ) if sent == line[0]: correct_tweets += 1 conf[line[0]][sent] += 1 # print (conf) print ('********* Ukupan broj tweetova u testu je {0}, broj tacnih odgovora je {1}"'.format(total_test_tweets, correct_tweets)) print ('********* Procenat tacnosti je {0} '.format(correct_tweets / float(total_test_tweets)))
items = db.my_collection try: import cPickle as pickle except ImportError: import pickle # GLOBAL VARIABLES # -------------------------------------------------------------------- # Current directory CWD = os.path.dirname(os.path.realpath(__file__)) test_data = CWD + "/trainingandtestdata/testdata.csv" tr_data = Read.load_data(CWD + "/trainingandtestdata/training.1600000_posneg.csv") stop_words = set() with open(CWD + '/english.stop', 'rb') as stop_w: for word in stop_w: stop_words.add(word.decode('utf-8').rstrip()) tweet_filter = Filter( ngram_combo=[1, 2, 3], stop_words=stop_words, patterns=REGEX_PATTERNS, func=stemmatize ) START_TIME = time.time()