# Ensure Amazon readers are defined
list_of_readers = nle_utils.set_up_readers()
number_of_tests = 30
feature_extractor = fe.simple_feature_extractor_stopwords

# File to be written tto
fo = open("N:\\Downloads\\NLE\\results_simple_classifier_test.txt", "wb")

for x in xrange(0, number_of_tests):

    sys.stdout.write("TEST_NUMBER:" + str(x) + ":")
    fo.write("TEST_NUMBER:" + str(x) + ":")

    # Split data into ((+ve training),(-ve training),(+ve testing, -ve testing))
    split_data = nle_utils.split_by_classification(list_of_readers, 0.8)

    # List - tuple - tuple - document
    # print type(split_data[0][0][0][0])
    i = 0  # iterator
    for domain_split in split_data:

        # print "\n for a domain " + nle_utils.list_of_amazon_categories[i] + ":"
        sys.stdout.write("DOMAIN:" + nle_utils.list_of_amazon_categories[i] + ":")
        fo.write("DOMAIN:" + nle_utils.list_of_amazon_categories[i] + ":")
        # Detemine Frequency distribution of all words in training data positive and negative.
        fdists = nle_utils.calculate_training_freq_dists(domain_split, feature_extractor)

        ### DECISION: top x OR more than x
        # wordlist_tuple = nle_utils.pos_neg_wordlist(fdists,nle_utils.words_as_frequent_as_x,200) # todo 200, 100
        wordlist_tuple = nle_utils.pos_neg_wordlist(fdists, nle_utils.top_x_most_frequent, 100)
Esempio n. 2
0
feature_extraction = None # The function of feature extraction to be applied
global cross_domain_testing_data # Speeding up the cross domain testing by not extracting it each time.
cross_domain_testing_data = None

# open file to write results out to
fo = open("N:\\Downloads\\NLE\\results_naive_bayes_test.txt", "wb")

if k_fold == False: # preserving functionality for when k folding isn't used
    # Main for loop of the experiment
    for x in xrange(0,number_of_tests):
    
        sys.stdout.write('TEST_NUMBER:' + str(x) + ':'),
        fo.write('TEST_NUMBER:' + str(x) + ':')
    
        # Split data into ((+ve training),(-ve training),(+ve testing, -ve testing))
        split_data = nle_utils.split_by_classification(list_of_readers,sample_ratio)
        
        #List - tuple - tuple - document #print type(split_data[0][0][0][0])
        i = 0 # iterator
        for domain_split in split_data:
            
            # if not cross domain
            if (cross_domain == ""):            
                sys.stdout.write("DOMAIN:" + nle_utils.list_of_amazon_categories[i] + ":")
                fo.write("DOMAIN:" + nle_utils.list_of_amazon_categories[i] + ":")
                train_nb_data, test_nb_data = nle_utils.format_for_naive_bayes(domain_split,feature_extraction)
                # train_nb_data = 1600 reviews when train ratio is 80%
                # to vary training data sizes uncomment the line below
                train_nb_data = train_nb_data
                nb_classifier = NaiveBayesClassifier.train(train_nb_data)
                sys.stdout.write("ACCURACY:" + str(accuracy(nb_classifier, test_nb_data)) + '\n')