Esempio n. 1
0
def format_review(raw_data):
    formatted_data = []
    for review in raw_data:
        formatted_review = []
        tokenized_review = sent_tokenize(review)
        tokenized_review = list(flatten(tokenized_review))
        tokenized_review[0] = tokenized_review[0].partition(" ")[2]

        for sentence in tokenized_review:
            formatted_review.append(sentence.split())
        formatted_data.append(formatted_review)

    debug()
    return formatted_data
Esempio n. 2
0
def main():
    input_dictionary = {} 
    try:
        input_reviews = open(sys.argv[1])
    except:
        print("Please supply an input file to be classified as the first argument and try again")
    
    try:
        input_dictionary = pickle.load(open(sys.argv[2], 'rb'))
    except:
        print("Please supply a pickled data set as the second argument and try again ")

    data_dictionary = input_dictionary["freq"]
    prior_dictionary = input_dictionary["prior"]
    
    TCount = 0
    FCount = 0
    correct = 0
    incorrect = 0
    test_list = list(flatten((('T',)*20,('F',)*20)))

    debug()
    for review in format_review(input_reviews):
       intermediate = classify(data_dictionary, prior_dictionary, review)
       probable_class = (max(intermediate, key=lambda k: (intermediate[k])))
       if probable_class == 'final_T':
           TCount += 1
       else:
           FCount += 1

       if probable_class[6:] == test_list.pop(0):
           correct += 1
       else:
           incorrect += 1

    print(TCount)
    print(FCount)
    print(TCount/(TCount + FCount))
    print(FCount/(TCount + FCount))
    
    if TCount > FCount:
        print('T')
    else:
        print('F')


    print(correct/(correct + incorrect))