def format_review(raw_data): formatted_data = [] for review in raw_data: formatted_review = [] tokenized_review = sent_tokenize(review) tokenized_review = list(flatten(tokenized_review)) tokenized_review[0] = tokenized_review[0].partition(" ")[2] for sentence in tokenized_review: formatted_review.append(sentence.split()) formatted_data.append(formatted_review) debug() return formatted_data
def main(): input_dictionary = {} try: input_reviews = open(sys.argv[1]) except: print("Please supply an input file to be classified as the first argument and try again") try: input_dictionary = pickle.load(open(sys.argv[2], 'rb')) except: print("Please supply a pickled data set as the second argument and try again ") data_dictionary = input_dictionary["freq"] prior_dictionary = input_dictionary["prior"] TCount = 0 FCount = 0 correct = 0 incorrect = 0 test_list = list(flatten((('T',)*20,('F',)*20))) debug() for review in format_review(input_reviews): intermediate = classify(data_dictionary, prior_dictionary, review) probable_class = (max(intermediate, key=lambda k: (intermediate[k]))) if probable_class == 'final_T': TCount += 1 else: FCount += 1 if probable_class[6:] == test_list.pop(0): correct += 1 else: incorrect += 1 print(TCount) print(FCount) print(TCount/(TCount + FCount)) print(FCount/(TCount + FCount)) if TCount > FCount: print('T') else: print('F') print(correct/(correct + incorrect))