Ejemplo n.º 1
0
def format_review(raw_data):
    formatted_data = []
    for review in raw_data:
        formatted_review = []
        tokenized_review = sent_tokenize(review)
        tokenized_review = list(flatten(tokenized_review))
        tokenized_review[0] = tokenized_review[0].partition(" ")[2]

        for sentence in tokenized_review:
            formatted_review.append(sentence.split())
        formatted_data.append(formatted_review)

    debug()
    return formatted_data
Ejemplo n.º 2
0
def main():
    input_dictionary = {} 
    try:
        input_reviews = open(sys.argv[1])
    except:
        print("Please supply an input file to be classified as the first argument and try again")
    
    try:
        input_dictionary = pickle.load(open(sys.argv[2], 'rb'))
    except:
        print("Please supply a pickled data set as the second argument and try again ")

    data_dictionary = input_dictionary["freq"]
    prior_dictionary = input_dictionary["prior"]
    
    TCount = 0
    FCount = 0
    correct = 0
    incorrect = 0
    test_list = list(flatten((('T',)*20,('F',)*20)))

    debug()
    for review in format_review(input_reviews):
       intermediate = classify(data_dictionary, prior_dictionary, review)
       probable_class = (max(intermediate, key=lambda k: (intermediate[k])))
       if probable_class == 'final_T':
           TCount += 1
       else:
           FCount += 1

       if probable_class[6:] == test_list.pop(0):
           correct += 1
       else:
           incorrect += 1

    print(TCount)
    print(FCount)
    print(TCount/(TCount + FCount))
    print(FCount/(TCount + FCount))
    
    if TCount > FCount:
        print('T')
    else:
        print('F')


    print(correct/(correct + incorrect))
Ejemplo n.º 3
0
def main():
    global input_files
    global output_file

    processed_dictionary = {}
    output_dictionary = {}
    prior_counts = {}

    #Populate input_files
    for i in range(1, len(sys.argv)-1): input_files.append(open(sys.argv[i]))

    #Populate output_file
    output_file = open(sys.argv[len(sys.argv)-1], 'wb')

    process_counts(input_files[0], 'T', processed_dictionary, prior_counts) 
    debug()
    process_counts(input_files[1], 'F', processed_dictionary, prior_counts) 
    debug()

    output_dictionary["freq"] = processed_dictionary
    output_dictionary["prior"] = prior_counts

    pickle.dump(output_dictionary, output_file)
def main():
    global input_files
    global output_file

    processed_dictionary = {}
    output_dictionary = {}
    prior_counts = {}

    input_corpi = []
    corpus = {}
    corpus["text"] = []
    corpus["id"] = []
    corpus["class"] = []

    # Populate input_files
    for i in range(1, len(sys.argv) - 2):
        input_files.append(open(sys.argv[i]))
    debug()
    test_file = open(sys.argv[len(sys.argv) - 2])

    for i, corpus_file in enumerate(input_files):
        input_corpi.append([line for line in corpus_file])

    test_set = {}
    test_set["text"] = []
    test_set["id"] = []

    for line in test_file:
        split_line = line.split()
        for k, word in enumerate(split_line):
            if k is 0:
                continue
            split_line[k] = stem_word(word)

        line = " ".join(x for x in split_line)
        review_id = line[0:7]
        test_set["text"].append(line[8:])
        test_set["id"].append(review_id)

    for i, corpus_file in enumerate(input_corpi):
        current_class = input_files[i].name[0]

        for line in corpus_file:
            split_line = line.split()
            for k, word in enumerate(split_line):
                if k is 0:
                    continue
                split_line[k] = stem_word(word)

            line = " ".join(x for x in split_line)
            review_id = line[0:7]
            corpus["text"].append(line[8:])
            corpus["id"].append(review_id)
            corpus["class"].append(current_class)

    selector = SelectKBest(chi2, k=100)

    training_data = vectorize_for_training(corpus["text"])
    test_data = vectorize_for_testing(test_set["text"])

    other_training_data = selector.fit_transform(training_data, corpus["class"])
    filtered_test_data = selector.transform(test_data)

    linear_svm = svm.LinearSVC()

    linear_svm.fit(other_training_data, corpus["class"])

    predictions = linear_svm.predict(filtered_test_data)
    print(predictions)

    t_count = 0
    f_count = 0

    for c in predictions:
        if c == "T":
            t_count += 1
        else:
            f_count += 1

    print("True Count: " + str(t_count))
    print("False Count: " + str(f_count))

    # Populate output_file
    output_file = open(sys.argv[len(sys.argv) - 1], "w")

    for i, prediction in enumerate(predictions):
        output_file.write(prediction + "\t" + test_set["id"][i])