Ejemplo n.º 1
0
def process_dataset(args):
    training_header, training_set = utils.load_csv(args.input)
    if len(training_set) == 0:
        dataset.error("Invalid input: file does not exist or is empty.")
    
    test_set = []
    data = []
    if args.split:
        test_set = core.split_data(data=training_set, percentage=args.split)
    elif args.test_set:
        test_header, test_set = utils.load_csv(args.test_set)
        if len(test_set) == 0:
            dataset.error("Invalid test set: file does not exist or is empty.")
    elif not args.data:
        dataset.error("one of the arguments -d/--data -e/--test_set -s/--split is required")

    if args.data:
        data_header, data = utils.load_csv(args.data)

    # if len(args.category) > 1:
    #     categories = sorted(args.category)
    #     args.category = utils.aglutinate(training_header, categories)
    #     utils.aglutinate(training_set, categories)
    #     utils.aglutinate(test_set, categories)
    #     utils.aglutinate(data, categories)

    if args.ignore:
        utils.ignore_columns(training_set, args.ignore)
        utils.ignore_columns(test_set, args.ignore)
        utils.ignore_columns(data, args.ignore)
        for index in args.ignore:
            if index < args.category:
                args.category -= 1


    return (training_header, training_set, test_set, data)
def doc_features(doc):
    doc_words = cytoolz.frequencies(cm.filter_sw(doc))

    # initialize to 0
    features = zero_features.copy()

    word_matches = match(doc_words, word_features)

    for word in word_matches:
        features[word] = doc_words[word]

    return features


def make_features(docs):
    return [(doc_features(d), c) for (d, c) in docs]


if __name__ == "__main__":
    labeled_docs = label_docs()

    filtered = filter_corpus()
    word_features = select_word_features(filtered)
    zero_features = dict.fromkeys(word_features, 0)
    featuresets = make_features(labeled_docs)
    train_set, test_set = split_data(featuresets)
    classifier = NaiveBayesClassifier.train(train_set)
    print "Accuracy", accuracy(classifier, test_set)
    print classifier.show_most_informative_features()
Ejemplo n.º 3
0
def random_subset_of_samples(samples, ratio):
    return split_data(samples, ratio)[0]
Ejemplo n.º 4
0
def doc_features(doc):
    doc_words = cytoolz.frequencies(cm.filter_sw(doc))

    # initialize to 0
    features = zero_features.copy()

    word_matches = match(doc_words, word_features)

    for word in word_matches:
        features[word] = (doc_words[word])

    return features


def make_features(docs):
    return [(doc_features(d), c) for (d, c) in docs]


if __name__ == "__main__":
    labeled_docs = label_docs()

    filtered = filter_corpus()
    word_features = select_word_features(filtered)
    zero_features = dict.fromkeys(word_features, 0)
    featuresets = make_features(labeled_docs)
    train_set, test_set = split_data(featuresets)
    classifier = NaiveBayesClassifier.train(train_set)
    print "Accuracy", accuracy(classifier, test_set)
    print classifier.show_most_informative_features()