def process_dataset(args): training_header, training_set = utils.load_csv(args.input) if len(training_set) == 0: dataset.error("Invalid input: file does not exist or is empty.") test_set = [] data = [] if args.split: test_set = core.split_data(data=training_set, percentage=args.split) elif args.test_set: test_header, test_set = utils.load_csv(args.test_set) if len(test_set) == 0: dataset.error("Invalid test set: file does not exist or is empty.") elif not args.data: dataset.error("one of the arguments -d/--data -e/--test_set -s/--split is required") if args.data: data_header, data = utils.load_csv(args.data) # if len(args.category) > 1: # categories = sorted(args.category) # args.category = utils.aglutinate(training_header, categories) # utils.aglutinate(training_set, categories) # utils.aglutinate(test_set, categories) # utils.aglutinate(data, categories) if args.ignore: utils.ignore_columns(training_set, args.ignore) utils.ignore_columns(test_set, args.ignore) utils.ignore_columns(data, args.ignore) for index in args.ignore: if index < args.category: args.category -= 1 return (training_header, training_set, test_set, data)
def doc_features(doc): doc_words = cytoolz.frequencies(cm.filter_sw(doc)) # initialize to 0 features = zero_features.copy() word_matches = match(doc_words, word_features) for word in word_matches: features[word] = doc_words[word] return features def make_features(docs): return [(doc_features(d), c) for (d, c) in docs] if __name__ == "__main__": labeled_docs = label_docs() filtered = filter_corpus() word_features = select_word_features(filtered) zero_features = dict.fromkeys(word_features, 0) featuresets = make_features(labeled_docs) train_set, test_set = split_data(featuresets) classifier = NaiveBayesClassifier.train(train_set) print "Accuracy", accuracy(classifier, test_set) print classifier.show_most_informative_features()
def random_subset_of_samples(samples, ratio): return split_data(samples, ratio)[0]
def doc_features(doc): doc_words = cytoolz.frequencies(cm.filter_sw(doc)) # initialize to 0 features = zero_features.copy() word_matches = match(doc_words, word_features) for word in word_matches: features[word] = (doc_words[word]) return features def make_features(docs): return [(doc_features(d), c) for (d, c) in docs] if __name__ == "__main__": labeled_docs = label_docs() filtered = filter_corpus() word_features = select_word_features(filtered) zero_features = dict.fromkeys(word_features, 0) featuresets = make_features(labeled_docs) train_set, test_set = split_data(featuresets) classifier = NaiveBayesClassifier.train(train_set) print "Accuracy", accuracy(classifier, test_set) print classifier.show_most_informative_features()