def main(): plot_queue = multiprocessing.Queue() confusion_queue = multiprocessing.Queue() start_plot(plot_queue, confusion_queue) first = (SingleClassifier, semeval.TrainSemEvalSelfLearning, semeval.TestSemEval) second = (HierarchicalClassifier, semeval.TrainSemEvalSelfLearning, semeval.TestSemEval) extractor = FeatureExtractor(tokenizer=tokenizer) extractor.min_n, extractor.max_n = 1, 2 config = get_config() chunk_size = config.getint("sentiment", "chunk_size") first_chunk = config.getint("sentiment", "first_chunk") titles = json.loads(config.get("sentiment", "titles")) experiment = run_experiment(first, second, extractor, chunk_size, first_chunk) try: for data in experiment: data[titles[0]] = parse_performance(data[titles[0]]) data[titles[1]] = parse_performance(data[titles[1]]) plot_queue.put(data) confusion_queue.put(data) print data[titles[0]]["count"], data[titles[0]]["SemEval"], data[titles[1]]["SemEval"], data[titles[0]][ "vocab" ], data[titles[1]]["vocab"] except KeyboardInterrupt: pass finally: plot_queue.put(None) plot_queue.close() confusion_queue.put(None) confusion_queue.close() print "Done processing."
def setup(self): self.extractor = FeatureExtractor() self.document = 'I am so happy about this project'
class TestFeatureExtractor(object): """Extract features from text for use in classification.""" def setup(self): self.extractor = FeatureExtractor() self.document = 'I am so happy about this project' def test_no_features(self): result = self.extractor.extract(self.document) assert result == [] def test_tokenize_default(self): result = self.extractor.tokenize(self.document) expected = self.document.split() assert result == expected def test_tokenize_custom(self): def custom(document): return document.lower().split(' ') self.extractor = FeatureExtractor(tokenizer=custom) assert self.extractor.tokenize is custom def test_ngrams(self): test = [(1, 1, ['I', 'am', 'so', 'happy', 'about', 'this', 'project']), (2, 2, [ '__start__ I', 'I am', 'am so', 'so happy', 'happy about', 'about this', 'this project', 'project __end__' ]), (3, 3, [ '__start__ I am', 'I am so', 'am so happy', 'so happy about', 'happy about this', 'about this project', 'this project __end__' ])] test.append((1, 2, test[0][2] + test[1][2])) test.append((2, 3, test[1][2] + test[2][2])) test.append((1, 3, test[0][2] + test[1][2] + test[2][2])) failed = [] for min_n, max_n, expected in test: self.extractor.min_n, self.extractor.max_n = min_n, max_n result = sorted(self.extractor.extract(self.document)) # Convert expected to a tuple. expected = [tuple(x.split()) for x in expected] expected = sorted(expected) if result != expected: failed.append(((min_n, max_n), result, expected)) # This explicitly shows the expected tuple return type. document = 'this is a test' expected = [('__start__', 'this'), ('this', 'is'), ('is', 'a'), ('a', 'test'), ('test', '__end__')] expected = sorted(expected) self.extractor.min_n, self.extractor.max_n = 2, 2 result = sorted(self.extractor.extract(document)) if result != expected: failed.append(((2, 2), result, expected)) assert not failed def test_ngrams_multinomialnb(self): # Integration test with Naive Bayes classifier. classifier = MultinomialNB() self.extractor.min_n, self.extractor.max_n = 1, 3 features = self.extractor.extract(self.document) classifier.train([features, 'positive']) def test_ngrams_non_zero(self): bad_ranges = [(0, 1), (1, 0)] for min_n, max_n in bad_ranges: self.extractor.min_n, self.extractor.max_n = min_n, max_n assert_raises(ValueError, self.extractor.extract, self.document) def test_ngrams_non_negative(self): bad_ranges = [(-1, 1), (1, -1), (-2, -1), (-1, 0), (0, -1)] for min_n, max_n in bad_ranges: self.extractor.min_n, self.extractor.max_n = min_n, max_n assert_raises(ValueError, self.extractor.extract, self.document) def test_ngrams_non_reversed(self): bad_ranges = [(2, 1), (3, 1), (3, 2)] for min_n, max_n in bad_ranges: self.extractor.min_n, self.extractor.max_n = min_n, max_n assert_raises(ValueError, self.extractor.extract, self.document)
def test_tokenize_custom(self): def custom(document): return document.lower().split(' ') self.extractor = FeatureExtractor(tokenizer=custom) assert self.extractor.tokenize is custom
class TestFeatureExtractor(object): """Extract features from text for use in classification.""" def setup(self): self.extractor = FeatureExtractor() self.document = "I am so happy about this project" def test_no_features(self): result = self.extractor.extract(self.document) assert result == [] def test_tokenize_default(self): result = self.extractor.tokenize(self.document) expected = self.document.split() assert result == expected def test_tokenize_custom(self): def custom(document): return document.lower().split(" ") self.extractor = FeatureExtractor(tokenizer=custom) assert self.extractor.tokenize is custom def test_ngrams(self): test = [ (1, 1, ["I", "am", "so", "happy", "about", "this", "project"]), ( 2, 2, [ "__start__ I", "I am", "am so", "so happy", "happy about", "about this", "this project", "project __end__", ], ), ( 3, 3, [ "__start__ I am", "I am so", "am so happy", "so happy about", "happy about this", "about this project", "this project __end__", ], ), ] test.append((1, 2, test[0][2] + test[1][2])) test.append((2, 3, test[1][2] + test[2][2])) test.append((1, 3, test[0][2] + test[1][2] + test[2][2])) failed = [] for min_n, max_n, expected in test: self.extractor.min_n, self.extractor.max_n = min_n, max_n result = sorted(self.extractor.extract(self.document)) # Convert expected to a tuple. expected = [tuple(x.split()) for x in expected] expected = sorted(expected) if result != expected: failed.append(((min_n, max_n), result, expected)) # This explicitly shows the expected tuple return type. document = "this is a test" expected = [("__start__", "this"), ("this", "is"), ("is", "a"), ("a", "test"), ("test", "__end__")] expected = sorted(expected) self.extractor.min_n, self.extractor.max_n = 2, 2 result = sorted(self.extractor.extract(document)) if result != expected: failed.append(((2, 2), result, expected)) assert not failed def test_ngrams_multinomialnb(self): # Integration test with Naive Bayes classifier. classifier = MultinomialNB() self.extractor.min_n, self.extractor.max_n = 1, 3 features = self.extractor.extract(self.document) classifier.train([features, "positive"]) def test_ngrams_non_zero(self): bad_ranges = [(0, 1), (1, 0)] for min_n, max_n in bad_ranges: self.extractor.min_n, self.extractor.max_n = min_n, max_n assert_raises(ValueError, self.extractor.extract, self.document) def test_ngrams_non_negative(self): bad_ranges = [(-1, 1), (1, -1), (-2, -1), (-1, 0), (0, -1)] for min_n, max_n in bad_ranges: self.extractor.min_n, self.extractor.max_n = min_n, max_n assert_raises(ValueError, self.extractor.extract, self.document) def test_ngrams_non_reversed(self): bad_ranges = [(2, 1), (3, 1), (3, 2)] for min_n, max_n in bad_ranges: self.extractor.min_n, self.extractor.max_n = min_n, max_n assert_raises(ValueError, self.extractor.extract, self.document)
def test_tokenize_custom(self): def custom(document): return document.lower().split(" ") self.extractor = FeatureExtractor(tokenizer=custom) assert self.extractor.tokenize is custom
def setup(self): self.extractor = FeatureExtractor() self.document = "I am so happy about this project"