Esempio n. 1
0
def main():
    plot_queue = multiprocessing.Queue()
    confusion_queue = multiprocessing.Queue()
    start_plot(plot_queue, confusion_queue)

    first = (SingleClassifier, semeval.TrainSemEvalSelfLearning, semeval.TestSemEval)
    second = (HierarchicalClassifier, semeval.TrainSemEvalSelfLearning, semeval.TestSemEval)

    extractor = FeatureExtractor(tokenizer=tokenizer)
    extractor.min_n, extractor.max_n = 1, 2

    config = get_config()
    chunk_size = config.getint("sentiment", "chunk_size")
    first_chunk = config.getint("sentiment", "first_chunk")
    titles = json.loads(config.get("sentiment", "titles"))

    experiment = run_experiment(first, second, extractor, chunk_size, first_chunk)

    try:
        for data in experiment:
            data[titles[0]] = parse_performance(data[titles[0]])
            data[titles[1]] = parse_performance(data[titles[1]])
            plot_queue.put(data)
            confusion_queue.put(data)
            print data[titles[0]]["count"], data[titles[0]]["SemEval"], data[titles[1]]["SemEval"], data[titles[0]][
                "vocab"
            ], data[titles[1]]["vocab"]
    except KeyboardInterrupt:
        pass
    finally:
        plot_queue.put(None)
        plot_queue.close()
        confusion_queue.put(None)
        confusion_queue.close()

    print "Done processing."
Esempio n. 2
0
 def setup(self):
     self.extractor = FeatureExtractor()
     self.document = 'I am so happy about this project'
Esempio n. 3
0
class TestFeatureExtractor(object):
    """Extract features from text for use in classification."""
    def setup(self):
        self.extractor = FeatureExtractor()
        self.document = 'I am so happy about this project'

    def test_no_features(self):
        result = self.extractor.extract(self.document)
        assert result == []

    def test_tokenize_default(self):
        result = self.extractor.tokenize(self.document)
        expected = self.document.split()
        assert result == expected

    def test_tokenize_custom(self):
        def custom(document):
            return document.lower().split(' ')

        self.extractor = FeatureExtractor(tokenizer=custom)
        assert self.extractor.tokenize is custom

    def test_ngrams(self):
        test = [(1, 1, ['I', 'am', 'so', 'happy', 'about', 'this', 'project']),
                (2, 2, [
                    '__start__ I', 'I am', 'am so', 'so happy', 'happy about',
                    'about this', 'this project', 'project __end__'
                ]),
                (3, 3, [
                    '__start__ I am', 'I am so', 'am so happy',
                    'so happy about', 'happy about this', 'about this project',
                    'this project __end__'
                ])]
        test.append((1, 2, test[0][2] + test[1][2]))
        test.append((2, 3, test[1][2] + test[2][2]))
        test.append((1, 3, test[0][2] + test[1][2] + test[2][2]))
        failed = []
        for min_n, max_n, expected in test:
            self.extractor.min_n, self.extractor.max_n = min_n, max_n
            result = sorted(self.extractor.extract(self.document))
            # Convert expected to a tuple.
            expected = [tuple(x.split()) for x in expected]
            expected = sorted(expected)
            if result != expected:
                failed.append(((min_n, max_n), result, expected))

        # This explicitly shows the expected tuple return type.
        document = 'this is a test'
        expected = [('__start__', 'this'), ('this', 'is'), ('is', 'a'),
                    ('a', 'test'), ('test', '__end__')]
        expected = sorted(expected)
        self.extractor.min_n, self.extractor.max_n = 2, 2
        result = sorted(self.extractor.extract(document))
        if result != expected:
            failed.append(((2, 2), result, expected))

        assert not failed

    def test_ngrams_multinomialnb(self):
        # Integration test with Naive Bayes classifier.
        classifier = MultinomialNB()
        self.extractor.min_n, self.extractor.max_n = 1, 3
        features = self.extractor.extract(self.document)
        classifier.train([features, 'positive'])

    def test_ngrams_non_zero(self):
        bad_ranges = [(0, 1), (1, 0)]
        for min_n, max_n in bad_ranges:
            self.extractor.min_n, self.extractor.max_n = min_n, max_n
            assert_raises(ValueError, self.extractor.extract, self.document)

    def test_ngrams_non_negative(self):
        bad_ranges = [(-1, 1), (1, -1), (-2, -1), (-1, 0), (0, -1)]
        for min_n, max_n in bad_ranges:
            self.extractor.min_n, self.extractor.max_n = min_n, max_n
            assert_raises(ValueError, self.extractor.extract, self.document)

    def test_ngrams_non_reversed(self):
        bad_ranges = [(2, 1), (3, 1), (3, 2)]
        for min_n, max_n in bad_ranges:
            self.extractor.min_n, self.extractor.max_n = min_n, max_n
            assert_raises(ValueError, self.extractor.extract, self.document)
Esempio n. 4
0
    def test_tokenize_custom(self):
        def custom(document):
            return document.lower().split(' ')

        self.extractor = FeatureExtractor(tokenizer=custom)
        assert self.extractor.tokenize is custom
Esempio n. 5
0
class TestFeatureExtractor(object):
    """Extract features from text for use in classification."""

    def setup(self):
        self.extractor = FeatureExtractor()
        self.document = "I am so happy about this project"

    def test_no_features(self):
        result = self.extractor.extract(self.document)
        assert result == []

    def test_tokenize_default(self):
        result = self.extractor.tokenize(self.document)
        expected = self.document.split()
        assert result == expected

    def test_tokenize_custom(self):
        def custom(document):
            return document.lower().split(" ")

        self.extractor = FeatureExtractor(tokenizer=custom)
        assert self.extractor.tokenize is custom

    def test_ngrams(self):
        test = [
            (1, 1, ["I", "am", "so", "happy", "about", "this", "project"]),
            (
                2,
                2,
                [
                    "__start__ I",
                    "I am",
                    "am so",
                    "so happy",
                    "happy about",
                    "about this",
                    "this project",
                    "project __end__",
                ],
            ),
            (
                3,
                3,
                [
                    "__start__ I am",
                    "I am so",
                    "am so happy",
                    "so happy about",
                    "happy about this",
                    "about this project",
                    "this project __end__",
                ],
            ),
        ]
        test.append((1, 2, test[0][2] + test[1][2]))
        test.append((2, 3, test[1][2] + test[2][2]))
        test.append((1, 3, test[0][2] + test[1][2] + test[2][2]))
        failed = []
        for min_n, max_n, expected in test:
            self.extractor.min_n, self.extractor.max_n = min_n, max_n
            result = sorted(self.extractor.extract(self.document))
            # Convert expected to a tuple.
            expected = [tuple(x.split()) for x in expected]
            expected = sorted(expected)
            if result != expected:
                failed.append(((min_n, max_n), result, expected))

        # This explicitly shows the expected tuple return type.
        document = "this is a test"
        expected = [("__start__", "this"), ("this", "is"), ("is", "a"), ("a", "test"), ("test", "__end__")]
        expected = sorted(expected)
        self.extractor.min_n, self.extractor.max_n = 2, 2
        result = sorted(self.extractor.extract(document))
        if result != expected:
            failed.append(((2, 2), result, expected))

        assert not failed

    def test_ngrams_multinomialnb(self):
        # Integration test with Naive Bayes classifier.
        classifier = MultinomialNB()
        self.extractor.min_n, self.extractor.max_n = 1, 3
        features = self.extractor.extract(self.document)
        classifier.train([features, "positive"])

    def test_ngrams_non_zero(self):
        bad_ranges = [(0, 1), (1, 0)]
        for min_n, max_n in bad_ranges:
            self.extractor.min_n, self.extractor.max_n = min_n, max_n
            assert_raises(ValueError, self.extractor.extract, self.document)

    def test_ngrams_non_negative(self):
        bad_ranges = [(-1, 1), (1, -1), (-2, -1), (-1, 0), (0, -1)]
        for min_n, max_n in bad_ranges:
            self.extractor.min_n, self.extractor.max_n = min_n, max_n
            assert_raises(ValueError, self.extractor.extract, self.document)

    def test_ngrams_non_reversed(self):
        bad_ranges = [(2, 1), (3, 1), (3, 2)]
        for min_n, max_n in bad_ranges:
            self.extractor.min_n, self.extractor.max_n = min_n, max_n
            assert_raises(ValueError, self.extractor.extract, self.document)
Esempio n. 6
0
    def test_tokenize_custom(self):
        def custom(document):
            return document.lower().split(" ")

        self.extractor = FeatureExtractor(tokenizer=custom)
        assert self.extractor.tokenize is custom
Esempio n. 7
0
 def setup(self):
     self.extractor = FeatureExtractor()
     self.document = "I am so happy about this project"