Esempio n. 1
0
class BaseClassifier(tweepy.StreamListener):
    def __init__(self):
        # Create the objects to prevent repeated constructions.
        self.remover = StopwordRemover()
        self.remover.build_lists()
        self.tokenizer = SimpleTokenizer()
        self.normalizer = VocabNormalizer()
        self.normalizer.build_map()

        super(BaseClassifier, self).__init__()

    def on_error(self, status_code):
        print "Error: " + repr(status_code)
        return False

    def on_status(self, status):
        # Filter out links and mentions first.
        text_filter = TweetTextFilter()
        text = text_filter.filter(status.text)

        # Tokenize the text.
        tokens = self.tokenizer.tokenize(text)
        tokens = self.remover.remove_all(tokens)

        # Normalize the vocabulary.
        tokens = self.normalizer.normalize(tokens)
Esempio n. 2
0
class BaseClassifier(tweepy.StreamListener):
    def __init__(self, feature_selector, tokenizer=NLTKTokenizer, **kwargs):
        # Set the feature selector.
        self.feature_selector_class = feature_selector

        # Create the objects to prevent repeated constructions.
        self.text_filter = TweetTextFilter()
        self.remover = StopwordRemover()
        self.remover.build_lists()
        self.tokenizer = tokenizer()
        self.normalizer = VocabNormalizer()
        self.normalizer.build_map()
        self.max_features = config.max_features

        # Initialize some state.
        self.training_data = dict()
        self.trained = False
        self.results = list()

        super(BaseClassifier, self).__init__()

    def train(self, training_sets):
        # Don't allow retraining.
        if self.trained:
            raise RuntimeError('Classifier is already trained')

        for set_name in training_sets:
            training_file = training_sets[set_name]
            set_data = list()

            self.logger.info('Reading training set "{0}" ({1})...'.format(
                set_name, training_file))

            # Read JSON from the set.
            f = open(training_file, 'r')
            for line in f:
                status = json.loads(line)
                term_vector = self.get_term_vector(status)
                set_data.append(term_vector)

            self.training_data[set_name] = set_data

        self.logger.info('Reading training sets complete.')
        self.set_trained(True)

        # Create the feature selector.
        self.feature_selector = self.feature_selector_class(self.training_data)

    def get_data_count(self):
        data_count = 0

        for category_name in self.training_data:
            category_data = self.training_data[category_name]
            data_count += len(category_data)

        return data_count

    def normalize_term_vector(self, term_vector, features):
        norm = list()
        for feature in features:
            if feature in term_vector:
                # norm.append([feature, 1])
                norm.append(1)
            else:
                # norm.append([feature, 0])
                norm.append(0)
       
        # array = numpy.array(norm)
        # return array[:,1]
        return numpy.array(norm)

    def set_max_features(self, max_features):
        self.max_features = max_features

    def get_max_features(self):
        return self.max_features

    def set_trained(self, trained):
        self.trained = trained

    def get_trained(self):
        return self.trained

    def get_term_vector(self, status):
        # Filter out links and mentions first.
        if hasattr(status, '__getitem__'):
            text = self.text_filter.filter(status['text'])
        else:
            text = self.text_filter.filter(status.text)

        # Tokenize the text.
        tokens = self.tokenizer.tokenize(text)
        tokens = self.remover.remove_all(tokens)

        # Normalize the vocabulary.
        tokens = self.normalizer.normalize(tokens)

        # Create the term vector.
        term_vector = dict()
        for token in tokens:
            if token in term_vector:
                term_vector[token] += 1
            else:
                term_vector[token] = 1

        return term_vector

    def on_error(self, status_code):
        print "Error: " + repr(status_code)
        return False

    def on_status(self, status):
        if self.trained is False:
            raise ClassifierNotTrainedException('Classifier must be trained '
                    'before use.')

    def publish_result(self, status, categories):
        self.print_categories(status, categories)
        self.results.append(categories)

    def get_results(self):
        return self.results

    def print_categories(self, status, categories):
        if not config.quiet_mode:
            if hasattr(status, '__getitem__'):
                status_text = status['text']
            else:
                status_text = status.text

            print u'{0}: ({1})'.format(categories, status_text)