class TweetAnalyzer(StreamListener):
    classifier = client.Classifier(host, port, instance_name)

    def __init__(self, highlight):
        super(TweetAnalyzer, self).__init__()
        self.highlight = highlight

    def on_status(self, status):
        if not hasattr(status, 'text'):
            return

        d = Datum({'text': status.text})
        result = self.classifier.classify([d])

        if len(result) > 0 and len(result[0]) > 0:
            # sort the result in order of score
            est = sorted(result[0], key=lambda est: est.score, reverse=True)

            print_green(est[0].label, end=" ")
            if est[0].label == self.highlight:
                print_red(status.text)
            else:
                print(status.text)

    def on_error(self, status_code):
        if status_code in httplib.responses:
            status_msg = httplib.responses[status_code]
        else:
            status_msg = str(status_code)
        print "ERROR: Twitter Streaming API returned %d (%s)" % (status_code,
                                                                 status_msg)

        # return False to stop on first error (do not retry)
        return False
Example #2
0
class Trainer(StreamListener):
    classifier = client.Classifier(host, port, instance_name)

    def __init__(self, locations):
        super(Trainer, self).__init__()
        self.locations = locations

    '''
    Format of 'status' can be found in:
        https://dev.twitter.com/docs/platform-objects/tweets
    '''

    def on_status(self, status):
        if not hasattr(status, 'text'):
            return
        if not hasattr(status, 'coordinates'):
            return
        if not status.coordinates or not 'coordinates' in status.coordinates:
            return

        loc = None
        for l in self.locations:
            coordinates = status.coordinates['coordinates']
            if l.is_inside(coordinates[0], coordinates[1]):
                loc = l
                break
        if not loc:
            # Unknown location
            return
        hashtags = status.entities['hashtags']
        detagged_text = remove_hashtags_from_tweet(status.text, hashtags)

        # Create datum for Jubatus
        d = Datum({'text': detagged_text})

        # Send training data to Jubatus
        self.classifier.train([(loc.name, d)])

        # Print trained tweet
        print_green(loc.name, ' ')
        print(detagged_text)

    def on_error(self, status_code):
        if status_code in httplib.responses:
            status_msg = httplib.responses[status_code]
        else:
            status_msg = str(status_code)
        print(("ERROR: Twitter Streaming API returned %d (%s)" %
               (status_code, status_msg)))

        # return False to stop on first error (do not retry)
        return False
Example #3
0
def estimate_location_for(text):
    classifier = client.Classifier(host, port, instance_name)

    # Create datum for Jubatus
    d = Datum({'text': text})

    # Send estimation query to Jubatus
    result = classifier.classify([d])

    if len(result[0]) > 0:
        # Sort results by score
        est = sorted(result[0], key=lambda e: e.score, reverse=True)

        # Print the result
        print "Estimated Location for %s:" % text
        for e in est:
            print "  " + e.label + " (" + str(e.score) + ")"
    else:
        # No estimation results; maybe we haven't trained enough
        print "No estimation results available."
        print "Train more tweets or try using another text."
def train_wikipedia_abstract(label, xmlfile):
    classifier = client.Classifier(host, port, instance_name)

    parser = xml.sax.make_parser()
    parser.setContentHandler(Handler(classifier, label))
    parser.parse(xmlfile)
Example #5
0
#!/usr/bin/env python

import sys, json, subprocess
import glob
import random
from jubatus.classifier import client
from jubatus.common import Datum

NAME = "a"
classifier = client.Classifier("127.0.0.1", 9199, NAME)

file_list = glob.glob('../dat/*_train.txt')

fds = map(lambda x: [x.replace("_train.txt", ""), open(x, "r")], file_list)
while fds != []:
    [label, fd] = random.choice(fds)
    text = fd.readline()
    if text == "":
        fds.remove([label, fd])
        print("finished train of label %s \n" % (label))
        continue
    text_strip = text.rstrip()
    datum = Datum({"text": text_strip})
    print("train %s : %s ..." % (label, text_strip))
    classifier.train([(label, datum)])