class TweetAnalyzer(StreamListener): classifier = client.Classifier(host, port, instance_name) def __init__(self, highlight): super(TweetAnalyzer, self).__init__() self.highlight = highlight def on_status(self, status): if not hasattr(status, 'text'): return d = Datum({'text': status.text}) result = self.classifier.classify([d]) if len(result) > 0 and len(result[0]) > 0: # sort the result in order of score est = sorted(result[0], key=lambda est: est.score, reverse=True) print_green(est[0].label, end=" ") if est[0].label == self.highlight: print_red(status.text) else: print(status.text) def on_error(self, status_code): if status_code in httplib.responses: status_msg = httplib.responses[status_code] else: status_msg = str(status_code) print "ERROR: Twitter Streaming API returned %d (%s)" % (status_code, status_msg) # return False to stop on first error (do not retry) return False
class Trainer(StreamListener): classifier = client.Classifier(host, port, instance_name) def __init__(self, locations): super(Trainer, self).__init__() self.locations = locations ''' Format of 'status' can be found in: https://dev.twitter.com/docs/platform-objects/tweets ''' def on_status(self, status): if not hasattr(status, 'text'): return if not hasattr(status, 'coordinates'): return if not status.coordinates or not 'coordinates' in status.coordinates: return loc = None for l in self.locations: coordinates = status.coordinates['coordinates'] if l.is_inside(coordinates[0], coordinates[1]): loc = l break if not loc: # Unknown location return hashtags = status.entities['hashtags'] detagged_text = remove_hashtags_from_tweet(status.text, hashtags) # Create datum for Jubatus d = Datum({'text': detagged_text}) # Send training data to Jubatus self.classifier.train([(loc.name, d)]) # Print trained tweet print_green(loc.name, ' ') print(detagged_text) def on_error(self, status_code): if status_code in httplib.responses: status_msg = httplib.responses[status_code] else: status_msg = str(status_code) print(("ERROR: Twitter Streaming API returned %d (%s)" % (status_code, status_msg))) # return False to stop on first error (do not retry) return False
def estimate_location_for(text): classifier = client.Classifier(host, port, instance_name) # Create datum for Jubatus d = Datum({'text': text}) # Send estimation query to Jubatus result = classifier.classify([d]) if len(result[0]) > 0: # Sort results by score est = sorted(result[0], key=lambda e: e.score, reverse=True) # Print the result print "Estimated Location for %s:" % text for e in est: print " " + e.label + " (" + str(e.score) + ")" else: # No estimation results; maybe we haven't trained enough print "No estimation results available." print "Train more tweets or try using another text."
def train_wikipedia_abstract(label, xmlfile): classifier = client.Classifier(host, port, instance_name) parser = xml.sax.make_parser() parser.setContentHandler(Handler(classifier, label)) parser.parse(xmlfile)
#!/usr/bin/env python import sys, json, subprocess import glob import random from jubatus.classifier import client from jubatus.common import Datum NAME = "a" classifier = client.Classifier("127.0.0.1", 9199, NAME) file_list = glob.glob('../dat/*_train.txt') fds = map(lambda x: [x.replace("_train.txt", ""), open(x, "r")], file_list) while fds != []: [label, fd] = random.choice(fds) text = fd.readline() if text == "": fds.remove([label, fd]) print("finished train of label %s \n" % (label)) continue text_strip = text.rstrip() datum = Datum({"text": text_strip}) print("train %s : %s ..." % (label, text_strip)) classifier.train([(label, datum)])