return set(part[1:] for part in s.split() if part.startswith('#')) def train(stream, filename): i = 0 with open(filename, 'rb') as file_in: for line in file_in: try: tweet = json.loads(line) text = tweet['text'] print text hashtags = extract_hashtags(text) for hashtag in hashtags: event = {'text': text, 'hashtag': hashtag} ok = stream.train(event, types={'text': 'TEXT'}) if not ok: break except KeyError: pass fs.set_endpoint('http://vm:8088/mungio/api') stream = fs.start_stream(targets={'hashtag': 'CATEGORIC'}) print 'training from ', filename train(stream, filename) print 'getting stream info' print stream.get_info() stream.close()
def extract_hashtags(s): return set(part[1:] for part in s.split() if part.startswith('#')) def train(stream, filename): i=0 with open(filename, 'rb') as file_in: for line in file_in: try: tweet = json.loads(line) text = tweet['text'] print text hashtags = extract_hashtags(text) for hashtag in hashtags: event = {'text': text, 'hashtag':hashtag} ok=stream.train(event,types={'text':'TEXT'}) if not ok: break except KeyError: pass fs.set_endpoint('http://vm:8088/mungio/api') stream = fs.start_stream(targets={'hashtag':'CATEGORIC'}) print 'training from ',filename train(stream, filename) print 'getting stream info' print stream.get_info() stream.close()
import featurestream as fs import random import sys import pprint from collections import defaultdict fs.set_endpoint('http://192.168.2.3:8080') def classify_test(f,n=4000,mink=0,maxk=100): stream = fs.start_stream(learner='rf_classifier', target='f') print 'n=',n,'stream_id=',stream.stream_id # train phase for _ in xrange(n): j = random.randint(mink,maxk) event={} event['value']=j event['f']=f(j) ok = stream.train(event) if not ok: return # test phase # todo accuracy = stream.stats('accuracy') stream.close() return accuracy def train_threshold_event(n,label_prob,stream,k,d,attrs,thresholds): for _ in xrange(n): event = dict(('d.'+str(i),random.randint(0,k)) for i in xrange(d)) if random.random() <= label_prob:
import featurestream as fs import random import sys import pprint from collections import defaultdict fs.set_endpoint('http://192.168.2.3:8080') def classify_test(f, n=4000, mink=0, maxk=100): stream = fs.start_stream(learner='rf_classifier', target='f') print 'n=', n, 'stream_id=', stream.stream_id # train phase for _ in xrange(n): j = random.randint(mink, maxk) event = {} event['value'] = j event['f'] = f(j) ok = stream.train(event) if not ok: return # test phase # todo accuracy = stream.stats('accuracy') stream.close() return accuracy def train_threshold_event(n, label_prob, stream, k, d, attrs, thresholds): for _ in xrange(n):