return set(part[1:] for part in s.split() if part.startswith('#'))


def train(stream, filename):
    i = 0
    with open(filename, 'rb') as file_in:
        for line in file_in:
            try:
                tweet = json.loads(line)
                text = tweet['text']
                print text
                hashtags = extract_hashtags(text)
                for hashtag in hashtags:
                    event = {'text': text, 'hashtag': hashtag}
                    ok = stream.train(event, types={'text': 'TEXT'})
                    if not ok: break
            except KeyError:
                pass


fs.set_endpoint('http://vm:8088/mungio/api')
stream = fs.start_stream(targets={'hashtag': 'CATEGORIC'})

print 'training from ', filename
train(stream, filename)

print 'getting stream info'
print stream.get_info()

stream.close()
def extract_hashtags(s):
	return set(part[1:] for part in s.split() if part.startswith('#'))

def train(stream, filename):
	i=0
	with open(filename, 'rb') as file_in:
		for line in file_in:
			try:
				tweet = json.loads(line)
				text = tweet['text']
				print text
				hashtags = extract_hashtags(text)
				for hashtag in hashtags:
					event = {'text': text, 'hashtag':hashtag}
					ok=stream.train(event,types={'text':'TEXT'})
					if not ok: break
			except KeyError:
				pass

fs.set_endpoint('http://vm:8088/mungio/api')
stream = fs.start_stream(targets={'hashtag':'CATEGORIC'})

print 'training from ',filename
train(stream, filename)

print 'getting stream info'
print stream.get_info()

stream.close()
import featurestream as fs
import random
import sys
import pprint
from collections import defaultdict

fs.set_endpoint('http://192.168.2.3:8080')

def classify_test(f,n=4000,mink=0,maxk=100):
	stream = fs.start_stream(learner='rf_classifier', target='f')
	print 'n=',n,'stream_id=',stream.stream_id
	# train phase
	for _ in xrange(n):
		j = random.randint(mink,maxk)
		event={}
		event['value']=j
		event['f']=f(j)
		ok = stream.train(event)
		if not ok: return
	# test phase
	# todo

	accuracy = stream.stats('accuracy')
	stream.close()
	return accuracy

def train_threshold_event(n,label_prob,stream,k,d,attrs,thresholds):

	for _ in xrange(n):
		event = dict(('d.'+str(i),random.randint(0,k)) for i in xrange(d))
		if random.random() <= label_prob:
import featurestream as fs
import random
import sys
import pprint
from collections import defaultdict

fs.set_endpoint('http://192.168.2.3:8080')


def classify_test(f, n=4000, mink=0, maxk=100):
    stream = fs.start_stream(learner='rf_classifier', target='f')
    print 'n=', n, 'stream_id=', stream.stream_id
    # train phase
    for _ in xrange(n):
        j = random.randint(mink, maxk)
        event = {}
        event['value'] = j
        event['f'] = f(j)
        ok = stream.train(event)
        if not ok: return
    # test phase
    # todo

    accuracy = stream.stats('accuracy')
    stream.close()
    return accuracy


def train_threshold_event(n, label_prob, stream, k, d, attrs, thresholds):

    for _ in xrange(n):