Esempio n. 1
0
def main():
	print "loading documents..."
	documents	= ['data/'+i.strip() for i in open(sys.argv[1],'r')]
	print documents
	lda, time_model,prior = load_model(9)
	docs = ((' '.join(w[2]),dt) for w,dt in windowed(documents,window_size))
	
	for doc,dt in docs:
		topic_dist = lda.doc_distribution(filter_tokenise(doc))
		dt_dist    = time_dist(topic_dist,time_model,prior,limit=24*3*7)
		print sum((i*(time_bin/2)) * p for i,p in enumerate(dt_dist)), dt
Esempio n. 2
0
def main():
    print "loading documents..."
    documents = ['data/' + i.strip() for i in open(sys.argv[1], 'r')]
    print documents
    lda, time_model, prior = load_model(9)
    docs = ((' '.join(w[2]), dt) for w, dt in windowed(documents, window_size))

    for doc, dt in docs:
        topic_dist = lda.doc_distribution(filter_tokenise(doc))
        dt_dist = time_dist(topic_dist, time_model, prior, limit=24 * 3 * 7)
        print sum((i * (time_bin / 2)) * p for i, p in enumerate(dt_dist)), dt
Esempio n. 3
0
def evaluate(threadfile,
             model,
             extractor,
             window_size=1,
             bandwidth=1000000,
             LAG_TIME=10,
             offset=0):
    posts_log, visit_log, result_log = timestamp_log('posts', 'visit',
                                                     'sliding_window')
    try:
        time = 0
        d_visit = LAG_TIME
        time_visit = time
        time_visit += d_visit
        post_buffer = []
        visits = 0

        visit_times = []
        posts_times = []
        for window, d_t in windowed([threadfile], window_size, offset):

            #post being made
            print "%d\t-->" % time
            posts_log.write("%d\n" % time)
            posts_times.append(time)

            assert (time_visit - time > 0)

            time_post = time + d_t
            post_buffer.append(window)

            last_post_time = time
            while time_visit <= time_post:
                #visit being made
                time = time_visit
                print "%d\t<--" % time
                visits += 1
                visit_log.write("%d\n" % time)
                visit_times.append(time)

                if post_buffer:
                    feature_vec = extractor.extract(post_buffer[-1])
                    d_visit = model.predict(feature_vec, d_t)
                    post_buffer = []
                else:
                    d_visit = model.repredict()

                p_from_last_post = last_post_time + d_visit

                if time < p_from_last_post:
                    time_visit = p_from_last_post
                else:
                    d_visit = model.repredict()
                    time_visit = time + d_visit

            time = time_post

        k = 120
        N = int(max(visit_times[-1], posts_times[-1]))

        sum_Phi = 0
        sum_Psi = 0
        sum_ref = 0
        for i in range(N - k):
            r = len([j for j in posts_times if j >= i and j < i + k])
            h = len([j for j in visit_times if j >= i and j < i + k])
            if r > 0: sum_ref += 1
            if r > h: sum_Phi += 1
            elif r < h: sum_Psi += 1

        Pr_miss = float(sum_Phi) / sum_ref
        Pr_fa = float(sum_Psi) / float(N - k)

        Pr_error = 0.5 * Pr_miss + 0.5 * Pr_fa
        result_log.write(str(Pr_miss) + ' , ' + str(Pr_fa) + '\n')
        model.add_experiment('prerror_test', threadfile, Pr_error)
        model.save()

        return Pr_error, visits
    except Exception:
        raise
    finally:
        posts_log.close()
        visit_log.close()
        result_log.close()
def evaluate(threadfile, model, extractor, window_size = 1, bandwidth = 1000000, LAG_TIME = 10, offset=0):
	posts_log, visit_log, result_log = timestamp_log(
			'posts',
			'visit',
			'sliding_window')
	try:
		time = 0
		d_visit = LAG_TIME
		time_visit = time
		time_visit += d_visit
		post_buffer = []
		visits = 0
		
		visit_times = []
		posts_times = []
		for window,d_t in windowed([threadfile],window_size,offset):

			#post being made
			print "%d\t-->"%time
			posts_log.write("%d\n"%time)
			posts_times.append(time)

			assert(time_visit - time > 0)

			time_post = time + d_t
			post_buffer.append(window)

			last_post_time = time
			while time_visit <= time_post:
				#visit being made
				time = time_visit
				print "%d\t<--"%time
				visits += 1
				visit_log.write("%d\n"%time)
				visit_times.append(time)
				
				if post_buffer:
					feature_vec = extractor.extract(post_buffer[-1])
					d_visit = model.predict(feature_vec,d_t)
					post_buffer = []
				else:
					d_visit = model.repredict()

				p_from_last_post     = last_post_time + d_visit

				if   time < p_from_last_post:
					time_visit = p_from_last_post
				else:
					d_visit = model.repredict()
					time_visit = time + d_visit

			time = time_post

		k = 120
		N = int(max(visit_times[-1],posts_times[-1]))
		
		sum_Phi = 0
		sum_Psi = 0
		sum_ref = 0
		for i in range(N-k):
			r = len([j for j in posts_times if j >= i and j < i + k ])
			h = len([j for j in visit_times if j >= i and j < i + k ])
			if r > 0: sum_ref += 1
			if   r > h: sum_Phi += 1
			elif r < h: sum_Psi += 1
			
		Pr_miss = float(sum_Phi)/sum_ref
		Pr_fa   = float(sum_Psi)/float(N-k)
		
		
		Pr_error = 0.5*Pr_miss + 0.5*Pr_fa
		result_log.write(str(Pr_miss) + ' , ' + str(Pr_fa) + '\n')
		model.add_experiment('prerror_test',threadfile,Pr_error)
		model.save()

		return Pr_error,visits
	except Exception:
		raise
	finally:
		posts_log.close()
		visit_log.close()
		result_log.close()
'''
Created on Jul 19, 2012

@author: shawn
'''
from lib.io.reporting	import get_directory
from lib.options		import read_options
from lib.io.reader		import windowed
from lib.io.util		import load_from_file

import pickle
def save_model(filename,model):
	f = open("%s/%s"%(get_directory(),filename),'wb')
	pickle.dump(model,f)
	f.close()

def unpickle_model(filepath):
	return pickle.load(filepath)

if __name__ == '__main__':
	o,args = read_options()
	extractor   = load_from_file(o['extractor_name'], "Extractor")
	for window,d_t in windowed([o['test_file']],o['window_size']):
		print  extractor.extract(window),d_t
	extractor.save()
Esempio n. 6
0
def evaluate(threadfile, model, extractor,
			window_size = 1,
			bandwidth = 1000000,
			LAG_TIME = 10,
			offset = 0,
			sliding_window_size = 120,
			verbose = False
			):
	posts_log, visit_log, result_log_tscore,result_log_window = timestamp_log(
			'posts',
			'visit',
			't_score',
			'sliding_window')
	try:
		time = 0
		d_visit = LAG_TIME
		time_visit = time
		time_visit += d_visit
		post_buffer = []
		
		t_score_cum = 0
		count = 0
		visits = 0
		

		correct_count,wrong_count = 0,0
		w = SlidingWindow(K = 20, alpha = 0.5)
		ps = PairwiseScoring()
		for window,d_t in windowed([threadfile],window_size, offset):
			#post being made
			if verbose: print "%d\t-->"%time
			posts_log.write("%d\n"%time)
			w.event('post',time)
			ps.event('post',time)

			assert(time_visit - time > 0)
			t_score_cum += time_visit-time
			count += 1
			time_post = time + d_t
			post_buffer.append((extractor.extract(window),d_t))

			last_post_time = time
			
			
			while time_visit <= time_post:
				#visit being made
				time = time_visit
				if verbose: print "%d\t<--"%time
				visits += 1
				visit_log.write("%d\n"%time)
				w.event('visit',time)
				ps.event('visit',time)
				#start correction
				d_visit = None
				if post_buffer: feature_vec,_ = post_buffer[-1]
				d_visit = model.predict(
						feature_vec,d_t,
						current_d_t = time - last_post_time,
						unseen = post_buffer[:-1]
				)

				if post_buffer: post_buffer = []
				time_visit = last_post_time + d_visit
				
				assert(time < time_visit)
				
				#end correction
			time = time_post

		Pr_miss, Pr_fa, Pr_error = w.pr_error()
		result_log_window.write(str(Pr_miss) + ' , ' + str(Pr_fa) + '\n')
		model.add_experiment('prerror_test',threadfile,Pr_error)
		model.add_experiment('pairwise_scoring',threadfile,ps.score())

		t_score = t_score_cum/float(count)
		result_log_tscore.write(str(t_score)+'\n')
		model.add_experiment('t-score_test',threadfile,t_score)
		#save_model(pickle_file,model)
		model.save()

		return {
			'T-score':  t_score,
			'Pr_error': (Pr_miss,Pr_fa,Pr_error),
			'Visits':   visits,
			'Posts':    count,
			'Pairwise': ps.score()
			#'Invalid Predictions': (correct_count+wrong_count,
							#	wrong_count/float(correct_count+wrong_count))
			}
	except Exception:
		raise
	finally:
		posts_log.close()
		visit_log.close()
		result_log_tscore.close()
		result_log_window.close()