def main(): print "loading documents..." documents = ['data/'+i.strip() for i in open(sys.argv[1],'r')] print documents lda, time_model,prior = load_model(9) docs = ((' '.join(w[2]),dt) for w,dt in windowed(documents,window_size)) for doc,dt in docs: topic_dist = lda.doc_distribution(filter_tokenise(doc)) dt_dist = time_dist(topic_dist,time_model,prior,limit=24*3*7) print sum((i*(time_bin/2)) * p for i,p in enumerate(dt_dist)), dt
def main(): print "loading documents..." documents = ['data/' + i.strip() for i in open(sys.argv[1], 'r')] print documents lda, time_model, prior = load_model(9) docs = ((' '.join(w[2]), dt) for w, dt in windowed(documents, window_size)) for doc, dt in docs: topic_dist = lda.doc_distribution(filter_tokenise(doc)) dt_dist = time_dist(topic_dist, time_model, prior, limit=24 * 3 * 7) print sum((i * (time_bin / 2)) * p for i, p in enumerate(dt_dist)), dt
def evaluate(threadfile, model, extractor, window_size=1, bandwidth=1000000, LAG_TIME=10, offset=0): posts_log, visit_log, result_log = timestamp_log('posts', 'visit', 'sliding_window') try: time = 0 d_visit = LAG_TIME time_visit = time time_visit += d_visit post_buffer = [] visits = 0 visit_times = [] posts_times = [] for window, d_t in windowed([threadfile], window_size, offset): #post being made print "%d\t-->" % time posts_log.write("%d\n" % time) posts_times.append(time) assert (time_visit - time > 0) time_post = time + d_t post_buffer.append(window) last_post_time = time while time_visit <= time_post: #visit being made time = time_visit print "%d\t<--" % time visits += 1 visit_log.write("%d\n" % time) visit_times.append(time) if post_buffer: feature_vec = extractor.extract(post_buffer[-1]) d_visit = model.predict(feature_vec, d_t) post_buffer = [] else: d_visit = model.repredict() p_from_last_post = last_post_time + d_visit if time < p_from_last_post: time_visit = p_from_last_post else: d_visit = model.repredict() time_visit = time + d_visit time = time_post k = 120 N = int(max(visit_times[-1], posts_times[-1])) sum_Phi = 0 sum_Psi = 0 sum_ref = 0 for i in range(N - k): r = len([j for j in posts_times if j >= i and j < i + k]) h = len([j for j in visit_times if j >= i and j < i + k]) if r > 0: sum_ref += 1 if r > h: sum_Phi += 1 elif r < h: sum_Psi += 1 Pr_miss = float(sum_Phi) / sum_ref Pr_fa = float(sum_Psi) / float(N - k) Pr_error = 0.5 * Pr_miss + 0.5 * Pr_fa result_log.write(str(Pr_miss) + ' , ' + str(Pr_fa) + '\n') model.add_experiment('prerror_test', threadfile, Pr_error) model.save() return Pr_error, visits except Exception: raise finally: posts_log.close() visit_log.close() result_log.close()
def evaluate(threadfile, model, extractor, window_size = 1, bandwidth = 1000000, LAG_TIME = 10, offset=0): posts_log, visit_log, result_log = timestamp_log( 'posts', 'visit', 'sliding_window') try: time = 0 d_visit = LAG_TIME time_visit = time time_visit += d_visit post_buffer = [] visits = 0 visit_times = [] posts_times = [] for window,d_t in windowed([threadfile],window_size,offset): #post being made print "%d\t-->"%time posts_log.write("%d\n"%time) posts_times.append(time) assert(time_visit - time > 0) time_post = time + d_t post_buffer.append(window) last_post_time = time while time_visit <= time_post: #visit being made time = time_visit print "%d\t<--"%time visits += 1 visit_log.write("%d\n"%time) visit_times.append(time) if post_buffer: feature_vec = extractor.extract(post_buffer[-1]) d_visit = model.predict(feature_vec,d_t) post_buffer = [] else: d_visit = model.repredict() p_from_last_post = last_post_time + d_visit if time < p_from_last_post: time_visit = p_from_last_post else: d_visit = model.repredict() time_visit = time + d_visit time = time_post k = 120 N = int(max(visit_times[-1],posts_times[-1])) sum_Phi = 0 sum_Psi = 0 sum_ref = 0 for i in range(N-k): r = len([j for j in posts_times if j >= i and j < i + k ]) h = len([j for j in visit_times if j >= i and j < i + k ]) if r > 0: sum_ref += 1 if r > h: sum_Phi += 1 elif r < h: sum_Psi += 1 Pr_miss = float(sum_Phi)/sum_ref Pr_fa = float(sum_Psi)/float(N-k) Pr_error = 0.5*Pr_miss + 0.5*Pr_fa result_log.write(str(Pr_miss) + ' , ' + str(Pr_fa) + '\n') model.add_experiment('prerror_test',threadfile,Pr_error) model.save() return Pr_error,visits except Exception: raise finally: posts_log.close() visit_log.close() result_log.close()
''' Created on Jul 19, 2012 @author: shawn ''' from lib.io.reporting import get_directory from lib.options import read_options from lib.io.reader import windowed from lib.io.util import load_from_file import pickle def save_model(filename,model): f = open("%s/%s"%(get_directory(),filename),'wb') pickle.dump(model,f) f.close() def unpickle_model(filepath): return pickle.load(filepath) if __name__ == '__main__': o,args = read_options() extractor = load_from_file(o['extractor_name'], "Extractor") for window,d_t in windowed([o['test_file']],o['window_size']): print extractor.extract(window),d_t extractor.save()
def evaluate(threadfile, model, extractor, window_size = 1, bandwidth = 1000000, LAG_TIME = 10, offset = 0, sliding_window_size = 120, verbose = False ): posts_log, visit_log, result_log_tscore,result_log_window = timestamp_log( 'posts', 'visit', 't_score', 'sliding_window') try: time = 0 d_visit = LAG_TIME time_visit = time time_visit += d_visit post_buffer = [] t_score_cum = 0 count = 0 visits = 0 correct_count,wrong_count = 0,0 w = SlidingWindow(K = 20, alpha = 0.5) ps = PairwiseScoring() for window,d_t in windowed([threadfile],window_size, offset): #post being made if verbose: print "%d\t-->"%time posts_log.write("%d\n"%time) w.event('post',time) ps.event('post',time) assert(time_visit - time > 0) t_score_cum += time_visit-time count += 1 time_post = time + d_t post_buffer.append((extractor.extract(window),d_t)) last_post_time = time while time_visit <= time_post: #visit being made time = time_visit if verbose: print "%d\t<--"%time visits += 1 visit_log.write("%d\n"%time) w.event('visit',time) ps.event('visit',time) #start correction d_visit = None if post_buffer: feature_vec,_ = post_buffer[-1] d_visit = model.predict( feature_vec,d_t, current_d_t = time - last_post_time, unseen = post_buffer[:-1] ) if post_buffer: post_buffer = [] time_visit = last_post_time + d_visit assert(time < time_visit) #end correction time = time_post Pr_miss, Pr_fa, Pr_error = w.pr_error() result_log_window.write(str(Pr_miss) + ' , ' + str(Pr_fa) + '\n') model.add_experiment('prerror_test',threadfile,Pr_error) model.add_experiment('pairwise_scoring',threadfile,ps.score()) t_score = t_score_cum/float(count) result_log_tscore.write(str(t_score)+'\n') model.add_experiment('t-score_test',threadfile,t_score) #save_model(pickle_file,model) model.save() return { 'T-score': t_score, 'Pr_error': (Pr_miss,Pr_fa,Pr_error), 'Visits': visits, 'Posts': count, 'Pairwise': ps.score() #'Invalid Predictions': (correct_count+wrong_count, # wrong_count/float(correct_count+wrong_count)) } except Exception: raise finally: posts_log.close() visit_log.close() result_log_tscore.close() result_log_window.close()