def predict(self, stats): if isinstance(stats, str): stats = [stats] if not isinstance(stats, list): raise Exception('INVALID Parameter is given. %s' % stats) if not stats: return None ET.write_file(self.test_tmp_path, 'w', '') for txt in stats: features = self.model_helper.get_sparse_feature(txt) ET.write_file(self.test_tmp_path, 'a', '-1 %s\n' % ' '.join(features)) cmd = '%s/linear_predict %s %s %s 1>>std.log 2>>err.log' % ( model_dir, self.test_tmp_path, self.model_path, self.predict_tmp_path) linfo('predict cmd: %s' % cmd) ret = os.system(cmd) linfo('predict finish. return value: %s' % ret) if ret != 0: raise Exception('Fatal Error-Classifier predict FAIL') if os.path.exists(self.predict_tmp_path): with open(self.predict_tmp_path, 'r') as f: pred_tags = [line.strip() for line in f] ldebug('read predict results cnt: %s' % len(pred_tags)) if len(pred_tags) != len(stats): raise Exception('Invalid pred results') os.system('rm %s' % self.predict_tmp_path) try: return map(lambda x: INDEX2TAG[int(x)], pred_tags) except: raise Exception('Invalid pred results') return None
def run(self, detail=False): self.train() for name, stat in self.stats: try: for clf, path in self.classifiers: linfo('----------roundly predict start-----------') raw_stat, stat = ST.preprocess(stat) union = [(raw,new) for raw, new in zip(raw_stat, stat) if new] raw_stat = map(lambda x:x[0], union) stat = map(lambda x:x[1], union) pred_tags = clf.predict(stat) if not pred_tags or len(pred_tags) != len(stat): raise Exception('Predict Results Exception') tag2dist = self.cal_tag_dist(pred_tags) linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist)) save(path, 'a', '%s\t%s\t%s\n' % (name, json.dumps(tag2dist), len(stat))) if detail: detail_path = '%s%s' % (stats_predict_detail_prefix, name) if os.path.exists(detail_path): os.system('rm %s' % detail_path) for tag, txt in zip(pred_tags, raw_stat): ET.write_file(detail_path, 'a', '%s -%s\n' % (tag, txt)) #print tag, '-%s' % txt linfo('----------roundly predict end-----------') except Exception as e: lexcept('Unknown exception %s' % e)
def auto_tag_check(in_path='../test_data/objective_test_data', out_path='../test_data/objective_test_data_final', tag_log='tagger_check.log'): start_line = 0 if os.path.exists(tag_log): with open(tag_log, 'r') as f: line = f.readline() start_line = int(line) linfo('st_line: %s' % start_line) with open(in_path, 'r') as f: print 'please tag following states with "P:Positive", "N:Negative", "O:Objective"' for num, line in enumerate(f): if num < start_line: continue ET.write_file(tag_log, 'w', '%s' % (num + 1)) dic = json.loads(line.strip()) tag, txt = dic.items()[0] if '#' in txt: continue print '--------------' print tag, txt tag = raw_input() if tag in TAGS: item = {tag: txt} print '%s this state' % tag ET.write_file(out_path, 'a', '%s\n' % json.dumps(item)) elif tag == 'Z': print 'exit' break else: print 'ignore this state'
def auto_tag_check(in_path='../test_data/objective_test_data', out_path='../test_data/objective_test_data_final', tag_log='tagger_check.log'): start_line = 0 if os.path.exists(tag_log): with open(tag_log, 'r') as f: line = f.readline() start_line = int(line) linfo('st_line: %s' % start_line) with open(in_path, 'r') as f: print 'please tag following states with "P:Positive", "N:Negative", "O:Objective"' for num, line in enumerate(f): if num < start_line: continue ET.write_file(tag_log, 'w', '%s' % (num+1)) dic = json.loads(line.strip()) tag, txt = dic.items()[0] if '#' in txt: continue print '--------------' print tag, txt tag = raw_input() if tag in TAGS: item = {tag:txt} print '%s this state' % tag ET.write_file(out_path, 'a', '%s\n' % json.dumps(item)) elif tag == 'Z': print 'exit' break else: print 'ignore this state'
def ProfileRawData(path='../stats/train_public_stats'): ''' calculate user, url, retweet, topic, redundant stat ''' user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0) st = time.time() lines = ST.load_raw_data(path, replace_enabled=False, row_num=None) w2c = {} for txt in lines: for x in txt: w2c.setdefault(x, 0) w2c[x] += 1 print 'word cnt', len(w2c) out_path = 'word2cnt' ET.write_file(out_path, 'w', '') for w, c in w2c.items(): if w == ',': print 'special word: %s. cnt %s' % (w, c) continue ET.write_file(out_path, 'a', '%s,%s\n' % (w, c)) return for txt in lines: if '@' in txt: user_cnt += 1 if 'http' in txt: url_cnt += 1 if '#' in txt: st_i = txt.find('#') if txt.find('#', st_i + 1) != -1: topic_cnt += 1 print 'user_cnt', user_cnt print 'url_cnt', url_cnt print 'topic_cnt', topic_cnt print 'time used', time.time() - st
def debug_topic_public_stats(self, detail=False): #print '--------------------------------------' topic2txt = self.parse_topics_test_data() topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True) #for tp in topics: # print tp, len(topic2txt[tp]) #for txt in topic2txt[u'#和颐酒店女生遇袭#']: # print txt out_path = 'topic_test_tag_dist_format' self.train() for tp in topics: txts = topic2txt[tp] if len(txts) < 100: break txts = [x for x in txts] for csf, path in self.classifiers: #pred_tags = csf.predict(txts) #if not pred_tags or len(pred_tags) != len(txts): # raise Exception('Predict Results Exception') #tag2dist = self.cal_tag_dist(pred_tags) txts_no_topic = map(lambda x:x.replace(tp, ''), txts) pred_tags = csf.predict(txts_no_topic) if not pred_tags or len(pred_tags) != len(txts): raise Exception('Predict Results Exception') tag2dist = self.cal_tag_dist(pred_tags) ET.write_file(out_path, 'a', '%s,%s,%.4f,%.4f,%.4f\n' % (tp, len(txts), tag2dist['O'], tag2dist['P'], tag2dist['N'])) if detail: detail_path = 'topic_test_simulate' for tag, txt in zip(pred_tags, txts): ET.write_file(detail_path, 'a', '%s -%s\n' % (tag, txt))
def parse_topic_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/topic_test_data'): st_t = time.time() topic_cnt, total_cnt = 0, 0 topic2txt = {} with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) txt = dic['text'] topic = ST.parse_topic(txt) if not topic: continue topic2txt.setdefault(topic, list()) topic2txt[topic].append(txt) topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True) for t in topics: txts = topic2txt[t] if len(txts) > 7000: continue #print t, topic2txt[t] if len(txts) < 200: break for txt in txts: dic = {t: txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic)) print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt) print 'topic cnt: %s' % len(topic2txt) print 'time used: %.2f' % (time.time() - st_t)
def parse_city_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/city_test_data'): st_t = time.time() city2txt = {} city_stat_cnt, total_cnt = 0, 0 stat_ids = set() txts_upperbound = 1000 with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) if dic['id'] in stat_ids: continue else: stat_ids.add(dic['id']) city = ST.parse_spatial(dic) if not city: continue city2txt.setdefault(city, list()) if len(city2txt[city]) >= txts_upperbound: continue city2txt[city].append(dic['text']) locs = sorted(city2txt.keys(), key=lambda x: len(city2txt[x]), reverse=True) print 'city_stat_cnt', city_stat_cnt print 'total_cnt', total_cnt print 'time used: %.2f' % (time.time() - st_t) citys = sorted(city2txt.keys()) #for x in citys: # print x, len(city2txt[x]) if os.path.exists(out_path): os.system('rm %s' % out_path) for x in locs: for txt in city2txt[x]: dic={x:txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
def parse_topic_public_stats(in_path='../stats/train_public_stats',out_path='../test_data/topic_test_data'): st_t = time.time() topic_cnt, total_cnt = 0, 0 topic2txt = {} with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) txt = dic['text'] topic = ST.parse_topic(txt) if not topic: continue topic2txt.setdefault(topic, list()) topic2txt[topic].append(txt) topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True) for t in topics: txts = topic2txt[t] if len(txts) > 7000: continue #print t, topic2txt[t] if len(txts) < 200: break for txt in txts: dic = {t:txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic)) print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt) print 'topic cnt: %s' % len(topic2txt) print 'time used: %.2f' % (time.time() - st_t)
def predict(self, stats): if isinstance(stats, str): stats = [stats] if not isinstance(stats, list): raise Exception('INVALID Parameter is given. %s' % stats) if not stats: return None ET.write_file(self.test_tmp_path, 'w', '') for txt in stats: features = self.model_helper.get_sparse_feature(txt) ET.write_file(self.test_tmp_path, 'a', '-1 %s\n' % ' '.join(features)) cmd = '%s/linear_predict %s %s %s 1>>std.log 2>>err.log' % (model_dir, self.test_tmp_path, self.model_path, self.predict_tmp_path) linfo('predict cmd: %s' % cmd) ret = os.system(cmd) linfo('predict finish. return value: %s' % ret) if ret != 0: raise Exception('Fatal Error-Classifier predict FAIL') if os.path.exists(self.predict_tmp_path): with open(self.predict_tmp_path, 'r') as f: pred_tags = [line.strip() for line in f] ldebug('read predict results cnt: %s' % len(pred_tags)) if len(pred_tags) != len(stats): raise Exception('Invalid pred results') os.system('rm %s' % self.predict_tmp_path) try: return map(lambda x: INDEX2TAG[int(x)], pred_tags) except: raise Exception('Invalid pred results') return None
def ProfileRawData(path='../stats/train_public_stats'): ''' calculate user, url, retweet, topic, redundant stat ''' user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0) st = time.time() lines = ST.load_raw_data(path, replace_enabled=False, row_num=None) w2c = {} for txt in lines: for x in txt: w2c.setdefault(x, 0) w2c[x] += 1 print 'word cnt', len(w2c) out_path = 'word2cnt' ET.write_file(out_path, 'w', '') for w,c in w2c.items(): if w == ',': print 'special word: %s. cnt %s' % (w, c) continue ET.write_file(out_path, 'a', '%s,%s\n' % (w, c)) return for txt in lines: if '@' in txt: user_cnt += 1 if 'http' in txt: url_cnt += 1 if '#' in txt: st_i = txt.find('#') if txt.find('#', st_i+1) != -1: topic_cnt += 1 print 'user_cnt', user_cnt print 'url_cnt', url_cnt print 'topic_cnt', topic_cnt print 'time used', time.time() - st
def run(self, sample_enabled=False, profile_enabled=False): for csf, path in self.classifiers: csf.train() action_day = ET.format_time(time.localtime())[:10] action_total_cnt = 0 if profile_enabled: self.reset_profile() while True: try: stats = self.psr.online_run(interval=10) if not stats: continue linfo('-------roundly analysis----------') citys = map(lambda x: x[0], stats) stats = map(lambda x: x[1], stats) raw_stats, stats = ST.preprocess(stats) valid_ids = [i for i, txt in enumerate(stats) if txt] stats = map(lambda i: stats[i], valid_ids) raw_stats = map(lambda i: raw_stats[i], valid_ids) citys = map(lambda i: citys[i], valid_ids) f_t = ET.format_time(time.localtime()) if sample_enabled: sample_path = '%srealtime_%s' % ( sample_prefix, f_t.replace(' ', '').replace( '-', '').replace(':', '')) ET.write_file(sample_path, 'a', '%s\n' % json.dumps(raw_stats[:300])) #only one model supported at the same time now. for clf, path in self.classifiers: tag2cnt = {'P': 0, 'N': 0, 'O': 0} pred_tags = clf.predict(stats) for tag in pred_tags: tag2cnt[tag] += 1 tag2dist = { tag: cnt * 1.0 / len(stats) for tag, cnt in tag2cnt.items() } linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist)) f_time = ET.format_time(time.localtime()) today = f_time[:10] action_total_cnt = ( action_total_cnt + len(pred_tags) ) if today == action_day else len(pred_tags) save( path, 'a', '%s\t%s\t%s\n' % (f_time, json.dumps(tag2dist), len(stats))) if profile_enabled: self.update_profile_spatial(citys, pred_tags) self.update_profile_topic(raw_stats, pred_tags) if today != action_day: self.save_profile(action_day) self.reset_profile() action_day = today break except Exception as e: lexcept('Unknown exception %s' % e)
def format_sparse(self,_xs, _ys, out_path): if os.path.exists(out_path): os.system('rm %s' % out_path) for txt, tag in zip(_xs, _ys): bags = self.get_feature(txt) features = self.discret_feature(bags) line = '%s %s' % (tag, ' '.join(features)) ET.write_file(out_path, 'a', '%s\n' % line)
def format_sparse(self, _xs, _ys, out_path): if os.path.exists(out_path): os.system('rm %s' % out_path) for txt, tag in zip(_xs, _ys): bags = self.get_feature(txt) features = self.discret_feature(bags) line = '%s %s' % (tag, ' '.join(features)) ET.write_file(out_path, 'a', '%s\n' % line)
def visual_stats(in_path='stats/public_stats',out_path='stats/simple_public_stats'): ''' Load public states and visualise ''' lines = [] with open(in_path, 'r') as f: for line in f: dic = json.loads(line.strip()) lines.append('user: %s. created at: %s. text: %s\n' % (dic['user']['name'], dic['created_at'], dic['text'])) for line in lines: ET.write_file(out_path, 'a', line) print 'cnt of lines: %s' % len(lines)
def visual_stats(in_path='stats/public_stats', out_path='stats/simple_public_stats'): ''' Load public states and visualise ''' lines = [] with open(in_path, 'r') as f: for line in f: dic = json.loads(line.strip()) lines.append('user: %s. created at: %s. text: %s\n' % (dic['user']['name'], dic['created_at'], dic['text'])) for line in lines: ET.write_file(out_path, 'a', line) print 'cnt of lines: %s' % len(lines)
def run(self, sample_enabled=False, profile_enabled=False): for csf, path in self.classifiers: csf.train() action_day = ET.format_time(time.localtime())[:10] action_total_cnt = 0 if profile_enabled: self.reset_profile() while True: try: stats = self.psr.online_run(interval=10) if not stats: continue linfo('-------roundly analysis----------') citys = map(lambda x:x[0], stats) stats = map(lambda x:x[1], stats) raw_stats, stats = ST.preprocess(stats) valid_ids = [i for i, txt in enumerate(stats) if txt] stats = map(lambda i:stats[i], valid_ids) raw_stats = map(lambda i:raw_stats[i], valid_ids) citys = map(lambda i:citys[i], valid_ids) f_t = ET.format_time(time.localtime()) if sample_enabled: sample_path = '%srealtime_%s' % (sample_prefix, f_t.replace(' ', '').replace('-', '').replace(':','')) ET.write_file(sample_path, 'a', '%s\n' % json.dumps(raw_stats[:300])) #only one model supported at the same time now. for clf, path in self.classifiers: tag2cnt = {'P':0, 'N':0, 'O':0} pred_tags = clf.predict(stats) for tag in pred_tags: tag2cnt[tag] += 1 tag2dist = {tag:cnt * 1.0 / len(stats) for tag,cnt in tag2cnt.items()} linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist)) f_time = ET.format_time(time.localtime()) today = f_time[:10] action_total_cnt = (action_total_cnt + len(pred_tags)) if today == action_day else len(pred_tags) save(path, 'a', '%s\t%s\t%s\n' % (f_time, json.dumps(tag2dist), len(stats))) if profile_enabled: self.update_profile_spatial(citys, pred_tags) self.update_profile_topic(raw_stats, pred_tags) if today != action_day: self.save_profile(action_day) self.reset_profile() action_day = today break except Exception as e: lexcept('Unknown exception %s' % e)
def debug_city_public_stats(self, detail=False): city2txt = self.parse_citys_test_data() return out_path = 'city_test_tag_dist_format' self.train() for tp in city2txt: txts = city2txt[tp] for csf, path in self.classifiers: pred_tags = csf.predict(txts) if not pred_tags or len(pred_tags) != len(txts): raise Exception('Predict Results Exception') tag2dist = self.cal_tag_dist(pred_tags) ET.write_file(out_path, 'a', '%s,%s,%.4f,%.4f,%.4f\n' % (tp, len(txts), tag2dist['O'], tag2dist['P'], tag2dist['N'])) if detail: detail_path = 'city_test_simulate' for tag, txt in zip(pred_tags, txts): ET.write_file(detail_path, 'a', '%s -%s\n' % (tag, txt))
def parse_objective_stats(in_path='../stats/public_stats', out_path='../stats/objective_train_data'): if os.path.exists(out_path): os.system('rm %s' % out_path) st = time.time() cnt = 1 with open(in_path, 'r') as f: for line in f: dic = json.loads(line.strip()) txt = dic['text'] if any([x in txt for x in objective_excludes]): continue if any([x in txt for x in objective_includes]): dic = {'O':txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic)) cnt += 1 if cnt > 25000: break linfo('time used: %.2f. objective stats cnt: %s' % (time.time() - st, cnt))
def random_shardlize(cls, shard_sz, rand_cnt, path='rand_req', save=False, load=False): if shard_sz <= 1: raise Exception('unvalid shard_sz for cross validation') if load: with open(path, 'r') as f: line = f.readline().strip() rand_req = map(int, line.split(' ')) if len(rand_req) != rand_cnt: raise Exception('Load rand_req fail. wrong results') else: rand_req = [random.randint(1, shard_sz) for i in range(rand_cnt)] if save: ET.write_file(path, 'w', '%s\n'%' '.join(map(str, rand_req))) rid2shard = {} for i, rid in enumerate(rand_req): rid2shard.setdefault(rid, []) rid2shard[rid].append(i) return rid2shard
def parse_objective_stats(in_path='../stats/public_stats', out_path='../stats/objective_train_data'): if os.path.exists(out_path): os.system('rm %s' % out_path) st = time.time() cnt = 1 with open(in_path, 'r') as f: for line in f: dic = json.loads(line.strip()) txt = dic['text'] if any([x in txt for x in objective_excludes]): continue if any([x in txt for x in objective_includes]): dic = {'O': txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic)) cnt += 1 if cnt > 25000: break linfo('time used: %.2f. objective stats cnt: %s' % (time.time() - st, cnt))
def visual_test(in_path='../test_data/tri_test_data', out_path='../test_data/tri_visual_data'): if os.path.exists(out_path): os.system('rm %s' % out_path) #pp, nn = [], [] lines = [] with open(in_path, 'r') as f: for line in f: dic = json.loads(line.strip()) tag, txt = dic.items()[0] new_line = '%s -%s\n' % (tag, txt) lines.append(new_line) #ET.write_file(out_path, 'a', new_line) #if tag == 'P': # pp.append(new_line) #elif tag == 'N': # nn.append(new_line) lines = sorted(lines) for line in lines: ET.write_file(out_path, 'a', line)
def batch_extract_feature(self, word2cnt_path=None, cnt_threshold=10): self.w2id = {} w2cnt = {} for txt in self._train_xs: bags = self.get_feature(txt, cache=True) for w in bags: w2cnt.setdefault(w, 0) w2cnt[w] += 1 if w not in self.w2id: self.w2id[w] = len(self.w2id) + 1 if word2cnt_path: if os.path.exists(word2cnt_path): os.system('rm %s' % word2cnt_path) words = sorted(w2cnt.keys(), key=lambda x: w2cnt[x], reverse=True) for w in words: cnt = w2cnt[w] ET.write_file(word2cnt_path, 'a', '%s %s\n' % (w, cnt)) #self.w2id = {w:d for w, d in self.w2id.items() if w2cnt[w] >= cnt_threshold} linfo('gram cnt: %s' % len(self.w2id)) return self.w2id
def save_profile(self, action_day): #action_day format: 2016-01-01 action_day = action_day.replace('-', '') city_path = '%scity/city_%s' % (profile_prefix, action_day) linfo('save spatial profile: %s' % city_path) ET.write_file(city_path, 'w', '') for city, dist in self.profile_city.items(): ET.write_file(city_path, 'a', '%s,%s,%s,%s\n' % (city, dist['O'], dist['P'],dist['N'])) topic_path = '%stopic/topic_%s' % (profile_prefix, action_day) linfo('save topic profile: %s' % topic_path) ET.write_file(topic_path, 'w', '') for tp, dist in self.profile_topic.items(): ET.write_file(topic_path, 'a', '%s,%s,%s,%s\n' % (tp.replace(',', '-'), dist['O'], dist['P'],dist['N']))
def parse_city_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/city_test_data'): st_t = time.time() city2txt = {} city_stat_cnt, total_cnt = 0, 0 stat_ids = set() txts_upperbound = 1000 with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) if dic['id'] in stat_ids: continue else: stat_ids.add(dic['id']) city = ST.parse_spatial(dic) if not city: continue city2txt.setdefault(city, list()) if len(city2txt[city]) >= txts_upperbound: continue city2txt[city].append(dic['text']) locs = sorted(city2txt.keys(), key=lambda x: len(city2txt[x]), reverse=True) print 'city_stat_cnt', city_stat_cnt print 'total_cnt', total_cnt print 'time used: %.2f' % (time.time() - st_t) citys = sorted(city2txt.keys()) #for x in citys: # print x, len(city2txt[x]) if os.path.exists(out_path): os.system('rm %s' % out_path) for x in locs: for txt in city2txt[x]: dic = {x: txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
def save_profile(self, action_day): #action_day format: 2016-01-01 action_day = action_day.replace('-', '') city_path = '%scity/city_%s' % (profile_prefix, action_day) linfo('save spatial profile: %s' % city_path) ET.write_file(city_path, 'w', '') for city, dist in self.profile_city.items(): ET.write_file( city_path, 'a', '%s,%s,%s,%s\n' % (city, dist['O'], dist['P'], dist['N'])) topic_path = '%stopic/topic_%s' % (profile_prefix, action_day) linfo('save topic profile: %s' % topic_path) ET.write_file(topic_path, 'w', '') for tp, dist in self.profile_topic.items(): ET.write_file( topic_path, 'a', '%s,%s,%s,%s\n' % (tp.replace(',', '-'), dist['O'], dist['P'], dist['N']))
def write_records(path, mode, txt): local_lock.acquire() ET.write_file(path, mode, txt) local_lock.release()