def real_test(self): self._test_xs, self._test_ys = ST.load_data(self.test_path) ST.replace_url(self._test_xs, fill='H') ST.replace_target(self._test_xs, fill='T') #x_y = [(self.discret_txt(txt), y) for txt, y in zip(self._test_xs, self._test_ys)] test_mat = self.build_sparse_X(self._test_xs) self.accuracy(test_mat, self._test_ys)
def parse_stats_emoticon(in_path='../stats/train_public_stats', out_path='../stats/emoticon_debug'): ''' Study emoticon info from public states ''' st = time.time() icon2cnt = {} config = {"row_num": None} lines = ST.load_raw_data(in_path, **config) icon_line_cnt = 0 for txt in lines: icons = ST._retrieve_emoticon(txt) if not icons: continue icon_line_cnt += 1 for icon in icons: icon2cnt.setdefault(icon, 0) icon2cnt[icon] += 1 if os.path.exists(out_path): os.system('rm %s' % out_path) icons = icon2cnt.keys() icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True) for icon in icons: cnt = icon2cnt[icon] write(out_path, 'a', '%s:%s\n' % (icon, cnt)) linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' % (len(lines), icon_line_cnt, len(icons)))
def parse_stats_emoticon(in_path='../stats/train_public_stats', out_path='../stats/emoticon_debug'): ''' Study emoticon info from public states ''' st = time.time() icon2cnt = {} config={"row_num":None} lines = ST.load_raw_data(in_path, **config) icon_line_cnt = 0 for txt in lines: icons = ST._retrieve_emoticon(txt) if not icons: continue icon_line_cnt += 1 for icon in icons: icon2cnt.setdefault(icon, 0) icon2cnt[icon] += 1 if os.path.exists(out_path): os.system('rm %s' % out_path) icons = icon2cnt.keys() icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True) for icon in icons: cnt = icon2cnt[icon] write(out_path, 'a', '%s:%s\n' % (icon, cnt)) linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' % (len(lines), icon_line_cnt, len(icons)))
def debug(self): ws_1 = set(self.batch_extract_feature().keys()) ST.remove_emoticon(self._train_xs) ws_2 = set(self.batch_extract_feature().keys()) linfo('uni_bi_icon_feature_cnt: %s. no_icon: %s' % (len(ws_1), len(ws_2))) rms = ws_2 - ws_1 for x in rms: print x
def train_discret_model(self, **config): linfo('begin train helper discret model: %s' % config) if not config['emoticon']: ST.remove_emoticon(self._train_xs) if not config['parenthesis']: ST.remove_parenthesis(self._train_xs) self.txt2bags = {} self.w2id = self.batch_extract_feature() linfo('end train helper discret model')
def format_test(self, emoticon=True, parenthesis=True): test_path='../test_data/%s_test_data' % self.classifier_type self._test_xs, self._test_ys = ST.load_data(test_path) linfo('begin preprocess test data, then sparse') self._raw_test_xs, self._test_xs = ST.preprocess(self._test_xs) #ST.replace_url(self._test_xs, fill='H') #ST.replace_target(self._test_xs, fill='T') self._test_ys = map(lambda x:self.tag2index[x], self._test_ys) self.format_sparse(self._test_xs, self._test_ys, '%s/test_data/%s%s_sparse_test_data_%s' % (project_dir, self.flag_prefix, self.classifier_type, 'icon' if emoticon else 'no_icon'))
def train(self): self._train_xs, self._train_ys = ST.load_data(self._path) if not self._emoticon: ST.remove_emoticon(self._train_xs) self.gram2gid = self._discretize_gram2gid() X = self.build_sparse_X(self._train_xs) self.clf.fit(X, self._train_ys) self.real_test()
def parse_emoticon_stats(in_path='../stats/train_public_stats', out_path='../stats/train_data_dg'): ''' Parse states with selected emocicons. Dumps or visualise ''' st = time.time() pos_icons, neg_icons = load_emoticon() icon2stat = {} lines = ST.load_raw_data(in_path) for txt in lines: if any([x in txt for x in excludes]): continue icons = ST._retrieve_emoticon(txt) if not icons: continue dis_match = filter(lambda x: x not in pos_icons and x not in neg_icons, icons) if dis_match: if len(set(dis_match)) >= 2: continue pos_match = filter(lambda x: x in txt, pos_icons) neg_match = filter(lambda x: x in txt, neg_icons) if (pos_match and neg_match) or (not pos_match and not neg_match): continue if pos_match: for icon in pos_match: icon2stat.setdefault(icon, []) icon2stat[icon].append(txt) break if neg_match: for icon in neg_match: icon2stat.setdefault(icon, []) icon2stat[icon].append(txt) break write = ET.write_file if os.path.exists(out_path): os.system('rm %s' % out_path) pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons]) neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons]) icons = copy.copy(pos_icons) icons.extend(neg_icons) write( out_path, 'a', '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n' % (len(lines), pos_cnt, neg_cnt, time.time() - st)) for icon in icons: stats = icon2stat.get(icon, []) #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats))) for stat in stats: dic = {'%s' % ('P' if icon in pos_icons else 'N'): stat} write(out_path, 'a', '%s\n' % json.dumps(dic))
def train(self, icon=True, cross=False): #word2cnt = BayesClassifier.Word2Cnt() #txt = '今天天气就是棒[哈哈] [太阳] [飞起来]#' #return #self._load_data() #self._replace_url(fill=True) self._train_xs, self._train_ys = ST.load_data(self._train_path) ST.replace_url(self._train_xs, fill=True) if not icon: ST.remove_emoticon(self._train_xs) self._train(cross_validation=cross)
def format_test(self, emoticon=True, parenthesis=True): test_path = '../test_data/%s_test_data' % self.classifier_type self._test_xs, self._test_ys = ST.load_data(test_path) linfo('begin preprocess test data, then sparse') self._raw_test_xs, self._test_xs = ST.preprocess(self._test_xs) #ST.replace_url(self._test_xs, fill='H') #ST.replace_target(self._test_xs, fill='T') self._test_ys = map(lambda x: self.tag2index[x], self._test_ys) self.format_sparse( self._test_xs, self._test_ys, '%s/test_data/%s%s_sparse_test_data_%s' % (project_dir, self.flag_prefix, self.classifier_type, 'icon' if emoticon else 'no_icon'))
def parse_emoticon_stats(in_path='../stats/train_public_stats', out_path='../stats/train_data_dg'): ''' Parse states with selected emocicons. Dumps or visualise ''' st = time.time() pos_icons, neg_icons = load_emoticon() icon2stat= {} lines = ST.load_raw_data(in_path) for txt in lines: if any([x in txt for x in excludes]): continue icons = ST._retrieve_emoticon(txt) if not icons: continue dis_match = filter(lambda x:x not in pos_icons and x not in neg_icons, icons) if dis_match: if len(set(dis_match)) >= 2: continue pos_match = filter(lambda x: x in txt, pos_icons) neg_match = filter(lambda x: x in txt, neg_icons) if (pos_match and neg_match) or (not pos_match and not neg_match): continue if pos_match: for icon in pos_match: icon2stat.setdefault(icon, []) icon2stat[icon].append(txt) break if neg_match: for icon in neg_match: icon2stat.setdefault(icon, []) icon2stat[icon].append(txt) break write = ET.write_file if os.path.exists(out_path): os.system('rm %s' % out_path) pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons]) neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons]) icons = copy.copy(pos_icons) icons.extend(neg_icons) write(out_path, 'a', '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n' % (len(lines), pos_cnt, neg_cnt, time.time()-st)) for icon in icons: stats = icon2stat.get(icon, []) #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats))) for stat in stats: dic = {'%s' % ('P' if icon in pos_icons else 'N'):stat } write(out_path, 'a', '%s\n' % json.dumps(dic))
def test(): obj_stats_path = '../train_data/stat_obj_train_data' out_path = '../train_data/Dg_obj_stats' txts = [] with open(obj_stats_path, 'r') as f: for line in f: dic = json.loads(line.strip()) tag, txt = dic.items()[0] txts.append(txt) linfo('obj stats count: %s' % (len(txts))) ST.replace_url(txts, fill='H') ST.replace_target(txts, fill='T') for x in txts: dic = {'O':x} write(out_path, 'a', '%s\n' % json.dumps(dic))
def test(): obj_stats_path = '../train_data/stat_obj_train_data' out_path = '../train_data/Dg_obj_stats' txts = [] with open(obj_stats_path, 'r') as f: for line in f: dic = json.loads(line.strip()) tag, txt = dic.items()[0] txts.append(txt) linfo('obj stats count: %s' % (len(txts))) ST.replace_url(txts, fill='H') ST.replace_target(txts, fill='T') for x in txts: dic = {'O': x} write(out_path, 'a', '%s\n' % json.dumps(dic))
def parse_topic_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/topic_test_data'): st_t = time.time() topic_cnt, total_cnt = 0, 0 topic2txt = {} with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) txt = dic['text'] topic = ST.parse_topic(txt) if not topic: continue topic2txt.setdefault(topic, list()) topic2txt[topic].append(txt) topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True) for t in topics: txts = topic2txt[t] if len(txts) > 7000: continue #print t, topic2txt[t] if len(txts) < 200: break for txt in txts: dic = {t: txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic)) print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt) print 'topic cnt: %s' % len(topic2txt) print 'time used: %.2f' % (time.time() - st_t)
def update_profile_topic(self, raw_stats, tags): for txt, tag in zip(raw_stats, tags): topic = ST.parse_topic(txt) if not topic: continue self.profile_topic.setdefault(topic, {"P":0,"N":0,"O":0}) self.profile_topic[topic][tag] += 1
def parse_topic_public_stats(in_path='../stats/train_public_stats',out_path='../test_data/topic_test_data'): st_t = time.time() topic_cnt, total_cnt = 0, 0 topic2txt = {} with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) txt = dic['text'] topic = ST.parse_topic(txt) if not topic: continue topic2txt.setdefault(topic, list()) topic2txt[topic].append(txt) topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True) for t in topics: txts = topic2txt[t] if len(txts) > 7000: continue #print t, topic2txt[t] if len(txts) < 200: break for txt in txts: dic = {t:txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic)) print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt) print 'topic cnt: %s' % len(topic2txt) print 'time used: %.2f' % (time.time() - st_t)
def ProfileRawData(path='../stats/train_public_stats'): ''' calculate user, url, retweet, topic, redundant stat ''' user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0) st = time.time() lines = ST.load_raw_data(path, replace_enabled=False, row_num=None) w2c = {} for txt in lines: for x in txt: w2c.setdefault(x, 0) w2c[x] += 1 print 'word cnt', len(w2c) out_path = 'word2cnt' ET.write_file(out_path, 'w', '') for w,c in w2c.items(): if w == ',': print 'special word: %s. cnt %s' % (w, c) continue ET.write_file(out_path, 'a', '%s,%s\n' % (w, c)) return for txt in lines: if '@' in txt: user_cnt += 1 if 'http' in txt: url_cnt += 1 if '#' in txt: st_i = txt.find('#') if txt.find('#', st_i+1) != -1: topic_cnt += 1 print 'user_cnt', user_cnt print 'url_cnt', url_cnt print 'topic_cnt', topic_cnt print 'time used', time.time() - st
def online_run(self, interval=10, peroid=0.5, quiet=True): ''' return value: [(city, stat)...] ''' stats_set = set() stats = [] now = peroid cnt = 0 while now < interval: try: rsp = self.retrieve('on', quiet=quiet) cnt += 1 if rsp: for dic in rsp: if dic['id'] not in stats_set: city = ST.parse_spatial(dic) item = (city, dic['text']) stats.append(item) stats_set.add(dic['id']) except Exception as e: logging.exception(e) now += peroid time.sleep(peroid*60) linfo('online analysis %s new stats retrieved. retrieve cnt: %s' % (len(stats), cnt)) return stats
def _predict(self, txt, train_w2c, train_t2c, debug=False, emoticon=True): #if emoticon and 'emoticon' not in self._ngrams_config: # self._ngrams_config.append('emoticon') #elif not emoticon and 'emoticon' in self._ngrams_config: # self._ngrams_config = filter(lambda x: x != 'emoticon', self._ngrams_config) #grams = self._retrieve_feature(txt) grams = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config, gram_icon_mixed=emoticon) if debug: linfo('begin debug case: %s' % txt) tag2score = {"P": 0, "N": 0, "O": 0} for w in grams: for tag in tag2score: if not train_t2c[tag]: continue score = self._cal_likelihood(train_w2c[tag].get(w, 0), train_t2c[tag]) tag2score[tag] += score if debug: linfo( 'DEBUG probability for gram %s when given tag %s is: %.4f. gram cnt: %s.tag cnt: %s' % (w, tag, score, train_w2c[tag].get( w, 0), train_t2c[tag])) pred_tag = sorted(tag2score.keys(), key=lambda x: tag2score[x], reverse=True)[0] if debug: linfo('predict tag2score: %s' % tag2score) return pred_tag
def run(self, detail=False): self.train() for name, stat in self.stats: try: for clf, path in self.classifiers: linfo('----------roundly predict start-----------') raw_stat, stat = ST.preprocess(stat) union = [(raw,new) for raw, new in zip(raw_stat, stat) if new] raw_stat = map(lambda x:x[0], union) stat = map(lambda x:x[1], union) pred_tags = clf.predict(stat) if not pred_tags or len(pred_tags) != len(stat): raise Exception('Predict Results Exception') tag2dist = self.cal_tag_dist(pred_tags) linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist)) save(path, 'a', '%s\t%s\t%s\n' % (name, json.dumps(tag2dist), len(stat))) if detail: detail_path = '%s%s' % (stats_predict_detail_prefix, name) if os.path.exists(detail_path): os.system('rm %s' % detail_path) for tag, txt in zip(pred_tags, raw_stat): ET.write_file(detail_path, 'a', '%s -%s\n' % (tag, txt)) #print tag, '-%s' % txt linfo('----------roundly predict end-----------') except Exception as e: lexcept('Unknown exception %s' % e)
def update_profile_topic(self, raw_stats, tags): for txt, tag in zip(raw_stats, tags): topic = ST.parse_topic(txt) if not topic: continue self.profile_topic.setdefault(topic, {"P": 0, "N": 0, "O": 0}) self.profile_topic[topic][tag] += 1
def ProfileRawData(path='../stats/train_public_stats'): ''' calculate user, url, retweet, topic, redundant stat ''' user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0) st = time.time() lines = ST.load_raw_data(path, replace_enabled=False, row_num=None) w2c = {} for txt in lines: for x in txt: w2c.setdefault(x, 0) w2c[x] += 1 print 'word cnt', len(w2c) out_path = 'word2cnt' ET.write_file(out_path, 'w', '') for w, c in w2c.items(): if w == ',': print 'special word: %s. cnt %s' % (w, c) continue ET.write_file(out_path, 'a', '%s,%s\n' % (w, c)) return for txt in lines: if '@' in txt: user_cnt += 1 if 'http' in txt: url_cnt += 1 if '#' in txt: st_i = txt.find('#') if txt.find('#', st_i + 1) != -1: topic_cnt += 1 print 'user_cnt', user_cnt print 'url_cnt', url_cnt print 'topic_cnt', topic_cnt print 'time used', time.time() - st
def parse_city_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/city_test_data'): st_t = time.time() city2txt = {} city_stat_cnt, total_cnt = 0, 0 stat_ids = set() txts_upperbound = 1000 with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) if dic['id'] in stat_ids: continue else: stat_ids.add(dic['id']) city = ST.parse_spatial(dic) if not city: continue city2txt.setdefault(city, list()) if len(city2txt[city]) >= txts_upperbound: continue city2txt[city].append(dic['text']) locs = sorted(city2txt.keys(), key=lambda x: len(city2txt[x]), reverse=True) print 'city_stat_cnt', city_stat_cnt print 'total_cnt', total_cnt print 'time used: %.2f' % (time.time() - st_t) citys = sorted(city2txt.keys()) #for x in citys: # print x, len(city2txt[x]) if os.path.exists(out_path): os.system('rm %s' % out_path) for x in locs: for txt in city2txt[x]: dic={x:txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
def run(self, sample_enabled=False, profile_enabled=False): for csf, path in self.classifiers: csf.train() action_day = ET.format_time(time.localtime())[:10] action_total_cnt = 0 if profile_enabled: self.reset_profile() while True: try: stats = self.psr.online_run(interval=10) if not stats: continue linfo('-------roundly analysis----------') citys = map(lambda x: x[0], stats) stats = map(lambda x: x[1], stats) raw_stats, stats = ST.preprocess(stats) valid_ids = [i for i, txt in enumerate(stats) if txt] stats = map(lambda i: stats[i], valid_ids) raw_stats = map(lambda i: raw_stats[i], valid_ids) citys = map(lambda i: citys[i], valid_ids) f_t = ET.format_time(time.localtime()) if sample_enabled: sample_path = '%srealtime_%s' % ( sample_prefix, f_t.replace(' ', '').replace( '-', '').replace(':', '')) ET.write_file(sample_path, 'a', '%s\n' % json.dumps(raw_stats[:300])) #only one model supported at the same time now. for clf, path in self.classifiers: tag2cnt = {'P': 0, 'N': 0, 'O': 0} pred_tags = clf.predict(stats) for tag in pred_tags: tag2cnt[tag] += 1 tag2dist = { tag: cnt * 1.0 / len(stats) for tag, cnt in tag2cnt.items() } linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist)) f_time = ET.format_time(time.localtime()) today = f_time[:10] action_total_cnt = ( action_total_cnt + len(pred_tags) ) if today == action_day else len(pred_tags) save( path, 'a', '%s\t%s\t%s\n' % (f_time, json.dumps(tag2dist), len(stats))) if profile_enabled: self.update_profile_spatial(citys, pred_tags) self.update_profile_topic(raw_stats, pred_tags) if today != action_day: self.save_profile(action_day) self.reset_profile() action_day = today break except Exception as e: lexcept('Unknown exception %s' % e)
def _feature_encoding(self, txt): bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config) #fs = {x:0 for x in gram2gid} fs = {} for gram in bags: if gram in self.gram2gid: fs[self.gram2gid[gram]] = 1 return fs
def get_feature(self, txt, cache=False): if txt in self.txt2bags: bags = self.txt2bags[txt] else: bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config) if cache: self.txt2bags[txt] = bags return bags
def _discretize_gram2gid(self): w2id = {} for txt in self._train_xs: bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config) for w in bags: if w not in w2id: w2id[w] = len(w2id) + 1 linfo('grams cnt: %s' % len(w2id)) return w2id
def discret_txt(self, txt): fs = [0 for x in range(len(self.gram2gid))] bags = ST.retrieve_feature( txt, feature_extract_config=self._feature_extract_config) for w in bags: if w in self.gram2gid: wid = self.gram2gid[w] fs[wid] = 1 return fs
def get_feature(self, txt, cache=False): if txt in self.txt2bags: bags = self.txt2bags[txt] else: bags = ST.retrieve_feature( txt, feature_extract_config=self._feature_extract_config) if cache: self.txt2bags[txt] = bags return bags
def _cross_train(self, fold_sz): rid2shard = ST.random_shardlize(fold_sz, len(self._train_xs), load=True) precision = 0 for fid,sd in rid2shard.items(): tmp_train_xs = [self._train_xs[i] for i in sd] tmp_train_ys = [self._train_ys[i] for i in sd] test_set = [(self._feature_encoding(self._train_xs[i]), self._train_ys[i]) for i in sd] classifier = self._train(tmp_train_xs, tmp_train_ys) p = classify.accuracy(classifier, test_set) linfo('maxent classifier precision: %.4f' % p) precision += p linfo('average maxent classifier precision: %.4f' % precision/fold_sz)
def _all_train(self, total_word2cnt, total_tag2cnt): if os.path.exists(self._test_path): test_xs, test_ys = ST.load_data(self._test_path) #linfo('load manually tagged data count: %s' % len(test_xs)) else: return test_t2c = {"P":0,"N":0,"O":0} for y in test_ys: if y not in test_t2c: raise Exception('Key Error in tag2cnt. unknown key: %s' % y) test_t2c[y] += 1 #print test_t2c return self._batch_predict(test_xs, test_ys, total_word2cnt, total_tag2cnt, test_t2c)
def _all_train(self, total_word2cnt, total_tag2cnt): if os.path.exists(self._test_path): test_xs, test_ys = ST.load_data(self._test_path) #linfo('load manually tagged data count: %s' % len(test_xs)) else: return test_t2c = {"P": 0, "N": 0, "O": 0} for y in test_ys: if y not in test_t2c: raise Exception('Key Error in tag2cnt. unknown key: %s' % y) test_t2c[y] += 1 #print test_t2c return self._batch_predict(test_xs, test_ys, total_word2cnt, total_tag2cnt, test_t2c)
def parse_topics_realtime(self): topic_cnt, total_cnt = 0, 0 topic2txt = {} for name, txts in self.stats: for txt in txts: total_cnt += 1 topic = ST.parse_topic(txt) if not topic: continue topic_cnt += 1 topic2txt.setdefault(topic, list()) topic2txt[topic].append(txt) print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt) print 'topic cnt: %s' % len(topic2txt) return topic2txt
def _train(self, shard_sz=10, cross_validation=True): #print self._ngrams_config linfo('begin train classifier') st = time.time() rid2shard = ST.random_shardlize(shard_sz, len(self._train_xs), load=True, path=self.rand_path) #rid2word_info = {} #total_word2cnt = BayesClassifier.Word2Cnt() rid2tag_cnt, rid2word_presence = {}, {} total_word2presence = BayesClassifier.Word2Cnt() total_tag2cnt = {"P": 0, "N": 0, "O": 0} for rid in range(1, shard_sz + 1): shard = rid2shard[rid] #rid2word_info[rid] rid2tag_cnt[rid], rid2word_presence[rid] = self._cal_shard2info( shard) #for tag, w2c in rid2word_info[rid].items(): # for w, c in w2c.items(): # total_word2cnt[tag].setdefault(w, 0) # total_word2cnt[tag][w] += c for tag, w2p in rid2word_presence[rid].items(): for w, c in w2p.items(): total_word2presence[tag].setdefault(w, 0) total_word2presence[tag][w] += c for tag, cnt in rid2tag_cnt[rid].items(): total_tag2cnt[tag] += cnt #self._debug_bigram(total_word2presence) self._prune(total_word2presence, rid2word_presence, total_tag2cnt) self.total_w2c, self.total_t2c = total_word2presence, total_tag2cnt linfo(self.total_t2c) #cross_validation if cross_validation: linfo('beign cross validation') p, r, f = self._cross_train(total_word2presence, rid2word_presence, total_tag2cnt, rid2tag_cnt, shard_sz, rid2shard) linfo( 'Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p, r, f, time.time() - st)) else: linfo('beign train and test with manually tagged data set') p, r, f = self._all_train(total_word2presence, total_tag2cnt) linfo( 'Manually Tag Data Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p, r, f, time.time() - st))
def run(self, sample_enabled=False, profile_enabled=False): for csf, path in self.classifiers: csf.train() action_day = ET.format_time(time.localtime())[:10] action_total_cnt = 0 if profile_enabled: self.reset_profile() while True: try: stats = self.psr.online_run(interval=10) if not stats: continue linfo('-------roundly analysis----------') citys = map(lambda x:x[0], stats) stats = map(lambda x:x[1], stats) raw_stats, stats = ST.preprocess(stats) valid_ids = [i for i, txt in enumerate(stats) if txt] stats = map(lambda i:stats[i], valid_ids) raw_stats = map(lambda i:raw_stats[i], valid_ids) citys = map(lambda i:citys[i], valid_ids) f_t = ET.format_time(time.localtime()) if sample_enabled: sample_path = '%srealtime_%s' % (sample_prefix, f_t.replace(' ', '').replace('-', '').replace(':','')) ET.write_file(sample_path, 'a', '%s\n' % json.dumps(raw_stats[:300])) #only one model supported at the same time now. for clf, path in self.classifiers: tag2cnt = {'P':0, 'N':0, 'O':0} pred_tags = clf.predict(stats) for tag in pred_tags: tag2cnt[tag] += 1 tag2dist = {tag:cnt * 1.0 / len(stats) for tag,cnt in tag2cnt.items()} linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist)) f_time = ET.format_time(time.localtime()) today = f_time[:10] action_total_cnt = (action_total_cnt + len(pred_tags)) if today == action_day else len(pred_tags) save(path, 'a', '%s\t%s\t%s\n' % (f_time, json.dumps(tag2dist), len(stats))) if profile_enabled: self.update_profile_spatial(citys, pred_tags) self.update_profile_topic(raw_stats, pred_tags) if today != action_day: self.save_profile(action_day) self.reset_profile() action_day = today break except Exception as e: lexcept('Unknown exception %s' % e)
def train(self,cross_validation=False, fold_sz=10, test_path='../../test_data/tri_test_data'): self._train_xs, self._train_ys = ST.load_data(self._path) if not self._emoticon: ST.remove_emoticon(self._train_xs) self.gram2gid = self._discretize_gram2gid() if cross_validation: linfo('begin to cross train') self._cross_train(fold_sz) else: classifier = self._train(self._train_xs, self._train_ys) self._test_xs, self._test_ys = ST.load_data(test_path) ST.replace_url(self._test_xs, fill='H') ST.replace_target(self._test_xs, fill='T') test_set = [(self._feature_encoding(txt), tag) for txt, tag in zip(self._test_xs, self._test_ys)] linfo('maxent classifier precision: %.4f' % classify.accuracy(classifier, test_set))
def __init__(self, ct='tri', prefix=''): if prefix and prefix != 'Dg_': raise Exception('INVALID PREFIX GIVEN!!!') self.flag_prefix = prefix self.train_data_path = '%s/train_data/%s%s_train_data' % (project_dir, prefix, ct) if ct not in ['bi', 'tri']: raise Exception('INVALID Classifier Type') self.classifier_type = ct self.tag2index = TAG2INDEX self._train_xs, self._train_ys = ST.load_data(self.train_data_path) self._train_ys = map(lambda x: self.tag2index[x], self._train_ys) #self._feature_extract_config = ['unigram', 'bigram'] self._feature_extract_config = feature_config linfo('feature extract config: %s' % self._feature_extract_config) linfo('classifier type %s' % ct) linfo('init %s success' % self)
def _cal_shard2info(self, shard_indexs): #word2cnt = BayesClassifier.Word2Cnt() word2presence = BayesClassifier.Word2Cnt() #word_total_cnt = 0 tag2cnt = {"P":0,"N":0,"O":0} for index in shard_indexs: #word_total_cnt += len(x) txt = self._train_xs[index] tag = self._train_ys[index] tag2cnt[tag] += 1 bags = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config) for w in bags: word2presence[tag].setdefault(w, 0) word2presence[tag][w] += 1 #word2cnt[tag].setdefault(w, 0) #word2cnt[tag][w] += 1 return tag2cnt, word2presence
def _cal_shard2info(self, shard_indexs): #word2cnt = BayesClassifier.Word2Cnt() word2presence = BayesClassifier.Word2Cnt() #word_total_cnt = 0 tag2cnt = {"P": 0, "N": 0, "O": 0} for index in shard_indexs: #word_total_cnt += len(x) txt = self._train_xs[index] tag = self._train_ys[index] tag2cnt[tag] += 1 bags = ST.retrieve_feature( txt, feature_extract_config=self._ngrams_config) for w in bags: word2presence[tag].setdefault(w, 0) word2presence[tag][w] += 1 #word2cnt[tag].setdefault(w, 0) #word2cnt[tag][w] += 1 return tag2cnt, word2presence
def build_sparse_X(self, _xs): row_num = len(_xs) col_num = len(self.gram2gid) rows, cols = [], [] total_cnt = 0 for i, txt in enumerate(_xs): bags = ST.retrieve_feature( txt, feature_extract_config=self._feature_extract_config) for w in bags: if w in self.gram2gid: wid = self.gram2gid[w] rows.append(i) cols.append(wid) total_cnt += 1 linfo('build scipy sparse matrice. total_valid_cnt: %s' % (total_cnt)) row = np.array(rows) col = np.array(cols) data = np.array([1 for i in range(total_cnt)]) mtx = sparse.csr_matrix((data, (row, col)), shape=(row_num, col_num)) return mtx
def _predict(self, txt, train_w2c, train_t2c, debug=False, emoticon=True): #if emoticon and 'emoticon' not in self._ngrams_config: # self._ngrams_config.append('emoticon') #elif not emoticon and 'emoticon' in self._ngrams_config: # self._ngrams_config = filter(lambda x: x != 'emoticon', self._ngrams_config) #grams = self._retrieve_feature(txt) grams = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config, gram_icon_mixed=emoticon) if debug: linfo('begin debug case: %s' % txt) tag2score = {"P":0,"N":0,"O":0} for w in grams: for tag in tag2score: if not train_t2c[tag]: continue score = self._cal_likelihood(train_w2c[tag].get(w, 0), train_t2c[tag]) tag2score[tag] += score if debug: linfo('DEBUG probability for gram %s when given tag %s is: %.4f. gram cnt: %s.tag cnt: %s' % (w, tag, score, train_w2c[tag].get(w, 0), train_t2c[tag])) pred_tag = sorted(tag2score.keys(), key=lambda x: tag2score[x], reverse=True)[0] if debug: linfo('predict tag2score: %s' % tag2score) return pred_tag
def _train(self, shard_sz=10, cross_validation=True): #print self._ngrams_config linfo('begin train classifier') st = time.time() rid2shard = ST.random_shardlize(shard_sz, len(self._train_xs), load=True, path=self.rand_path) #rid2word_info = {} #total_word2cnt = BayesClassifier.Word2Cnt() rid2tag_cnt, rid2word_presence = {}, {} total_word2presence = BayesClassifier.Word2Cnt() total_tag2cnt = {"P":0,"N":0,"O":0} for rid in range(1, shard_sz+1): shard = rid2shard[rid] #rid2word_info[rid] rid2tag_cnt[rid], rid2word_presence[rid] = self._cal_shard2info(shard) #for tag, w2c in rid2word_info[rid].items(): # for w, c in w2c.items(): # total_word2cnt[tag].setdefault(w, 0) # total_word2cnt[tag][w] += c for tag, w2p in rid2word_presence[rid].items(): for w, c in w2p.items(): total_word2presence[tag].setdefault(w, 0) total_word2presence[tag][w] += c for tag, cnt in rid2tag_cnt[rid].items(): total_tag2cnt[tag] += cnt #self._debug_bigram(total_word2presence) self._prune(total_word2presence, rid2word_presence, total_tag2cnt) self.total_w2c, self.total_t2c = total_word2presence, total_tag2cnt linfo(self.total_t2c) #cross_validation if cross_validation: linfo('beign cross validation') p, r, f= self._cross_train(total_word2presence, rid2word_presence, total_tag2cnt, rid2tag_cnt, shard_sz, rid2shard) linfo('Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p , r , f, time.time()- st)) else: linfo('beign train and test with manually tagged data set') p, r, f = self._all_train(total_word2presence, total_tag2cnt) linfo('Manually Tag Data Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p , r , f , time.time()- st))
def parse_city_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/city_test_data'): st_t = time.time() city2txt = {} city_stat_cnt, total_cnt = 0, 0 stat_ids = set() txts_upperbound = 1000 with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) if dic['id'] in stat_ids: continue else: stat_ids.add(dic['id']) city = ST.parse_spatial(dic) if not city: continue city2txt.setdefault(city, list()) if len(city2txt[city]) >= txts_upperbound: continue city2txt[city].append(dic['text']) locs = sorted(city2txt.keys(), key=lambda x: len(city2txt[x]), reverse=True) print 'city_stat_cnt', city_stat_cnt print 'total_cnt', total_cnt print 'time used: %.2f' % (time.time() - st_t) citys = sorted(city2txt.keys()) #for x in citys: # print x, len(city2txt[x]) if os.path.exists(out_path): os.system('rm %s' % out_path) for x in locs: for txt in city2txt[x]: dic = {x: txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))