def parse_topic_public_stats(in_path='../stats/train_public_stats',out_path='../test_data/topic_test_data'): st_t = time.time() topic_cnt, total_cnt = 0, 0 topic2txt = {} with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) txt = dic['text'] topic = ST.parse_topic(txt) if not topic: continue topic2txt.setdefault(topic, list()) topic2txt[topic].append(txt) topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True) for t in topics: txts = topic2txt[t] if len(txts) > 7000: continue #print t, topic2txt[t] if len(txts) < 200: break for txt in txts: dic = {t:txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic)) print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt) print 'topic cnt: %s' % len(topic2txt) print 'time used: %.2f' % (time.time() - st_t)
def update_profile_topic(self, raw_stats, tags): for txt, tag in zip(raw_stats, tags): topic = ST.parse_topic(txt) if not topic: continue self.profile_topic.setdefault(topic, {"P":0,"N":0,"O":0}) self.profile_topic[topic][tag] += 1
def update_profile_topic(self, raw_stats, tags): for txt, tag in zip(raw_stats, tags): topic = ST.parse_topic(txt) if not topic: continue self.profile_topic.setdefault(topic, {"P": 0, "N": 0, "O": 0}) self.profile_topic[topic][tag] += 1
def parse_topic_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/topic_test_data'): st_t = time.time() topic_cnt, total_cnt = 0, 0 topic2txt = {} with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) txt = dic['text'] topic = ST.parse_topic(txt) if not topic: continue topic2txt.setdefault(topic, list()) topic2txt[topic].append(txt) topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True) for t in topics: txts = topic2txt[t] if len(txts) > 7000: continue #print t, topic2txt[t] if len(txts) < 200: break for txt in txts: dic = {t: txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic)) print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt) print 'topic cnt: %s' % len(topic2txt) print 'time used: %.2f' % (time.time() - st_t)
def parse_topics_realtime(self): topic_cnt, total_cnt = 0, 0 topic2txt = {} for name, txts in self.stats: for txt in txts: total_cnt += 1 topic = ST.parse_topic(txt) if not topic: continue topic_cnt += 1 topic2txt.setdefault(topic, list()) topic2txt[topic].append(txt) print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt) print 'topic cnt: %s' % len(topic2txt) return topic2txt