コード例 #1
0
def parse_topic_public_stats(in_path='../stats/train_public_stats',out_path='../test_data/topic_test_data'):
    st_t = time.time()
    topic_cnt, total_cnt = 0, 0
    topic2txt = {}
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip())
            txt = dic['text']
            topic = ST.parse_topic(txt)
            if not topic:
                continue
            topic2txt.setdefault(topic, list())
            topic2txt[topic].append(txt)
                
    topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True)
    for t in topics:
        txts = topic2txt[t]
        if len(txts) > 7000:
            continue
        #print t, topic2txt[t]
        if len(txts) < 200:
            break
        for txt in txts:
            dic = {t:txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
        
    print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt)
    print 'topic cnt: %s' % len(topic2txt)
    print 'time used: %.2f' % (time.time() - st_t)
コード例 #2
0
 def update_profile_topic(self, raw_stats, tags):
     for txt, tag in zip(raw_stats, tags):
         topic = ST.parse_topic(txt)
         if not topic:
             continue
         self.profile_topic.setdefault(topic, {"P":0,"N":0,"O":0})
         self.profile_topic[topic][tag] += 1
コード例 #3
0
 def update_profile_topic(self, raw_stats, tags):
     for txt, tag in zip(raw_stats, tags):
         topic = ST.parse_topic(txt)
         if not topic:
             continue
         self.profile_topic.setdefault(topic, {"P": 0, "N": 0, "O": 0})
         self.profile_topic[topic][tag] += 1
コード例 #4
0
def parse_topic_public_stats(in_path='../stats/train_public_stats',
                             out_path='../test_data/topic_test_data'):
    st_t = time.time()
    topic_cnt, total_cnt = 0, 0
    topic2txt = {}
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip())
            txt = dic['text']
            topic = ST.parse_topic(txt)
            if not topic:
                continue
            topic2txt.setdefault(topic, list())
            topic2txt[topic].append(txt)

    topics = sorted(topic2txt.keys(),
                    key=lambda x: len(topic2txt[x]),
                    reverse=True)
    for t in topics:
        txts = topic2txt[t]
        if len(txts) > 7000:
            continue
        #print t, topic2txt[t]
        if len(txts) < 200:
            break
        for txt in txts:
            dic = {t: txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))

    print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt)
    print 'topic cnt: %s' % len(topic2txt)
    print 'time used: %.2f' % (time.time() - st_t)
コード例 #5
0
ファイル: simulator.py プロジェクト: BrightSirius/sentiment
 def parse_topics_realtime(self):
     topic_cnt, total_cnt = 0, 0
     topic2txt = {}
     for name, txts in self.stats:
         for txt in txts:
             total_cnt += 1
             topic = ST.parse_topic(txt)
             if not topic:
                 continue
             topic_cnt += 1
             topic2txt.setdefault(topic, list())
             topic2txt[topic].append(txt)
     print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt)
     print 'topic cnt: %s' % len(topic2txt)
     return topic2txt