Ejemplo n.º 1
0
 def predict(self, stats):
     if isinstance(stats, str):
         stats = [stats]
     if not isinstance(stats, list):
         raise Exception('INVALID Parameter is given. %s' % stats)
     if not stats:
         return None
     ET.write_file(self.test_tmp_path, 'w', '')
     for txt in stats:
         features = self.model_helper.get_sparse_feature(txt)
         ET.write_file(self.test_tmp_path, 'a',
                       '-1 %s\n' % ' '.join(features))
     cmd = '%s/linear_predict %s %s %s 1>>std.log 2>>err.log' % (
         model_dir, self.test_tmp_path, self.model_path,
         self.predict_tmp_path)
     linfo('predict cmd: %s' % cmd)
     ret = os.system(cmd)
     linfo('predict finish. return value: %s' % ret)
     if ret != 0:
         raise Exception('Fatal Error-Classifier predict FAIL')
     if os.path.exists(self.predict_tmp_path):
         with open(self.predict_tmp_path, 'r') as f:
             pred_tags = [line.strip() for line in f]
         ldebug('read predict results cnt: %s' % len(pred_tags))
         if len(pred_tags) != len(stats):
             raise Exception('Invalid pred results')
         os.system('rm %s' % self.predict_tmp_path)
         try:
             return map(lambda x: INDEX2TAG[int(x)], pred_tags)
         except:
             raise Exception('Invalid pred results')
     return None
Ejemplo n.º 2
0
 def run(self, detail=False):
     self.train()
     for name, stat in self.stats:
         try:
             for clf, path in self.classifiers:
                 linfo('----------roundly predict start-----------')
                 raw_stat, stat = ST.preprocess(stat)
                 union = [(raw,new) for raw, new in zip(raw_stat, stat) if new]
                 raw_stat = map(lambda x:x[0], union)
                 stat = map(lambda x:x[1], union)
                 pred_tags = clf.predict(stat)
                 if not pred_tags or len(pred_tags) != len(stat):
                     raise Exception('Predict Results Exception')
                 tag2dist = self.cal_tag_dist(pred_tags)
                 linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist))
                 save(path, 'a', '%s\t%s\t%s\n' % (name, json.dumps(tag2dist), len(stat)))
                 if detail:
                     detail_path = '%s%s' % (stats_predict_detail_prefix, name)
                     if os.path.exists(detail_path):
                         os.system('rm %s' % detail_path)
                     for tag, txt in zip(pred_tags, raw_stat):
                         ET.write_file(detail_path, 'a', '%s -%s\n' % (tag, txt))
                         #print tag, '-%s' % txt
                 linfo('----------roundly predict end-----------')
         except Exception as e:
             lexcept('Unknown exception %s' % e)
Ejemplo n.º 3
0
def auto_tag_check(in_path='../test_data/objective_test_data',
                   out_path='../test_data/objective_test_data_final',
                   tag_log='tagger_check.log'):
    start_line = 0
    if os.path.exists(tag_log):
        with open(tag_log, 'r') as f:
            line = f.readline()
            start_line = int(line)
    linfo('st_line: %s' % start_line)

    with open(in_path, 'r') as f:
        print 'please tag following states with "P:Positive", "N:Negative", "O:Objective"'
        for num, line in enumerate(f):
            if num < start_line:
                continue
            ET.write_file(tag_log, 'w', '%s' % (num + 1))
            dic = json.loads(line.strip())
            tag, txt = dic.items()[0]
            if '#' in txt:
                continue
            print '--------------'
            print tag, txt
            tag = raw_input()
            if tag in TAGS:
                item = {tag: txt}
                print '%s this state' % tag
                ET.write_file(out_path, 'a', '%s\n' % json.dumps(item))
            elif tag == 'Z':
                print 'exit'
                break
            else:
                print 'ignore this state'
Ejemplo n.º 4
0
def auto_tag_check(in_path='../test_data/objective_test_data', out_path='../test_data/objective_test_data_final', tag_log='tagger_check.log'):
    start_line = 0
    if os.path.exists(tag_log):
        with open(tag_log, 'r') as f:
            line = f.readline()
            start_line = int(line)
    linfo('st_line: %s' % start_line)


    with open(in_path, 'r') as f:
        print 'please tag following states with "P:Positive", "N:Negative", "O:Objective"'
        for num, line in enumerate(f):
            if num < start_line:
                continue
            ET.write_file(tag_log, 'w', '%s' % (num+1))
            dic = json.loads(line.strip())
            tag, txt = dic.items()[0]
            if '#' in txt:
                continue
            print '--------------'
            print tag, txt
            tag = raw_input()
            if tag in TAGS:
                item = {tag:txt}
                print '%s this state' % tag
                ET.write_file(out_path, 'a', '%s\n' % json.dumps(item))
            elif tag == 'Z':
                print 'exit'
                break
            else:
                print 'ignore this state'
Ejemplo n.º 5
0
def ProfileRawData(path='../stats/train_public_stats'):
    '''
    calculate user, url, retweet, topic, redundant stat
    '''
    user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0)
    st = time.time()
    lines = ST.load_raw_data(path, replace_enabled=False, row_num=None)
    w2c = {}
    for txt in lines:
        for x in txt:
            w2c.setdefault(x, 0)
            w2c[x] += 1
    print 'word cnt', len(w2c)
    out_path = 'word2cnt'
    ET.write_file(out_path, 'w', '')
    for w, c in w2c.items():
        if w == ',':
            print 'special word: %s. cnt %s' % (w, c)
            continue
        ET.write_file(out_path, 'a', '%s,%s\n' % (w, c))
    return
    for txt in lines:
        if '@' in txt:
            user_cnt += 1
        if 'http' in txt:
            url_cnt += 1
        if '#' in txt:
            st_i = txt.find('#')
            if txt.find('#', st_i + 1) != -1:
                topic_cnt += 1
    print 'user_cnt', user_cnt
    print 'url_cnt', url_cnt
    print 'topic_cnt', topic_cnt
    print 'time used', time.time() - st
Ejemplo n.º 6
0
    def debug_topic_public_stats(self, detail=False):
        #print '--------------------------------------'
        topic2txt = self.parse_topics_test_data()

        topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True)
        #for tp in topics:
        #    print tp, len(topic2txt[tp])
        #for txt in topic2txt[u'#和颐酒店女生遇袭#']:
        #    print txt
        out_path = 'topic_test_tag_dist_format'
        self.train()
        for tp in topics:
            txts = topic2txt[tp]
            if len(txts) < 100:
                break
            txts = [x for x in txts]
            for csf, path in self.classifiers:
                #pred_tags = csf.predict(txts)
                #if not pred_tags or len(pred_tags) != len(txts):
                #    raise Exception('Predict Results Exception')
                #tag2dist = self.cal_tag_dist(pred_tags)

                txts_no_topic = map(lambda x:x.replace(tp, ''), txts)
                pred_tags = csf.predict(txts_no_topic)
                if not pred_tags or len(pred_tags) != len(txts):
                    raise Exception('Predict Results Exception')
                tag2dist = self.cal_tag_dist(pred_tags)
                ET.write_file(out_path, 'a', '%s,%s,%.4f,%.4f,%.4f\n' % (tp, len(txts), tag2dist['O'], tag2dist['P'], tag2dist['N']))
                if detail:
                    detail_path = 'topic_test_simulate'
                    for tag, txt in zip(pred_tags, txts):
                        ET.write_file(detail_path, 'a', '%s -%s\n' % (tag, txt))
Ejemplo n.º 7
0
def parse_topic_public_stats(in_path='../stats/train_public_stats',
                             out_path='../test_data/topic_test_data'):
    st_t = time.time()
    topic_cnt, total_cnt = 0, 0
    topic2txt = {}
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip())
            txt = dic['text']
            topic = ST.parse_topic(txt)
            if not topic:
                continue
            topic2txt.setdefault(topic, list())
            topic2txt[topic].append(txt)

    topics = sorted(topic2txt.keys(),
                    key=lambda x: len(topic2txt[x]),
                    reverse=True)
    for t in topics:
        txts = topic2txt[t]
        if len(txts) > 7000:
            continue
        #print t, topic2txt[t]
        if len(txts) < 200:
            break
        for txt in txts:
            dic = {t: txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))

    print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt)
    print 'topic cnt: %s' % len(topic2txt)
    print 'time used: %.2f' % (time.time() - st_t)
Ejemplo n.º 8
0
def parse_city_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/city_test_data'):
    st_t = time.time()
    city2txt = {}
    city_stat_cnt, total_cnt = 0, 0
    stat_ids = set()
    txts_upperbound = 1000
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip()) 
            if dic['id'] in stat_ids:
                continue
            else:
                stat_ids.add(dic['id'])
            city = ST.parse_spatial(dic)
            if not city:
                continue
            city2txt.setdefault(city, list())
            if len(city2txt[city]) >= txts_upperbound:
                continue
            city2txt[city].append(dic['text'])
    locs = sorted(city2txt.keys(), key=lambda x: len(city2txt[x]), reverse=True)
    print 'city_stat_cnt', city_stat_cnt
    print 'total_cnt', total_cnt
    print 'time used: %.2f' % (time.time() - st_t)
    citys = sorted(city2txt.keys())
    #for x in citys:
    #    print x, len(city2txt[x])
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    for x in locs:
        for txt in city2txt[x]:
            dic={x:txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
Ejemplo n.º 9
0
def parse_topic_public_stats(in_path='../stats/train_public_stats',out_path='../test_data/topic_test_data'):
    st_t = time.time()
    topic_cnt, total_cnt = 0, 0
    topic2txt = {}
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip())
            txt = dic['text']
            topic = ST.parse_topic(txt)
            if not topic:
                continue
            topic2txt.setdefault(topic, list())
            topic2txt[topic].append(txt)
                
    topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True)
    for t in topics:
        txts = topic2txt[t]
        if len(txts) > 7000:
            continue
        #print t, topic2txt[t]
        if len(txts) < 200:
            break
        for txt in txts:
            dic = {t:txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
        
    print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt)
    print 'topic cnt: %s' % len(topic2txt)
    print 'time used: %.2f' % (time.time() - st_t)
Ejemplo n.º 10
0
 def predict(self, stats):
     if isinstance(stats, str):
         stats = [stats]
     if not isinstance(stats, list):
         raise Exception('INVALID Parameter is given. %s' % stats)
     if not stats:
         return None
     ET.write_file(self.test_tmp_path, 'w', '')
     for txt in stats:
         features = self.model_helper.get_sparse_feature(txt)
         ET.write_file(self.test_tmp_path, 'a', '-1 %s\n' % ' '.join(features))
     cmd = '%s/linear_predict %s %s %s 1>>std.log 2>>err.log' % (model_dir, self.test_tmp_path, self.model_path, self.predict_tmp_path)
     linfo('predict cmd: %s' % cmd)
     ret = os.system(cmd)
     linfo('predict finish. return value: %s' % ret)
     if ret != 0:
         raise Exception('Fatal Error-Classifier predict FAIL')
     if os.path.exists(self.predict_tmp_path):
         with open(self.predict_tmp_path, 'r') as f:
             pred_tags = [line.strip() for line in f]
         ldebug('read predict results cnt: %s' % len(pred_tags))
         if len(pred_tags) != len(stats):
             raise Exception('Invalid pred results')
         os.system('rm %s' % self.predict_tmp_path)
         try:
             return map(lambda x: INDEX2TAG[int(x)], pred_tags)
         except:
             raise Exception('Invalid pred results')
     return None
Ejemplo n.º 11
0
def ProfileRawData(path='../stats/train_public_stats'):
    '''
    calculate user, url, retweet, topic, redundant stat
    '''
    user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0)
    st = time.time()
    lines = ST.load_raw_data(path, replace_enabled=False,  row_num=None)
    w2c = {}
    for txt in lines:
        for x in txt:
            w2c.setdefault(x, 0)
            w2c[x] += 1
    print 'word cnt', len(w2c)
    out_path = 'word2cnt'
    ET.write_file(out_path, 'w', '')
    for w,c in w2c.items():
        if w == ',':
            print 'special word: %s. cnt %s' % (w, c)
            continue
        ET.write_file(out_path, 'a', '%s,%s\n' % (w, c))
    return
    for txt in lines:
        if '@' in txt:
            user_cnt += 1
        if 'http' in txt: 
            url_cnt += 1
        if '#' in txt:
            st_i = txt.find('#')
            if txt.find('#', st_i+1) != -1:
                topic_cnt += 1
    print 'user_cnt', user_cnt
    print 'url_cnt', url_cnt
    print 'topic_cnt', topic_cnt
    print 'time used', time.time() - st
Ejemplo n.º 12
0
    def run(self, sample_enabled=False, profile_enabled=False):
        for csf, path in self.classifiers:
            csf.train()
        action_day = ET.format_time(time.localtime())[:10]
        action_total_cnt = 0
        if profile_enabled:
            self.reset_profile()
        while True:
            try:
                stats = self.psr.online_run(interval=10)
                if not stats:
                    continue

                linfo('-------roundly analysis----------')
                citys = map(lambda x: x[0], stats)
                stats = map(lambda x: x[1], stats)
                raw_stats, stats = ST.preprocess(stats)
                valid_ids = [i for i, txt in enumerate(stats) if txt]
                stats = map(lambda i: stats[i], valid_ids)
                raw_stats = map(lambda i: raw_stats[i], valid_ids)
                citys = map(lambda i: citys[i], valid_ids)
                f_t = ET.format_time(time.localtime())
                if sample_enabled:
                    sample_path = '%srealtime_%s' % (
                        sample_prefix, f_t.replace(' ', '').replace(
                            '-', '').replace(':', ''))
                    ET.write_file(sample_path, 'a',
                                  '%s\n' % json.dumps(raw_stats[:300]))

                #only one model supported at the same time now.
                for clf, path in self.classifiers:
                    tag2cnt = {'P': 0, 'N': 0, 'O': 0}
                    pred_tags = clf.predict(stats)
                    for tag in pred_tags:
                        tag2cnt[tag] += 1
                    tag2dist = {
                        tag: cnt * 1.0 / len(stats)
                        for tag, cnt in tag2cnt.items()
                    }
                    linfo('%s-roundly online sentiment distribution: %s' %
                          (clf, tag2dist))
                    f_time = ET.format_time(time.localtime())
                    today = f_time[:10]
                    action_total_cnt = (
                        action_total_cnt + len(pred_tags)
                    ) if today == action_day else len(pred_tags)
                    save(
                        path, 'a', '%s\t%s\t%s\n' %
                        (f_time, json.dumps(tag2dist), len(stats)))
                    if profile_enabled:
                        self.update_profile_spatial(citys, pred_tags)
                        self.update_profile_topic(raw_stats, pred_tags)
                        if today != action_day:
                            self.save_profile(action_day)
                            self.reset_profile()
                            action_day = today
                    break
            except Exception as e:
                lexcept('Unknown exception %s' % e)
Ejemplo n.º 13
0
 def format_sparse(self,_xs, _ys, out_path):
     if os.path.exists(out_path):
         os.system('rm %s' % out_path)
     for txt, tag in zip(_xs, _ys):
         bags = self.get_feature(txt)
         features = self.discret_feature(bags)
         line = '%s %s' % (tag, ' '.join(features))
         ET.write_file(out_path, 'a', '%s\n' % line)
Ejemplo n.º 14
0
 def format_sparse(self, _xs, _ys, out_path):
     if os.path.exists(out_path):
         os.system('rm %s' % out_path)
     for txt, tag in zip(_xs, _ys):
         bags = self.get_feature(txt)
         features = self.discret_feature(bags)
         line = '%s %s' % (tag, ' '.join(features))
         ET.write_file(out_path, 'a', '%s\n' % line)
Ejemplo n.º 15
0
def visual_stats(in_path='stats/public_stats',out_path='stats/simple_public_stats'):
    '''
    Load public states and visualise
    '''
    lines = []
    with open(in_path, 'r') as f:
        for line in f:
            dic = json.loads(line.strip())
            lines.append('user: %s. created at: %s. text: %s\n' % (dic['user']['name'], dic['created_at'], dic['text']))
    for line in lines:
        ET.write_file(out_path, 'a', line)
    print 'cnt of lines: %s' % len(lines)
Ejemplo n.º 16
0
def visual_stats(in_path='stats/public_stats',
                 out_path='stats/simple_public_stats'):
    '''
    Load public states and visualise
    '''
    lines = []
    with open(in_path, 'r') as f:
        for line in f:
            dic = json.loads(line.strip())
            lines.append('user: %s. created at: %s. text: %s\n' %
                         (dic['user']['name'], dic['created_at'], dic['text']))
    for line in lines:
        ET.write_file(out_path, 'a', line)
    print 'cnt of lines: %s' % len(lines)
Ejemplo n.º 17
0
 def run(self, sample_enabled=False, profile_enabled=False):
     for csf, path in self.classifiers:
         csf.train()
     action_day = ET.format_time(time.localtime())[:10]
     action_total_cnt = 0
     if profile_enabled:
         self.reset_profile()
     while True:
         try:
             stats = self.psr.online_run(interval=10)
             if not stats:
                 continue
             
             linfo('-------roundly analysis----------')
             citys = map(lambda x:x[0], stats)
             stats = map(lambda x:x[1], stats)
             raw_stats, stats = ST.preprocess(stats)
             valid_ids = [i for i, txt in enumerate(stats) if txt]
             stats = map(lambda i:stats[i], valid_ids)
             raw_stats = map(lambda i:raw_stats[i], valid_ids)
             citys = map(lambda i:citys[i], valid_ids)
             f_t = ET.format_time(time.localtime())
             if sample_enabled:
                 sample_path = '%srealtime_%s' % (sample_prefix, f_t.replace(' ', '').replace('-', '').replace(':',''))
                 ET.write_file(sample_path, 'a', '%s\n' % json.dumps(raw_stats[:300]))
             
             #only one model supported at the same time now.
             for clf, path in self.classifiers:
                 tag2cnt = {'P':0, 'N':0, 'O':0}
                 pred_tags = clf.predict(stats)
                 for tag in pred_tags:
                     tag2cnt[tag] += 1
                 tag2dist =  {tag:cnt * 1.0 / len(stats) for tag,cnt in tag2cnt.items()}
                 linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist))
                 f_time = ET.format_time(time.localtime())
                 today = f_time[:10]
                 action_total_cnt = (action_total_cnt + len(pred_tags)) if today == action_day else len(pred_tags)
                 save(path, 'a', '%s\t%s\t%s\n' % (f_time, json.dumps(tag2dist), len(stats)))
                 if profile_enabled:
                     self.update_profile_spatial(citys, pred_tags)
                     self.update_profile_topic(raw_stats, pred_tags)
                     if today != action_day:
                         self.save_profile(action_day)
                         self.reset_profile()
                         action_day = today
                 break
         except Exception as e:
             lexcept('Unknown exception %s' % e)
Ejemplo n.º 18
0
 def debug_city_public_stats(self, detail=False):
     city2txt = self.parse_citys_test_data()
     return
     out_path = 'city_test_tag_dist_format'
     self.train()
     for tp in city2txt:
         txts = city2txt[tp]
         for csf, path in self.classifiers:
             pred_tags = csf.predict(txts)
             if not pred_tags or len(pred_tags) != len(txts):
                 raise Exception('Predict Results Exception')
             tag2dist = self.cal_tag_dist(pred_tags)
             ET.write_file(out_path, 'a', '%s,%s,%.4f,%.4f,%.4f\n' % (tp, len(txts), tag2dist['O'], tag2dist['P'], tag2dist['N']))
             if detail:
                 detail_path = 'city_test_simulate'
                 for tag, txt in zip(pred_tags, txts):
                     ET.write_file(detail_path, 'a', '%s -%s\n' % (tag, txt))
Ejemplo n.º 19
0
def parse_objective_stats(in_path='../stats/public_stats', out_path='../stats/objective_train_data'):
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    st = time.time()
    cnt = 1
    with open(in_path, 'r') as f:
        for line in f:
            dic = json.loads(line.strip())
            txt = dic['text']
            if any([x in txt for x in objective_excludes]):
                continue
            if any([x in txt for x in objective_includes]):
                dic = {'O':txt}
                ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
                cnt += 1
            if cnt > 25000:
                break
    linfo('time used: %.2f. objective stats cnt: %s' % (time.time() - st, cnt))
Ejemplo n.º 20
0
 def random_shardlize(cls, shard_sz, rand_cnt, path='rand_req', save=False, load=False):
     if shard_sz <= 1:
         raise Exception('unvalid shard_sz for cross validation')
     if load:
         with open(path, 'r') as f:
             line = f.readline().strip()
             rand_req = map(int, line.split(' '))
             if len(rand_req) != rand_cnt:
                 raise Exception('Load rand_req fail. wrong results')
     else:
         rand_req =  [random.randint(1, shard_sz) for i in range(rand_cnt)]
     if save:
         ET.write_file(path, 'w', '%s\n'%' '.join(map(str, rand_req)))
         
     rid2shard = {}
     for i, rid in enumerate(rand_req):
         rid2shard.setdefault(rid, [])
         rid2shard[rid].append(i)
     return rid2shard
Ejemplo n.º 21
0
def parse_objective_stats(in_path='../stats/public_stats',
                          out_path='../stats/objective_train_data'):
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    st = time.time()
    cnt = 1
    with open(in_path, 'r') as f:
        for line in f:
            dic = json.loads(line.strip())
            txt = dic['text']
            if any([x in txt for x in objective_excludes]):
                continue
            if any([x in txt for x in objective_includes]):
                dic = {'O': txt}
                ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
                cnt += 1
            if cnt > 25000:
                break
    linfo('time used: %.2f. objective stats cnt: %s' % (time.time() - st, cnt))
Ejemplo n.º 22
0
def visual_test(in_path='../test_data/tri_test_data', out_path='../test_data/tri_visual_data'):
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    #pp, nn = [], []
    lines = []
    with open(in_path, 'r') as f:
        for line in f:
            dic = json.loads(line.strip())
            tag, txt = dic.items()[0]
            new_line = '%s -%s\n'  % (tag, txt)
            lines.append(new_line)
            #ET.write_file(out_path, 'a', new_line)
            #if tag == 'P':
            #    pp.append(new_line)
            #elif tag == 'N':
            #    nn.append(new_line)
    lines = sorted(lines)
    for line in lines:
        ET.write_file(out_path, 'a', line)
Ejemplo n.º 23
0
 def batch_extract_feature(self, word2cnt_path=None, cnt_threshold=10):
     self.w2id = {}
     w2cnt = {}
     for txt in self._train_xs:
         bags = self.get_feature(txt, cache=True)
         for w in bags:
             w2cnt.setdefault(w, 0)
             w2cnt[w] += 1
             if w not in self.w2id:
                 self.w2id[w] = len(self.w2id) + 1
     if word2cnt_path:
         if os.path.exists(word2cnt_path):
             os.system('rm %s' % word2cnt_path)
         words = sorted(w2cnt.keys(), key=lambda x: w2cnt[x], reverse=True)
         for w in words:
             cnt = w2cnt[w]
             ET.write_file(word2cnt_path, 'a', '%s %s\n' % (w, cnt))
     #self.w2id = {w:d for w, d in self.w2id.items() if w2cnt[w] >= cnt_threshold}
     linfo('gram cnt: %s' % len(self.w2id))
     return self.w2id
Ejemplo n.º 24
0
 def batch_extract_feature(self, word2cnt_path=None, cnt_threshold=10):
     self.w2id = {} 
     w2cnt = {}
     for txt in self._train_xs:
         bags = self.get_feature(txt, cache=True)
         for w in bags:
             w2cnt.setdefault(w, 0)
             w2cnt[w] += 1
             if w not in self.w2id:
                 self.w2id[w] = len(self.w2id) + 1
     if word2cnt_path:
         if os.path.exists(word2cnt_path):
             os.system('rm %s' % word2cnt_path)
         words = sorted(w2cnt.keys(), key=lambda x: w2cnt[x], reverse=True)
         for w in words:
             cnt = w2cnt[w]
             ET.write_file(word2cnt_path, 'a', '%s %s\n' % (w, cnt))
     #self.w2id = {w:d for w, d in self.w2id.items() if w2cnt[w] >= cnt_threshold}
     linfo('gram cnt: %s' % len(self.w2id))
     return self.w2id
Ejemplo n.º 25
0
def visual_test(in_path='../test_data/tri_test_data',
                out_path='../test_data/tri_visual_data'):
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    #pp, nn = [], []
    lines = []
    with open(in_path, 'r') as f:
        for line in f:
            dic = json.loads(line.strip())
            tag, txt = dic.items()[0]
            new_line = '%s -%s\n' % (tag, txt)
            lines.append(new_line)
            #ET.write_file(out_path, 'a', new_line)
            #if tag == 'P':
            #    pp.append(new_line)
            #elif tag == 'N':
            #    nn.append(new_line)
    lines = sorted(lines)
    for line in lines:
        ET.write_file(out_path, 'a', line)
Ejemplo n.º 26
0
 def save_profile(self, action_day):
     #action_day format: 2016-01-01
     action_day = action_day.replace('-', '')
     city_path = '%scity/city_%s' % (profile_prefix, action_day)
     linfo('save spatial profile: %s' % city_path)
     ET.write_file(city_path, 'w', '')
     for city, dist in self.profile_city.items():
         ET.write_file(city_path, 'a', '%s,%s,%s,%s\n' % (city, dist['O'], dist['P'],dist['N']))
     
     topic_path = '%stopic/topic_%s' % (profile_prefix, action_day)
     linfo('save topic profile: %s' % topic_path)
     ET.write_file(topic_path, 'w', '')
     for tp, dist in self.profile_topic.items():
         ET.write_file(topic_path, 'a', '%s,%s,%s,%s\n' % (tp.replace(',', '-'), dist['O'], dist['P'],dist['N']))
Ejemplo n.º 27
0
def parse_city_public_stats(in_path='../stats/train_public_stats',
                            out_path='../test_data/city_test_data'):
    st_t = time.time()
    city2txt = {}
    city_stat_cnt, total_cnt = 0, 0
    stat_ids = set()
    txts_upperbound = 1000
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip())
            if dic['id'] in stat_ids:
                continue
            else:
                stat_ids.add(dic['id'])
            city = ST.parse_spatial(dic)
            if not city:
                continue
            city2txt.setdefault(city, list())
            if len(city2txt[city]) >= txts_upperbound:
                continue
            city2txt[city].append(dic['text'])
    locs = sorted(city2txt.keys(),
                  key=lambda x: len(city2txt[x]),
                  reverse=True)
    print 'city_stat_cnt', city_stat_cnt
    print 'total_cnt', total_cnt
    print 'time used: %.2f' % (time.time() - st_t)
    citys = sorted(city2txt.keys())
    #for x in citys:
    #    print x, len(city2txt[x])
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    for x in locs:
        for txt in city2txt[x]:
            dic = {x: txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
Ejemplo n.º 28
0
    def save_profile(self, action_day):
        #action_day format: 2016-01-01
        action_day = action_day.replace('-', '')
        city_path = '%scity/city_%s' % (profile_prefix, action_day)
        linfo('save spatial profile: %s' % city_path)
        ET.write_file(city_path, 'w', '')
        for city, dist in self.profile_city.items():
            ET.write_file(
                city_path, 'a',
                '%s,%s,%s,%s\n' % (city, dist['O'], dist['P'], dist['N']))

        topic_path = '%stopic/topic_%s' % (profile_prefix, action_day)
        linfo('save topic profile: %s' % topic_path)
        ET.write_file(topic_path, 'w', '')
        for tp, dist in self.profile_topic.items():
            ET.write_file(
                topic_path, 'a', '%s,%s,%s,%s\n' %
                (tp.replace(',', '-'), dist['O'], dist['P'], dist['N']))
Ejemplo n.º 29
0
def write_records(path, mode, txt):
    local_lock.acquire()
    ET.write_file(path, mode, txt)
    local_lock.release()