Exemple #1
0
 def online_run(self, interval=10, peroid=0.5, quiet=True):
     '''
     return value: [(city, stat)...]
     '''
     stats_set = set()
     stats = []
     now = peroid
     cnt = 0
     while now < interval:
         try:
             rsp = self.retrieve('on', quiet=quiet)
             cnt += 1
             if rsp:
                 for dic in rsp:
                     if dic['id'] not in stats_set:
                         city = ST.parse_spatial(dic)
                         item = (city, dic['text'])
                         stats.append(item)
                         stats_set.add(dic['id'])
         except Exception as e:
             logging.exception(e)
         now += peroid
         time.sleep(peroid*60)
     linfo('online analysis %s new stats retrieved. retrieve cnt: %s' % (len(stats), cnt))
     return stats
def parse_city_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/city_test_data'):
    st_t = time.time()
    city2txt = {}
    city_stat_cnt, total_cnt = 0, 0
    stat_ids = set()
    txts_upperbound = 1000
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip()) 
            if dic['id'] in stat_ids:
                continue
            else:
                stat_ids.add(dic['id'])
            city = ST.parse_spatial(dic)
            if not city:
                continue
            city2txt.setdefault(city, list())
            if len(city2txt[city]) >= txts_upperbound:
                continue
            city2txt[city].append(dic['text'])
    locs = sorted(city2txt.keys(), key=lambda x: len(city2txt[x]), reverse=True)
    print 'city_stat_cnt', city_stat_cnt
    print 'total_cnt', total_cnt
    print 'time used: %.2f' % (time.time() - st_t)
    citys = sorted(city2txt.keys())
    #for x in citys:
    #    print x, len(city2txt[x])
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    for x in locs:
        for txt in city2txt[x]:
            dic={x:txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
def parse_city_public_stats(in_path='../stats/train_public_stats',
                            out_path='../test_data/city_test_data'):
    st_t = time.time()
    city2txt = {}
    city_stat_cnt, total_cnt = 0, 0
    stat_ids = set()
    txts_upperbound = 1000
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip())
            if dic['id'] in stat_ids:
                continue
            else:
                stat_ids.add(dic['id'])
            city = ST.parse_spatial(dic)
            if not city:
                continue
            city2txt.setdefault(city, list())
            if len(city2txt[city]) >= txts_upperbound:
                continue
            city2txt[city].append(dic['text'])
    locs = sorted(city2txt.keys(),
                  key=lambda x: len(city2txt[x]),
                  reverse=True)
    print 'city_stat_cnt', city_stat_cnt
    print 'total_cnt', total_cnt
    print 'time used: %.2f' % (time.time() - st_t)
    citys = sorted(city2txt.keys())
    #for x in citys:
    #    print x, len(city2txt[x])
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    for x in locs:
        for txt in city2txt[x]:
            dic = {x: txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))