Exemple #1
0
 def real_test(self):
     self._test_xs, self._test_ys = ST.load_data(self.test_path)
     ST.replace_url(self._test_xs, fill='H')
     ST.replace_target(self._test_xs, fill='T')
     #x_y = [(self.discret_txt(txt), y) for txt, y in zip(self._test_xs, self._test_ys)]
     test_mat = self.build_sparse_X(self._test_xs)
     self.accuracy(test_mat, self._test_ys)
def parse_stats_emoticon(in_path='../stats/train_public_stats',
                         out_path='../stats/emoticon_debug'):
    '''
    Study emoticon info from public states
    '''
    st = time.time()
    icon2cnt = {}
    config = {"row_num": None}
    lines = ST.load_raw_data(in_path, **config)
    icon_line_cnt = 0
    for txt in lines:
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        icon_line_cnt += 1
        for icon in icons:
            icon2cnt.setdefault(icon, 0)
            icon2cnt[icon] += 1
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    icons = icon2cnt.keys()
    icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True)
    for icon in icons:
        cnt = icon2cnt[icon]
        write(out_path, 'a', '%s:%s\n' % (icon, cnt))
    linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' %
          (len(lines), icon_line_cnt, len(icons)))
def parse_stats_emoticon(in_path='../stats/train_public_stats', out_path='../stats/emoticon_debug'):
    '''
    Study emoticon info from public states
    '''
    st = time.time()
    icon2cnt = {}
    config={"row_num":None}
    lines = ST.load_raw_data(in_path, **config)
    icon_line_cnt = 0
    for txt in lines:
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        icon_line_cnt += 1
        for icon in icons:
            icon2cnt.setdefault(icon, 0)
            icon2cnt[icon] += 1
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    icons = icon2cnt.keys()
    icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True)
    for icon in icons:
        cnt = icon2cnt[icon]
        write(out_path, 'a', '%s:%s\n' % (icon, cnt))
    linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' % (len(lines), icon_line_cnt, len(icons)))
 def debug(self):
     ws_1 = set(self.batch_extract_feature().keys())
     ST.remove_emoticon(self._train_xs)
     ws_2 = set(self.batch_extract_feature().keys())
     linfo('uni_bi_icon_feature_cnt: %s. no_icon: %s' % (len(ws_1), len(ws_2)))
     rms =  ws_2 - ws_1
     for x in rms:
         print x
 def debug(self):
     ws_1 = set(self.batch_extract_feature().keys())
     ST.remove_emoticon(self._train_xs)
     ws_2 = set(self.batch_extract_feature().keys())
     linfo('uni_bi_icon_feature_cnt: %s. no_icon: %s' %
           (len(ws_1), len(ws_2)))
     rms = ws_2 - ws_1
     for x in rms:
         print x
 def train_discret_model(self, **config):
     linfo('begin train helper discret model: %s' % config)
     if not config['emoticon']:
         ST.remove_emoticon(self._train_xs)
     if not config['parenthesis']:
         ST.remove_parenthesis(self._train_xs)
     self.txt2bags = {}
     self.w2id = self.batch_extract_feature()
     linfo('end train helper discret model')
 def train_discret_model(self, **config):
     linfo('begin train helper discret model: %s' % config)
     if not config['emoticon']:
         ST.remove_emoticon(self._train_xs)
     if not config['parenthesis']:
         ST.remove_parenthesis(self._train_xs)
     self.txt2bags = {}
     self.w2id = self.batch_extract_feature()
     linfo('end train helper discret model')
 def format_test(self, emoticon=True, parenthesis=True):
     test_path='../test_data/%s_test_data' % self.classifier_type
     self._test_xs, self._test_ys = ST.load_data(test_path)
     linfo('begin preprocess test data, then sparse')
     self._raw_test_xs, self._test_xs = ST.preprocess(self._test_xs)
     #ST.replace_url(self._test_xs, fill='H')
     #ST.replace_target(self._test_xs, fill='T')
     self._test_ys = map(lambda x:self.tag2index[x], self._test_ys)
     self.format_sparse(self._test_xs, self._test_ys, '%s/test_data/%s%s_sparse_test_data_%s' % (project_dir, self.flag_prefix, self.classifier_type, 'icon' if emoticon else 'no_icon'))
Exemple #9
0
    def train(self):
        self._train_xs, self._train_ys = ST.load_data(self._path)
        if not self._emoticon:
            ST.remove_emoticon(self._train_xs)
        self.gram2gid = self._discretize_gram2gid()
        X = self.build_sparse_X(self._train_xs)

        self.clf.fit(X, self._train_ys)

        self.real_test()
def parse_emoticon_stats(in_path='../stats/train_public_stats',
                         out_path='../stats/train_data_dg'):
    '''
    Parse states with selected emocicons.
    Dumps or visualise
    '''
    st = time.time()
    pos_icons, neg_icons = load_emoticon()

    icon2stat = {}
    lines = ST.load_raw_data(in_path)
    for txt in lines:
        if any([x in txt for x in excludes]):
            continue
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        dis_match = filter(lambda x: x not in pos_icons and x not in neg_icons,
                           icons)
        if dis_match:
            if len(set(dis_match)) >= 2:
                continue
        pos_match = filter(lambda x: x in txt, pos_icons)
        neg_match = filter(lambda x: x in txt, neg_icons)
        if (pos_match and neg_match) or (not pos_match and not neg_match):
            continue
        if pos_match:
            for icon in pos_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break
        if neg_match:
            for icon in neg_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break

    write = ET.write_file
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons])
    neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons])
    icons = copy.copy(pos_icons)
    icons.extend(neg_icons)
    write(
        out_path, 'a',
        '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n'
        % (len(lines), pos_cnt, neg_cnt, time.time() - st))
    for icon in icons:
        stats = icon2stat.get(icon, [])
        #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats)))
        for stat in stats:
            dic = {'%s' % ('P' if icon in pos_icons else 'N'): stat}
            write(out_path, 'a', '%s\n' % json.dumps(dic))
    def train(self, icon=True, cross=False):
        #word2cnt = BayesClassifier.Word2Cnt()

        #txt = '今天天气就是棒[哈哈] [太阳] [飞起来]#'
        #return
        #self._load_data()
        #self._replace_url(fill=True)
        self._train_xs, self._train_ys = ST.load_data(self._train_path)
        ST.replace_url(self._train_xs, fill=True)
        if not icon:
            ST.remove_emoticon(self._train_xs)
        self._train(cross_validation=cross)
 def train(self, icon=True, cross=False):
     #word2cnt = BayesClassifier.Word2Cnt()
     
     #txt = '今天天气就是棒[哈哈] [太阳] [飞起来]#'
     #return
     #self._load_data()
     #self._replace_url(fill=True)
     self._train_xs, self._train_ys = ST.load_data(self._train_path)
     ST.replace_url(self._train_xs, fill=True)
     if not icon:
         ST.remove_emoticon(self._train_xs)
     self._train(cross_validation=cross)
 def format_test(self, emoticon=True, parenthesis=True):
     test_path = '../test_data/%s_test_data' % self.classifier_type
     self._test_xs, self._test_ys = ST.load_data(test_path)
     linfo('begin preprocess test data, then sparse')
     self._raw_test_xs, self._test_xs = ST.preprocess(self._test_xs)
     #ST.replace_url(self._test_xs, fill='H')
     #ST.replace_target(self._test_xs, fill='T')
     self._test_ys = map(lambda x: self.tag2index[x], self._test_ys)
     self.format_sparse(
         self._test_xs, self._test_ys,
         '%s/test_data/%s%s_sparse_test_data_%s' %
         (project_dir, self.flag_prefix, self.classifier_type,
          'icon' if emoticon else 'no_icon'))
def parse_emoticon_stats(in_path='../stats/train_public_stats', out_path='../stats/train_data_dg'):
    '''
    Parse states with selected emocicons.
    Dumps or visualise
    '''
    st = time.time()
    pos_icons, neg_icons = load_emoticon()

    icon2stat= {}
    lines = ST.load_raw_data(in_path)
    for txt in lines:
        if any([x in txt for x in excludes]):
            continue
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        dis_match = filter(lambda x:x not in pos_icons and x not in neg_icons, icons)
        if dis_match:
            if len(set(dis_match)) >= 2:
                continue
        pos_match = filter(lambda x: x in txt, pos_icons)
        neg_match = filter(lambda x: x in txt, neg_icons)
        if (pos_match and neg_match) or (not pos_match and not neg_match):
            continue
        if pos_match:
            for icon in pos_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break
        if neg_match:
            for icon in neg_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break

    write = ET.write_file
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons])
    neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons])
    icons = copy.copy(pos_icons)
    icons.extend(neg_icons)
    write(out_path, 'a', '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n' % (len(lines), pos_cnt, neg_cnt, time.time()-st))
    for icon in icons:
        stats = icon2stat.get(icon, [])
        #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats)))
        for stat in stats:
            dic = {'%s' % ('P' if icon in pos_icons else 'N'):stat }
            write(out_path, 'a', '%s\n' % json.dumps(dic))
def test():
    obj_stats_path = '../train_data/stat_obj_train_data'
    out_path = '../train_data/Dg_obj_stats'
    txts = []
    with open(obj_stats_path, 'r') as f:
        for line in f:
            dic = json.loads(line.strip())
            tag, txt = dic.items()[0]
            txts.append(txt)
    linfo('obj stats count: %s' % (len(txts)))
    ST.replace_url(txts, fill='H')
    ST.replace_target(txts, fill='T')
    for x in txts:
        dic = {'O':x}
        write(out_path, 'a', '%s\n' % json.dumps(dic))
def test():
    obj_stats_path = '../train_data/stat_obj_train_data'
    out_path = '../train_data/Dg_obj_stats'
    txts = []
    with open(obj_stats_path, 'r') as f:
        for line in f:
            dic = json.loads(line.strip())
            tag, txt = dic.items()[0]
            txts.append(txt)
    linfo('obj stats count: %s' % (len(txts)))
    ST.replace_url(txts, fill='H')
    ST.replace_target(txts, fill='T')
    for x in txts:
        dic = {'O': x}
        write(out_path, 'a', '%s\n' % json.dumps(dic))
def parse_topic_public_stats(in_path='../stats/train_public_stats',
                             out_path='../test_data/topic_test_data'):
    st_t = time.time()
    topic_cnt, total_cnt = 0, 0
    topic2txt = {}
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip())
            txt = dic['text']
            topic = ST.parse_topic(txt)
            if not topic:
                continue
            topic2txt.setdefault(topic, list())
            topic2txt[topic].append(txt)

    topics = sorted(topic2txt.keys(),
                    key=lambda x: len(topic2txt[x]),
                    reverse=True)
    for t in topics:
        txts = topic2txt[t]
        if len(txts) > 7000:
            continue
        #print t, topic2txt[t]
        if len(txts) < 200:
            break
        for txt in txts:
            dic = {t: txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))

    print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt)
    print 'topic cnt: %s' % len(topic2txt)
    print 'time used: %.2f' % (time.time() - st_t)
 def update_profile_topic(self, raw_stats, tags):
     for txt, tag in zip(raw_stats, tags):
         topic = ST.parse_topic(txt)
         if not topic:
             continue
         self.profile_topic.setdefault(topic, {"P":0,"N":0,"O":0})
         self.profile_topic[topic][tag] += 1
def parse_topic_public_stats(in_path='../stats/train_public_stats',out_path='../test_data/topic_test_data'):
    st_t = time.time()
    topic_cnt, total_cnt = 0, 0
    topic2txt = {}
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip())
            txt = dic['text']
            topic = ST.parse_topic(txt)
            if not topic:
                continue
            topic2txt.setdefault(topic, list())
            topic2txt[topic].append(txt)
                
    topics = sorted(topic2txt.keys(), key=lambda x: len(topic2txt[x]), reverse=True)
    for t in topics:
        txts = topic2txt[t]
        if len(txts) > 7000:
            continue
        #print t, topic2txt[t]
        if len(txts) < 200:
            break
        for txt in txts:
            dic = {t:txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
        
    print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt)
    print 'topic cnt: %s' % len(topic2txt)
    print 'time used: %.2f' % (time.time() - st_t)
def ProfileRawData(path='../stats/train_public_stats'):
    '''
    calculate user, url, retweet, topic, redundant stat
    '''
    user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0)
    st = time.time()
    lines = ST.load_raw_data(path, replace_enabled=False,  row_num=None)
    w2c = {}
    for txt in lines:
        for x in txt:
            w2c.setdefault(x, 0)
            w2c[x] += 1
    print 'word cnt', len(w2c)
    out_path = 'word2cnt'
    ET.write_file(out_path, 'w', '')
    for w,c in w2c.items():
        if w == ',':
            print 'special word: %s. cnt %s' % (w, c)
            continue
        ET.write_file(out_path, 'a', '%s,%s\n' % (w, c))
    return
    for txt in lines:
        if '@' in txt:
            user_cnt += 1
        if 'http' in txt: 
            url_cnt += 1
        if '#' in txt:
            st_i = txt.find('#')
            if txt.find('#', st_i+1) != -1:
                topic_cnt += 1
    print 'user_cnt', user_cnt
    print 'url_cnt', url_cnt
    print 'topic_cnt', topic_cnt
    print 'time used', time.time() - st
Exemple #21
0
 def online_run(self, interval=10, peroid=0.5, quiet=True):
     '''
     return value: [(city, stat)...]
     '''
     stats_set = set()
     stats = []
     now = peroid
     cnt = 0
     while now < interval:
         try:
             rsp = self.retrieve('on', quiet=quiet)
             cnt += 1
             if rsp:
                 for dic in rsp:
                     if dic['id'] not in stats_set:
                         city = ST.parse_spatial(dic)
                         item = (city, dic['text'])
                         stats.append(item)
                         stats_set.add(dic['id'])
         except Exception as e:
             logging.exception(e)
         now += peroid
         time.sleep(peroid*60)
     linfo('online analysis %s new stats retrieved. retrieve cnt: %s' % (len(stats), cnt))
     return stats
 def _predict(self, txt, train_w2c, train_t2c, debug=False, emoticon=True):
     #if emoticon and 'emoticon' not in self._ngrams_config:
     #    self._ngrams_config.append('emoticon')
     #elif not emoticon and 'emoticon' in self._ngrams_config:
     #    self._ngrams_config = filter(lambda x: x != 'emoticon', self._ngrams_config)
     #grams = self._retrieve_feature(txt)
     grams = ST.retrieve_feature(txt,
                                 feature_extract_config=self._ngrams_config,
                                 gram_icon_mixed=emoticon)
     if debug:
         linfo('begin debug case: %s' % txt)
     tag2score = {"P": 0, "N": 0, "O": 0}
     for w in grams:
         for tag in tag2score:
             if not train_t2c[tag]:
                 continue
             score = self._cal_likelihood(train_w2c[tag].get(w, 0),
                                          train_t2c[tag])
             tag2score[tag] += score
             if debug:
                 linfo(
                     'DEBUG probability for gram %s when given tag %s is: %.4f. gram cnt: %s.tag cnt: %s'
                     % (w, tag, score, train_w2c[tag].get(
                         w, 0), train_t2c[tag]))
     pred_tag = sorted(tag2score.keys(),
                       key=lambda x: tag2score[x],
                       reverse=True)[0]
     if debug:
         linfo('predict tag2score: %s' % tag2score)
     return pred_tag
Exemple #23
0
 def run(self, detail=False):
     self.train()
     for name, stat in self.stats:
         try:
             for clf, path in self.classifiers:
                 linfo('----------roundly predict start-----------')
                 raw_stat, stat = ST.preprocess(stat)
                 union = [(raw,new) for raw, new in zip(raw_stat, stat) if new]
                 raw_stat = map(lambda x:x[0], union)
                 stat = map(lambda x:x[1], union)
                 pred_tags = clf.predict(stat)
                 if not pred_tags or len(pred_tags) != len(stat):
                     raise Exception('Predict Results Exception')
                 tag2dist = self.cal_tag_dist(pred_tags)
                 linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist))
                 save(path, 'a', '%s\t%s\t%s\n' % (name, json.dumps(tag2dist), len(stat)))
                 if detail:
                     detail_path = '%s%s' % (stats_predict_detail_prefix, name)
                     if os.path.exists(detail_path):
                         os.system('rm %s' % detail_path)
                     for tag, txt in zip(pred_tags, raw_stat):
                         ET.write_file(detail_path, 'a', '%s -%s\n' % (tag, txt))
                         #print tag, '-%s' % txt
                 linfo('----------roundly predict end-----------')
         except Exception as e:
             lexcept('Unknown exception %s' % e)
 def update_profile_topic(self, raw_stats, tags):
     for txt, tag in zip(raw_stats, tags):
         topic = ST.parse_topic(txt)
         if not topic:
             continue
         self.profile_topic.setdefault(topic, {"P": 0, "N": 0, "O": 0})
         self.profile_topic[topic][tag] += 1
def ProfileRawData(path='../stats/train_public_stats'):
    '''
    calculate user, url, retweet, topic, redundant stat
    '''
    user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0)
    st = time.time()
    lines = ST.load_raw_data(path, replace_enabled=False, row_num=None)
    w2c = {}
    for txt in lines:
        for x in txt:
            w2c.setdefault(x, 0)
            w2c[x] += 1
    print 'word cnt', len(w2c)
    out_path = 'word2cnt'
    ET.write_file(out_path, 'w', '')
    for w, c in w2c.items():
        if w == ',':
            print 'special word: %s. cnt %s' % (w, c)
            continue
        ET.write_file(out_path, 'a', '%s,%s\n' % (w, c))
    return
    for txt in lines:
        if '@' in txt:
            user_cnt += 1
        if 'http' in txt:
            url_cnt += 1
        if '#' in txt:
            st_i = txt.find('#')
            if txt.find('#', st_i + 1) != -1:
                topic_cnt += 1
    print 'user_cnt', user_cnt
    print 'url_cnt', url_cnt
    print 'topic_cnt', topic_cnt
    print 'time used', time.time() - st
def parse_city_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/city_test_data'):
    st_t = time.time()
    city2txt = {}
    city_stat_cnt, total_cnt = 0, 0
    stat_ids = set()
    txts_upperbound = 1000
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip()) 
            if dic['id'] in stat_ids:
                continue
            else:
                stat_ids.add(dic['id'])
            city = ST.parse_spatial(dic)
            if not city:
                continue
            city2txt.setdefault(city, list())
            if len(city2txt[city]) >= txts_upperbound:
                continue
            city2txt[city].append(dic['text'])
    locs = sorted(city2txt.keys(), key=lambda x: len(city2txt[x]), reverse=True)
    print 'city_stat_cnt', city_stat_cnt
    print 'total_cnt', total_cnt
    print 'time used: %.2f' % (time.time() - st_t)
    citys = sorted(city2txt.keys())
    #for x in citys:
    #    print x, len(city2txt[x])
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    for x in locs:
        for txt in city2txt[x]:
            dic={x:txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
    def run(self, sample_enabled=False, profile_enabled=False):
        for csf, path in self.classifiers:
            csf.train()
        action_day = ET.format_time(time.localtime())[:10]
        action_total_cnt = 0
        if profile_enabled:
            self.reset_profile()
        while True:
            try:
                stats = self.psr.online_run(interval=10)
                if not stats:
                    continue

                linfo('-------roundly analysis----------')
                citys = map(lambda x: x[0], stats)
                stats = map(lambda x: x[1], stats)
                raw_stats, stats = ST.preprocess(stats)
                valid_ids = [i for i, txt in enumerate(stats) if txt]
                stats = map(lambda i: stats[i], valid_ids)
                raw_stats = map(lambda i: raw_stats[i], valid_ids)
                citys = map(lambda i: citys[i], valid_ids)
                f_t = ET.format_time(time.localtime())
                if sample_enabled:
                    sample_path = '%srealtime_%s' % (
                        sample_prefix, f_t.replace(' ', '').replace(
                            '-', '').replace(':', ''))
                    ET.write_file(sample_path, 'a',
                                  '%s\n' % json.dumps(raw_stats[:300]))

                #only one model supported at the same time now.
                for clf, path in self.classifiers:
                    tag2cnt = {'P': 0, 'N': 0, 'O': 0}
                    pred_tags = clf.predict(stats)
                    for tag in pred_tags:
                        tag2cnt[tag] += 1
                    tag2dist = {
                        tag: cnt * 1.0 / len(stats)
                        for tag, cnt in tag2cnt.items()
                    }
                    linfo('%s-roundly online sentiment distribution: %s' %
                          (clf, tag2dist))
                    f_time = ET.format_time(time.localtime())
                    today = f_time[:10]
                    action_total_cnt = (
                        action_total_cnt + len(pred_tags)
                    ) if today == action_day else len(pred_tags)
                    save(
                        path, 'a', '%s\t%s\t%s\n' %
                        (f_time, json.dumps(tag2dist), len(stats)))
                    if profile_enabled:
                        self.update_profile_spatial(citys, pred_tags)
                        self.update_profile_topic(raw_stats, pred_tags)
                        if today != action_day:
                            self.save_profile(action_day)
                            self.reset_profile()
                            action_day = today
                    break
            except Exception as e:
                lexcept('Unknown exception %s' % e)
 def _feature_encoding(self, txt):
     bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config)
     #fs = {x:0 for x in gram2gid}
     fs = {}
     for gram in bags:
         if gram in self.gram2gid:
             fs[self.gram2gid[gram]] = 1
     return fs
 def get_feature(self, txt, cache=False):
     if txt in self.txt2bags:
         bags = self.txt2bags[txt]
     else:
         bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config)
         if cache:
             self.txt2bags[txt] = bags
     return bags
 def _discretize_gram2gid(self):
     w2id = {} 
     for txt in self._train_xs:
         bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config)
         for w in bags:
             if w not in w2id:
                 w2id[w] = len(w2id) + 1
     linfo('grams cnt: %s' % len(w2id))
     return w2id
Exemple #31
0
 def discret_txt(self, txt):
     fs = [0 for x in range(len(self.gram2gid))]
     bags = ST.retrieve_feature(
         txt, feature_extract_config=self._feature_extract_config)
     for w in bags:
         if w in self.gram2gid:
             wid = self.gram2gid[w]
             fs[wid] = 1
     return fs
 def get_feature(self, txt, cache=False):
     if txt in self.txt2bags:
         bags = self.txt2bags[txt]
     else:
         bags = ST.retrieve_feature(
             txt, feature_extract_config=self._feature_extract_config)
         if cache:
             self.txt2bags[txt] = bags
     return bags
 def _cross_train(self, fold_sz):
     rid2shard = ST.random_shardlize(fold_sz, len(self._train_xs), load=True)
     precision = 0
     for fid,sd in rid2shard.items():
         tmp_train_xs = [self._train_xs[i] for i in sd]
         tmp_train_ys = [self._train_ys[i] for i in sd]
         test_set = [(self._feature_encoding(self._train_xs[i]), self._train_ys[i]) for i in sd]
         classifier = self._train(tmp_train_xs, tmp_train_ys)
         p = classify.accuracy(classifier, test_set)
         linfo('maxent classifier precision: %.4f' % p)
         precision += p
     linfo('average maxent classifier precision: %.4f' % precision/fold_sz)
 def _all_train(self, total_word2cnt, total_tag2cnt):
     if os.path.exists(self._test_path):
         test_xs, test_ys = ST.load_data(self._test_path)
         #linfo('load manually tagged data count: %s' % len(test_xs))
     else:
         return
     test_t2c = {"P":0,"N":0,"O":0}
     for y in test_ys:
         if y not in test_t2c:
             raise Exception('Key Error in tag2cnt. unknown key: %s' % y)
         test_t2c[y] += 1
     #print test_t2c
     return  self._batch_predict(test_xs, test_ys, total_word2cnt, total_tag2cnt, test_t2c)
 def _all_train(self, total_word2cnt, total_tag2cnt):
     if os.path.exists(self._test_path):
         test_xs, test_ys = ST.load_data(self._test_path)
         #linfo('load manually tagged data count: %s' % len(test_xs))
     else:
         return
     test_t2c = {"P": 0, "N": 0, "O": 0}
     for y in test_ys:
         if y not in test_t2c:
             raise Exception('Key Error in tag2cnt. unknown key: %s' % y)
         test_t2c[y] += 1
     #print test_t2c
     return self._batch_predict(test_xs, test_ys, total_word2cnt,
                                total_tag2cnt, test_t2c)
Exemple #36
0
 def parse_topics_realtime(self):
     topic_cnt, total_cnt = 0, 0
     topic2txt = {}
     for name, txts in self.stats:
         for txt in txts:
             total_cnt += 1
             topic = ST.parse_topic(txt)
             if not topic:
                 continue
             topic_cnt += 1
             topic2txt.setdefault(topic, list())
             topic2txt[topic].append(txt)
     print 'total cnt: %s. topic stats cnt: %s' % (total_cnt, topic_cnt)
     print 'topic cnt: %s' % len(topic2txt)
     return topic2txt
    def _train(self, shard_sz=10, cross_validation=True):
        #print self._ngrams_config
        linfo('begin train classifier')
        st = time.time()
        rid2shard = ST.random_shardlize(shard_sz,
                                        len(self._train_xs),
                                        load=True,
                                        path=self.rand_path)

        #rid2word_info = {}
        #total_word2cnt = BayesClassifier.Word2Cnt()
        rid2tag_cnt, rid2word_presence = {}, {}
        total_word2presence = BayesClassifier.Word2Cnt()
        total_tag2cnt = {"P": 0, "N": 0, "O": 0}
        for rid in range(1, shard_sz + 1):
            shard = rid2shard[rid]
            #rid2word_info[rid]
            rid2tag_cnt[rid], rid2word_presence[rid] = self._cal_shard2info(
                shard)
            #for tag, w2c in rid2word_info[rid].items():
            #    for w, c in w2c.items():
            #        total_word2cnt[tag].setdefault(w, 0)
            #        total_word2cnt[tag][w] += c
            for tag, w2p in rid2word_presence[rid].items():
                for w, c in w2p.items():
                    total_word2presence[tag].setdefault(w, 0)
                    total_word2presence[tag][w] += c
            for tag, cnt in rid2tag_cnt[rid].items():
                total_tag2cnt[tag] += cnt
        #self._debug_bigram(total_word2presence)
        self._prune(total_word2presence, rid2word_presence, total_tag2cnt)
        self.total_w2c, self.total_t2c = total_word2presence, total_tag2cnt
        linfo(self.total_t2c)
        #cross_validation
        if cross_validation:
            linfo('beign cross validation')
            p, r, f = self._cross_train(total_word2presence, rid2word_presence,
                                        total_tag2cnt, rid2tag_cnt, shard_sz,
                                        rid2shard)
            linfo(
                'Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f'
                % (p, r, f, time.time() - st))
        else:
            linfo('beign train and test with manually tagged data set')
            p, r, f = self._all_train(total_word2presence, total_tag2cnt)
            linfo(
                'Manually Tag Data Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f'
                % (p, r, f, time.time() - st))
 def run(self, sample_enabled=False, profile_enabled=False):
     for csf, path in self.classifiers:
         csf.train()
     action_day = ET.format_time(time.localtime())[:10]
     action_total_cnt = 0
     if profile_enabled:
         self.reset_profile()
     while True:
         try:
             stats = self.psr.online_run(interval=10)
             if not stats:
                 continue
             
             linfo('-------roundly analysis----------')
             citys = map(lambda x:x[0], stats)
             stats = map(lambda x:x[1], stats)
             raw_stats, stats = ST.preprocess(stats)
             valid_ids = [i for i, txt in enumerate(stats) if txt]
             stats = map(lambda i:stats[i], valid_ids)
             raw_stats = map(lambda i:raw_stats[i], valid_ids)
             citys = map(lambda i:citys[i], valid_ids)
             f_t = ET.format_time(time.localtime())
             if sample_enabled:
                 sample_path = '%srealtime_%s' % (sample_prefix, f_t.replace(' ', '').replace('-', '').replace(':',''))
                 ET.write_file(sample_path, 'a', '%s\n' % json.dumps(raw_stats[:300]))
             
             #only one model supported at the same time now.
             for clf, path in self.classifiers:
                 tag2cnt = {'P':0, 'N':0, 'O':0}
                 pred_tags = clf.predict(stats)
                 for tag in pred_tags:
                     tag2cnt[tag] += 1
                 tag2dist =  {tag:cnt * 1.0 / len(stats) for tag,cnt in tag2cnt.items()}
                 linfo('%s-roundly online sentiment distribution: %s' % (clf, tag2dist))
                 f_time = ET.format_time(time.localtime())
                 today = f_time[:10]
                 action_total_cnt = (action_total_cnt + len(pred_tags)) if today == action_day else len(pred_tags)
                 save(path, 'a', '%s\t%s\t%s\n' % (f_time, json.dumps(tag2dist), len(stats)))
                 if profile_enabled:
                     self.update_profile_spatial(citys, pred_tags)
                     self.update_profile_topic(raw_stats, pred_tags)
                     if today != action_day:
                         self.save_profile(action_day)
                         self.reset_profile()
                         action_day = today
                 break
         except Exception as e:
             lexcept('Unknown exception %s' % e)
    def train(self,cross_validation=False, fold_sz=10, test_path='../../test_data/tri_test_data'):
        self._train_xs, self._train_ys = ST.load_data(self._path)
        if not self._emoticon:
            ST.remove_emoticon(self._train_xs)
        self.gram2gid = self._discretize_gram2gid()
        if cross_validation:
            linfo('begin to cross train')
            self._cross_train(fold_sz)
        else:
            classifier = self._train(self._train_xs, self._train_ys)

            self._test_xs, self._test_ys = ST.load_data(test_path)
            ST.replace_url(self._test_xs, fill='H')
            ST.replace_target(self._test_xs, fill='T')

            test_set = [(self._feature_encoding(txt), tag) for txt, tag in zip(self._test_xs, self._test_ys)]

            linfo('maxent classifier precision: %.4f' % classify.accuracy(classifier, test_set))
    def __init__(self, ct='tri', prefix=''):
        if prefix and prefix != 'Dg_':
            raise Exception('INVALID PREFIX GIVEN!!!')
        self.flag_prefix = prefix
        self.train_data_path = '%s/train_data/%s%s_train_data' % (project_dir, prefix, ct)
        if ct not in ['bi', 'tri']:
            raise Exception('INVALID Classifier Type')
        self.classifier_type = ct
        self.tag2index = TAG2INDEX
        self._train_xs, self._train_ys = ST.load_data(self.train_data_path)
        self._train_ys = map(lambda x: self.tag2index[x], self._train_ys)

        #self._feature_extract_config = ['unigram', 'bigram']
        self._feature_extract_config = feature_config 
        linfo('feature extract config: %s' % self._feature_extract_config)
        linfo('classifier type %s' % ct)
        linfo('init %s success' % self)
 def _cal_shard2info(self, shard_indexs):
     #word2cnt = BayesClassifier.Word2Cnt()
     word2presence = BayesClassifier.Word2Cnt() 
     #word_total_cnt = 0
     tag2cnt = {"P":0,"N":0,"O":0}
     for index in shard_indexs:
         #word_total_cnt += len(x)
         txt = self._train_xs[index] 
         tag = self._train_ys[index]
         tag2cnt[tag] += 1
         bags = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config)
         for w in bags:
             word2presence[tag].setdefault(w, 0)
             word2presence[tag][w] += 1
             #word2cnt[tag].setdefault(w, 0)
             #word2cnt[tag][w] += 1
         
     return tag2cnt, word2presence
    def __init__(self, ct='tri', prefix=''):
        if prefix and prefix != 'Dg_':
            raise Exception('INVALID PREFIX GIVEN!!!')
        self.flag_prefix = prefix
        self.train_data_path = '%s/train_data/%s%s_train_data' % (project_dir,
                                                                  prefix, ct)
        if ct not in ['bi', 'tri']:
            raise Exception('INVALID Classifier Type')
        self.classifier_type = ct
        self.tag2index = TAG2INDEX
        self._train_xs, self._train_ys = ST.load_data(self.train_data_path)
        self._train_ys = map(lambda x: self.tag2index[x], self._train_ys)

        #self._feature_extract_config = ['unigram', 'bigram']
        self._feature_extract_config = feature_config
        linfo('feature extract config: %s' % self._feature_extract_config)
        linfo('classifier type %s' % ct)
        linfo('init %s success' % self)
    def _cal_shard2info(self, shard_indexs):
        #word2cnt = BayesClassifier.Word2Cnt()
        word2presence = BayesClassifier.Word2Cnt()
        #word_total_cnt = 0
        tag2cnt = {"P": 0, "N": 0, "O": 0}
        for index in shard_indexs:
            #word_total_cnt += len(x)
            txt = self._train_xs[index]
            tag = self._train_ys[index]
            tag2cnt[tag] += 1
            bags = ST.retrieve_feature(
                txt, feature_extract_config=self._ngrams_config)
            for w in bags:
                word2presence[tag].setdefault(w, 0)
                word2presence[tag][w] += 1
                #word2cnt[tag].setdefault(w, 0)
                #word2cnt[tag][w] += 1

        return tag2cnt, word2presence
Exemple #44
0
    def build_sparse_X(self, _xs):
        row_num = len(_xs)
        col_num = len(self.gram2gid)

        rows, cols = [], []
        total_cnt = 0
        for i, txt in enumerate(_xs):
            bags = ST.retrieve_feature(
                txt, feature_extract_config=self._feature_extract_config)
            for w in bags:
                if w in self.gram2gid:
                    wid = self.gram2gid[w]
                    rows.append(i)
                    cols.append(wid)
                    total_cnt += 1
        linfo('build scipy sparse matrice. total_valid_cnt: %s' % (total_cnt))
        row = np.array(rows)
        col = np.array(cols)
        data = np.array([1 for i in range(total_cnt)])
        mtx = sparse.csr_matrix((data, (row, col)), shape=(row_num, col_num))
        return mtx
 def _predict(self, txt, train_w2c, train_t2c, debug=False, emoticon=True):
     #if emoticon and 'emoticon' not in self._ngrams_config:
     #    self._ngrams_config.append('emoticon')
     #elif not emoticon and 'emoticon' in self._ngrams_config:
     #    self._ngrams_config = filter(lambda x: x != 'emoticon', self._ngrams_config)
     #grams = self._retrieve_feature(txt)
     grams = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config, gram_icon_mixed=emoticon)
     if debug:
         linfo('begin debug case: %s' % txt)
     tag2score = {"P":0,"N":0,"O":0}
     for w in grams:
         for tag in tag2score:
             if not train_t2c[tag]:
                 continue
             score = self._cal_likelihood(train_w2c[tag].get(w, 0), train_t2c[tag]) 
             tag2score[tag] += score
             if debug:
                 linfo('DEBUG probability for gram %s when given tag %s is: %.4f. gram cnt: %s.tag cnt: %s' % (w, tag, score, train_w2c[tag].get(w, 0), train_t2c[tag]))
     pred_tag = sorted(tag2score.keys(), key=lambda x: tag2score[x], reverse=True)[0]
     if debug: 
         linfo('predict tag2score: %s' % tag2score)
     return pred_tag
    def _train(self, shard_sz=10, cross_validation=True):
        #print self._ngrams_config
        linfo('begin train classifier')
        st = time.time()
        rid2shard = ST.random_shardlize(shard_sz, len(self._train_xs), load=True, path=self.rand_path)

        #rid2word_info = {}
        #total_word2cnt = BayesClassifier.Word2Cnt()
        rid2tag_cnt, rid2word_presence = {}, {}
        total_word2presence = BayesClassifier.Word2Cnt()
        total_tag2cnt = {"P":0,"N":0,"O":0}
        for rid in range(1, shard_sz+1):
            shard = rid2shard[rid]
            #rid2word_info[rid]
            rid2tag_cnt[rid], rid2word_presence[rid] = self._cal_shard2info(shard)
            #for tag, w2c in rid2word_info[rid].items():
            #    for w, c in w2c.items():
            #        total_word2cnt[tag].setdefault(w, 0)
            #        total_word2cnt[tag][w] += c
            for tag, w2p in rid2word_presence[rid].items():
                for w, c in w2p.items():
                    total_word2presence[tag].setdefault(w, 0)
                    total_word2presence[tag][w] += c
            for tag, cnt in rid2tag_cnt[rid].items():
                total_tag2cnt[tag] += cnt
        #self._debug_bigram(total_word2presence)
        self._prune(total_word2presence, rid2word_presence, total_tag2cnt)
        self.total_w2c, self.total_t2c = total_word2presence, total_tag2cnt
        linfo(self.total_t2c)
        #cross_validation
        if cross_validation:
            linfo('beign cross validation')
            p, r, f= self._cross_train(total_word2presence, rid2word_presence, total_tag2cnt, rid2tag_cnt, shard_sz, rid2shard)
            linfo('Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p , r , f, time.time()- st))
        else:
            linfo('beign train and test with manually tagged data set')
            p, r, f = self._all_train(total_word2presence, total_tag2cnt)
            linfo('Manually Tag Data Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p , r , f , time.time()- st))
def parse_city_public_stats(in_path='../stats/train_public_stats',
                            out_path='../test_data/city_test_data'):
    st_t = time.time()
    city2txt = {}
    city_stat_cnt, total_cnt = 0, 0
    stat_ids = set()
    txts_upperbound = 1000
    with open(in_path, 'r') as f:
        for line in f:
            total_cnt += 1
            dic = json.loads(line.strip())
            if dic['id'] in stat_ids:
                continue
            else:
                stat_ids.add(dic['id'])
            city = ST.parse_spatial(dic)
            if not city:
                continue
            city2txt.setdefault(city, list())
            if len(city2txt[city]) >= txts_upperbound:
                continue
            city2txt[city].append(dic['text'])
    locs = sorted(city2txt.keys(),
                  key=lambda x: len(city2txt[x]),
                  reverse=True)
    print 'city_stat_cnt', city_stat_cnt
    print 'total_cnt', total_cnt
    print 'time used: %.2f' % (time.time() - st_t)
    citys = sorted(city2txt.keys())
    #for x in citys:
    #    print x, len(city2txt[x])
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    for x in locs:
        for txt in city2txt[x]:
            dic = {x: txt}
            ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))