Esempio n. 1
0
def parse_stats_emoticon(in_path='../stats/train_public_stats', out_path='../stats/emoticon_debug'):
    '''
    Study emoticon info from public states
    '''
    st = time.time()
    icon2cnt = {}
    config={"row_num":None}
    lines = ST.load_raw_data(in_path, **config)
    icon_line_cnt = 0
    for txt in lines:
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        icon_line_cnt += 1
        for icon in icons:
            icon2cnt.setdefault(icon, 0)
            icon2cnt[icon] += 1
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    icons = icon2cnt.keys()
    icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True)
    for icon in icons:
        cnt = icon2cnt[icon]
        write(out_path, 'a', '%s:%s\n' % (icon, cnt))
    linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' % (len(lines), icon_line_cnt, len(icons)))
Esempio n. 2
0
def ProfileRawData(path='../stats/train_public_stats'):
    '''
    calculate user, url, retweet, topic, redundant stat
    '''
    user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0)
    st = time.time()
    lines = ST.load_raw_data(path, replace_enabled=False,  row_num=None)
    w2c = {}
    for txt in lines:
        for x in txt:
            w2c.setdefault(x, 0)
            w2c[x] += 1
    print 'word cnt', len(w2c)
    out_path = 'word2cnt'
    ET.write_file(out_path, 'w', '')
    for w,c in w2c.items():
        if w == ',':
            print 'special word: %s. cnt %s' % (w, c)
            continue
        ET.write_file(out_path, 'a', '%s,%s\n' % (w, c))
    return
    for txt in lines:
        if '@' in txt:
            user_cnt += 1
        if 'http' in txt: 
            url_cnt += 1
        if '#' in txt:
            st_i = txt.find('#')
            if txt.find('#', st_i+1) != -1:
                topic_cnt += 1
    print 'user_cnt', user_cnt
    print 'url_cnt', url_cnt
    print 'topic_cnt', topic_cnt
    print 'time used', time.time() - st
Esempio n. 3
0
def parse_stats_emoticon(in_path='../stats/train_public_stats',
                         out_path='../stats/emoticon_debug'):
    '''
    Study emoticon info from public states
    '''
    st = time.time()
    icon2cnt = {}
    config = {"row_num": None}
    lines = ST.load_raw_data(in_path, **config)
    icon_line_cnt = 0
    for txt in lines:
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        icon_line_cnt += 1
        for icon in icons:
            icon2cnt.setdefault(icon, 0)
            icon2cnt[icon] += 1
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    icons = icon2cnt.keys()
    icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True)
    for icon in icons:
        cnt = icon2cnt[icon]
        write(out_path, 'a', '%s:%s\n' % (icon, cnt))
    linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' %
          (len(lines), icon_line_cnt, len(icons)))
Esempio n. 4
0
def ProfileRawData(path='../stats/train_public_stats'):
    '''
    calculate user, url, retweet, topic, redundant stat
    '''
    user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0)
    st = time.time()
    lines = ST.load_raw_data(path, replace_enabled=False, row_num=None)
    w2c = {}
    for txt in lines:
        for x in txt:
            w2c.setdefault(x, 0)
            w2c[x] += 1
    print 'word cnt', len(w2c)
    out_path = 'word2cnt'
    ET.write_file(out_path, 'w', '')
    for w, c in w2c.items():
        if w == ',':
            print 'special word: %s. cnt %s' % (w, c)
            continue
        ET.write_file(out_path, 'a', '%s,%s\n' % (w, c))
    return
    for txt in lines:
        if '@' in txt:
            user_cnt += 1
        if 'http' in txt:
            url_cnt += 1
        if '#' in txt:
            st_i = txt.find('#')
            if txt.find('#', st_i + 1) != -1:
                topic_cnt += 1
    print 'user_cnt', user_cnt
    print 'url_cnt', url_cnt
    print 'topic_cnt', topic_cnt
    print 'time used', time.time() - st
Esempio n. 5
0
def parse_emoticon_stats(in_path='../stats/train_public_stats',
                         out_path='../stats/train_data_dg'):
    '''
    Parse states with selected emocicons.
    Dumps or visualise
    '''
    st = time.time()
    pos_icons, neg_icons = load_emoticon()

    icon2stat = {}
    lines = ST.load_raw_data(in_path)
    for txt in lines:
        if any([x in txt for x in excludes]):
            continue
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        dis_match = filter(lambda x: x not in pos_icons and x not in neg_icons,
                           icons)
        if dis_match:
            if len(set(dis_match)) >= 2:
                continue
        pos_match = filter(lambda x: x in txt, pos_icons)
        neg_match = filter(lambda x: x in txt, neg_icons)
        if (pos_match and neg_match) or (not pos_match and not neg_match):
            continue
        if pos_match:
            for icon in pos_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break
        if neg_match:
            for icon in neg_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break

    write = ET.write_file
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons])
    neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons])
    icons = copy.copy(pos_icons)
    icons.extend(neg_icons)
    write(
        out_path, 'a',
        '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n'
        % (len(lines), pos_cnt, neg_cnt, time.time() - st))
    for icon in icons:
        stats = icon2stat.get(icon, [])
        #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats)))
        for stat in stats:
            dic = {'%s' % ('P' if icon in pos_icons else 'N'): stat}
            write(out_path, 'a', '%s\n' % json.dumps(dic))
Esempio n. 6
0
def parse_emoticon_stats(in_path='../stats/train_public_stats', out_path='../stats/train_data_dg'):
    '''
    Parse states with selected emocicons.
    Dumps or visualise
    '''
    st = time.time()
    pos_icons, neg_icons = load_emoticon()

    icon2stat= {}
    lines = ST.load_raw_data(in_path)
    for txt in lines:
        if any([x in txt for x in excludes]):
            continue
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        dis_match = filter(lambda x:x not in pos_icons and x not in neg_icons, icons)
        if dis_match:
            if len(set(dis_match)) >= 2:
                continue
        pos_match = filter(lambda x: x in txt, pos_icons)
        neg_match = filter(lambda x: x in txt, neg_icons)
        if (pos_match and neg_match) or (not pos_match and not neg_match):
            continue
        if pos_match:
            for icon in pos_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break
        if neg_match:
            for icon in neg_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break

    write = ET.write_file
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons])
    neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons])
    icons = copy.copy(pos_icons)
    icons.extend(neg_icons)
    write(out_path, 'a', '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n' % (len(lines), pos_cnt, neg_cnt, time.time()-st))
    for icon in icons:
        stats = icon2stat.get(icon, [])
        #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats)))
        for stat in stats:
            dic = {'%s' % ('P' if icon in pos_icons else 'N'):stat }
            write(out_path, 'a', '%s\n' % json.dumps(dic))