Beispiel #1
0
def parse_stats_emoticon(in_path='../stats/train_public_stats', out_path='../stats/emoticon_debug'):
    '''
    Study emoticon info from public states
    '''
    st = time.time()
    icon2cnt = {}
    config={"row_num":None}
    lines = ST.load_raw_data(in_path, **config)
    icon_line_cnt = 0
    for txt in lines:
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        icon_line_cnt += 1
        for icon in icons:
            icon2cnt.setdefault(icon, 0)
            icon2cnt[icon] += 1
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    icons = icon2cnt.keys()
    icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True)
    for icon in icons:
        cnt = icon2cnt[icon]
        write(out_path, 'a', '%s:%s\n' % (icon, cnt))
    linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' % (len(lines), icon_line_cnt, len(icons)))
Beispiel #2
0
def parse_stats_emoticon(in_path='../stats/train_public_stats',
                         out_path='../stats/emoticon_debug'):
    '''
    Study emoticon info from public states
    '''
    st = time.time()
    icon2cnt = {}
    config = {"row_num": None}
    lines = ST.load_raw_data(in_path, **config)
    icon_line_cnt = 0
    for txt in lines:
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        icon_line_cnt += 1
        for icon in icons:
            icon2cnt.setdefault(icon, 0)
            icon2cnt[icon] += 1
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    icons = icon2cnt.keys()
    icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True)
    for icon in icons:
        cnt = icon2cnt[icon]
        write(out_path, 'a', '%s:%s\n' % (icon, cnt))
    linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' %
          (len(lines), icon_line_cnt, len(icons)))
Beispiel #3
0
def parse_emoticon_stats(in_path='../stats/train_public_stats',
                         out_path='../stats/train_data_dg'):
    '''
    Parse states with selected emocicons.
    Dumps or visualise
    '''
    st = time.time()
    pos_icons, neg_icons = load_emoticon()

    icon2stat = {}
    lines = ST.load_raw_data(in_path)
    for txt in lines:
        if any([x in txt for x in excludes]):
            continue
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        dis_match = filter(lambda x: x not in pos_icons and x not in neg_icons,
                           icons)
        if dis_match:
            if len(set(dis_match)) >= 2:
                continue
        pos_match = filter(lambda x: x in txt, pos_icons)
        neg_match = filter(lambda x: x in txt, neg_icons)
        if (pos_match and neg_match) or (not pos_match and not neg_match):
            continue
        if pos_match:
            for icon in pos_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break
        if neg_match:
            for icon in neg_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break

    write = ET.write_file
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons])
    neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons])
    icons = copy.copy(pos_icons)
    icons.extend(neg_icons)
    write(
        out_path, 'a',
        '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n'
        % (len(lines), pos_cnt, neg_cnt, time.time() - st))
    for icon in icons:
        stats = icon2stat.get(icon, [])
        #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats)))
        for stat in stats:
            dic = {'%s' % ('P' if icon in pos_icons else 'N'): stat}
            write(out_path, 'a', '%s\n' % json.dumps(dic))
Beispiel #4
0
def parse_emoticon_stats(in_path='../stats/train_public_stats', out_path='../stats/train_data_dg'):
    '''
    Parse states with selected emocicons.
    Dumps or visualise
    '''
    st = time.time()
    pos_icons, neg_icons = load_emoticon()

    icon2stat= {}
    lines = ST.load_raw_data(in_path)
    for txt in lines:
        if any([x in txt for x in excludes]):
            continue
        icons = ST._retrieve_emoticon(txt)
        if not icons:
            continue
        dis_match = filter(lambda x:x not in pos_icons and x not in neg_icons, icons)
        if dis_match:
            if len(set(dis_match)) >= 2:
                continue
        pos_match = filter(lambda x: x in txt, pos_icons)
        neg_match = filter(lambda x: x in txt, neg_icons)
        if (pos_match and neg_match) or (not pos_match and not neg_match):
            continue
        if pos_match:
            for icon in pos_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break
        if neg_match:
            for icon in neg_match:
                icon2stat.setdefault(icon, [])
                icon2stat[icon].append(txt)
                break

    write = ET.write_file
    if os.path.exists(out_path):
        os.system('rm %s' % out_path)
    pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons])
    neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons])
    icons = copy.copy(pos_icons)
    icons.extend(neg_icons)
    write(out_path, 'a', '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n' % (len(lines), pos_cnt, neg_cnt, time.time()-st))
    for icon in icons:
        stats = icon2stat.get(icon, [])
        #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats)))
        for stat in stats:
            dic = {'%s' % ('P' if icon in pos_icons else 'N'):stat }
            write(out_path, 'a', '%s\n' % json.dumps(dic))