def parse_stats_emoticon(in_path='../stats/train_public_stats', out_path='../stats/emoticon_debug'): ''' Study emoticon info from public states ''' st = time.time() icon2cnt = {} config={"row_num":None} lines = ST.load_raw_data(in_path, **config) icon_line_cnt = 0 for txt in lines: icons = ST._retrieve_emoticon(txt) if not icons: continue icon_line_cnt += 1 for icon in icons: icon2cnt.setdefault(icon, 0) icon2cnt[icon] += 1 if os.path.exists(out_path): os.system('rm %s' % out_path) icons = icon2cnt.keys() icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True) for icon in icons: cnt = icon2cnt[icon] write(out_path, 'a', '%s:%s\n' % (icon, cnt)) linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' % (len(lines), icon_line_cnt, len(icons)))
def ProfileRawData(path='../stats/train_public_stats'): ''' calculate user, url, retweet, topic, redundant stat ''' user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0) st = time.time() lines = ST.load_raw_data(path, replace_enabled=False, row_num=None) w2c = {} for txt in lines: for x in txt: w2c.setdefault(x, 0) w2c[x] += 1 print 'word cnt', len(w2c) out_path = 'word2cnt' ET.write_file(out_path, 'w', '') for w,c in w2c.items(): if w == ',': print 'special word: %s. cnt %s' % (w, c) continue ET.write_file(out_path, 'a', '%s,%s\n' % (w, c)) return for txt in lines: if '@' in txt: user_cnt += 1 if 'http' in txt: url_cnt += 1 if '#' in txt: st_i = txt.find('#') if txt.find('#', st_i+1) != -1: topic_cnt += 1 print 'user_cnt', user_cnt print 'url_cnt', url_cnt print 'topic_cnt', topic_cnt print 'time used', time.time() - st
def parse_stats_emoticon(in_path='../stats/train_public_stats', out_path='../stats/emoticon_debug'): ''' Study emoticon info from public states ''' st = time.time() icon2cnt = {} config = {"row_num": None} lines = ST.load_raw_data(in_path, **config) icon_line_cnt = 0 for txt in lines: icons = ST._retrieve_emoticon(txt) if not icons: continue icon_line_cnt += 1 for icon in icons: icon2cnt.setdefault(icon, 0) icon2cnt[icon] += 1 if os.path.exists(out_path): os.system('rm %s' % out_path) icons = icon2cnt.keys() icons = sorted(icons, key=lambda x: icon2cnt[x], reverse=True) for icon in icons: cnt = icon2cnt[icon] write(out_path, 'a', '%s:%s\n' % (icon, cnt)) linfo('end parse emoticons. total lines: %s.icon lines: %s. icons:%s.' % (len(lines), icon_line_cnt, len(icons)))
def ProfileRawData(path='../stats/train_public_stats'): ''' calculate user, url, retweet, topic, redundant stat ''' user_cnt, url_cnt, retweet_cnt, topic_cnt, redundant = (0, 0, 0, 0, 0) st = time.time() lines = ST.load_raw_data(path, replace_enabled=False, row_num=None) w2c = {} for txt in lines: for x in txt: w2c.setdefault(x, 0) w2c[x] += 1 print 'word cnt', len(w2c) out_path = 'word2cnt' ET.write_file(out_path, 'w', '') for w, c in w2c.items(): if w == ',': print 'special word: %s. cnt %s' % (w, c) continue ET.write_file(out_path, 'a', '%s,%s\n' % (w, c)) return for txt in lines: if '@' in txt: user_cnt += 1 if 'http' in txt: url_cnt += 1 if '#' in txt: st_i = txt.find('#') if txt.find('#', st_i + 1) != -1: topic_cnt += 1 print 'user_cnt', user_cnt print 'url_cnt', url_cnt print 'topic_cnt', topic_cnt print 'time used', time.time() - st
def parse_emoticon_stats(in_path='../stats/train_public_stats', out_path='../stats/train_data_dg'): ''' Parse states with selected emocicons. Dumps or visualise ''' st = time.time() pos_icons, neg_icons = load_emoticon() icon2stat = {} lines = ST.load_raw_data(in_path) for txt in lines: if any([x in txt for x in excludes]): continue icons = ST._retrieve_emoticon(txt) if not icons: continue dis_match = filter(lambda x: x not in pos_icons and x not in neg_icons, icons) if dis_match: if len(set(dis_match)) >= 2: continue pos_match = filter(lambda x: x in txt, pos_icons) neg_match = filter(lambda x: x in txt, neg_icons) if (pos_match and neg_match) or (not pos_match and not neg_match): continue if pos_match: for icon in pos_match: icon2stat.setdefault(icon, []) icon2stat[icon].append(txt) break if neg_match: for icon in neg_match: icon2stat.setdefault(icon, []) icon2stat[icon].append(txt) break write = ET.write_file if os.path.exists(out_path): os.system('rm %s' % out_path) pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons]) neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons]) icons = copy.copy(pos_icons) icons.extend(neg_icons) write( out_path, 'a', '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n' % (len(lines), pos_cnt, neg_cnt, time.time() - st)) for icon in icons: stats = icon2stat.get(icon, []) #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats))) for stat in stats: dic = {'%s' % ('P' if icon in pos_icons else 'N'): stat} write(out_path, 'a', '%s\n' % json.dumps(dic))
def parse_emoticon_stats(in_path='../stats/train_public_stats', out_path='../stats/train_data_dg'): ''' Parse states with selected emocicons. Dumps or visualise ''' st = time.time() pos_icons, neg_icons = load_emoticon() icon2stat= {} lines = ST.load_raw_data(in_path) for txt in lines: if any([x in txt for x in excludes]): continue icons = ST._retrieve_emoticon(txt) if not icons: continue dis_match = filter(lambda x:x not in pos_icons and x not in neg_icons, icons) if dis_match: if len(set(dis_match)) >= 2: continue pos_match = filter(lambda x: x in txt, pos_icons) neg_match = filter(lambda x: x in txt, neg_icons) if (pos_match and neg_match) or (not pos_match and not neg_match): continue if pos_match: for icon in pos_match: icon2stat.setdefault(icon, []) icon2stat[icon].append(txt) break if neg_match: for icon in neg_match: icon2stat.setdefault(icon, []) icon2stat[icon].append(txt) break write = ET.write_file if os.path.exists(out_path): os.system('rm %s' % out_path) pos_cnt = sum([len(icon2stat.get(x, [])) for x in pos_icons]) neg_cnt = sum([len(icon2stat.get(x, [])) for x in neg_icons]) icons = copy.copy(pos_icons) icons.extend(neg_icons) write(out_path, 'a', '----------------\ntotal_cnt: %s. pos_cnt: %s. neg_cnt: %s. time used: %.2fs\n' % (len(lines), pos_cnt, neg_cnt, time.time()-st)) for icon in icons: stats = icon2stat.get(icon, []) #write(out_path, 'a', '--------------------------------------\nicon: %s. stats_cnt: %s\n' % (icon, len(stats))) for stat in stats: dic = {'%s' % ('P' if icon in pos_icons else 'N'):stat } write(out_path, 'a', '%s\n' % json.dumps(dic))