def get_len(file_): count = 0 for line in open(sys.argv[1]): uid, user_name, score, post_id, forum_name, title, content, feature_len, feature = line.rstrip( '\n').split('\t') if content > 1000: content = content[:1000] content = content.replace('"', '') title = title.replace('"', '') val = [ monitor_type, str(count) + ' ' + score + ' ' + gezi.to_gbk(title), gezi.to_gbk(content), gezi.to_gbk(user_name), gezi.to_gbk(forum_name), uid ] score = float(score) if len(title.split('#')) < 2: continue count += 1 if thre > 1: if count > thre: break else: if score < thre: break if thre2 != 0: if thre2 > 1: if count > thre2: break else: if score < thre2: break return count
def deal(li): count_set = set() thread_count_set = set() pic_count_set = set() for l in li: pid = l[1] title = gezi.to_gbk(l[-2]) content = gezi.to_gbk(l[-1]) count_set.add(pid) is_thread = False if gezi.is_thread(title): thread_count_set.add(pid) is_thread = True if is_thread and gezi.contains_pic(content): pic_count_set.add(pid) print l[0] + '\t' + str(len(count_set)) + '\t' + str( len(thread_count_set)) + '\t' + str(len(pic_count_set))
def deal(li): louzhu_uid = None m = {} for l in li: uid = l[1] title = gezi.to_gbk(l[2]) if gezi.is_thread(title): louzhu_uid = uid else: if not uid in m: m[uid] = 1 else: m[uid] += 1 if louzhu_uid != None: for (uid, count) in m.items(): if uid != louzhu_uid: print louzhu_uid + '\t' + uid + '\t' + str(count)
#!/usr/bin/env python #coding=gbk # ============================================================================== # \file utf8togbk.py # \author chenghuige # \date 2015-03-13 19:53:33.858613 # \Description # ============================================================================== import sys, os import gezi for line in open(sys.argv[1]): print gezi.to_gbk(line),
#!/usr/bin/env python #coding=gbk # ============================================================================== # \file op-filter.py # \author chenghuige # \date 2015-03-02 17:15:06.828377 # \Description # ============================================================================== import sys,os sys.path.append('./') import gezi for line in sys.stdin: l = line.rstrip().split('\t') opname = gezi.to_gbk(l[-1]) if opname.startswith('Ìù°É'): print line,
#!/usr/bin/env python #coding=gbk # ============================================================================== # \file cut-cn.py # \author chenghuige # \date 2015-03-06 14:40:14.908346 # \Description # ============================================================================== import sys, os sys.path.append('./') import gezi sep = '\t' for line in sys.stdin: l = line.rstrip('\n').split(sep) word = gezi.to_gbk(l[0]) words = gezi.to_cnvec(gezi.extract_chinese(word)) for word in words: print word
uid = l[0] set_.add(uid) for line in open(sys.argv[1]): uid, user_name, score, post_id, forum_name, title, content, feature_len, feature = line.rstrip( '\n').split('\t') if uid in set_: continue if content > 1000: content = content[:1000] content = content.replace('"', '') title = title.replace('"', '') val = [ monitor_type, str(count) + ' ' + score + ' ' + gezi.to_gbk(title), gezi.to_gbk(content), gezi.to_gbk(user_name), gezi.to_gbk(forum_name), uid ] score = float(score) if len(title.split('#')) < 2: continue count += 1 if thre > 1: if count > thre: break else: if score < thre: break if thre2 != 0: