def sample(): blogs = commdatica.load('output/umtc.txt') has_emo = [] no_emo = [] target = 1000 i = 0 pbar = progbar.start(target) for blog in blogs: if blogger.is_valid(blog.text): if not len(has_emo) >= 500: has_emo.append(blog) i += 1 elif blogger.is_valid(blog.text, check_emo = False): if not len(no_emo) >= 500: no_emo.append(blog) i += 1 pbar.update(i) pbar.finish() print 'writing to umtc_yes_emo.txt ....', open('output/umtc_yes_emo.txt', 'w').write('\n'.join([repr(blog) for blog in has_emo])) print 'OK' print 'writing to umtc_no_emo.txt ....', open('output/umtc_no_emo.txt', 'w').write('\n'.join([repr(blog) for blog in no_emo])) print 'OK' bs = commdatica.load('output/umtc_yes_emo.txt') print len(bs)
def sample(): blogs = commdatica.load("output/umtc.txt") has_emo = [] no_emo = [] target = 1000 i = 0 pbar = progbar.start(target) for blog in blogs: if blogger.is_valid(blog.text): if not len(has_emo) >= 500: has_emo.append(blog) i += 1 elif blogger.is_valid(blog.text, check_emo=False): if not len(no_emo) >= 500: no_emo.append(blog) i += 1 pbar.update(i) pbar.finish() print "writing to umtc_yes_emo.txt ....", open("output/umtc_yes_emo.txt", "w").write("\n".join([repr(blog) for blog in has_emo])) print "OK" print "writing to umtc_no_emo.txt ....", open("output/umtc_no_emo.txt", "w").write("\n".join([repr(blog) for blog in no_emo])) print "OK" bs = commdatica.load("output/umtc_yes_emo.txt") print len(bs)
def select(): unmv = pkl_load('output/unmv.pkl') import db con = db.connect() cur = con.cursor() thr_min = 5 # t for threshold_max unt = [(u, n, m + v) for u, n, m, v in unmv if m <= 50 and v <= 100] umtc = [] for u, n, thr_max in unt: cur.execute('select mid, text, comments_count from microblogs where user_id = %s and comments_count >= %d and comments_count <= %d limit %d'%(u, thr_min, thr_max, n)) tmp_umtc = [] for m, t, c in cur: if blogger.is_valid(t, check_emo = False): tmp_umtc.append((u, m, t, c)) tmp_umtc = sorted(tmp_umtc, key = lambda k: -k[3]) if len(tmp_umtc) > 100: tmp_umtc = tmp_umtc[:100] umtc.extend(tmp_umtc) if len(umtc) >= 400000: break fobj = open('output/umtc.txt', 'w') for u, m, t, c in umtc: fobj.write(repr(BlogInfo(u, m, t, c)) + '\n') fobj.close()
def sample(eid): ''' sampling text from MySQL to data/dataset/text/EID.txt and data/dataset/muid/EID.txt ''' import blogger import db con = db.connect() cur = con.cursor() mid_uid = load_mid_uid(eid) umids = muid2umids(mid_uid) uid_mids = sorted(umids.items(), key = lambda k: -len(k[1])) n_text = 0 target = 4000 texts = [] str_mid_uid = [] for i, item in enumerate(uid_mids): uid, mids = item c = 0 for mid in mids: cur.execute('SELECT text FROM microblogs WHERE user_id=%s AND mid=%s LIMIT 1'%(uid, mid)) text = cur.fetchone()[0] if not blogger.is_valid(text): continue texts.append(text) str_mid_uid.append('%s %s'%(mid, uid)) c += 1 if c == 100: break n_text += c if n_text >= target: break # n_text may not be equal to 4000, later correction - lxh if len(texts) > target: texts = texts[:target] str_mid_uid = str_mid_uid[:target] DIR_DATASET = 'data/dataset/' DIR_TEXT = DIR_DATASET + 'text/' DIR_MUIDS = DIR_DATASET + 'muid/' open(DIR_TEXT + '%d.txt'%(eid), 'w').write('\n'.join(texts)) open(DIR_MUIDS + '%d.txt'%(eid), 'w').write('\n'.join(str_mid_uid))
def main(): blogs = commdatica.load('output/umtc.txt') print '%d in total'%(len(blogs)) pbar = progbar.start(len(blogs)) c = 0 for i, blog in enumerate(blogs): if blogger.is_valid(blog.text, check_emo = False): c += 1 pbar.update(i + 1) pbar.finish() print '%.2f%%'%(100. * c / len(blogs))