def main(): con = db.connect() cur = con.cursor() emos = open('../data/emo_top100.txt', 'r').read().decode('utf8').split('\n') limit = int(sys.argv[1]) pbar = progbar.start(limit) i = 0 cur.execute("SELECT text FROM microblogs WHERE comments_count > 0 AND comments_count < 100 LIMIT %d"%(limit)) fobjs = dict([(emo, open('../corpus_raw/%s.txt'%(emo), 'w')) for emo in emos]) for res in cur: blog = res[0] text, emos = blogger.extract(blog) n_emo = len(emos) if n_emo > 0 and n_emo < 6: samples = blogger.prepare_sample(text, emos) for e, t in samples: if not e in fobjs: continue fobjs[e].write(t + '\n') i += 1 pbar.update(i) pbar.finish() for fobj in fobjs.values(): fobj.close()
def main(): con = db.connect() cur = con.cursor() emo_mids = {} limit = 25000000 pbar = progbar.start(limit) i = 0 cur.execute("SELECT mid, text FROM microblogs WHERE comments_count > 0 AND comments_count < 100 LIMIT %d"%(limit)) for mid, text in cur: text, emos = blogger.extract(text) if len(emos) > 0 and len(emos) < 6: samples = blogger.prepare_sample(text, emos) for e, t in samples: if emo_mids.has_key(e): emo_mids[e].append(mid) else: emo_mids[e] = [mid, ] i += 1 pbar.update(i) pbar.finish() cPickle.dump(emo_mids, open('../output/emo_mids.pkl', 'w'))
def prepare(): ''' prepare data for LSTM data are structured as [[codes, codes, ...], ...], which are ordered by eid ''' import blogger from utils import progbar coder = cPickle.load(open(PKL_TFCODER, 'r')) datalist = [] pbar = progbar.start(N_EMO) for eid in range(N_EMO): lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n') data = [] for line in lines: text, emo = blogger.extract(line) codes = coder.code(text) data.append(codes) datalist.append(data) pbar.update(eid + 1) pbar.finish() cPickle.dump(datalist, open(PKL_TFDATA, 'w'))
def extract(ifname, ofname): n_lines = int(commands.getoutput('grep -cF "" %s'%(ifname))) if n_lines == 0: return pbar = progbar.start(n_lines) l = 0 datalist = [] ofobj = open(ofname, 'w') with open(ifname, 'r') as ifobj: for line in ifobj: blog = json.loads(line) n_text = len(blog['text']) for i in range(1, n_text): res = blogger.extract(blog['text'][i]) if res is not None: datum = {} datum['text'] = res[0] datum['emo'] = res[1] above = [] re_id = i while blog['re'][re_id] is not None: re_id = blog['re'][re_id] above.append(blog['text'][re_id]) datum['above'] = above follow = [] last_id = i for j in range(i + 1, n_text): if blog['re'][j] == last_id: follow.append(blog['text'][j]) last_id = j datum['follow'] = follow ofobj.write(json.dumps(datum) + '\n') #for k in range(n_text): # print '%d. (-> %s) %s'%(k, blog['re'][k], blog['text'][k]) # #print 'above:' #print '\n'.join(above) #print 'follow:' #print '\n'.join(follow) #print #if i > 100: # return l += 1 pbar.update(l) pbar.finish() ofobj.close()
def main(): import db import blogger from utils import progbar n_label = len(emos_list) emo_map = {} for label, emos in enumerate(emos_list): for emo in emos: emo_map[emo] = label odname = 'data/dataset_emo/' if not os.path.isdir(odname): os.mkdir(odname) odname += 'raw/' if not os.path.isdir(odname): os.mkdir(odname) fobjs = [open(odname + '%d.txt'%(i), 'w') for i in range(n_label)] counts = [0 for i in range(n_label)] N = 70000 all_N = N * n_label con = db.connect() cur = con.cursor() pbar = progbar.start(all_N) l = 0 cur.execute('SELECT text FROM microblogs') for t in cur: res = blogger.extract(t[0]) if res is None or not emo_map.has_key(res[1]): continue label = emo_map[res[1]] if counts[label] < N: counts[label] += 1 fobjs[label].write(res[0] + '\n') l += 1 pbar.update(l) if counts[label] == N and sum(counts) == all_N: break pbar.finish() cur.close() con.close() for fobj in fobjs: fobj.close()
def extract(dname_dataset, idx): idname = 'data/blogs/mtr/' dir_dataset = 'data/blogs/%s/'%(dname_dataset) odname = dir_dataset + 'tea/' init_folders([dir_dataset, odname]) ifname = idname + '%d.txt'%(idx) ofname = odname + '%d.txt'%(idx) n_lines = int(commands.getoutput('grep -cF "" %s'%(ifname))) if n_lines == 0: print >> sys.stderr, '%s is empty'%(ifname) return pbar = progbar.start(n_lines) l = 0 datalist = [] ofobj = open(ofname, 'w') with open(ifname, 'r') as ifobj: for line in ifobj: blog = json.loads(line) n_text = len(blog['text']) for i in range(1, n_text): res = blogger.extract(blog['text'][i]) if res is not None: datum = {} datum['text'] = res[0] datum['emo'] = res[1] above_s = [] re_id = i while blog['re'][re_id] is not None: re_id = blog['re'][re_id] above_s.append(blog['text'][re_id]) datum['above_s'] = above_s above_t = [] re_id = i - 1 while re_id >= 0: above_t.append(blog['text'][re_id]) re_id -= 1 datum['above_t'] = above_t ofobj.write(json.dumps(datum) + '\n') l += 1 pbar.update(l) pbar.finish() ofobj.close()
def sample_seqs(): import blogger import zhtokenizer lines = open('data/blogs1000.txt', 'r').readlines() for line in lines: l = line.strip() res = blogger.extract(l, check_emo = False) if res == None: continue t = res[0] yield zhtokenizer.tokenize(t)
def main(): import db import datica import blogger from utils import progbar con = db.connect() cur = con.cursor() maxN = 70000 odname = 'data/dataset_emo/' if not os.path.isdir(odname): os.mkdir(odname) config = datica.load_config('data/config4.txt') for label, eids in enumerate(config): for eid in eids: print >> sys.stderr, 'loading LABEL %d - EID %d'%(label, eid) ifname = 'data/eid_mids/%d.txt'%(eid) ofname = odname + '%d.txt'%(eid) ofobj = open(ofname, 'w') mids = open(ifname, 'r').read().split('\n') if len(mids) > maxN: mids = mids[:maxN] pbar = progbar.start(len(mids)) l = 0 for mid in mids: t = load_by_mid(cur, mid) res = blogger.extract(t) if res is not None: text, emo = res ofobj.write(text + '\n') l += 1 pbar.update(l) pbar.finish() ofobj.close() cur.close() con.close()
def emo_rate(blogs): count = 0 total = 0 tlen = [] emolist = [] for blog in blogs: total += blog['comments_count'] for comm in blog['comments']: res = blogger.extract(comm['text']) if res == None or len(res[0]) < 2: continue count += 1 tlen.append(len(res[0])) emolist.append(res[1]) emohist = tohist(emolist) return 100. * count / total, np.mean(tlen), emohist
def prepare(eids = range(N_EMO)): ''' tokenize and unigramize the text under data/dataset/text ''' import blogger import zhtokenizer from utils import progbar, zhprocessor if not os.path.isdir(DIR_UNIGRAM): os.mkdir(DIR_UNIGRAM) if not os.path.isdir(DIR_TOKEN): os.mkdir(DIR_TOKEN) unigram_list = [] token_list = [] for eid in eids: lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n') unigram_list = [] token_list = [] print 'preparing data for EID-%d'%(eid) pbar = progbar.start(len(lines)) for i, line in enumerate(lines): text, emo = blogger.extract(line) text = zhprocessor.simplify(text) unigrams = zhtokenizer.unigramize(text) tokens = zhtokenizer.tokenize(text) unigram_list.append(unigrams) token_list.append(tokens) pbar.update(i + 1) pbar.finish() cPickle.dump(unigram_list, open(DIR_UNIGRAM + '%d.pkl'%(eid), 'w')) cPickle.dump(token_list, open(DIR_TOKEN + '%d.pkl'%(eid), 'w'))
def collect_emo_mids(): ''' collect {emo: [mid, mid, ..], } from mysql and export to PKL_EMO_MIDS ''' import db from utils import probgar print 'connecting to MySQL..' con = db.connect() print 'start..' cur = con.cursor() cur.execute('SELECT mid, text FROM microblogs WHERE comments_count > 1') pbar = progbar.start(TOTAL_BLOGS) loop = 0 emo_mids = {} for mid, text in cur: res = blogger.extract(text) if res == None: continue text, emo = res if emo_mids.has_key(emo): emo_mids[emo].append(mid) else: emo_mids[emo] = [mid, ] loop += 1 pbar.update(loop) pbar.finish() cPickle.dump(emo_mids, open(PKL_EMO_MIDS, 'w')) cur.close() con.close()
def prepare(): ''' load data/blogs/blogs_400000.txt and after filtering by blogger.is_valid those which pass are saved into data/blogs/blogs_filtered.txt ''' import blogger from utils import progbar blogs = [] lines = open(FNAME_BLOGS_RAW, 'r').readlines() pbar = progbar.start(len(lines)) for i, l in enumerate(lines): parts = l[:-1].decode('utf8').split('\t') params = [] for part in parts: if not part == '': params.append(part) text = params[2] res = blogger.extract(text) if res is not None: uid = params[0] mid = params[1] comments_count = int(params[3]) blogs.append(BlogInfo(uid, mid, text, comments_count)) pbar.update(i + 1) pbar.finish() fobj = open(FNAME_BLOGS_FILTERED, 'w') for blog in blogs: fobj.write('%s\t%s\t%s\t%d\n'%(blog.uid, blog.mid, blog.text, blog.comments_count)) fobj.close()
def main(): import db import blogger from utils import progbar p1 = re.compile('[:;]\)') p2 = re.compile('[:;]\(') patterns = [p1, p2] n_label = 2 N = 70000 odname = 'data/dataset_sym/' if not os.path.isdir(odname): os.mkdir(odname) odname += 'raw/' if not os.path.isdir(odname): os.mkdir(odname) fobjs = [open(odname + '%d.txt'%(i), 'w') for i in range(n_label)] counts = [0 for i in range(n_label)] all_N = N * n_label con = db.connect() cur = con.cursor() st_time = time.time() print >> sys.stderr, 'executing... ', cur.execute('SELECT text FROM microblogs') print >> sys.stderr, time.time() - st_time pbar = progbar.start(all_N) l = 0 for t in cur: res = blogger.extract(t[0], check_emo = False) if res is None: continue t = res[0] pid = None p = None for i, pi in enumerate(patterns): if pi.search(t) >= 0: p = pi pid = i break if p is None: continue text = p.sub('', t) if counts[pid] < N: counts[pid] += 1 fobjs[pid].write(text + '\n') l += 1 pbar.update(l) if counts[pid] == N and sum(counts) == all_N: break pbar.finish() cur.close() con.close() print counts for fobj in fobjs: fobj.close()