コード例 #1
0
ファイル: prepare_data.py プロジェクト: liangxh/emozh
def main():
	con = db.connect()
	cur = con.cursor()

	emos = open('../data/emo_top100.txt', 'r').read().decode('utf8').split('\n')

	limit = int(sys.argv[1])
	pbar = progbar.start(limit)
	i = 0

	cur.execute("SELECT text FROM microblogs WHERE comments_count > 0 AND comments_count < 100 LIMIT %d"%(limit))

	fobjs = dict([(emo, open('../corpus_raw/%s.txt'%(emo), 'w')) for emo in emos])

	for res in cur:
		blog = res[0]
		text, emos = blogger.extract(blog)

		n_emo = len(emos)
		if n_emo > 0 and n_emo < 6:
			samples = blogger.prepare_sample(text, emos)
			for e, t in samples:
				if not e in fobjs:
					continue

				fobjs[e].write(t + '\n')
		
		i += 1
		pbar.update(i)
	pbar.finish()

	for fobj in fobjs.values():
		fobj.close()
コード例 #2
0
ファイル: statica.py プロジェクト: liangxh/emozh
def main():
	con = db.connect()
	cur = con.cursor()
	
	emo_mids = {}

	limit = 25000000
	pbar = progbar.start(limit)
	i = 0

	cur.execute("SELECT mid, text FROM microblogs WHERE comments_count > 0 AND comments_count < 100 LIMIT %d"%(limit))
	for mid, text in cur:
		text, emos = blogger.extract(text)
		
		if len(emos) > 0 and len(emos) < 6:
			samples = blogger.prepare_sample(text, emos)
			for e, t in samples:
				if emo_mids.has_key(e):
					emo_mids[e].append(mid)
				else:
					emo_mids[e] = [mid, ]

		i += 1
		pbar.update(i)

	pbar.finish()

	cPickle.dump(emo_mids, open('../output/emo_mids.pkl', 'w'))
コード例 #3
0
ファイル: unidatica.py プロジェクト: liangxh/weibo
def prepare():
	'''
	prepare data for LSTM
	data are structured as [[codes, codes, ...], ...], which are ordered by eid
	'''

	import blogger	
	from utils import progbar

	coder = cPickle.load(open(PKL_TFCODER, 'r'))

	datalist = []

	pbar = progbar.start(N_EMO)
	for eid in range(N_EMO):
		lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n')
		data = []
		for line in lines:
			text, emo = blogger.extract(line)
			codes = coder.code(text)
			data.append(codes)
		datalist.append(data)
		pbar.update(eid + 1)

	pbar.finish()

	cPickle.dump(datalist, open(PKL_TFDATA, 'w'))
コード例 #4
0
ファイル: cdataextractor.py プロジェクト: liangxh/idu
def extract(ifname, ofname):
	n_lines = int(commands.getoutput('grep -cF "" %s'%(ifname)))
	if n_lines == 0:
		return

	pbar = progbar.start(n_lines)
	l = 0

	datalist = []

	ofobj = open(ofname, 'w')
	with open(ifname, 'r') as ifobj:
		for line in ifobj:
			blog = json.loads(line)
			n_text = len(blog['text'])

			for i in range(1, n_text):
				res = blogger.extract(blog['text'][i])
				if res is not None:
					datum = {}
					datum['text'] = res[0]
					datum['emo'] = res[1]
					
					above = []			
					re_id = i
					while blog['re'][re_id] is not None:
						re_id = blog['re'][re_id]
						above.append(blog['text'][re_id])
					datum['above'] = above

					follow = []
					last_id = i					
					for j in range(i + 1, n_text):
						if blog['re'][j] == last_id:
							follow.append(blog['text'][j])
							last_id = j

					datum['follow'] = follow

					ofobj.write(json.dumps(datum) + '\n')
	
					#for k in range(n_text):
					#	print '%d. (-> %s) %s'%(k, blog['re'][k], blog['text'][k])
					#
					#print 'above:'
					#print '\n'.join(above)
					#print 'follow:'
					#print '\n'.join(follow)
					#print 

				#if i > 100:
				#	return

			l += 1
			pbar.update(l)
		pbar.finish()
	
	ofobj.close()
コード例 #5
0
ファイル: textscanner.py プロジェクト: liangxh/idu
def main():
	import db
	import blogger
	from utils import progbar

	n_label = len(emos_list)
	emo_map = {}
	for label, emos in enumerate(emos_list):
		for emo in emos:
			emo_map[emo] = label

	odname = 'data/dataset_emo/'
	if not os.path.isdir(odname):
		os.mkdir(odname)

	odname += 'raw/'
	if not os.path.isdir(odname):
		os.mkdir(odname)


	fobjs = [open(odname + '%d.txt'%(i), 'w') for i in range(n_label)]
	counts = [0 for i in range(n_label)]
	N = 70000
	all_N = N * n_label

	con = db.connect()
	cur = con.cursor()
	
	pbar = progbar.start(all_N)
	l = 0

	cur.execute('SELECT text FROM microblogs')
	for t in cur:
		res = blogger.extract(t[0])
		if res is None or not emo_map.has_key(res[1]):
			continue

		label = emo_map[res[1]]
		if counts[label] < N:
			counts[label] += 1
			fobjs[label].write(res[0] + '\n')

			l += 1
			pbar.update(l)

			if counts[label] == N and sum(counts) == all_N:
				break

	pbar.finish()

	cur.close()
	con.close()

	for fobj in fobjs:
		fobj.close()
コード例 #6
0
ファイル: cdextractor.py プロジェクト: liangxh/idu
def extract(dname_dataset, idx):
	idname = 'data/blogs/mtr/'
	dir_dataset = 'data/blogs/%s/'%(dname_dataset)
	odname = dir_dataset + 'tea/'

	init_folders([dir_dataset, odname])

	ifname = idname + '%d.txt'%(idx)
	ofname = odname + '%d.txt'%(idx)
	
	n_lines = int(commands.getoutput('grep -cF "" %s'%(ifname)))
	if n_lines == 0:
		print >> sys.stderr, '%s is empty'%(ifname)
		return

	pbar = progbar.start(n_lines)
	l = 0

	datalist = []

	ofobj = open(ofname, 'w')
	with open(ifname, 'r') as ifobj:
		for line in ifobj:
			blog = json.loads(line)
			n_text = len(blog['text'])

			for i in range(1, n_text):
				res = blogger.extract(blog['text'][i])
				if res is not None:
					datum = {}
					datum['text'] = res[0]
					datum['emo'] = res[1]
					
					above_s = []			
					re_id = i
					while blog['re'][re_id] is not None:
						re_id = blog['re'][re_id]
						above_s.append(blog['text'][re_id])
					datum['above_s'] = above_s

					above_t = []
					re_id = i - 1
					while re_id >= 0:
						above_t.append(blog['text'][re_id])
						re_id -= 1
					datum['above_t'] = above_t

					ofobj.write(json.dumps(datum) + '\n')

			l += 1
			pbar.update(l)
		pbar.finish()

	ofobj.close()
コード例 #7
0
ファイル: gensimtest.py プロジェクト: liangxh/idu
def sample_seqs():
	import blogger
	import zhtokenizer
	
	lines = open('data/blogs1000.txt', 'r').readlines()
	for line in lines:
		l = line.strip()
		res = blogger.extract(l, check_emo = False)
		if res == None:
			continue
		t = res[0]
		yield zhtokenizer.tokenize(t)
コード例 #8
0
ファイル: midselector.py プロジェクト: liangxh/idu
def main():
	import db
	import datica
	import blogger
	from utils import progbar

	con = db.connect()
	cur = con.cursor()
	maxN = 70000

	odname = 'data/dataset_emo/'
	if not os.path.isdir(odname):
		os.mkdir(odname)

	config = datica.load_config('data/config4.txt')
	for label, eids in enumerate(config):
		for eid in eids:
			print >> sys.stderr, 'loading LABEL %d - EID %d'%(label, eid)			

			ifname = 'data/eid_mids/%d.txt'%(eid)

			ofname = odname + '%d.txt'%(eid)
			ofobj = open(ofname, 'w')

			mids = open(ifname, 'r').read().split('\n')
			if len(mids) > maxN:
				mids = mids[:maxN]

			pbar = progbar.start(len(mids))
			l = 0
			for mid in mids:
				t = load_by_mid(cur, mid)
				res = blogger.extract(t)
				if res is not None:
					text, emo = res
					ofobj.write(text + '\n')
			
				l += 1
				pbar.update(l)
			pbar.finish()
			ofobj.close()

	cur.close()
	con.close()
コード例 #9
0
ファイル: weiboanalyser.py プロジェクト: liangxh/idu
	def emo_rate(blogs):
		count = 0
		total = 0

		tlen = []
		emolist = []

		for blog in blogs:
			total += blog['comments_count']
			for comm in blog['comments']:
				res = blogger.extract(comm['text'])
				if res == None or len(res[0]) < 2:
					continue
				count += 1
				tlen.append(len(res[0]))
				emolist.append(res[1])
		emohist = tohist(emolist)

		return 100. * count / total, np.mean(tlen), emohist
コード例 #10
0
ファイル: datica.py プロジェクト: liangxh/idu
def prepare(eids = range(N_EMO)):
	'''
	tokenize and unigramize the text under data/dataset/text
	'''

	import blogger
	import zhtokenizer
	from utils import progbar, zhprocessor
	
	if not os.path.isdir(DIR_UNIGRAM):
		os.mkdir(DIR_UNIGRAM)

	if not os.path.isdir(DIR_TOKEN):
		os.mkdir(DIR_TOKEN)

	unigram_list = []
	token_list = []

	for eid in eids:
		lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n')
		
		unigram_list = []
		token_list = []
		
		print 'preparing data for EID-%d'%(eid)
		pbar = progbar.start(len(lines))
	
		for i, line in enumerate(lines):
			text, emo = blogger.extract(line)
			text = zhprocessor.simplify(text)

			unigrams = zhtokenizer.unigramize(text)	
			tokens = zhtokenizer.tokenize(text)
			
			unigram_list.append(unigrams)
			token_list.append(tokens)
		
			pbar.update(i + 1)
		pbar.finish()

		cPickle.dump(unigram_list, open(DIR_UNIGRAM + '%d.pkl'%(eid), 'w'))
		cPickle.dump(token_list, open(DIR_TOKEN + '%d.pkl'%(eid), 'w'))
コード例 #11
0
ファイル: statica_0.py プロジェクト: liangxh/idu
def collect_emo_mids():
	'''
	collect {emo: [mid, mid, ..], } from mysql and export to PKL_EMO_MIDS
	'''

	import db
	from utils import probgar

	print 'connecting to MySQL..'
	con = db.connect()

	print 'start..'
	cur = con.cursor()
	cur.execute('SELECT mid, text FROM microblogs WHERE comments_count > 1')
	
	pbar = progbar.start(TOTAL_BLOGS)	
	loop = 0
	emo_mids = {}
	for mid, text in cur:
		res = blogger.extract(text)
		if res == None:
			continue
		
		text, emo = res
		if emo_mids.has_key(emo):
			emo_mids[emo].append(mid)
		else:
			emo_mids[emo] = [mid, ]
	
		loop += 1
		pbar.update(loop)

	pbar.finish()

	cPickle.dump(emo_mids, open(PKL_EMO_MIDS, 'w'))
	
	cur.close()
	con.close()
コード例 #12
0
ファイル: commdatica.py プロジェクト: liangxh/weibo
def prepare():
	'''
	load data/blogs/blogs_400000.txt and after filtering by blogger.is_valid
	those which pass are saved into data/blogs/blogs_filtered.txt
	'''

	import blogger
	from utils import progbar
	blogs = []

	lines = open(FNAME_BLOGS_RAW, 'r').readlines()
	
	pbar = progbar.start(len(lines))
	for i, l in enumerate(lines):
		parts = l[:-1].decode('utf8').split('\t')
		params = []
		for part in parts:
			if not part == '':
				params.append(part)

		text = params[2]
		res = blogger.extract(text)
		
		if res is not None:
			uid = params[0]
			mid = params[1]
			comments_count = int(params[3])

			blogs.append(BlogInfo(uid, mid, text, comments_count))
		
		pbar.update(i + 1)
	pbar.finish()

	fobj = open(FNAME_BLOGS_FILTERED, 'w')
	for blog in blogs:
		fobj.write('%s\t%s\t%s\t%d\n'%(blog.uid, blog.mid, blog.text, blog.comments_count))
	fobj.close()
コード例 #13
0
ファイル: symscanner.py プロジェクト: liangxh/idu
def main():
	import db
	import blogger
	from utils import progbar

	p1 = re.compile('[:;]\)')
	p2 = re.compile('[:;]\(')
	patterns = [p1, p2]
	n_label = 2
	N = 70000
	
	odname = 'data/dataset_sym/'
	if not os.path.isdir(odname):
		os.mkdir(odname)

	odname += 'raw/'
	if not os.path.isdir(odname):
		os.mkdir(odname)


	fobjs = [open(odname + '%d.txt'%(i), 'w') for i in range(n_label)]
	counts = [0 for i in range(n_label)]
	all_N = N * n_label

	con = db.connect()
	cur = con.cursor()

	st_time = time.time()
	print >> sys.stderr, 'executing... ',
	cur.execute('SELECT text FROM microblogs')
	print >> sys.stderr, time.time() - st_time
	
	pbar = progbar.start(all_N)
	l = 0

	for t in cur:
		res = blogger.extract(t[0], check_emo = False)
		if res is None:
			continue
	
		t = res[0]		
		pid = None
		p = None
		for i, pi in enumerate(patterns):
			if pi.search(t) >= 0:
				p = pi
				pid = i
				break
		
		if p is None:
			continue

		
		text = p.sub('', t)
		if counts[pid] < N:
			counts[pid] += 1
			fobjs[pid].write(text + '\n')

			l += 1
			pbar.update(l)

			if counts[pid] == N and sum(counts) == all_N:
				break

	pbar.finish()

	cur.close()
	con.close()

	print counts

	for fobj in fobjs:
		fobj.close()