Beispiel #1
0
	def prepare_PMI(self, texts, tokens_valid, thr):
		n_tokens = len(tokens_valid)
		n_samples = len(texts)

		p_margin = np.zeros(n_tokens)
		p = np.zeros((n_tokens, n_tokens))
		
		pbar = progbar.start(n_samples)
		l = 0

		for tokens in texts:
			tids = [tokens_valid[token] for token in set(tokens) if tokens_valid.has_key(token)]

			for tid in tids:
				p_margin[tid] += 1

			for i in range(len(tids) - 1):
				t1 = tids[i]
				for j in range(i + 1, len(tids)):
					t2 = tids[j]
					if t1 < t2:
						p[t1][t2] += 1
					else:
						p[t2][t1] += 1
			
			l += 1
			pbar.update(l)
		pbar.finish()
		
		pmi_list = []
		#values = []
	
		n = (n_tokens - 1) * n_tokens / 2
		pbar = progbar.start(n)
		l = 0

		vs = []
		for i in range(n_tokens - 1):
			if p_margin[i] == 0.:
					print i
				
			for j in range(i + 1, n_tokens):
				v = p[i][j] / (p_margin[i] * p_margin[j])
				vs.append(v)

				if v > thr:
					pmi_list.append(((i, j), v))
				
				#values.append(v)
				l += 1
				pbar.update(l)
		pbar.finish()

		print 'sim_value_range: [%f, %f]'%(np.min(vs), np.max(vs))

		#cPickle.dump(values, open('output/pmi_values.pkl', 'w'))
		return pmi_list
Beispiel #2
0
def export_vote(thr_rate, ofname):
	n_batch = 90

	yhists = []
	pbar = progbar.start(n_batch)
	
	for batch_id in range(n_batch):
		fname = 'data/simrecord_90_%d.pkl'%(batch_id)
		records = cPickle.load(open(fname, 'r'))

		
		for y, x_len, record in records:
			thr = x_len * thr_rate
			
			ys = [yi for yi, d in record if d <= thr]
			yhist = {}
			for yi in ys:
				if yhist.has_key(yi):
					yhist[yi] += 1
				else:
					yhist[yi] = 1.

			yhist = sorted(yhist.items(), key = lambda k: -k[1])
			yhists.append(yhist)
	
		pbar.update(batch_id + 1)

	pbar.finish()

	cPickle.dump(yhists, open(ofname, 'w'))
Beispiel #3
0
def main():
	con = db.connect()
	cur = con.cursor()
	
	emo_mids = {}

	limit = 25000000
	pbar = progbar.start(limit)
	i = 0

	cur.execute("SELECT mid, text FROM microblogs WHERE comments_count > 0 AND comments_count < 100 LIMIT %d"%(limit))
	for mid, text in cur:
		text, emos = blogger.extract(text)
		
		if len(emos) > 0 and len(emos) < 6:
			samples = blogger.prepare_sample(text, emos)
			for e, t in samples:
				if emo_mids.has_key(e):
					emo_mids[e].append(mid)
				else:
					emo_mids[e] = [mid, ]

		i += 1
		pbar.update(i)

	pbar.finish()

	cPickle.dump(emo_mids, open('../output/emo_mids.pkl', 'w'))
Beispiel #4
0
def sample():
	blogs = commdatica.load('output/umtc.txt')
	
	has_emo = []
	no_emo = []

	target = 1000
	i = 0
	pbar = progbar.start(target)

	for blog in blogs:
		if blogger.is_valid(blog.text):
			if not len(has_emo) >= 500:
				has_emo.append(blog)
				i += 1
	
		elif blogger.is_valid(blog.text, check_emo = False):
			if not len(no_emo) >= 500:
				no_emo.append(blog)
				i += 1

		pbar.update(i)

	pbar.finish()

	print 'writing to umtc_yes_emo.txt ....',
	open('output/umtc_yes_emo.txt', 'w').write('\n'.join([repr(blog) for blog in has_emo]))
	print 'OK'

	print 'writing to umtc_no_emo.txt ....',
	open('output/umtc_no_emo.txt', 'w').write('\n'.join([repr(blog) for blog in no_emo]))
	print 'OK'	

	bs = commdatica.load('output/umtc_yes_emo.txt')
	print len(bs)
Beispiel #5
0
def sample():
    blogs = commdatica.load("output/umtc.txt")

    has_emo = []
    no_emo = []

    target = 1000
    i = 0
    pbar = progbar.start(target)

    for blog in blogs:
        if blogger.is_valid(blog.text):
            if not len(has_emo) >= 500:
                has_emo.append(blog)
                i += 1

        elif blogger.is_valid(blog.text, check_emo=False):
            if not len(no_emo) >= 500:
                no_emo.append(blog)
                i += 1

        pbar.update(i)

    pbar.finish()

    print "writing to umtc_yes_emo.txt ....",
    open("output/umtc_yes_emo.txt", "w").write("\n".join([repr(blog) for blog in has_emo]))
    print "OK"

    print "writing to umtc_no_emo.txt ....",
    open("output/umtc_no_emo.txt", "w").write("\n".join([repr(blog) for blog in no_emo]))
    print "OK"

    bs = commdatica.load("output/umtc_yes_emo.txt")
    print len(bs)
Beispiel #6
0
def prepare():
	'''
	prepare data for LSTM
	data are structured as [[codes, codes, ...], ...], which are ordered by eid
	'''

	import blogger	
	from utils import progbar

	coder = cPickle.load(open(PKL_TFCODER, 'r'))

	datalist = []

	pbar = progbar.start(N_EMO)
	for eid in range(N_EMO):
		lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n')
		data = []
		for line in lines:
			text, emo = blogger.extract(line)
			codes = coder.code(text)
			data.append(codes)
		datalist.append(data)
		pbar.update(eid + 1)

	pbar.finish()

	cPickle.dump(datalist, open(PKL_TFDATA, 'w'))
Beispiel #7
0
def main():
	con = db.connect()
	cur = con.cursor()

	emos = open('../data/emo_top100.txt', 'r').read().decode('utf8').split('\n')

	limit = int(sys.argv[1])
	pbar = progbar.start(limit)
	i = 0

	cur.execute("SELECT text FROM microblogs WHERE comments_count > 0 AND comments_count < 100 LIMIT %d"%(limit))

	fobjs = dict([(emo, open('../corpus_raw/%s.txt'%(emo), 'w')) for emo in emos])

	for res in cur:
		blog = res[0]
		text, emos = blogger.extract(blog)

		n_emo = len(emos)
		if n_emo > 0 and n_emo < 6:
			samples = blogger.prepare_sample(text, emos)
			for e, t in samples:
				if not e in fobjs:
					continue

				fobjs[e].write(t + '\n')
		
		i += 1
		pbar.update(i)
	pbar.finish()

	for fobj in fobjs.values():
		fobj.close()
Beispiel #8
0
def main():
	optparser = OptionParser()
	
	optparser.add_option('-x', '--dname_x', action='store', type = 'str', dest='dname_x')
	optparser.add_option('-s', '--dname_xsup', action='store', type = 'str', dest='dname_xsup')
	optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.)
	optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim', default = N_EMO)

	opts, args = optparser.parse_args()

	print >> sys.stderr, 'nbdatica: [info] loading data for training NaiveBayes ... ',
	train, valid, test = datica.load_data(opts.dname_x, opts.ydim, valid_rate = 0.)
	print >> sys.stderr, 'OK'

	print >> sys.stderr, 'nbdatica: [info] training NaiveBayes ... ',	
	classifier = NaiveBayesClassifier()
	classifier.train(train[0], train[1], opts.value_k)
	print >> sys.stderr, 'OK'

	if not os.path.exists(opts.dname_xsup):
		os.mkdir(opts.dname_xsup)

	pbar = progbar.start(opts.ydim)
	for eid in range(opts.ydim):
		ifname = opts.dname_x + '%d.pkl'%(eid)
		seqs = cPickle.load(open(ifname, 'r'))

		ofname = opts.dname_xsup + '%d.pkl'%(eid)
		proba = [classifier.classify(seq) for seq in seqs]

		cPickle.dump(proba, open(ofname, 'w'))
		pbar.update(eid + 1)
	pbar.finish()
Beispiel #9
0
def main():
	dir_root = '../'
	
	dir_corpus = dir_root + 'corpus/'
	dir_token = dir_root + 'token/'

	fname_emo = dir_corpus + 'emo.txt'

	emos = open(fname_emo, 'r').read().decode('utf8').split('\n')[:-1]

	if not os.path.isdir(dir_token):
		os.mkdir(dir_token)
	
	for emo in emos:
		fname_output = dir_token + '%s.txt'%(emo)
		fname_input = dir_corpus + '%s.txt'%(emo)

		fobj_out = open(fname_output, 'w')
		with open(fname_input, 'r') as fobj_in:
			print >> sys.stderr, 'main: [info] processing %s'%(fname_input)

			pbar = progbar.start(wc_l(fname_input))
			i = 0

			for line in fobj_in:
				line = line[:-1].decode('utf8')
				toks = zhtokenizer.tokenize(line)
				fobj_out.write(' '.join(toks) + '\n')

				i += 1
				pbar.update(i)

			pbar.finish()

		fobj_out.close()	
Beispiel #10
0
def main():
	key_wemb = sys.argv[1]
	ydim = int(sys.argv[2])
	size = int(sys.argv[3])
	key_dataset = key_wemb + '%dy%d'%(ydim, size)

	dir_root = '../'
	dir_tokid = dir_root + 'tokid/%s/'%(key_wemb)
	dir_dataset = dir_root + 'dataset/'

	fname_dataset = dir_dataset + '%s.pkl'%(key_dataset)

	fname_emo = dir_tokid + 'emo.txt'
	emos = open(fname_emo, 'r').read().decode('utf8').split('\n')[:-1]
	dataset = []

	pbar = progbar.start(ydim)
	i = 0
	for emo in emos[:ydim]:
		fname_input = dir_tokid + '%s.txt'%(emo)
		ids = range(wc_l(fname_input))
		np.random.shuffle(ids)
		dataset.append((emo, ids[:size]))

		i += 1
		pbar.update(i)

	pbar.finish()

	cPickle.dump(dataset, open(fname_dataset, 'w'))
Beispiel #11
0
def classify(train, test, ubm, gamma = 1., r = 16.,  w = 1., m = 1., v = 1., n_components = 8):
	x, y = train
	xdim = x.shape[1]
	ydim = np.unique(y).shape[0]

	M = n_components

	gs = []

	print >> sys.stderr, 'classify: [info] building gmm for each label ...'

	pbar = progbar.start(ydim)
	for i in range(ydim):
		xi = x[y == i]                  # matrix[T x xdim]
		T = xi.shape[0]

		weights = ubm.weights_          # matrix[M, ]
		probs = ubm.predict_proba(xi)   # matrix[T x M]

		Pr_t_i = probs * weights
		Pr_t_i = Pr_t_i / np.asmatrix(Pr_t_i.sum(axis = 1)).T    # matrix[T x M]

		n_i = np.asarray(Pr_t_i.sum(axis = 0)).flatten()      # matrix[M, ]
		Ex_i = np.asarray([(np.asarray(Pr_t_i[:, i]) * xi).sum(axis = 0) / n_i[i] if not n_i[i] == 0. else np.zeros(xdim) for i in range(M)])
		# matrix[M x xdim]
		Ex2_i = np.asarray([(np.asarray(Pr_t_i[:, i]) * (xi ** 2)).sum(axis = 0) / n_i[i] if not n_i[i] == 0. else np.zeros(xdim) for i in range(M)])
		# matrix[M x xdim] 

		alpha = lambda p: n_i / (n_i + r ** p)
		alpha_w = alpha(w)
		alpha_m = alpha(m)
		alpha_v = alpha(v)		

		# weights: matrix[M, ]
		new_weights = (alpha_w * n_i / T + (1. - alpha_w) * ubm.weights_) * gamma

		# means: matrix[M, xdim]
		alpha_m = np.asarray(np.asmatrix(alpha_m).T)
		new_means = alpha_m * Ex_i + (1. - alpha_m) * ubm.means_

		# covar: matrix[M, xdim]
		alpha_v = np.asarray(np.asmatrix(alpha_v).T)
		new_covars = alpha_v * Ex2_i + (1. - alpha_v) * (ubm.covars_ + ubm.means_ **2) - new_means ** 2

		g = GMM(n_components = M)
		g.means_ = new_means
		g.weights_ = new_weights
		g.covars_ = new_covars

		gs.append(g)

		pbar.update(i + 1)
	pbar.finish()

	x, y = test
	scores = [g.score(x) for g in gs]
	proba = np.column_stack(scores) # not probability really	

	return proba
Beispiel #12
0
	def __iter__(self):
		pbar = progbar.start(self.n_emo)
		for i in range(self.n_emo):
			seqs = cPickle.load(open('data/dataset/unigram/%d.pkl'%(i), 'r'))
			for seq in seqs:
				yield seq
			pbar.update(i + 1)
		pbar.finish()
Beispiel #13
0
def extract(ifname, ofname):
	n_lines = int(commands.getoutput('grep -cF "" %s'%(ifname)))
	if n_lines == 0:
		return

	pbar = progbar.start(n_lines)
	l = 0

	datalist = []

	ofobj = open(ofname, 'w')
	with open(ifname, 'r') as ifobj:
		for line in ifobj:
			blog = json.loads(line)
			n_text = len(blog['text'])

			for i in range(1, n_text):
				res = blogger.extract(blog['text'][i])
				if res is not None:
					datum = {}
					datum['text'] = res[0]
					datum['emo'] = res[1]
					
					above = []			
					re_id = i
					while blog['re'][re_id] is not None:
						re_id = blog['re'][re_id]
						above.append(blog['text'][re_id])
					datum['above'] = above

					follow = []
					last_id = i					
					for j in range(i + 1, n_text):
						if blog['re'][j] == last_id:
							follow.append(blog['text'][j])
							last_id = j

					datum['follow'] = follow

					ofobj.write(json.dumps(datum) + '\n')
	
					#for k in range(n_text):
					#	print '%d. (-> %s) %s'%(k, blog['re'][k], blog['text'][k])
					#
					#print 'above:'
					#print '\n'.join(above)
					#print 'follow:'
					#print '\n'.join(follow)
					#print 

				#if i > 100:
				#	return

			l += 1
			pbar.update(l)
		pbar.finish()
	
	ofobj.close()
Beispiel #14
0
def main():
	import db
	import blogger
	from utils import progbar

	n_label = len(emos_list)
	emo_map = {}
	for label, emos in enumerate(emos_list):
		for emo in emos:
			emo_map[emo] = label

	odname = 'data/dataset_emo/'
	if not os.path.isdir(odname):
		os.mkdir(odname)

	odname += 'raw/'
	if not os.path.isdir(odname):
		os.mkdir(odname)


	fobjs = [open(odname + '%d.txt'%(i), 'w') for i in range(n_label)]
	counts = [0 for i in range(n_label)]
	N = 70000
	all_N = N * n_label

	con = db.connect()
	cur = con.cursor()
	
	pbar = progbar.start(all_N)
	l = 0

	cur.execute('SELECT text FROM microblogs')
	for t in cur:
		res = blogger.extract(t[0])
		if res is None or not emo_map.has_key(res[1]):
			continue

		label = emo_map[res[1]]
		if counts[label] < N:
			counts[label] += 1
			fobjs[label].write(res[0] + '\n')

			l += 1
			pbar.update(l)

			if counts[label] == N and sum(counts) == all_N:
				break

	pbar.finish()

	cur.close()
	con.close()

	for fobj in fobjs:
		fobj.close()
Beispiel #15
0
def extract(dname_dataset, idx):
	idname = 'data/blogs/mtr/'
	dir_dataset = 'data/blogs/%s/'%(dname_dataset)
	odname = dir_dataset + 'tea/'

	init_folders([dir_dataset, odname])

	ifname = idname + '%d.txt'%(idx)
	ofname = odname + '%d.txt'%(idx)
	
	n_lines = int(commands.getoutput('grep -cF "" %s'%(ifname)))
	if n_lines == 0:
		print >> sys.stderr, '%s is empty'%(ifname)
		return

	pbar = progbar.start(n_lines)
	l = 0

	datalist = []

	ofobj = open(ofname, 'w')
	with open(ifname, 'r') as ifobj:
		for line in ifobj:
			blog = json.loads(line)
			n_text = len(blog['text'])

			for i in range(1, n_text):
				res = blogger.extract(blog['text'][i])
				if res is not None:
					datum = {}
					datum['text'] = res[0]
					datum['emo'] = res[1]
					
					above_s = []			
					re_id = i
					while blog['re'][re_id] is not None:
						re_id = blog['re'][re_id]
						above_s.append(blog['text'][re_id])
					datum['above_s'] = above_s

					above_t = []
					re_id = i - 1
					while re_id >= 0:
						above_t.append(blog['text'][re_id])
						re_id -= 1
					datum['above_t'] = above_t

					ofobj.write(json.dumps(datum) + '\n')

			l += 1
			pbar.update(l)
		pbar.finish()

	ofobj.close()
Beispiel #16
0
	def embed(xy):
		seqs, y = xy
		x_vec = []
		pbar = progbar.start(len(seqs))

		for i, seq in enumerate(seqs):
			x_vec.append(np.mean(embedder.embed(seq), axis = 0))
			pbar.update(i + 1)

		pbar.finish()

		return (x_vec, y)
Beispiel #17
0
def build(train_x_y, ydim):
	probwe = {}

	x, y = train_x_y
	n_y = np.zeros(ydim)

	n_samples = len(x)

	print >> sys.stderr, 'scaning train dataset'

	pbar = progbar.start(n_samples)
	loop = 0

	for seq, yi in zip(x, y):
		n_y[yi] += 1

		for token in set(seq):
			if not probwe.has_key(token):
				probwe[token] = np.zeros(ydim)
			probwe[token][yi] += 1
	
		loop += 1
		pbar.update(loop)

	pbar.finish()

	print >> sys.stderr, 'normalization'

	pbar = progbar.start(len(probwe))
	loop = 0

	for k in probwe.keys():
		probwe[k] /= n_y

		loop += 1
		pbar.update(loop)

	pbar.finish()

	return probwe
Beispiel #18
0
def main():
	blogs = commdatica.load('output/umtc.txt')
	print '%d in total'%(len(blogs))

	pbar = progbar.start(len(blogs))
	c = 0
	for i, blog in enumerate(blogs):
		if blogger.is_valid(blog.text, check_emo = False):
			c += 1

		pbar.update(i + 1)
	pbar.finish()

	print '%.2f%%'%(100. * c / len(blogs))
Beispiel #19
0
def prepare(eids = range(N_EMO)):
	if not os.path.exists(DIR_TOKEN):
		os.mkdir(DIR_TOKEN)

	pbar = progbar.start(len(eids) * 4000)
	c = 0
	for eid in eids:
		seqs = []
		lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n')
		for line in lines:
			seqs.append(zhpr.tokenize(line))
			c += 1
			pbar.update(c)
		
		cPickle.dump(seqs, open(DIR_TOKEN + '%d.pkl'%(eid), 'w'))

	pbar.finish()
Beispiel #20
0
def reform(idir, odir):
	if not os.path.isdir(odir):
		os.mkdir(odir)
	
	n_emo = 90
	pbar = progbar.start(n_emo)

	for i in range(n_emo):
		seqs = cPickle.load(open(idir + '%d.pkl'%(i), 'r'))	
		fobj = open(odir + '%d.txt'%(i), 'w')
		content = u''
		content = u'\n'.join([u' '.join(seq) for seq in seqs])
		fobj.write(content)
		fobj.close()

		pbar.update(i + 1)
	pbar.finish()
Beispiel #21
0
def test():
	import cPickle
	import zhprocessor
	from utils import logger, progbar

	FNAME_TEXT = 'output/0.txt'
	PKL_TOKENS = 'data/tokens.pkl'

	if not os.path.exists(PKL_TOKENS):
		logger.debug('loading lines...')
		lines = open(FNAME_TEXT, 'r').read().split('\n')
		
		logger.debug('preparing toknes..')	
		tokens = []

		pbar = progbar.start(len(lines))
		for i, line in enumerate(lines):
			res = blogger.extract(line)
			if res == None:
				continue
			text, emo = res
			tokens.extend(zhprocessor.tokenize(text))
			pbar.update(i + 1)

		pbar.finish()
		
		cPickle.dump(tokens, open(PKL_TOKENS, 'w'))
	else:
		tokens = cPickle.load(open('data/tokens.pkl', 'r'))

	cclassifier = ConceptClassifier()
	logger.debug('analysing tokens..')
	
	d = 10
	vecs = cclassifier.analyse(tokens, d)
	print len(vecs)
	
	vecs = vecs.items()
	repr_vecs = [vecs[0] for i in range(d)]
	for vec in vecs[1:]:
		for i in range(d):
			if abs(repr_vecs[i][1][i]) < abs(vec[1][i]):
				repr_vecs[i] = vec
	
	for k, v in repr_vecs:
		print k, v
Beispiel #22
0
def load(fname_dataset, dir_tokid, valid = 0.2, test = 0.1):
	emo_ids = cPickle.load(open(fname_dataset, 'r'))

	dataset = [[[], []] for i in range(3)]

	def add_samples(idxs, lines, y, split_id):
		for idx in idxs:
			tids = map(int, lines[idx][:-1].split(' '))

			dataset[split_id][0].append(tids)
			dataset[split_id][1].append(y)
	
	print >> sys.stderr, 'load: [info] loading data...'
	pbar = progbar.start(len(emo_ids))
	
	for i, item in enumerate(emo_ids):
		emo, ids = item 

		n_total = len(ids)
		n_valid = int(valid * n_total)
		n_test = int(test * n_total)
		n_train = n_total - n_valid - n_test 

		fname_tokid = dir_tokid + '%s.txt'%(emo)
		# tids = load_tokid(fname_tokid)
		lines = open(fname_tokid, 'r').readlines()

		add_samples(ids[:n_train], lines, i, 0)
		add_samples(ids[n_train:(-n_test)], lines, i, 1)
		add_samples(ids[-n_test:], lines, i, 2)

		pbar.update(i + 1)
	pbar.finish()

	def shuffle(subset):
		x, y = subset
		ids = range(len(x))
		np.random.shuffle(ids)
		x = [x[i] for i in ids]
		y = [y[i] for i in ids]
		return (x, y)

	dataset = [shuffle(tuple(subset)) for subset in dataset]
	dataset = tuple(dataset)
	
	return dataset
Beispiel #23
0
def load_blogs():
	def load_blog_lines():
		lines = []
		for i in range(3):
			fname = 'data/blogs/blogs_subset_%d.txt'%(i)
			lines.extend(open(fname, 'r').readlines())
		return lines

	lines = load_blog_lines()
	blogs = []
	
	pbar = progbar.start(len(lines))
	for i, line in enumerate(lines):
		blogs.append(json.loads(line))
		pbar.update(i + 1)
	pbar.finish()

	return blogs
Beispiel #24
0
def load(fname_blogs = FNAME_BLOGS_FILTERED):
	'''
	load data/blogs_subset.txt as list of BlogInfo
	'''

	lines = open(fname_blogs, 'r').readlines()
	blogs = []	

	pbar = progbar.start(len(lines))

	for i, l in enumerate(lines):
		blogs.append(BlogInfo.loads(l))
		
		pbar.update(i + 1)

	pbar.finish()

	return blogs
Beispiel #25
0
def main():
	import db
	import datica
	import blogger
	from utils import progbar

	con = db.connect()
	cur = con.cursor()
	maxN = 70000

	odname = 'data/dataset_emo/'
	if not os.path.isdir(odname):
		os.mkdir(odname)

	config = datica.load_config('data/config4.txt')
	for label, eids in enumerate(config):
		for eid in eids:
			print >> sys.stderr, 'loading LABEL %d - EID %d'%(label, eid)			

			ifname = 'data/eid_mids/%d.txt'%(eid)

			ofname = odname + '%d.txt'%(eid)
			ofobj = open(ofname, 'w')

			mids = open(ifname, 'r').read().split('\n')
			if len(mids) > maxN:
				mids = mids[:maxN]

			pbar = progbar.start(len(mids))
			l = 0
			for mid in mids:
				t = load_by_mid(cur, mid)
				res = blogger.extract(t)
				if res is not None:
					text, emo = res
					ofobj.write(text + '\n')
			
				l += 1
				pbar.update(l)
			pbar.finish()
			ofobj.close()

	cur.close()
	con.close()
Beispiel #26
0
	def calculate(self, x, y, deduplicate = True):
		vecs = {}
		ydim = np.max(y) + 1

		y_count = np.zeros(ydim)
		n_samples = len(x)

		for xi, yi in zip(x, y):
			y_count[yi] += 1

			if deduplicate:
				xi = set(xi)

			for token in xi:
				if not vecs.has_key(token):
					vecs[token] = np.zeros(ydim)
		
				vecs[token][yi] += 1

		chis = {}

		pbar = progbar.start(len(vecs))
		l = 0

		for token, vec in vecs.items():
			chi_values = []
		
			vec_sum = np.sum(vec)

			for i in range(ydim):
				a = vec[i]
				b = vec_sum - a
				c = y_count[i] - a
				d = (n_samples - y_count[i]) - b

				chi_values.append((a * d - b * c) ** 2 / ((a + b) * (c + d)))

			chis[token] = chi_values
			l += 1
			pbar.update(l)
		pbar.finish()

		return chis
Beispiel #27
0
def prepare(eids = range(N_EMO)):
	'''
	tokenize and unigramize the text under data/dataset/text
	'''

	import blogger
	import zhtokenizer
	from utils import progbar, zhprocessor
	
	if not os.path.isdir(DIR_UNIGRAM):
		os.mkdir(DIR_UNIGRAM)

	if not os.path.isdir(DIR_TOKEN):
		os.mkdir(DIR_TOKEN)

	unigram_list = []
	token_list = []

	for eid in eids:
		lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n')
		
		unigram_list = []
		token_list = []
		
		print 'preparing data for EID-%d'%(eid)
		pbar = progbar.start(len(lines))
	
		for i, line in enumerate(lines):
			text, emo = blogger.extract(line)
			text = zhprocessor.simplify(text)

			unigrams = zhtokenizer.unigramize(text)	
			tokens = zhtokenizer.tokenize(text)
			
			unigram_list.append(unigrams)
			token_list.append(tokens)
		
			pbar.update(i + 1)
		pbar.finish()

		cPickle.dump(unigram_list, open(DIR_UNIGRAM + '%d.pkl'%(eid), 'w'))
		cPickle.dump(token_list, open(DIR_TOKEN + '%d.pkl'%(eid), 'w'))
Beispiel #28
0
def build(seqs, N):
	tf = {}
	df = {}

	pbar = progbar.start(len(seqs))
	l = 0

	for seq in seqs:
		for t in seq:
			if tf.has_key(t):
				tf[t] += 1
			else:	
				tf[t] = 1.
		
		for t in set(seq):
			if df.has_key(t):
				df[t] += 1
			else:
				df[t] = 1
			
		l += 1
		pbar.update(l)

	pbar.finish()

	tf = sorted(tf.items(), key = lambda k: -k[1])

	n_seq = len(seqs)
	
	nums = set('0123456789')
	Widx = {}
	Widf = {}
	idx = 0
	for t, f in tf:
		if not t in nums:
			Widf[t] = np.log((1. + n_seq) / df[t])
			Widx[t]	= idx	
			idx += 1
			if idx == N:
				break
	
	return Widx, Widf
Beispiel #29
0
def prepare_tokenize(ifname, ofname):
	import zhtokenizer
	from utils import zhprocessor, progbar

	lines = open(ifname, 'r').readlines()

	pbar = progbar.start(len(lines))
	l = 0

	seqs = []
	for line in lines:
		line = line.decode('utf8')
		line = zhprocessor.simplify(line)
		tokens = zhtokenizer.tokenize(line)
		seqs.append(tokens)

		l += 1
		pbar.update(l)
	pbar.finish()

	cPickle.dump(seqs, open(ofname, 'w'))
Beispiel #30
0
def main():
	dlist = []
	x_set = {}
	m_set = {}

	for i in range(90):
		fname = 'data/dataset/text/%d.txt'%(i)
		lines = open(fname, 'r').read().split('\n')
		dlist.append(lines)		

		pbar = progbar.start(len(lines))
		for j, line in enumerate(lines):
			l = zhp.simplify(line.decode('utf8'))
			res = zht.segment(l, True)	
			for w in res:
				if w.flag == 'x':
					if x_set.has_key(w.word):
						x_set[w.word] += 1
					else:
						x_set[w.word] = 1
				elif w.flag == 'm':
					if m_set.has_key(w.word):
						m_set[w.word] += 1
					else:
						m_set[w.word] = 1
			pbar.update(j + 1)
		pbar.finish()

	fobj = open('output/set_x.txt', 'w')
	x_set = sorted(x_set.items(), key = lambda k: -k[1])
	for k, v in x_set:
		fobj.write('%s (%d)\n'%(k, v))
	fobj.close()

	fobj = open('output/set_m.txt', 'w')
	m_set = sorted(m_set.items(), key = lambda k: -k[1])
	for k, v in m_set:
		fobj.write('%s (%d)\n'%(k, v))
	fobj.close()