Example #1
0
def sample():
	blogs = commdatica.load('output/umtc.txt')
	
	has_emo = []
	no_emo = []

	target = 1000
	i = 0
	pbar = progbar.start(target)

	for blog in blogs:
		if blogger.is_valid(blog.text):
			if not len(has_emo) >= 500:
				has_emo.append(blog)
				i += 1
	
		elif blogger.is_valid(blog.text, check_emo = False):
			if not len(no_emo) >= 500:
				no_emo.append(blog)
				i += 1

		pbar.update(i)

	pbar.finish()

	print 'writing to umtc_yes_emo.txt ....',
	open('output/umtc_yes_emo.txt', 'w').write('\n'.join([repr(blog) for blog in has_emo]))
	print 'OK'

	print 'writing to umtc_no_emo.txt ....',
	open('output/umtc_no_emo.txt', 'w').write('\n'.join([repr(blog) for blog in no_emo]))
	print 'OK'	

	bs = commdatica.load('output/umtc_yes_emo.txt')
	print len(bs)
Example #2
0
def sample():
    blogs = commdatica.load("output/umtc.txt")

    has_emo = []
    no_emo = []

    target = 1000
    i = 0
    pbar = progbar.start(target)

    for blog in blogs:
        if blogger.is_valid(blog.text):
            if not len(has_emo) >= 500:
                has_emo.append(blog)
                i += 1

        elif blogger.is_valid(blog.text, check_emo=False):
            if not len(no_emo) >= 500:
                no_emo.append(blog)
                i += 1

        pbar.update(i)

    pbar.finish()

    print "writing to umtc_yes_emo.txt ....",
    open("output/umtc_yes_emo.txt", "w").write("\n".join([repr(blog) for blog in has_emo]))
    print "OK"

    print "writing to umtc_no_emo.txt ....",
    open("output/umtc_no_emo.txt", "w").write("\n".join([repr(blog) for blog in no_emo]))
    print "OK"

    bs = commdatica.load("output/umtc_yes_emo.txt")
    print len(bs)
Example #3
0
def select():
	unmv = pkl_load('output/unmv.pkl')

	import db
	con = db.connect()
	cur = con.cursor()

	thr_min = 5		

	# t for threshold_max
	unt = [(u, n, m + v) for u, n, m, v in unmv if m <= 50 and v <= 100]
	umtc = []
	for u, n, thr_max in unt:
		cur.execute('select mid, text, comments_count from microblogs where user_id = %s and comments_count >= %d and comments_count <= %d limit %d'%(u, thr_min, thr_max, n))

		tmp_umtc = []
		for m, t, c in cur:
			if blogger.is_valid(t, check_emo = False):
				tmp_umtc.append((u, m, t, c))

		tmp_umtc = sorted(tmp_umtc, key = lambda k: -k[3])
		if len(tmp_umtc) > 100:
			tmp_umtc = tmp_umtc[:100]
		umtc.extend(tmp_umtc)
		
		if len(umtc) >= 400000:
			break

	fobj = open('output/umtc.txt', 'w')
	for u, m, t, c in umtc:
		fobj.write(repr(BlogInfo(u, m, t, c)) + '\n')
	fobj.close()
Example #4
0
def sample(eid):
	'''
	sampling text from MySQL to data/dataset/text/EID.txt and data/dataset/muid/EID.txt
	'''

	import blogger
	import db
	con = db.connect()
	cur = con.cursor()

	mid_uid = load_mid_uid(eid)
	umids = muid2umids(mid_uid)
	
	uid_mids = sorted(umids.items(), key = lambda k: -len(k[1]))

	n_text = 0
	target = 4000

	texts = []
	str_mid_uid = []

	for i, item in enumerate(uid_mids):
		uid, mids = item
		
		c = 0
		for mid in mids:
			cur.execute('SELECT text FROM microblogs WHERE user_id=%s AND mid=%s LIMIT 1'%(uid, mid))
			text = cur.fetchone()[0]
			if not blogger.is_valid(text):
				continue

			texts.append(text)
			str_mid_uid.append('%s %s'%(mid, uid))
			c += 1
			if c == 100:
				break

		n_text += c
		if n_text >= target:
			break
	
	# n_text may not be equal to 4000, later correction - lxh

	if len(texts) > target:
		texts = texts[:target]
		str_mid_uid = str_mid_uid[:target]

	DIR_DATASET = 'data/dataset/'
	DIR_TEXT = DIR_DATASET + 'text/'
	DIR_MUIDS = DIR_DATASET + 'muid/'

	open(DIR_TEXT + '%d.txt'%(eid), 'w').write('\n'.join(texts))
	open(DIR_MUIDS + '%d.txt'%(eid), 'w').write('\n'.join(str_mid_uid))
Example #5
0
def main():
	blogs = commdatica.load('output/umtc.txt')
	print '%d in total'%(len(blogs))

	pbar = progbar.start(len(blogs))
	c = 0
	for i, blog in enumerate(blogs):
		if blogger.is_valid(blog.text, check_emo = False):
			c += 1

		pbar.update(i + 1)
	pbar.finish()

	print '%.2f%%'%(100. * c / len(blogs))