def portrayal_thread(user, task_id): tweet_list = tweets_crawler.get_user_all_timeline_return( screen_name=user['screen_name']) user['tweets'] = tweet_list out = user_profile.user_profile(user) db = MongoDB().connect() collect = db['typical'] cur_user = collect.find_one({'_id': long(out['user_id'])}) out['_id'] = long(out['user_id']) del out['user_id'] if cur_user == None: collect.insert_one(out) th = threading.Thread(target=relation_thread, args=(out, )) th.start() else: collect.delete_one({'_id': long(out['_id'])}) collect.insert_one(out) with app.app_context(): TypicalCharacter.query.filter(TypicalCharacter.id == task_id).update({ 'finished_at': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) })
def classify_career(): db = MongoDB().connect() users = db['typical'].find() n = 0 err_dict = {} for user in users: tweets = user['tweets'] text = user['description'] for tweet in tweets: text += ' ' + tweet['text'] text = preprocess.preprocess(text) res = classify.exe_classify(text) if err_dict.has_key(res[0]): err_dict[res[0]] += 1 else: err_dict[res[0]] = 1 if res[0] == user['category']: n += 1 print err_dict print n
def task_detail(task_id): task = Task.query.filter(Task.id == task_id).first() user = Admin.query.filter(Admin.userid == task.userid).first() res = { 'id': task.id, 'task_name': task.task_name, 'user_name': user.username, 'created_at': task.created_at, 'finished_at': task.finished_at, 'screen_name': task.search_name, 'remark': task.remark, 'thread_num': task.thread_num, 'deepth': task.deepth, 'extension': str(task.extension), 'tweet_num': task.tweet_num, 'basicinfo_num': task.basicinfo_num, 'type': 'File Upload' if task.is_file else 'Breadth First Extension', 'search_type': task.search_type, 'file_content': task.file_content, 'basicinfo_num_finished': '', 'tweet_num_finished': '' } if '4' in task.search_type: sql = "select count(*) from task_%s" % task.id bsc = db.session.execute(sql).first() res['basicinfo_num_finished'] = bsc[0] if '1' in task.search_type or task.search_type == '': md = MongoDB().connect() collect = md["task_%s" % task.id] res['tweet_num_finished'] = collect.find().count() return render_template('task_detail.html', task=res)
def modify_category(): user_id = request.form['user_id'] category = request.form['category'] categories = [ "Entertainment", "Agriculture", "Sports", "Religion", "Military", "Politics", "Education", "Technology", "Economy" ] if not user_id or category not in categories: return jsonify({'status': 0}) db = MongoDB().connect() collect = db['typical'] collect.update_one({'_id': long(user_id)}, {"$set": { "category": category }}) node = selector.select("Typical", user_id=long(user_id)).first() node['category'] = category graph.push(node) return jsonify({'status': 1})
def test_insert(): db = MongoDB('test') doc = 'test' data = {'test': 123} r = db.insert(doc, data=data) assert r assert isinstance(r, ObjectId) c = db.query(doc) \ .find_one({'_id': r}) assert c['test'] == 123
def calc_sentiment_score(): sentiment_dict = dict(map(lambda (k,v): (k,int(v)), [ line.split('\t') for line in open("portrayal/sentiment_classify/data/sentiment_words1.txt") ])) db = MongoDB().connect() users = db['typical'].find({'screen_name': 'EP_Agriculture'}).limit(1) for user in users: final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2 = sentiment_classify.exe_sentiment_classify(user['tweets']) # print tags db['typical'].update({'_id': user['_id']}, {"$set": {"psy": final_sentiment, "psy_with_time1": psy_with_time1, "psy_with_time2": psy_with_time2, "psy_with_count1": psy_with_count1, "psy_with_count2": psy_with_count2}})
def test_update(): db = MongoDB('test') doc = 'test' data = {'test': 123} db.insert(doc, data=data) c = db.query(doc) \ .find_one({'test': 123}) r = db.update(doc, c=c, data={'test': 999}) assert r['ok'] == True c2 = db.query(doc) \ .find_one({'_id': c['_id']}) assert c2['test'] == 999
def calc_sentiment(): db = MongoDB().connect() users = db['typical'].find() for user in users: try: final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2 = sentiment_classify.exe_sentiment_classify(user['tweets']) except Exception as e: print user['_id'] print e continue db['typical'].update({'_id': user['_id']}, {"$set": {"psy": final_sentiment, "psy_with_time1": psy_with_time1, "psy_with_time2": psy_with_time2, "psy_with_count1": psy_with_count1, "psy_with_count2": psy_with_count2}})
def download_user_xml(user_id): db = MongoDB().connect() collect = db['typical'] user = collect.find_one({'_id': long(user_id)}) file_name = generate_user_xml(user) response = make_response(send_file(file_name)) response.headers[ "Content-Disposition"] = "attachment; filename=%s.xml" % user[ 'screen_name'] return response
def update_attr(): graph = Neo4j().connect() mongo = MongoDB().connect() tus = mongo['typical'].find({}, { 'name': 1, 'category': 1, 'followers_count': 1, 'location': 1, 'utc_offset': 1, 'statuses_count': 1, 'description': 1, 'friends_count': 1, 'psy': 1, 'verified': 1, 'lang': 1, 'favourites_count': 1, 'screen_name': 1, 'influence_score': 1, 'created_at': 1, 'time_zone': 1, 'protected': 1, 'activity': 1 }) for item in tus: node = graph.find_one("Typical", property_key="user_id", property_value=item['_id']) node['name'] = item['name'] node['category'] = item['category'] node['followers_count'] = item['followers_count'] node['location'] = item['location'] node['utc_offset'] = item['utc_offset'] node['statuses_count'] = item['statuses_count'] node['description'] = item['description'] node['friends_count'] = item['friends_count'] node['psy'] = item['psy'] node['verified'] = item['verified'] node['lang'] = item['lang'] node['favourites_count'] = item['favourites_count'] node['screen_name'] = item['screen_name'] node['influence_score'] = item['influence_score'] node['created_at'] = item['created_at'] node['time_zone'] = item['time_zone'] node['protected'] = item['protected'] node['activity'] = item['activity'] graph.push(node)
def extract_interset(): db = MongoDB().connect() users = db['typical'].find() for u in users: text = '' for item in u['tweets']: text += item['text'] + ' ' try: tags = interest_extract.extract_tags(text, u['description']) except Exception as e: print u['_id'] print e continue db['typical'].update({'_id': u['_id']}, {"$set": {"interest_tags": tags}})
def download_interest_tags(user_id): db = MongoDB().connect() collect = db['typical'] user = collect.find_one({'_id': long(user_id)}, { 'interest_tags': 1, 'screen_name': 1 }) file_name = tag_cloud.generate_tag_cloud(user['interest_tags'], user['_id']) response = make_response(send_file(file_name)) response.headers[ "Content-Disposition"] = "attachment; filename=%s_interest_tags.png" % user[ 'screen_name'] return response
def typical_character_newdelete(): user_id = request.form['user_id'] if not user_id: return jsonify({'status': 0}) TypicalCharacter.query.filter(TypicalCharacter.user_id == user_id).delete() db = MongoDB().connect() collect = db['typical'] collect.delete_one({'_id': long(user_id)}) collect = db['relation'] collect.delete_one({'_id': long(user_id)}) cql = '''MATCH (a:Typical{user_id:%d})-[r:following]-(b) DETACH delete a, r''' % long( user_id) graph.run(cql) return jsonify({'status': 1})
def typical_data_statistics(): db = MongoDB().connect() collect = db['typical'] users = collect.find({}, {'category': 1, '_id': 0, 'influence_score': 1}) category = { 'Politics': 0, 'Religion': 0, 'Military': 0, 'Economy': 0, 'Technology': 0, 'Education': 0, 'Agriculture': 0, 'Entertainment': 0, 'Sports': 0 } influence = {} for i in range(15): influence[str(i * 10)] = 0 for item in users: category[item['category']] += 1 s = str(int(item['influence_score'] / 10) * 10) if int(s) >= 150: if not influence.has_key(s): influence[s] = 1 else: influence[s] += 1 else: influence[s] += 1 return render_template('portrayal/typical_data_statistics.html', category=category, influence=influence)
def create_relation(): graph = Neo4j().connect() mongo = MongoDB().connect() # 清空neo4j数据库 # graph = graph.delete_all() tus = mongo['typical'].find({}, {'_id': 1}) for item in tus: # 创建用户节点 user = Node("Typical", user_id=item['_id']) graph.create(user) tus = mongo['relation'].find({}, {'_id': 1}) user_list = map(lambda item: item['_id'], tus) # 创建用户节点之间的关系 for user_id in user_list: friends = mongo['relation'].find_one({'_id': user_id}) friends = set(friends['friends']) node1 = graph.find_one("Typical", property_key="user_id", property_value=user_id) for user_id1 in user_list: if user_id1 == user_id: continue if user_id1 in friends: node2 = graph.find_one("Typical", property_key="user_id", property_value=user_id1) following = Relationship(node1, 'following', node2) graph.create(following)
def test_query(): db = MongoDB('test') doc = 'test' r = db.query(doc) assert r assert r == db.collection[doc]
def update_user_category(): db = MongoDB().connect() users = db['typical'].find({}, {'_id': 1, 'screen_name': 1, 'category': 1, 'category_score': 1}) count = 0 category_name = ['Politics', 'Religion', 'Military', 'Economy', 'Technology', 'Education', 'Agriculture', 'Entertainment', 'Sports'] users_temp = [] for item in users: sorted_list = sorted(item['category_score'].iteritems(), key = lambda asd:asd[1], reverse = True) if sorted_list[0][1] > 2 * sorted_list[1][1] or sorted_list[0][1] - sorted_list[1][1] > 50: if sorted_list[0][0] != item['category']: count += 1 continue score_differ = (2 * sorted_list[0][1] - sorted_list[1][1] - sorted_list[-1][1]) / 2 relation_dict = { sorted_list[0][0]: 0, sorted_list[1][0]: 0, sorted_list[2][0]: 0, sorted_list[3][0]: 0 } # for name in category_name: # relation_dict[name] = 0 cql = '''MATCH(a{user_id:%s})-[:following]->(f) return distinct f.user_id as user_id''' % (item['_id']) res = graph.data(cql) for f in res: user = db['typical'].find_one({'_id': f['user_id']}, {'category_score': 1}) category_score = user['category_score'] max_category = max(category_score, key = lambda x: category_score[x]) if max_category in relation_dict: relation_dict[max_category] += 1 cql = '''MATCH(a{user_id:%s})<-[:following]-(f) return distinct f.user_id as user_id''' % (item['_id']) res = graph.data(cql) for f in res: user = db['typical'].find_one({'_id': f['user_id']}, {'category_score': 1}) category_score = user['category_score'] max_category = max(category_score, key = lambda x: category_score[x]) if max_category in relation_dict: relation_dict[max_category] += 1 relation_total = 0 for ri in relation_dict: relation_total += relation_dict[ri] if relation_total < 10: if sorted_list[0][0] != item['category']: count += 1 continue for ri in relation_dict: item['category_score'][ri] += round(score_differ * relation_dict[ri] / relation_total, 2) users_temp.append({'_id':item['_id'], "category_score": item['category_score']}) s1 = sorted_list[0][0] sorted_list = sorted(item['category_score'].iteritems(), key = lambda asd:asd[1], reverse = True) # if sorted_list[0][0] == item['category'] and s1 != item['category']: # print item['screen_name'] if sorted_list[0][0] != item['category']: count += 1 print count for item in users_temp: db['typical'].update({'_id': item['_id']}, {"$set": {"category_score": item['category_score']}})
# print user['screen_name'] # # break # print count # for tt in user['tweets']: # print tt['text'] # extract_interset() # calc_sentiment() # calc_sentiment_score() # sentiment_dict_test() # try: # words = word_tokenize("What a beautiful sunday . happy") # print nltk.pos_tag(words) # except Exception as e: # print e db = MongoDB().connect() users = db['typical'].find() count = 1 data_set = { 'retweet_favorite_rate': [], 'fans_retweet_rate': [], 'fans_favorite_rate': [] } for user in users: tweets = user['tweets'] fans = user['followers_count'] # if fans > 2000000: # continue count += 1
def typical_category_statistics(): db = MongoDB().connect() collect = db['typical'] total_count = collect.find({}, {}).count() users = collect.find({}, {'category': 1, '_id': 0, 'category_score': 1}) category_name = [ 'Politics', 'Religion', 'Military', 'Economy', 'Technology', 'Education', 'Agriculture', 'Entertainment', 'Sports' ] category = {} for item in category_name: category[item] = { 'count': 0, 'error_count': 0, 'sub_error_count': 0, 'error_classified_count': 0, 'error_distribution': {} } for name in category_name: category[item]['error_distribution'][name] = 0 for item in category: del category[item]['error_distribution'][item] error_count = 0 sub_error_count = 0 for item in users: category[item['category']]['count'] += 1 category_score = item['category_score'] max_category = max(category_score, key=category_score.get) if max_category != item['category']: category[item['category']]['error_distribution'][max_category] += 1 category[max_category]['error_classified_count'] += 1 error_count += 1 category[item['category']]['error_count'] += 1 category_score[max_category] = 0 max_category = max(category_score, key=category_score.get) if max_category != item['category']: sub_error_count += 1 category[item['category']]['sub_error_count'] += 1 for item in category: correct_count = category[item]['count'] - category[item]['error_count'] category[item]['correct_count'] = correct_count category[item]['recall'] = round( correct_count * 1.0 / category[item]['count'], 4) category[item]['precision'] = round( correct_count * 1.0 / (correct_count + category[item]['error_classified_count']), 4) category[item]['f_score'] = round( 2 * category[item]['recall'] * category[item]['precision'] / (category[item]['recall'] + category[item]['precision']), 4) category[item]['accuracy'] = round( (correct_count + total_count - category[item]['count'] - category[item]['error_classified_count']) * 1.0 / total_count, 4) category[item]['easy_wrong_category'] = max( category[item]['error_distribution'], key=category[item]['error_distribution'].get) average_recall = 0 average_precision = 0 average_fscore = 0 average_accuracy = 0 for item in category: average_recall += category[item]['recall'] * category[item][ 'count'] / total_count average_precision += category[item]['precision'] * category[item][ 'count'] / total_count average_fscore += category[item]['f_score'] * category[item][ 'count'] / total_count average_accuracy += category[item]['accuracy'] * category[item][ 'count'] / total_count return render_template('portrayal/typical_category_statistics.html', category = category, total_count = total_count, error_count = error_count, \ average_accuracy = round(average_accuracy, 4), sub_error_count = sub_error_count, average_recall = round(average_recall, 4), average_precision = round(average_precision, 4), average_fscore = round(average_fscore, 4))
def relation_thread(user): user_id = user['_id'] db = MongoDB().connect() collect = db['relation'] user_id = long(user_id) typical_user = collect.find_one({'_id': user_id}) if typical_user: return user_node = Node("Typical", user_id=user_id, name=user['name'], category=user['category'], followers_count=user['followers_count'], location=user['location'], utc_offset=user['utc_offset'], statuses_count=user['statuses_count'], description=user['description'], friends_count=user['friends_count'], psy=user['psy'], verified=user['verified'], lang=user['lang'], favourites_count=user['favourites_count'], screen_name=user['screen_name'], influence_score=user['influence_score'], created_at=user['created_at'], time_zone=user['time_zone'], protected=user['protected'], activity=user['activity']) graph.create(user_node) cursor = -1 friends = [] while cursor != 0: out = relation_crawler.get_friendids_paged_sleep(user_id=user_id, cursor=cursor, count=5000) if not out: break friends = friends + out[2] cursor = out[0] collect.insert_one({'_id': user_id, 'friends': friends}) tus = collect.find() friends = set(friends) for item in tus: if item['_id'] == user_id: continue friend_node = selector.select("Typical", user_id=item['_id']).first() # friend_node1 = graph.find_one("Typical", # property_key = "user_id", # property_value = item['_id']) if item['_id'] in friends: following = Relationship(user_node, 'following', friend_node) graph.create(following) if user_id in set(item['friends']): following = Relationship(friend_node, 'following', user_node) graph.create(following) sorted_list = sorted(user['category_score'].iteritems(), key=lambda asd: asd[1], reverse=True) if sorted_list[0][1] > 2 * sorted_list[1][1] or sorted_list[0][ 1] - sorted_list[1][1] > 50: return score_differ = (2 * sorted_list[0][1] - sorted_list[1][1] - sorted_list[-1][1]) / 2 category_name = [ 'Politics', 'Religion', 'Military', 'Economy', 'Technology', 'Education', 'Agriculture', 'Entertainment', 'Sports' ] relation_dict = {} for name in category_name: relation_dict[name] = 0 cql = '''MATCH(a{user_id:%s})-[:following]->(f) return distinct f.user_id as user_id, f.category as category''' % ( user['_id']) res = graph.data(cql) for f in res: relation_dict[f['category']] += 1 cql = '''MATCH(a{user_id:%s})<-[:following]-(f) return distinct f.user_id as user_id, f.category as category''' % ( user['_id']) res = graph.data(cql) for f in res: relation_dict[f['category']] += 1 relation_total = 0 for ri in relation_dict: relation_total += relation_dict[ri] if relation_total < 10: return for ri in relation_dict: user['category_score'][ri] += round( score_differ * relation_dict[ri] / relation_total, 2) category_score = user['category_score'] max_category = max(category_score, key=lambda x: category_score[x]) db['typical'].update( {'_id': user['_id']}, {"$set": { "category_score": category_score, "category": max_category }}) node = selector.select("Typical", user_id=long(user['_id'])).first() node['category'] = max_category graph.push(node)
def typical_character_list_detail(): data = json.loads(request.form['aoData']) for item in data: if item['name'] == 'sSearch': s_search = item['value'].strip() if item['name'] == 'iDisplayLength': data_length = item['value'] if item['name'] == 'iDisplayStart': data_start = item['value'] data_length = int(data_length) db = MongoDB().connect() collect = db['typical'] field = { 'screen_name': 1, 'name': 1, 'friends_count': 1, 'followers_count': 1, 'statuses_count': 1, 'influence_score': 1, 'category': 1 } if s_search == '': users = collect.find({}, field).skip(data_start).limit(data_length) count = collect.find({}, {'_id': 1}).count() else: pattern = re.compile(".*" + s_search + ".*") query = { "$or": [{ "name": pattern }, { "category": pattern }, { "interest_tags": pattern }, { "screen_name": pattern }, { "description": pattern }, { "location": pattern }, { "name": pattern }] } users = collect.find(query, field).skip(data_start).limit(data_length) count = collect.find(query, {'_id': 1}).count() res = [] for u in users: res.append({ "_id": str(u['_id']), "screen_name": u['screen_name'], "name": u['name'], "friends_count": u['friends_count'], "followers_count": u['followers_count'], "statuses_count": u['statuses_count'], "category": u['category'], "influence_score": round(u['influence_score'], 3) }) return jsonify({'aaData': res, 'iTotalDisplayRecords': count})
def typical_character_detail(user_id): mdb = MongoDB().connect() collect = mdb['typical'] user = collect.find_one({'_id': long(user_id)}, {'tweets': 0}) user['ratio'] = user[ 'followers_count'] if not user['friends_count'] else round( user['followers_count'] * 1.0 / user['friends_count'], 2) user['created_at'] = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime(user['created_at'].replace('+0000 ', ''))) user['crawler_date'] = str(user['crawler_date']).split(" ")[0] user['interest_tags'] = user['interest_tags'].replace(',', ', ') user['interest_tags'] = re.sub( r'#(\w+)', "<a href='https://www.twitter.com/hashtag/\g<1>' target='_blank'>#\g<1></a>", user['interest_tags']) s = '' for item in user['activity_list']: s += "," + str(item) user['activity_list'] = s[1:] s = '' for item in user['psy_with_time1']: s += "," + str(item) user['psy_with_time1'] = s[1:] s = '' for item in user['psy_with_time2']: s += "," + str(item) user['psy_with_time2'] = s[1:] s = '' for item in user['psy_with_count1']: s += "," + str(item) user['psy_with_count1'] = s[1:] s = '' for item in user['psy_with_count2']: s += "," + str(item) user['psy_with_count2'] = s[1:] s = '' s1 = '' for item in user['category_score']: s += "," + item s1 += "," + str(user['category_score'][item]) user['category_score_keys'] = s[1:] user['category_score_values'] = s1[1:] user_tweets = collect.aggregate([{"$match": {'_id': long(user_id)}}, \ {"$project": {"length": {"$size": "$tweets"}, "first": {"$slice": ["$tweets.created_at", 0, 1]},\ "last": {"$slice": ["$tweets.created_at", -1]}}}]) for item in user_tweets: user['tweets_count'] = item['length'] user['tweets_start_time'] = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime(item['first'][0].replace('+0000 ', ''))) user['tweets_end_time'] = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime(item['last'][0].replace('+0000 ', ''))) break get_image(user['profile_image_url'], user['screen_name']) related_users = collect.find( { 'category': user['category'], '_id': { "$ne": user['_id'] } }, { 'screen_name': 1, 'name': 1, 'profile_image_url': 1 }).limit(10) ru_arr = [] for ru in related_users: ru_arr.append({ '_id': str(ru['_id']), 'screen_name': ru['screen_name'], 'name': ru['name'] }) get_image(ru['profile_image_url'], ru['screen_name']) return render_template('portrayal/typical_character_detail.html', user=user, related_users=ru_arr)