def transform_net_data(dbname, colname, newdbname, newcolname): dbo = dbt.db_connect_no_auth(dbname) neto = dbo[colname] dbn = dbt.db_connect_no_auth(newdbname) netn = dbn[newcolname] netn.create_index([("user", pymongo.ASCENDING), ("follower", pymongo.ASCENDING), ("type", pymongo.ASCENDING)], unique=True) for status in neto.find({'scraped_times':1}): netn.insert(status)
def transform_data(dbname, colname, newdbname, newcolname, timeend): dbo = dbt.db_connect_no_auth(dbname) timeo = dbo[colname] dbn = dbt.db_connect_no_auth(newdbname) timen = dbn[newcolname] timen.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) timen.create_index([('id', pymongo.ASCENDING)], unique=True) for status in timeo.find(): ts = datetime.strptime(status['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if ts <= timeend: timen.insert(status)
def transform_net_data(dbname, colname, newdbname, newcolname): # transform network data dbo = dbt.db_connect_no_auth(dbname) neto = dbo[colname] dbn = dbt.db_connect_no_auth(newdbname) netn = dbn[newcolname] netn.create_index([("user", pymongo.ASCENDING), ("follower", pymongo.ASCENDING), ("type", pymongo.ASCENDING)], unique=True) for status in neto.find({'scraped_times': 1}): netn.insert(status)
def transform_data(dbname, colname, newdbname, newcolname, timeend): # transform tweet after some date point dbo = dbt.db_connect_no_auth(dbname) timeo = dbo[colname] dbn = dbt.db_connect_no_auth(newdbname) timen = dbn[newcolname] timen.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) timen.create_index([('id', pymongo.ASCENDING)], unique=True) for status in timeo.find(): ts = datetime.strptime(status['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if ts <= timeend: timen.insert(status)
def transform(): db = dbt.db_connect_no_auth('rd') cols = db['com'] db = dbt.db_connect_no_auth('drd') cold = db['com'] cold.create_index([('id', pymongo.ASCENDING)], unique=True) for user in cols.find({'level': 3}, ['id', 'screen_name', "description", "friends_count", "followers_count", "statuses_count"]): cold.insert({'id': user['id'], 'screen_name':user['screen_name'], 'description': user['description'], 'friends_count': user['friends_count'], 'followers_count': user['followers_count'], 'statuses_count': user['statuses_count']})
def check_change(time_index): """Count how many users have change their profiles, e.g., increase or descrease their follower numbers""" db = dbt.db_connect_no_auth('monitor') changedb = db['changes'] changedb.create_index([('dataset', pymongo.ASCENDING), ('statis_index', pymongo.DESCENDING)], unique=True) datasets = ['ded', 'drd', 'dyg'] check_keys = ['description', 'friends_count', 'followers_count', 'statuses_count'] for dataset in datasets[:1]: dbs = dbt.db_connect_no_auth(dataset) sample_user = dbs['com'] sample_time = dbs['timeline'] sample_net = dbs['net'] """ record need to store """ changes = {'dataset': dataset, 'statis_index': time_index} # check prof changes, 'description', 'friends_count', 'followers_count', 'statuses_count' for user in sample_user.find({'timeline_scraped_times': time_index, 'timeline_count': {'$gt': 0}}): # print dataset, user['id'] last_tweet = sample_time.find({'user.id': user['id']}, {'id':1, 'user':1, 'created_at':1}).sort([('id', -1)]).limit(1)[0] # sort: 1 = ascending, -1 = descending if last_tweet: userc = last_tweet['user'] for key in check_keys: if user[key] != userc[key]: value = changes.get(key, 0) value += 1 changes[key] = value """Update newest profiles in user database""" sample_user.update_one({'id': user['id']}, {'$set': {key: userc[key]}}, upsert=False) if 'count' in key: if user[key] < userc[key]: value = changes.get(key+'_inc', 0) value += 1 changes[key+'_inc'] = value elif user[key] > userc[key]: value = changes.get(key+'_dec', 0) value += 1 changes[key+'_dec'] = value """check following changes among users""" count = sample_net.count({'scraped_times': time_index})-sample_net.count({'scraped_times': time_index-1}) changes['net_changes'] = count changes['statis_at'] = datetime.datetime.now() try: changedb.insert(changes) except pymongo.errors.DuplicateKeyError: pass
def check_change(time_index): db = dbt.db_connect_no_auth("monitor") changedb = db["changes"] changedb.create_index([("dataset", pymongo.ASCENDING), ("statis_index", pymongo.DESCENDING)], unique=True) datasets = ["ded", "drd", "dyg"] check_keys = ["description", "friends_count", "followers_count", "statuses_count"] for dataset in datasets: dbs = dbt.db_connect_no_auth(dataset) sample_user = dbs["com"] sample_time = dbs["timeline"] sample_net = dbs["net"] changes = {"dataset": dataset, "statis_index": time_index} # check prof changes, 'description', 'friends_count', 'followers_count', 'statuses_count' for user in sample_user.find({"timeline_scraped_times": time_index, "timeline_count": {"$gt": 0}}): # print dataset, user['id'] last_tweet = ( sample_time.find({"user.id": user["id"]}, {"id": 1, "user": 1, "created_at": 1}) .sort([("id", -1)]) .limit(1)[0] ) # sort: 1 = ascending, -1 = descending if last_tweet: userc = last_tweet["user"] # print last_tweet['id'] for key in check_keys: if user[key] != userc[key]: value = changes.get(key, 0) value += 1 changes[key] = value sample_user.update_one({"id": user["id"]}, {"$set": {key: userc[key]}}, upsert=False) if "count" in key: if user[key] < userc[key]: value = changes.get(key + "_inc", 0) value += 1 changes[key + "_inc"] = value elif user[key] > userc[key]: value = changes.get(key + "_dec", 0) value += 1 changes[key + "_dec"] = value # check following changes among users count = sample_net.count({"scraped_times": time_index}) - sample_net.count({"scraped_times": time_index - 1}) changes["net_changes"] = count changes["statis_at"] = datetime.datetime.now().strftime("%a %b %d %H:%M:%S +0000 %Y") try: changedb.insert(changes) except pymongo.errors.DuplicateKeyError: pass
def active_user_list(dbname, comname, timename): db = dbt.db_connect_no_auth(dbname) time = db[timename] com = db[comname] date = [] pred_users = pickle.load(open('data/ed-rel.pick', 'r')) for uid in pred_users: user = com.find_one({'id': int(uid)}) if user['level'] != 1: last_tweet = time.find({ 'user.id': int(uid) }, { 'id': 1, 'user': 1, 'created_at': 1 }).sort([('id', -1) ]).limit(1)[0] # sort: 1 = ascending, -1 = descending datev = last_tweet['created_at'] if isinstance(datev, basestring): datev = datetime.strptime(datev, '%a %b %d %H:%M:%S +0000 %Y') date.append(datetime(datev.year, datev.month, datev.day)) # print user['screen_name'], datetime(datev.year, datev.month, datev.day) print len(date) df = pd.DataFrame({'PredictED_nonED': date}, index=date) df.groupby([df.PredictED_nonED.dt.year, df.PredictED_nonED.dt.month]).count().plot(kind="bar") plt.xlabel('(Year, Month)') plt.ylabel('Count') plt.show()
def re_snowball_friends(olddbname, oldcomname, newdbname, newcomname): newdb = dbt.db_connect_no_auth(newdbname) newcom = newdb[newcomname] newnet = newdb['net'] newcom.create_index("id", unique=True) newcom.create_index([('level', pymongo.ASCENDING), ('following_prelevel_node', pymongo.ASCENDING)], unique=False) newcom.create_index([('level', pymongo.ASCENDING), ('follower_prelevel_node', pymongo.ASCENDING)], unique=False) newnet.create_index([("user", pymongo.ASCENDING), ("follower", pymongo.ASCENDING)], unique=True) '''Reteive ED core users''' ed_users = iot.get_values_one_field(olddbname, oldcomname, 'id', {'level': 1}) list_size = len(ed_users) length = int(math.ceil(list_size/100.0)) for index in xrange(length): index_begin = index*100 index_end = min(list_size, index_begin+100) lookup.lookup_user_list(ed_users[index_begin:index_end], newcom, 1, 'N') level = 1 while True: # Each call of snowball_following and snowball_follower only process up to 200 users print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Snowball followings of seeds for sample db', level following_flag = following.snowball_following(newcom, newnet, level, 'N') print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Snowball followees of seeds for sample db', level follower_flag = follower.snowball_follower(newcom, newnet, level, 'N') if following_flag == False and follower_flag == False: break else: continue
def process_tweet(dbname, comname, timename, label, filename): db = dbt.db_connect_no_auth(dbname) times = db[timename] user_list = iot.get_values_one_field(dbname, comname, 'id', { "timeline_count": { '$gt': 0 }, 'lang': 'en' }) target_users = [] for user in user_list: context = '' for time in times.find({'user.id': user}).sort([('id', 1)]): # print time['created_at'] if 'retweeted_status' in time: continue elif 'quoted_status' in time: continue else: text = process(time['text']) if text: # print user, time['id'], text, '<-------', time['text'] context += text + ' ' else: continue # print user, time['id'], 'None', '<-------', time['text'] if len(context.split()) > 50: target_users.append(user) print '__label__' + label + ' , ' + context pickle.dump(target_users, open('data/' + filename + '.pick', 'w'))
def core_ed(): idset = set() db = dbt.db_connect_no_auth('fed') com = db['com'] for user in com.find({'level':1}): idset.add(user['id_str']) return idset
def timeline_time(dbname, colname, timename): db = dbt.db_connect_no_auth(dbname) com = db[colname] timeline = db[timename] posts = {} dates = {} biolist = ['text_anal.gw.value', 'text_anal.cw.value', # 'text_anal.edword_count.value', 'text_anal.h.value', 'text_anal.a.value', 'text_anal.lw.value', 'text_anal.hw.value'] for user in com.find({"$and":[ # {biolist[0]:{'$exists': True}}, {biolist[1]:{'$exists': True}}, {biolist[2]:{'$exists': True}}, # {biolist[3]:{'$exists': True}}, # {biolist[4]:{'$exists': True}}, # {biolist[5]:{'$exists': True}} ]}): uid, timeline_count = user['id'], user['timeline_count'] posts[uid] = timeline_count for tw in timeline.find({'user.id': uid}): ts = datetime.strptime(tw['created_at'],'%a %b %d %H:%M:%S +0000 %Y') datelist = dates.get(uid, []) datelist.append(ts) dates[uid] = datelist return posts, dates
def ed_user(dbname, colname): user_list = [] db = dbt.db_connect_no_auth(dbname) com = db[colname] for user in com.find({'level': 1}, ['id']): user_list.append(str(user['id'])) return user_list
def lifetime(dbname, comname, timename): db = dbt.db_connect_no_auth(dbname) com = db[comname] time = db[timename] during = [] for user in com.find({"timeline_count": {'$gt': 0}}): newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0] last = datetime.strptime(newtweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') account = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y') print user['id'], last, account, (last.date() - account.date()).days + 1 during.append((last.date() - account.date()).days + 1) pt.plot_config() plt.figure(1) plt.subplot(211) pt.sns.distplot(during) print np.mean(during), np.std(during) plt.axvline(np.mean(during), linestyle='--', color='k', label='Mean') plt.ylabel('PDF') plt.xlim(0, 2700) plt.legend() plt.subplot(212) pt.sns.boxplot(x=during) plt.ylabel('Quartile') plt.xlabel('Day') plt.xlim(0,2700) plt.show()
def create_time(dbname, colname): db = dbt.db_connect_no_auth(dbname) com = db[colname] created_time = {} # biolist = ['text_anal.gw.value', # 'text_anal.cw.value', # # 'text_anal.edword_count.value', # 'text_anal.h.value', # 'text_anal.a.value', # 'text_anal.lw.value', # 'text_anal.hw.value'] # for user in com.find({"$and":[ # # {biolist[0]:{'$exists': True}}, # {biolist[1]:{'$exists': True}}, # {biolist[2]:{'$exists': True}}, # # {biolist[3]:{'$exists': True}}, # # {biolist[4]:{'$exists': True}}, # # {biolist[5]:{'$exists': True}} # {'status':{'$exists': True}} # ]}): for user in com.find({}): ts = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y') # print type(ts) created_time[user['id']] = ts # print ts # print user['created_at'] # print '-----------------------' # print max(created_time.values()), min(created_time.values()) return created_time
def timeline(dbname, timename): db = dbt.db_connect_no_auth(dbname) timeline = db[timename] dates = [] for status in timeline.find(no_cursor_timeout=True): dates.append(status['created_at']) return dates
def load_behavior_network(db_name, collection='None', btype='communication'): '''Tweet: 0 Retweet: 1; Reply: 2; Direct Mention: 3; undirect mention: 4 ''' btype_dic = {'retweet': [1], 'reply': [2], 'mention': [3], 'communication': [2, 3]} DG = DiGraph() if collection is 'None': cols = db_name else: db = dbt.db_connect_no_auth(db_name) cols = db[collection] for row in cols.find({"type": {'$in': btype_dic[btype]}}): # if btype is 'retweet': # n2 = row['id0'] # n1 = row['id1'] # else: n1 = row['id0'] n2 = row['id1'] if n1 != n2: weightv = 1 if (DG.has_node(n1)) and (DG.has_node(n2)) and (DG.has_edge(n1, n2)): DG[n1][n2]['weight'] += weightv else: DG.add_edge(n1, n2, weight=weightv) return DG
def image_main_color(dbname, colname): db = dbt.db_connect_no_auth(dbname) poi = db[colname] color_list = {} index = 0 for user in poi.find( { 'profile_banner_url': { '$exists': True }, 'liwc_anal.result.WC': { '$exists': True } }, ['id', 'profile_banner_url']): uid = user['id'] url = user['profile_banner_url'] index += 1 if index % 100 == 0: print 'Have processed users:', index try: main_colors = ic.main_colors(url) color_list[uid] = main_colors except urllib2.HTTPError: continue # if len(color_list)>10000: # break return color_list
def read_document(dbname, colname, timecol, uset=None): db = dbt.db_connect_no_auth(dbname) col = db[colname] timelines = db[timecol] rtgrex = re.compile(r'RT (?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+):') # for Retweet mgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+)') # for mention hgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9]))#([A-Za-z0-9_]+)') # for hashtags ugrex = re.compile(r'(https?://[^\s]+)') # for url documents = list() ids = list() for user in col.find({'timeline_count': {'$gt': 0}}, ['id']): uid = user['id'] # for uid in uset: textmass = "" for tweet in timelines.find({'user.id': uid}): text = tweet['text'].encode('utf8') # replace RT, @, # and Http:// text = rtgrex.sub('', text) text = mgrex.sub('', text) text = hgrex.sub('', text) text = ugrex.sub('', text) text = text.strip() if not(text.endswith('.') or text.endswith('?') or text.endswith('!')): text += '.' textmass = textmass + " " + text.lower() words = textmass.split() # Any text with fewer than 50 words should be looked at with a certain degree of skepticism. if len(words) > 50: ids.append(uid) documents.append(textmass) pickle.dump(ids, open('data/doc_ids.pick', 'w')) return documents
def data_4_opinionfinder(dbname, comname, timename, outpath, filter={}): db = dbt.db_connect_no_auth(dbname) time = db[timename] rtgrex = re.compile(r'RT (?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+):') # for Retweet mgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+)') # for mention hgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9]))#([A-Za-z0-9_]+)') # for hashtags ugrex = re.compile(r'(https?://[^\s]+)') # for url users = io.get_values_one_field(dbname, comname, 'id_str', filter) userlist = list() for user in users: documents = list() for tweet in time.find({'user.id': int(user)}): text = tweet['text'].encode('utf8') # replace RT, @, # and Http:// text = rtgrex.sub('', text) text = mgrex.sub('', text) text = hgrex.sub('', text) text = ugrex.sub('', text) text = text.strip() if not(text.endswith('.') or text.endswith('?') or text.endswith('!')): text += '.' words = text.split() if len(words) > 0: documents.append(' '.join(words)) if len(documents) > 0: with open(outpath+'/'+user+'.data', 'w') as fo: for document in documents: fo.write(document+'\t\n') userlist.append(user) with open(outpath+'.doclist', 'w') as fo: for user in userlist: fo.write('database/'+outpath+'/'+ user+'.data\n')
def process_db(dbname, poicol, timecol, bnetcol, level): #### Connecting db and collections db = dbutil.db_connect_no_auth(dbname) sample_poi = db[poicol] print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + 'Connecting POI dbs well' sample_time = db[timecol] print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + 'Connecting timeline dbs well' sample_network = db[bnetcol] sample_network.create_index([("id0", pymongo.ASCENDING), ("id1", pymongo.ASCENDING), ("type", pymongo.ASCENDING), ("statusid", pymongo.ASCENDING)], unique=True) # sample_poi.create_index([('timeline_count', pymongo.DESCENDING), # ('net_anal.tnmined', pymongo.ASCENDING), # ('level', pymongo.ASCENDING)], unique=False) # set every poi to have not been analysed. sample_poi.update_many({"net_anal.tnmined": True}, {'$set': { "net_anal.tnmined": False }}, upsert=False) print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + 'Connecting network dbs well' # sample_network.delete_many({'relationship': 'tweet'}) network_mining(sample_poi, sample_time, sample_network, level)
def extract_behavior_subnetwork(db_name, comname, bnetname, sbnetname, index=0): db = dbt.db_connect_no_auth(db_name) if index != 0: comname, bnetname, sbnetname = comname + '_t' + str( index), bnetname + '_t' + str(index), sbnetname + '_t' + str(index) poi = db[comname] net = db[bnetname] tem = db[sbnetname] # subset of behavior network tem.create_index([("id0", pymongo.ASCENDING), ("id1", pymongo.ASCENDING), ("type", pymongo.ASCENDING), ("statusid", pymongo.ASCENDING)], unique=True) userl1 = set([]) for user in poi.find({}, ['id']): userl1.add(user['id']) for user in userl1: for rel in net.find({'id0': user}): # follower = rel['id1'] # if follower in userl1: try: tem.insert(rel) except pymongo.errors.DuplicateKeyError: pass
def getuid(dbname, colname): db = dbt.db_connect_no_auth(dbname) sample_user = db[colname] uids = list() for user in sample_user.find({'level': 1}, ['id']): uids.append(user['id']) return uids
def target_set(dbname, comname): db = dbt.db_connect_no_auth(dbname) com = db[comname] uset = set() for user in com.find({}, ['id']): uset.add(user['id']) return uset
def bio_statis(dbname, colname): db = dbutil.db_connect_no_auth(dbname) bio = db[colname] biolist = ['results.gw.value', 'results.cw.value', 'results.edword_count.value', 'results.h.value', 'results.a.value', 'results.lw.value', 'results.hw.value'] for name in biolist: user_count = {} for rec in bio.find({name:{'$exists': True}}): count = user_count.get(rec['uid'], 0) count += 1 user_count[rec['uid']] = count change_count = 0 for user in user_count.keys(): if user_count[user] > 1: change_count += 1 # print user_count percent = float(len(user_count))/61580 change_per = float(change_count)/len(user_count) print ('%s, %.2f, %.2f' % (name, percent, change_per)) count = bio.count({"$or":[{biolist[0]:{'$exists': True}}, {biolist[1]:{'$exists': True}}, # {biolist[2]:{'$exists': True}}, {biolist[3]:{'$exists': True}}, # {biolist[4]:{'$exists': True}}, {biolist[5]:{'$exists': True}}, {biolist[6]:{'$exists': True}}]}) print ('Have anyone, %.2f' %(float(count)/61580))
def core_ed(): idset = set() db = dbt.db_connect_no_auth('fed') com = db['com'] for user in com.find({'level': 1}): idset.add(user['id_str']) return idset
def ed_user(dbname, colname): db = dbt.db_connect_no_auth(dbname) com = db[colname] userlist = [] for user in com.find(): userlist.append(user['id_str']) return userlist
def get_users(dbname, colname, filter): user_set = set() db = dbt.db_connect_no_auth(dbname) cols = db[colname] for user in cols.find(filter, ['id']): user_set.add(user['id']) return user_set
def read_document(dbname, colname, timecol, uset=None): db = dbt.db_connect_no_auth(dbname) col = db[colname] timelines = db[timecol] for user in col.find({'timeline_count': { '$gt': 0 }}, ['id', 'description'], no_cursor_timeout=True): uid = user['id'] text = process(user['description']) if text: print str(uid) + '\t' + ' '.join(text.split()) else: textmass = "" for tweet in timelines.find({ 'user.id': uid }, no_cursor_timeout=True).sort([ ('id', -1) ]).limit(5): text = process(tweet['text']) if text: textmass += text + ' ' else: continue tokens = textmass.split() if len(tokens) >= 3: # topk = topKFrequent(tokens, 300) # words = [token for token in tokens if token in topk] print str(uid) + '\t' + ' '.join(tokens) else: continue
def bio_change(dbname, colname, timename): db = dbt.db_connect_no_auth(dbname) com = db[colname] time = db[timename] filter = { 'liwc_anal.result.i': { '$exists': True }, 'new_liwc_anal.result.i': { '$exists': True } } cw, gw, all = 0, 0, 0 for user in com.find(filter): newtweet = time.find({ 'user.id': user['id'] }, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0] oldtweet = time.find({ 'user.id': user['id'] }, no_cursor_timeout=True).sort([('id', 1)]).limit(1)[0] newdes = newtweet['user']['description'] olddes = oldtweet['user']['description'] if newdes != olddes: all += 1 newbio = des_miner.process_text(newdes) oldbio = des_miner.process_text(olddes) if 'cw' in newbio and 'cw' in oldbio: if newbio['cw']['value'] != oldbio['cw']['value']: cw += 1 if 'gw' in newbio and 'gw' in oldbio: if newbio['gw']['value'] != oldbio['gw']['value']: gw += 1 print cw, gw, all
def getuid(dbname, colname): db = dbt.db_connect_no_auth(dbname) sample_user = db[colname] uids = list() for user in sample_user.find({"level": 1}, ["id"]): uids.append(user["id"]) return uids
def extract_behavior_subnetwork(db_name, comname, bnetname, sbnetname, index=0): db = dbt.db_connect_no_auth(db_name) if index != 0: comname, bnetname, sbnetname = ( comname + "_t" + str(index), bnetname + "_t" + str(index), sbnetname + "_t" + str(index), ) poi = db[comname] net = db[bnetname] tem = db[sbnetname] # subset of behavior network tem.create_index( [ ("id0", pymongo.ASCENDING), ("id1", pymongo.ASCENDING), ("type", pymongo.ASCENDING), ("statusid", pymongo.ASCENDING), ], unique=True, ) userl1 = set([]) for user in poi.find({}, ["id"]): userl1.add(user["id"]) for user in userl1: for rel in net.find({"id0": user}): follower = rel["id1"] if follower in userl1: try: tem.insert(rel) except pymongo.errors.DuplicateKeyError: pass
def word2vec_tweets(dbname, colname, timecol): # load word2vec of tweets and represent each users as the vector of word2vec model = gensim.models.Word2Vec.load('word2vec/fed_w2v.model') db = dbt.db_connect_no_auth(dbname) col = db[colname] timelines = db[timecol] for user in col.find({'timeline_count': { '$gt': 0 }}, ['id'], no_cursor_timeout=True): uid = user['id'] user_vec = [] for tweet in timelines.find({'user.id': uid}, no_cursor_timeout=True): text = tweet['text'].encode('utf8') # replace RT, @, and Http:// text = text.strip().lower() text = re.sub( r"(?:(rt\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// keep hashtag but remove words = tokenizer.tokenize(text) # Any text with fewer than 50 words should be looked at with a certain degree of skepticism. # if len(words) > 5: for word in words: if word in model: user_vec.append(model[word]) if len(user_vec) > 0: vector = np.array(user_vec).mean(axis=0) col.update_one( {'id': uid}, {'$set': { 'w2v.mined': True, 'w2v.result': vector.tolist() }}, upsert=False)
def read_tweets(dbname, timecol): '''Read tweets, excluding retweets''' db = dbt.db_connect_no_auth(dbname) # col = db[colname] timelines = db[timecol] # documents = list() # ids = list() # for user in col.find({'timeline_count': {'$gt': 0}}, ['id'], no_cursor_timeout=True): # uid = user['id'] for tweet in timelines.find({'retweeted_status': { '$exists': False }}, no_cursor_timeout=True): hashtags = tweet['entities']['hashtags'] hash_set = set() for hash in hashtags: hash_set.add(hash['text'].encode('utf-8').lower().replace( '_', '').replace('-', '')) text = tweet['text'].encode('utf8') uid = tweet['user']['id'] # replace RT, @, and Http:// text = text.strip().lower() text = re.sub( r"(?:(rt\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// keep hashtag but remove words = tokenizer.tokenize(text) # Any text with fewer than 50 words should be looked at with a certain degree of skepticism. if len(words) > 3: print('%d\t%d\t%s\t%s') % (uid, tweet['id'], ' '.join(words), ' '.join(list(hash_set)))
def load_behavior_network(db_name, collection='None', btype='communication'): '''Tweet: 0 Retweet: 1; Reply: 2; Direct Mention: 3; undirect mention: 4 ''' btype_dic = {'retweet': [1], 'reply': [2], 'mention': [3], 'communication': [2, 3]} DG = DiGraph() if collection is 'None': cols = db_name else: db = dbt.db_connect_no_auth(db_name) cols = db[collection] for row in cols.find({"type": {'$in': btype_dic[btype]}}): if btype is 'retweet': n2 = row['id0'] n1 = row['id1'] else: n1 = row['id0'] n2 = row['id1'] if n1 != n2: weightv = 1 if (DG.has_node(n1)) and (DG.has_node(n2)) and (DG.has_edge(n1, n2)): DG[n1][n2]['weight'] += weightv else: DG.add_edge(n1, n2, weight=weightv) return DG
def user_hashtag_profile(dbname, hash_com): ''' Map the hashtags that a user has used to communities of hashtag network Get the <commnity: proportion> vector for users' hashtag profiles :param dbname: :param hash_com: :return: ''' ed_users = iot.get_values_one_field(dbname, 'scom', 'id') db = dbt.db_connect_no_auth(dbname) com_length = len(set(hash_com.values())) times = db['timeline'] user_hash_profile = {} for uid in ed_users: counter = {} for tweet in times.find({'user.id': uid, '$where': 'this.entities.hashtags.length>0'}): hashtags = tweet['entities']['hashtags'] hash_set = set() for hash in hashtags: hash_set.add(hash['text'].encode('utf-8').lower().replace('_', '').replace('-', '')) hash_list = list(hash_set) for hash in hash_list: v = counter.get(hash, 0) counter[hash] = v+1 vector = [0.0]*com_length for hash in counter: if hash in hash_com: comid = hash_com[hash] vector[comid] += counter[hash] if sum(vector) == 0: user_hash_profile[uid] = np.array(vector) else: user_hash_profile[uid] = np.array(vector)/sum(vector) pickle.dump(user_hash_profile, open('data/user-hash-profile.pick', 'w'))
def network_snowball(dbname, mode='N'): db = dbt.db_connect_no_auth(dbname) ed_poi = db['ccom'] ed_net = db['cnet'] stream_users = db['poi'] # echelon = dbt.db_connect_no_auth('echelon') # echelon_poi = echelon['poi'] ed_poi.create_index("id", unique=True) ed_poi.create_index([('level', pymongo.ASCENDING), ('following_prelevel_node', pymongo.ASCENDING)], unique=False) ed_poi.create_index([('level', pymongo.ASCENDING), ('follower_prelevel_node', pymongo.ASCENDING)], unique=False) ed_net.create_index([("user", pymongo.ASCENDING), ("follower", pymongo.ASCENDING), ("type", pymongo.ASCENDING)], unique=True) while True: ed_seed = profiles_check.seed_all_profile(stream_users) length = len(ed_seed) if length == 0: print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S"), 'no seed users, finished!' break else: print 'seed users: ', length lookup.trans_seed_to_poi(ed_seed, ed_poi) continue statis = '' level = 1 while level < 3: print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S" ), 'Snowball followings of seeds for sample db', level following_flag = following.snowball_following(ed_poi, ed_net, level, mode) print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S" ), 'Snowball followees of seeds for sample db', level follower_flag = follower.snowball_follower(ed_poi, ed_net, level, mode) # count = ed_poi.count() # try: # # nsize, esize = nt.size_gaint_comp_net_db(ed_net) # # s = 'Start_level: ' + str(level) + ' all_users: ' + \ # # str(count) + ' size_gc:' + str(nsize) + ' ed_gc: ' + str(esize) + '\n' # print s # statis += s # except networkx.exception.NetworkXPointlessConcept: # nsize = 0 # pass if (following_flag == False and follower_flag == False): return statis else: level += 1 continue
def print_tweets(dbname, timeline): db = dbt.db_connect_no_auth(dbname) time = db[timeline] for tweet in time.find(): try: print tweet['text'] except UnicodeEncodeError: pass
def verify_core_user(dbname, colname, usetlist): """verify the users in the largest K-core""" db = dbt.db_connect_no_auth(dbname) com = db[colname] for uid in usetlist: user = com.find_one({'id': int(uid)}) if user['level'] != 1: print user['screen_name'].encode('utf-8')
def timeline(dbname, colname): db = dbt.db_connect_no_auth(dbname) timeline = db[colname] tlist = [] for status in timeline.find(): ts = datetime.strptime(status['created_at'], '%a %b %d %H:%M:%S +0000 %Y') tlist.append(ts) return tlist
def trim_user(dbname, timename): db = dbt.db_connect_no_auth(dbname) time = db[timename] for tweet in time.find({'user.screen_name': {'$exists': True}}, no_cursor_timeout=True): user = tweet['user'] # tweet['user'] = {'id': user['id']} # print tweet time.update_one({'id': tweet['id']}, {'$set':{"user": {'id': user['id']}}}, upsert=False)
def get_period(dbname, timename, newtimename): db = dbt.db_connect_no_auth(dbname) timeline = db[timename] newtimeline = db[newtimename] start = datetime(2013, 7, 25) end=datetime(2013, 7, 29) for status in timeline.find({'created_at_date': {'$gte': start, '$lt': end}}, no_cursor_timeout=True): newtimeline.insert(status)
def beh_stat(dbname, comname, colname, filename=None): db = dbt.db_connect_no_auth(dbname) com = db[comname] timeline = db[colname] tweet_all, retweet_all, dmention_all, udmention_all, reply_all, hashtag_all, url_all, quota_all, count_sum_all = \ 0, 0, 0, 0, 0, 0, 0, 0, 0 user_staits = {} for user in com.find({}, ['id'], no_cursor_timeout=True): tweet, retweet, dmention, udmention, reply, hashtag, url, quota, count_sum = 0, 0, 0, 0, 0, 0, 0, 0, 0 for status in timeline.find({'user.id': user['id']}, no_cursor_timeout=True): count_sum += 1 count_sum_all += 1 if 'retweeted_status' in status: retweet += 1 retweet_all += 1 else: tweet += 1 tweet_all += 1 if len(status['entities']['user_mentions']) > 0: udmention_list = [] replyf, udmentionf, dmentionf = False, False, False # get user mentions in retweet if ('retweeted_status' in status) and len(status['retweeted_status']['entities']['user_mentions'])>0: for udmention_item in status['retweeted_status']['entities']['user_mentions']: udmention_list.append(udmention_item['id']) for mention in status['entities']['user_mentions']: if ('in_reply_to_user_id' in status) and (mention['id'] == status['in_reply_to_user_id']): # reply replyf = True elif mention['id'] in udmention_list: # mentions in Retweet content; undirected mention udmentionf = True else: # original mentions; directed mention dmentionf = True if replyf: reply += 1 reply_all += 1 if udmentionf: udmention += 1 udmention_all += 1 if dmentionf: dmention += 1 dmention_all += 1 if len(status['entities']['hashtags']) > 0: hashtag += 1 hashtag_all += 1 if len(status['entities']['urls']) > 0: url += 1 url_all += 1 if 'quoted_status' in status: quota += 1 quota_all += 1 user_staits[user['id']] = (tweet, retweet, dmention, udmention, reply, hashtag, url, quota, count_sum) user_staits[-1] = (tweet_all, retweet_all, dmention_all, udmention_all, reply_all, hashtag_all, url_all, quota_all, count_sum_all) if filename: pickle.dump(user_staits, open('data/'+filename+'.pick', 'w')) return user_staits
def transform(): db = dbt.db_connect_no_auth('rd') cols = db['com'] db = dbt.db_connect_no_auth('drd') cold = db['com'] cold.create_index([('id', pymongo.ASCENDING)], unique=True) for user in cols.find({'level': 3}, [ 'id', 'screen_name', "description", "friends_count", "followers_count", "statuses_count" ]): cold.insert({ 'id': user['id'], 'screen_name': user['screen_name'], 'description': user['description'], 'friends_count': user['friends_count'], 'followers_count': user['followers_count'], 'statuses_count': user['statuses_count'] })
def states_change(dbname1, dbname2, comname1, comname2): db1 = dbt.db_connect_no_auth(dbname1) db2 = dbt.db_connect_no_auth(dbname2) com1 = db1[comname1] com2 = db2[comname2] count = 0 index = 0 for user1 in com1.find({'level': 1}): index += 1 user1_ed = profiles_check.check_ed(user1) user2 = com2.find_one({'id': user1['id']}) if user2: user2_ed = profiles_check.check_ed(user2) if user1_ed != user2_ed: print user1['id'] count += 1 print count print index
def get_retweeted_tweet(db_name): db = dbt.db_connect_no_auth(db_name) bnet = db["sbnet"] timeline = db["timeline"] for net in bnet.find({}): sid = net["statusid"] orig = timeline.find_one({"id": sid}, ["retweeted_status"]) oid = orig["retweeted_status"]["id"] bnet.update({"statusid": sid}, {"$set": {"ostatusid": oid}})
def netstatis(dbname, behavior_name, g, userlist, comname): db = dbt.db_connect_no_auth(dbname) com = db[comname] g = g.as_undirected(combine_edges=dict(weight="sum")) # node_n = g.vcount() # edge_m = g.ecount() # degree_mean = np.mean(g.indegree()) # degree_std = np.std(g.indegree()) # density = g.density() # avg_path = g.average_path_length() # components = g.clusters() # comp_count = len(components) # giant_comp = components.giant() # giant_comp_r = float(giant_comp.vcount())/node_n # cluster_co_global = g.transitivity_undirected() # cluster_co_avg = g.transitivity_avglocal_undirected() # assort = g.assortativity_degree(directed=False) gnode = g.vs["name"] target_nodes = list(set(userlist).intersection(gnode)) '''Remove nodes with tailed strength''' # strengths = np.array(g.strength(target_nodes, mode='OUT', loops=False, weights='weight')) # # maxv, minv = np.percentile(strengths, 97.5), np.percentile(strengths, 2.5) # maxv, minv = max(strengths), min(strengths) # index = np.logical_and(strengths >= minv, strengths <= maxv) # target_nodes = np.asarray(target_nodes, dtype=str)[index] degreess = g.degree(target_nodes, mode='OUT', loops=False) # strengths = g.strength(target_nodes, mode='OUT', loops=False, weights='weight') # print target_nodes divs = np.array(g.diversity(target_nodes, 'weight'))*np.log(degreess) '''Store in DB''' for i in xrange(len(target_nodes)): node = target_nodes[i] user = com.find_one({'id': int(node)}) data = user.get('behavior', {}) diver = divs[i] if not np.isfinite(diver): diver = 0.0 data[behavior_name+'_div'] = diver com.update_one({'id': int(node)}, {'$set': {'behavior': data}}, upsert=False) divs[~np.isfinite(divs)] = 0.0 # print node_n, edge_m, round(degree_mean, 3), round(degree_std, 3), round(density, 3), \ # round(avg_path, 3), comp_count, round(giant_comp_r, 3), round(cluster_co_global, 3), \ # round(cluster_co_avg, 3), round(assort, 3), \ # print len(target_nodes), np.mean(degreess), np.std(degreess),\ # np.mean(strengths), np.std(strengths),\ # np.mean(divs), np.std(divs) return divs
def test_timline(): db = dbt.db_connect_no_auth('rd') cols = db['com'] for user in cols.find({'timeline_count': {'$lt': 3200}}, ['id', 'timeline_count', 'statuses_count']): # print user if (user['statuses_count']-user['timeline_count']) > 100: print user['id'] cols.update({'id': user['id']}, {'$set': {"timeline_count": 0, 'timeline_scraped_times': 0}}, upsert=False)
def get_retweeted_tweet(db_name): db = dbt.db_connect_no_auth(db_name) bnet = db['sbnet'] timeline = db['timeline'] for net in bnet.find({}): sid = net['statusid'] orig = timeline.find_one({'id': sid}, ['retweeted_status']) oid = orig['retweeted_status']['id'] bnet.update({'statusid': sid}, {'$set': {"ostatusid": oid}})