def timeline_sampling(dbname, mode='N'): db = dbt.db_connect_no_auth(dbname) poi = db['tcom'] timel = db['times'] bnet = db['bnet'] stream_users = db['poi'] poi.create_index("id", unique=True) poi.create_index("level", unique=False) timel.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)], unique=False) timel.create_index([('id', pymongo.ASCENDING)], unique=True) bnet.create_index([("id0", pymongo.ASCENDING), ("id1", pymongo.ASCENDING), ("relationship", pymongo.ASCENDING), ("statusid", pymongo.ASCENDING)], unique=True) # while True: ed_seed = profiles_check.seed_all_profile(stream_users, 5) length = len(ed_seed) if length == 0: print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S"), 'no seed users, finished!' # break else: print 'seed users: ', length lookup.trans_seed_to_poi(ed_seed, poi, mode) # continue level = 1 while True: timelines.monitor_timeline(poi, timel, 1) timeline_network_miner.network_mining(poi, timel, bnet, level) for user in poi.find({'level': level}): neiblist = set() for relate in bnet.find({ 'id0': user['id'], 'relationship': { '$in': ['retweet', 'reply-to', 'dmentioned'] } }): neiblist.add(relate['id1']) neiblist = list(neiblist) list_size = len(neiblist) length = int(math.ceil(list_size / 100.0)) for index in xrange(length): index_begin = index * 100 index_end = min(list_size, index_begin + 100) lookup.lookup_user_list(neiblist[index_begin:index_end], poi, level + 1, mode) if poi.count() > 4000: break else: level += 1 continue
def monitor_timeline(time_index): datasets = ["sed", "srd", "syg"] print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + "Start to crawl timelines" for dataset in datasets: db = dbt.db_connect_no_auth(dataset) sample_user = db["com"] sample_time = db["timeline"] sample_user.create_index([("timeline_scraped_times", pymongo.ASCENDING)], unique=False) sample_time.create_index([("user.id", pymongo.ASCENDING), ("id", pymongo.DESCENDING)], unique=False) sample_time.create_index([("id", pymongo.ASCENDING)], unique=True) print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + "Start to crawl timeline" timelines.monitor_timeline(sample_user, sample_time, time_index) print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), "Finish a crawl" print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + "Finish timlines crawl"
def monitor_timeline(time_index): datasets = ['ded', 'drd', 'dyg'] print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Start to crawl timelines' for dataset in datasets[:1]: db = dbt.db_connect_no_auth(dataset) sample_user = db['com'] sample_time = db['timeline'] sample_user.create_index([('timeline_scraped_times', pymongo.ASCENDING)], unique=False) sample_time.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)], unique=False) sample_time.create_index([('id', pymongo.ASCENDING)], unique=True) print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Start to crawl timeline' timelines.monitor_timeline(sample_user, sample_time, time_index) print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Finish a crawl' print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Finish timlines crawl'
def timeline_sampling(dbname, mode='N'): db = dbt.db_connect_no_auth(dbname) poi = db['tcom'] timel = db['times'] bnet = db['bnet'] stream_users = db['poi'] poi.create_index("id", unique=True) poi.create_index("level", unique=False) timel.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)], unique=False) timel.create_index([('id', pymongo.ASCENDING)], unique=True) bnet.create_index([("id0", pymongo.ASCENDING), ("id1", pymongo.ASCENDING), ("relationship", pymongo.ASCENDING), ("statusid", pymongo.ASCENDING)], unique=True) # while True: ed_seed = profiles_check.seed_all_profile(stream_users, 5) length = len(ed_seed) if length == 0: print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'no seed users, finished!' # break else: print 'seed users: ', length lookup.trans_seed_to_poi(ed_seed, poi, mode) # continue level = 1 while True: timelines.monitor_timeline(poi, timel, 1) timeline_network_miner.network_mining(poi, timel, bnet, level) for user in poi.find({'level': level}): neiblist = set() for relate in bnet.find({'id0': user['id'], 'relationship': {'$in': ['retweet', 'reply-to', 'dmentioned']}}): neiblist.add(relate['id1']) neiblist = list(neiblist) list_size = len(neiblist) length = int(math.ceil(list_size/100.0)) for index in xrange(length): index_begin = index*100 index_end = min(list_size, index_begin+100) lookup.lookup_user_list(neiblist[index_begin:index_end], poi, level+1, mode) if poi.count() > 4000: break else: level += 1 continue
def monitor_timeline(time_index): datasets = ['ded', 'drd', 'dyg'] print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + 'Start to crawl timelines' for dataset in datasets[:1]: db = dbt.db_connect_no_auth(dataset) sample_user = db['com'] sample_time = db['timeline'] sample_user.create_index( [('timeline_scraped_times', pymongo.ASCENDING)], unique=False) sample_time.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)], unique=False) sample_time.create_index([('id', pymongo.ASCENDING)], unique=True) print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + 'Start to crawl timeline' timelines.monitor_timeline(sample_user, sample_time, time_index) print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S"), 'Finish a crawl' print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + 'Finish timlines crawl'