Example #1
0
def timeline_sampling(dbname, mode='N'):
    db = dbt.db_connect_no_auth(dbname)
    poi = db['tcom']
    timel = db['times']
    bnet = db['bnet']
    stream_users = db['poi']
    poi.create_index("id", unique=True)
    poi.create_index("level", unique=False)
    timel.create_index([('user.id', pymongo.ASCENDING),
                        ('id', pymongo.DESCENDING)],
                       unique=False)
    timel.create_index([('id', pymongo.ASCENDING)], unique=True)
    bnet.create_index([("id0", pymongo.ASCENDING), ("id1", pymongo.ASCENDING),
                       ("relationship", pymongo.ASCENDING),
                       ("statusid", pymongo.ASCENDING)],
                      unique=True)
    # while True:
    ed_seed = profiles_check.seed_all_profile(stream_users, 5)
    length = len(ed_seed)
    if length == 0:
        print datetime.datetime.now().strftime(
            "%Y-%m-%d-%H-%M-%S"), 'no seed users, finished!'
        # break
    else:
        print 'seed users: ', length
        lookup.trans_seed_to_poi(ed_seed, poi, mode)
        # continue
    level = 1
    while True:
        timelines.monitor_timeline(poi, timel, 1)
        timeline_network_miner.network_mining(poi, timel, bnet, level)
        for user in poi.find({'level': level}):
            neiblist = set()
            for relate in bnet.find({
                    'id0': user['id'],
                    'relationship': {
                        '$in': ['retweet', 'reply-to', 'dmentioned']
                    }
            }):
                neiblist.add(relate['id1'])
            neiblist = list(neiblist)
            list_size = len(neiblist)
            length = int(math.ceil(list_size / 100.0))
            for index in xrange(length):
                index_begin = index * 100
                index_end = min(list_size, index_begin + 100)
                lookup.lookup_user_list(neiblist[index_begin:index_end], poi,
                                        level + 1, mode)
        if poi.count() > 4000:
            break
        else:
            level += 1
            continue
Example #2
0
def monitor_timeline(time_index):
    datasets = ["sed", "srd", "syg"]
    print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + "Start to crawl timelines"
    for dataset in datasets:
        db = dbt.db_connect_no_auth(dataset)
        sample_user = db["com"]
        sample_time = db["timeline"]
        sample_user.create_index([("timeline_scraped_times", pymongo.ASCENDING)], unique=False)
        sample_time.create_index([("user.id", pymongo.ASCENDING), ("id", pymongo.DESCENDING)], unique=False)
        sample_time.create_index([("id", pymongo.ASCENDING)], unique=True)
        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + "Start to crawl timeline"
        timelines.monitor_timeline(sample_user, sample_time, time_index)
        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), "Finish a crawl"

    print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + "Finish timlines crawl"
Example #3
0
def monitor_timeline(time_index):
    datasets = ['ded', 'drd', 'dyg']
    print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Start to crawl timelines'
    for dataset in datasets[:1]:
        db = dbt.db_connect_no_auth(dataset)
        sample_user = db['com']
        sample_time = db['timeline']
        sample_user.create_index([('timeline_scraped_times', pymongo.ASCENDING)], unique=False)
        sample_time.create_index([('user.id', pymongo.ASCENDING),
                                  ('id', pymongo.DESCENDING)], unique=False)
        sample_time.create_index([('id', pymongo.ASCENDING)], unique=True)
        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Start to crawl timeline'
        timelines.monitor_timeline(sample_user, sample_time, time_index)
        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Finish a crawl'

    print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Finish timlines crawl'
Example #4
0
def timeline_sampling(dbname, mode='N'):
    db = dbt.db_connect_no_auth(dbname)
    poi = db['tcom']
    timel = db['times']
    bnet = db['bnet']
    stream_users = db['poi']
    poi.create_index("id", unique=True)
    poi.create_index("level", unique=False)
    timel.create_index([('user.id', pymongo.ASCENDING),
                                  ('id', pymongo.DESCENDING)], unique=False)
    timel.create_index([('id', pymongo.ASCENDING)], unique=True)
    bnet.create_index([("id0", pymongo.ASCENDING),
                             ("id1", pymongo.ASCENDING),
                             ("relationship", pymongo.ASCENDING),
                             ("statusid", pymongo.ASCENDING)],
                            unique=True)
    # while True:
    ed_seed = profiles_check.seed_all_profile(stream_users, 5)
    length = len(ed_seed)
    if length == 0:
        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'no seed users, finished!'
        # break
    else:
        print 'seed users: ', length
        lookup.trans_seed_to_poi(ed_seed, poi, mode)
        # continue
    level = 1
    while True:
        timelines.monitor_timeline(poi, timel, 1)
        timeline_network_miner.network_mining(poi, timel, bnet, level)
        for user in poi.find({'level': level}):
            neiblist = set()
            for relate in bnet.find({'id0': user['id'],
                                     'relationship': {'$in': ['retweet', 'reply-to', 'dmentioned']}}):
                neiblist.add(relate['id1'])
            neiblist = list(neiblist)
            list_size = len(neiblist)
            length = int(math.ceil(list_size/100.0))
            for index in xrange(length):
                index_begin = index*100
                index_end = min(list_size, index_begin+100)
                lookup.lookup_user_list(neiblist[index_begin:index_end], poi, level+1, mode)
        if poi.count() > 4000:
            break
        else:
            level += 1
            continue
Example #5
0
def monitor_timeline(time_index):
    datasets = ['ded', 'drd', 'dyg']
    print datetime.datetime.now().strftime(
        "%Y-%m-%d-%H-%M-%S") + "\t" + 'Start to crawl timelines'
    for dataset in datasets[:1]:
        db = dbt.db_connect_no_auth(dataset)
        sample_user = db['com']
        sample_time = db['timeline']
        sample_user.create_index(
            [('timeline_scraped_times', pymongo.ASCENDING)], unique=False)
        sample_time.create_index([('user.id', pymongo.ASCENDING),
                                  ('id', pymongo.DESCENDING)],
                                 unique=False)
        sample_time.create_index([('id', pymongo.ASCENDING)], unique=True)
        print datetime.datetime.now().strftime(
            "%Y-%m-%d-%H-%M-%S") + "\t" + 'Start to crawl timeline'
        timelines.monitor_timeline(sample_user, sample_time, time_index)
        print datetime.datetime.now().strftime(
            "%Y-%m-%d-%H-%M-%S"), 'Finish a crawl'

    print datetime.datetime.now().strftime(
        "%Y-%m-%d-%H-%M-%S") + "\t" + 'Finish timlines crawl'