def main(): log.info('Loading user ids...') uid_list = load_user_list('tables/users.txt') # load users log.info('Loading user id, done.') # 导入所有group的topic topic_dict = load_topic() start = 0 count = 100 # 一次处理100个用户 total_users = len(uid_list) # 考虑到需要存储的内容较多,所以一次处理一定数目的用户 topic_count = 0 comment_count = 0 while start < total_users: if start + count < total_users: puid_list = uid_list[start:(start + count)] else: puid_list = uid_list[start:] #print (start, start+count) behavior = get_behavior_statics(puid_list, topic_dict) # save to file topic_count, comment_count = save_behavior_statics( behavior, topic_count, comment_count) log.info('Processing uid range: [%d, %d)' % (start, start + count)) start += count log.info('Total number of topics used: %d' % topic_count) log.info('Total number of comments used: %d' % comment_count) log.info('Thank God. It\'s done.')
def filter_user(interest_info): """ 注意:之后可能会根据interest的信息,过滤倒一些用户 """ pass if __name__ == '__main__': if len(sys.argv) < 2: print 'Group ID not provided.' sys.exit(1) group_id = sys.argv[1] log.info('Prepare user interest for group: %s' % group_id) print 'Loading users...' path = 'social/' + group_id + '/all-users-' + group_id uid_list = load_user_list(path) total_user = len(uid_list) log.info('Number of users loaded: %d' % total_user) print 'Number of users loaded: %d' % total_user print 'Loading model and dict...' model_path = 'ldamodels/' + group_id + '/title-comment-' + group_id + '.ldamodel' dict_path = 'ldamodels/' + group_id + '/dict-title-comment-' + group_id + '.dict' log.info('Loading LDA model...') ldamodel = models.ldamodel.LdaModel.load(model_path) # load model log.info('Loading dict...') dictionary = corpora.dictionary.Dictionary.load(dict_path) # load dict print 'Gen user interest...' topic_path = 'tables/' + group_id + '/TopicInfo-raw-all-' + group_id comment_path = 'tables/' + group_id + '/CommentInfo-raw-all-' + group_id