def update_targets(week): target_week_data = "data/" + ("week_%s/" % week) + "target_users.pkl" invalid_week_data = "data/" + ("week_%s/" % week) + "invalid_users.pkl" update_info = {} invalid_users = [] with open("data/target_users.pkl") as orig: targets = cPickle.load(orig) # for index, t in enumerate(targets, start=1): # print "--- get user=%s(%d:%d) ---" % (t, index, len(targets)) # result = get_user_info(t) # if result is None: # print "--- user=%s get invalid info ---" % t # invalid_users.append(t) # update_info[t] = result from utils import iter_pool_do info_iter = iter_pool_do(get_user_info, targets.keys()) index = 1 for name, info in info_iter: print "--- get user=%s(%d:%d) ---" % (name, index, len(targets)) if info is None: print "--- user=%s get invalid info ---" % name invalid_users.append(name) update_info[name] = info index += 1 print "---- update to file %s %d targets---" % ( target_week_data, len(update_info)) save(target_week_data, update_info) print "---- update to file %s %d invalids---" % ( invalid_week_data, len(invalid_users)) save(invalid_week_data, invalid_users) return update_info
def get_week_range_history(start_week, end_week): cursor, conn = prepare_history_db(start_week, end_week) users_param = restore_from_db(cursor) History.total_user = len(users_param) users_with_index = list(enumerate(users_param, start=1)) gen = iter_pool_do(dispatch_one_user, users_with_index, cap=4) for g in gen: pass
def get_friends_history(filename): cursor, conn = prepare_history_db(filename) ranges = restore_from_db(cursor) History.total_user = len(ranges) range_with_index = list(enumerate(ranges, start=1)) gen = iter_pool_do(dispatch_one_user, range_with_index, cap=10) for g in gen: pass
def scheduling_scrape(week): db = create_friend_listeners_table(week) count = db.execute("select count(*) from meta_info;").fetchone()[0] if count == 0: initialize_friend_listeners_table(week, db) progress = load_progress(db, week) log_file = open("data/week_%s/friends.log" % week, "a") total = len(progress) FriendHistory.setup(db, total, log_file) gen = iter_pool_do(dispatch_one_user, progress, cap=10) for g in gen: pass
def get_playcount_and_love(): CONN = sqlite3.connect('data/friends_listened.db') CURSOR = CONN.cursor() CURSOR.executescript(""" create table if not exists playcount_and_love ( target, friendname, track, artist, playcount, loved, timestamp ); """) targets = get_targets() tracks = get_tracks() save_file = 'save_for_friend.pkl' error_file = open('error_file.txt', 'a') if os.path.exists(save_file): obj = pickle.load(open(save_file)) last_index1 = obj['index1'] next_index2 = obj['index2'] already_fetched = obj['already'] else: last_index1 = 0 next_index2 = 0 already_fetched = set() for index1, t in enumerate(tracks[last_index1:], start=1+last_index1): track = t[0] artist = t[1] for index2, target in enumerate(targets[next_index2:], start=1+next_index2): friends = get_target_friends(target) filted_friends = [f for f in friends if f not in already_fetched] # for index3, friend in enumerate(friends, start=1): func = functools.partial(friend_like, track, artist) generator = iter_pool_do(func, filted_friends, cap=10) index3 = 1 for friend, result in generator: # if friend in already_fetched: # # skip this one # continue print "--- [%s(%d:%d) %s(%d:%d) %s(%d:%d)] ---" % ( track, index1, len(tracks), target, index2, len(targets), # friend, index3, len(friends)) friend, index3, len(filted_friends)) # result = friend_like(track, artist, friend) if result: # insert in to db playcount, loved = result if playcount: print "--- get valid record! ---" CURSOR.execute( "insert into playcount_and_love values (?, ?, ?, ?, ?, ?, ?)", ( target, friend, track, artist, playcount, loved, int(time.time()) ) ) else: already_fetched.add(friend) else: # log this print >> error_file, "(%s) (%s) (%s)" % (track.encode('utf-8'), target.encode('utf-8'), friend.encode('utf-8')) error_file.flush() index3 += 1 # save what? # in second loop # next (track, targets) # already_list for current track save_obj = { 'index1': index1-1, 'index2': index2, 'already': already_fetched } save(save_file, save_obj) CONN.commit() # prepare for next track already_fetched = set() next_index2 = 0