def main(): timer = Timer() timer.start() app_name = 'cyberbullying' complete_user_id_set = set() with open('../data/{0}_out/complete_user_{0}.txt'.format(app_name), 'r') as fin: for line in fin: tid, root_uid, _ = line.rstrip().split(',', 2) complete_user_id_set.add(root_uid) embed_uid_dict = { 'u{0}'.format(embed): uid for embed, uid in enumerate(sorted(list(complete_user_id_set))) } num_user_complete = len(embed_uid_dict) print('{0} users appear in the complete set'.format(num_user_complete)) with open('../networks/{0}_embed_user.txt'.format(app_name), 'w') as fout: for uid in sorted(embed_uid_dict.keys()): fout.write('{0},{1}\n'.format(uid, embed_uid_dict[uid])) print('>>> Finish embedding users') timer.stop() complete_hashtag_id_set = set() with open('../data/{0}_out/complete_hashtag_{0}.txt'.format(app_name), 'r', encoding='utf-8') as fin: for line in fin: tid, *hashtags = line.rstrip().lower().split(',') complete_hashtag_id_set.update(hashtags) embed_hid_dict = { 'h{0}'.format(embed): hashtag for embed, hashtag in enumerate(sorted(list(complete_hashtag_id_set))) } num_hashtag_complete = len(embed_hid_dict) print( '{0} hashtags appear in the complete set'.format(num_hashtag_complete)) with open('../networks/{0}_embed_hashtag.txt'.format(app_name), 'w', encoding='utf-8') as fout: for hid in sorted(embed_hid_dict.keys()): fout.write('{0},{1}\n'.format(hid, embed_hid_dict[hid])) print('>>> Finish embedding hashtags') timer.stop()
def main(): timer = Timer() timer.start() cc4 = ColorPalette.CC4 fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) start_idx = 21000 end_idx = 25500 timestamp_list = [] track_list = [] with open('rate_limit_2015-09-08.txt', 'r') as fin: for line in fin: rate_json = json.loads(line.rstrip()) track = rate_json['limit']['track'] track_list.append(track) timestamp = datetime.utcfromtimestamp( (int(rate_json['limit']['timestamp_ms'][:-3]))) timestamp_list.append(timestamp) axes[0].scatter(timestamp_list[start_idx:end_idx], track_list[start_idx:end_idx], c='k', s=0.4) axes[0].set_xlim([timestamp_list[start_idx], timestamp_list[end_idx]]) axes[0].xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S')) axes[0].set_xticks(axes[0].get_xticks()[::2]) axes[0].set_xlabel('Sep 08, 2015', fontsize=16) axes[0].set_ylabel('value', fontsize=16) axes[0].yaxis.set_major_formatter(FuncFormatter(concise_fmt)) axes[0].tick_params(axis='both', which='major', labelsize=16) axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001) print('start timestamp', timestamp_list[start_idx]) print('end timestamp', timestamp_list[end_idx]) split_track_lst, split_ts_lst = map_ratemsg( track_list[start_idx:end_idx], timestamp_list[start_idx:end_idx]) total_miss = 0 for track_lst, ts_lst, color in zip(split_track_lst, split_ts_lst, cc4): axes[1].scatter(ts_lst, track_lst, c=color, s=0.4) total_miss += (track_list[-1] - track_list[0]) print('{0} tweets are missing'.format(total_miss)) axes[1].set_xlim([timestamp_list[start_idx], timestamp_list[end_idx]]) axes[1].xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S')) axes[1].set_xticks(axes[1].get_xticks()[::2]) axes[1].set_xlabel('Sep 08, 2015', fontsize=16) axes[1].yaxis.set_major_formatter(FuncFormatter(concise_fmt)) axes[1].tick_params(axis='both', which='major', labelsize=16) axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/SI_ratemsg_coloring.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() data_prefix = '../data/' fig, axes = plt.subplots(ncols=3, nrows=3, figsize=(12, 4), sharex='col') gs = axes[0, 0].get_gridspec() for ax in axes[:, 0]: ax.remove() ax_left = fig.add_subplot(gs[:, 0]) ax_left.set_axis_off() ax_left.spines['top'].set_visible(False) ax_left.spines['right'].set_visible(False) ax_left.spines['bottom'].set_visible(False) ax_left.spines['left'].set_visible(False) ax_left.set_title('(a)', fontsize=12) axes = axes[:, 1:].ravel() video_title_list = ['Hello', 'Someone like you', 'Rolling in the deep', 'Skyfall', 'Set fire to the rain', 'Hometown glory'] # == == == == == == Part 1: Load data == == == == == == # fig_idx = 0 with open(os.path.join(data_prefix, 'teaser.json'), 'r') as fin: for line in fin: video_json = json.loads(line.rstrip()) daily_view = video_json['insights']['dailyView'] end_date = datetime.strptime(video_json['insights']['endDate'], '%Y-%m-%d') start_date = end_date - timedelta(days=len(daily_view)) date_axis = [start_date + timedelta(days=t) for t in range(len(daily_view))] # plot daily view series axes[fig_idx].plot_date(date_axis, daily_view, 'k-') axes[fig_idx].axvline(x=datetime(2015, 10, 23), color=ColorPalette.TOMATO, linestyle='--', lw=1.5, zorder=30) axes[fig_idx].text(0.3, 0.95, video_title_list[fig_idx], size=10, transform=axes[fig_idx].transAxes, ha='center', va='bottom') axes[fig_idx].tick_params(axis='both', which='major', labelsize=10) axes[fig_idx].yaxis.set_major_formatter(FuncFormatter(concise_fmt)) axes[fig_idx].xaxis.set_major_formatter(mdates.DateFormatter("'%y")) fig_idx += 1 axes[2].set_ylabel('daily views', fontsize=11) axes[0].set_title('(b)', fontsize=12) hide_spines(axes) timer.stop() plt.tight_layout() plt.savefig('../images/intro_teaser.pdf', bbox_inches='tight') plt.show()
def main(): timer = Timer() timer.start() consumer_key = conf.twitter_consumer_key consumer_secret = conf.twitter_consumer_secret access_token = conf.twitter_access_token access_token_secret = conf.twitter_access_secret auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = API(auth) num_media = 0 num_found_twitter = 0 num_found_youtube = 0 num_fail = 0 app_name = 'mbfc' with open('data/{0}/{0}_ratings_v3.csv'.format(app_name), 'w') as fout: with open('data/{0}/{0}_ratings_v2.csv'.format(app_name), 'r') as fin: fout.write('{0},{1},{2},{3},{4}\n'.format(fin.readline().rstrip(), 'TWHandle', 'TWSim', 'YTId', 'YTUser')) for line in fin: num_media += 1 head, website_url = line.rstrip().rsplit(',', 1) try: tw_handle, tw_sim, yt_id, yt_user = crawl_social_media_from_url( website_url, api) fout.write('{0},{1},{2},{3},{4}\n'.format( line.rstrip(), tw_handle, tw_sim, yt_id, yt_user)) print( 'crawled accounts: {0:>10} | {1:>10} | {2:>10}'.format( tw_handle, yt_id, yt_user)) if tw_handle != '': num_found_twitter += 1 if yt_id != '' or yt_user != '': num_found_youtube += 1 if isinstance(tw_sim, float) and tw_sim < 0.5: print('+++ Twitter handle to be reviewed: {0} {1:.4f}'. format(tw_handle, tw_sim)) except: num_fail += 1 continue print( '>>> {0}/{2} twitter handles are found; {1}/{2} youtube ids' .format(num_found_twitter, num_found_youtube, num_media)) print('in total, {0} media websites not accessible'.format(num_fail)) timer.stop()
def main(): # == == == == == == == == Part 1: Set up experiment parameters == == == == == == == == # total_start_time = time.time() data_prefix = '../data/' forecast_filepath = 'vevo_forecast_data_60k.tsv' recsys_dirpath = 'recsys' snapshot_dirpath = 'network_pickle' if not os.path.exists(os.path.join(data_prefix, snapshot_dirpath)): os.mkdir(os.path.join(data_prefix, snapshot_dirpath)) # == == == == == == Part 2: Load vevo en videos 61k dataset == == == == == == # vid_embed_dict = {} vid_view_dict = {} with open(os.path.join(data_prefix, forecast_filepath), 'r') as fin: for line in fin: embed, vid, ts_view, total_view = line.rstrip().split('\t') vid_embed_dict[vid] = int(embed) ts_view = np.array(intify(ts_view.split(','))) vid_view_dict[vid] = ts_view vevo_en_vid_list = list(sorted(vid_embed_dict.keys())) num_videos = len(vevo_en_vid_list) for t in range(T): timer = Timer() timer.start() target_date_str = obj2str(datetime(2018, 9, 1) + timedelta(days=t)) recsys_filepath = 'recsys_{0}.json'.format(target_date_str) snapshot_filepath = 'network_{0}.p'.format(target_date_str) network_mat = {embed: [] for embed in range(num_videos)} with open(os.path.join(data_prefix, recsys_dirpath, recsys_filepath), 'r') as fin: for line in fin: network_json = json.loads(line.rstrip()) source = network_json['vid'] targets = network_json['relevant_list'][: MAX_POSITION] for position, target in enumerate(targets): if target in vevo_en_vid_list: # add embedding of incoming video and position of target video on source video network_mat[vid_embed_dict[target]].append((vid_embed_dict[source], position, vid_view_dict[source][t])) with open(os.path.join(data_prefix, snapshot_dirpath, snapshot_filepath), 'wb') as fout: pickle.dump(network_mat, fout) print('>>> Finish dumping date {0}'.format(target_date_str)) timer.stop() print('>>> Network structure has been dumped!') print('>>> Total elapsed time: {0}\n'.format(str(timedelta(seconds=time.time() - total_start_time))[:-3]))
def main(): timer = Timer() timer.start() n = 63 # change to 10,000 for faster computing num_sim = 10000 with open('./justify_persistent_link.log', 'w') as fout: for p in np.arange(0, 1.01, 0.01): fout.write('p_form: {0:.2f}, p_persistent_link: {1:.4f}\n'.format( p, simulate_for_prob(p, n, num_sim))) print('>>> Finish simulating at prob {0:.2f}'.format(p)) timer.stop()
def main(): # == == == == == == Part 1: Set up experiment parameters == == == == == == # timer = Timer() timer.start() data_prefix = '../data/' vevo_en_videos_path = 'vevo_en_videos_60k.json' vevo_forecast_filepath = 'vevo_forecast_data_60k.tsv' vevo_embed_filepath = 'vevo_en_embeds_60k.txt' # == == == == == == Part 2: Load Vevo en forecast data == == == == == == # vevo_en_vid_list = [] vid_title_dict = {} vid_forecast_view_dict = {} with open(os.path.join(data_prefix, vevo_en_videos_path), 'r') as fin: for line in fin: video_json = json.loads(line.rstrip()) vid = video_json['id'] vevo_en_vid_list.append(vid) title = (video_json['snippet']['title'].encode( 'ascii', 'ignore')).decode('utf-8') vid_title_dict[vid] = title daily_view = video_json['insights']['dailyView'] forecast_view = daily_view[-T:] vid_forecast_view_dict[vid] = forecast_view vevo_en_vid_list = sorted(vevo_en_vid_list) num_videos = len(vevo_en_vid_list) with open(os.path.join(data_prefix, vevo_forecast_filepath), 'w') as fout: for embed in range(num_videos): vid = vevo_en_vid_list[embed] forecast_view = vid_forecast_view_dict[vid] fout.write('{0}\t{1}\t{2}\t{3}\n'.format( embed, vid, strify(forecast_view, delimiter=','), np.sum(forecast_view))) with open(os.path.join(data_prefix, vevo_embed_filepath), 'w', encoding='utf-8') as fout: for embed in range(num_videos): vid = vevo_en_vid_list[embed] fout.write('{0},{1},{2}\n'.format(embed, vid, vid_title_dict[vid])) timer.stop()
def main(): timer = Timer() timer.start() input_filepath = 'data/mbfc/to_crawl_users.csv' output_filepath = 'data/mbfc/active_user_subscription.json.bz2' visited_channel_set = set() if os.path.exists(output_filepath): with bz2.BZ2File(output_filepath, 'r') as fin: for line in fin: line = line.decode('utf-8') channel_id = json.loads(line.rstrip())['channel_id'] visited_channel_set.add(channel_id) print('visited {0} channels in the past, continue...'.format( len(visited_channel_set))) num_user = len(visited_channel_set) with bz2.open(output_filepath, 'at') as fout: with open(input_filepath, 'r') as fin: for line in fin: user_id = line.rstrip().split(',')[0] if user_id not in visited_channel_set: num_request = 0 found = False print('get description and subscriptions for user {0}'. format(user_id)) while num_request < 5: try: profile_json = get_subscriptions_from_channel( user_id, target='subscription') found = True except: num_request += 1 if found: fout.write('{0}\n'.format( json.dumps(profile_json))) num_user += 1 print( '{0} subscriptions are obtained for user {1}: {2}\n' .format(len(profile_json['subscriptions']), num_user, user_id)) time.sleep(1) break timer.stop()
def main(): app_name = 'covid' if app_name == 'cyberbullying': target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all'] elif app_name == 'youtube' or app_name == 'covid': target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'all'] else: target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all'] archive_dir = '../data/{0}_out'.format(app_name) timer = Timer() timer.start() est_num_tweet = 0 for suffix in target_suffix: num_tweet = 0 num_ratemsg = 0 track_list = [] with bz2.BZ2File(os.path.join(archive_dir, '{0}_{1}/ts_{0}_{1}.bz2'.format(app_name, suffix)), mode='r') as fin: for line in fin: line = line.decode('utf-8') if 'ratemsg' in line: num_ratemsg += 1 track_list.append(int(line.rstrip().split(',')[2])) else: num_tweet += 1 num_miss = count_track(track_list) subcrawler_sampling_rate = num_tweet / (num_tweet + num_miss) print('>>> subcrawler {0}_{1: <3}, {2: >9d} retrieved tweets, {3: >7d} rate limit track, indicating {4: >9d} missing tweets, yielding {5: >6.2f}% sampling rate' .format(app_name, suffix, num_tweet, num_ratemsg, num_miss, 100 * subcrawler_sampling_rate)) if suffix == 'all': est_num_tweet = num_tweet + num_miss gt_num_tweet = 0 with bz2.BZ2File(os.path.join(archive_dir, 'complete_ts_{0}.bz2'.format(app_name)), mode='r') as fin: for _ in fin: gt_num_tweet += 1 gt_sampling_rate = gt_num_tweet / est_num_tweet print('>>> complete_set {0} , {1: >9d} retrieved tweets, {2: >7} rate limit track, estimating {3: >9d} total tweets, yielding {4: >6.2f}% sampling rate' .format(app_name, gt_num_tweet, 'NaN', est_num_tweet, 100 * gt_sampling_rate)) timer.stop()
def main(): timer = Timer() timer.start() app_name = 'mbfc' input_filepath = 'data/{0}/{0}_ratings.csv'.format(app_name) output_filepath = 'data/{0}/MBFC_featured_channels.json'.format(app_name) with open(output_filepath, 'w') as fout: with open(input_filepath, 'r') as fin: fin.readline() for line in fin: _, channel_id, _, _, is_political = line.rstrip().rsplit( ',', 4) if is_political == 'Y': num_request = 0 found = False print('get featured channels for channel {0}'.format( channel_id)) while num_request < 5: try: profile_json = get_subscriptions_from_channel( channel_id, target='featured') found = True except: num_request += 1 if found: if len(profile_json['featured_channels']) > 0: fout.write('{0}\n'.format( json.dumps(profile_json))) print(json.dumps(profile_json)) print( '{0} featured channels are obtained for channel {1}\n' .format( len(profile_json['featured_channels']), channel_id)) time.sleep(1) break timer.stop()
def main(): app_name = 'cyberbullying' if app_name == 'cyberbullying': target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all'] elif app_name == 'youtube': target_suffix = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'all' ] else: target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all'] archive_dir = '../data/{0}_out'.format(app_name) timer = Timer() timer.start() print('>>> Merging user profile') fout = open( os.path.join(archive_dir, 'complete_user_profile_{0}.txt'.format(app_name)), 'w') visited_user_id_str = set() num_users = 0 for suffix in target_suffix: with bz2.BZ2File(os.path.join( archive_dir, '{0}_{1}_user.txt.bz2'.format(app_name, suffix)), mode='r') as fin: for line in fin: line = line.decode('utf8') user_id_str, _ = line.rstrip().split(',', 1) if user_id_str not in visited_user_id_str: fout.write(line) visited_user_id_str.add(user_id_str) num_users += 1 print('>>> We retrieve profiles for {0} users'.format(num_users)) fout.close() timer.stop()
def main(): timer = Timer() timer.start() input_filepath = 'data/mbfc/to_crawl_vid.txt' output_filepath = 'data/mbfc/MBFC_video_metadata.json.bz2' visited_video_set = set() if os.path.exists(output_filepath): with bz2.BZ2File(output_filepath, mode='r') as fin: for line in fin: video_json = json.loads(line.rstrip()) if 'vid' in video_json: visited_video_set.add(video_json['vid']) print('visited {0} videos in the past, continue...'.format(len(visited_video_set))) num_video = len(visited_video_set) total_num_request = 0 with bz2.open(output_filepath, 'at') as fout: with open(input_filepath, 'r') as fin: for line in fin: video_id = line.rstrip() if video_id not in visited_video_set: try: video_metadata = get_video_metadata(video_id) total_num_request += video_metadata.pop('num_request', 0) if len(video_metadata) > 0: visited_video_set.add(video_id) fout.write('{0}\n'.format(json.dumps(video_metadata))) num_video += 1 else: print('xxx error, failed in crawling metadata for video {0}'.format(video_id)) except Exception as e: print(str(e)) break print('>>> so far crawled {0} videos, {1} requests are sent'.format(num_video, total_num_request)) print('>>> reach file end!') timer.stop()
def main(): timer = Timer() timer.start() app_name = 'mbfc' current_date = datetime.now().strftime('%Y-%m-%d') input_filepath = 'data/{0}/{0}_ratings.csv'.format(app_name) output_filepath = 'data/{0}/{0}_video_ids_{1}.json'.format(app_name, current_date) visited_channel_set = set() if os.path.exists(output_filepath): with open(output_filepath, 'r') as fin: for line in fin: visited_channel_set.add(json.loads(line.rstrip())['channel_id']) print('visited {0} channels in the past, continue...'.format(len(visited_channel_set))) idx_media = len(visited_channel_set) with open(output_filepath, 'a') as fout: with open(input_filepath, 'r') as fin: fin.readline() for line in fin: channel_id = line.rstrip().split(',')[-1] if channel_id != '' and channel_id not in visited_channel_set: print('get videos for media {0}'.format(channel_id)) upload_playlist = 'UU' + channel_id[2:] num_fail = 0 while num_fail < 5: try: channel_video_ids = get_videos_from_playlist(upload_playlist) break except: num_fail += 1 fout.write('{0}\n'.format(json.dumps({'channel_id': channel_id, 'playlist': channel_video_ids}))) visited_channel_set.add(channel_id) idx_media += 1 print('{0} video ids are obtained for media {1}: {2}\n'.format(len(channel_video_ids), idx_media, channel_id)) timer.stop()
def main(): timer = Timer() timer.start() youtube_key = conf.youtube_key parts = 'id' yt_crawler = YTCrawler() yt_crawler.set_key(youtube_key) app_name = 'mbfc' with open('data/{0}/{0}_ratings_v6.csv'.format(app_name), 'w') as fout: with open('data/{0}/{0}_ratings_v5.csv'.format(app_name), 'r') as fin: fout.write(fin.readline()) for line in fin: title, tail = line.rstrip().split(',', 1) middle, yt_id, yt_user = tail.rsplit(',', 2) if yt_id == '' and yt_user == '': fout.write(line) elif yt_id != '': yt_id = yt_crawler.check_channel_id(yt_id, parts) if yt_id == '': print('--- Channel id crawler failed on title {0}'. format(title)) fout.write('{0},{1},{2},{3}\n'.format( title, middle, yt_id, yt_user)) else: yt_id = yt_crawler.get_channel_id(yt_user, parts) if yt_id == '': print('--- Channel id crawler failed on title {0}'. format(title)) fout.write('{0},{1},{2},{3}\n'.format( title, middle, yt_id, yt_user)) timer.stop()
def main(): timer = Timer() timer.start() app_names = ['cyberbullying', 'youtube'] # data for plot subfig (a) showcase_segment_idx = 0 showcase_complete_tid_list = [] showcase_retrieved_tid_list = [] showcase_ratemsg_list = [] showcase_track_list = [] # data for plot subfig (b) mape_dict = {app_name: [] for app_name in app_names} rate_silence_length = 10000 disconnect_silence_length = 180000 print('>>> We silence {0} seconds around rate limit messages'.format( rate_silence_length // 1000)) print('>>> We silence {0} seconds proceeding disconnect messages\n'.format( disconnect_silence_length // 1000)) for app_name in app_names: print('>>> Computing on app {0}'.format(app_name)) archive_dir = './{0}_out'.format(app_name) sample_input_path = os.path.join(archive_dir, 'ts_{0}_all.txt'.format(app_name)) complete_input_path = os.path.join( archive_dir, 'complete_ts_{0}.txt'.format(app_name)) # == == == == == == Part 1: Initially select segments in the complete set == == == == == == # # segments that silence 10s around rate limit messages and 180s proceeding disconnect messages in complete set init_segment_list = [] init_start_ts = 0 with open(complete_input_path, 'r') as fin: for line in fin: split_line = line.rstrip().split(',') # if it is a disconnect msg if 'disconnect' in split_line[1]: disconnect_ts = int(split_line[0]) # disconnect message, remove the proceeding [disconnect_silence_length] init_end_ts = disconnect_ts - disconnect_silence_length if init_end_ts > init_start_ts: init_segment_list.append((init_start_ts, init_end_ts, init_end_ts - init_start_ts)) init_start_ts = disconnect_ts # elif it is a rate limit msg elif 'ratemsg' in split_line[1]: ratemsg_ts = int(split_line[0]) # rate limit message, remove the surrounding [rate_silence_length] init_end_ts = ratemsg_ts - rate_silence_length // 2 if init_end_ts > init_start_ts: init_segment_list.append((init_start_ts, init_end_ts, init_end_ts - init_start_ts)) init_start_ts = ratemsg_ts + rate_silence_length // 2 print( '>>> Initially, we identify {0} segments in complete set without rate limit message' .format(len(init_segment_list))) # print(init_segment_list[: 10]) # == == == == == == Part 2: Segments are bounded by 2 rate limit messages in the sample set == == == == == == # bounded_segment_list = [] current_segment_idx = 0 current_start_ts = 0 current_ratemsg_list = [] current_track_list = [] last_ratemsg_ts = 0 look_for_end = False found_showcase = False with open(sample_input_path, 'r') as fin: for line in fin: split_line = line.rstrip().split(',') if 'ratemsg' in split_line[1]: ratemsg_ts = int(split_line[0]) track = int(split_line[2]) if not look_for_end or ( look_for_end and current_start_ts == last_ratemsg_ts and init_segment_list[current_segment_idx][1] < ratemsg_ts): # fast forward, skip some really short segments while ratemsg_ts >= init_segment_list[ current_segment_idx][1]: current_segment_idx += 1 if current_segment_idx == len(init_segment_list): break if ratemsg_ts >= init_segment_list[ current_segment_idx][0]: current_start_ts = ratemsg_ts current_ratemsg_list = [ratemsg_ts] current_track_list = [track] look_for_end = True else: look_for_end = False elif look_for_end: if current_start_ts < last_ratemsg_ts <= init_segment_list[ current_segment_idx][1] < ratemsg_ts: current_num_miss = count_track( current_track_list, start_with_rate=True, subcrawler=False) bounded_segment_list.append( (current_start_ts, last_ratemsg_ts, last_ratemsg_ts - current_start_ts, current_num_miss)) # find the first example segment that is around 11 sec long if app_name == 'cyberbullying' and not found_showcase and 10000 <= last_ratemsg_ts - current_start_ts <= 12000: showcase_segment_idx = len( bounded_segment_list) - 1 showcase_ratemsg_list = copy.deepcopy( current_ratemsg_list) showcase_track_list = copy.deepcopy( current_track_list) found_showcase = True current_segment_idx += 1 if current_segment_idx == len(init_segment_list): break if ratemsg_ts >= init_segment_list[ current_segment_idx][0]: current_start_ts = ratemsg_ts current_ratemsg_list = [ratemsg_ts] current_track_list = [track] look_for_end = True else: look_for_end = False else: current_ratemsg_list.append(ratemsg_ts) current_track_list.append(track) last_ratemsg_ts = ratemsg_ts if current_segment_idx == len(init_segment_list): break print('>>> We further bound {0} segments with 2 rate limit messages'. format(len(bounded_segment_list))) # print(bounded_segment_list[-10:]) # == == == == == == Part 3: Add sample and complete volume in each segment == == == == == == # for input_path, tid_list in zip( [sample_input_path, complete_input_path], [showcase_retrieved_tid_list, showcase_complete_tid_list]): current_segment_idx = 0 current_segment_cnt = 0 with open(input_path, 'r') as fin: for line in fin: split_line = line.rstrip().split(',') if len(split_line) == 2: msg_ts = int(split_line[0]) if bounded_segment_list[current_segment_idx][ 0] < msg_ts <= bounded_segment_list[ current_segment_idx][1]: current_segment_cnt += 1 if app_name == 'cyberbullying' and current_segment_idx == showcase_segment_idx: tweet_id = split_line[1] tid_list.append(tweet_id) elif msg_ts > bounded_segment_list[ current_segment_idx][1]: bounded_segment_list[current_segment_idx] = ( *bounded_segment_list[current_segment_idx], current_segment_cnt) current_segment_idx += 1 current_segment_cnt = 0 if current_segment_idx == len( bounded_segment_list): break # print(bounded_segment_list[-10:]) length_tracker = 0 mape_list = [] for segment in bounded_segment_list: length_tracker += segment[2] mape_list.append(mape(segment[-1], segment[-2] + segment[-3])) mape_dict[app_name] = copy.deepcopy(mape_list) print('MAPE: {0:.5f} +- {1:.5f}, median: {2:.5f}'.format( np.mean(mape_list), np.std(mape_list), np.median(mape_list))) print('total tracked days bounded: {0:.2f} out of 14'.format( length_tracker / 1000 / 60 / 60 / 24)) if app_name == 'cyberbullying': print( 'complete tweets: {0}, retrieved tweets: {1}, estimated missing: {2}' .format( len(showcase_complete_tid_list), len(showcase_retrieved_tid_list), count_track(showcase_track_list, start_with_rate=True, subcrawler=False))) print('ratemsg timestamp', showcase_ratemsg_list) print('ratemsg track', showcase_track_list) print() timer.stop() # == == == == == == Part 5: Plot a showcase segment that is roughly 10s == == == == == == # cc4 = ColorPalette.CC4 blue = cc4[0] green = cc4[1] red = cc4[3] fig, axes = plt.subplots(1, 4, figsize=(12, 1.6)) ax2 = axes[-1] gs = axes[1].get_gridspec() for ax in axes[:-1]: ax.remove() ax1 = fig.add_subplot(gs[:-1]) # add a timeline ax1.axhline(0, linewidth=2, color='k') observed_tweet_ts_list = sorted( [melt_snowflake(tid)[0] for tid in showcase_retrieved_tid_list]) showcase_missing_tid_set = set(showcase_complete_tid_list).difference( set(showcase_retrieved_tid_list)) missing_tweet_ts_list = sorted( [melt_snowflake(tid)[0] for tid in showcase_missing_tid_set]) ax1.scatter(observed_tweet_ts_list, [1] * len(observed_tweet_ts_list), marker='o', facecolors='none', edgecolors=blue, lw=1, s=20) ax1.scatter(missing_tweet_ts_list, [0.5] * len(missing_tweet_ts_list), marker='x', c='k', lw=1, s=20) # stats for missing tweets, cut by rate limit msg timestamp_ms complete_track_list = [] i, j, curr_cnt = 0, 1, 0 while i < len(missing_tweet_ts_list) and j < len(showcase_ratemsg_list): if missing_tweet_ts_list[i] <= showcase_ratemsg_list[j]: curr_cnt += 1 i += 1 else: complete_track_list.append(curr_cnt) curr_cnt = 0 j += 1 complete_track_list.append(curr_cnt) # print(complete_track_list) for idx, ts in enumerate(showcase_ratemsg_list): ax1.axvline(ts, ymin=0, ymax=1.1, linewidth=1, color='k') for idx, ts in enumerate(showcase_ratemsg_list[1:]): ax1.text(ts - 50, 0.42, '/{0:>3}'.format(complete_track_list[idx]), color='k', ha='right', va='top', size=10) ax1.text(ts - 470, 0.42, str(showcase_track_list[idx + 1] - showcase_track_list[idx]), color=green, ha='right', va='top', size=10) ax1.xaxis.set_major_formatter(FuncFormatter(to_datetime)) ax1.set_xlim(left=showcase_ratemsg_list[0] - 200, right=showcase_ratemsg_list[-1] + 200) ax1.set_yticks([0.5, 1.0]) ax1.set_ylim(top=1.2, bottom=0) num_missing_by_counting = len(showcase_complete_tid_list) - len( showcase_retrieved_tid_list) num_missing_by_estimating = count_track(showcase_track_list, start_with_rate=True, subcrawler=False) num_observed_tweets = len(showcase_retrieved_tid_list) ax1.tick_params(axis='x', which='major', labelsize=10) ax1.tick_params(axis='y', which='both', length=0) ax1.set_yticklabels([ 'missing tweets\n{0}/{1}'.format(num_missing_by_estimating, num_missing_by_counting), 'collected tweets\n{0}'.format(num_observed_tweets) ], fontsize=10) # remove borders ax1.spines['right'].set_visible(False) ax1.spines['left'].set_visible(False) ax1.spines['top'].set_visible(False) ax1.spines['bottom'].set_visible(False) ax1.set_title('(a)', fontsize=11, pad=-1.35 * 72, y=1.0001) bplot = ax2.boxplot([mape_dict['cyberbullying'], mape_dict['youtube']], labels=['Cyberbullying', 'YouTube'], widths=0.5, showfliers=False, showmeans=False, patch_artist=True) for patch, color in zip(bplot['boxes'], [blue, red]): patch.set_facecolor(color) for median in bplot['medians']: median.set(color='k', linewidth=1) ax2.tick_params(axis='both', which='major', labelsize=10) ax2.set_ylabel('MAPE', fontsize=10) ax2.spines['right'].set_visible(False) ax2.spines['top'].set_visible(False) ax2.set_title('(b)', fontsize=11, pad=-1.35 * 72, y=1.0001) plt.tight_layout(rect=[0, 0.03, 1, 1]) plt.savefig('../images/validate_ratemsg.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() app_name = 'cyberbullying' sample_cascade_size = {} sample_inter_arrival_time = [] sample_cascade_influence = {} sample_cascade_influence_10m = defaultdict(int) sample_cascade_influence_1h = defaultdict(int) with open('../data/{0}_out/sample_retweet_{0}.txt'.format(app_name), 'r') as fin: for line in fin: root_tweet, cascades = line.rstrip().split(':') cascades = cascades.split(',') root_tweet = root_tweet.split('-')[0] retweets = [x.split('-')[0] for x in cascades] influences = [int(x.split('-')[1]) for x in cascades] sample_cascade_size[root_tweet] = len(retweets) sample_cascade_influence[root_tweet] = sum(influences) root_timestamp = melt_snowflake(root_tweet)[0] / 1000 retweet_timestamp_list = [root_timestamp] for i in range(len(retweets)): retweet_time = melt_snowflake(retweets[i])[0] / 1000 relative_retweet_time = retweet_time - root_timestamp retweet_timestamp_list.append( melt_snowflake(retweets[i])[0] / 1000) if relative_retweet_time < 10 * 60: sample_cascade_influence_10m[root_tweet] += influences[i] if relative_retweet_time < 60 * 60: sample_cascade_influence_1h[root_tweet] += influences[i] for i in range(len(retweet_timestamp_list) - 1): sample_inter_arrival_time.append(retweet_timestamp_list[i + 1] - retweet_timestamp_list[i]) complete_cascade_size = {} complete_inter_arrival_time = [] complete_cascade_influence = {} complete_cascade_influence_10m = defaultdict(int) complete_cascade_influence_1h = defaultdict(int) with open('../data/{0}_out/complete_retweet_{0}.txt'.format(app_name), 'r') as fin: for line in fin: root_tweet, cascades = line.rstrip().split(':') cascades = cascades.split(',') root_tweet = root_tweet.split('-')[0] retweets = [x.split('-')[0] for x in cascades] complete_cascade_size[root_tweet] = len(retweets) if len(retweets) >= 50: influences = [int(x.split('-')[1]) for x in cascades] complete_cascade_influence[root_tweet] = sum(influences) root_timestamp = melt_snowflake(root_tweet)[0] / 1000 retweet_timestamp_list = [root_timestamp] for i in range(len(retweets)): retweet_time = melt_snowflake(retweets[i])[0] / 1000 relative_retweet_time = retweet_time - root_timestamp retweet_timestamp_list.append( melt_snowflake(retweets[i])[0] / 1000) if relative_retweet_time < 10 * 60: complete_cascade_influence_10m[ root_tweet] += influences[i] if relative_retweet_time < 60 * 60: complete_cascade_influence_1h[ root_tweet] += influences[i] for i in range(len(retweet_timestamp_list) - 1): complete_inter_arrival_time.append( retweet_timestamp_list[i + 1] - retweet_timestamp_list[i]) print('number of cascades in the complete set', len(complete_cascade_size)) print('number of cascades in the sample set', len(sample_cascade_size)) print('mean complete size', np.mean(list(complete_cascade_size.values()))) print('mean sample size', np.mean(list(sample_cascade_size.values()))) print('complete #cascades (≥50 retweets)', sum([1 for x in list(complete_cascade_size.values()) if x >= 50])) print('sample #cascades (≥50 retweets)', sum([1 for x in list(sample_cascade_size.values()) if x >= 50])) num_complete_cascades_in_sample = 0 complete_cascades_in_sample_size_list = [] num_complete_cascades_in_sample_50 = 0 for root_tweet in sample_cascade_size: if sample_cascade_size[root_tweet] == complete_cascade_size[ root_tweet]: num_complete_cascades_in_sample += 1 complete_cascades_in_sample_size_list.append( complete_cascade_size[root_tweet]) if complete_cascade_size[root_tweet] >= 50: num_complete_cascades_in_sample_50 += 1 print('number of complete cascades in the sample set', num_complete_cascades_in_sample) print('number of complete cascades (>50 retweets) in the sample set', num_complete_cascades_in_sample_50) print('max: {0}, mean: {1}'.format( max(complete_cascades_in_sample_size_list), np.mean(complete_cascades_in_sample_size_list))) fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) cc4 = ColorPalette.CC4 blue = cc4[0] red = cc4[3] sample_median = np.median(sample_inter_arrival_time) complete_median = np.median(complete_inter_arrival_time) plot_ccdf(sample_inter_arrival_time, ax=axes[0], color=blue, ls='-', label='sample') plot_ccdf(complete_inter_arrival_time, ax=axes[0], color='k', ls='-', label='complete') axes[0].plot([sample_median, sample_median], [0, 1], color=blue, ls='--', lw=1) axes[0].plot([complete_median, complete_median], [0, 1], color='k', ls='--', lw=1) print('\ninter_arrival_time sample median', sample_median) print('inter_arrival_time complete median', complete_median) axes[0].set_xscale('symlog') axes[0].set_xticks([0, 1, 100, 10000, 1000000]) axes[0].set_yscale('linear') axes[0].set_xlabel('inter-arrival time (sec)', fontsize=16) axes[0].set_ylabel('$P(X \geq x)$', fontsize=16) axes[0].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='upper right') axes[0].tick_params(axis='both', which='major', labelsize=16) axes[0].set_title('(a)', fontsize=18, pad=-3 * 72, y=1.0001) influence_list = [] influence_list_10m = [] influence_list_1h = [] for root_tweet in sample_cascade_size: if complete_cascade_size[root_tweet] >= 50: if complete_cascade_influence[root_tweet] > 0: influence_list.append(sample_cascade_influence[root_tweet] / complete_cascade_influence[root_tweet]) if complete_cascade_influence_10m[root_tweet] > 0: influence_list_10m.append( sample_cascade_influence_10m[root_tweet] / complete_cascade_influence_10m[root_tweet]) if complete_cascade_influence_1h[root_tweet] > 0: influence_list_1h.append( sample_cascade_influence_1h[root_tweet] / complete_cascade_influence_1h[root_tweet]) plot_ccdf(influence_list_10m, ax=axes[1], color=red, ls='-', label='10m') plot_ccdf(influence_list_1h, ax=axes[1], color=blue, ls='-', label='1h') plot_ccdf(influence_list, ax=axes[1], color='k', ls='-', label='14d') print('influence_list median', np.median(influence_list)) print('influence_list_1h median', np.median(influence_list_1h)) print('influence_list_10m median', np.median(influence_list_10m)) print('influence_list 0.25', percentileofscore(influence_list, 0.25)) print('influence_list 0.25', percentileofscore(influence_list_1h, 0.25)) print('influence_list 0.25', percentileofscore(influence_list_10m, 0.25)) print('influence_list 0.75', percentileofscore(influence_list, 0.75)) print('influence_list 0.75', percentileofscore(influence_list_1h, 0.75)) print('influence_list 0.75', percentileofscore(influence_list_10m, 0.75)) axes[1].set_xscale('linear') axes[1].set_yscale('linear') axes[1].set_xlabel('relative potential reach', fontsize=16) # axes[1].set_ylabel('$P(X \geq x)$', fontsize=16) axes[1].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='upper right') axes[1].tick_params(axis='both', which='major', labelsize=16) axes[1].set_title('(b)', fontsize=18, pad=-3 * 72, y=1.0001) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/cascades_measures.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data/recsys' num_relevant_by_rank = np.zeros((NUM_REL,)) num_recommended_by_rank = np.zeros((NUM_REC,)) # aggregate by rank1, rank2-5, rank6-10, rank11-15 dense_relevant_in_recommended_mat = np.zeros((NUM_REL, 4)) # aggregate by rank1, rank2-5, rank6-10, rank11-15, rank16-30, rank31-50 dense_recommended_from_relevant_mat = np.zeros((NUM_REC, 6)) relevant_in_recommended_arr = np.zeros((NUM_REL,)) recommended_from_relevant_arr = np.zeros((NUM_REC,)) # == == == == == == Part 2: Load both relevant list and recommended list == == == == == == # for subdir, _, files in os.walk(data_prefix): for f in files: with open(os.path.join(subdir, f), 'r') as fin: for line in fin: network_json = json.loads(line.rstrip()) recommended_list = network_json['recommended_list'][: NUM_REC] relevant_list = network_json['relevant_list'][: NUM_REL] num_relevant_by_rank += np.pad(np.ones(len(relevant_list)), (0, NUM_REL - len(relevant_list)), 'constant') num_recommended_by_rank += np.pad(np.ones(len(recommended_list)), (0, NUM_REC - len(recommended_list)), 'constant') for rel_rank, vid in enumerate(relevant_list): if vid in recommended_list: relevant_in_recommended_arr[rel_rank] += 1 position_on_recommended = recommended_list.index(vid) dense_relevant_in_recommended_mat[rel_rank, switch(position_on_recommended)] += 1 for rec_rank, vid in enumerate(recommended_list): if vid in relevant_list: recommended_from_relevant_arr[rec_rank] += 1 position_on_relevant = relevant_list.index(vid) dense_recommended_from_relevant_mat[rec_rank, switch(position_on_relevant)] += 1 # == == == == == == Part 3: Plot probabilities in each position == == == == == == # fig, axes = plt.subplots(1, 2, figsize=(12, 4)) axes = axes.ravel() color_cycle_6 = ColorPalette.CC6 stackedBarPlot(ax=axes[0], data=dense_relevant_in_recommended_mat / num_relevant_by_rank.reshape(-1, 1), cols=color_cycle_6, edgeCols=['#000000'] * 4, xlabel='position $x$ on relevant list', ylabel='prob. of displaying on recommended list', scale=False, endGaps=True) axes[0].legend([plt.Rectangle((0, 0), 1, 1, fc=color_cycle_6[x], alpha=0.8, ec='k') for x in range(4)], ['position 1', 'position 2-5', 'position 6-10', 'position 11-15'], fontsize=10, frameon=False, loc='upper right', fancybox=False, shadow=True, ncol=1) axes[0].set_title('(a)', fontsize=12) stackedBarPlot(ax=axes[1], data=dense_recommended_from_relevant_mat / num_recommended_by_rank.reshape(-1, 1), cols=ColorPalette.CC6, edgeCols=['#000000'] * 6, xlabel='position $x$ on recommended list', ylabel='prob. of originating from relevant list', scale=False, endGaps=True) axes[1].legend([plt.Rectangle((0, 0), 1, 1, fc=color_cycle_6[x], alpha=0.8, ec='k') for x in range(6)], ['position 1', 'position 2-5', 'position 6-10', 'position 11-15', 'position 16-30', 'position 31-50'], fontsize=10, frameon=False, loc='upper right', fancybox=False, shadow=True, ncol=2) axes[1].set_title('(b)', fontsize=12) for ax in axes: ax.set_ylim(top=1) ax.set_ylim(bottom=0) hide_spines(axes) timer.stop() plt.tight_layout() plt.savefig('../images/data_rel2rec.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() n_cluster = 6 for date_type in ['sample', 'complete']: uid_hid_stats = pickle.load( open('./{0}_uid_hid_stats.p'.format(date_type), 'rb')) hid_uid_stats = pickle.load( open('./{0}_hid_uid_stats.p'.format(date_type), 'rb')) num_users = len(uid_hid_stats) num_hashtags = len(hid_uid_stats) print('in {0} set, {1} users, {2} hashtags'.format( date_type, num_users, num_hashtags)) all_graph = { uid: [x[0] for x in lst[1:]] for uid, lst in uid_hid_stats.items() } rev_all_graph = { hid: [x[0] for x in lst[1:]] for hid, lst in hid_uid_stats.items() } all_graph.update(rev_all_graph) all_bipartites = tarjan(all_graph) all_bipartites = sorted(all_bipartites, key=lambda x: len(x), reverse=True) print('number of bipartites: {0}'.format(len(all_bipartites))) largest_bipartite = all_bipartites[0] largest_bipartite_users = [ x for x in largest_bipartite if x.startswith('u') ] largest_bipartite_hashtags = [ x for x in largest_bipartite if x.startswith('h') ] largest_bipartite_num_users = len(largest_bipartite_users) largest_bipartite_num_hashtags = len(largest_bipartite_hashtags) print( 'components of largest bipartite: {0} users; {1} hashtags'.format( largest_bipartite_num_users, largest_bipartite_num_hashtags)) # B = nx.Graph() # # Add edges only between nodes of opposite node sets # bipartite_edges = [] # for uid in largest_bipartite_users: # for hid, cnt in uid_hid_stats[uid]: # bipartite_edges.append((uid, hid, {'weight': cnt})) # B.add_edges_from(bipartite_edges) # re-embed new_user_embed = { uid: embed for embed, uid in enumerate(sorted(largest_bipartite_users)) } new_embed_user = {v: k for k, v in new_user_embed.items()} new_hashtag_embed = { hid: embed for embed, hid in enumerate(sorted(largest_bipartite_hashtags)) } new_embed_hashtag = {v: k for k, v in new_hashtag_embed.items()} bipartite_edges = {} for uid in largest_bipartite_users: bipartite_edges[new_user_embed[uid]] = [] for hid, _ in uid_hid_stats[uid][1:]: bipartite_edges[new_user_embed[uid]].append( new_hashtag_embed[hid]) row, col = [], [] for key, item in bipartite_edges.items(): row += [key] * len(item) col += item biadjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col))) print('built the biadjacency') bispectral = BiSpectralClustering(n_clusters=n_cluster) print('running BiSpectralClustering...') bispectral.fit(biadjacency) print('completed BiSpectralClustering...') row_labels = bispectral.row_labels_ col_labels = bispectral.col_labels_ clusters = [[] for _ in range(n_cluster)] for user_idx, label in enumerate(row_labels): clusters[label].append(new_embed_user[user_idx]) for hashtag_idx, label in enumerate(col_labels): clusters[label].append(new_embed_hashtag[hashtag_idx]) for i in range(n_cluster): print('cluster {0}, size: {1}, num_user: {2}, num_hashtag: {3}'. format(i, len(clusters[i]), len([x for x in clusters[i] if x.startswith('u')]), len([x for x in clusters[i] if x.startswith('h')]))) with open('./{0}_cluster{1}.txt'.format(date_type, i), 'w') as fout: fout.write(','.join(clusters[i]))
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data/' # == == == == == == Part 2: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() embed_avg_view_dict = data_loader.embed_avg_view_dict num_videos = data_loader.num_videos # == == == == == == Part 3: Build views percentile partition == == == == == == # day_views = list(embed_avg_view_dict.values()) median_value = np.median(day_views) # the top 1st quantile is 75th percentile and above first_quantile_value = np.percentile(day_views, 75) third_quantile_value = np.percentile(day_views, 25) embed_percentile_dict = {} for embed in np.arange(num_videos): if embed_avg_view_dict[embed] >= first_quantile_value: embed_percentile_dict[embed] = 0 elif embed_avg_view_dict[embed] >= median_value: embed_percentile_dict[embed] = 1 elif embed_avg_view_dict[embed] >= third_quantile_value: embed_percentile_dict[embed] = 2 else: embed_percentile_dict[embed] = 3 # == == == == == == Part 4: Load dynamic network snapshot == == == == == == # edge_weight_mat = np.zeros((4, 4), dtype=np.float32) for t in range(T): filename = 'network_{0}.p'.format( (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d')) with open(os.path.join(data_prefix, 'network_pickle', filename), 'rb') as fin: network_dict = pickle.load(fin) # embed_tar: [(embed_src, pos_src, view_src), ...] for embed_tar in range(num_videos): for embed_src, pos_src, _ in network_dict[embed_tar]: if pos_src < NUM_REL: edge_weight_mat[( embed_percentile_dict[embed_src], embed_percentile_dict[embed_tar])] += 1 / T print('>>> Finish loading day {0}...'.format(t + 1)) edge_weight_mat = edge_weight_mat.astype(np.int) print('>>> Network structure has been loaded!') # == == == == == == Part 5: Plot graph by network2tikz == == == == == == # # Network # ------- # every possible pair, including self loop network_structure = [] num_partitions = 4 for pair in itertools.product(np.arange(num_partitions), repeat=2): network_structure.append(pair) net = igraph.Graph(network_structure, directed=True) # Network attributes # ------------------ # Network dicts # ------------- layout = {0: (0, 0), 1: (1, 0), 2: (2, 0), 3: (3, 0)} # Visual style dict # ----------------- visual_style = {} # node styles # ----------- visual_style['vertex_size'] = 0.9 visual_style['vertex_color'] = ColorPalette.CCRGB4 visual_style['vertex_opacity'] = 0.6 visual_style['vertex_label'] = [ 'top 25\%', '(25\% 50\%', '(50\% 75\%', 'bottom 25\%' ] visual_style['vertex_label_distance'] = 0 visual_style['vertex_label_size'] = [5, 4, 4, 4] # edge styles # ----------- edge_width = list(np.ravel(edge_weight_mat)) visual_style['edge_width'] = scaler(edge_width) visual_style['edge_curved'] = 0.7 edge_label = ['{{{:,}}}'.format(x) for x in edge_width] visual_style['edge_label'] = edge_label visual_style['edge_label_size'] = 4.5 visual_style['edge_loop_shape'] = 60 visual_style['edge_loop_size'] = 1 visual_style['edge_loop_position'] = [180, 0, 0, 0] visual_style['edge_arrow_size'] = 0.01 visual_style['edge_arrow_width'] = [ 0.03, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01 ] # general options # --------------- visual_style['layout'] = layout visual_style['canvas'] = (10, 3.5) visual_style['margin'] = 1.5 # Create pdf figure of the network plot(net, '../images/measure_how_videos_connect.pdf', **visual_style) print('>>> Generated pdf file ../images/measure_how_videos_connect.pdf') timer.stop()
def main(): timer = Timer() timer.start() cornflower_blue = ColorPalette.BLUE tomato = ColorPalette.TOMATO color_cycle_4 = ColorPalette.CC4 label_fs = ColorPalette.LABELFS title_fs = ColorPalette.TITLEFS tick_style = ColorPalette.TICKSTYLE bar_text_style = ColorPalette.BARTEXTSTYLE data_loader = DataLoader() data_loader.load_video_views() embed_view_dict = data_loader.embed_view_dict embed_avg_train_view_dict = { embed: np.mean(embed_view_dict[embed][:-NUM_OUTPUT]) for embed in embed_view_dict.keys() } net_ratio_list = [] src_to_tar_view_ratio = [] link_weights_record = [] naive_smape_list, snaive_smape_list, ar_smape_list, rnn_smape_list, arnet_smape_list = [ [] for _ in range(5) ] naive_daily_smape_mat, snaive_daily_smape_mat, ar_daily_smape_mat, rnn_daily_smape_mat, arnet_daily_smape_mat = [ np.empty((0, NUM_OUTPUT), np.float) for _ in range(5) ] with open('./forecast_tracker_all.json', 'r') as fin: for line in fin: result_json = json.loads(line.rstrip()) tar_embed = result_json['embed'] true_value = result_json['true_value'] naive_pred = result_json['naive_pred'] snaive_pred = result_json['snaive_pred'] ar_pred = result_json['ar_pred'] rnn_pred = result_json['rnn_pred'] arnet_pred = result_json['arnet_pred'] naive_smape, naive_daily_smape_arr = smape(true_value, naive_pred) naive_smape_list.append(naive_smape) naive_daily_smape_mat = np.vstack( (naive_daily_smape_mat, naive_daily_smape_arr)) snaive_smape, snaive_daily_smape_arr = smape( true_value, snaive_pred) snaive_smape_list.append(snaive_smape) snaive_daily_smape_mat = np.vstack( (snaive_daily_smape_mat, snaive_daily_smape_arr)) ar_smape, ar_daily_smape_arr = smape(true_value, ar_pred) ar_smape_list.append(ar_smape) ar_daily_smape_mat = np.vstack( (ar_daily_smape_mat, ar_daily_smape_arr)) rnn_smape, rnn_daily_smape_arr = smape(true_value, rnn_pred) rnn_smape_list.append(rnn_smape) rnn_daily_smape_mat = np.vstack( (rnn_daily_smape_mat, rnn_daily_smape_arr)) arnet_smape, arnet_daily_smape_arr = smape(true_value, arnet_pred) arnet_smape_list.append(arnet_smape) arnet_daily_smape_mat = np.vstack( (arnet_daily_smape_mat, arnet_daily_smape_arr)) # analyse network contribution arnet_net_ratio = result_json['net_ratio'] net_ratio_list.append(arnet_net_ratio) incoming_embeds = result_json['incoming_embeds'] link_weights = result_json['link_weights'] for edge_inx, src_embed in enumerate(incoming_embeds): view_ratio = np.log10(embed_avg_train_view_dict[src_embed] / embed_avg_train_view_dict[tar_embed]) src_to_tar_view_ratio.append(view_ratio) link_weights_record.append(link_weights[edge_inx]) fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(12, 4)) axes = axes.ravel() # == == == == == == Part 1: Plot performance comparison == == == == == == # smape_mat = [ naive_smape_list, snaive_smape_list, ar_smape_list, rnn_smape_list, arnet_smape_list ] axes[0].boxplot(smape_mat, showfliers=False, meanline=True, showmeans=True, widths=0.7) means = [np.mean(x) for x in smape_mat] pos = range(len(means)) for tick, label in zip(pos, axes[1].get_xticklabels()): axes[0].text(pos[tick] + 1, means[tick] + 0.3, '{0:.3f}'.format(means[tick]), **bar_text_style) axes[0].set_xticklabels(['Naive', 'SN', 'AR', 'RNN', 'ARNet'], fontsize=label_fs) axes[0].set_ylabel('SMAPE', fontsize=label_fs) axes[0].tick_params(**tick_style) axes[0].set_title('(a)', fontsize=title_fs) # == == == == == == Part 2: Plot performance with forecast horizon extends == == == == == == # axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(naive_daily_smape_mat, axis=0), label='Naive', c='k', mfc='none', marker='D', markersize=4) axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(snaive_daily_smape_mat, axis=0), label='SN', c=color_cycle_4[0], mfc='none', marker='*', markersize=5) axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(ar_daily_smape_mat, axis=0), label='AR', c=color_cycle_4[1], mfc='none', marker='s', markersize=5) axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(rnn_daily_smape_mat, axis=0), label='RNN', c=color_cycle_4[2], mfc='none', marker='^', markersize=5) axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(arnet_daily_smape_mat, axis=0), label='ARNet', c=color_cycle_4[3], marker='o', markersize=5) axes[1].set_xlabel('forecast horizon', fontsize=label_fs) axes[1].set_ylabel('SMAPE', fontsize=label_fs) axes[1].set_ylim([6, 23]) axes[1].tick_params(**tick_style) axes[1].legend(frameon=False) axes[1].set_title('(b)', fontsize=title_fs) # == == == == == == Part 3: Plot link strength vs. view ratio from src to tar == == == == == == # bin_axis = np.arange(-2, 1.9, 0.1) bin_records = [[] for _ in range(len(bin_axis))] for x, y in zip(src_to_tar_view_ratio, link_weights_record): if x >= -2: bin_records[int(np.floor((x + 2) * 10))].append(y) for t in np.arange(5, 50, 5): axes[2].fill_between(bin_axis, [np.percentile(x, 50 - t) for x in bin_records], [np.percentile(x, 55 - t) for x in bin_records], facecolor=cornflower_blue, alpha=(100 - 2 * t) / 100, lw=0) axes[2].fill_between(bin_axis, [np.percentile(x, 45 + t) for x in bin_records], [np.percentile(x, 50 + t) for x in bin_records], facecolor=cornflower_blue, alpha=(100 - 2 * t) / 100, lw=0) for t in [10, 30, 70, 90]: axes[2].plot(bin_axis, [np.percentile(x, t) for x in bin_records], color=cornflower_blue, alpha=(100 - 2 * t) / 100, lw=1, zorder=15) median_line = [np.percentile(x, 50) for x in bin_records] axes[2].plot(bin_axis, median_line, color='k', alpha=0.5, zorder=20, lw=1.5) axes[2].xaxis.set_major_formatter( FuncFormatter(lambda x, _: r'$10^{{{0:.0f}}}%$'.format(x))) peak1_idx = int(np.argmax(median_line)) peak2_idx = 10 + int(np.argmax(median_line[10:])) peak1 = (bin_axis[peak1_idx], median_line[peak1_idx]) peak2 = (bin_axis[peak2_idx], median_line[peak2_idx]) axes[2].scatter(peak1[0], peak1[1], s=15, c=tomato, edgecolors='k', zorder=30) axes[2].text(peak1[0] + 0.08, peak1[1] + 0.01, '({0:.2f}, {1:.2f})'.format(10**peak1[0], peak1[1]), ha='left', va='center') axes[2].scatter(peak2[0], peak2[1], s=15, c=tomato, edgecolors='k', zorder=30) axes[2].text(peak2[0], peak2[1] + 0.02, '({0:.2f}, {1:.2f})'.format(10**peak2[0], peak2[1]), ha='center', va='bottom') axes[2].set_xlim((-2.05, 2.02)) axes[2].set_ylim((-0.02, 1.01)) axes[2].set_xlabel('views ratio from video ' + r'$u$' + ' to video ' + r'$v$', fontsize=label_fs) axes[2].set_ylabel('estimated link strength ' + r'$\beta_{u, v}$', fontsize=label_fs) axes[2].set_title('(c)', fontsize=title_fs) hide_spines(axes) timer.stop() plt.tight_layout() plt.savefig('../images/model_prediction_results.pdf', bbox_inches='tight') plt.show()
def main(): timer = Timer() timer.start() cornflower_blue = ColorPalette.BLUE tomato = ColorPalette.TOMATO color_cycle_4 = ColorPalette.CC4 label_fs = ColorPalette.LABELFS title_fs = ColorPalette.TITLEFS tick_style = ColorPalette.TICKSTYLE data_loader = DataLoader() data_loader.load_video_views() embed_view_dict = data_loader.embed_view_dict embed_avg_train_view_dict = { embed: np.mean(embed_view_dict[embed][:-NUM_OUTPUT]) for embed in embed_view_dict.keys() } data_loader.load_embed_content_dict() embed_cid_dict = data_loader.embed_cid_dict embed_genre_dict = data_loader.embed_genre_dict cid_artist_dict = {} cid_tag_dict = {} with open('../data/artist_details.json', 'r') as fin: for line in fin: artist_json = json.loads(line.rstrip()) cid_artist_dict[ artist_json['channel_id']] = artist_json['artist_name'] cid_tag_dict[artist_json['channel_id']] = artist_json['tag-dict'] cid_views_dict = defaultdict(int) cid_views_wo_network_dict = defaultdict(int) arnet_smape_list = [] net_ratio_list = [] same_artist_net_ratio_list = [] same_genre_net_ratio_list = [] total_views = 0 network_explained_views = 0 with open('./embed_prediction.json', 'r') as fin: for line in fin: result_json = json.loads(line.rstrip()) tar_embed = result_json['embed'] avg_train_views = embed_avg_train_view_dict[tar_embed] true_value = result_json['true_value'] arnet_pred = result_json['arnet_pred'] arnet_smape_list.append(smape(true_value, arnet_pred)[0]) incoming_embeds = result_json['incoming_embeds'] link_weights = result_json['link_weights'] same_artist_contributed_views = 0 same_genre_contributed_views = 0 for edge_inx, src_embed in enumerate(incoming_embeds): if embed_cid_dict[tar_embed] == embed_cid_dict[src_embed]: same_artist_contributed_views += link_weights[ edge_inx] * embed_avg_train_view_dict[src_embed] if is_same_genre(embed_genre_dict[tar_embed], embed_genre_dict[src_embed]): same_genre_contributed_views += link_weights[ edge_inx] * embed_avg_train_view_dict[src_embed] # analyse network contribution arnet_net_ratio = result_json['net_ratio'] net_ratio_list.append(arnet_net_ratio) # rounding issue can make the value slightly larger than 1 same_artist_net_ratio_list.append( min(same_artist_contributed_views / avg_train_views, 1)) same_genre_net_ratio_list.append( min(same_genre_contributed_views / avg_train_views, 1)) cid_views_dict[embed_cid_dict[tar_embed]] += avg_train_views cid_views_wo_network_dict[embed_cid_dict[ tar_embed]] += avg_train_views * (1 - arnet_net_ratio) total_views += avg_train_views network_explained_views += avg_train_views * arnet_net_ratio print( '\nFor an average video in our dataset, we estimate {0:.1f}% of the views come from the network.' .format(100 * np.mean(net_ratio_list))) print( 'In particular, {0:.1f}% ({1:.1f}%) of the views come from the same artist.' .format( 100 * np.mean(same_artist_net_ratio_list), 100 * np.mean(same_artist_net_ratio_list) / np.mean(net_ratio_list))) print( 'In total, our model estimates that the recommendation network contributes {0:.1f}% of popularity in the Vevo network.' .format(100 * network_explained_views / total_views)) print('total views for 13K: {0:.1f}M'.format(total_views / 1000000)) print('explained views for 13K: {0:.1f}M'.format(network_explained_views / 1000000)) print('total views for 60K: {0:.1f}M'.format( np.sum(list(embed_avg_train_view_dict.values())) / 1000000)) print('Gini coef with network: {0:.4f}'.format( gini(list(cid_views_dict.values())))) print('Gini coef without network: {0:.4f}\n'.format( gini(list(cid_views_wo_network_dict.values())))) fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 4.2)) gs = axes[0, 0].get_gridspec() for ax in axes[:, 1]: ax.remove() ax_mid = fig.add_subplot(gs[:, 1]) for ax in axes[:, 2]: ax.remove() ax_right = fig.add_subplot(gs[:, 2]) axes = [axes[0, 0], axes[1, 0], ax_mid, ax_right] # == == == == == == Part 1: Plot SMAPE vs. traffic composition == == == == == == # num_bin = 10 sorted_same_artist_tuple_list = sorted( [(x, y) for x, y in zip(same_artist_net_ratio_list, arnet_smape_list)], key=lambda x: x[0]) same_artist_split_values = [ np.percentile(same_artist_net_ratio_list, x) for x in np.arange(10, 101, 10) ] same_artist_bins = [[] for _ in range(num_bin)] for same_artist_net_ratio, arnet_smape in sorted_same_artist_tuple_list: slice_idx = int( np.floor( percentileofscore(same_artist_net_ratio_list, same_artist_net_ratio) / 10)) if slice_idx >= num_bin: slice_idx = num_bin - 1 same_artist_bins[slice_idx].append(arnet_smape) sorted_same_genre_tuple_list = sorted( [(x, y) for x, y in zip(same_genre_net_ratio_list, arnet_smape_list)], key=lambda x: x[0]) same_genre_split_values = [ np.percentile(same_genre_net_ratio_list, x) for x in np.arange(10, 101, 10) ] same_genre_bins = [[] for _ in range(num_bin)] for same_genre_net_ratio, arnet_smape in sorted_same_genre_tuple_list: slice_idx = int( np.floor( percentileofscore(same_genre_net_ratio_list, same_genre_net_ratio) / 10)) if slice_idx >= num_bin: slice_idx = num_bin - 1 same_genre_bins[slice_idx].append(arnet_smape) axes[0].plot(range(1, 11, 1), [np.mean(x) for x in same_artist_bins], color=cornflower_blue, label='same artist', mfc='none', marker='o', markersize=4) axes[1].plot(range(1, 11, 1), [np.mean(x) for x in same_genre_bins], color=tomato, label='same genre', mfc='none', marker='o', markersize=4) for ax in [axes[0], axes[1]]: ax.set_xlim([0.5, 10.5]) ax.set_ylim([7, 10.5]) ax.set_ylabel('SMAPE', fontsize=label_fs) ax.xaxis.set_ticks(np.arange(1, 10, 2)) ax.tick_params(**tick_style) ax.legend(frameon=False) axes[0].xaxis.set_major_formatter( FuncFormatter(lambda x, _: '({0:.3f})'.format(same_artist_split_values[ int(x) - 1]))) axes[1].xaxis.set_major_formatter( FuncFormatter(lambda x, _: '{0:.0f}%\n({1:.3f})'.format( 10 * x, same_genre_split_values[int(x) - 1]))) # axes[0].xaxis.set_major_formatter( # FuncFormatter(lambda x, _: '({0:.3f})'.format(10 * x))) # axes[1].xaxis.set_major_formatter( # FuncFormatter(lambda x, _: '{0:.0f}%\n({1:.3f})'.format(10 * x, 10 * x))) axes[1].set_xlabel('$\eta_v$ percentile', fontsize=label_fs) axes[0].set_title('(a)', fontsize=title_fs) # == == == == == == Part 2: Plot who can utilize the network better? == == == == == == # artist_views_list = list(cid_views_dict.values()) wo_network_artist_views_list = list(cid_views_wo_network_dict.values()) cid_list = sorted(cid_views_dict.keys()) artist_true_percentile = [ percentileofscore(artist_views_list, cid_views_dict[cid]) for cid in cid_list ] wo_network_artist_percentile = [ percentileofscore(wo_network_artist_views_list, cid_views_wo_network_dict[cid]) for cid in cid_list ] percentile_change = np.array([ artist_true_percentile[i] - wo_network_artist_percentile[i] for i in range(len(cid_list)) ]) num_popularity_loss = sum(percentile_change < 0) num_popularity_equal = sum(percentile_change == 0) num_popularity_gain = sum(percentile_change > 0) print('{0} ({1:.2f}%) artists lose popularity with network'.format( num_popularity_loss, num_popularity_loss / len(cid_list) * 100)) print('{0} ({1:.2f}%) artists with no popularity change'.format( num_popularity_equal, num_popularity_equal / len(cid_list) * 100)) print('{0} ({1:.2f}%) artists gain popularity with network\n'.format( num_popularity_gain, num_popularity_gain / len(cid_list) * 100)) artist_percentile_mat = [[] for _ in range(10)] artist_cid_mat = [[] for _ in range(10)] for idx, percentile_value in enumerate(wo_network_artist_percentile): bin_idx = min(int(np.floor(percentile_value / 10)), 9) artist_percentile_mat[bin_idx].append(artist_true_percentile[idx] - percentile_value) artist_cid_mat[bin_idx].append(cid_list[idx]) red_circle = dict(markerfacecolor=tomato, marker='o', markersize=4) axes[2].boxplot(artist_percentile_mat, showfliers=True, widths=0.5, flierprops=red_circle) axes[2].axhline(y=0, color=cornflower_blue, linestyle='--', lw=1, zorder=0) axes[2].set_xlabel('artist popularity percentile without network', fontsize=label_fs) axes[2].set_ylabel('percentile change with network', fontsize=label_fs) axes[2].tick_params(**tick_style) axes[2].set_xticks(axes[2].get_xticks()[::2]) axes[2].xaxis.set_major_formatter( FuncFormatter(lambda x, _: '{0:.0f}%'.format(10 * x))) axes[2].yaxis.set_major_formatter( FuncFormatter(lambda x, _: '{0:.0f}%'.format(x))) axes[2].set_title('(b)', fontsize=12) # find outliers whis = 1.5 top_outliers_list = [] bottom_outliers_list = [] for box_idx, box in enumerate(artist_percentile_mat): q1 = np.percentile(box, 25) q3 = np.percentile(box, 75) iq = q3 - q1 hi_val = q3 + whis * iq lo_val = q1 - whis * iq for idx, val in enumerate(box): if val > hi_val: top_outliers_list.append((artist_cid_mat[box_idx][idx], val)) elif val < lo_val: bottom_outliers_list.append( (artist_cid_mat[box_idx][idx], val)) sorted_top_outliers_list = sorted( [(cid_artist_dict[x[0]], cid_tag_dict[x[0]], int( cid_views_dict[x[0]]), x[1]) for x in top_outliers_list], key=lambda t: t[2], reverse=True) for t in sorted_top_outliers_list: print(t) print('-------------------') sorted_bottom_outliers_list = sorted( [(cid_artist_dict[x[0]], cid_tag_dict[x[0]], int( cid_views_dict[x[0]]), x[1]) for x in bottom_outliers_list], key=lambda t: t[2], reverse=True) for t in sorted_bottom_outliers_list: print(t) indie_xaxis, indie_yaxis = [], [] rap_xaxis, rap_yaxis = [], [] other_xaxis, other_yaxis = [], [] lose_xaxis, lose_yaxis = [], [] for top_outlier, _ in top_outliers_list: if 'indie' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'alternative' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'new wave' in ','.join(cid_tag_dict[top_outlier].keys()): indie_xaxis.append(cid_views_dict[top_outlier]) indie_yaxis.append((cid_views_dict[top_outlier] - cid_views_wo_network_dict[top_outlier]) / cid_views_dict[top_outlier]) elif 'rap' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'hip hop' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'rhythm and blues' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'reggae' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'punk' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'funk' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'r&b' in ','.join(cid_tag_dict[top_outlier].keys()): rap_xaxis.append(cid_views_dict[top_outlier]) rap_yaxis.append((cid_views_dict[top_outlier] - cid_views_wo_network_dict[top_outlier]) / cid_views_dict[top_outlier]) else: other_xaxis.append(cid_views_dict[top_outlier]) other_yaxis.append((cid_views_dict[top_outlier] - cid_views_wo_network_dict[top_outlier]) / cid_views_dict[top_outlier]) for bottom_outlier, _ in bottom_outliers_list: lose_xaxis.append(cid_views_dict[bottom_outlier]) lose_yaxis.append((cid_views_dict[bottom_outlier] - cid_views_wo_network_dict[bottom_outlier]) / cid_views_dict[bottom_outlier]) axes[3].scatter(indie_xaxis, indie_yaxis, marker='^', facecolors='none', edgecolors=color_cycle_4[0], s=20, label='Indie: {0}'.format(len(indie_xaxis))) axes[3].scatter(rap_xaxis, rap_yaxis, marker='o', facecolors='none', edgecolors=color_cycle_4[1], s=20, label='Hip hop: {0}'.format(len(rap_xaxis))) axes[3].scatter(other_xaxis, other_yaxis, marker='s', facecolors='none', edgecolors=color_cycle_4[2], s=20, label='Other: {0}'.format(len(other_xaxis))) # axes[3].scatter(lose_xaxis, lose_yaxis, marker='x', color=color_cycle_4[3], s=20, label='artists lose popularity: {0}'.format(len(bad_xaxis))) axes[3].set_ylim((-0.02, 1.02)) axes[3].set_xscale('log') axes[3].set_xlabel('artist average daily views', fontsize=label_fs) axes[3].set_ylabel('network contribution ratio ' + '$\eta_v$', fontsize=label_fs) axes[3].tick_params(**tick_style) axes[3].legend(frameon=False, loc='lower left') axes[3].set_title('(c)', fontsize=title_fs) hide_spines(axes) timer.stop() plt.tight_layout(w_pad=0.2) plt.savefig('../images/model_prediction_analysis.pdf', bbox_inches='tight') plt.show()
# PROJECT_DIR = os.getcwd() # Must execute from project dir PKG_DIR = os.path.dirname(__file__) PROJECT_DIR = os.path.dirname(PKG_DIR) release_dir = os.path.join(PROJECT_DIR, 'release', option_output_dir) os.chdir(PROJECT_DIR) # Add Paths PATHS.append(os.path.join(PROJECT_DIR, 'bin')) [sys.path.append(p) for p in PATHS] # Additional Paths from Options if option_directory: sys.path.append(option_directory) logger.debug(sys.path) logger.debug(arguments) logger.debug(ASSEMBLIES) if arguments['make']: timer = Timer() if not option_all: ASSEMBLIES = [option_assembly_name] for assembly_name in ASSEMBLIES: assembly_dict = make(release_dir, assembly_name, overwrite=option_overwrite, quiet=option_all) if option_json: dump_json_log(assembly_dict) logger.info('Done: {} seconds'.format(timer.stop()))
def main(): timer = Timer() timer.start() consumer_key = conf.twitter_consumer_key consumer_secret = conf.twitter_consumer_secret access_token = conf.twitter_access_token access_token_secret = conf.twitter_access_secret auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = API(auth) app_name = 'mbfc' num_hit_twitter = 0 num_fail_twitter = 0 num_request = 0 # searching Twitter Search API for tweet handles if we cannot find it on webpage with open('data/{0}/{0}_ratings_v4.csv'.format(app_name), 'w') as fout: with open('data/{0}/{0}_ratings_v3.csv'.format(app_name), 'r') as fin: fout.write(fin.readline()) for line in fin: title, tail = line.rstrip().split(',', 1) middle, website_url, tw_handle, tw_sim, yt_id, yt_user = tail.rsplit( ',', 5) if tw_handle == '': print('===============') print('media title', title) print('website url', website_url) # get the first 10 Twitter users returned_users = api.search_users(title, count=10) num_request += 1 to_write = True for user in returned_users: user_json = user._json screen_name = user_json['screen_name'].lower() if match_website_on_twitter_page( user_json, website_url): selected_tw_handle = screen_name num_hit_twitter += 1 tw_similarity = SequenceMatcher( None, tldextract.extract( get_domain(website_url)).domain, selected_tw_handle).ratio() fout.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( title, middle, website_url, selected_tw_handle, tw_similarity, yt_id, yt_user)) to_write = False print('find twitter handle:', selected_tw_handle) print('success index {0}/{2}: {1}'.format( num_hit_twitter, title, num_request)) break if to_write: num_fail_twitter += 1 fout.write(line) print('xxx failed to find for this media {0}, {1}!'. format(title, website_url)) print('fail index {0}/{2}: {1}'.format( num_fail_twitter, title, num_request)) else: fout.write(line) print('number of requests sent: {0}'.format(num_request)) timer.stop()
def main(): timer = Timer() timer.start() app_name = 'mbfc' num_hit_youtube = 0 num_fail_youtube = 0 num_search = 0 with open('data/{0}/{0}_ratings_v5.csv'.format(app_name), 'w') as fout: with open('data/{0}/{0}_ratings_v4.csv'.format(app_name), 'r') as fin: fout.write(fin.readline()) for line in fin: title, tail = line.rstrip().split(',', 1) middle, website_url, tw_handle, tw_sim, yt_id, yt_user = tail.rsplit( ',', 5) if yt_id != '' or yt_user != '': fout.write(line) else: # searching YouTube search bar for youtube channels if we cannot find it on webpage print('===============') num_search += 1 num_search_results = 0 search_results = [] for _ in range(5): if num_search_results == 0: print( 'sent a request to YouTube search bar with title "{0}"...' .format(title)) search_request = get_search_request(title) print(search_request) try: search_response = requests.get( search_request, headers={ 'User-Agent': random.choice(USER_AGENT_LIST) }) except Exception as e: print(str(e)) search_response = requests.get(search_request) time.sleep(1) if search_response: html = search_response.text try: initial_data = json.loads( find_value( html, 'window["ytInitialData"] = ', 0, '\n').rstrip(';')) # print(json.dumps(initial_data)) except: continue # get the first 10 YouTube channels search_results = list( search_dict(initial_data, 'channelRenderer'))[:10] num_search_results = len(search_results) print('find {0} search results'.format( num_search_results)) found_match = False for search_result in search_results: channel_title = search_result['title']['simpleText'] channel_id = search_result['navigationEndpoint'][ 'browseEndpoint']['browseId'] print(channel_title, channel_id) if channel_title != '': print( YOUTUBE_CHANNEL_ABOUT.format( channel_id=channel_id)) channel_response = requests.get( YOUTUBE_CHANNEL_ABOUT.format( channel_id=channel_id)) time.sleep(1) if match_links_on_youtube_page( channel_response, website_url, tw_handle): found_match = True yt_id = channel_id break if found_match: num_hit_youtube += 1 print('success index {0}/{4}: {1}, {2}, {3}'.format( num_hit_youtube, title, yt_id, '', num_search)) fout.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( title, middle, website_url, tw_handle, tw_sim, yt_id, '')) else: num_fail_youtube += 1 print('fail index {0}/{2}: {1}'.format( num_fail_youtube, title, num_search)) fout.write(line) timer.stop()
Usage: python duration_predictor.py -i ./ -o ./output -f re Time: ~1M """ import os, sys, pickle, argparse import numpy as np from sklearn.metrics import mean_absolute_error, r2_score sys.path.append(os.path.join(os.path.dirname(__file__), '../')) from utils.helper import Timer, write_dict_to_pickle from utils.converter import to_watch_percentage if __name__ == '__main__': # == == == == == == == == Part 1: Set up experiment parameters == == == == == == == == # timer = Timer() timer.start() test_vids = [] test_duration = [] true_engagement = [] guess_engagement = [] # == == == == == == == == Part 2: Load dataset == == == == == == == == # parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', help='input file dir of formatted dataset', required=True) parser.add_argument('-o', '--output', help='output file dir of predictor result', required=True) parser.add_argument('-f', '--function', help='choose prediction target', required=True) args = parser.parse_args() input_dir = args.input
def main(): timer = Timer() timer.start() cornflower_blue = ColorPalette.BLUE tomato = ColorPalette.TOMATO color_cycle_4 = ColorPalette.CC4 label_fs = ColorPalette.LABELFS title_fs = ColorPalette.TITLEFS tick_style = ColorPalette.TICKSTYLE bar_text_style = ColorPalette.BARTEXTSTYLE data_loader = DataLoader() data_loader.load_embed_content_dict() embed_cid_dict = data_loader.embed_cid_dict embed_genre_dict = data_loader.embed_genre_dict fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 4)) gs = axes[0, 0].get_gridspec() for ax in axes[:, 0]: ax.remove() ax_left = fig.add_subplot(gs[:, 0]) for ax in axes[:, 1]: ax.remove() ax_mid = fig.add_subplot(gs[:, 1]) axes = [ax_left, ax_mid, axes[0, 2], axes[1, 2]] # == == == == == == Part 1: Plot the probability of forming a persistent link == == == == == == # p_form_list = [] p_persistent_list = [] with open('./justify_persistent_link.log', 'r') as fin: for line in fin: _, p_form, _, p_persistent = re.split(',|:', line) p_form = float(p_form.strip()) p_persistent = float(p_persistent.strip()) p_form_list.append(p_form) p_persistent_list.append(p_persistent) axes[0].plot(p_form_list, p_persistent_list, color=cornflower_blue) for p_form in [0.5, 0.7, 0.8, 0.9]: p_persistent = p_persistent_list[int(p_form * 100)] axes[0].scatter(p_form, p_persistent, s=15, c=tomato, edgecolors='k', zorder=30) axes[0].text(p_form - 0.01, p_persistent, '({0:.2f}, {1:.2f})'.format(p_form, p_persistent), ha='right', va='bottom') axes[0].set_xlabel('prob. of forming a link', fontsize=label_fs) axes[0].set_ylabel('prob. of being persistent link', fontsize=label_fs) axes[0].tick_params(**tick_style) axes[0].set_title('(a)', fontsize=title_fs) # == == == == == == Part 2: Plot the portion of persistent links that pass statistics test == == == == == == # log_files_list = [ './random_pearsonr.log', './ephemeral_pearsonr.log', './persistent_pearsonr.log', './reciprocal_pearsonr.log' ] link_cnt_list = [] sign_ratio_list = [] same_artist_list = [] sign_ratio_same_artist_list = [] same_genre_list = [] sign_ratio_same_genre_list = [] for log_file in log_files_list: cnt = 0 same_artist_cnt = 0 same_genre_cnt = 0 sign_cnt = 0 sign_cnt_same_artist = 0 sign_cnt_same_genre = 0 with open(log_file, 'r') as fin: for line in fin: src_embed, tar_embed, r, p = line.rstrip().split(',') src_embed = int(src_embed) tar_embed = int(tar_embed) r = float(r) p = float(p) if p < 0.05: sign_cnt += 1 cnt += 1 if embed_cid_dict[src_embed] == embed_cid_dict[tar_embed]: same_artist_cnt += 1 if p < 0.05: sign_cnt_same_artist += 1 if is_same_genre(embed_genre_dict[src_embed], embed_genre_dict[tar_embed]): same_genre_cnt += 1 if p < 0.05: sign_cnt_same_genre += 1 sign_ratio_list.append(sign_cnt / cnt) same_artist_list.append(same_artist_cnt / cnt) sign_ratio_same_artist_list.append(sign_cnt_same_artist / cnt) same_genre_list.append(same_genre_cnt / cnt) sign_ratio_same_genre_list.append(sign_cnt_same_genre / cnt) link_cnt_list.append(cnt) print( '#links: {0}, #sign links: {1}, #sign same artist: {2}, #sign same genre: {3}' .format(cnt, sign_cnt, sign_cnt_same_artist, sign_cnt_same_genre)) ind = np.arange(len(log_files_list)) axes[1].bar(ind, sign_ratio_list, 0.6, edgecolor=['k'] * 4, color=color_cycle_4, lw=1.5, alpha=0.6) axes[1].set_ylim([0, axes[0].get_ylim()[1]]) axes[1].set_ylabel('percentage of links with p<0.05', fontsize=label_fs) axes[1].set_xticklabels( ('', 'random' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[0]), 'ephemeral' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[1]), 'persistent' + r'$^{-}$' + '\n({0:,})'.format(link_cnt_list[2]), 'reciprocal' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[3]))) for tick in ind: axes[1].text(tick, sign_ratio_list[tick] + 0.01, '{0:.3f}'.format(sign_ratio_list[tick]), **bar_text_style) axes[1].tick_params(**tick_style) axes[1].set_title('(b)', fontsize=title_fs) # == == == == == == Part 3: Plot the percentage of significant persistent links belong to the same artist or contain the same genre == == == == == == # axes[2].bar(ind, np.array(same_artist_list) - np.array(sign_ratio_same_artist_list), 0.6, bottom=sign_ratio_same_artist_list, edgecolor=color_cycle_4, color=['w'] * 4, hatch='//', lw=1.5, alpha=0.6) axes[2].bar(ind, sign_ratio_same_artist_list, 0.6, edgecolor=['k'] * 4, color=color_cycle_4, lw=1.5, alpha=0.6) axes[2].set_ylim([0, axes[0].get_ylim()[1]]) axes[2].set_ylabel('same artist', fontsize=label_fs) axes[2].text(0, same_artist_list[0] + 0.01, '{0:.3f}'.format(same_artist_list[0]), **bar_text_style) for tick in ind[1:]: axes[2].text(tick, same_artist_list[tick] + 0.01, '{0:.3f}'.format(same_artist_list[tick]), **bar_text_style) axes[2].text(tick, sign_ratio_same_artist_list[tick] + 0.01, '{0:.3f}'.format(sign_ratio_same_artist_list[tick]), **bar_text_style) axes[2].tick_params(**tick_style) axes[2].get_xaxis().set_visible(False) axes[2].set_title('(c)', fontsize=title_fs) axes[3].bar(ind, np.array(same_genre_list) - np.array(sign_ratio_same_genre_list), 0.6, bottom=sign_ratio_same_genre_list, edgecolor=color_cycle_4, color=['w'] * 4, hatch='//', lw=1.5, alpha=0.6) axes[3].bar(ind, sign_ratio_same_genre_list, 0.6, edgecolor=['k'] * 4, color=color_cycle_4, lw=1.5, alpha=0.6) axes[3].set_ylim([0, axes[0].get_ylim()[1]]) axes[3].set_ylabel('same genre', fontsize=label_fs) for tick in ind: axes[3].text(tick, same_genre_list[tick] + 0.01, '{0:.3f}'.format(same_genre_list[tick]), **bar_text_style) axes[3].text(tick, sign_ratio_same_genre_list[tick] + 0.01, '{0:.3f}'.format(sign_ratio_same_genre_list[tick]), **bar_text_style) axes[3].tick_params(**tick_style) axes[3].set_xticklabels( ('', 'random' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[0]), 'ephemeral' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[1]), 'persistent' + r'$^{-}$' + '\n({0:,})'.format(link_cnt_list[2]), 'reciprocal' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[3]))) hide_spines(axes) timer.stop() plt.tight_layout() plt.savefig('../images/model_persistent_links.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() cc4 = ColorPalette.CC4 blue = cc4[0] app_name = 'cyberbullying' rho = 0.5272 entity = 'user' fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) print('for entity: {0}'.format(entity)) sample_entity_freq_dict = defaultdict(int) with open('../data/{1}_out/{0}_{1}_all.txt'.format(entity, app_name), 'r') as sample_datefile: for line in sample_datefile: sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1 complete_entity_freq_dict = defaultdict(int) with open('../data/{1}_out/complete_{0}_{1}.txt'.format(entity, app_name), 'r') as complete_datefile: for line in complete_datefile: complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1 complete_to_sample_freq_dict = defaultdict(list) sample_to_complete_freq_dict = defaultdict(list) for item, complete_vol in complete_entity_freq_dict.items(): if item in sample_entity_freq_dict: complete_to_sample_freq_dict[complete_vol].append(sample_entity_freq_dict[item]) else: complete_to_sample_freq_dict[complete_vol].append(0) for item, sample_vol in sample_entity_freq_dict.items(): sample_to_complete_freq_dict[sample_vol].append(complete_entity_freq_dict[item]) for item in set(complete_entity_freq_dict.keys()) - set(sample_entity_freq_dict.keys()): sample_to_complete_freq_dict[0].append(complete_entity_freq_dict[item]) ax1_x_axis = range(1, 101) ax1_y_axis = [] empirical_mean_list = [] expected_mean_list = [] for num_sample in ax1_x_axis: # compute sample to complete empirical_cnt_dist = sample_to_complete_freq_dict[num_sample] neg_binomial_cnt_dist = [] for x in range(num_sample, max(30, 3 * num_sample + 1)): neg_binomial_cnt_dist.extend([x] * int(negative_binomial(x, num_sample, rho) * len(empirical_cnt_dist))) ks_test = stats.ks_2samp(empirical_cnt_dist, neg_binomial_cnt_dist) empirical_mean = sum(empirical_cnt_dist) / len(empirical_cnt_dist) empirical_mean_list.append(empirical_mean) expected_mean = sum(neg_binomial_cnt_dist) / len(neg_binomial_cnt_dist) expected_mean_list.append(expected_mean) print('num_sample: {0}, number of Bernoulli trials: {1}, d_statistic: {2:.4f}, p: {3:.4f}, expected mean: {4:.2f}, empirical mean: {5:.2f}' .format(num_sample, len(empirical_cnt_dist), ks_test[0], ks_test[1], expected_mean, empirical_mean)) ax1_y_axis.append(ks_test[0]) axes[0].plot(ax1_x_axis, ax1_y_axis, c='k', lw=1.5, ls='-') axes[0].set_xlabel(r'sample frequency $n_s$', fontsize=16) axes[0].set_ylabel('D-statistic', fontsize=16) axes[0].set_xlim([-2, 102]) axes[0].set_xticks([0, 25, 50, 75, 100]) axes[0].set_ylim([0, 0.17]) axes[0].yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{0:.2f}'.format(x))) axes[0].tick_params(axis='both', which='major', labelsize=16) axes[0].set_title('(a)', fontsize=18, pad=-3*72, y=1.0001) # show an example num_sample = np.argmin(ax1_y_axis) + 1 axes[0].scatter(num_sample, ax1_y_axis[num_sample - 1], s=40, c=blue, zorder=30) axes[0].set_yticks([0, ax1_y_axis[num_sample - 1], 0.05, 0.1, 0.15]) axes[0].plot([axes[0].get_xlim()[0], num_sample], [ax1_y_axis[num_sample - 1], ax1_y_axis[num_sample - 1]], color=blue, ls='--', lw=1) axes[0].plot([num_sample, num_sample], [axes[0].get_ylim()[0], ax1_y_axis[num_sample - 1]], color=blue, ls='--', lw=1) # plot sample to complete ax2_x_axis = range(num_sample, max(30, 3 * num_sample + 1)) num_items = len(sample_to_complete_freq_dict[num_sample]) sample_to_complete_cnt = Counter(sample_to_complete_freq_dict[num_sample]) ax2_y_axis = [sample_to_complete_cnt[x] / num_items for x in ax2_x_axis] ax2_neg_binomial_axis = [negative_binomial(x, num_sample, rho) for x in ax2_x_axis] axes[1].plot(ax2_x_axis, ax2_y_axis, c=blue, lw=1.5, ls='-', marker='o', zorder=20, label='empirical') axes[1].plot(ax2_x_axis, ax2_neg_binomial_axis, c='k', lw=1.5, ls='-', marker='x', zorder=10, label='negative binomial') axes[1].set_xlabel(r'complete frequency $n_c$', fontsize=16) axes[1].set_ylabel(r'Pr($n_c$|$n_s$={0})'.format(num_sample), fontsize=16) axes[1].set_xticks([num_sample, 2 * num_sample, 3 * num_sample]) axes[1].set_ylim([-0.005, 0.15]) axes[1].set_yticks([0, 0.05, 0.1]) axes[1].tick_params(axis='both', which='major', labelsize=16) axes[1].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='upper left') axes[1].set_title('(b)', fontsize=18, pad=-3*72, y=1.0001) axes[1].plot([empirical_mean_list[num_sample - 1], empirical_mean_list[num_sample - 1]], [axes[1].get_ylim()[0], 0.1], color=blue, ls='--', lw=1) axes[1].plot([expected_mean_list[num_sample - 1], expected_mean_list[num_sample - 1]], [axes[1].get_ylim()[0], 0.1], color='k', ls='--', lw=1) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/entity_negative_binomial.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() cc4 = ColorPalette.CC4 blue = cc4[0] fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) timestamp_list = [] sec_count_dict = defaultdict(int) ms_list = [] with open('rate_limit_2015-09-08.txt', 'r') as fin: for line in fin: rate_json = json.loads(line.rstrip()) ms_list.append(int(rate_json['limit']['timestamp_ms'][-3:])) timestamp = datetime.utcfromtimestamp( (int(rate_json['limit']['timestamp_ms']) - 666) // 1000) timestamp_list.append(timestamp) sec_count_dict[timestamp] += 1 print('{0:.2f}% rate limit messages come from millisecond 700 to 1000'. format(len([x for x in ms_list if x >= 700]) / len(ms_list) * 100)) sns.distplot(ms_list, bins=200, color=blue, ax=axes[0], kde_kws={ 'shade': False, 'linewidth': 1.5, 'color': 'k' }) axes[0].set_xticks([0, 250, 500, 750, 1000]) axes[0].set_xlim([-50, 1050]) axes[0].set_xlabel('millisecond', fontsize=16) axes[0].set_ylabel('density', fontsize=16) axes[0].tick_params(axis='both', which='major', labelsize=16) axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001) sec_count_stats = Counter(sec_count_dict.values()) x_axis = sorted(sec_count_stats.keys()) axes[1].bar(x_axis, [sec_count_stats[x] for x in x_axis], facecolor=blue, edgecolor='k', width=0.7) axes[1].set_xticks([1, 2, 3, 4]) axes[1].set_xlim([0, 5]) axes[1].set_xlabel('#rate limit messages per second', fontsize=16) axes[1].set_ylabel('frequency', fontsize=16) axes[1].yaxis.set_major_formatter(FuncFormatter(concise_fmt)) axes[1].tick_params(axis='both', which='major', labelsize=16) axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/SI_ratemsg_dist.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() n_cluster = 6 complete_cluster_size_list = [] for i in range(n_cluster): with open('{0}_cluster{1}.txt'.format('complete', i), 'r') as fin: complete_nodes = set(fin.readline().split(',')) num_entities = len(complete_nodes) num_users = len([x for x in complete_nodes if x.startswith('u')]) num_hashtags = num_entities - num_users complete_cluster_size_list.append((num_entities, num_users, num_hashtags)) complete_sorted_by_size = sorted(enumerate(complete_cluster_size_list), key=lambda x: x[1][0], reverse=True) complete_sorted_by_size_copy = [] complete_sorted_by_size_copy.append(complete_sorted_by_size[0]) complete_sorted_by_size_copy.append(complete_sorted_by_size[4]) complete_sorted_by_size_copy.append(complete_sorted_by_size[5]) complete_sorted_by_size_copy.append(complete_sorted_by_size[3]) complete_sorted_by_size_copy.append(complete_sorted_by_size[2]) complete_sorted_by_size_copy.append(complete_sorted_by_size[1]) complete_sorted_by_size = complete_sorted_by_size_copy print(complete_sorted_by_size) sample_cluster_size_list = [] for i in range(n_cluster): with open('{0}_cluster{1}.txt'.format('sample', i), 'r') as fin: sample_nodes = set(fin.readline().split(',')) num_entities = len(sample_nodes) num_users = len([x for x in sample_nodes if x.startswith('u')]) num_hashtags = num_entities - num_users sample_cluster_size_list.append((num_entities, num_users, num_hashtags)) sample_sorted_by_size = sorted(enumerate(sample_cluster_size_list), key=lambda x: x[1][0], reverse=True) sample_sorted_by_size_copy = [] sample_sorted_by_size_copy.append(sample_sorted_by_size[0]) sample_sorted_by_size_copy.append(sample_sorted_by_size[1]) sample_sorted_by_size_copy.append(sample_sorted_by_size[4]) sample_sorted_by_size_copy.append(sample_sorted_by_size[2]) sample_sorted_by_size_copy.append(sample_sorted_by_size[5]) sample_sorted_by_size_copy.append(sample_sorted_by_size[3]) sample_sorted_by_size = sample_sorted_by_size_copy print(sample_sorted_by_size) complete_clusters_list = [] for i, _ in complete_sorted_by_size: with open('{0}_cluster{1}.txt'.format('complete', i), 'r') as fin: complete_nodes = set(fin.readline().split(',')) complete_clusters_list.append(complete_nodes) sample_clusters_list = [] for i, _ in sample_sorted_by_size: with open('{0}_cluster{1}.txt'.format('sample', i), 'r') as fin: sample_nodes = set(fin.readline().split(',')) sample_clusters_list.append(sample_nodes) col_labels = ['SC1', 'SC2', 'SC3', 'SC4', 'SC5', 'SC6', 'Missing', 'Total'] row_labels = ['CC1', 'CC2', 'CC3', 'CC4', 'CC5', 'CC6', 'Total'] n_row = len(row_labels) n_col = len(col_labels) confusion_mat = np.zeros(shape=(n_row, n_col)) confusion_mat_rate = np.zeros(shape=(n_row, n_col)) confusion_mat_annot = [[[] for _ in range(n_col)] for _ in range(n_row)] for i in range(n_row - 1): cnt0 = cnt = complete_sorted_by_size[i][1][0] print('from complete cluster {0}'.format(i + 1), cnt0) for j in range(n_col - 2): tmp = len(complete_clusters_list[i].intersection(sample_clusters_list[j])) print('>>> to sample cluster {0}: '.format(j + 1), tmp, tmp/cnt0) confusion_mat[i, j] = tmp confusion_mat_rate[i, j] = tmp / cnt0 cnt -= tmp if tmp > 0: confusion_mat_annot[i][j] = '{0}\n{1:.1f}%'.format(concise_fmt(tmp, None), 100*tmp/cnt0) else: confusion_mat_annot[i][j] = '{0}'.format(concise_fmt(tmp, None)) print('>>> to missing: ', cnt/cnt0) confusion_mat[i, -2] = cnt if cnt > 0: confusion_mat_annot[i][-2] = '{0}\n{1:.1f}%'.format(concise_fmt(cnt, None), 100*cnt / cnt0) else: confusion_mat_annot[i][-2] = '{0}'.format(concise_fmt(cnt, None)) confusion_mat_rate[i, -2] = cnt / cnt0 confusion_mat[i, -1] = cnt0 # confusion_mat_rate[i, -1] = 0 for j in range(n_row): # confusion_mat_annot[j][-1] = '{0}\n{1:.1f}%'.format(concise_fmt(confusion_mat[j, -1]), 100*confusion_mat[j, -1]/sum(confusion_mat[:-1, -1])) confusion_mat_annot[j][-1] = '{0}'.format(concise_fmt(confusion_mat[j, -1], None)) # confusion_mat_annot[-1][j] = '{0}\n{1:.1f}%'.format(concise_fmt(sum(confusion_mat[:-1, j])), 100*sum(confusion_mat[:-1, j]) / sum(confusion_mat[:-1, -1])) confusion_mat_annot[-1][j] = '{0}'.format(concise_fmt(sum(confusion_mat[:-1, j]), None)) # confusion_mat_rate[-1, j] = 0 # confusion_mat_annot[-1][-1] = '{0}\n{1:.0f}%'.format(concise_fmt(sum(confusion_mat[:-1, -1])), 100) confusion_mat_annot[-1][-1] = '{0}'.format(concise_fmt(sum(confusion_mat[:-1, -1]), None)) confusion_mat_annot = np.array(confusion_mat_annot) fig, ax1 = plt.subplots(1, 1) sns.heatmap(confusion_mat_rate, annot=confusion_mat_annot, cmap=ccmap, fmt='s', ax=ax1, cbar_kws={'label': 'ratio from complete clusters to sample clusters', 'shrink': .6}) ax1.set_title('clusters in sample set', loc='right') ax1.set_title('clusters in complete set', loc='left') ax1.set_xticklabels(col_labels, ha='center') ax1.set_yticklabels(row_labels, rotation=90, va='center') ax1.xaxis.tick_top() ax1.hlines(y=0, xmin=n_col-1, xmax=n_col) ax1.hlines(y=n_row-1, xmin=0, xmax=n_col-1) ax1.hlines(y=n_row, xmin=0, xmax=n_col) ax1.vlines(x=0, ymin=n_row-1, ymax=n_row) ax1.vlines(x=n_col-1, ymin=0, ymax=n_row-1) ax1.vlines(x=n_col, ymin=0, ymax=n_row) cbar_ax = fig.axes[-1] cbar_ax.set_frame_on(True) timer.stop() plt.tight_layout(rect=[0.04, 0, 1, 1]) plt.savefig('../images/measure_bipartite_cluster_flow.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() app_name = 'cyberbullying' hours_in_day = 24 minutes_in_hour = 60 seconds_in_minute = 60 ms_in_second = 1000 num_bins = 100 width = ms_in_second // num_bins num_top = 500 fig, axes = plt.subplots(1, 2, figsize=(7.2, 4.8), gridspec_kw={'width_ratios': [2.75, 3]}) axes = axes.ravel() confusion_sampling_rate = np.load( '../data/{0}_out/{0}_confusion_sampling_rate.npy'.format(app_name)) confusion_sampling_rate = np.nan_to_num(confusion_sampling_rate) load_external_data = True if not load_external_data: sample_entity_stats = defaultdict(int) with open('../data/{0}_out/user_{0}_all.txt'.format(app_name), 'r') as fin: for line in fin: split_line = line.rstrip().split(',') sample_entity_stats[split_line[1]] += 1 # == == == == == == Part 2: Plot entity rank == == == == == == # print('>>> found top {0} users in sample set...'.format(num_top)) sample_top = [ kv[0] for kv in sorted(sample_entity_stats.items(), key=lambda x: x[1], reverse=True)[:num_top] ] # == == == == == == Part 1: Find tweets appearing in complete set == == == == == == # complete_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)] complete_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)] complete_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)] complete_post_lists_10ms = [[0] * num_bins for _ in range(num_top)] complete_entity_stats = defaultdict(int) with open('../data/{0}_out/complete_user_{0}.txt'.format(app_name), 'r') as fin: for line in fin: split_line = line.rstrip().split(',') user_id = split_line[1] if user_id in sample_top: complete_entity_stats[user_id] += 1 user_idx = sample_top.index(user_id) tweet_id = split_line[0] timestamp_ms = melt_snowflake(tweet_id)[0] dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000) hour = dt_obj.hour minute = dt_obj.minute second = dt_obj.second millisec = timestamp_ms % 1000 ms_idx = (millisec - 7) // width if millisec >= 7 else ( 1000 + millisec - 7) // width complete_post_lists_hour[user_idx][hour] += 1 complete_post_lists_min[user_idx][minute] += 1 complete_post_lists_sec[user_idx][second] += 1 complete_post_lists_10ms[user_idx][ms_idx] += 1 write_to_file('./complete_post_lists_hour.txt', sample_top, complete_post_lists_hour) write_to_file('./complete_post_lists_min.txt', sample_top, complete_post_lists_min) write_to_file('./complete_post_lists_sec.txt', sample_top, complete_post_lists_sec) write_to_file('./complete_post_lists_10ms.txt', sample_top, complete_post_lists_10ms) print('>>> finish dumping complete lists...') timer.stop() # == == == == == == Part 2: Find appearing tweets in sample set == == == == == == # sample_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)] sample_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)] sample_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)] sample_post_lists_10ms = [[0] * num_bins for _ in range(num_top)] estimated_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)] estimated_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)] estimated_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)] estimated_post_lists_10ms = [[0] * num_bins for _ in range(num_top)] hourly_conversion = np.mean(confusion_sampling_rate, axis=(1, 2, 3)) minutey_conversion = np.mean(confusion_sampling_rate, axis=(2, 3)) secondly_conversion = np.mean(confusion_sampling_rate, axis=(3)) with open('../data/{0}_out/user_{0}_all.txt'.format(app_name), 'r') as fin: for line in fin: split_line = line.rstrip().split(',') user_id = split_line[1] if user_id in sample_top: user_idx = sample_top.index(user_id) tweet_id = split_line[0] timestamp_ms = melt_snowflake(tweet_id)[0] dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000) hour = dt_obj.hour minute = dt_obj.minute second = dt_obj.second millisec = timestamp_ms % 1000 ms_idx = (millisec - 7) // width if millisec >= 7 else ( 1000 + millisec - 7) // width sample_post_lists_hour[user_idx][hour] += 1 sample_post_lists_min[user_idx][minute] += 1 sample_post_lists_sec[user_idx][second] += 1 sample_post_lists_10ms[user_idx][ms_idx] += 1 estimated_post_lists_hour[user_idx][ hour] += 1 / hourly_conversion[hour] estimated_post_lists_min[user_idx][ minute] += 1 / minutey_conversion[hour, minute] estimated_post_lists_sec[user_idx][ second] += 1 / secondly_conversion[hour, minute, second] estimated_post_lists_10ms[user_idx][ ms_idx] += 1 / confusion_sampling_rate[hour, minute, second, ms_idx] write_to_file('./sample_post_lists_hour.txt', sample_top, sample_post_lists_hour) write_to_file('./sample_post_lists_min.txt', sample_top, sample_post_lists_min) write_to_file('./sample_post_lists_sec.txt', sample_top, sample_post_lists_sec) write_to_file('./sample_post_lists_10ms.txt', sample_top, sample_post_lists_10ms) write_to_file('./estimated_post_lists_hour.txt', sample_top, estimated_post_lists_hour) write_to_file('./estimated_post_lists_min.txt', sample_top, estimated_post_lists_min) write_to_file('./estimated_post_lists_sec.txt', sample_top, estimated_post_lists_sec) write_to_file('./estimated_post_lists_10ms.txt', sample_top, estimated_post_lists_10ms) print('>>> finish dumping sample and estimated lists...') timer.stop() else: sample_top = [] complete_post_lists_hour = [] with open('./complete_post_lists_hour.txt', 'r') as fin: for line in fin: user_id, total, records = line.rstrip().split('\t') sample_top.append(user_id) records = list(map(int, records.split(','))) complete_post_lists_hour.append(records) sample_post_lists_hour = read_from_file('./sample_post_lists_hour.txt', dtype=0) sample_post_lists_min = read_from_file('./sample_post_lists_min.txt', dtype=0) sample_post_lists_sec = read_from_file('./sample_post_lists_sec.txt', dtype=0) sample_post_lists_10ms = read_from_file('./sample_post_lists_10ms.txt', dtype=0) estimated_post_lists_hour = read_from_file( './estimated_post_lists_hour.txt', dtype=1) estimated_post_lists_min = read_from_file( './estimated_post_lists_min.txt', dtype=1) estimated_post_lists_sec = read_from_file( './estimated_post_lists_sec.txt', dtype=1) estimated_post_lists_10ms = read_from_file( './estimated_post_lists_10ms.txt', dtype=1) # == == == == == == Part 3: Find the best estimation by comparing JS distance == == == == == == # ret = {} num_estimate_list = [] num_sample_list = [] num_complete_list = [] sample_entity_stats = { user_id: sum(sample_post_lists_hour[user_idx]) for user_idx, user_id in enumerate(sample_top) } complete_entity_stats = { user_id: sum(complete_post_lists_hour[user_idx]) for user_idx, user_id in enumerate(sample_top) } min_mat = np.array([], dtype=np.int64).reshape(0, 60) sec_mat = np.array([], dtype=np.int64).reshape(0, 60) for user_idx, user_id in enumerate(sample_top): num_sample = sample_entity_stats[user_id] num_complete = complete_entity_stats[user_id] hour_entropy = entropy(sample_post_lists_hour[user_idx], base=hours_in_day) min_entropy = entropy(sample_post_lists_min[user_idx], base=minutes_in_hour) sec_entropy = entropy(sample_post_lists_sec[user_idx], base=seconds_in_minute) ms10_entropy = entropy(sample_post_lists_10ms[user_idx], base=num_bins) min_mat = np.vstack( (min_mat, np.array(sample_post_lists_min[user_idx]).reshape(1, -1))) sec_mat = np.vstack( (sec_mat, np.array(sample_post_lists_sec[user_idx]).reshape(1, -1))) min_entropy, min_entropy_idx = min( (min_entropy, min_entropy_idx) for (min_entropy_idx, min_entropy ) in enumerate([hour_entropy, min_entropy, sec_entropy])) if ms10_entropy < 0.87: min_entropy_idx = 3 else: min_entropy_idx = 2 # # if they are all very large # if min_entropy >= msly_entropy_benchmark: # min_entropy_idx = 2 num_estimate = sum([ estimated_post_lists_hour[user_idx], estimated_post_lists_min[user_idx], estimated_post_lists_sec[user_idx], estimated_post_lists_10ms[user_idx] ][min_entropy_idx]) num_estimate_list.append(num_estimate) num_sample_list.append(num_sample) num_complete_list.append(num_complete) ret[user_id] = (num_sample, num_complete, num_estimate, min_entropy_idx) np.savetxt('min_sample.npy', min_mat, delimiter=',') np.savetxt('sec_sample.npy', sec_mat, delimiter=',') rank_by_sample = [ k for k, v in sorted( ret.items(), key=lambda item: item[1][0], reverse=True) ] rank_by_complete = [ k for k, v in sorted( ret.items(), key=lambda item: item[1][1], reverse=True) ] rank_by_estimated = [ k for k, v in sorted( ret.items(), key=lambda item: item[1][2], reverse=True) ] for user_idx, user_id in enumerate(sample_top): print(user_id, ret[user_id][:-1], (rank_by_sample.index(user_id) + 1, rank_by_complete.index(user_id) + 1, rank_by_estimated.index(user_id) + 1)) print( ret[user_id][0] / ret[user_id][1], mape(ret[user_id][1], ret[user_id][2])[0], rank_by_sample.index(user_id) - rank_by_complete.index(user_id), rank_by_estimated.index(user_id) - rank_by_complete.index(user_id)) print(np.sum(np.array(sample_post_lists_min[user_idx]) > 0), np.sum(np.array(sample_post_lists_sec[user_idx]) > 0), np.sum(np.array(sample_post_lists_10ms[user_idx]) > 0)) observed_top100 = rank_by_sample[:100] complete_rank_for_observed_top100 = [ rank_by_complete.index(uid) + 1 for uid in observed_top100 ] user_sampling_rates_for_observed_top100 = [ sample_entity_stats[uid] / complete_entity_stats[uid] for uid in observed_top100 ] print('kendall tau for observed', kendalltau(range(1, 101), complete_rank_for_observed_top100)) estimated_top100 = rank_by_estimated[:100] complete_rank_for_estimated_top100 = [ rank_by_complete.index(uid) + 1 for uid in estimated_top100 ] user_sampling_rates_for_estimated_top100 = [ sample_entity_stats[uid] / complete_entity_stats[uid] for uid in estimated_top100 ] print('kendall tau for estimated', kendalltau(range(1, 101), complete_rank_for_estimated_top100)) axes[0].scatter(range(1, 101), complete_rank_for_observed_top100, s=30, c=user_sampling_rates_for_observed_top100, edgecolors='gray', vmin=0.2, vmax=0.9, cmap=cm, zorder=50) axes[0].set_xlabel('observed rank in sample set', fontsize=13) axes[0].set_ylabel('rank in complete set', fontsize=13) axes[0].text(0.04, 0.9, r"kendall's $\tau$: {0:.4f}".format( kendalltau(range(1, 101), complete_rank_for_observed_top100)[0]), ha='left', va='top', size=12, transform=axes[0].transAxes) axes[0].plot([0, 100], [100, 100], color='gray', ls='--', lw=1) axes[0].plot([100, 100], [0, 100], color='gray', ls='--', lw=1) axes[0].plot([0, 100], [0, 100], color='gray', ls='--', lw=1) axes[0].set_title('(a)', fontsize=13) sc = axes[1].scatter(range(1, 101), complete_rank_for_estimated_top100, s=30, c=user_sampling_rates_for_estimated_top100, edgecolors='gray', vmin=0.2, vmax=0.9, cmap=cm, zorder=50) axes[1].set_xlabel('estimated rank in sample set', fontsize=13) axes[1].plot([0, 100], [100, 100], color='gray', ls='--', lw=1) axes[1].plot([100, 100], [0, 100], color='gray', ls='--', lw=1) axes[1].plot([0, 100], [0, 100], color='gray', ls='--', lw=1) axes[1].text(0.04, 0.9, r"kendall's $\tau$: {0:.4f}".format( kendalltau(range(1, 101), complete_rank_for_estimated_top100)[0]), ha='left', va='top', size=12, transform=axes[1].transAxes) axes[1].set_ylim(axes[0].get_ylim()) axes[1].set_title('(b)', fontsize=13) cb = plt.colorbar(sc, fraction=0.055) cb.set_label(label='user sampling rate', size=13) cb.ax.tick_params(labelsize=11) for ax in axes[:2]: ax.set_xlim([-4, 104]) ax.set_ylim(bottom=-4) ax.set_xticks([0, 50, 100]) ax.set_yticks([0, 50, 100]) ax.tick_params(axis='both', which='major', labelsize=11) timer.stop() plt.tight_layout() plt.savefig('../images/top_entity_rank.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()