def get_user_profile_watch_videos_from_file(self): """ Method that returns a list with all the YouTube Videos IDs that the current User Profile will watch to build a watch history :return: """ return Utils.read_file(filename='{0}/{1}_watch_history_videos.txt'.format(Config.USER_PROFILES_WATCH_VIDEOS_BASE_DIR, self.USER_PROFILE))
def get_video_label(self, video_id): """ Method that returns the label of the given YouTube Video from MongoDB """ video_details = self.audit_framework_videos_col.find_one({'id': video_id}, {'classification': 1}) if Utils.key_exists(video_details, 'classification'): return video_details['classification']['classification_category'] else: print('[VIDEO: {}] Video not classified yet. Exiting...'.format(video_id)) sys.exit(0)
def get_user_proxy_server(self): """ Method that finds the proxy server of the User Profile :return: """ user_profiles_info = Utils.read_json_file(filename=Config.USER_PROFILES_INFO_FILENAME) for user_profile in user_profiles_info: if user_profile['nickname'] == self.USER_PROFILE: return user_profile['proxy'] print('[{0}] Cannot find the HTTPS Proxy server of this User Profile'.format(self.USER_PROFILE)) sys.exit(errno.ECANCELED)
def get_video_duration(self, video_id): """ Method that returns the duration of the given YouTube Video :param video_id: YouTube Video Id :return: """ # Get Video Metadata video_metadata = self.YOUTUBE_DOWNLOADER.download_video_metadata(video_id=video_id, retrieve_recommended_videos=False) # Convert Video duration to seconds video_duration_seconds = Utils.convert_youtube_video_duration_to_seconds(video_duration=video_metadata['contentDetails']['duration']) return video_duration_seconds
def analyze_audit_experiments(self): """ Method that analyzes the YouTube Search Audit experiments repetitions considering only unique videos and analyzes the videos incremental for each number of top N videos in the YouTube Homepage of a user """ # Iterate each User Profile and calculate its plot values for USER_PROFILE in self.USER_PROFILES: print('\n--- Analyzing results for USER PROFILE: {}\n'.format( USER_PROFILE)) # Iterate through the keywords for each User Profile for SEARCH_TERM in self.CONSIDERED_SEARCH_TERMS: print('\n--- [{}] Analyzing results for SEARCH TERM {}'.format( USER_PROFILE, SEARCH_TERM)) # Get YouTube Search Results for the current User Profile and Search Term curr_user_search_term_exp_details = self.audit_framework_youtube_search_col.find_one( { '$and': [{ 'user_profile_type': USER_PROFILE }, { 'search_term': SEARCH_TERM }] }, { 'experiment_details': 1, 'experiment_analysis': 1 }) if not curr_user_search_term_exp_details: print( '[{}] YouTube Search Experiment for SEARCH TERM {} has a problem' .format(USER_PROFILE, SEARCH_TERM)) return None if Utils.key_exists(curr_user_search_term_exp_details, 'experiment_analysis'): print( '[{}] Incremental analysis for the current YouTube Search Experiment for SEARCH TERM {} already performed' .format(USER_PROFILE, SEARCH_TERM)) continue # Declare variables curr_search_term_experiment_analysis = list() progressBar = tqdm(total=Config.AUDIT_SEARCH_RESULTS_THRESHOLD) for n_top_search_results_videos in range( 1, Config.AUDIT_SEARCH_RESULTS_THRESHOLD + 1): # Declare necessary variables for calculation pseudoscience_videos_found = list() all_videos_seen = list() # Iterate Experiment Repetitions for experiment_repetition in curr_user_search_term_exp_details[ 'experiment_details']: # Iterate videos of the current repetition for video_id in experiment_repetition[ 'CRAWLED_VIDEOS'][: n_top_search_results_videos]: # Add to the list of seen videos all_videos_seen.append(video_id) # Get Video Label curr_video_label = self.get_video_label( video_id=video_id) if curr_video_label == 'pseudoscience': pseudoscience_videos_found.append(video_id) # Calculate analysis results for the current number of homepage videos search_term_experiment_analysis = dict() search_term_experiment_analysis['total_videos_seen'] = len( all_videos_seen) search_term_experiment_analysis[ 'total_unique_videos_seen'] = len( list(set(all_videos_seen))) search_term_experiment_analysis[ 'pseudoscience_videos_found'] = pseudoscience_videos_found search_term_experiment_analysis[ 'total_pseudoscience_videos_found'] = len( pseudoscience_videos_found) search_term_experiment_analysis[ 'total_unique_pseudoscience_videos_found'] = len( list(set(pseudoscience_videos_found))) search_term_experiment_analysis[ 'average_pseudoscience_videos_total'] = ( len(pseudoscience_videos_found) / len(all_videos_seen)) * 100 search_term_experiment_analysis[ 'average_pseudoscience_videos_unique'] = ( len(list(set(pseudoscience_videos_found))) / len(list(set(all_videos_seen)))) * 100 # Add to the list with all the results curr_search_term_experiment_analysis.append( search_term_experiment_analysis) progressBar.update(1) progressBar.close() """ Insert YouTube Search Audit Analysis results into MongoDB """ self.audit_framework_youtube_search_col.update_one( { '$and': [{ 'user_profile_type': USER_PROFILE }, { 'search_term': SEARCH_TERM }] }, { '$set': { 'experiment_analysis': curr_search_term_experiment_analysis } }) return
def analyze_audit_experiments(self): """ Method that analyzes the YouTube Homepage Audit experiments repetitions considering only unique videos and analyzes the videos incremental for each number of top N videos in the YouTube Homepage of a user """ # Iterate each User Profile and calculate its plot values for USER_PROFILE in self.USER_PROFILES: print('\n--- Analyzing results for USER PROFILE: {}'.format( USER_PROFILE)) # Get the experiment details for the current USER PROFILE experiment_details = self.audit_framework_youtube_homepage_col.find_one( { 'user_profile_type': USER_PROFILE, 'total_repetitions': Config.AUDIT_HOMEPAGE_TOTAL_REPETITIONS }, { 'experiment_details': 1, 'experiment_analysis': 1 }) # Ensure that the results exists and the analysis has not already performed if not experiment_details: print( '\n[{}] YouTube Homepage Audit Experiment results cannot be found in MongoDB' .format(USER_PROFILE)) return None if Utils.key_exists(experiment_details, 'experiment_analysis'): print( '\n[{}] Incremental analysis for the current Homepage Experiment already performed' .format(USER_PROFILE)) continue # Declare variables incremental_homepage_experiment_analysis = list() progressBar = tqdm(total=Config.AUDIT_HOMEPAGE_VIDEOS_THRESHOLD) for n_top_homepage_videos in range( 1, Config.AUDIT_HOMEPAGE_VIDEOS_THRESHOLD + 1): # Declare necessary variables for calculation pseudoscience_videos_found = list() all_videos_seen = list() # Iterate Experiment Repetitions for experiment_repetition in experiment_details[ 'experiment_details']: # Iterate videos of the current repetition for video_id in experiment_repetition[ 'CRAWLED_VIDEOS'][:n_top_homepage_videos]: # Add to the list of seen videos all_videos_seen.append(video_id) # Get Video Label curr_video_label = self.get_video_label( video_id=video_id) if curr_video_label == 'pseudoscience': pseudoscience_videos_found.append(video_id) # Calculate analysis results for the current number of homepage videos homepage_experiment_analysis = dict() homepage_experiment_analysis['total_videos_seen'] = len( all_videos_seen) homepage_experiment_analysis['total_unique_videos_seen'] = len( list(set(all_videos_seen))) homepage_experiment_analysis[ 'pseudoscience_videos_found'] = pseudoscience_videos_found homepage_experiment_analysis[ 'total_pseudoscience_videos_found'] = len( pseudoscience_videos_found) homepage_experiment_analysis[ 'total_unique_pseudoscience_videos_found'] = len( list(set(pseudoscience_videos_found))) homepage_experiment_analysis[ 'average_pseudoscience_videos_total'] = ( len(pseudoscience_videos_found) / len(all_videos_seen)) * 100 homepage_experiment_analysis[ 'average_pseudoscience_videos_unique'] = ( len(list(set(pseudoscience_videos_found))) / len(list(set(all_videos_seen)))) * 100 # Add to the list with all the results incremental_homepage_experiment_analysis.append( homepage_experiment_analysis) progressBar.update(1) progressBar.close() """ Insert YouTube Homepage Audit Analysis results into MongoDB """ self.audit_framework_youtube_homepage_col.update_one( {'user_profile_type': USER_PROFILE}, { '$set': { 'experiment_analysis': incremental_homepage_experiment_analysis } }) return
def crawl_watch_youtube_video(self, video_id, hop_number): """ Method that downloads the metadata of the given YouTube Video and watches the video :param video_id: :param hop_number: :return: """ # Find whether we should watch the video or not if Config.AUDIT_RANDOM_WALKS_WATCH_VIDEO and hop_number < 5: watch_curr_video = True else: watch_curr_video = False # Load YouTube Video Page self.driver.get('https://www.youtube.com/watch?v={}&autoplay=1'.format(video_id)) time.sleep(3) # Check if user is Authenticated before proceeding if self.USER_PROFILE != 'NO_PERSONALIZATION' and not self.is_user_authenticated(): exit(1) # Check if LiveStream try: isLivestream = self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/div/div[1]/div[2]/yt-formatted-string'))).text if 'started streaming' in isLivestream.lower(): print('[VIDEO: {}] is a LIVESTREAM. Skipping and choosing another video...'.format(video_id)) return None except TimeoutException: pass # Start by Watching the Video if watch_curr_video: try: self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[1]/div/div/div/ytd-player/div/div/div[4]/button'))).click() except TimeoutException: try: self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[1]/div/div/div/ytd-player/div/div/div[5]/button'))).click() except TimeoutException: pass # Keep the time needed to crawl the video details video_crawl_started = time.time() self.driver.execute_script("window.scrollTo(0, 800)") """ DOWNLOAD VIDEO METADATA """ # Check if Video already exists in MongoDB video_exists = False video_metadata = self.audit_framework_videos_col.find_one({'id': video_id}) if not video_metadata: # Get Video Metadata using YouTube Data API video_metadata = self.get_video_metadata(video_id=video_id, retrieve_recommended_videos=False) if video_metadata is None: return None # Add Video Annotation information video_metadata['classification'] = dict() video_metadata['classification']['classification_category'] = None else: # Set Video Exists flag video_exists = True # # GET RELATED VIDEOS (no matter if the video exists or not) # print('--- [VIDEO: {}] GETTING TOP {} RECOMMENDED VIDEOS...'.format(video_id, Config.AUDIT_RANDOM_WALKS_RECOMMENDED_VIDEOS_THRESHOLD)) related_videos_list = list() related_videos_items = self.driver.find_elements_by_xpath('//*[@id="thumbnail"]') included_related_videos = 0 for related_video_item in related_videos_items: try: related_video_id = related_video_item.get_attribute('href').split('v=')[1] if "&" in related_video_id: related_video_id = related_video_id.split('&')[0] related_videos_list.append(related_video_id) included_related_videos += 1 except (AttributeError, IndexError): continue if Config.AUDIT_RANDOM_WALKS_RECOMMENDED_VIDEOS_THRESHOLD == len(related_videos_list): break print('--- [VIDEO: {0}] TOP {1} RECOMMENDED VIDEOS: {2}'.format(video_id, Config.AUDIT_RANDOM_WALKS_RECOMMENDED_VIDEOS_THRESHOLD, related_videos_list)) video_metadata['relatedVideos'] = related_videos_list video_metadata['updatedAt'] = str(dt.now()) # STORE VIDEO INFORMATION IN MongoDB if not video_exists: # Insert Video Details in MongoDB self.audit_framework_videos_col.insert_one(video_metadata) else: # Update Video Details in MongoDB self.audit_framework_videos_col.replace_one({'id': video_id}, video_metadata, upsert=True) # WATCH VIDEO if watch_curr_video: # Calculate Video Crawl Duration video_crawl_ended = time.time() video_crawl_duration_sec = video_crawl_ended - video_crawl_started # Read Video Duration video_duration_sec = Utils.convert_youtube_video_duration_to_seconds(video_duration=video_metadata['contentDetails']['duration']) # Calculate the final watch time percentage to watch final_video_duration_sec = int((video_duration_sec * Config.AUDIT_RANDOM_WALKS_WATCH_VIDEO_PERCENTAGE) / 100) final_video_duration_sec = final_video_duration_sec - video_crawl_duration_sec print('[{0}] - Sleeping for {1} seconds to watch the full VIDEO: {2}'.format(dt.now().strftime("%d-%m-%Y %H:%M:%S"), final_video_duration_sec, video_id)) time.sleep(final_video_duration_sec) return video_metadata
def analyze_audit_experiments(self, random_walks_starting_hop=0): """ Method that analyzes the random walks of the YouTube Video Recommendations audit experiments :param random_walks_starting_hop: :return: """ # Iterate each User Profile and calculate its plot values for USER_PROFILE in self.USER_PROFILES: print('\n--- Analyzing Random Walks for USER PROFILE: {}'.format(USER_PROFILE)) # Iterate through the keywords for each User Profile progressBar = tqdm(total=len(self.CONSIDERED_SEARCH_TERMS)) for SEARCH_TERM in self.CONSIDERED_SEARCH_TERMS: # Initialize Variables total_pseudoscience_videos_found = 0 # Get Random Walk details for the current User Profile and Search Term curr_random_walk_details = self.audit_framework_youtube_video_recommendations.find_one({ '$and': [ {'user_profile_type': USER_PROFILE}, {'seed_search_term_topic': SEARCH_TERM} ] }, {'random_walks_details': 1, 'random_walks_analysis': 1}) # Ensure that we have performed Random Walks for the requested YouTube Recommendations Monitor Round ID if not curr_random_walk_details: print('--- [{}] Personalized Random Walks for SEARCH TERM {} NOT PERFORMED'.format(USER_PROFILE, SEARCH_TERM)) return None if Utils.key_exists(curr_random_walk_details, 'random_walks_analysis'): print('--- [{}] Analysis of Personalized Random Walks for SEARCH TERM {} ALREADY PERFORMED'.format(USER_PROFILE, SEARCH_TERM)) progressBar.update(1) continue hops_pseudoscience_videos_found = [list() for i in range(0, Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1)] hops_all_videos_found = [list() for i in range(0, Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1)] # Iterate Random Walks for random_walk in curr_random_walk_details['random_walks_details']: # Iterate each Random Walk and calculate what we want for hop_cntr in range(0, Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1): hops_all_videos_found[hop_cntr].append(random_walk['hop_{}'.format(hop_cntr)]['video_id']) # Get video label curr_video_label = self.get_video_label(video_id=random_walk['hop_{}'.format(hop_cntr)]['video_id']) if curr_video_label == 'pseudoscience': hops_pseudoscience_videos_found[hop_cntr].append(random_walk['hop_{}'.format(hop_cntr)]['video_id']) total_pseudoscience_videos_found += 1 """ Calculate the percentage of times our Random Walker has found a PSEUDOSCIENCE video at each Hop """ hops_pseudoscience_videos_found_perc = [0.0 for j in range(0, Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1)] for hop_cntr in range(start=random_walks_starting_hop, stop=Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1): # HOP 0 hop_percentage_pseudoscience = 0.0 if hop_cntr == 0: hop_unique_total_videos = len(list(set(hops_all_videos_found[hop_cntr]))) hop_unique_total_pseudoscience = len(list(set(hops_pseudoscience_videos_found[hop_cntr]))) hop_percentage_pseudoscience = (int(hop_unique_total_pseudoscience) / float(hop_unique_total_videos)) * 100 # ALL OTHER HOPS elif hop_cntr >= 1: all_hops_pseudoscience_videos = list() all_hops_videos = list() for i in range(start=random_walks_starting_hop, stop=hop_cntr + 1): all_hops_pseudoscience_videos += hops_pseudoscience_videos_found[i] all_hops_videos += hops_all_videos_found[i] hop_percentage_pseudoscience = (int(len(list(set(all_hops_pseudoscience_videos)))) / float(len(list(set(all_hops_videos))))) * 100 # Set the Percentage of Pseudoscience videos found at the current Hop over all unique videos so far in the Walk hops_pseudoscience_videos_found_perc[hop_cntr] = hop_percentage_pseudoscience """ Insert YouTube Video Recommendations Audit (Random Walks) Analysis results into MongoDB """ random_walks_analysis_results = dict() random_walks_analysis_results['total_pseudoscience_videos_found'] = total_pseudoscience_videos_found random_walks_analysis_results['hops_pseudoscience_videos_found'] = hops_pseudoscience_videos_found random_walks_analysis_results['hops_pseudoscience_videos_found_perc'] = hops_pseudoscience_videos_found_perc # Update Database Record for the Random Walks of the current USER PROFILE - SEARCH TERM self.audit_framework_youtube_video_recommendations.update_one( {'$and': [{'user_profile_type': USER_PROFILE}, {'seed_search_term_topic': SEARCH_TERM}]}, {'$set': {'random_walks_analysis': random_walks_analysis_results}} ) progressBar.update(1) progressBar.close() return