def search(): item_list = list() # todo: limit > 0 while True: try: time.sleep(global_advanced_settings.getint('search_cooldown')) print('searching for: "' + query + '"') for url in google_search(query, stop=limit): if len(item_list) >= limit: break new_item = {'link': url} while True: for existing_candidate in video_candidates: if new_item['link'] == existing_candidate['link']: break try: new_item['pytube_result'] = YouTube(new_item['link']) item_list.append(new_item) break except KeyError: print('Pytube failed to initialize (KeyError). trying again in 2 seconds.') time.sleep(2) except URLError: print('Pytube failed to initialize (URLError). trying again in 2 seconds.') time.sleep(2) except exceptions.RegexMatchError: new_item['delete_this_item'] = True break break except HTTPError as e: if e.code == 503: print('------------------------------------------------------------------------------------') print('It seems that your IP-address have been flagged by google for unusual activity. ') print('They usually put down the flag after some time so try again tomorrow.') print('If this is a reoccurring issue, increase the search cooldown under advanced settings') print('------------------------------------------------------------------------------------') sys.exit() else: print('Failed to retrieve search results, trying again in 2 seconds: ' + e.msg) time.sleep(2) continue except URLError as e: print('Failed to retrieve search results, trying again in 2 seconds: ' + e.msg) time.sleep(2) continue return item_list
def search(query): google_response = google_search(query + ' site:genius.com', stop=1) links = [] for url in google_response: links.append(url) links_of_lyric_pages = [] for link in links: # Remove url of other pages of genius.com if link.count('/') != 3: continue if link.split('/')[3] in ['artists-index', 'discussions']: continue if (link.split('/')[3]).split('?')[0] in ['songs']: continue if link.count('-') == 0: continue if 'lyrics' not in link: continue else: links_of_lyric_pages.append(link) # list_of_links_with_description = [{'description': "", # 'url': "", # 'user_query': ""}] list_of_links_with_description = [] for link in links_of_lyric_pages: url_lyric_description_part = link.split('/')[-1] # remove '-' and 'lyrics' word description = ' '.join(url_lyric_description_part.split('-')[:-1]) list_of_links_with_description.append({ 'description': description, 'url': link, 'query': query }) return list_of_links_with_description
def search(): item_list = list() # todo: limit > 0 while True: try: time.sleep( global_advanced_settings.getint('search_cooldown')) print('searching for: "' + query + '"') for url in google_search(query, stop=limit): if len(item_list) >= limit: break new_item = {'link': url} while True: for existing_candidate in video_candidates: if new_item['link'] == existing_candidate[ 'link']: break try: new_item['pytube_result'] = YouTube( new_item['link']) item_list.append(new_item) break except KeyError: print( 'Pytube failed to initialize (KeyError). trying again in 2 seconds.' ) time.sleep(2) except URLError: print( 'Pytube failed to initialize (URLError). trying again in 2 seconds.' ) time.sleep(2) except exceptions.RegexMatchError: new_item['delete_this_item'] = True break break except HTTPError as e: if e.code == 503: print( '------------------------------------------------------------------------------------' ) print( 'It seems that your IP-address have been flagged by google for unusual activity. ' ) print( 'They usually put down the flag after some time so try again tomorrow.' ) print( 'If this is a reoccurring issue, increase the search cooldown under advanced settings' ) print( '------------------------------------------------------------------------------------' ) sys.exit() else: print( 'Failed to retrieve search results, trying again in 2 seconds: ' + e.msg) time.sleep(2) continue except URLError as e: print( 'Failed to retrieve search results, trying again in 2 seconds: ' + e.msg) time.sleep(2) continue return item_list
# Make all emails to lower case and return as list return list({email.casefold() for email in emails}) # Google Search search_queries = ["webbyrå stockholm", "webbyrå göteborg"] # Search queries num_search_limit = 100 # Number of results to retrieve from Google search per search query urls_to_scrape = [] for search_query in search_queries: url_index = 0 for url_result in google_search(search_query, tld="se", num=num_search_limit, stop=num_search_limit, pause=2): # Extract the base url parts = urlsplit(url_result) base_url = "{0.scheme}://{0.netloc}".format(parts) # Check if the url has already been handled. If it has but the new ranking is higher (lower number) delete and replace run_continue = False list_len = len(urls_to_scrape) for i in range(0, list_len): if urls_to_scrape[i]['base_url'] == base_url: if urls_to_scrape[i][ 'search_ranking'] > url_index: # If the new ranking is higher (lower number) - delete the old one del urls_to_scrape[i]
def get_video_to_download(movie, search_suffix, filter_arguments): def scan_response(response): response['max_video_resolution'] = 0 for result in response['items']: result['delete_this_item'] = False video = None for try_count in range(5): if try_count > 2: time.sleep(1) video = YouTube(result['link']) else: try: video = YouTube(result['link']) break except KeyError: print('Pytube failed to initialize (KeyError). trying again in 10 seconds.') time.sleep(9) except URLError: print('Pytube failed to initialize (URLError). trying again in 10 seconds.') time.sleep(9) except exceptions.RegexMatchError: result['delete_this_item'] = True break if result['delete_this_item']: continue result['youtube_object'] = video result['title'] = video.title result['avg_rating'] = float(video.player_config_args['avg_rating']) result['view_count'] = int(video.player_config_args['view_count']) if result['view_count'] < 60: result['view_count'] = 60 result['video_resolution'] = 0 for stream in video.streams.filter(type='video').all(): try: resolution = int(stream.resolution.replace('p', '')) except AttributeError: resolution = 0 if resolution > response['max_video_resolution']: response['max_video_resolution'] = resolution if resolution > result['video_resolution']: result['video_resolution'] = resolution try: if 'ad_preroll' in video.player_config_args: result['adds_info'] = 'have adds' else: result['adds_info'] = 'No adds' except ValueError: result['adds_info'] = 'No adds' return response def filter_response(response, arguments): items = list() for result in response['items']: append_video = True if result['delete_this_item']: continue for word in arguments['video_name_must_contain']: if word.lower() not in result['title'].lower(): append_video = False for word in arguments['video_name_must_not_contain']: if word.lower() in result['title'].lower(): append_video = False if append_video: items.append(result) response.pop('items') response['items'] = items return response def score_response(response, scoring_arguments): for result in response['items']: result['true_rating'] = result['avg_rating'] * (1 - 1 / ((result['view_count'] / 60) ** 0.5)) if result['video_resolution'] < 700: result['true_rating'] *= 0.90 result['view_count'] *= 0.5 for bonus in scoring_arguments['video_name_tag_bonuses']: for word in scoring_arguments['video_name_tag_bonuses'][bonus]: if word in result['title'].lower(): result['true_rating'] *= bonus result['view_count'] *= bonus break return response # search for movie search = movie.replace('(', '').replace(')', '').replace('[', '').replace(']', '') + ' ' + search_suffix search = search.replace('.', ' ').replace('_', ' ').replace('-', ' ').replace(' ', ' ').replace(' ', ' ') search = str('site:youtube.com ' + search) item_list = list() for attempt in range(5): if attempt > 2: for url in google_search(search, stop=10): item = {'link': url} item_list.append(item) break else: try: for url in google_search(search, stop=10): item = {'link': url} item_list.append(item) break except URLError: print('Failed to retrieve search results, trying again in 10 seconds') time.sleep(10) continue item_list.pop() item_list.pop() item_list.pop() search_response = {'items': item_list} search_response = scan_response(search_response) search_response = filter_response(search_response, filter_arguments) search_response = score_response(search_response, filter_arguments) # select video selected_movie = None top_score = 0 top_view_count = 0 for item in search_response['items']: print('-----------------------------------------------------------------') print(item['title']) print(item['adds_info']) print(item['video_resolution']) print(item['link']) print(item['true_rating']) print(item['view_count']) if item['true_rating'] > top_score: top_score = item['true_rating'] for item in search_response['items']: if item['true_rating'] > top_score * 0.95: if item['view_count'] > top_view_count: top_view_count = item['view_count'] selected_movie = item return selected_movie
def discover_google_plagiat(query): # stop à 20 liens pour ne pas être trop long urls = google_search(query, stop=20)
def get_google_urls(self): urls = google_search(self.query, stop=self.google_search_max_results) return urls