def spotify(songids_file, spotify_token, request_rate=0.25, out_file=None, num_processes=1): songids_df = pd.read_json(songids_file) # get non-empty spotify ids songids_df = songids_df.loc[(~songids_df.spotify_id.isna())] relevant_spotify_ids = songids_df.spotify_id # spotify audio features API can take up to 100 ids at a time num_groups = math.ceil(len(relevant_spotify_ids) / 100) groups = np.array_split(relevant_spotify_ids, num_groups) api_urls = [_construct_spotify_audio_features_url(g) for g in groups] headers = utils.create_auth_headers(bearer_token=spotify_token) api_s = scraper.APIScraper(request_rate, headers=headers) api_results = utils.run_multi_scraper(api_s, api_urls, num_processes) songids_df = songids_df.assign(audio_features=lambda x: None) features = [] for res in api_results: features.extend(res['audio_features']) songids_df.audio_features = features if out_file is not None: songids_df.to_json(out_file, orient='records') else: print(songids_df.to_json(orient='records'))
def song_blogs(tm_out_file, request_rate=0.25, out_file=None, num_processes=1): tm_df = pd.read_json(tm_out_file) item_ids = set() for week in tm_df.songs: for song in week: item_ids.add( song['itemid'] ) # artist, title, loved_count, posted_count, time, week print(f'{len(item_ids)} unique songs provided.') item_ids = list(item_ids) # order song_blogs_urls = [ _construct_song_blogs_url(item_id) for item_id in item_ids ] s = scraper.APIScraper(request_rate) blogs = utils.run_multi_scraper(s, song_blogs_urls, num_processes) result_df = pd.DataFrame({'itemid': item_ids, 'blogs': blogs}) if out_file is not None: result_df.to_json(out_file, orient='records') else: print(result_df.to_json(orient='records'))
def spotify_genre(songids_file, spotify_token, request_rate=0.25, out_file=None, num_processes=1): # TODO: unlikely that the sequential calls to get artist ids and get genres will finish in under # an hour, so use a token with longer expiration, or refresh your token between tasks. songids_df = pd.read_json(songids_file) # get non-empty spotify ids songids_df = songids_df.loc[(~songids_df.spotify_id.isna())] relevant_spotify_ids = songids_df.spotify_id # get spotify track urls track_api_urls = [ _construct_spotify_track_url(id) for id in relevant_spotify_ids ] headers = utils.create_auth_headers(bearer_token=spotify_token) api_s = scraper.APIScraper(request_rate, headers=headers) api_results = utils.run_multi_scraper(api_s, track_api_urls, num_processes) # get spotify artist urls - just take first artist into account artist_ids = [ res['artists'][0]['id'] if 'artists' in res else '' for res in api_results ] #artist_ids = songids_df.artist_id artist_api_urls = [_construct_spotify_artist_url(id) for id in artist_ids] api_s = scraper.APIScraper(request_rate, headers=headers) api_results = utils.run_multi_scraper(api_s, artist_api_urls, num_processes) genres = [res['genres'] if len(res) else [] for res in api_results] songids_df = songids_df.assign(artist_id=lambda x: None) songids_df = songids_df.assign(spotify_genres=lambda x: None) songids_df.artist_id = artist_ids songids_df.spotify_genres = genres if out_file is not None: songids_df.to_json(out_file, orient='records') else: print(songids_df.to_json(orient='records'))
def time_machine(api_key, start_date=datetime.now(), end_date=datetime.now() - timedelta(days=14), days_from_start=None, request_rate=0.25, out_file=None, num_processes=1): date_format = '%b-%d-%Y' # May-27-2018 if not isinstance(start_date, datetime): start_date = datetime.strptime(start_date, date_format) if isinstance(days_from_start, int): end_date = start_date - timedelta(days_from_start) elif not isinstance(end_date, datetime): end_date = datetime.strptime(end_date, date_format) print( f'start_date: {start_date.__format__(date_format)}; end_date: {end_date.__format__(date_format)}' ) weeks = [] curr_date = start_date while curr_date > end_date: if curr_date == start_date and curr_date.weekday() is not 0: delta = curr_date.weekday() else: delta = 7 last_monday = curr_date - timedelta(days=delta) weeks.append(last_monday.__format__(date_format)) curr_date = last_monday time_machine_urls = [_construct_time_machine_url(wk) for wk in weeks] key_param = f'?key={api_key}' if api_key is not None else None s = scraper.APIScraper(request_rate, query_params=key_param) songs = utils.run_multi_scraper(s, time_machine_urls, num_processes) result_df = pd.DataFrame({'popular_week': weeks, 'songs': songs}) if out_file is not None: result_df.to_json(out_file, orient='records') else: print(result_df.to_json(orient='records'))
def genius(songids_file, genius_token, request_rate=0.25, out_file=None, num_processes=1): songids_df = pd.read_json(songids_file) # get genius ids that have a non-empty corresponding spotify id songids_df = songids_df.loc[~songids_df.spotify_id.isna()] relevant_genius_ids = songids_df.genius_id # get genius song urls song_api_urls = [ _construct_genius_song_api_url(id) for id in relevant_genius_ids ] headers = utils.create_auth_headers(bearer_token=genius_token) api_s = scraper.APIScraper(request_rate, headers=headers) api_results = utils.run_multi_scraper(api_s, song_api_urls, num_processes) song_urls = [ res['response']['song']['url'] if 'response' in res else '' for res in api_results ] dom_s = scraper.DOMScraper(request_rate) dom_results = utils.run_multi_scraper(dom_s, song_urls, num_processes, ['meta[itemprop="page_data"]'], 'content') songids_df = songids_df.assign(genres=lambda x: None, desc=lambda x: None) h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True genres = [] descs = [] regex = r'tag:([^,]+)' for content in dom_results: if content is None: genres.append(None) descs.append(None) continue sections = json.loads(content)['chartbeat']['sections'] try: matches = re.finditer(regex, sections) song_genres = [match.group(1) for match in matches] except Exception as e: print(f'[WARNING] error getting song genres - setting to None') print(e) song_genres = None genres.append(song_genres) try: desc = h.handle(json.loads(content)['song']['description']['html']) except Exception as e: print(f'[WARNING] error getting song desc - setting to None') print(e) desc = None descs.append(desc) songids_df.genres = genres songids_df.desc = descs # TODO: desc == '\n\n?\n\n' if out_file is not None: songids_df.to_json(out_file, orient='records') else: print(songids_df.to_json(orient='records'))
def extern_song_ids(hypem_songlist_file, spotify_token=None, genius_token=None, request_rate=0.25, out_file=None, num_processes=1): songlist_df = pd.read_json(hypem_songlist_file) search_queries = [] for song in songlist_df.itertuples(): search_queries.append(_get_song_query(song.title, song.artist)) if spotify_token is not None: spotify_search_urls = [ _construct_spotify_search_url(query) for query in search_queries ] if genius_token is not None: genius_search_urls = [ _construct_genius_search_url(query) for query in search_queries ] if spotify_token is not None and genius_token is not None: # run spotify and genius in parallel parallel_params = zip([spotify_search_urls, genius_search_urls], [spotify_token, genius_token]) pool = Pool(processes=2) # TODO: num_processes > 1 == AssertionError: daemonic processes are not allowed to have children print(f'Executing Spotify and Genius scrapes in parallel') async_results = [pool.apply_async(utils.run_multi_scraper, (scraper.APIScraper(request_rate, headers=utils.create_auth_headers(bearer_token=bearer_token)), urls, num_processes)) \ for urls, bearer_token in parallel_params] pool_results = [] for res in async_results: pool_results.append(res.get()) # `get` is a blocking call pool.close() spotify_res = pool_results[0] genius_res = pool_results[1] elif spotify_token is not None: headers = utils.create_auth_headers(bearer_token=spotify_token) s = scraper.APIScraper(request_rate, headers=headers) spotify_res = utils.run_multi_scraper(s, spotify_search_urls, num_processes) elif genius_token is not None: headers = utils.create_auth_headers(bearer_token=genius_token) s = scraper.APIScraper(request_rate, headers=headers) genius_res = utils.run_multi_scraper(s, genius_search_urls, num_processes) else: print( f'[ERROR] At least one token must be provided: spotify_token or genius_token' ) sys.exit() if spotify_token is not None: spotify_ids = [] for res in spotify_res: if 'tracks' not in res: spotify_ids.append(None) continue track_items = res['tracks']['items'] if not len(track_items): spotify_ids.append(None) continue # assumption: taking top search result spotify_ids.append(track_items[0]['id']) else: spotify_ids = [None] * len(songlist_df.itemid) if genius_token is not None: genius_ids = [] for res in genius_res: if 'response' not in res: genius_ids.append(None) continue hits = res['response']['hits'] if not len(hits): genius_ids.append(None) continue # assumption: taking top search result genius_ids.append(hits[0]['result']['id']) else: genius_ids = [0] * len(songlist_df.itemid) result_df = pd.DataFrame({ 'itemid': songlist_df.itemid, 'spotify_id': spotify_ids, 'genius_id': genius_ids }) result_df.genius_id = result_df.genius_id.fillna(0).astype(int) if out_file is not None: result_df.to_json(out_file, orient='records') else: print(result_df.to_json(orient='records'))
def review(bloglist_file, api_key, request_rate=0.25, out_file=None, num_processes=1): '''Scrape reviews (article-like) of the URLs provided using the Mercury Web Parser (https://mercury.postlight.com/web-parser/). ''' bloglist_df = pd.read_json(bloglist_file) bloglist_df.columns = ['url'] parser_urls = [ _construct_mercury_parser_url(url) for url in bloglist_df.url ] headers = utils.create_auth_headers(api_key=api_key) s = scraper.APIScraper(request_rate, headers=headers, res_callback=_handle_article_response) results = utils.run_multi_scraper(s, parser_urls, num_processes) DetectorFactory.seed = 0 # enforce consistent language detection # assumption: no next_page_url handling (assuming 1 page) updated_results = [] for i, res in enumerate(results): if not res: res = { 'title': None, 'author': None, 'date_published': None, 'dek': None, 'lead_image_url': None, 'content': '', 'next_page_url': None, 'url': None, 'domain': None, 'excerpt': None, 'word_count': 0, 'direction': None, 'total_pages': None, 'rendered_pages': None } res['orig_url'] = bloglist_df.loc[i, 'url'] h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True try: res['content'] = h.handle(res['content']) except Exception as e: print( f'[WARNING] html2text content handling threw an exception - setting content to empty' ) res['content'] = '' res['word_count'] = len(res['content'].split()) try: res['lang'] = detect(res['content']) except Exception as e: #print(f'[WARNING] langdetect threw an exception - setting lang to None') res['lang'] = None updated_results.append(res) print('total word count:', sum([res['word_count'] for res in updated_results])) updated_results_df = pd.DataFrame.from_records(updated_results) if out_file is not None: updated_results_df.to_json(out_file, orient='records') else: print(updated_results_df.to_json(orient='records'))