content_txt = content.text score_idx = content_txt.find(score_str) score_str_len = len(score_str) beg_idx = score_idx + score_str_len end_idx = beg_idx + 2 score = content_txt[beg_idx:end_idx] return score if __name__ == '__main__': try: year = sys.argv[1] except Exception as e: print(e) raise Exception('<Usage> Input a year to grab data music data for.') URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/' soup = get_html(URL) css_selectors = ['.albumTitle'] album_titles_contents = select_soup(soup, css_selectors) album_titles_lst = list( grab_contents_key(album_titles_contents, 'text').values()) album_titles = album_titles_lst[0] album_title_links = grab_contents_key(album_titles_contents, 'a') album_title_hrefs = grab_contents_key(album_title_links, 'href') final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles) store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")
while attribute.find('Other') == -1: values[attribute] = value points_misc_idx += 1 # The value is always the last item present, surrounded by (), and the # 1+ items before that are the attributes to which those points belong. split_text = sum_points_misc_lst[points_misc_idx].split() attribute = ' '.join(split_text[:-1]) value = split_text[-1].replace('(', '').replace(')', '') values[attribute] = value points_misc_idx += 1 return values, points_misc_idx if __name__ == '__main__': try: year = sys.argv[1] except Exception as e: print(e) raise Exception('<Usage> Input a year to grab music data for.') URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/' soup = get_html(URL) css_selectors = ['.artistTitle', '.albumTitle', '.summaryPoints', '.summaryPointsMisc'] desired_contents = select_soup(soup, css_selectors) desired_contents_text = grab_contents_key(desired_contents, "text") desired_contents_renamed = rename_keys(desired_contents_text) final_lst = parse_contents(desired_contents_renamed) store_in_mongo(final_lst, 'music', 'music_lists')
content_txt = content.text score_idx = content_txt.find(score_str) score_str_len = len(score_str) beg_idx = score_idx + score_str_len end_idx = beg_idx + 2 score = content_txt[beg_idx:end_idx] return score if __name__ == '__main__': try: year = sys.argv[1] except Exception as e: print(e) raise Exception('<Usage> Input a year to grab data music data for.') URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/' soup = get_html(URL) css_selectors = ['.albumTitle'] album_titles_contents = select_soup(soup, css_selectors) album_titles_lst = list(grab_contents_key(album_titles_contents, 'text').values()) album_titles = album_titles_lst[0] album_title_links = grab_contents_key(album_titles_contents, 'a') album_title_hrefs = grab_contents_key(album_title_links, 'href') final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles) store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")
rating_txt: str Text that potentially holds the rating. idx: int Holds the rating if the text does not. Return: int """ if len(rating_txt) >= 1: rating = int(rating_txt[0].replace('.', '')) else: rating = idx return rating if __name__ == '__main__': lists_url = 'http://www.albumoftheyear.org/lists.php' soup = get_html(lists_url) critics_content = select_soup(soup, '.criticListBlockTitle') critics_names = grab_contents_key(critics_content, "text") critics_links = grab_contents_key(critics_content, 'a') critics_hrefs = grab_contents_key(critics_links, 'href') raw_output = grab_critics_info(critics_names, critics_hrefs) formatted_output = [{"Album Title": k, "Critics Scores": v} for \ k, v in raw_output.items()] store_in_mongo(formatted_output, 'music', 'music_lists', key="Album Title")
---- rating_txt: str Text that potentially holds the rating. idx: int Holds the rating if the text does not. Return: int """ if len(rating_txt) >= 1: rating = int(rating_txt[0].replace('.', '')) else: rating = idx return rating if __name__ == '__main__': lists_url = 'http://www.albumoftheyear.org/lists.php' soup = get_html(lists_url) critics_content = select_soup(soup, '.criticListBlockTitle') critics_names = grab_contents_key(critics_content, "text") critics_links = grab_contents_key(critics_content, 'a') critics_hrefs = grab_contents_key(critics_links, 'href') raw_output = grab_critics_info(critics_names, critics_hrefs) formatted_output = [{"Album Title": k, "Critics Scores": v} for \ k, v in raw_output.items()] store_in_mongo(formatted_output, 'music', 'music_lists', key="Album Title")
content_txt = content.text score_idx = content_txt.find(score_str) score_str_len = len(score_str) beg_idx = score_idx + score_str_len end_idx = beg_idx + 2 score = content_txt[beg_idx:end_idx] return score if __name__ == '__main__': try: year = sys.argv[1] except Exception as e: print e raise Exception('<Usage> Input a year to grab data music data for.') URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/' soup = get_html(URL) css_selectors = ['.albumTitle'] album_titles_contents = select_soup(soup, css_selectors) album_titles = grab_contents_key(album_titles_contents, 'text').values()[0] album_title_links = grab_contents_key(album_titles_contents, 'a') album_title_hrefs = grab_contents_key(album_title_links, 'href') final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles) store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")