def get_volume_info(): html = read_url(VOLUME_URL) table = re.search(r'<table id="top100_table"(.+)</table>', html, re.DOTALL).group(0) soup = BeautifulSoup(table) raw_entries = soup.findAll('tr')[1:] for raw_entry in raw_entries: soup = BeautifulSoup(str(raw_entry)) columns = soup.findAll('td') id = int(re.search(r'id=(\d+)', str(columns[0])).group(1)) volume = rs_str_to_int(columns[5].contents[0]) yield id, volume
def get_front_volume_info(): html = read_url(FRONT_VOLUME_URL) top5_left = re.search(r'<div class="top5_left(.+)', html, re.DOTALL).group(0) soup = BeautifulSoup(top5_left) table = soup.find('table') raw_entries = table.findAll('tr')[1:] for raw_entry in raw_entries: soup = BeautifulSoup(str(raw_entry)) columns = soup.findAll('td') id = int(re.search(r'id=(\d+)', str(columns[0])).group(1)) volume = rs_str_to_int(columns[3].contents[0][:-1]) yield id, volume
def loop_and_parse_indexes(): logging.info('Looping through index pages.') letters = list('abcdefghijklmnopqrstuvwxyz') letters.append('Other') for letter in letters: page = 1 while True: content = read_url(get_index_url(letter, page)) if 'did not return' in content: break logging.debug("Parsing page %i of letter '%s'" % (page, letter)) parse_index(content) if 'Next ><br>' in content: break page += 1
def get_detail_info_from_id(rs_id): html = read_url(get_detail_url(rs_id)) return get_detail_info(html)