class Builder: def __init__(self): pass def get_crawler(self, url): self.crawler = WebCrawler(url) def get_data(self, url): self.get_crawler(url) return self.crawler.get_soup()
class Builder: def __init__(self): pass def set_crawler(self, url): self.crawler = WebCrawler(url) def get_data(self, url): self.set_crawler(url) manga_list = PL(url, self.crawler.get_soup()) result = manga_list.get_data() # _url = 'https://www.wawacity.vip/?p=manga&id=1872-the-millionaire-detective-balance-unlimited-saison1' # _url = 'https://www.wawacity.vip/?p=manga&id=1874-food-wars-saison5' for k, v in result.items(): result[k]['page'] = self.get_page_data(result[k]['link']) pprint(result) def get_page_data(self, _url): self.set_crawler(_url) manga_page = PD(_url, self.crawler.get_soup()) return manga_page.get_data() def insert_process(self, args): result = {} for num, dict in args.items(): for keys, values in dict.items(): if keys != 'page': result[keys] = values else: for key, value in dict['page'].items(): if key == 'details': for k, v in value.items(): result[k] = v else: result[key] = value ''' inset in DB import from create_table'''
class Builder: def __init__(self): pass def get_crawler(self, url): self.crawler = WebCrawler(url) def get_data(self, url): self.get_crawler(url) # manga_list = PL(url, self.crawler.get_soup()) # result = manga_list.get_data() # # # _url = 'https://www.wawacity.vip/?p=manga&id=1872-the-millionaire-detective-balance-unlimited-saison1' # # _url = 'https://www.wawacity.vip/?p=manga&id=1874-food-wars-saison5' # for k, v in result.items(): # result[k]['page'] = self.get_page_data(result[k]['link']) titles = self.crawler.get_soup().find_all("a", href=True) for title in titles: print(title)
def get_all_pages(self): self.page_soup = [] regex = { '遅れ(10分未満)': '遅れ\(\S+分\S+\)', '遅れ(30分以上)': '遅れ\(\S+分\S+\)', '遅れ(10〜30分)': '遅れ\(\S+分\)', '止まっている': '止まっている', '順調': '順調', 'その他': 'その他', '運転再開': '運転再開' } for page_number in range(0, 900, 30): # 581 list_page = WebCrawler(self.url + str(page_number)) list_page_soup = list_page.get_soup() tables = list_page_soup.find_all('div', {'class': 'div_table'}) for table in tables: spans = table.find_all('span') train = {} for span_counter, span in enumerate(spans): span = span.getText() if span_counter == 0: for key, reg in regex.items(): if key in span: train['line'] = re.sub(reg, '', span).replace(' ', '') train['delay'] = re.findall(reg, span)[0] elif span_counter == 1: train['start_time'] = re.findall(r'\d\d:\d\d', span)[0] tmp = re.findall(r'(\S+) → (\S+)', span) if not tmp: tmp = [(re.findall(r'(\S+) →', span)[0], '')] train['start_station'], train['end_station'] = tmp[0] elif span_counter == 2: train['status'] = span elif span_counter == 3: a = table.parent.parent.find("a", href=True) a = re.findall(r'id=\d+', a['href'])[0] detail_page_soup = WebCrawler('https://mb.jorudan.co.jp/os/live.cgi?' + a).get_soup() detail_table = detail_page_soup.find('table', {'class': 'detail_table'}) trs = detail_table.find_all('tr') english_trad = { '時刻': 'timesOfDay', '区間': 'section', '詳細': 'details' } for tr in trs: tds = tr.find_all('td') train[english_trad[tds[0].getText()]] = tds[1].getText().strip() self.page_soup.append(train)