def parse(self, reg, range_postfix): bodies = [] start, end = tuple(range_postfix) for i in tqdm(range(start, end), desc=f'{self.name}'): url = self.url.link(i) text = Text.create_by_url(url) bodies_sub = text.findall(reg) bodies.extend(bodies_sub) urls = self.url.concat_site_with_bodies(bodies) return urls
def __init__(self, site, url): self.url = url self.site = site self.text = Text.create_by_url(url)
def urls(self): reg = r'<div class="content-list_item" data-reactid=".*?"><div class="content-list_item-info" data-reactid=".*?"><div class="media-preview" data-reactid=".*?"><a class="media-preview_img-wrap" href="(.*?)"' text = Text.create_by_url(self.url.url) urls = text.findall(reg) urls = self.url.concat_site_with_bodies(urls) return urls