def crawl_cosmetic(self, scrape_choices, nrpages): cosmetic = CosmeticsCrawler('Cosmetics', nrpages) sub_sites = {} if len(scrape_choices) == 0: sub_sites.add(site) # for site in ['http://www.cosmeticsdesign.com/', 'http://www.cosmeticsdesign-europe.com/', 'http://www.cosmeticsdesign-asia.com/']: for site_url in ['http://www.cosmeticsdesign.com/']: for scrape_choice in list(scrape_choices): if scrape_choice == 'product': sub_sites[ 'Skin-care'] = site_url + '/Product-Categories/Skin-Care' sub_sites[ 'Hair-care'] = site_url + '/Product-Categories/Hair-Care' if scrape_choice == 'market': sub_sites['Market-Trends'] = site_url + '/Market-Trends' sub_sites[ 'Brand-Innovation'] = site_url + '/Brand-Innovation' print(len(sub_sites)) for sub_site, sub_site_url in sub_sites.items(): links = cosmetic.get_pagination_links(sub_site_url) for link in links: bs = Crawler.read_page(link) cosmetic.pages.add(link) data = cosmetic.scrape_page_map(sub_site, link, bs) cosmetic.bulk_data.append(data) bulk(models.client, actions=cosmetic.bulk_data, stats_only=True)
def get_pagination_links(self, sub_site): include_url = urlparse(sub_site).scheme + "://" + urlparse( sub_site).netloc links = set() url = sub_site page_nr = 0 page_size = 10 link_count = 0 while url != None and link_count < self.nrpages: # bs = batch_crawl.read_page(url) print(url) bs = Crawler.read_page(url) box_1_tag = bs.find("div", class_="box_1") for link_tag in box_1_tag.findAll( "a", href=re.compile("^(/|.*" + include_url + ")")): if link_tag.attrs['href'] is not None: if link_tag.attrs['href'] not in links: if link_tag.attrs['href'].startswith('/'): link = include_url + link_tag.attrs['href'] else: link = link_tag.attrs['href'] links.add(link) link_count = link_count + 1 result_count_tag = bs.find("span", class_="result_count") if result_count_tag != None: result_count_list = result_count_tag.text.split() result_count = int(float(result_count_list[4])) else: result_count = page_size navigation_tag = bs.find(id="navigation") if navigation_tag != None: next_tag = navigation_tag.find("span", class_="next") if next_tag != None: next_url = include_url + next_tag.find("a").attrs['href'] else: next_url = None else: page_nr = page_nr + 1 if page_nr * page_size > result_count: next_url = None else: next_url = sub_site + '/(offset)/{}'.format(page_nr) url = next_url return links
def main(): headers = {'Accept-Language': 'ja'} with Crawler() as crawler: html = crawler.get_html('https://beta.atcoder.jp/contests/archive', headers) page_count = len(html.cssselect('ul.pagination li')) print(page_count) for i in range(page_count): html = crawler.get_html( f'https://beta.atcoder.jp/contests/archive?page={i+1}', headers) table = html.cssselect('div.table-responsive table')[0] contests = table.cssselect('tr td:nth-child(2) a') for contest in contests: link = contest.get('href') match = re.fullmatch('^/contests/(.+)$', link) if match: print(match[1])
def crawl_contagious(self, scrape_choices, nrpages): contag = ContagiousCrawler('Contagious', nrpages) sub_sites = {} site_url = self.hv.CONTAGIOUS_SITE_URL for scrape_choice in scrape_choices: if scrape_choice == 'newsviews': sub_sites['blogs'] = site_url + 'blogs/news-and-views' for sub_site, sub_site_url in sub_sites.items(): if sub_site_url == 'https://www.contagious.com/blogs/news-and-views': links = contag.get_pagination_links_blog(sub_site_url) for link in links: bs = Crawler.read_page(link) contag.pages.add(link) data = contag.scrape_page_map(sub_site, link, bs) contag.bulk_data.append(data) bulk(models.client, actions=contag.bulk_data, stats_only=True)
def main(): logger.setLevel(logging.INFO) handler = logging.FileHandler('../log/crawler.log') formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) contest_list = ['abc002'] with Crawler() as crawler: scraper = Scraper(crawler) for contest in contest_list: db_session = Session() try: # scraper.crawl_results(contest, db_session) scraper.crawl_contest_by_id(contest, db_session) except Exception as e: db_session.rollback() raise e finally: db_session.commit() db_session.close()
def get_pagination_links_pulications(self, sub_site): include_url = urlparse(sub_site).scheme + "://" + urlparse( sub_site).netloc links = set() url = sub_site page_nr = 0 page_size = 10 link_count = 0 # while url != None and link_count < self.nrpages: bs = Crawler.read_page(url) print(bs) print(type(bs)) blog_posts_tag = bs.find("section", id="pub-list") # "div", class_="blog-posts") # blog_posts_tag = bs.find("div", class_="blog-posts") for link_tag in blog_posts_tag.findAll( "a", href=re.compile("^(/|.*" + include_url + ")")): print(link_tag) if link_tag.attrs['href'] is not None: if link_tag.attrs['href'] not in links: if link_tag.attrs['href'].startswith('/'): link = include_url + link_tag.attrs['href'] else: link = link_tag.attrs['href'] links.add(link) link_count = link_count + 1 navigation_tag = bs.find( "div", id="pagination-parts") # bs.find("nav", class_="nav-below") if navigation_tag != None: next_tag = navigation_tag.find("span", class_="nav-next") if next_tag != None: next_url = next_tag.parent.attrs['href'] else: next_url = None url = next_url return links
def get_pagination_links_blog(self, sub_site): include_url = urlparse(sub_site).scheme + "://" + urlparse(sub_site).netloc links = set() url = sub_site page_nr = 0 page_size = 10 link_count = 0 while url != None and link_count < self.nrpages: bs = Crawler.read_page(url) # blog_posts_tag = bs.find("section", id_ = "pub-list") # "div", class_="blog-posts") blog_posts_tag = bs.find("div", id="page-container") for link_tag in blog_posts_tag.findAll("a", href=re.compile("^(/|.*" + include_url + ")")): if link_tag.attrs['href'] is not None: if link_tag.attrs['href'] not in links: if link_tag.attrs['href'].startswith('/'): link = include_url + link_tag.attrs['href'] else: link = link_tag.attrs['href'] links.add(link) link_count = link_count + 1 navigation_tag = bs.find("div", id="pagination-parts") # bs.find("nav", class_="nav-below") if navigation_tag != None: next_tag = navigation_tag.find("span", class_="next") if next_tag != None: print(type(next_tag)) # print(next_tag.parent.attrs['href']) next_tag = str(next_tag) print(next_tag) next_url = re.findall('<a href="?\'?([^"\'>]*)', next_tag) # next_url = next_tag.parent.attrs['href'] next_url=next_url[0] print(next_url) else: next_url = None url = include_url + next_url return links
def crawl_apf(self, scrape_choices, nrpages): apf = AFPCrawler('APF', nrpages) sub_sites = {} site_url = self.hv.APF_SITE_URL for scrape_choice in scrape_choices: if scrape_choice == 'blog': sub_sites['blog'] = site_url + 'blog' if scrape_choice == 'publications': sub_sites['publications'] = site_url + 'publications' for sub_site, sub_site_url in sub_sites.items(): if sub_site_url == 'https://apf.org/blog': links = apf.get_pagination_links_blog(sub_site_url) if sub_site_url == 'https://apf.org/publications': # links = apf.get_pagination_links_pulications(sub_site_url) links = ('https://apf.org/publications', ) for link in links: bs = Crawler.read_page(link) apf.pages.add(link) data = apf.scrape_page_map(sub_site, link, bs) apf.bulk_data.append(data) bulk(models.client, actions=apf.bulk_data, stats_only=True)