Beispiel #1
0
    def crawl_cosmetic(self, scrape_choices, nrpages):
        cosmetic = CosmeticsCrawler('Cosmetics', nrpages)
        sub_sites = {}
        if len(scrape_choices) == 0:
            sub_sites.add(site)

    #   for site in ['http://www.cosmeticsdesign.com/', 'http://www.cosmeticsdesign-europe.com/', 'http://www.cosmeticsdesign-asia.com/']:
        for site_url in ['http://www.cosmeticsdesign.com/']:
            for scrape_choice in list(scrape_choices):
                if scrape_choice == 'product':
                    sub_sites[
                        'Skin-care'] = site_url + '/Product-Categories/Skin-Care'
                    sub_sites[
                        'Hair-care'] = site_url + '/Product-Categories/Hair-Care'
                if scrape_choice == 'market':
                    sub_sites['Market-Trends'] = site_url + '/Market-Trends'
                    sub_sites[
                        'Brand-Innovation'] = site_url + '/Brand-Innovation'
        print(len(sub_sites))
        for sub_site, sub_site_url in sub_sites.items():
            links = cosmetic.get_pagination_links(sub_site_url)
            for link in links:
                bs = Crawler.read_page(link)
                cosmetic.pages.add(link)
                data = cosmetic.scrape_page_map(sub_site, link, bs)
                cosmetic.bulk_data.append(data)

        bulk(models.client, actions=cosmetic.bulk_data, stats_only=True)
Beispiel #2
0
 def get_pagination_links(self, sub_site):
     include_url = urlparse(sub_site).scheme + "://" + urlparse(
         sub_site).netloc
     links = set()
     url = sub_site
     page_nr = 0
     page_size = 10
     link_count = 0
     while url != None and link_count < self.nrpages:
         # bs = batch_crawl.read_page(url)
         print(url)
         bs = Crawler.read_page(url)
         box_1_tag = bs.find("div", class_="box_1")
         for link_tag in box_1_tag.findAll(
                 "a", href=re.compile("^(/|.*" + include_url + ")")):
             if link_tag.attrs['href'] is not None:
                 if link_tag.attrs['href'] not in links:
                     if link_tag.attrs['href'].startswith('/'):
                         link = include_url + link_tag.attrs['href']
                     else:
                         link = link_tag.attrs['href']
                     links.add(link)
                     link_count = link_count + 1
         result_count_tag = bs.find("span", class_="result_count")
         if result_count_tag != None:
             result_count_list = result_count_tag.text.split()
             result_count = int(float(result_count_list[4]))
         else:
             result_count = page_size
         navigation_tag = bs.find(id="navigation")
         if navigation_tag != None:
             next_tag = navigation_tag.find("span", class_="next")
             if next_tag != None:
                 next_url = include_url + next_tag.find("a").attrs['href']
             else:
                 next_url = None
         else:
             page_nr = page_nr + 1
             if page_nr * page_size > result_count:
                 next_url = None
             else:
                 next_url = sub_site + '/(offset)/{}'.format(page_nr)
         url = next_url
     return links
def main():
    headers = {'Accept-Language': 'ja'}
    with Crawler() as crawler:
        html = crawler.get_html('https://beta.atcoder.jp/contests/archive',
                                headers)
        page_count = len(html.cssselect('ul.pagination li'))
        print(page_count)

        for i in range(page_count):
            html = crawler.get_html(
                f'https://beta.atcoder.jp/contests/archive?page={i+1}',
                headers)
            table = html.cssselect('div.table-responsive table')[0]
            contests = table.cssselect('tr td:nth-child(2) a')
            for contest in contests:
                link = contest.get('href')
                match = re.fullmatch('^/contests/(.+)$', link)
                if match:
                    print(match[1])
Beispiel #4
0
    def crawl_contagious(self, scrape_choices, nrpages):
        contag = ContagiousCrawler('Contagious', nrpages)
        sub_sites = {}
        site_url = self.hv.CONTAGIOUS_SITE_URL
        for scrape_choice in scrape_choices:
            if scrape_choice == 'newsviews':
                sub_sites['blogs'] = site_url + 'blogs/news-and-views'

        for sub_site, sub_site_url in sub_sites.items():

            if sub_site_url == 'https://www.contagious.com/blogs/news-and-views':
                links = contag.get_pagination_links_blog(sub_site_url)

            for link in links:
                bs = Crawler.read_page(link)
                contag.pages.add(link)
                data = contag.scrape_page_map(sub_site, link, bs)
                contag.bulk_data.append(data)
        bulk(models.client, actions=contag.bulk_data, stats_only=True)
Beispiel #5
0
def main():
    logger.setLevel(logging.INFO)
    handler = logging.FileHandler('../log/crawler.log')
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    contest_list = ['abc002']
    with Crawler() as crawler:
        scraper = Scraper(crawler)
        for contest in contest_list:
            db_session = Session()
            try:
                # scraper.crawl_results(contest, db_session)
                scraper.crawl_contest_by_id(contest, db_session)
            except Exception as e:
                db_session.rollback()
                raise e
            finally:
                db_session.commit()
                db_session.close()
Beispiel #6
0
    def get_pagination_links_pulications(self, sub_site):
        include_url = urlparse(sub_site).scheme + "://" + urlparse(
            sub_site).netloc
        links = set()
        url = sub_site
        page_nr = 0
        page_size = 10
        link_count = 0

        # while url != None and link_count < self.nrpages:
        bs = Crawler.read_page(url)
        print(bs)
        print(type(bs))
        blog_posts_tag = bs.find("section",
                                 id="pub-list")  # "div", class_="blog-posts")
        # blog_posts_tag = bs.find("div", class_="blog-posts")

        for link_tag in blog_posts_tag.findAll(
                "a", href=re.compile("^(/|.*" + include_url + ")")):
            print(link_tag)
            if link_tag.attrs['href'] is not None:
                if link_tag.attrs['href'] not in links:
                    if link_tag.attrs['href'].startswith('/'):
                        link = include_url + link_tag.attrs['href']
                    else:
                        link = link_tag.attrs['href']
                    links.add(link)
                    link_count = link_count + 1

        navigation_tag = bs.find(
            "div", id="pagination-parts")  # bs.find("nav", class_="nav-below")
        if navigation_tag != None:
            next_tag = navigation_tag.find("span", class_="nav-next")
            if next_tag != None:
                next_url = next_tag.parent.attrs['href']
            else:
                next_url = None
        url = next_url

        return links
Beispiel #7
0
    def get_pagination_links_blog(self, sub_site):
        include_url = urlparse(sub_site).scheme + "://" + urlparse(sub_site).netloc
        links = set()
        url = sub_site
        page_nr = 0
        page_size = 10
        link_count = 0
        while url != None and link_count < self.nrpages:
            bs = Crawler.read_page(url)

            # blog_posts_tag = bs.find("section", id_ = "pub-list") # "div", class_="blog-posts")
            blog_posts_tag = bs.find("div", id="page-container")

            for link_tag in blog_posts_tag.findAll("a", href=re.compile("^(/|.*" + include_url + ")")):
                if link_tag.attrs['href'] is not None:
                    if link_tag.attrs['href'] not in links:
                        if link_tag.attrs['href'].startswith('/'):
                            link = include_url + link_tag.attrs['href']
                        else:
                            link = link_tag.attrs['href']
                        links.add(link)
                        link_count = link_count + 1
            navigation_tag = bs.find("div", id="pagination-parts") # bs.find("nav", class_="nav-below")
            if navigation_tag != None:
                next_tag = navigation_tag.find("span", class_="next")
                if next_tag != None:
                    print(type(next_tag))
                    # print(next_tag.parent.attrs['href'])
                    next_tag = str(next_tag)
                    print(next_tag)
                    next_url = re.findall('<a href="?\'?([^"\'>]*)', next_tag)
                    # next_url = next_tag.parent.attrs['href']
                    next_url=next_url[0]
                    print(next_url)
                else:
                    next_url = None
            url = include_url + next_url
        return links
Beispiel #8
0
    def crawl_apf(self, scrape_choices, nrpages):
        apf = AFPCrawler('APF', nrpages)
        sub_sites = {}
        site_url = self.hv.APF_SITE_URL
        for scrape_choice in scrape_choices:
            if scrape_choice == 'blog':
                sub_sites['blog'] = site_url + 'blog'
            if scrape_choice == 'publications':
                sub_sites['publications'] = site_url + 'publications'

        for sub_site, sub_site_url in sub_sites.items():

            if sub_site_url == 'https://apf.org/blog':
                links = apf.get_pagination_links_blog(sub_site_url)
            if sub_site_url == 'https://apf.org/publications':
                # links = apf.get_pagination_links_pulications(sub_site_url)
                links = ('https://apf.org/publications', )
            for link in links:
                bs = Crawler.read_page(link)
                apf.pages.add(link)
                data = apf.scrape_page_map(sub_site, link, bs)
                apf.bulk_data.append(data)
        bulk(models.client, actions=apf.bulk_data, stats_only=True)