Beispiel #1
0
 def crawl(self):
     if len(self._pages) > 0:
         crawler = Crawler(self._pages[0].get_links())
         for result in crawler.begin_crawl():
             page = Page(site=self._site)
             page.set_url(result["url"])
             page.set_html(result["data"])
             page.parse_page(self._robot)
             print(f'{page.get_url()} has {len(page.get_links())} links')
             self.add_page(page)
    def visit_page(self, product_list_url, page_num):
        print "Page-{}, {}".format(page_num, product_list_url)

        response = urllib2.urlopen(product_list_url)

        if response.code is self.HTTP_STATUS_OK:
            page = Page()
            page.set_data(response.read())
            page.set_url(product_list_url)
            page.set_out_file(self.output_filename + "-page-" + str(page_num) + "-")
            page.scrap()

            print "{} - Completed".format(product_list_url)
class Scraper:
    ROOT_DOMAIN = 'http://www.amazon.com'
    HTTP_STATUS_OK = 200
    THREAD_NUM = 5

    def __init__(self, url):
        self.url = None
        self.set_url(url)
        self.page = Page()
        self.product = Product()
        self.page_links = []
        self.output_filename = "out_"
        self.threads = []

    def set_url(self, url):
        self.url = url
        return self

    def set_output_file(self, keyword):
        self.output_filename = keyword.replace(" ", "_").lower()

    def get_page_links(self):
        return self.page_links

    def get_url(self):
        return self.url

    def build_page_links(self, max_page_num, pagination_format):

        if pagination_format is not None:
            query_string = pagination_format.split("&")
            # pages = query_string[1].split("=")
            match = re.search("page=(\d+)", pagination_format)

            page_num = int(match.group(1))

            page_dict = {
                "page": str(page_num),
                "url": self.ROOT_DOMAIN + pagination_format
            }

            self.page_links.append(page_dict)

            while page_num < int(max_page_num):
                page_num += 1

                query_string[1] = 'page=' + str(page_num)
                pack_query_string = '&'.join(query_string)

                page_dict = {
                    "page": str(page_num),
                    "url": self.ROOT_DOMAIN + pack_query_string
                }

                self.page_links.append(page_dict)

    def visit_page(self, product_list_url, page_num):
        print "Page-{}, {}".format(page_num, product_list_url)

        response = urllib2.urlopen(product_list_url)

        if response.code is self.HTTP_STATUS_OK:
            page = Page()
            page.set_data(response.read())
            page.set_url(product_list_url)
            page.set_out_file(self.output_filename + "-page-" + str(page_num) + "-")
            page.scrap()

            print "{} - Completed".format(product_list_url)
            # list_page.set_response(response)

            # products = self.page.scrap()

    def scrap_all_products(self):

        # total_links = len(self.page_links)
        # print "Total Links : " + str(total_links)
        #
        # chunks = total_links / self.THREAD_NUM
        # remaining = total_links % self.THREAD_NUM
        #
        # print "Loop Required : " + str(chunks)
        # print "Starting Thread Count :" + str(self.THREAD_NUM)
        # print "Remaining Count : " + str(remaining)
        #
        # for i in range(0, chunks):
        #     for idx in range(self.THREAD_NUM):
        #         # print idx
        #         link = self.page_links[idx]
        #         page_num = link["page"]
        #         page_url = link["url"]
        #         #
        #         print "Starting Thread for url: " + page_url
        #         # t = threading.Thread(target=self.visit_page, args=(page_url, page_num,))
        #         # self.threads.append(t)
        #         # t.start()

        #
        # start_index = self.THREAD_NUM * chunks
        # #
        # remaining_chunk = total_links % self.THREAD_NUM
        # #
        # for j in range(remaining_chunk):
        #     print "Starting Thread for remaining urls"
        #     link = self.page_links[start_index]
        #     page_num = link["page"]
        #     page_url = link["url"]
        #
        #     print page_url
        #
        #     # t = threading.Thread(target=self.visit_page, args=(page_url, page_num,))
        #     # self.threads.append(t)
        #     #
        #     # print "Starting Thread for url: " + page_url
        #     # t.start()
        #
        #     start_index += 1

        count = 0;
        for link in self.page_links:
            # print link["url"]
            page_num = link["page"]
            page_url = link["url"]

            if (count % 4) is 0:
                time.sleep(5)

            t = threading.Thread(target=self.visit_page, args=(page_url, page_num,))
            self.threads.append(t)
            print "Starting Thread for url: " + page_url
            count += 1
            t.start()
            # self.visit_page(product_list_url=page_url, page_num=page_num)

    def main(self):
        try:
            opener = urllib2.build_opener()
            # opener.add_headers = [
            #     ('User-agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.82 Safari/537.36'),
            #     ('Referer', 'http://www.amazon.com/?field-keywords=LED+Lights'),
            #     ('Host', 'www.amazon.com'),
            #     ('Content-Type', 'application/x-www-form-urlencoded'),
            #     ('X-Requested-With', 'XMLHttpRequest'),
            #     ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
            # ]
            response = opener.open(self.get_url())
            # #
            # # print response.code
            #
            #
            # # response.code = 200
            #
            # # request = urllib2.Request(self.get_url())
            # # response = urllib2.urlopen(request, None, headers)

            if response.code is self.HTTP_STATUS_OK:
                # if True:
                html = response.read()

                # # TODO Remove this hard-coded page listings
                # fh = open("listing-1.html", "rb+")
                # html = fh.read()

                self.page.set_data(html)
                self.page.set_url(self.get_url())
                self.page.scrap()

                pagination_link_format = self.page.get_pagination_link()
                max_pagination_number = self.page.get_max_pagination_num()
                logging.info("Max pagination number found : " + max_pagination_number)

                # link = s.build_page_links(20, "/s?ie=UTF8&page=2&rh=i%3Aaps%2Ck%3ABrown%20LED")
                self.build_page_links(max_pagination_number, pagination_link_format)

                # print self.page_links
                self.scrap_all_products()
            else:
                print "Something did not work properly."

        except urllib2.HTTPError as e:
            print e.code
            print e.reason