Python Page.scrap Examples

Programming Language: Python

Namespace/Package Name: page

Class/Type: Page

Method/Function: scrap

Examples at hotexamples.com: 2

Python Page.scrap - 2 examples found. These are the top rated real world Python examples of page.Page.scrap extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Page(30)

__init__(30)

content(8)

add_page(7)

addForm(5)

addBlockCenter(5)

button(4)

a(4)

body(4)

click(4)

create(4)

resources(3)

find(3)

in_index(3)

addBlockRight(3)

get_by_name(3)

close(2)

crawl(2)

import_tables(2)

data(2)

find_links_in_content(2)

gSoupToLinks(2)

index_num(2)

process_node(2)

scrap(2)

set_data(2)

click_add_new_product(2)

control(2)

all(2)

addScript(2)

add_html(2)

addBlockLeft(2)

_Icons(2)

_Name(2)

change_product_name(1)

parse_date(1)

navigate(1)

length(1)

is_valid(1)

insertThumbnail(1)

is_enabled(1)

print_form_results(1)

inflate(1)

add(1)

index(1)

parse_table(1)

printed_output(1)

print_info(1)

set_confidences(1)

validate(1)

Example #1

Show file

File: scraper.py Project: samundra/amazon-search-scrapper

    def visit_page(self, product_list_url, page_num):
        print "Page-{}, {}".format(page_num, product_list_url)

        response = urllib2.urlopen(product_list_url)

        if response.code is self.HTTP_STATUS_OK:
            page = Page()
            page.set_data(response.read())
            page.set_url(product_list_url)
            page.set_out_file(self.output_filename + "-page-" + str(page_num) + "-")
            page.scrap()

            print "{} - Completed".format(product_list_url)

Example #2

Show file

File: scraper.py Project: samundra/amazon-search-scrapper

class Scraper:
    ROOT_DOMAIN = 'http://www.amazon.com'
    HTTP_STATUS_OK = 200
    THREAD_NUM = 5

    def __init__(self, url):
        self.url = None
        self.set_url(url)
        self.page = Page()
        self.product = Product()
        self.page_links = []
        self.output_filename = "out_"
        self.threads = []

    def set_url(self, url):
        self.url = url
        return self

    def set_output_file(self, keyword):
        self.output_filename = keyword.replace(" ", "_").lower()

    def get_page_links(self):
        return self.page_links

    def get_url(self):
        return self.url

    def build_page_links(self, max_page_num, pagination_format):

        if pagination_format is not None:
            query_string = pagination_format.split("&")
            # pages = query_string[1].split("=")
            match = re.search("page=(\d+)", pagination_format)

            page_num = int(match.group(1))

            page_dict = {
                "page": str(page_num),
                "url": self.ROOT_DOMAIN + pagination_format
            }

            self.page_links.append(page_dict)

            while page_num < int(max_page_num):
                page_num += 1

                query_string[1] = 'page=' + str(page_num)
                pack_query_string = '&'.join(query_string)

                page_dict = {
                    "page": str(page_num),
                    "url": self.ROOT_DOMAIN + pack_query_string
                }

                self.page_links.append(page_dict)

    def visit_page(self, product_list_url, page_num):
        print "Page-{}, {}".format(page_num, product_list_url)

        response = urllib2.urlopen(product_list_url)

        if response.code is self.HTTP_STATUS_OK:
            page = Page()
            page.set_data(response.read())
            page.set_url(product_list_url)
            page.set_out_file(self.output_filename + "-page-" + str(page_num) + "-")
            page.scrap()

            print "{} - Completed".format(product_list_url)
            # list_page.set_response(response)

            # products = self.page.scrap()

    def scrap_all_products(self):

        # total_links = len(self.page_links)
        # print "Total Links : " + str(total_links)
        #
        # chunks = total_links / self.THREAD_NUM
        # remaining = total_links % self.THREAD_NUM
        #
        # print "Loop Required : " + str(chunks)
        # print "Starting Thread Count :" + str(self.THREAD_NUM)
        # print "Remaining Count : " + str(remaining)
        #
        # for i in range(0, chunks):
        #     for idx in range(self.THREAD_NUM):
        #         # print idx
        #         link = self.page_links[idx]
        #         page_num = link["page"]
        #         page_url = link["url"]
        #         #
        #         print "Starting Thread for url: " + page_url
        #         # t = threading.Thread(target=self.visit_page, args=(page_url, page_num,))
        #         # self.threads.append(t)
        #         # t.start()

        #
        # start_index = self.THREAD_NUM * chunks
        # #
        # remaining_chunk = total_links % self.THREAD_NUM
        # #
        # for j in range(remaining_chunk):
        #     print "Starting Thread for remaining urls"
        #     link = self.page_links[start_index]
        #     page_num = link["page"]
        #     page_url = link["url"]
        #
        #     print page_url
        #
        #     # t = threading.Thread(target=self.visit_page, args=(page_url, page_num,))
        #     # self.threads.append(t)
        #     #
        #     # print "Starting Thread for url: " + page_url
        #     # t.start()
        #
        #     start_index += 1

        count = 0;
        for link in self.page_links:
            # print link["url"]
            page_num = link["page"]
            page_url = link["url"]

            if (count % 4) is 0:
                time.sleep(5)

            t = threading.Thread(target=self.visit_page, args=(page_url, page_num,))
            self.threads.append(t)
            print "Starting Thread for url: " + page_url
            count += 1
            t.start()
            # self.visit_page(product_list_url=page_url, page_num=page_num)

    def main(self):
        try:
            opener = urllib2.build_opener()
            # opener.add_headers = [
            #     ('User-agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.82 Safari/537.36'),
            #     ('Referer', 'http://www.amazon.com/?field-keywords=LED+Lights'),
            #     ('Host', 'www.amazon.com'),
            #     ('Content-Type', 'application/x-www-form-urlencoded'),
            #     ('X-Requested-With', 'XMLHttpRequest'),
            #     ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
            # ]
            response = opener.open(self.get_url())
            # #
            # # print response.code
            #
            #
            # # response.code = 200
            #
            # # request = urllib2.Request(self.get_url())
            # # response = urllib2.urlopen(request, None, headers)

            if response.code is self.HTTP_STATUS_OK:
                # if True:
                html = response.read()

                # # TODO Remove this hard-coded page listings
                # fh = open("listing-1.html", "rb+")
                # html = fh.read()

                self.page.set_data(html)
                self.page.set_url(self.get_url())
                self.page.scrap()

                pagination_link_format = self.page.get_pagination_link()
                max_pagination_number = self.page.get_max_pagination_num()
                logging.info("Max pagination number found : " + max_pagination_number)

                # link = s.build_page_links(20, "/s?ie=UTF8&page=2&rh=i%3Aaps%2Ck%3ABrown%20LED")
                self.build_page_links(max_pagination_number, pagination_link_format)

                # print self.page_links
                self.scrap_all_products()
            else:
                print "Something did not work properly."

        except urllib2.HTTPError as e:
            print e.code
            print e.reason