Ejemplo n.º 1
0
    def __init__(self, entrance_url, product_type, storage_table):
        AGraber.__init__(self)
        self.db = DBMysql()
        self.entrance_url = entrance_url
        self.product_type = product_type

        # self.searcher_xpath_value = searcher_xpath_value
        # self.searcher_submit_button = searcher_submit_button

        self.storage_table = storage_table
        pass
Ejemplo n.º 2
0
 def __init__(self, entrance_url, product_type, storage_table):
     AEbayGraber.__init__(self)
     logging.basicConfig(
         level=logging.INFO,
         format=
         '%(asctime)s %(module)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
         datefmt='%Y %b %d %H:%M:%S',
         filename='./app.Log',
         filemode='w')
     self.db = DBMysql()
     self.entrance_url = entrance_url
     self.product_type = product_type
     self.storage_table = storage_table
     pass
Ejemplo n.º 3
0
class GrabEbuyerPhone(AGraber):
    def __init__(self, entrance_url, product_type, storage_table):
        AGraber.__init__(self)
        self.db = DBMysql()
        self.entrance_url = entrance_url
        self.product_type = product_type

        # self.searcher_xpath_value = searcher_xpath_value
        # self.searcher_submit_button = searcher_submit_button

        self.storage_table = storage_table
        pass

    def handle_one_page(self, driver):
        """重载父类方法,实现具体的爬虫操作"""

        url = self.entrance_url
        keyword = self.product_type
        driver.get(url)
        time.sleep(10)
        print "Inittial Page:", url
        # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay
        driver = self.submit_initial_url(driver, "//*[@id='search-box']",
                                         "search-button", keyword)

        i = 0
        while i < 500:
            # 获取当前网页html文档
            response_html = self.get_htmlcontent(driver.current_url)
            try:
                if response_html.status_code is not 200:
                    print "Get status_code, but Exception:response_html.status_code=", response_html.status_code
                    break
            except:
                print "Exception:response_html.status_code=", response_html.status_code
                break

            # 仅提取内容部分的文档,方便解析提速
            html_part_id_value = "main-content"
            only_content_tags = SoupStrainer("section", id=html_part_id_value)
            html_part_content = BeautifulSoup(
                response_html.text,
                "html.parser",
                parse_only=only_content_tags).prettify()

            # 解析所需的所有链接
            soup = BeautifulSoup(html_part_content,
                                 "html.parser",
                                 from_encoding="utf-8")
            links = soup.find_all('a',
                                  class_="view-product",
                                  href=re.compile(self.product_type, re.I))
            for link in links:
                # 产生新的url的方式不同
                url_parse = urlparse(url)
                domain = url_parse.scheme + "://" + url_parse.netloc
                new_url = domain + link['href']
                self.handle_result_url(new_url, keyword, i)
                time.sleep(10)

            i = i + 1

            # current_page = "a.pg  curr"
            # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished"

            # nextPage = "a.gspr.next"
            try:
                # driver.find_element_by_css_selector(nextPage).click() #ebay
                driver.find_element_by_xpath(
                    "//*[@id='main-content']/div/div[1]/div[2]/div[1]/ul/li[6]/a"
                ).click()
            except:
                break
            print driver.current_url
            time.sleep(20)

        driver.quit()
        self.db.close()

    # 加个关键词,传入type
    def handle_result_url(self, item_url, keyword, i):
        print "Handle", i, "th URL:", item_url

        response_html = self.get_htmlcontent(item_url)
        # if response_html is not None:
        if response_html.status_code == 200:
            try:
                item_domain = urlparse(item_url).scheme + "://www." + urlparse(
                    item_url).netloc
                item_content = response_html.text.replace('\"', ' ')
                html_content = BeautifulSoup(response_html.text, "html.parser")
                item_title = html_content.title.string.replace('\"', ' ')
                new_reconrd = {
                    "id": str(uuid.uuid4()),
                    "domain_name": item_domain,
                    "keyword": keyword,
                    "url": item_url,
                    "title": item_title,
                    "doc": self.db.escape_string(unicode(item_content))
                }
                if "| eBay" is not item_title:
                    self.db.insertOneData(self.storage_table, new_reconrd)
            except:
                pass
        else:
            print "Handle", i, "th URL:", item_url, "Failed!..............................................."
Ejemplo n.º 4
0
class GrabSearsPhone(AGraber):
    def __init__(self, entrance_url, product_type, storage_table):
        AGraber.__init__(self)
        self.db = DBMysql()
        self.entrance_url = entrance_url
        self.product_type = product_type

        # self.searcher_xpath_value = searcher_xpath_value
        # self.searcher_submit_button = searcher_submit_button

        self.storage_table = storage_table
        pass

    def run(self):
        # 使用phantomjs虚拟浏览器
        driver = self.get_driver("chrome")
        self.handle_one_page(driver)

    def handle_one_page(self, driver):
        """重载父类方法,实现具体的爬虫操作"""

        url = self.entrance_url
        keyword = self.product_type
        driver.get(url)
        time.sleep(10)
        print "Inittial Page:", url
        # # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay
        # #driver = self.submit_initial_url(driver, "//input[@type='search']", "search-button", keyword)
        #
        # driver.find_element_by_xpath("//*[@id='keyword']").clear()
        # driver.find_element_by_xpath("//*[@id='keyword']").send_keys(keyword)
        #
        # # 获取按钮对象并点击按钮
        # # elem = driver.find_element_by_id("search-button") # ebay ebuyer中使用该类型
        # # driver.find_element_by_xpath("//*[@id='goBtn']").click()
        # driver.find_element_by_id("goBtn").click()
        #
        # time.sleep(20)  # 需要暂停一两秒,防止页面未跳转
        # print "Get Crawer Home Page:", driver.current_url

        inputBoxXpath = "//*[@id='keyword']"

        driver.find_element_by_xpath(inputBoxXpath).clear()
        # elem = driver.find_element_by_id("keyword").send_keys(keyword)
        driver.find_element_by_xpath(inputBoxXpath).send_keys(keyword)
        # elem = driver.find_element_by_xpath(inputBoxXpath)
        # elem.clear()
        # elem = driver.find_element_by_xpath(inputBoxXpath)
        # elem.send_keys(keyword)

        # 获取按钮对象并点击按钮
        # searchButton = 'search-button'

        # elem = driver.find_element_by_css_selector('button.btn.btn-default')
        # elem = driver.find_element_by_css_selector('input.btn.btn-prim gh-spr')
        elem = driver.find_element_by_id("goBtn")
        #elem = driver.find_element_by_css_selector('# goBtn')
        # goBtn

        # elem = driver.find_element_by_xpath("//*[@id='goBtn']")
        elem.click()

        time.sleep(10)  # 需要暂停一两秒,防止页面未跳转
        print "Get Crawer Home Page:", driver.current_url

        i = 0
        while i < 21:
            # 获取当前网页html文档
            response_html = self.get_htmlcontent(driver.current_url)
            # if response_html is not None:
            try:
                if response_html.status_code is not 200:
                    print "Get status_code, but Exception:response_html.status_code=", response_html.status_code
                    break
            except:
                print "Exception:response_html.status_code=", response_html.status_code
                break

            # 仅提取内容部分的文档,方便解析提速
            # html_part_id_value =  "content"
            # # only_content_tags = SoupStrainer("ul", id=html_part_id_value)
            # only_content_tags = SoupStrainer("div", id=html_part_id_value)
            # html_part_content = BeautifulSoup(response_html.text, "html.parser", parse_only=only_content_tags).prettify()
            #
            # # 解析所需的所有链接
            # soup = BeautifulSoup(html_part_content, "html.parser", from_encoding="utf-8")
            # # links = soup.find_all('a', href=re.compile("phone", re.I))

            soup = BeautifulSoup(response_html.text,
                                 "html.parser",
                                 from_encoding="utf-8")
            #links = soup.find_all('a', href=re.compile(r"phone", re.I))
            links = soup.find_all('a')
            for link in links:
                url_parse = urlparse(url)
                domain = url_parse.scheme + "://" + url_parse.netloc
                new_url = domain + link['href']
                self.handle_result_url(new_url, keyword, i)
                print "Fetch a phone url :", new_url
                time.sleep(10)

            i = i + 1

            # current_page = "a.pg  curr"
            # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished"

            try:
                # nextPage = "a.gspr.next"
                # driver.find_element_by_css_selector(nextPage).click() #ebay

                # 获取下一页的链接
                driver.find_element_by_xpath(
                    "//*[@id='pagination']/div[1]/div[2]/a/span").click()
            except:
                break
            print driver.current_url
            time.sleep(20)

        driver.quit()
        self.db.close()

    # 加个关键词,传入type
    def handle_result_url(self, item_url, keyword, i):
        print "Handle", i, "Page's URL:", item_url

        response_html = self.get_htmlcontent(item_url)
        # if response_html is not None:
        if response_html.status_code == 200:
            try:
                item_domain = urlparse(item_url).scheme + "://www." + urlparse(
                    item_url).netloc
                item_content = response_html.text.replace('\"', ' ')
                html_content = BeautifulSoup(response_html.text, "html.parser")
                item_title = html_content.title.string.replace('\"', ' ')
                new_reconrd = {
                    "id": str(uuid.uuid4()),
                    "domain_name": item_domain,
                    "keyword": keyword,
                    "url": item_url,
                    "title": item_title,
                    "doc": self.db.escape_string(unicode(item_content))
                }
                if "| eBay" is not item_title:
                    self.db.insertOneData(self.storage_table, new_reconrd)
            except:
                pass
        else:
            print "Handle", i, "Page's URL:", item_url, "Failed!..............................................."
Ejemplo n.º 5
0
class EbayGraber(AEbayGraber):
    def __init__(self, entrance_url, product_type, storage_table):
        AEbayGraber.__init__(self)
        logging.basicConfig(
            level=logging.INFO,
            format=
            '%(asctime)s %(module)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
            datefmt='%Y %b %d %H:%M:%S',
            filename='./app.Log',
            filemode='w')
        self.db = DBMysql()
        self.entrance_url = entrance_url
        self.product_type = product_type
        self.storage_table = storage_table
        pass

    def get_soup_responser(self,
                           html_text_content,
                           html_psrser="html.parser",
                           from_encoding="utf-8"):
        """根据网页内容,构建soup对象,并返回"""
        return BeautifulSoup(html_text_content,
                             html_psrser,
                             from_encoding=from_encoding)

    def get_strainer_soup_responser(self,
                                    html_text_content,
                                    strainer_symbol,
                                    html_psrser="html.parser",
                                    from_encoding="utf-8"):
        """根据由SoupStrainer限定的部分网页内容,构建soup对象,并返回"""
        only_content_tags = SoupStrainer(
            id=strainer_symbol
        )  # todo:修改soupStrainer为多种构造方式自选,使用枚举类型、传入参数、switch确定
        only_content = BeautifulSoup(html_text_content,
                                     html_psrser,
                                     parse_only=only_content_tags).prettify()
        return BeautifulSoup(only_content,
                             html_psrser,
                             from_encoding=from_encoding)

    def web_page_paser(self, driver):
        """解析所有有效的网页内容"""
        keyword = self.product_type

        driver.get(self.entrance_url)
        time.sleep(10)

        print "Inittial Page:", self.entrance_url

        driver = self.submit_initial_url(driver, "//input[@type='text']",
                                         "gh-btn", keyword)

        i = 0
        while i < 500:
            request_responser = self.get_request_responser(driver.current_url)
            if request_responser is None:
                break

            soup = self.get_strainer_soup_responser(request_responser.text,
                                                    "CenterPanel")
            links = soup.find_all('a', href=re.compile(keyword))
            print links
            for link in links:
                new_url = link['href']
                self.handle_one_url(new_url, keyword, i)
                time.sleep(10)

            i = i + 1

            # current_page = "a.pg  curr"
            # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished"

            try:
                driver.find_element_by_css_selector("a.gspr.next").click()
                print "Geting a new page,url=%s", driver.current_url
                time.sleep(20)
            except:
                print "Error:Geting next page Failed", request_responser.status_code
                break

        driver.quit()
        self.db.close()

    # 加个关键词,传入type
    def handle_one_url(self, url, keyword, i):
        logging.info("Handle the" + str(i) + "th URL" + url)

        response_html = self.get_request_responser(url)
        try:
            if response_html.status_code is not 200:
                pass
        except:
            print "LOG:Exception:EbayGraber %s handle_one_url response_html.status_code=%s! url=%s", self.product_type, response_html.status_code, url
            pass

        if response_html.text is not None:
            new_reconrd = self.get_db_product_db_object(
                url, response_html.text)
            if new_reconrd is not None:
                self.db.insertOneData(self.storage_table, new_reconrd)
        else:
            print "LOG:Warning:EbayGraber %s handle_one_url response_html.text is None! url=%s" % self.product_type, url

    def get_db_product_db_object(self, url, response_html_text):
        """解析抽取到的商品链接,判断是否是所需处理的网页,返回一个可以入库的字典"""
        soup = BeautifulSoup(response_html_text, "html.parser")
        try:
            item_title = ""
            if soup.title is not None:
                if self.product_type not in soup.title.string or "camera | eBay" in soup.title.string:
                    logging.warn("This is not a " + self.product_type +
                                 " product url! url=" + url)
                    return None
                item_title = soup.title.string.replace('\"', ' ')

            item_content = ""
            if response_html_text is not None:
                item_content = response_html_text.replace('\"', ' ')

            new_record = {
                "id": str(uuid.uuid4()),
                "domain_name":
                urlparse(url).scheme + "://www." + urlparse(url).netloc,
                "keyword": self.product_type,
                "url": url,
                "title": item_title,
                "doc": self.db.escape_string(unicode(item_content))
            }
            return new_record
        except:
            pass
Ejemplo n.º 6
0
class GrabOverStockPhone(AGraber):
    def __init__(self, entrance_url, product_type, storage_table):
        AGraber.__init__(self)
        self.db = DBMysql()
        self.entrance_url = entrance_url
        self.product_type = product_type

        # self.searcher_xpath_value = searcher_xpath_value
        # self.searcher_submit_button = searcher_submit_button

        self.storage_table = storage_table
        pass

    def handle_one_page(self, driver):
        """重载父类方法,实现具体的爬虫操作"""

        url = self.entrance_url
        keyword = self.product_type
        driver.get(url)

        time.sleep(120)
        try:
            driver.find_element_by_id("cboxClose").click()
            print "Click the pop - up windows"
        except:
            pass
        time.sleep(60)

        print "Inittial Page:", url
        # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay
        #driver = self.submit_initial_url(driver, "//input[@type='search']", "search-button", keyword)

        driver.find_element_by_xpath("//*[@id='search-input']").clear()
        driver.find_element_by_xpath("//*[@id='search-input']").send_keys(
            keyword)

        # 获取按钮对象并点击按钮
        # elem = driver.find_element_by_id(submit_key) # ebay ebuyer中使用该类型
        elem = driver.find_element_by_xpath(
            '//*[@id="search-form"]/fieldset[2]/label/i')
        elem.click()

        time.sleep(60)
        # 翻页的一种变种,将页面滚动条拖到底部
        i = 0
        while i < 200:  ###################################################################################   2000
            js = "var q=document.body.scrollTop=" + str(10000 * i)
            print i, js
            driver.execute_script(js)
            time.sleep(3)
            i = i + 1

        # 获取当前网页html文档
        print "Get Crawer Home Page:", driver.current_url
        response_html = self.get_htmlcontent(driver.current_url)
        try:
            if response_html.status_code is not 200:
                print "Get status_code, but Exception:response_html.status_code=", response_html.status_code
                pass
        except:
            print "Exception:response_html.status_code=", response_html.status_code
            pass

        # 仅提取内容部分的文档,方便解析提速
        html_part_id_value = "result-products"
        only_content_tags = SoupStrainer(id=html_part_id_value)
        html_part_content = BeautifulSoup(
            response_html.text, "html.parser",
            parse_only=only_content_tags).prettify()

        # 解析所需的所有链接
        soup = BeautifulSoup(html_part_content,
                             "html.parser",
                             from_encoding="utf-8")
        # links = soup.find_all('a', class_='jsQs', href=re.compile(self.product_type, re.I))

        if keyword is "cellphone":
            url_judger = "phone"
        elif keyword is "tv":
            url_judger = "tv"
        elif keyword is "digit camera":
            url_judger = "camera"
        elif keyword is "bike":
            url_judger = "bike"
        elif keyword is "labtop":
            url_judger = "labtop"
        elif keyword is "mice":
            url_judger = "mice"
        elif keyword is "Webcams":
            url_judger = "Webcams"
        elif keyword is "shaver":
            url_judger = "shaver"
        elif keyword is "flashlight":
            url_judger = "flashlight"
        elif keyword is "watch":
            url_judger = "watch"

        links = soup.find_all('a', href=re.compile(
            url_judger,
            re.I))  # 注意,这里的phone,不一定等于keyword了!!!!!!!!!!!!!!!!!!!!!
        print len(links)
        for link in links:
            new_url = link['href']
            self.handle_result_url(new_url, keyword, i)
            time.sleep(10)

            # current_page = "a.pg  curr"
            # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished"

            # try:
            #     # nextPage = "a.gspr.next"
            #     # driver.find_element_by_css_selector(nextPage).click() #ebay
            #
            #     # driver.find_element_by_xpath("//*[@id='main-content']/div/div[1]/div[2]/div[1]/ul/li[6]/a").click()  # Cdicount
            #
            #     nextPage = "a.jsNxtPage.pgNext"
            #     driver.find_element_by_css_selector(nextPage).click()
            # except:
            #     break
            # print driver.current_url
            time.sleep(20)

        driver.quit()
        self.db.close()

    # 加个关键词,传入type
    def handle_result_url(self, item_url, keyword, i):
        print "Handle", i, "Page's URL:", item_url

        response_html = self.get_htmlcontent(item_url)
        if response_html.status_code == 200:
            try:
                item_domain = urlparse(item_url).scheme + "://www." + urlparse(
                    item_url).netloc
                item_content = response_html.text.replace('\"', ' ')
                html_content = BeautifulSoup(response_html.text, "html.parser")
                item_title = html_content.title.string.replace('\"', ' ')
                new_reconrd = {
                    "id": str(uuid.uuid4()),
                    "domain_name": item_domain,
                    "keyword": keyword,
                    "url": item_url,
                    "title": item_title,
                    "doc": self.db.escape_string(unicode(item_content))
                }
                if "| eBay" is not item_title:
                    self.db.insertOneData(self.storage_table, new_reconrd)
            except:
                pass
        else:
            print "Handle", i, "Page's URL:", item_url, "Failed!..............................................."
Ejemplo n.º 7
0
class GrabCdiscountPhone(AGraber):
    def __init__(self, entrance_url, product_type, storage_table):
        AGraber.__init__(self)
        self.db = DBMysql()
        self.entrance_url = entrance_url
        self.product_type = product_type

        # self.searcher_xpath_value = searcher_xpath_value
        # self.searcher_submit_button = searcher_submit_button

        self.storage_table = storage_table
        pass

    def handle_one_page(self, driver):
        """重载父类方法,实现具体的爬虫操作"""

        url = self.entrance_url
        keyword = self.product_type
        driver.get(url)
        time.sleep(10)
        print "Inittial Page:", url
        # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay
        #driver = self.submit_initial_url(driver, "//input[@type='search']", "search-button", keyword)

        driver.find_element_by_xpath("//input[@type='search']").clear()
        driver.find_element_by_xpath("//input[@type='search']").send_keys(
            keyword)

        # 获取按钮对象并点击按钮
        # elem = driver.find_element_by_id(submit_key) # ebay ebuyer中使用该类型
        elem = driver.find_element_by_xpath(
            '//*[@id="hFull"]/div[2]/div[1]/button')
        elem.click()

        time.sleep(20)  # 需要暂停一两秒,防止页面未跳转
        print "Get Crawer Home Page:", driver.current_url

        i = 0
        while i < 500:
            # 获取当前网页html文档
            response_html = self.get_htmlcontent(driver.current_url)
            try:
                if response_html.status_code is not 200:
                    print "Get status_code, but Exception:response_html.status_code=", response_html.status_code
                    break
            except:
                print "Exception:response_html.status_code=", response_html.status_code
                break

            # 仅提取内容部分的文档,方便解析提速
            html_part_id_value = "lpBloc"
            # only_content_tags = SoupStrainer("ul", id=html_part_id_value)
            only_content_tags = SoupStrainer(id=html_part_id_value)
            html_part_content = BeautifulSoup(
                response_html.text,
                "html.parser",
                parse_only=only_content_tags).prettify()

            # 解析所需的所有链接
            soup = BeautifulSoup(html_part_content,
                                 "html.parser",
                                 from_encoding="utf-8")
            # links = soup.find_all('a', class_='jsQs', href=re.compile(self.product_type, re.I))
            links = soup.find_all('a',
                                  class_='jsQs')  # , href=re.compile("Phone")
            for link in links:
                new_url = link['href']
                self.handle_result_url(new_url, keyword, i)
                time.sleep(10)

            i = i + 1

            # current_page = "a.pg  curr"
            # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished"

            try:
                # nextPage = "a.gspr.next"
                # driver.find_element_by_css_selector(nextPage).click() #ebay

                # driver.find_element_by_xpath("//*[@id='main-content']/div/div[1]/div[2]/div[1]/ul/li[6]/a").click()  # Cdicount

                nextPage = "a.jsNxtPage.pgNext"
                driver.find_element_by_css_selector(nextPage).click()
                print driver.current_url
                time.sleep(20)
            except:
                print "Exception:Get Next page Fail", response_html.status_code
                break

        driver.quit()
        self.db.close()

    # 加个关键词,传入type
    def handle_result_url(self, item_url, keyword, i):
        print "Handle", i, "Page's URL:", item_url

        response_html = self.get_htmlcontent(item_url)
        try:
            if response_html.status_code is not 200:
                print "Get status_code, but Exception:response_html.status_code=", response_html.status_code
                pass
        except:
            print "Exception:response_html.status_code=", response_html.status_code
            pass

        try:
            item_domain = urlparse(item_url).scheme + "://www." + urlparse(
                item_url).netloc
            item_content = response_html.text.replace('\"', ' ')
            html_content = BeautifulSoup(response_html.text, "html.parser")
            item_title = html_content.title.string.replace('\"', ' ')
            new_reconrd = {
                "id": str(uuid.uuid4()),
                "domain_name": item_domain,
                "keyword": keyword,
                "url": item_url,
                "title": item_title,
                "doc": self.db.escape_string(unicode(item_content))
            }
            if "| eBay" is not item_title:
                self.db.insertOneData(self.storage_table, new_reconrd)
        except:
            pass
        else:
            print "Handle", i, "Page's URL:", item_url, "Failed!..............................................."
Ejemplo n.º 8
0

def GetNewKeyWord():
    productArray = []
    file = open("Bikes.txt")
    while 1:
        line = file.readline()
        if not line:
            break
        productArray.append(line)
    return productArray


if __name__ == '__main__':

    db = DBMysql()

    # sql = 'select * from keyword_info'
    # results = db.query(sql, "all")
    # for r in results:
    #    item = keyword_Info_item()
    #    item.keyword =r[0]
    #    item.name = r[1]
    #    keyword_info_lists.append(item)

    # for i in keyword_info_lists:
    #     print i.keyword, i.name

    # productArray = GetNewKeyWord()
    # productArrayLen = len(productArray)
    # for i in range(productArrayLen):