Example #1
0
 def get_tasks(self):
     db = MysqlHelper()
     products = db.get_product_tasks()
     tasks = [
         product['url'] for product in products
         if product['url'] is not None
     ]
     return tasks
Example #2
0
class TaobaoItemSpider(object):
    driver = None
    driver_option = None
    mysql_db = None

    def __init__(self):
        self.lang = "zh"
        self.db = MysqlHelper()

    def init_chrome_driver(self):
        """ init chrome dirver """
        options = webdriver.ChromeOptions()
        options.add_argument('--lang=' + self.lang)
        options.add_argument("--disable-notifications")
        self.driver = webdriver.Chrome(CHROME_DRIVER_PATH,
                                       chrome_options=options)
        pass

    def start(self):
        self.init_chrome_driver()

        tasks = self.get_tasks()

        for task in tasks:
            self.crawl_taobao_item(task)
            print("crawl the page finished %s" % (task['url'], ))

        self.close()

    def close(self):
        """close driver"""
        self.driver.quit()
        self.db.db.close()
        logger.info('Goodbye, The tasks is finished.')
        pass

    def get_tasks(self):
        """ get tasks from db"""
        tasks = self.db.get_product_tasks()
        return tasks

    def crawl_taobao_item(self, task):
        """
            1. crawl taobao item page
            2. get item info form page
            3. save info to database
        """
        url = task['url']
        print("begin to crawl page: %s" % url)

        try:
            self.driver.get(url)
            print("success get query success %s" % url)
        except Exception as e:
            print("failed when get query %s" % url)
        else:
            try:
                time.sleep(30)
                product = TaobaoItem()
                product['sn'] = task['sn']
                product['sizes'] = []
                product['colors'] = []
                product['images'] = []
                product['prices'] = []
                product['choices'] = dict()

                # get properties
                properties_xpath = "//ul[@id='J_AttrUL']/li"
                _arrs = self.driver.find_elements_by_xpath(properties_xpath)
                properties = dict()
                for pt in _arrs:
                    pt = pt.get_attribute("innerText")
                    _arr = pt.split(":")
                    properties[_arr[0].strip()] = _arr[1].strip()

                product['properties'] = properties

                # get images from html doc

                images_xpath = "//ul[@id='J_UlThumb']/li//img"
                _arrs = self.driver.find_elements_by_xpath(images_xpath)
                images = set()
                for i in _arrs:
                    src = "https:" + i.get_attribute("src").replace(
                        "_60x60q90.jpg", "").strip()
                    images.add(src)

                body = self.driver.find_element_by_tag_name(
                    "body").get_attribute("innerHTML")
                r = re.compile(r"(.*)TShop.Setup\((.*?)\}\)", re.S)

                matchObj = re.match(r, body, 0)
                if matchObj and len(matchObj.group()) > 1:
                    obj_json = matchObj.group(2)
                    obj_json = obj_json.strip()[:-2].strip()
                    obj_json = json.loads(obj_json)

                    # get images
                    if 'propertyPics' in obj_json.keys():
                        js_images = obj_json['propertyPics']

                        for k, v in js_images.items():
                            for i in v:
                                images.add("https:" + i)
                    product['images'] = list(images)

                    # get colors
                    sku_list = []
                    if 'valItemInfo' in obj_json.keys():
                        if 'skuList' in obj_json['valItemInfo'].keys():
                            sku_list = obj_json['valItemInfo']['skuList']
                            for sku in sku_list:
                                product['colors'].append(sku['names'].strip())

                    # get prices
                    if 'valItemInfo' in obj_json.keys():
                        if 'skuMap' in obj_json['valItemInfo'].keys():
                            sku_maps = obj_json['valItemInfo']['skuMap']
                            prices = set()
                            for k, v in sku_maps.items():
                                prices.add(v['price'])
                            product['prices'] = list(prices)

                            # get choices
                            product['choices'] = self.format_sku(
                                sku_list, sku_maps)
                print(product)
                self.db.update_product(product)

            except Exception as e:
                print(e)
                raise e

    def format_sku(self, sku_list, sku_maps):
        new_sku_list = dict()
        for sku in sku_list:
            new_sku_list[sku['pvs']] = sku["names"].strip()

        for k, v in sku_maps.items():
            if k.strip(";") in new_sku_list.keys():
                sku_maps[k]['color'] = new_sku_list[k.strip(";")]

        return sku_maps