def main(): db = MysqlHelper() tasks = db.get_tmall_shops() for task in tasks: crawl_tmall_list(task) driver.close()
def main(): file_path = os.path.join(DATA_PATH, 'products_tmall.csv') write_head(file_path) db = MysqlHelper() shops = db.get_tmall_shops() for shop in shops: products = db.get_products_by_shop(shop['id']) for product in products: line = format_line() line['1-sn'] = product['sn'] line['2-title'] = product['title'] line['3-price'] = product['price'] line['4-prices'] = list_to_string(format_data(product['prices'])) line['5-colors'] = list_to_string(format_data(product['colors'])) line['6-sizes'] = list_to_string(format_data(product['sizes'])) line['7-url'] = product['url'] line['8-shop_name'] = product['name'] line['9-shop_url'] = product['task_url'] properties = format_data(product['properties']) properties = json.loads(properties) for k, v in properties.items(): line[k] = v.replace("\"", "") print(line) new_line = line.values() print(new_line) write_list_to_csv([new_line], file_path)
def get_tasks(self): db = MysqlHelper() products = db.get_product_tasks() tasks = [ product['url'] for product in products if product['url'] is not None ] return tasks
def t_get_product(): db = MysqlHelper() products = db.get_products() for p in products: images = p['images'] s = images.replace("\\", "").replace("\'", "\"") s = json.loads(s) print(s[1]['sku'])
def insert_task(): csv_file = os.path.join(DATA_PATH, "task.csv") lines = read_csv(csv_file) db = MysqlHelper() for line in lines: task = line.split("@")[0].strip() print(task) db.add_task_url(task)
def update_task(): csv_file = os.path.join(DATA_PATH, "shop_infos.csv") lines = read_csv(csv_file) db = MysqlHelper() id = 1 for line in lines: name, wid = line.split(",") db.update_task(name, wid, id) id = id + 1
def main(): db = MysqlHelper() shops = db.get_tmall_shops() for shop in shops: image_path = mk_image_dir(shop['id']) if shop['id'] == 26: continue products = db.get_products_by_shop(shop['id']) for product in products: images = format_data(product['images']) save_images(product['sn'], images, image_path)
def get_property_keys(): db = MysqlHelper() products = db.get_products() all_keys = set() for p in products: try: properties = format_data(p['properties']) properties = json.loads(properties) keys = properties.keys() for k in keys: all_keys.add(k) except Exception as e: print(p['id']) print(p['properties']) return all_keys
def get_tasks(self): db = MysqlHelper() shops = db.get_shops() tasks = [] for shop in shops: wid = shop['wid'] name = shop['name'] task_url = shop['task_url'] _arr = task_url.split("/") if len(_arr) >= 3: site = _arr[2] path = _arr[3] url = "https://{}/i/asynSearch.htm".format(site) params = { "_ksTS": "replace_time_141", # will replace later "callback": "jsonp142", "mid": "w-{}-0".format(wid), "wid": wid, "path": "/{}".format(path), "search": "y", "pageNo": 1 } f = furl(url) f.args = params # print(f.url) task = dict() task['id'] = str(shop['id']) task['name'] = name task['site'] = site task['path'] = path task['task_url'] = task_url task['url'] = f.url tasks.append(task) # break # only get one for test else: continue return tasks
def crawl_tmall_list(task): url = task['task_url'] + "?pageNo=2" sid = task['id'] try: driver.get(url) print("success get query success %s" % url) except Exception as e: print("failed when get query %s" % url) raise else: db = MysqlHelper() time.sleep(10) items_xpath = "//dl[contains(@class,'item')]" items = driver.find_elements_by_xpath(items_xpath) for item in items: product = TaobaoItem() product['thumb'] = item.find_element_by_xpath( "./dt//img").get_attribute('src') product['url'] = item.find_element_by_xpath( "./dt/a").get_attribute('href') product['title'] = item.find_element_by_xpath( "./dd[@class='detail']/a").get_attribute('innerText').strip() product['price'] = item.find_element_by_xpath( ".//span[@class='c-price']").get_attribute( 'innerText').strip() try: product['sn'] = product['url'].split("?id=")[1] except Exception as e: continue else: if "&" in product['sn']: product['sn'] = product['sn'].split("&")[0] product['sid'] = sid db.upsert_products_from_list(product) db.db.close()
def __init__(self): self.lang = "zh" self.db = MysqlHelper()
class TaobaoItemSpider(object): driver = None driver_option = None mysql_db = None def __init__(self): self.lang = "zh" self.db = MysqlHelper() def init_chrome_driver(self): """ init chrome dirver """ options = webdriver.ChromeOptions() options.add_argument('--lang=' + self.lang) options.add_argument("--disable-notifications") self.driver = webdriver.Chrome(CHROME_DRIVER_PATH, chrome_options=options) pass def start(self): self.init_chrome_driver() tasks = self.get_tasks() for task in tasks: self.crawl_taobao_item(task) print("crawl the page finished %s" % (task['url'], )) self.close() def close(self): """close driver""" self.driver.quit() self.db.db.close() logger.info('Goodbye, The tasks is finished.') pass def get_tasks(self): """ get tasks from db""" tasks = self.db.get_product_tasks() return tasks def crawl_taobao_item(self, task): """ 1. crawl taobao item page 2. get item info form page 3. save info to database """ url = task['url'] print("begin to crawl page: %s" % url) try: self.driver.get(url) print("success get query success %s" % url) except Exception as e: print("failed when get query %s" % url) else: try: time.sleep(30) product = TaobaoItem() product['sn'] = task['sn'] product['sizes'] = [] product['colors'] = [] product['images'] = [] product['prices'] = [] product['choices'] = dict() # get properties properties_xpath = "//ul[@id='J_AttrUL']/li" _arrs = self.driver.find_elements_by_xpath(properties_xpath) properties = dict() for pt in _arrs: pt = pt.get_attribute("innerText") _arr = pt.split(":") properties[_arr[0].strip()] = _arr[1].strip() product['properties'] = properties # get images from html doc images_xpath = "//ul[@id='J_UlThumb']/li//img" _arrs = self.driver.find_elements_by_xpath(images_xpath) images = set() for i in _arrs: src = "https:" + i.get_attribute("src").replace( "_60x60q90.jpg", "").strip() images.add(src) body = self.driver.find_element_by_tag_name( "body").get_attribute("innerHTML") r = re.compile(r"(.*)TShop.Setup\((.*?)\}\)", re.S) matchObj = re.match(r, body, 0) if matchObj and len(matchObj.group()) > 1: obj_json = matchObj.group(2) obj_json = obj_json.strip()[:-2].strip() obj_json = json.loads(obj_json) # get images if 'propertyPics' in obj_json.keys(): js_images = obj_json['propertyPics'] for k, v in js_images.items(): for i in v: images.add("https:" + i) product['images'] = list(images) # get colors sku_list = [] if 'valItemInfo' in obj_json.keys(): if 'skuList' in obj_json['valItemInfo'].keys(): sku_list = obj_json['valItemInfo']['skuList'] for sku in sku_list: product['colors'].append(sku['names'].strip()) # get prices if 'valItemInfo' in obj_json.keys(): if 'skuMap' in obj_json['valItemInfo'].keys(): sku_maps = obj_json['valItemInfo']['skuMap'] prices = set() for k, v in sku_maps.items(): prices.add(v['price']) product['prices'] = list(prices) # get choices product['choices'] = self.format_sku( sku_list, sku_maps) print(product) self.db.update_product(product) except Exception as e: print(e) raise e def format_sku(self, sku_list, sku_maps): new_sku_list = dict() for sku in sku_list: new_sku_list[sku['pvs']] = sku["names"].strip() for k, v in sku_maps.items(): if k.strip(";") in new_sku_list.keys(): sku_maps[k]['color'] = new_sku_list[k.strip(";")] return sku_maps