def iterate_product_urls(): opener = NetOpener("http://www.asos.com", "asos_cookie.txt") opener.init_cookie() opener.load_cookie() opener.load_opener() db_params = DBParams() db_params.host = "172.16.8.149" db_params.port = "3306" db_params.user = "******" db_params.passwd = "123456" db_params.db = "test" conn = get_param_conn(db_params) if conn is None: print("没有此数据库") return False cur = conn.cursor() pudao = ProductUrlsDao(conn, cur) pdao = ProductDao(conn, cur) pddao = ProductDescDao(conn, cur) pidao = ProductImagesDao(conn, cur) psdao = ProductSkuDao(conn, cur) product_urls = pudao.get_need_spider_urls() for url_item in product_urls: page_response = opener.visit_url(url_item[3], None) page_html = page_response.read() result = analysis_page_to_product(page_html, url_item, pdao, pidao, pddao, psdao, pudao, opener, url_item[2]) pudao.sign_spider(url_item[0], result) cur.close() conn.close()
def get_spider_product_urls(): opener = NetOpener("http://www.asos.com", "asos_cookie.txt") opener.init_cookie() opener.load_cookie() opener.load_opener() db_params = DBParams() db_params.host = "172.16.8.149" db_params.port = "3306" db_params.user = "******" db_params.passwd = "123456" db_params.db = "test" conn = get_param_conn(db_params) if conn is None: print("没有此数据库") return False cur = conn.cursor() pudao = ProductUrlsDao(conn, cur) brand_url_file = open( "C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\brand_urls_20171010.txt", "r") brand_urls = brand_url_file.readlines() for url_str in brand_urls: url_item_arr = url_str.split("|") brand_name = url_item_arr[0].strip() url = url_item_arr[1].strip() page_response = opener.visit_url(url, None) page_html = page_response.read() save_product_urls(brand_name, page_html, pudao) # test_html = open("C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\test.html", "w") # test_html.write(page_html) styles_num = int( re.findall( r'<span data-bind="text: formatedNumber" class="total-results">([0-9|,]*)</span>', page_html)[0].replace(",", "")) pages_num = int(math.ceil(styles_num / 36)) if pages_num > 1: for i in range(1, pages_num): page_url = "".join([url, "&pge=", str(i), "&pgesize=36"]) page_response = opener.visit_url(page_url, None) page_html = page_response.read() save_product_urls(brand_name, page_html, pudao) cur.close() conn.close()
def get_spider_brand_urls(): opener = NetOpener("http://www.asos.com", "asos_cookie.txt") # opener.init_cookie() opener.load_cookie() opener.load_opener() brand_file = open( "C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\asos_brands_men.txt", "r") brand_url_file = open( "C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\brand_urls_men.txt", "a") brands = brand_file.readlines() page_response = opener.visit_url( "http://www.asos.com/men/a-to-z-of-brands/cat/?cid=1361", None) page_html = page_response.read().lower() # test_html = open("C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\test.html", "w") # test_html.write(page_html) for brand in brands: brand_str = brand.strip().lower() r_str = "".join( ['href="(http://www.asos.com/[^"]*)"[^<]*', brand_str, "[^<]*<"]) match_strs = re.findall(r_str, page_html) match_strs = len(match_strs) > 0 and (",".join( str(match_item) for match_item in match_strs)) or "" brand_url_file.write("".join([brand.strip(), "|", match_strs, "\n"]))
def __init__(self): # 初始化网络访问工具 self.opener = NetOpener("https://translate.google.cn/", "gg_cookie.txt") self.opener.init_cookie() self.opener.load_cookie() self.opener.load_opener() # 加载 Google 请求识别标识生成方法 self.gg_tk_js = execjs.compile(""" var b = function (a, b) { for (var d = 0; d < b.length - 2; d += 3) { var c = b.charAt(d + 2), c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c), c = "+" == b.charAt(d + 1) ? a >>> c : a << c; a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c } return a } var tk = function (a,TKK) { //console.log(a,TKK); for (var e = TKK.split("."), h = Number(e[0]) || 0, g = [], d = 0, f = 0; f < a.length; f++) { var c = a.charCodeAt(f); 128 > c ? g[d++] = c : (2048 > c ? g[d++] = c >> 6 | 192 : (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512) ? (c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023), g[d++] = c >> 18 | 240, g[d++] = c >> 12 & 63 | 128) : g[d++] = c >> 12 | 224, g[d++] = c >> 6 & 63 | 128), g[d++] = c & 63 | 128) } a = h; for (d = 0; d < g.length; d++) a += g[d], a = b(a, "+-a^+6"); a = b(a, "+-3^+b+-f"); a ^= Number(e[1]) || 0; 0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + "." + (a ^ h) } """) self.tkk = "" self.jsondecoder = JSONDecoder()
class BaseUrlSpider(object): def __init__(self): # 初始化数据库配置 db_params = DBParams() db_params.host = "172.16.8.147" db_params.port = "3306" db_params.user = "******" db_params.passwd = "123456" db_params.db = "spider2" conn = get_param_conn(db_params) if conn is None: print("没有此数据库") return False cur = conn.cursor() self.PUDAO = ProductUrlDao(conn, cur) self.HOSTURL = None self.COOKIEFILE = None self.LOGFILE = None # 公用日志记录方法 def log_info(self, info): file = open(self.LOGFILE, "a") file.write("".join([info, "\n"])) file.close() # 初始化网络访问工具 def init_opener(self): self.OPENER = NetOpener(self.HOSTURL, self.COOKIEFILE) self.OPENER.init_cookie() self.OPENER.load_cookie() self.OPENER.load_opener() # 访问url def do_visit(self, url): try: rsp = self.OPENER.visit_url(url, None) return rsp.read() except Exception: return self.do_visit(url) # 保存商品url def save_url(self, pu): if self.PUDAO.is_exists_product_url(pu.source_code): print "".join([str(pu.source_code), " is exists!"]) else: self.PUDAO.save(pu) print "".join([str(pu.source_code), " saved!"])
def init_opener(): global OPENER OPENER = NetOpener("http://www.asos.com/women/", "asos_cookie.txt") OPENER.init_cookie() OPENER.load_cookie() OPENER.load_opener()
def init_opener(self): self.OPENER = NetOpener(self.HOSTURL, self.COOKIEFILE) self.OPENER.init_cookie() self.OPENER.load_cookie() self.OPENER.load_opener()
class GGTranslater(object): def __init__(self): # 初始化网络访问工具 self.opener = NetOpener("https://translate.google.cn/", "gg_cookie.txt") self.opener.init_cookie() self.opener.load_cookie() self.opener.load_opener() # 加载 Google 请求识别标识生成方法 self.gg_tk_js = execjs.compile(""" var b = function (a, b) { for (var d = 0; d < b.length - 2; d += 3) { var c = b.charAt(d + 2), c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c), c = "+" == b.charAt(d + 1) ? a >>> c : a << c; a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c } return a } var tk = function (a,TKK) { //console.log(a,TKK); for (var e = TKK.split("."), h = Number(e[0]) || 0, g = [], d = 0, f = 0; f < a.length; f++) { var c = a.charCodeAt(f); 128 > c ? g[d++] = c : (2048 > c ? g[d++] = c >> 6 | 192 : (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512) ? (c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023), g[d++] = c >> 18 | 240, g[d++] = c >> 12 & 63 | 128) : g[d++] = c >> 12 | 224, g[d++] = c >> 6 & 63 | 128), g[d++] = c & 63 | 128) } a = h; for (d = 0; d < g.length; d++) a += g[d], a = b(a, "+-a^+6"); a = b(a, "+-3^+b+-f"); a ^= Number(e[1]) || 0; 0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + "." + (a ^ h) } """) self.tkk = "" self.jsondecoder = JSONDecoder() # 获取识别标识生成参数 tkk def init_tkk(self): page_html = self.opener.visit_url("https://translate.google.cn/", None) page_str = page_html.read() tkk_str = re.findall(r'TKK=(eval\(\'[^\']*\'\))', page_str)[0] tkk_mtd = "".join(["var tkk = function () { return ", tkk_str, "}"]) tkk_js = execjs.compile(tkk_mtd) self.tkk = tkk_js.call("tkk") # 生成请求识别标识 def get_tk(self, source): return self.gg_tk_js.call("tk", source, self.tkk) # 翻译英文为中文的实现方法,提供给外部调用 def en_to_zh(self, source): self.init_tkk() tk = self.get_tk(source) params = {"tk": tk, "q": source} # 翻译接口连接 url = "https://translate.google.cn/translate_a/single?client=t&sl=en&tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex" \ "&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&otf=1&ssel=0&tsel=0&kc=5" \ "&" + urllib.urlencode(params) # 翻译结果 rsp = self.opener.visit_url(url, None).read() rjson = self.jsondecoder.decode(rsp) results = rjson[0] rarr = [] # 解析结果 for ritem in results: ritem_str = ritem[0] if ritem_str and ritem_str != "": rarr.append(ritem_str.strip().encode("utf-8")) return "\n".join(rarr)
class BaseProductSpider(object): def __init__(self): # 初始化数据库参数 db_params = DBParams() db_params.host = "172.16.8.147" db_params.port = "3306" db_params.user = "******" db_params.passwd = "123456" db_params.db = "spider2" conn = get_param_conn(db_params) if conn is None: print("没有此数据库") return False cur = conn.cursor() self.PDAO = ProductDao(conn, cur) self.PDDAO = ProductDescDao(conn, cur) self.PIDAO = ProductImagesDao(conn, cur) self.PSDAO = ProductSkuDao(conn, cur) self.OPENER = None self.COOKIEFILE = None self.LOGFILE = None # 初始化网络访问工具 def init_opener(self): self.OPENER = NetOpener(self.HOSTURL, self.COOKIEFILE) self.OPENER.init_cookie() self.OPENER.load_cookie() self.OPENER.load_opener() # 公用日志记录方法 def log_info(self, info): file = open(self.LOGFILE, "a") file.write("".join([info, "\n"])) file.close() # 公用url访问方法 def do_visit(self, url): try: rsp = self.OPENER.visit_url(url, None) return rsp.read() except Exception: return self.do_visit(url) # 公用json型返回请求解析方法 def get_json(self, url): try: rsp = self.do_visit(url) return json.loads(rsp) except Exception: return self.get_json(url) # 公用商品信息保存方法 def save_product(self, product): exists_id = self.PDAO.get_id_by_code(product.resource_code) # 如果已经存在改商品(已存在相同resource_code的商品),则不做处理。(如果需要更新商品信息,可以调用self.PDAO.update_product_info) if exists_id is not None: product.spider_product_id = exists_id print "".join( [str(exists_id), ":", str(product.resource_code), " exists!"]) else: # 保存商品信息 product.spider_product_id = self.PDAO.save(product) print "".join([str(product.spider_product_id)]) def save_product_desc(self, product): # 如果已经存在该商品描述,则不做处理。(如需更新商品描述,可调用 self.PDDAO.update_product_desc) if self.PDDAO.is_exists_product_desc(product.spider_product_id, product.language_id): print "".join([ str(product.spider_product_id), ":", product.language_id, " desc is exists!" ]) else: # 保存商品描述信息 self.PDDAO.save(product) def save_product_images(self, pimg): # 如果已经存在该商品图片信息,则不做处理 if self.PIDAO.is_exists_product_images(pimg.spid): print "".join([str(pimg.spid), " images exists!"]) else: # 保存商品图片信息 self.PIDAO.save(pimg) def save_product_skus(self, psku): # 如果已经存在该商品尺码信息,则不做处理 if self.PSDAO.get_id_by_spid_size(psku) is not None: print "".join( [str(psku.spid), ":", str(psku.size), " size is exists!"]) else: # 保存商品尺码信息 self.PSDAO.save(psku)
# -*- coding: UTF-8 -*- import sys sys.path.append("..") import multiprocessing import time import os from db.dbconnecter import get_param_conn from db.daos import ProductDao, ProductImagesDao from db.models import DBParams from netvisiter.net_openner import NetOpener # 下载不同站点的图片启用不同的 OPENER # OPENER = NetOpener("https://www.net-a-porter.com/it/zh/", "netaporter_cookie.txt") OPENER = NetOpener("https://www.farfetch.cn", "farfetch_cookie.txt") # OPENER = NetOpener("https://www.reiss.com", "reiss_cookie.txt") # OPENER = NetOpener("https://www.tedbaker.com", "tedbaker_cookie.txt") # OPENER = NetOpener("http://www.jackwills.com", "jackwills_cookie.txt") OPENER.init_cookie() OPENER.load_cookie() OPENER.load_opener() # 记录日志 def log_info(info): file = open("./log/image_log.txt", "a") file.write("".join([info, "\n"])) file.close() # 图片下载的实现方法,当出现报错时会调用自身重新下载,连续错误3次后不再下载,记入日志中