コード例 #1
0
def iterate_product_urls():
    opener = NetOpener("http://www.asos.com", "asos_cookie.txt")
    opener.init_cookie()
    opener.load_cookie()
    opener.load_opener()

    db_params = DBParams()
    db_params.host = "172.16.8.149"
    db_params.port = "3306"
    db_params.user = "******"
    db_params.passwd = "123456"
    db_params.db = "test"
    conn = get_param_conn(db_params)
    if conn is None:
        print("没有此数据库")
        return False
    cur = conn.cursor()
    pudao = ProductUrlsDao(conn, cur)
    pdao = ProductDao(conn, cur)
    pddao = ProductDescDao(conn, cur)
    pidao = ProductImagesDao(conn, cur)
    psdao = ProductSkuDao(conn, cur)

    product_urls = pudao.get_need_spider_urls()
    for url_item in product_urls:
        page_response = opener.visit_url(url_item[3], None)
        page_html = page_response.read()
        result = analysis_page_to_product(page_html, url_item, pdao, pidao,
                                          pddao, psdao, pudao, opener,
                                          url_item[2])
        pudao.sign_spider(url_item[0], result)
    cur.close()
    conn.close()
コード例 #2
0
def get_spider_product_urls():
    opener = NetOpener("http://www.asos.com", "asos_cookie.txt")
    opener.init_cookie()
    opener.load_cookie()
    opener.load_opener()

    db_params = DBParams()
    db_params.host = "172.16.8.149"
    db_params.port = "3306"
    db_params.user = "******"
    db_params.passwd = "123456"
    db_params.db = "test"
    conn = get_param_conn(db_params)
    if conn is None:
        print("没有此数据库")
        return False
    cur = conn.cursor()
    pudao = ProductUrlsDao(conn, cur)

    brand_url_file = open(
        "C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\brand_urls_20171010.txt",
        "r")
    brand_urls = brand_url_file.readlines()
    for url_str in brand_urls:
        url_item_arr = url_str.split("|")
        brand_name = url_item_arr[0].strip()
        url = url_item_arr[1].strip()
        page_response = opener.visit_url(url, None)
        page_html = page_response.read()
        save_product_urls(brand_name, page_html, pudao)
        # test_html = open("C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\test.html", "w")
        # test_html.write(page_html)
        styles_num = int(
            re.findall(
                r'<span data-bind="text: formatedNumber" class="total-results">([0-9|,]*)</span>',
                page_html)[0].replace(",", ""))
        pages_num = int(math.ceil(styles_num / 36))
        if pages_num > 1:
            for i in range(1, pages_num):
                page_url = "".join([url, "&pge=", str(i), "&pgesize=36"])
                page_response = opener.visit_url(page_url, None)
                page_html = page_response.read()
                save_product_urls(brand_name, page_html, pudao)

    cur.close()
    conn.close()
コード例 #3
0
def get_spider_brand_urls():
    opener = NetOpener("http://www.asos.com", "asos_cookie.txt")
    # opener.init_cookie()
    opener.load_cookie()
    opener.load_opener()

    brand_file = open(
        "C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\asos_brands_men.txt",
        "r")
    brand_url_file = open(
        "C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\brand_urls_men.txt",
        "a")
    brands = brand_file.readlines()

    page_response = opener.visit_url(
        "http://www.asos.com/men/a-to-z-of-brands/cat/?cid=1361", None)
    page_html = page_response.read().lower()
    # test_html = open("C:\\Users\\Administrator\\Desktop\\spidertemp\\asos\\test.html", "w")
    # test_html.write(page_html)

    for brand in brands:
        brand_str = brand.strip().lower()
        r_str = "".join(
            ['href="(http://www.asos.com/[^"]*)"[^<]*', brand_str, "[^<]*<"])
        match_strs = re.findall(r_str, page_html)
        match_strs = len(match_strs) > 0 and (",".join(
            str(match_item) for match_item in match_strs)) or ""
        brand_url_file.write("".join([brand.strip(), "|", match_strs, "\n"]))
コード例 #4
0
ファイル: translate.py プロジェクト: hhzgit/ffspider
 def __init__(self):
     # 初始化网络访问工具
     self.opener = NetOpener("https://translate.google.cn/",
                             "gg_cookie.txt")
     self.opener.init_cookie()
     self.opener.load_cookie()
     self.opener.load_opener()
     # 加载 Google 请求识别标识生成方法
     self.gg_tk_js = execjs.compile("""
         var b = function (a, b) {
             for (var d = 0; d < b.length - 2; d += 3) {
                 var c = b.charAt(d + 2),
                     c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c),
                     c = "+" == b.charAt(d + 1) ? a >>> c : a << c;
                 a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c
             }
             return a
         }
         
         var tk =  function (a,TKK) {
             //console.log(a,TKK);
             for (var e = TKK.split("."), h = Number(e[0]) || 0, g = [], d = 0, f = 0; f < a.length; f++) {
                 var c = a.charCodeAt(f);
                 128 > c ? g[d++] = c : (2048 > c ? g[d++] = c >> 6 | 192 : (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512) ? (c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023), g[d++] = c >> 18 | 240, g[d++] = c >> 12 & 63 | 128) : g[d++] = c >> 12 | 224, g[d++] = c >> 6 & 63 | 128), g[d++] = c & 63 | 128)
             }
             a = h;
             for (d = 0; d < g.length; d++) a += g[d], a = b(a, "+-a^+6");
             a = b(a, "+-3^+b+-f");
             a ^= Number(e[1]) || 0;
             0 > a && (a = (a & 2147483647) + 2147483648);
             a %= 1E6;
             return a.toString() + "." + (a ^ h)
         }
     """)
     self.tkk = ""
     self.jsondecoder = JSONDecoder()
コード例 #5
0
ファイル: base_url_spider.py プロジェクト: hhzgit/ffspider
class BaseUrlSpider(object):
    def __init__(self):
        # 初始化数据库配置
        db_params = DBParams()
        db_params.host = "172.16.8.147"
        db_params.port = "3306"
        db_params.user = "******"
        db_params.passwd = "123456"
        db_params.db = "spider2"
        conn = get_param_conn(db_params)
        if conn is None:
            print("没有此数据库")
            return False
        cur = conn.cursor()
        self.PUDAO = ProductUrlDao(conn, cur)

        self.HOSTURL = None
        self.COOKIEFILE = None
        self.LOGFILE = None

    # 公用日志记录方法
    def log_info(self, info):
        file = open(self.LOGFILE, "a")
        file.write("".join([info, "\n"]))
        file.close()

    # 初始化网络访问工具
    def init_opener(self):
        self.OPENER = NetOpener(self.HOSTURL, self.COOKIEFILE)
        self.OPENER.init_cookie()
        self.OPENER.load_cookie()
        self.OPENER.load_opener()

    # 访问url
    def do_visit(self, url):
        try:
            rsp = self.OPENER.visit_url(url, None)
            return rsp.read()
        except Exception:
            return self.do_visit(url)

    # 保存商品url
    def save_url(self, pu):
        if self.PUDAO.is_exists_product_url(pu.source_code):
            print "".join([str(pu.source_code), " is exists!"])
        else:
            self.PUDAO.save(pu)
            print "".join([str(pu.source_code), " saved!"])
コード例 #6
0
def init_opener():
    global OPENER
    OPENER = NetOpener("http://www.asos.com/women/", "asos_cookie.txt")
    OPENER.init_cookie()
    OPENER.load_cookie()
    OPENER.load_opener()
コード例 #7
0
ファイル: base_url_spider.py プロジェクト: hhzgit/ffspider
 def init_opener(self):
     self.OPENER = NetOpener(self.HOSTURL, self.COOKIEFILE)
     self.OPENER.init_cookie()
     self.OPENER.load_cookie()
     self.OPENER.load_opener()
コード例 #8
0
ファイル: translate.py プロジェクト: hhzgit/ffspider
class GGTranslater(object):
    def __init__(self):
        # 初始化网络访问工具
        self.opener = NetOpener("https://translate.google.cn/",
                                "gg_cookie.txt")
        self.opener.init_cookie()
        self.opener.load_cookie()
        self.opener.load_opener()
        # 加载 Google 请求识别标识生成方法
        self.gg_tk_js = execjs.compile("""
            var b = function (a, b) {
                for (var d = 0; d < b.length - 2; d += 3) {
                    var c = b.charAt(d + 2),
                        c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c),
                        c = "+" == b.charAt(d + 1) ? a >>> c : a << c;
                    a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c
                }
                return a
            }
            
            var tk =  function (a,TKK) {
                //console.log(a,TKK);
                for (var e = TKK.split("."), h = Number(e[0]) || 0, g = [], d = 0, f = 0; f < a.length; f++) {
                    var c = a.charCodeAt(f);
                    128 > c ? g[d++] = c : (2048 > c ? g[d++] = c >> 6 | 192 : (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512) ? (c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023), g[d++] = c >> 18 | 240, g[d++] = c >> 12 & 63 | 128) : g[d++] = c >> 12 | 224, g[d++] = c >> 6 & 63 | 128), g[d++] = c & 63 | 128)
                }
                a = h;
                for (d = 0; d < g.length; d++) a += g[d], a = b(a, "+-a^+6");
                a = b(a, "+-3^+b+-f");
                a ^= Number(e[1]) || 0;
                0 > a && (a = (a & 2147483647) + 2147483648);
                a %= 1E6;
                return a.toString() + "." + (a ^ h)
            }
        """)
        self.tkk = ""
        self.jsondecoder = JSONDecoder()

    # 获取识别标识生成参数 tkk
    def init_tkk(self):
        page_html = self.opener.visit_url("https://translate.google.cn/", None)
        page_str = page_html.read()
        tkk_str = re.findall(r'TKK=(eval\(\'[^\']*\'\))', page_str)[0]
        tkk_mtd = "".join(["var tkk = function () { return ", tkk_str, "}"])
        tkk_js = execjs.compile(tkk_mtd)
        self.tkk = tkk_js.call("tkk")

    # 生成请求识别标识
    def get_tk(self, source):
        return self.gg_tk_js.call("tk", source, self.tkk)

    # 翻译英文为中文的实现方法,提供给外部调用
    def en_to_zh(self, source):
        self.init_tkk()
        tk = self.get_tk(source)
        params = {"tk": tk, "q": source}
        # 翻译接口连接
        url = "https://translate.google.cn/translate_a/single?client=t&sl=en&tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex" \
              "&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&otf=1&ssel=0&tsel=0&kc=5" \
              "&" + urllib.urlencode(params)
        # 翻译结果
        rsp = self.opener.visit_url(url, None).read()
        rjson = self.jsondecoder.decode(rsp)
        results = rjson[0]
        rarr = []
        # 解析结果
        for ritem in results:
            ritem_str = ritem[0]
            if ritem_str and ritem_str != "":
                rarr.append(ritem_str.strip().encode("utf-8"))
        return "\n".join(rarr)
コード例 #9
0
class BaseProductSpider(object):
    def __init__(self):
        # 初始化数据库参数
        db_params = DBParams()
        db_params.host = "172.16.8.147"
        db_params.port = "3306"
        db_params.user = "******"
        db_params.passwd = "123456"
        db_params.db = "spider2"
        conn = get_param_conn(db_params)
        if conn is None:
            print("没有此数据库")
            return False
        cur = conn.cursor()
        self.PDAO = ProductDao(conn, cur)
        self.PDDAO = ProductDescDao(conn, cur)
        self.PIDAO = ProductImagesDao(conn, cur)
        self.PSDAO = ProductSkuDao(conn, cur)

        self.OPENER = None
        self.COOKIEFILE = None
        self.LOGFILE = None

    # 初始化网络访问工具
    def init_opener(self):
        self.OPENER = NetOpener(self.HOSTURL, self.COOKIEFILE)
        self.OPENER.init_cookie()
        self.OPENER.load_cookie()
        self.OPENER.load_opener()

    # 公用日志记录方法
    def log_info(self, info):
        file = open(self.LOGFILE, "a")
        file.write("".join([info, "\n"]))
        file.close()

    # 公用url访问方法
    def do_visit(self, url):
        try:
            rsp = self.OPENER.visit_url(url, None)
            return rsp.read()
        except Exception:
            return self.do_visit(url)

    # 公用json型返回请求解析方法
    def get_json(self, url):
        try:
            rsp = self.do_visit(url)
            return json.loads(rsp)
        except Exception:
            return self.get_json(url)

    # 公用商品信息保存方法
    def save_product(self, product):
        exists_id = self.PDAO.get_id_by_code(product.resource_code)
        # 如果已经存在改商品(已存在相同resource_code的商品),则不做处理。(如果需要更新商品信息,可以调用self.PDAO.update_product_info)
        if exists_id is not None:
            product.spider_product_id = exists_id
            print "".join(
                [str(exists_id), ":",
                 str(product.resource_code), " exists!"])
        else:
            # 保存商品信息
            product.spider_product_id = self.PDAO.save(product)
            print "".join([str(product.spider_product_id)])

    def save_product_desc(self, product):
        # 如果已经存在该商品描述,则不做处理。(如需更新商品描述,可调用 self.PDDAO.update_product_desc)
        if self.PDDAO.is_exists_product_desc(product.spider_product_id,
                                             product.language_id):
            print "".join([
                str(product.spider_product_id), ":", product.language_id,
                " desc is exists!"
            ])
        else:
            # 保存商品描述信息
            self.PDDAO.save(product)

    def save_product_images(self, pimg):
        # 如果已经存在该商品图片信息,则不做处理
        if self.PIDAO.is_exists_product_images(pimg.spid):
            print "".join([str(pimg.spid), " images exists!"])
        else:
            # 保存商品图片信息
            self.PIDAO.save(pimg)

    def save_product_skus(self, psku):
        # 如果已经存在该商品尺码信息,则不做处理
        if self.PSDAO.get_id_by_spid_size(psku) is not None:
            print "".join(
                [str(psku.spid), ":",
                 str(psku.size), " size is exists!"])
        else:
            # 保存商品尺码信息
            self.PSDAO.save(psku)
コード例 #10
0
# -*- coding: UTF-8 -*-
import sys

sys.path.append("..")
import multiprocessing
import time
import os
from db.dbconnecter import get_param_conn
from db.daos import ProductDao, ProductImagesDao
from db.models import DBParams
from netvisiter.net_openner import NetOpener

# 下载不同站点的图片启用不同的 OPENER
# OPENER = NetOpener("https://www.net-a-porter.com/it/zh/", "netaporter_cookie.txt")
OPENER = NetOpener("https://www.farfetch.cn", "farfetch_cookie.txt")
# OPENER = NetOpener("https://www.reiss.com", "reiss_cookie.txt")
# OPENER = NetOpener("https://www.tedbaker.com", "tedbaker_cookie.txt")
# OPENER = NetOpener("http://www.jackwills.com", "jackwills_cookie.txt")
OPENER.init_cookie()
OPENER.load_cookie()
OPENER.load_opener()


# 记录日志
def log_info(info):
    file = open("./log/image_log.txt", "a")
    file.write("".join([info, "\n"]))
    file.close()


# 图片下载的实现方法,当出现报错时会调用自身重新下载,连续错误3次后不再下载,记入日志中