Exemple #1
0
 def __init__(self):
     self.encoding = "utf-8"
     self.mallUrl = 'http://www.smzdm.com/mall'
     self.hide_malls = [
         'ebay', 'dell', 'microsoftstore', 'newegg', 'amazon_jp', 'xiji',
         'sfht', 'mi', 'amazon_de', 'joesnewbalanceoutlet',
         'sierratradingpost', 'amazon_fr', 'kaola', 'myhabit',
         'nikestore_cn', 'ehaier', 'midea', 'jd_hk', 'royyoungchemist_cn',
         'amcal_cn', 'bubugao', 'supuy', 'muyingzhijia', 'daling', 'sasa',
         'amazon_es', '6pm', 'finishline', 'wiggle', 'jimmyjazz'
     ]
     self.dict_country = {
         '美国': 227,
         '日本': 109,
         '英国': 226,
         '德国': 82,
         '澳大利亚': 13,
         '西班牙': 198,
         '香港': 97,
         '德国': 82,
         '法国': 74
     }
     self.imgSaveRoot = 'E:\\wiki_img'
     self.file_tool = File_Tool()
     self.db = SMZDM_Mysql()
     self.myTool = HTML_Tool()
 def __init__(self):
     #属性类型不需要显示申明
     self.encoding = "utf-8"
     self.homeUrl = 'http://wiki.smzdm.com/'
     self.db = SMZDM_Mysql()
     self.myTool = HTML_Tool()
     self.categories = []
     self.thread_num = 10
Exemple #3
0
    def __init__(self):
        #属性类型不需要显示申明
        self.encoding = "utf-8"
        self.homeUrl = 'http://wiki.smzdm.com/youxuan/'
        self.tagUrl = 'http://wiki.smzdm.com/t'
        self.db = SMZDM_Mysql()
        self.myTool = HTML_Tool()

        self.old_tags = {}
    def __init__(self):
        self.encoding = "utf-8"
        self.homeUrl = 'http://pinpai.smzdm.com/'
        self.imgSaveRoot = 'E:\\wiki_img'
        self.file_tool = File_Tool()
        self.db = SMZDM_Mysql()
        self.myTool = HTML_Tool()

        self.countries = {}
        self.categories = {}
    def __init__(self):
        #属性类型不需要显示申明
        self.encoding = "utf-8"
        self.homeUrl = 'http://wiki.smzdm.com/'
        self.db = SMZDM_Mysql()
        self.myTool = HTML_Tool()
        self.categories = []
        self.thread_num = 10

        self.page_item_size = 20
        self.wiki_items = []

        # 只抓取优选 wiki
        self.is_excellent = False
Exemple #6
0
class Tag_Spider:
    #申明相关的属性,相当于构造函数
    def __init__(self):
        #属性类型不需要显示申明
        self.encoding = "utf-8"
        self.homeUrl = 'http://wiki.smzdm.com/youxuan/'
        self.tagUrl = 'http://wiki.smzdm.com/t'
        self.db = SMZDM_Mysql()
        self.myTool = HTML_Tool()

        self.old_tags = {}

    #定义方法
    def test_print(self):
        print 'hello world'

    def get_tags_hot(self):
        list_tag = self.db.get_tags()
        for tag in list_tag:
            self.old_tags[tag[1]] = tag[0]

    def spider_start(self):
        print u'已经启动Tag 爬虫,咔嚓咔嚓'
        self.db.init_db()
        #self.get_tags_hot()
        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
        headers = {'User-Agent': user_agent}
        try:
            #send HTTP/1.0 request , adding this , fix the problem
            httplib.HTTPConnection._http_vsn = 10
            httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
            # 处理热门标签
            #self.splider_hot_tags(headers)

            # 处理一般标签,轮询url
            self.splider_all_tags(headers)

            #after | back to http 1.1
            httplib.HTTPConnection._http_vsn = 11
            httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1'
        except Exception, e:
            self.db.close_db()
            print Exception, ":", e
            return ''

        self.db.close_db()
        print u'Tag 爬虫服务运行结束.....'
class Wiki_Splider:
    def __init__(self):
        #属性类型不需要显示申明
        self.encoding = "utf-8"
        self.homeUrl = 'http://wiki.smzdm.com/'
        self.db = SMZDM_Mysql()
        self.myTool = HTML_Tool()
        self.categories = []
        self.thread_num = 10

        self.page_item_size = 20
        self.wiki_items = []

        # 只抓取优选 wiki
        self.is_excellent = False

    def prepare_categories(self):
        _categories = self.db.get_categories(2)
        for category in _categories:
            self.categories.append(category)

    def spider_start(self):
        print u'已经启动Wiki 爬虫,咔嚓咔嚓'
        self.db.init_db()
        self.prepare_categories()
        self.db.close_db()
        print u'共处理category数:' + str(len(self.categories))
        try:
            #send HTTP/1.0 request , adding this , fix the problem
            httplib.HTTPConnection._http_vsn = 10
            httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'

            # 商品wiki 列表处理
            self.splide_wiki()

            #after | back to http 1.1
            httplib.HTTPConnection._http_vsn = 11
            httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1'
        except Exception, e:
            print Exception, ":", e
            return ''

        #self.db.close_db()
        print u'Wiki 爬虫服务运行结束.....'
    def splide_catenode_by_cate2(self, cates):
        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
        headers = {'User-Agent': user_agent}
        #一个线程一个db
        thread_db = SMZDM_Mysql()
        thread_db.init_db()
        node_results = []
        for cate in cates:
            current_cate_uri = cate[2]
            current_cate_id = cate[0]
            page_url = self.homeUrl + current_cate_uri
            print(u'开始处理页面:%s' % page_url)
            # 加载页面
            req = urllib2.Request(page_url, headers=headers)
            myPage = urllib2.urlopen(req).read().decode(self.encoding)
            soup = BeautifulSoup(myPage, 'lxml')

            dom_node_a = soup.select(
                'ul[class="search_result_ul"] > li[class="current"] > ul > li > a '
            )

            if dom_node_a:
                print(u'%s ,存在 %s 个四级分类' % (current_cate_uri, len(dom_node_a)))
                for item in dom_node_a:
                    node = {}
                    _href = item['href'].replace('/you', '').replace('/', '')
                    node['uri'] = _href
                    _text = self.myTool.Replace_Char(item.get_text().replace(
                        "\n", "").encode(self.encoding))
                    _pos = _text.find('(')
                    node['name'] = _text[:_pos]
                    node['parent_id'] = current_cate_id
                    node['level'] = 3
                    node_results.append(node)

        #print json.dumps(node_results,ensure_ascii=False)
        #print node_results
        self.insert_db(node_results, thread_db)
        thread_db.close_db()
    def splide_wikiurl_by_cates(self, cates):
        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
        headers = {'User-Agent': user_agent}
        #一个线程一个db
        thread_db = SMZDM_Mysql()
        thread_db.init_db()
        #wiki_results = []
        for cate in cates:
            current_cate_uri = cate[2]
            current_cate_id = cate[0]
            page_url = self.homeUrl + current_cate_uri
            if self.is_excellent:
                page_url = self.homeUrl + current_cate_uri + '/you'
            print(u'开始处理页面:%s' % page_url)
            # 加载页面
            req = urllib2.Request(page_url, headers=headers)
            myPage = urllib2.urlopen(req).read().decode(self.encoding)
            soup = BeautifulSoup(myPage, 'lxml')

            # 计算分页数
            item_num = 0
            dom_item_numb_div = soup.select(
                'div[class*="right_wrap"] > div[class*="right_top_title"] > div[class="total_pro"]'
            )
            if dom_item_numb_div:
                item_numb_text = dom_item_numb_div[0].get_text()
                #print item_numb_text
                if self.is_excellent:
                    item_num = int(
                        item_numb_text.replace('共', '').replace('条优选产品', ''))
                else:
                    item_num = int(
                        item_numb_text.replace('共', '').replace('条产品', ''))

            if not item_num:
                continue

            page_numb = (item_num + self.page_item_size -
                         1) / self.page_item_size
            print(u'%s,共%s页,%s条记录' % (current_cate_uri, page_numb, item_num))
            # 用于存储wiki_item 的url
            page_urls = []
            # 当前页 也就是第一页
            page_urls.extend(self.splide_wiki_list_item_url(soup))
            # 后面的页
            for i in range(2, page_numb + 1):
                #print u'开始处理第%s页'%i
                other_page_url = page_url + '/p' + str(i)
                other_req = urllib2.Request(other_page_url, headers=headers)
                other_page = urllib2.urlopen(other_req).read().decode(
                    self.encoding)
                other_soup = BeautifulSoup(other_page, 'lxml')
                page_urls.extend(self.splide_wiki_list_item_url(other_soup))

            # 当前cate 的所有wiki_item
            wiki_items = []
            for item in page_urls:
                wiki_u = {}
                wiki_u['url'] = item
                wiki_u['cate'] = current_cate_id
                wiki_u['cate_uri'] = current_cate_uri
                wiki_items.append(wiki_u)

            print(u'%s,开始入库,共计%s条记录' % (current_cate_uri, len(wiki_items)))
            self.insert_db_2(wiki_items, thread_db)
            #wiki_results.extend(wiki_items)
        thread_db.close_db()
 def __init__(self):
     self.encoding = "utf-8"
     self.categoryUrl = 'http://wiki.smzdm.com/youxuan'
     self.myTool = HTML_Tool()
     self.db = SMZDM_Mysql()
class Categories_Spider:
    #申明相关的属性
    def __init__(self):
        self.encoding = "utf-8"
        self.categoryUrl = 'http://wiki.smzdm.com/youxuan'
        self.myTool = HTML_Tool()
        self.db = SMZDM_Mysql()

    def test_print(self):
        print 'hello world'

    def spider_start(self):
        print u'已经启动Categories 爬虫,咔嚓咔嚓'
        self.db.init_db()
        #读取页面的原始信息并将其从gbk转码
        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
        headers = {'User-Agent': user_agent}
        # 处理分类
        try:
            self.get_categoris(headers)
        except Exception as ex:
            self.db.close_db()
            print("Exception occurred get_categoris call: " + ex.__str__())
            return ''

        self.db.close_db()
        print u'Categories 爬虫服务运行结束.....'

    # ------------------------- category 处理
    def get_categoris(self, _headers):
        req = urllib2.Request(self.categoryUrl, headers=_headers)
        myPage = urllib2.urlopen(req).read().decode(self.encoding)
        soup = BeautifulSoup(myPage, 'lxml')

        li_list = soup.select('ul[id="left-category"] > li')
        # {name:name,parent_id:parent_id,level:level,uri:uri}
        for item in li_list:
            # 查找一级分类 li div[class=li_item] > h2 > a
            node_1 = item.select('div[class="li_item"] > h2 > a')
            if node_1:
                #print node_1[0].contents[0].string
                #print node_1[0].get_text()
                category = self.hand_category_a(node_1[0])
                category['parent_id'] = -1
                category['level'] = 0
                self.db.insert_category(category)
                category['id'] = self.db.get_conn().insert_id()
                print json.dumps(category, ensure_ascii=False)

                # 处理 2 级 和 3级 类别
                dl_dt_a = item.select('dl[class="sub_category"] > dt > a')
                dl_dd = item.select('dl[class="sub_category"] > dd')
                sub_for_index = 0
                if len(dl_dt_a) == len(dl_dd):
                    for sub_2 in dl_dt_a:
                        subcate_2 = self.hand_category_a(sub_2)
                        subcate_2['parent_id'] = category['id']
                        subcate_2['level'] = 1
                        self.db.insert_category(subcate_2)
                        subcate_2['id'] = self.db.get_conn().insert_id()
                        print "   " + json.dumps(subcate_2, ensure_ascii=False)
                        # 三级类别
                        dl_dd_a = dl_dd[sub_for_index].find_all('a')
                        sub_for_index += 1
                        for sub_3 in dl_dd_a:
                            subcate_3 = self.hand_category_a(sub_3)
                            subcate_3['parent_id'] = subcate_2['id']
                            subcate_3['level'] = 2
                            self.db.insert_category(subcate_3)
                            subcate_3['id'] = self.db.get_conn().insert_id()
                            print "      " + json.dumps(subcate_3,
                                                        ensure_ascii=False)
                # 记得提交
                self.db.conn.commit()

    def hand_category_a(self, a):
        category = {}
        category['name'] = self.myTool.Replace_Char(a.get_text().replace(
            "\n", "").encode(self.encoding))
        href = a['href']
        pos = href[0:len(href) - 1].rfind('/')
        category['uri'] = href[1:pos]
        return category
Exemple #12
0
class Mall_Spider:
    #申明相关的属性
    def __init__(self):
        self.encoding = "utf-8"
        self.mallUrl = 'http://www.smzdm.com/mall'
        self.hide_malls = [
            'ebay', 'dell', 'microsoftstore', 'newegg', 'amazon_jp', 'xiji',
            'sfht', 'mi', 'amazon_de', 'joesnewbalanceoutlet',
            'sierratradingpost', 'amazon_fr', 'kaola', 'myhabit',
            'nikestore_cn', 'ehaier', 'midea', 'jd_hk', 'royyoungchemist_cn',
            'amcal_cn', 'bubugao', 'supuy', 'muyingzhijia', 'daling', 'sasa',
            'amazon_es', '6pm', 'finishline', 'wiggle', 'jimmyjazz'
        ]
        self.dict_country = {
            '美国': 227,
            '日本': 109,
            '英国': 226,
            '德国': 82,
            '澳大利亚': 13,
            '西班牙': 198,
            '香港': 97,
            '德国': 82,
            '法国': 74
        }
        self.imgSaveRoot = 'E:\\wiki_img'
        self.file_tool = File_Tool()
        self.db = SMZDM_Mysql()
        self.myTool = HTML_Tool()

    def test_print(self):
        print 'hello world'

    def spider_start(self):
        print u'已经启动Mall 爬虫,咔嚓咔嚓'
        self.db.init_db()

        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
        headers = {'User-Agent': user_agent}
        try:
            # 处理商城
            self.get_malls(headers)

            # 处理隐藏 mall
            self.get_malls_hide(headers)
        except Exception as ex:
            self.db.close_db()
            print("Exception occurred get_malls | get_malls_hide call: " +
                  ex.__str__())
            return ''

        self.db.close_db()
        print u'Mall 爬虫服务运行结束.....'

    # ------------------------- mall 处理
    def get_malls_hide(self, _headers):
        print u'已经启动隐藏商城爬虫,咔嚓咔嚓'
        #send HTTP/1.0 request , adding this , fix the problem
        httplib.HTTPConnection._http_vsn = 10
        httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
        malls = []
        category = '综合商城'
        for uri in self.hide_malls:
            url = self.mallUrl + '/' + uri
            detail_req = urllib2.Request(url, headers=_headers)
            detail_page = ''
            mall = {}
            mall['uri'] = uri
            try:
                detail_page = urllib2.urlopen(detail_req).read().decode(
                    self.encoding)
            except httplib.IncompleteRead, e:
                detail_page = e.partial
            except Exception as RESTex:
                print("Exception occurred get mall detail page call: " +
                      RESTex.__str__())
                continue
            detail = self.get_mall_details(detail_page)
            if detail:
                mall['name'] = detail['name']
                mall['url'] = detail['url']
                mall['country'] = detail['country']
                mall['excerpt'] = detail['excerpt']
                mall_image = detail['mall_image']
                mall['category'] = category
                mall['recommend'] = 5
                mall['summary'] = ''
                # save image to local
                if detail['mall_image']:
                    origin_image = detail['mall_image'].replace(
                        '_g320.jpg', '')
                    pos = origin_image.rfind('/')
                    mall_pic_name = origin_image[pos + 1:]
                    self.file_tool.saveImg(self.imgSaveRoot, 'mall',
                                           mall_pic_name, detail['mall_image'])
                    mall['pic_url'] = '/mall/' + mall_pic_name

                malls.append(mall)
class Brand_Spider:
    #申明相关的属性
    def __init__(self):
        self.encoding = "utf-8"
        self.homeUrl = 'http://pinpai.smzdm.com/'
        self.imgSaveRoot = 'E:\\wiki_img'
        self.file_tool = File_Tool()
        self.db = SMZDM_Mysql()
        self.myTool = HTML_Tool()

        self.countries = {}
        self.categories = {}

    def test_print(self):
        print 'hello world'

    def prepare_countries(self):
        _countries = self.db.get_country()
        for country in _countries:
            self.countries[country[2]] = country[0]

    def prepare_categories(self):
        _categories = self.db.get_big_categories()
        for category in _categories:
            self.categories[category[2]] = category[0]

    def spider_start(self):
        print u'已经启动Brand 爬虫,咔嚓咔嚓'
        self.db.init_db()
        # 准备工作
        self.prepare_countries()
        self.prepare_categories()

        # 处理逻辑
        # 1. 按照 category 的uri 请求网页,解析品牌个数 & 品牌第一页
        # 2. 如果品牌数>1000 则 请求品牌第二页。后面的品牌全部忽略
        # 3. 品牌 需要进入品牌详情页,读取品牌描述和品牌地区
        # 4. 保存品牌图片 ,品牌信息入库

        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
        headers = {'User-Agent': user_agent}
        try:
            #send HTTP/1.0 request , adding this , fix the problem
            httplib.HTTPConnection._http_vsn = 10
            httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
            # 处理商城
            self.get_brands(headers)

            #after | back to http 1.1
            httplib.HTTPConnection._http_vsn = 11
            httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1'
        except Exception as ex:
            self.db.close_db()
            print("Exception occurred get_brand call: " + ex.__str__())
            return ''

        self.db.close_db()
        print u'Brand 爬虫服务运行结束.....'

    def get_brands(self, _headers):
        print u'已经启动get_brands....'
        i = 0
        for cate_uri in self.categories:
            # i +=1
            # if i>2:
            #     break
            brand_cate_url = self.homeUrl + cate_uri

            req = urllib2.Request(brand_cate_url, headers=_headers)
            myPage = urllib2.urlopen(req).read().decode(self.encoding)
            soup = BeautifulSoup(myPage, 'lxml')

            splide_page = 1
            dom_brand_cate_numbs_span = soup.select(
                'div[class="brand-classify"] > ul > li > a[class="selected"] > span'
            )
            if dom_brand_cate_numbs_span[0]:
                numbs_str = dom_brand_cate_numbs_span[0].get_text()
                if numbs_str:
                    numbs = numbs_str.replace('(', '').replace(')', '')
                    splide_page = int(numbs) / 1000 + 1
                    splide_page = 3 if splide_page >= 3 else splide_page

            brands = []
            # 获取当前页面的所有品牌
            dom_brands_li = soup.select('ul[class*="brands"] > li')

            # 如果需要抓取第二页
            if splide_page == 2:
                req2 = urllib2.Request(brand_cate_url + '/p2',
                                       headers=_headers)
                myPage2 = urllib2.urlopen(req2).read().decode(self.encoding)
                soup2 = BeautifulSoup(myPage2, 'lxml')
                dom_brands_li_page2 = soup2.select('ul[class*="brands"] > li')
                dom_brands_li = dom_brands_li + dom_brands_li_page2

            if splide_page == 3:
                req3 = urllib2.Request(brand_cate_url + '/p3',
                                       headers=_headers)
                myPage3 = urllib2.urlopen(req3).read().decode(self.encoding)
                soup3 = BeautifulSoup(myPage3, 'lxml')
                dom_brands_li_page3 = soup3.select('ul[class*="brands"] > li')
                dom_brands_li = dom_brands_li + dom_brands_li_page3

            print(u"%s需要爬取%d页数,共计%d个品牌" %
                  (cate_uri, splide_page, len(dom_brands_li)))

            j = 0
            for brand_li in dom_brands_li:
                # if j>10:
                #     break
                # j +=1
                brand = {}
                brand['category'] = self.categories[cate_uri]
                detail_brand_a = brand_li.find('a')
                brand_detail_url = detail_brand_a['href']

                dom_brand_name_div = brand_li.find('div', class_='brands-name')
                if dom_brand_name_div:
                    brand['name'] = self.myTool.Replace_Char(
                        dom_brand_name_div.get_text().replace("\n", "").encode(
                            self.encoding))
                    if not brand['name']:
                        continue

                # 图片处理
                dom_brand_img = brand_li.find('img')
                if dom_brand_img:
                    brand_image = dom_brand_img['src']
                    _default_pos = brand_image.find('brand_default')
                    if _default_pos > 1:
                        brand['pic_url'] = '/brand/brand_default.jpg'
                    else:
                        # save image to local
                        origin_image = brand_image.replace('_d200.jpg', '')
                        pos = origin_image.rfind('/')
                        brand_pic_name = origin_image[pos + 1:]
                        sub_dir = str(brand['category']) + '/'
                        self.file_tool.saveImg(self.imgSaveRoot,
                                               'brand/' + sub_dir,
                                               brand_pic_name, brand_image)
                        brand['pic_url'] = '/brand/' + sub_dir + brand_pic_name

                # 进入详情页处理
                detail_req = urllib2.Request(brand_detail_url,
                                             headers=_headers)
                detail_page = ''
                try:
                    detail_page = urllib2.urlopen(detail_req).read().decode(
                        self.encoding)
                except httplib.IncompleteRead, e:
                    print("Exception occurred httplib.IncompleteRead")
                    detail_page = e.partial
                except Exception as ex:
                    print("Exception occurred get brand detail page call: " +
                          ex.__str__())
                    continue
                if detail_page:
                    detail = self.get_brand_detail(detail_page)
                    if detail:
                        brand['hot_tag'] = detail['hot_tag']
                        brand['country'] = detail['country']
                        brand['desc'] = detail['desc']

                brands.append(brand)
            # 按照类别 分批入库
            self.save_brands(brands)