def __init__(self): self.encoding = "utf-8" self.mallUrl = 'http://www.smzdm.com/mall' self.hide_malls = [ 'ebay', 'dell', 'microsoftstore', 'newegg', 'amazon_jp', 'xiji', 'sfht', 'mi', 'amazon_de', 'joesnewbalanceoutlet', 'sierratradingpost', 'amazon_fr', 'kaola', 'myhabit', 'nikestore_cn', 'ehaier', 'midea', 'jd_hk', 'royyoungchemist_cn', 'amcal_cn', 'bubugao', 'supuy', 'muyingzhijia', 'daling', 'sasa', 'amazon_es', '6pm', 'finishline', 'wiggle', 'jimmyjazz' ] self.dict_country = { '美国': 227, '日本': 109, '英国': 226, '德国': 82, '澳大利亚': 13, '西班牙': 198, '香港': 97, '德国': 82, '法国': 74 } self.imgSaveRoot = 'E:\\wiki_img' self.file_tool = File_Tool() self.db = SMZDM_Mysql() self.myTool = HTML_Tool()
def __init__(self): #属性类型不需要显示申明 self.encoding = "utf-8" self.homeUrl = 'http://wiki.smzdm.com/' self.db = SMZDM_Mysql() self.myTool = HTML_Tool() self.categories = [] self.thread_num = 10
def __init__(self): #属性类型不需要显示申明 self.encoding = "utf-8" self.homeUrl = 'http://wiki.smzdm.com/youxuan/' self.tagUrl = 'http://wiki.smzdm.com/t' self.db = SMZDM_Mysql() self.myTool = HTML_Tool() self.old_tags = {}
def __init__(self): self.encoding = "utf-8" self.homeUrl = 'http://pinpai.smzdm.com/' self.imgSaveRoot = 'E:\\wiki_img' self.file_tool = File_Tool() self.db = SMZDM_Mysql() self.myTool = HTML_Tool() self.countries = {} self.categories = {}
def __init__(self): #属性类型不需要显示申明 self.encoding = "utf-8" self.homeUrl = 'http://wiki.smzdm.com/' self.db = SMZDM_Mysql() self.myTool = HTML_Tool() self.categories = [] self.thread_num = 10 self.page_item_size = 20 self.wiki_items = [] # 只抓取优选 wiki self.is_excellent = False
class Tag_Spider: #申明相关的属性,相当于构造函数 def __init__(self): #属性类型不需要显示申明 self.encoding = "utf-8" self.homeUrl = 'http://wiki.smzdm.com/youxuan/' self.tagUrl = 'http://wiki.smzdm.com/t' self.db = SMZDM_Mysql() self.myTool = HTML_Tool() self.old_tags = {} #定义方法 def test_print(self): print 'hello world' def get_tags_hot(self): list_tag = self.db.get_tags() for tag in list_tag: self.old_tags[tag[1]] = tag[0] def spider_start(self): print u'已经启动Tag 爬虫,咔嚓咔嚓' self.db.init_db() #self.get_tags_hot() user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent': user_agent} try: #send HTTP/1.0 request , adding this , fix the problem httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' # 处理热门标签 #self.splider_hot_tags(headers) # 处理一般标签,轮询url self.splider_all_tags(headers) #after | back to http 1.1 httplib.HTTPConnection._http_vsn = 11 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1' except Exception, e: self.db.close_db() print Exception, ":", e return '' self.db.close_db() print u'Tag 爬虫服务运行结束.....'
class Wiki_Splider: def __init__(self): #属性类型不需要显示申明 self.encoding = "utf-8" self.homeUrl = 'http://wiki.smzdm.com/' self.db = SMZDM_Mysql() self.myTool = HTML_Tool() self.categories = [] self.thread_num = 10 self.page_item_size = 20 self.wiki_items = [] # 只抓取优选 wiki self.is_excellent = False def prepare_categories(self): _categories = self.db.get_categories(2) for category in _categories: self.categories.append(category) def spider_start(self): print u'已经启动Wiki 爬虫,咔嚓咔嚓' self.db.init_db() self.prepare_categories() self.db.close_db() print u'共处理category数:' + str(len(self.categories)) try: #send HTTP/1.0 request , adding this , fix the problem httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' # 商品wiki 列表处理 self.splide_wiki() #after | back to http 1.1 httplib.HTTPConnection._http_vsn = 11 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1' except Exception, e: print Exception, ":", e return '' #self.db.close_db() print u'Wiki 爬虫服务运行结束.....'
def splide_catenode_by_cate2(self, cates): user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent': user_agent} #一个线程一个db thread_db = SMZDM_Mysql() thread_db.init_db() node_results = [] for cate in cates: current_cate_uri = cate[2] current_cate_id = cate[0] page_url = self.homeUrl + current_cate_uri print(u'开始处理页面:%s' % page_url) # 加载页面 req = urllib2.Request(page_url, headers=headers) myPage = urllib2.urlopen(req).read().decode(self.encoding) soup = BeautifulSoup(myPage, 'lxml') dom_node_a = soup.select( 'ul[class="search_result_ul"] > li[class="current"] > ul > li > a ' ) if dom_node_a: print(u'%s ,存在 %s 个四级分类' % (current_cate_uri, len(dom_node_a))) for item in dom_node_a: node = {} _href = item['href'].replace('/you', '').replace('/', '') node['uri'] = _href _text = self.myTool.Replace_Char(item.get_text().replace( "\n", "").encode(self.encoding)) _pos = _text.find('(') node['name'] = _text[:_pos] node['parent_id'] = current_cate_id node['level'] = 3 node_results.append(node) #print json.dumps(node_results,ensure_ascii=False) #print node_results self.insert_db(node_results, thread_db) thread_db.close_db()
def splide_wikiurl_by_cates(self, cates): user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent': user_agent} #一个线程一个db thread_db = SMZDM_Mysql() thread_db.init_db() #wiki_results = [] for cate in cates: current_cate_uri = cate[2] current_cate_id = cate[0] page_url = self.homeUrl + current_cate_uri if self.is_excellent: page_url = self.homeUrl + current_cate_uri + '/you' print(u'开始处理页面:%s' % page_url) # 加载页面 req = urllib2.Request(page_url, headers=headers) myPage = urllib2.urlopen(req).read().decode(self.encoding) soup = BeautifulSoup(myPage, 'lxml') # 计算分页数 item_num = 0 dom_item_numb_div = soup.select( 'div[class*="right_wrap"] > div[class*="right_top_title"] > div[class="total_pro"]' ) if dom_item_numb_div: item_numb_text = dom_item_numb_div[0].get_text() #print item_numb_text if self.is_excellent: item_num = int( item_numb_text.replace('共', '').replace('条优选产品', '')) else: item_num = int( item_numb_text.replace('共', '').replace('条产品', '')) if not item_num: continue page_numb = (item_num + self.page_item_size - 1) / self.page_item_size print(u'%s,共%s页,%s条记录' % (current_cate_uri, page_numb, item_num)) # 用于存储wiki_item 的url page_urls = [] # 当前页 也就是第一页 page_urls.extend(self.splide_wiki_list_item_url(soup)) # 后面的页 for i in range(2, page_numb + 1): #print u'开始处理第%s页'%i other_page_url = page_url + '/p' + str(i) other_req = urllib2.Request(other_page_url, headers=headers) other_page = urllib2.urlopen(other_req).read().decode( self.encoding) other_soup = BeautifulSoup(other_page, 'lxml') page_urls.extend(self.splide_wiki_list_item_url(other_soup)) # 当前cate 的所有wiki_item wiki_items = [] for item in page_urls: wiki_u = {} wiki_u['url'] = item wiki_u['cate'] = current_cate_id wiki_u['cate_uri'] = current_cate_uri wiki_items.append(wiki_u) print(u'%s,开始入库,共计%s条记录' % (current_cate_uri, len(wiki_items))) self.insert_db_2(wiki_items, thread_db) #wiki_results.extend(wiki_items) thread_db.close_db()
def __init__(self): self.encoding = "utf-8" self.categoryUrl = 'http://wiki.smzdm.com/youxuan' self.myTool = HTML_Tool() self.db = SMZDM_Mysql()
class Categories_Spider: #申明相关的属性 def __init__(self): self.encoding = "utf-8" self.categoryUrl = 'http://wiki.smzdm.com/youxuan' self.myTool = HTML_Tool() self.db = SMZDM_Mysql() def test_print(self): print 'hello world' def spider_start(self): print u'已经启动Categories 爬虫,咔嚓咔嚓' self.db.init_db() #读取页面的原始信息并将其从gbk转码 user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent': user_agent} # 处理分类 try: self.get_categoris(headers) except Exception as ex: self.db.close_db() print("Exception occurred get_categoris call: " + ex.__str__()) return '' self.db.close_db() print u'Categories 爬虫服务运行结束.....' # ------------------------- category 处理 def get_categoris(self, _headers): req = urllib2.Request(self.categoryUrl, headers=_headers) myPage = urllib2.urlopen(req).read().decode(self.encoding) soup = BeautifulSoup(myPage, 'lxml') li_list = soup.select('ul[id="left-category"] > li') # {name:name,parent_id:parent_id,level:level,uri:uri} for item in li_list: # 查找一级分类 li div[class=li_item] > h2 > a node_1 = item.select('div[class="li_item"] > h2 > a') if node_1: #print node_1[0].contents[0].string #print node_1[0].get_text() category = self.hand_category_a(node_1[0]) category['parent_id'] = -1 category['level'] = 0 self.db.insert_category(category) category['id'] = self.db.get_conn().insert_id() print json.dumps(category, ensure_ascii=False) # 处理 2 级 和 3级 类别 dl_dt_a = item.select('dl[class="sub_category"] > dt > a') dl_dd = item.select('dl[class="sub_category"] > dd') sub_for_index = 0 if len(dl_dt_a) == len(dl_dd): for sub_2 in dl_dt_a: subcate_2 = self.hand_category_a(sub_2) subcate_2['parent_id'] = category['id'] subcate_2['level'] = 1 self.db.insert_category(subcate_2) subcate_2['id'] = self.db.get_conn().insert_id() print " " + json.dumps(subcate_2, ensure_ascii=False) # 三级类别 dl_dd_a = dl_dd[sub_for_index].find_all('a') sub_for_index += 1 for sub_3 in dl_dd_a: subcate_3 = self.hand_category_a(sub_3) subcate_3['parent_id'] = subcate_2['id'] subcate_3['level'] = 2 self.db.insert_category(subcate_3) subcate_3['id'] = self.db.get_conn().insert_id() print " " + json.dumps(subcate_3, ensure_ascii=False) # 记得提交 self.db.conn.commit() def hand_category_a(self, a): category = {} category['name'] = self.myTool.Replace_Char(a.get_text().replace( "\n", "").encode(self.encoding)) href = a['href'] pos = href[0:len(href) - 1].rfind('/') category['uri'] = href[1:pos] return category
class Mall_Spider: #申明相关的属性 def __init__(self): self.encoding = "utf-8" self.mallUrl = 'http://www.smzdm.com/mall' self.hide_malls = [ 'ebay', 'dell', 'microsoftstore', 'newegg', 'amazon_jp', 'xiji', 'sfht', 'mi', 'amazon_de', 'joesnewbalanceoutlet', 'sierratradingpost', 'amazon_fr', 'kaola', 'myhabit', 'nikestore_cn', 'ehaier', 'midea', 'jd_hk', 'royyoungchemist_cn', 'amcal_cn', 'bubugao', 'supuy', 'muyingzhijia', 'daling', 'sasa', 'amazon_es', '6pm', 'finishline', 'wiggle', 'jimmyjazz' ] self.dict_country = { '美国': 227, '日本': 109, '英国': 226, '德国': 82, '澳大利亚': 13, '西班牙': 198, '香港': 97, '德国': 82, '法国': 74 } self.imgSaveRoot = 'E:\\wiki_img' self.file_tool = File_Tool() self.db = SMZDM_Mysql() self.myTool = HTML_Tool() def test_print(self): print 'hello world' def spider_start(self): print u'已经启动Mall 爬虫,咔嚓咔嚓' self.db.init_db() user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent': user_agent} try: # 处理商城 self.get_malls(headers) # 处理隐藏 mall self.get_malls_hide(headers) except Exception as ex: self.db.close_db() print("Exception occurred get_malls | get_malls_hide call: " + ex.__str__()) return '' self.db.close_db() print u'Mall 爬虫服务运行结束.....' # ------------------------- mall 处理 def get_malls_hide(self, _headers): print u'已经启动隐藏商城爬虫,咔嚓咔嚓' #send HTTP/1.0 request , adding this , fix the problem httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' malls = [] category = '综合商城' for uri in self.hide_malls: url = self.mallUrl + '/' + uri detail_req = urllib2.Request(url, headers=_headers) detail_page = '' mall = {} mall['uri'] = uri try: detail_page = urllib2.urlopen(detail_req).read().decode( self.encoding) except httplib.IncompleteRead, e: detail_page = e.partial except Exception as RESTex: print("Exception occurred get mall detail page call: " + RESTex.__str__()) continue detail = self.get_mall_details(detail_page) if detail: mall['name'] = detail['name'] mall['url'] = detail['url'] mall['country'] = detail['country'] mall['excerpt'] = detail['excerpt'] mall_image = detail['mall_image'] mall['category'] = category mall['recommend'] = 5 mall['summary'] = '' # save image to local if detail['mall_image']: origin_image = detail['mall_image'].replace( '_g320.jpg', '') pos = origin_image.rfind('/') mall_pic_name = origin_image[pos + 1:] self.file_tool.saveImg(self.imgSaveRoot, 'mall', mall_pic_name, detail['mall_image']) mall['pic_url'] = '/mall/' + mall_pic_name malls.append(mall)
class Brand_Spider: #申明相关的属性 def __init__(self): self.encoding = "utf-8" self.homeUrl = 'http://pinpai.smzdm.com/' self.imgSaveRoot = 'E:\\wiki_img' self.file_tool = File_Tool() self.db = SMZDM_Mysql() self.myTool = HTML_Tool() self.countries = {} self.categories = {} def test_print(self): print 'hello world' def prepare_countries(self): _countries = self.db.get_country() for country in _countries: self.countries[country[2]] = country[0] def prepare_categories(self): _categories = self.db.get_big_categories() for category in _categories: self.categories[category[2]] = category[0] def spider_start(self): print u'已经启动Brand 爬虫,咔嚓咔嚓' self.db.init_db() # 准备工作 self.prepare_countries() self.prepare_categories() # 处理逻辑 # 1. 按照 category 的uri 请求网页,解析品牌个数 & 品牌第一页 # 2. 如果品牌数>1000 则 请求品牌第二页。后面的品牌全部忽略 # 3. 品牌 需要进入品牌详情页,读取品牌描述和品牌地区 # 4. 保存品牌图片 ,品牌信息入库 user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent': user_agent} try: #send HTTP/1.0 request , adding this , fix the problem httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' # 处理商城 self.get_brands(headers) #after | back to http 1.1 httplib.HTTPConnection._http_vsn = 11 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1' except Exception as ex: self.db.close_db() print("Exception occurred get_brand call: " + ex.__str__()) return '' self.db.close_db() print u'Brand 爬虫服务运行结束.....' def get_brands(self, _headers): print u'已经启动get_brands....' i = 0 for cate_uri in self.categories: # i +=1 # if i>2: # break brand_cate_url = self.homeUrl + cate_uri req = urllib2.Request(brand_cate_url, headers=_headers) myPage = urllib2.urlopen(req).read().decode(self.encoding) soup = BeautifulSoup(myPage, 'lxml') splide_page = 1 dom_brand_cate_numbs_span = soup.select( 'div[class="brand-classify"] > ul > li > a[class="selected"] > span' ) if dom_brand_cate_numbs_span[0]: numbs_str = dom_brand_cate_numbs_span[0].get_text() if numbs_str: numbs = numbs_str.replace('(', '').replace(')', '') splide_page = int(numbs) / 1000 + 1 splide_page = 3 if splide_page >= 3 else splide_page brands = [] # 获取当前页面的所有品牌 dom_brands_li = soup.select('ul[class*="brands"] > li') # 如果需要抓取第二页 if splide_page == 2: req2 = urllib2.Request(brand_cate_url + '/p2', headers=_headers) myPage2 = urllib2.urlopen(req2).read().decode(self.encoding) soup2 = BeautifulSoup(myPage2, 'lxml') dom_brands_li_page2 = soup2.select('ul[class*="brands"] > li') dom_brands_li = dom_brands_li + dom_brands_li_page2 if splide_page == 3: req3 = urllib2.Request(brand_cate_url + '/p3', headers=_headers) myPage3 = urllib2.urlopen(req3).read().decode(self.encoding) soup3 = BeautifulSoup(myPage3, 'lxml') dom_brands_li_page3 = soup3.select('ul[class*="brands"] > li') dom_brands_li = dom_brands_li + dom_brands_li_page3 print(u"%s需要爬取%d页数,共计%d个品牌" % (cate_uri, splide_page, len(dom_brands_li))) j = 0 for brand_li in dom_brands_li: # if j>10: # break # j +=1 brand = {} brand['category'] = self.categories[cate_uri] detail_brand_a = brand_li.find('a') brand_detail_url = detail_brand_a['href'] dom_brand_name_div = brand_li.find('div', class_='brands-name') if dom_brand_name_div: brand['name'] = self.myTool.Replace_Char( dom_brand_name_div.get_text().replace("\n", "").encode( self.encoding)) if not brand['name']: continue # 图片处理 dom_brand_img = brand_li.find('img') if dom_brand_img: brand_image = dom_brand_img['src'] _default_pos = brand_image.find('brand_default') if _default_pos > 1: brand['pic_url'] = '/brand/brand_default.jpg' else: # save image to local origin_image = brand_image.replace('_d200.jpg', '') pos = origin_image.rfind('/') brand_pic_name = origin_image[pos + 1:] sub_dir = str(brand['category']) + '/' self.file_tool.saveImg(self.imgSaveRoot, 'brand/' + sub_dir, brand_pic_name, brand_image) brand['pic_url'] = '/brand/' + sub_dir + brand_pic_name # 进入详情页处理 detail_req = urllib2.Request(brand_detail_url, headers=_headers) detail_page = '' try: detail_page = urllib2.urlopen(detail_req).read().decode( self.encoding) except httplib.IncompleteRead, e: print("Exception occurred httplib.IncompleteRead") detail_page = e.partial except Exception as ex: print("Exception occurred get brand detail page call: " + ex.__str__()) continue if detail_page: detail = self.get_brand_detail(detail_page) if detail: brand['hot_tag'] = detail['hot_tag'] brand['country'] = detail['country'] brand['desc'] = detail['desc'] brands.append(brand) # 按照类别 分批入库 self.save_brands(brands)