class BenDiSouWangSpider(CrawlSpider): name = "bendiso" allowed_domains = ['www.bendiso.com', 'bendiso.com'] start_urls = ['http://www.bendiso.com/gongsi/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@id='main']//ul//li//div[@class='picture']//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@class='page']//a[@title='下一页']")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='nav']//a[contains(text(),'联系方式')]")), callback='parse_items', follow=True), ) def parse_items(self, response): item = BenDiSouWangItem() pattern = re.compile(r'<meta name="keywords" content=".*?,(.*?)"/>', re.S) item["company_Name"] = response.xpath( "//div[@id='header']//h1//a/text()").extract_first() item["company_address"] = response.xpath( "//div[@class='corp_boxinfo']//p[contains(text(),'地址:')]/text()" ).extract_first() item["linkman"] = response.xpath( "//p[contains(text(),'联系人:')]/text()").extract_first() item["telephone"] = response.xpath( "//p[contains(text(),'电话:')]/text()").extract_first() item["phone"] = response.xpath( "//p[contains(text(),'手机:')]/text()").extract_first() item["contact_Fax"] = response.xpath( "//p[contains(text(),'传真:')]/text()").extract_first() item["contact_QQ"] = response.xpath( "//p[contains(text(),'QQ:')]//img[@alt='点击这里给我发消息']/../@href" ).extract_first() item["E_Mail"] = response.xpath( "//p[contains(text(),'邮箱:')]/a/text()").extract_first() item["Source"] = response.url item["kind"] = ",".join( response.xpath( "//ul[@class='product_boxli']//li//div[@class='info']//a/@title" ).getall()) city_infos = response.xpath( "//div[@class='corp_boxinfo']//p[contains(text(),'地址:')]/text()" ).get() if item["company_Name"] and item["company_Name"] != '': if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() else: return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:|', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace('未填写', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' if city_infos: if ' ' in city_infos: try: city_infos = city_infos.replace("地址:", "") item["province"] = city_infos.split(' ')[0] item["city_name"] = city_infos.split(' ')[1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: # pattern_p = re.compile(r'([\u4e00-\u9fa5]{2,5})省') # pattern_c = re.compile(r'[省]([\u4e00-\u9fa5]{2,5})市') item["province"] = '' item["city_name"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class QiMaoWangSpider(CrawlSpider): name = 'jzjxqm' allowed_domains = ['www.jzjxqm.com'] start_urls = ['http://www.jzjxqm.com/qiye/'] cw = CleanWords() custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Cookie":"Hm_lvt_39b391b010992cf89654d83467db5db7=1564969344; Hm_lpvt_39b391b010992cf89654d83467db5db7=1564970833", # "Host":"www.mfqyw.com", # "Referer":"http://www.mfqyw.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } def parse(self, response): a_list = response.xpath( "//div[@class='m']//div[@class='left_box']//div[@class='catalog']//td//a" ) for a in a_list: kind_name = a.xpath("./text()").extract_first() kind_href = a.xpath("./@href").extract_first() if kind_name is None: kind_name = a.xpath("./strong/text()").extract_first() if kind_href: # print(kind_name,kind_href) yield scrapy.Request(url=kind_href, callback=self.parse_company_list, dont_filter=True) def parse_company_list(self, response): tr_list = response.xpath( "//div[@class='left_box']//div[@class='list']//table//tr") for tr in tr_list: item = QiMaoWangspiderItem() pattern = re.compile(r'\[(.*?)\/(.*?)\]', re.S) item["company_Name"] = tr.xpath( ".//li//a/strong/text()").extract_first() company_href = tr.xpath(".//li/a/@href").extract_first() item["kind"] = tr.xpath( ".//li[contains(text(),'主营:')]/text()").extract_first() city_infos = tr.xpath( ".//td[@class='f_orange']/text()").extract_first() if city_infos: # 广东/潮州市 try: item["province"] = re.findall(pattern, city_infos)[0][0] item["city_name"] = re.findall(pattern, city_infos)[0][1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = city_infos item["city_name"] = '' if company_href: # print(company_href) contact_href = company_href + "contact/" yield scrapy.Request(url=contact_href, callback=self.parse_company_contact, meta={"item": item}, dont_filter=True) next_page_url = response.xpath( "//div[@class='pages']//a[contains(text(),'下一页»')]/@href" ).extract_first() if next_page_url: yield scrapy.Request(url=next_page_url, callback=self.parse_company_list) def parse_company_contact(self, response): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Cookie": "bdshare_firstime=1564976283795; BAIDU_SSP_lcr=https://www.baidu.com/link?url=6kZ0hzYVwwzEyL9fwlHs-4qX3qJG3iRU1NoWkSz4Thu&wd=&eqid=ae0f4b8300245afa000000065d50c031; Hm_lvt_c7894de9c1e0658a1d1ab0f838038a41=1565573377; UM_distinctid=16c8371ec762f7-08654ebee1a5d2-5a13331d-1fa400-16c8371ec77453; CNZZDATA4515303=cnzz_eid%3D439230478-1565569508-%26ntime%3D1565569508; Hm_lpvt_c7894de9c1e0658a1d1ab0f838038a41=1565573634", # "Host": "www.jzjxqm.com", "Referer": response.url, "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } item = response.meta["item"] item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td/text()" ).extract_first() item["company_address"] = "".join( response.xpath( "//td[contains(text(),'公司地址:')]/following-sibling::td/text()"). extract()) item["linkman"] = response.xpath( "//td[contains(text(),'联 系 人:')]/following-sibling::td/text()" ).extract_first() item["telephone"] = response.xpath( "//td[contains(text(),'公司电话:')]/following-sibling::td/img/@src" ).extract_first() item["phone"] = response.xpath( "//td[contains(text(),'手机号码:')]/following-sibling::td/img/@src" ).extract_first() item["contact_Fax"] = response.xpath( "//td[contains(text(),'公司传真:')]/following-sibling::td/img/@src" ).extract_first() item["contact_QQ"] = response.xpath( "//img[@title='点击QQ交谈/留言']/../@href").extract_first() item["E_Mail"] = response.xpath( "//td[contains(text(),'电子邮件:')]/following-sibling::td/img/@src" ).extract_first() item["Source"] = response.url if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.requests_href(item["phone"], headers) item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.requests_href(item["telephone"], headers) item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.requests_href(item["contact_Fax"], headers) item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.requests_href(item["E_Mail"], headers) item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return '' def requests_href(self, url, headers): res = requests.get(url=url, headers=headers, timeout=20, verify=False) res.encoding = "utf-8" if res.status_code == requests.codes.ok: img = res.content something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img2\image.png" with open(something_img_file_path, "wb") as fp: fp.write(img) fp.close() if img: try: something = recognition_image(something_img_file_path) if something: return something else: return '' except: return '' else: return '' else: return ''
class ZhongGuoHuaDongHuaGongWangSpider(CrawlSpider): name = "nbchem" allowed_domains = ['www.nbchem.com','nbchem.com'] start_urls = ['http://www.nbchem.com'] cw = CleanWords() page_count = 0 # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302}, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } def parse(self, response): a_list = response.xpath("//div[@id='p2']//table[@align='center']//td[@valign='top']//div//a") for a in a_list: kind_name = a.xpath("./text()").get() kind_href = a.xpath("./@href").get() if kind_href: kind_href = "http://www.nbchem.com" + kind_href # print(kind_name,kind_href) # list2-306.html kind_num = kind_href.split("list2-")[-1].split(".html")[0] yield scrapy.Request( url=kind_href, callback=self.parse_company_list, meta={"info":kind_num}, dont_filter=True ) def parse_company_list(self, response): if "没有公司" in response.text: return pattern = re.compile(r'</span>\)主要产品:(.*?)\s*</div>',re.S) kinds = re.findall(pattern,response.text) if re.findall(pattern,response.text) else '' td_list = response.xpath("//div[@class='page']//table[3]//td[@valign='top']//div//table[@width='100%']//td") for td in td_list: item = ZhongGuoHuaDongHuaGongWangItem() item["kind"] = "".join(td.xpath(".//div[2]//text()").getall()).strip() contact_href = td.xpath(".//div[4]/a/@href").get() if contact_href and "dtcon-" in contact_href: # http://www.nbchem.com/bc/dtcon-15691.html contact_href = "http://www.nbchem.com/bc/" + contact_href # print(contact_href) yield scrapy.Request( url=contact_href, callback=self.parse_company_contact, meta={"item": item}, dont_filter=True ) kind_num = response.meta.get("info") if kind_num: total_page = response.xpath("//div[@class='nbchem']//span[@id='ctl00_ContentPlaceHolder1_PageNav1_labPage']/text()").get() if total_page: total_page_num = total_page.split('/')[-1] if self.page_count < int(total_page_num): self.page_count += 1 next_page_url = "http://www.nbchem.com/bc/list2-{}-0-0-0-{}.html".format(kind_num,self.page_count) print(next_page_url) # http://www.nbchem.com/bc/list2-435-0-0-0-1.html yield scrapy.Request( url=next_page_url, callback=self.parse_company_list ) def parse_company_contact(self, response): item = response.meta["item"] if "contact.aspx" in response.url: item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td/strong/text()").extract_first() item["company_address"] = response.xpath( "//td[contains(text(),'地址:')]/following-sibling::td/text()").extract_first() item["linkman"] = response.xpath( "//td[contains(text(),'联系人:')]/following-sibling::td/text()").extract_first() item["telephone"] = response.xpath( "//td[contains(text(),'电话:')]/following-sibling::td/text()").extract_first() item["phone"] = response.xpath( "//td[contains(text(),'手机:')]/following-sibling::td/text()").extract_first() item["contact_Fax"] = response.xpath( "//td[contains(text(),'传真:')]/following-sibling::td/text()").extract_first() item["contact_QQ"] = response.xpath( "//td[contains(text(),'QQ:')]/following-sibling::td/a/text()").extract_first() item["E_Mail"] = response.xpath( "//td[contains(text(),'Email:')]/following-sibling::td/a/text()").extract_first() item["Source"] = response.url item["kind"] = item["kind"] city_infos = response.xpath("//dt[contains(text(),'所在地区:')]/following-sibling::dd/text()").get() if item["company_Name"] and item["company_Name"] != '': if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace(' ', '').strip() else: return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|') \ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace('未填写', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address(item["company_address"]) else: item["company_address"] = '' if city_infos: if '/' in city_infos: try: item["province"] = city_infos.split('/')[0] item["city_name"] = city_infos.split('/')[1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: pattern = re.compile(r'<meta name="keywords" content=".*?,(.*?)"/>',re.S) item["company_Name"] = response.xpath("//div[@class='ptitle']/span/text()").extract_first() item["company_address"] = response.xpath("//td[contains(text(),'地 址:')]/following-sibling::td/span/text()").extract_first() item["linkman"] = response.xpath("//td[contains(text(),'联 系 人:')]/following-sibling::td/span/text()").extract_first() item["telephone"] = response.xpath("//td[contains(text(),'电 话:')]/following-sibling::td/span/text()").extract_first() item["phone"] = response.xpath("//td[contains(text(),'移动电话:')]/following-sibling::td/span/text()").extract_first() item["contact_Fax"] = response.xpath("//td[contains(text(),'传 真:')]/following-sibling::td/span/text()").extract_first() item["contact_QQ"] = response.xpath("//td[contains(text(),'QQ:')]/following-sibling::td/span/text()").extract_first() item["E_Mail"] = response.xpath("//td[contains(text(),'电子邮件:')]/following-sibling::td/span/text()").extract_first() item["Source"] = response.url item["kind"] = item["kind"] city_infos = response.xpath("//dt[contains(text(),'所在地区:')]/following-sibling::dd/text()").get() if item["company_Name"] and item["company_Name"] != '': if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace(' ', '').strip() else: return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace('未填写','') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address(item["company_address"]) else: item["company_address"] = '' if city_infos: if '/' in city_infos: try: item["province"] = city_infos.split('/')[0] item["city_name"] = city_infos.split('/')[1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class QiDuoWangSpider(CrawlSpider): name = "qdw" allowed_domains = ['www.qiduowang.com'] start_urls = ['http://www.qiduowang.com/company/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@class='left_box']//table//tr//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='list']//td[@align='left']//li//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='pages']//a[contains(text(),'下一页»')]")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='menu']//span[contains(text(),'联系方式')]/..")), callback='parse_items', follow=True), ) def parse_items(self, response): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "safedog-flow-item=; __51cke__=; __tins__915545=%7B%22sid%22%3A%201567480167459%2C%20%22vd%22%3A%201%2C%20%22expires%22%3A%201567481967459%7D; __51laig__=9", "Host": "www.qiduowang.com", "Referer": response.url, "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } item = QiDuoWangItem() item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td/text()" ).extract_first() item["company_address"] = response.xpath( "//td[contains(text(),'公司地址:')]/following-sibling::td/text()" ).extract_first() item["linkman"] = response.xpath( "//td[contains(text(),'联 系 人:')]/following-sibling::td/text()" ).extract_first() item["telephone"] = response.xpath( "//td[contains(text(),'公司电话:')]/following-sibling::td/img/@src" ).extract_first() item["phone"] = response.xpath( "//td[contains(text(),'手机号码:')]/following-sibling::td/img/@src" ).extract_first() item["contact_Fax"] = response.xpath( "//td[contains(text(),'公司传真:')]/following-sibling::td/img/@src" ).extract_first() item["contact_QQ"] = response.xpath( "//img[@title='点击QQ交谈/留言']/../@href").extract_first() item["E_Mail"] = response.xpath( "//td[contains(text(),'电子邮件:')]/following-sibling::td/img/@src" ).extract_first() item["Source"] = response.url item["kind"] = ",".join( response.xpath("//div[@class='head']//h4/text()").getall()) city_infos = response.xpath( "//td[contains(text(),'所在地区:')]/following-sibling::td/text()").get( ) if item["company_Name"]: if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.requests_href(item["phone"], headers) item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.requests_href(item["telephone"], headers) item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.requests_href(item["contact_Fax"], headers) item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.requests_href(item["E_Mail"], headers) item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' if city_infos: if '/' in city_infos: try: item["province"] = city_infos.split('/')[0] item["city_name"] = city_infos.split('/')[1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return '' def requests_href(self, url, headers): res = requests.get(url=url, headers=headers, timeout=10, verify=False) res.encoding = "utf-8" if res.status_code == requests.codes.ok: img = res.content something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img3\image.png" with open(something_img_file_path, "wb") as fp: fp.write(img) fp.close() if img: try: something = recognition_image(something_img_file_path) if something: return something else: return '' except: return '' else: return '' else: return ''
class WuJinShangJiWangSpider(CrawlSpider): name = "chinawj" allowed_domains = ['www.chinawj.com.cn', 'chinawj.com.cn'] start_urls = ['http://www.chinawj.com.cn/qiye/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@class='proTypeList']//li//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths= ("//div[@class='leftbox']//div[@class='pr0']//div[@class='pr2']//li[1]//a" )), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//p[@id='page']//a[contains(text(),'下一页')]")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='menu']//a[contains(text(),'联系方式')]")), callback='parse_items', follow=True), ) def parse_items(self, response): item = WuJinShangJiWangItem() # pattern = re.compile(r'<meta name="keywords" content="(.*?),.*?" >', re.S) pattern1 = re.compile(r'联系人:(.*?) <', re.S) pattern2 = re.compile(r'>\s*地址:(.*?)<', re.S) pattern3 = re.compile(r'>\s*电话:(.*?)<', re.S) pattern4 = re.compile(r'>\s*手机:(.*?)<', re.S) pattern5 = re.compile(r'>\s*传真:(.*?)<', re.S) pattern6 = re.compile(r'>\s*邮箱:(.*?)<', re.S) # pattern7 = re.compile(r'<em>QQ:</em>(.*?)<br />', re.S) pattern8 = re.compile(r'<li>所在地区:(.*?)</li>', re.S) pattern9 = re.compile(r'>主营:(.*?)<', re.S) item["company_Name"] = response.xpath( "//div[@class='head']//h1/text()").extract_first() item["company_address"] = "".join( re.findall(pattern2, response.text)) if re.findall( pattern2, response.text) else '' item["linkman"] = "".join( re.findall(pattern1, response.text)) if re.findall( pattern1, response.text) else '' item["telephone"] = "".join( re.findall(pattern3, response.text)[0]) if re.findall( pattern3, response.text) else '' item["phone"] = "".join(re.findall(pattern4, response.text)[0]) if re.findall( pattern4, response.text) else '' item["contact_Fax"] = "".join( re.findall(pattern5, response.text)[0]) if re.findall( pattern5, response.text) else '' item["contact_QQ"] = response.xpath( "//img[@alt='跟我QQ洽谈']/../@href").get() item["E_Mail"] = "".join( re.findall(pattern6, response.text)) if re.findall( pattern6, response.text) else '' item["Source"] = response.url item["kind"] = "|".join( response.xpath("//div[@class='head']//h4//a//text()").extract()) city_infos = ",".join(re.findall(pattern8, response.text)) if re.findall( pattern8, response.text) else '' if item["company_Name"] and item["company_Name"] != '': if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() else: return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace('未填写', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' if city_infos: if '/' in city_infos: try: item["province"] = city_infos.split('/')[0] item["city_name"] = city_infos.split('/')[1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class ShuKongJiChuanShiChangWangSpider(CrawlSpider): name = "skjcsc" allowed_domains = ['www.skjcsc.com','skjcsc.com'] start_urls = ['http://www.skjcsc.com/enterprisefront/enterpriseFrontAction.action'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302}, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*",restrict_xpaths=("///div[@id='newproductlist']//tr//div[@id='newp-name']//a")),callback="parse_items",follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//a[contains(text(),'下一页')]")), follow=True), ) def parse_items(self, response): item = ShuKongJiChuanShiChangWangItem() pattern = re.compile(r'<meta name="keywords" content=".*?,(.*?)"/>',re.S) pattern_qq = re.compile(r'(\d+)@qq.com',re.S) item["company_Name"] = response.xpath("//td[contains(text(),'公司名称:')]/following-sibling::td/text()").extract_first() item["company_address"] = response.xpath("//td[contains(text(),'详细地址:')]/following-sibling::td/text()").extract_first() item["linkman"] = response.xpath("//td[contains(text(),'联 系 人:')]/following-sibling::td/text()").extract_first() item["telephone"] = response.xpath("///td[contains(text(),'电 话:')]/following-sibling::td/text()").extract_first() item["phone"] = response.xpath("//td[contains(text(),'手 机:')]/following-sibling::td/text()").extract_first() item["contact_Fax"] = response.xpath("//td[contains(text(),'传 真:')]/following-sibling::td/text()").extract_first() item["contact_QQ"] = "".join(re.findall(pattern_qq,response.text)) if re.findall(pattern_qq,response.text) else '' item["E_Mail"] = response.xpath("//td[contains(text(),'邮 箱:')]/following-sibling::td/text()").extract_first() item["Source"] = response.url item["kind"] = ",".join(response.xpath("//div[@class='box_bg']//ul//li//a//text()").getall()) city_infos = response.xpath("//td[contains(text(),'详细地址:')]/following-sibling::td/text()").extract_first() if item["company_Name"]: item["company_Name"] = self.cw.search_company(item["company_Name"]) else: item["company_Name"] = '' return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = self.cw.search_linkman(item["linkman"]) else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = item["company_address"].replace("联系地址:","") item["company_address"] = self.cw.search_address(item["company_address"]) else: item["company_address"] = '' if city_infos: if '/' in city_infos: try: item["province"] = city_infos.split('/')[0] item["city_name"] = city_infos.split('/')[1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class RouDianGongChengSpider(CrawlSpider): name = 'rdgc' allowed_domains = ['rdzjw.com', 'www.rdzjw.com'] start_urls = ['http://www.rdzjw.com/company/'] cw = CleanWords() custom_settings = { 'DOWNLOAD_DELAY': 1, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Cookie": "Hm_lvt_081b84205c0f16480d7a8964a70f6b6b=1565055009; BAIDU_SSP_lcr=http://hao.huangye88.com/b2b_42621.html; Hm_lpvt_081b84205c0f16480d7a8964a70f6b6b=1565055034", # "Host": "www.fashangji.com", # "Referer": "https://www.fashangji.com/", # "Sec-Fetch-Mode": "navigate", # "Sec-Fetch-Site": "none", # "Sec-Fetch-User": "******", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } def parse(self, response): a_list = response.xpath( "//div[@class='m']//div[@class='left_box']//td//a") for a in a_list: kind_name = a.xpath("./text()").extract_first() kind_href = a.xpath("./@href").extract_first() if kind_name is None: kind_name = a.xpath("./strong/text()").extract_first() if kind_href: print(kind_name, kind_href) yield scrapy.Request(url=kind_href, callback=self.parse_company_list, dont_filter=True) def parse_company_list(self, response): div_list = response.xpath( "//div[@class='m']//div[@class='list']//table//tr") for div in div_list: item = RouDianGongChengspiderItem() # pattern = re.compile(r'\[\(.*?\)\/\(.*?\)\]', re.S) item["company_Name"] = div.xpath( ".//td[@align='left']//li/a/strong/text()").extract_first() company_href = div.xpath( ".//td[@align='left']//li/a/@href").extract_first() item["kind"] = div.xpath( ".//td[@align='left']//li[contains(text(),'主营:')]/text()" ).extract_first() city_infos = "".join( div.xpath("//td[@align='left']/following-sibling::td/text()"). extract_first()) if city_infos and "/" in city_infos: # 广东/潮州市 try: item["province"] = city_infos.replace("[", '').replace( "]", '').split('/')[0] item["city_name"] = city_infos.replace("[", '').replace( "]", '').split('/')[-1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = city_infos.replace("[", '').replace("]", '') item["city_name"] = '' if company_href: # print(company_href) contact_href = company_href + "contact/" yield scrapy.Request(url=contact_href, callback=self.parse_company_contact, meta={"item": item}, dont_filter=True) next_page_url = response.xpath( "//div[@class='pages']//a[contains(text(),'下一页»')]/@href" ).extract_first() if next_page_url: yield scrapy.Request(url=next_page_url, callback=self.parse_company_list) def parse_company_contact(self, response): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "URnU_6599_saltkey=ak6gp656; URnU_6599_lastvisit=1565058750; BAIDU_SSP_lcr=https://www.baidu.com/link?url=cb-lCyEOCCLgFwHPL2dAanIBMLf8DvDyItiG2Ov4tBa&wd=&eqid=9f23e6d6000403b3000000065d48f4ac; UM_distinctid=16c64fc50e79b-0e9495f2b9b774-5a13331d-1fa400-16c64fc50e868; CNZZDATA1254919241=1514860467-1565058564-null%7C1565058564; URnU_6599_lastact=1565062639%09api.php%09js", "Host": "www.rdzjw.com", "Referer": response.url, "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } item = response.meta["item"] item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td/text()" ).extract_first() # item["company_id"] = md5(item["company_Name"].encode()).hexdigest() # item["kind"] = response.xpath("//div[@class='head']/h4/text()").extract_first() item["company_address"] = "".join( response.xpath( "//td[contains(text(),'公司地址:')]/following-sibling::td/text()"). extract()) item["linkman"] = "".join( response.xpath( "//td[contains(text(),'联 系 人:')]/following-sibling::td/text()" ).extract()) item["telephone"] = "".join( response.xpath( "//td[contains(text(),'公司电话:')]/following-sibling::td/img/@src" ).extract()) item["phone"] = "".join( response.xpath( "//td[contains(text(),'手机号码:')]/following-sibling::td/img/@src" ).extract()) item["contact_Fax"] = "".join( response.xpath( "//td[contains(text(),'公司传真:')]/following-sibling::td/text()"). extract()) item["contact_QQ"] = "".join( response.xpath("//img[@title='点击QQ交谈/留言']/../@href").extract()) item["E_Mail"] = "".join( response.xpath( "//td[contains(text(),'电子邮件:')]/following-sibling::td/img/@src" ).extract()) item["Source"] = response.url if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.requests_href(item["phone"], headers) item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.requests_href(item["telephone"], headers) item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.requests_href(item["contact_Fax"], headers) item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.requests_href(item["E_Mail"], headers) if item["E_Mail"]: item["E_Mail"] = item["E_Mail"].replace("e", "@").replace( "8126", "@126").replace("8163", "@163").strip() # item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return '' def requests_href(self, url, headers): res = requests.get(url=url, headers=headers, timeout=20, verify=False) res.encoding = "utf-8" if res.status_code == requests.codes.ok: img = res.content something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img2\image.png" with open(something_img_file_path, "wb") as fp: fp.write(img) fp.close() if img: try: something = recognition_image(something_img_file_path) if something: return something else: return '' except: return '' else: return '' else: return ''
class GouLianZiYuanWangSpider(CrawlSpider): name = "ibicn" # allowed_domains = ['www.53info.com','qineng1688.53info.com'] start_urls = ['https://shangji.ibicn.com/gongsi/s_i_d_t_l_k/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", # "Connection": "keep-alive", # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686", # "Host": "jamesni139.tybaba.com", # "Referer": "http://jamesni139.tybaba.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, # 不验证SSL证书 "DOWNLOAD_HANDLERS_BASE": { 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', 'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', 'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', }, # "DOWNLOAD_HANDLERS": { # 'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'}, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@class='pull-left right']//li//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths= ("//div[@id='main']//li[@class='item ']//div[@class='pull-left left text-ellipsis']//a" )), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@id='pages']//a[contains(text(),'下一页')]")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@id='nav']//a[contains(text(),'联系我们')]")), callback='parse_items', follow=False), ) def parse_items(self, response): pattern = re.compile(r'主要经营(.*?)\。', re.S) pattern1 = re.compile(r'是[一家](.*?)的[高新|技术|企业|公司]\。', re.S) pattern2 = re.compile(r'>地址:(.*?)<', re.S) pattern3 = re.compile(r'<span>电 话:</span><b>(.*?)\s*</b>', re.S) pattern4 = re.compile(r'<span>手 机:</span><b>(.*?)\s*</b>', re.S) pattern5 = re.compile(r'<span>联系人:</span> <b>(.*?)\s*</b>', re.S) item = GouLianZiYuanWangItem() contact_infos = ",".join( response.xpath( "//div[@class='contact']//div[@class='item']//span").getall()) item["company_Name"] = response.xpath( "//div[@class='copany_name']/a/@title").extract_first() item["kind"] = "".join(re.findall(pattern, response.text)) if re.findall( pattern, response.text) else '' item["company_address"] = "".join( re.findall(pattern2, response.text)) if re.findall( pattern2, response.text) else '' item["linkman"] = "".join( re.findall(pattern5, response.text)) if re.findall( pattern5, response.text) else '' item["telephone"] = "".join( re.findall(pattern3, response.text)) if re.findall( pattern3, response.text) else '' item["phone"] = "".join(re.findall(pattern4, response.text)) if re.findall( pattern4, response.text) else '' item["contact_Fax"] = item["telephone"] if item["telephone"] else '' item["contact_QQ"] = '' item["E_Mail"] = '' item["Source"] = response.url item["province"] = '' item["city_name"] = '' if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: try: item["kind"] = "".join(re.findall( pattern1, response.text)) if re.findall( pattern1, response.text) else '' item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() except: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: try: item["phone"] = self.cw.search_phone_num(contact_infos) except: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: try: item["telephone"] = self.cw.search_telephone_num(contact_infos) except: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class BaFangZiYuanWangSpider(CrawlSpider): name = "b2b168" allowed_domains = ['b2b168.com','www.b2b168.com'] start_urls = ['https://www.b2b168.com/page-company.html'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302}, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686", # "Host": "jamesni139.tybaba.com", # "Referer": "http://jamesni139.tybaba.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, }, # 不验证SSL证书 "DOWNLOAD_HANDLERS_BASE": { 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', 'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', 'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', }, # "DOWNLOAD_HANDLERS": { # 'http': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'}, } rules = ( Rule(LinkExtractor( allow=r".*",restrict_xpaths=("//div[@class='map']//ul[contains(@class,'c-hangye')]//li//a")), follow=True), Rule(LinkExtractor( allow=r".*",restrict_xpaths=("//div[@class='mach_list clearfix']//dd//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@class='list-right']//ul[@class='list']//li//div[1]//a")), follow=True), Rule(LinkExtractor( allow=r".*",restrict_xpaths=("//div[@class='pages']//a[contains(text(),'下页')]")), follow=True), Rule(LinkExtractor( allow=r".*",restrict_xpaths=("//a[contains(text(),'联系方式')]")),callback='parse_items', follow=False), ) def parse_items(self, response): pattern = re.compile(r'<meta name="description" content="(.*?)"/>',re.S) pattern1 = re.compile(r'<span>主营:</span>(.*?)</p>',re.S) pattern2 = re.compile(r'<div class="com-name">(.*?)</div>',re.S) pattern3 = re.compile(r'联 系 人: <a class=b2>(.*?)</a>',re.S) pattern4 = re.compile(r'电 话: (.*?)<br />',re.S) pattern5 = re.compile(r'传 真: (.*?)<br />', re.S) pattern6 = re.compile(r'移动电话: (.*?)<br />', re.S) pattern7 = re.compile(r'地 址: (.*?)<br />', re.S) pattern8 = re.compile(r'主要经营(.*?)<br />', re.S) pattern9 = re.compile(r'<ul class="company">(.*?)</ul>', re.S) pattern10 = re.compile(r'主要经营(.*?)<br />', re.S) pattern11 = re.compile(r'ShowMap\("divMap","(.*?)","(.*?)", "(.*?)"\);', re.S) pattern12 = re.compile(r'>地址:(.*?) <a', re.S) pattern13 = re.compile(r'<dt>固定电话:</dt><dd>(.*?)</dd>', re.S) pattern14 = re.compile(r'<dt>联系人:</dt><dd>(.*?)</dd>', re.S) pattern15 = re.compile(r'<dt>移动电话:</dt><dd>(.*?)</dd>', re.S) pattern16 = re.compile(r'<dt>传真号码:</dt><dd>(.*?)</dd>', re.S) if response.status == 200: try: item = BaFangZiYuanWangspiderItem() item["company_Name"] = re.findall(pattern11,response.text)[0][0] if re.findall(pattern11,response.text) else '' item["company_address"] = re.findall(pattern11,response.text)[0][1] if re.findall(pattern11,response.text) else '' item["linkman"] = "".join(re.findall(pattern3,response.text)) if re.findall(pattern3,response.text) else '' item["telephone"] = "".join(re.findall(pattern4,response.text)) if re.findall(pattern4,response.text) else '' item["phone"] = "".join(re.findall(pattern6,response.text)) if re.findall(pattern6,response.text) else '' item["contact_Fax"] = "".join(re.findall(pattern5,response.text)) if re.findall(pattern5,response.text) else '' item["contact_QQ"] = '' item["E_Mail"] = '' item["kind"] = ",".join(re.findall(pattern1,response.text) if re.findall(pattern1,response.text) else '') item["Source"] = response.url item["province"] = re.findall(pattern11,response.text)[0][2].split(' ')[0] if re.findall(pattern11,response.text) else '' item["city_name"] = re.findall(pattern11,response.text)[0][2].split(' ')[1] if re.findall(pattern11,response.text) else '' if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace(' ', '').strip() item["company_id"] = get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: try: item["kind"] = "".join(re.findall(pattern10,response.text)) if re.findall(pattern10,response.text) else '' except: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: try: item["linkman"] = "".join(re.findall(pattern14,response.text)) if re.findall(pattern14,response.text) else '' except: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: try: item["phone"] = "".join(re.findall(pattern15,response.text)) if re.findall(pattern15,response.text) else '' except: item["phone"] = '' item["phone"] = self.cw.search_phone_num(item["phone"]) if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: try: item["telephone"] = "".join(re.findall(pattern13, response.text)) if re.findall(pattern13, response.text) else '' except: item["telephone"] = '' item["telephone"] = self.cw.search_telephone_num(item["telephone"]) if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) else: try: item["contact_Fax"] = "".join(re.findall(pattern16, response.text)) if re.findall(pattern16, response.text) else '' except: item["contact_Fax"] = '' item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["E_Mail"]: try: item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"]) except: item["contact_QQ"] = '' else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address(item["company_address"]) else: item["company_address"] = '' yield item except: return
class BaiChuangHuangYeWangSpider(CrawlSpider): name = "ayijx" allowed_domains = ['www.ayijx.com'] start_urls = ['http://www.ayijx.com/area/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } rules = ( Rule(LinkExtractor(allow=r".*", restrict_xpaths=("//div[@class='listsum']//dl//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths= ("//div[@class='dqmqsumxdtb margintop']//li//div[@class='dqmqlefts']//a" )), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='navbox']//a[contains(text(),'联系我们')]")), callback='parse_items', follow=True), Rule(LinkExtractor(allow=r".*", restrict_xpaths=("//div[@class='fanye']//p//a")), follow=True), ) def parse_items(self, response): item = BaiChuangHuangYeWangItem() pattern = re.compile(r'<title>联系我们_(.*?)</title>', re.S) pattern1 = re.compile(r'<p>电话: (.*?)</p>', re.S) pattern2 = re.compile(r'<p>手机: (.*?)</p>', re.S) pattern3 = re.compile(r'<p>Q Q: (.*?)</p>', re.S) pattern4 = re.compile(r'<p>联系人:(.*?)</p>', re.S) pattern5 = re.compile(r'<div class="LOGO_lfont">\s*<p>(.*?)</p>', re.S) pattern6 = re.compile(r'>联 系 人:(.*?)<', re.S) pattern7 = re.compile(r'> 电 话:(.*?)<', re.S) pattern8 = re.compile(r'>手 机: (.*?)<', re.S) pattern9 = re.compile(r'> Q Q:(.*?)<', re.S) pattern10 = re.compile(r'>传 真:(.*?)<', re.S) pattern11 = re.compile(r'>邮 箱:(.*?)<', re.S) pattern12 = re.compile(r'>地 址:(.*?)<', re.S) pattern13 = re.compile(r'>企业官网:(.*?)<', re.S) pattern14 = re.compile(r'>\s*主营产品:(.*?)<', re.S) if response.text is not None: try: item["company_Name"] = "".join( re.findall(pattern, response.text)) if re.findall( pattern, response.text) else '' item["kind"] = "".join(re.findall( pattern14, response.text)) if re.findall( pattern14, response.text) else '' item["company_address"] = "".join( re.findall(pattern12, response.text)) if re.findall( pattern12, response.text) else '' item["linkman"] = "".join(re.findall( pattern6, response.text)) if re.findall( pattern6, response.text) else '' item["telephone"] = "".join(re.findall( pattern7, response.text)) if re.findall( pattern7, response.text) else '' item["phone"] = "".join(re.findall( pattern8, response.text)) if re.findall( pattern8, response.text) else '' item["contact_Fax"] = "".join( re.findall(pattern10, response.text)) if re.findall( pattern10, response.text) else '' item["contact_QQ"] = "".join( re.findall(pattern9, response.text)) if re.findall( pattern9, response.text) else '' item["E_Mail"] = "".join(re.findall( pattern11, response.text)) if re.findall( pattern11, response.text) else '' item["Source"] = response.url item["province"] = "" item["city_name"] = "" if item["company_Name"]: if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split( '(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split( '(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split( '_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split( '-')[0] else: item["company_Name"] = re.sub( r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace(' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|').replace('、', '|')\ .replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: try: item["kind"] = "|".join( response.xpath( "//div[@class='hotico']//ul//li//a//text()"). getall()) item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-','|')\ .replace('、','|').replace(',', '|').replace(',', '|')\ .replace(';', '|').replace('.', '').strip() except: item["kind"] = '' item["kind"] = self.cw.rinse_keywords( self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: try: item["linkman"] = "".join( re.findall(pattern4, response.text)) if re.findall( pattern4, response.text) else '' except: item["linkman"] = '' if item["linkman"]: if '<' in item["linkman"]: item["linkman"] = item["linkman"].split('<')[0] item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["telephone"]: item["telephone"] = item["telephone"] else: try: item["telephone"] = "".join( re.findall(pattern1, response.text)) if re.findall( pattern1, response.text) else '' except: item["telephone"] = '' item["telephone"] = self.cw.search_telephone_num( item["telephone"]) if item["phone"]: item["phone"] = item["phone"] else: try: item["phone"] = "".join( re.findall(pattern1, response.text)) if re.findall( pattern1, response.text) else '' except: item["phone"] = '' item["phone"] = self.cw.search_phone_num(item["phone"]) if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = item["contact_QQ"].replace("Q Q:", '') else: try: item["contact_QQ"] = "".join( re.findall(pattern3, response.text)) if re.findall( pattern3, response.text) else '' except: item["contact_QQ"] = '' item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item except: return def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class ZhongGuoHuaGongSheBeiWangSpider(CrawlSpider): name = "ccen_v1" allowed_domains = ['www.ccen.net', 'ccen.net'] start_urls = ['http://www.ccen.net/company/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.4, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } def parse(self, response): a_list = response.xpath( "//table[@class='martop']//table[@class='ccen_blueborder']//table[@style='margin-bottom:5px']//tr//a" ) for a in a_list: kind_name = a.xpath("./text()").get() kind_href = a.xpath("./@href").get() if kind_href: kind_href = "http://www.ccen.net/company/" + kind_href # print(kind_name,kind_href) yield scrapy.Request(url=kind_href, callback=self.parse_company_list, dont_filter=True) def parse_company_list(self, response): t_list = response.xpath( "//table[@style='margin-top:10px; border-bottom:1px solid #CCCCCC; padding:5px;']//td[@valign='top']" ) for t in t_list: item = ZhongGuoHuaGongSheBeiWangItem() # ',,,联系人:,树伟,电话:,025-57467888,手机:,15806100000,更多联系方式>>,,,' pattern = re.compile(r'>联系人:</span>(.*?) <', re.S) pattern1 = re.compile(r'>电话:</span>(.*?) <', re.S) pattern2 = re.compile(r'>手机:</span>(.*?) <', re.S) pattern3 = re.compile(r'>主营:</span>(.*?)</td>', re.S) pattern4 = re.compile( r'<a href="(.*?)" target="_blank">更多联系方式\>></a>', re.S) item["company_Name"] = t.xpath( ".//table[1]//a[@class='blue f14']/@title").get() city_infos = t.xpath( ".//table[1]//a[@class='blue f14']/../text()").get() linkinfos = "".join(t.xpath(".//table[3]").getall()) item["kind"] = "".join(re.findall( pattern3, response.text)) if re.findall( pattern3, response.text) else '' item["linkman"] = "".join(re.findall( pattern, linkinfos)) if re.findall(pattern, linkinfos) else '' item["telephone"] = "".join( re.findall(pattern1, linkinfos)) if re.findall( pattern1, linkinfos) else '' item["phone"] = "".join(re.findall(pattern2, linkinfos)) if re.findall( pattern2, linkinfos) else '' if city_infos: # [湖北省-武汉] pattern_p = re.compile(r'\[(.*?)-.*?\]', re.S) pattern_c = re.compile(r'\[.*?-(.*?)\]', re.S) if "[" and "-" and "]" in city_infos: try: item["province"] = "".join( re.findall(pattern_p, city_infos)) if re.findall( pattern_p, city_infos) else '' item["city_name"] = "".join( re.findall(pattern_c, city_infos)) if re.findall( pattern_c, city_infos) else '' except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' company_href = "".join(re.findall(pattern4, linkinfos)[0]) if re.findall( pattern4, linkinfos) else '' if company_href: yield scrapy.Request(url=company_href, callback=self.parse_company_detail, meta={"item": item}, dont_filter=True) next_page_url = response.xpath( "//table[@class='membertable_page'][1]//a[contains(text(),'下一页')]/@href" ).get() if next_page_url: next_page_url = "http://www.ccen.net" + next_page_url yield scrapy.Request(url=next_page_url, callback=self.parse_company_list) def parse_company_detail(self, response): item = response.meta["item"] contact_href = response.xpath( "//font[contains(text(),'联系我们')]/../../@href").get() if contact_href: contact_href = response.url + contact_href yield scrapy.Request(url=contact_href, callback=self.parse_company_contact, dont_filter=True) else: pattern_add = re.compile(r'>\s*详细地址:(.*?)<', re.S) pattern_em = re.compile(r'>\s*电子邮件:(.*?)<', re.S) pattern_fa = re.compile(r'>\s*传真:(.*?)<', re.S) pattern_k = re.compile(r'<p>主营产品: (.*?)<br />', re.S) item["company_Name"] = item["company_Name"] item["company_address"] = "".join( re.findall(pattern_add, response.text)) if re.findall( pattern_add, response.text) else '' item["linkman"] = item["linkman"] item["telephone"] = item["telephone"] item["phone"] = item["phone"] item["contact_Fax"] = "".join(re.findall( pattern_fa, response.text)) if re.findall( pattern_fa, response.text) else '' item["contact_QQ"] = response.xpath( "//a[contains(@title,'点击QQ图标在线联系')]/@href").extract_first() item["E_Mail"] = "".join(re.findall( pattern_em, response.text)) if re.findall( pattern_em, response.text) else '' item["Source"] = response.url item["kind"] = ",".join(re.findall( pattern_k, response.text)) if re.findall( pattern_k, response.text) else '' # city_infos = response.xpath("//dt[contains(text(),'所在地区:')]/following-sibling::dd/text()").get() if item["company_Name"] and item["company_Name"] != '': if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub( r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace(' ', '').strip() else: return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords( self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace('未填写', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num( item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: try: item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"]) except: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' # if city_infos: # if '/' in city_infos: # try: # item["province"] = city_infos.split('/')[0] # item["city_name"] = city_infos.split('/')[1] # except: # item["province"] = '' # item["city_name"] = '' # else: # item["province"] = '' # item["city_name"] = '' # else: # item["province"] = '' # item["city_name"] = '' yield item def parse_company_contact(self, response): item = response.meta["item"] # pattern = re.compile(r'<title>(.*?) - .*?</title>',re.S) # pattern1 = re.compile(r'<p>主营产品: (.*?)<br />',re.S) # pattern2 = re.compile(r'>\s*通信地址:(.*?)\ ',re.S) # pattern3 = re.compile(r';\s*电话:(.*?)\ ',re.S) # pattern4 = re.compile(r';\s*传真:(.*?)\s*<', re.S) # pattern5 = re.compile(r'>\s*E-mail:(.*?)\ ', re.S) # pattern6 = re.compile(r'>\s*联系人:(.*?)<br />', re.S) pattern_k = re.compile(r'>\s*主营产品: (.*?)<', re.S) item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td[2]/text()" ).get() item["company_address"] = response.xpath( "//td[contains(text(),'详细地址:')]/following-sibling::td[2]/text()" ).get() item["linkman"] = response.xpath( "//td[contains(text(),'联 系 人:')]/following-sibling::td[2]/text()" ).get() item["telephone"] = response.xpath( "//td[contains(text(),'公司电话:')]/following-sibling::td[2]/text()" ).get() item["phone"] = response.xpath( "//td[contains(text(),'手 机:')]/following-sibling::td[2]/text()" ).get() item["contact_Fax"] = response.xpath( "//td[contains(text(),'传 真:')]/following-sibling::td[2]/text()" ).get() item["contact_QQ"] = response.xpath( "//a[contains(@title,'点击QQ图标在线联系')]/@href").extract_first() item["E_Mail"] = response.xpath( "//td[contains(text(),'电子邮件:')]/following-sibling::td[2]/text()" ).extract_first() item["Source"] = response.url item["kind"] = ",".join( re.findall(pattern_k, response.text)) if re.findall( pattern_k, response.text) else '' # city_infos = response.xpath("//dt[contains(text(),'所在地区:')]/following-sibling::dd/text()").get() if item["company_Name"] and item["company_Name"] != '': if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() else: return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace('未填写', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: try: item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"]) except: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' # if city_infos: # if '/' in city_infos: # try: # item["province"] = city_infos.split('/')[0] # item["city_name"] = city_infos.split('/')[1] # except: # item["province"] = '' # item["city_name"] = '' # else: # item["province"] = '' # item["city_name"] = '' # else: item["province"] = item["province"] item["city_name"] = item["city_name"] yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class CebnDianZiShangWuWangSpider(CrawlSpider): name = "cebn" allowed_domains = ['www.cebn.cn', 'cebn.cn'] start_urls = ['http://www.cebn.cn/company/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Host": "www.kusoba.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@class='fl J-mainNav']//ul//li//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='list-content']//div[@class='proname']//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='pages']//a[contains(text(),'下一页»')]")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//span[contains(text(),'联系方式')]/..")), callback='parse_items', follow=False), ) def parse_items(self, response): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "UM_distinctid=16cb2979f862a5-0dfda36ee0202d-5a13331d-1fa400-16cb2979f8725e; Hm_lvt_aa1b57052b9004f48376724837cc9b69=1566364377; yunsuo_session_verify=ded7c3ded7b4429e61379b82e2e37d8e; Hm_lpvt_aa1b57052b9004f48376724837cc9b69=1566365005", # "Host": "fshjbxg.cn.cebn.cn", "Referer": response.url, "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } item = CebnDianZiShangWuWangItem() if response.text: try: item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td/text()" ).extract_first() item["kind"] = response.xpath( "//div[@class='head']//h4/text()").get() item["company_address"] = "".join( response.xpath( "//td[contains(text(),'公司地址:')]/following-sibling::td/text()" ).extract()) item["linkman"] = response.xpath( "//td[contains(text(),'联 系 人:')]/following-sibling::td/text()" ).extract_first() item["telephone"] = response.xpath( "//td[contains(text(),'公司电话:')]/following-sibling::td/img/@src" ).extract_first() item["phone"] = response.xpath( "//td[contains(text(),'手机号码:')]/following-sibling::td/img/@src" ).extract_first() item["contact_Fax"] = response.xpath( "//td[contains(text(),'公司传真:')]/following-sibling::td/img/@src" ).extract_first() item["contact_QQ"] = response.xpath( "//img[@title='点击QQ交谈/留言']/../@href").extract_first() item["E_Mail"] = response.xpath( "//td[contains(text(),'电子邮件:')]/following-sibling::td/img/@src" ).extract_first() item["Source"] = response.url item["province"] = "" item["city_name"] = "" if item["company_Name"]: item["company_Name"] = re.sub( r'\n|\s|\r|\t|公司名称:|全称:', '', item["company_Name"]).replace(' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", "|") item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营产品:', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords( self.cw.replace_ss(item["kind"])) if item["linkman"]: if "(" in item["linkman"]: item["linkman"] = item["linkman"].split( "(")[0].replace('法定代表人:', '').replace('暂未公布', '') else: item["linkman"] = item["linkman"].replace( '法定代表人:', '').replace('暂未公布', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.requests_href(item["phone"], headers) item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.requests_href( item["telephone"], headers) item["telephone"] = self.cw.search_telephone_num( item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.requests_href( item["contact_Fax"], headers) item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.requests_href( item["E_Mail"], headers) item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: # item["contact_QQ"] = self.requests_href(item["contact_QQ"], headers) item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: # if "\"" in item["company_address"]: item["company_address"] = item["company_address"] item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item except: return def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return '' def requests_href(self, url, headers): res = requests.get(url=url, headers=headers, timeout=10, verify=False) res.encoding = "utf-8" if res.status_code == requests.codes.ok: img = res.content something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img2\image.png" with open(something_img_file_path, "wb") as fp: fp.write(img) fp.close() if img: try: something = recognition_image(something_img_file_path) if something: return something else: return '' except: return '' else: return '' else: return ''
class YiLingLingYiSanWuShangWuWangSpider(CrawlSpider): name = "yllsw" allowed_domains = ['100135.com', 'www.100135.com'] start_urls = ['http://www.100135.com/company.html'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", # "Connection": "keep-alive", # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686", # "Host": "jamesni139.tybaba.com", # "Referer": "http://jamesni139.tybaba.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, # 不验证SSL证书 "DOWNLOAD_HANDLERS_BASE": { 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', 'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', 'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', }, # "DOWNLOAD_HANDLERS": { # 'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'}, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='left sort']//div[@class='SortTitle']//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@id='main_left']//dd//div[@class='info_title']//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@id='pager']//a[contains(text(),'>')]")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@id='leftmenu']//a[contains(text(),'联系方式')]")), callback='parse_items', follow=False), ) def parse_items(self, response): item = YiLingLingYiSanWuShangWuWangItem() item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td/text()" ).extract_first() item["kind"] = "".join( response.xpath("//p[contains(text(),'主营产品:')]//text()").getall()) item["company_address"] = "".join( response.xpath( "//td[contains(text(),'地址:')]/following-sibling::td/text()"). extract()) item["linkman"] = response.xpath( "//td[contains(text(),'联系人:')]/following-sibling::td/text()" ).extract_first() item["telephone"] = response.xpath( "//td[contains(text(),'固定电话:')]/following-sibling::td/text()" ).extract_first() item["phone"] = response.xpath( "//td[contains(text(),'手机:')]/following-sibling::td/text()" ).extract_first() item["contact_Fax"] = response.xpath( "//td[contains(text(),'传真:')]/following-sibling::td/text()" ).extract_first() item["contact_QQ"] = response.xpath( "//img[@title='点击QQ交谈/留言']/../@href").extract_first() item["E_Mail"] = response.xpath( "//td[contains(text(),'E-mail:')]/following-sibling::td/text()" ).extract_first() item["Source"] = response.url item["province"] = "" item["city_name"] = "" if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: try: item["kind"] = item["kind"].split('主营产品:')[-1] if item["kind"]: item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() except: item["kind"] = '' else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: try: item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"]) except: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class TongZhuangQiYeWangSpider(CrawlSpider): name = "61kids" allowed_domains = ['www.61kids.com.cn'] start_urls = ['http://www.61kids.com.cn/dressunion/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", # "Cache-Control": "no-cache", "Connection": "keep-alive", # "Cookie": "Hm_lvt_7b0dee6397672d912c9cfc5ce2c321d2=1568272622; Hm_lpvt_7b0dee6397672d912c9cfc5ce2c321d2=1568272862; KiDs_member_htc_86506_86506=fc96DkuXLEkLAe6X6-yladuhZt1sZWrWjZhvydEQUw; KiDs_member_htc_416770_416770=013fQYlCVAMyQsG7kIiH6sHxGqtSOAgPu0IAU06W4Q", "Host": "www.61kids.com.cn", # "Pragma": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } # # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@class='zy1 t10']//ul//li//a")), callback="parse_items", follow=True), Rule(LinkExtractor(allow=r".*", restrict_xpaths=("//a[contains(text(),'下一页')]")), follow=True), ) # def parse(self, response): # a_list = response.xpath("//div[@class='main_content']//ul//li//h4/../@href").getall() # for a in a_list: # print(a) # yield scrapy.Request( # url=a, # callback=self.parse_items, # dont_filter=True # ) def parse_items(self, response): item = TongZhuangQiYeWangItem() pattern_c = re.compile(r'<meta name="keywords" content="(.*?)" />', re.S) pattern_k = re.compile(r'>主营:(.*?)\s*<', re.S) pattern_area = re.compile(r'>\s*公司所在地:(.*?) (.*?) <br/>', re.S) pattern_add = re.compile(r'>\s*地址:(.*?)<br />', re.S) pattern_tp = re.compile(r'>\s*电话:(.*?)<br />', re.S) pattern_ph = re.compile(r'>\s*手机:(.*?)<br />', re.S) pattern_fx = re.compile(r'>\s*传真:(.*?)<br />', re.S) pattern_ad = re.compile(r'>\s*地址:(.*?)<', re.S) pattern_add = re.compile(r'址:(.*?)<', re.S) # pattern_tp = re.compile(r'\(?0\d{2,3}[)-]?\d{7,8}', re.S) pattern_ph = re.compile(r'\(?0\d{2,3}[)-]?\d{7,8}', re.S) pattern_fx = re.compile(r'真:(.*?)<', re.S) pattern_e = re.compile(r'箱:(.*?)<') pattern_em = re.compile(r'(.*?)@163.com', re.S) pattern_qq = re.compile(r'(\d+)@qq.com', re.S) item["company_Name"] = response.xpath( "//div[@class='lb2_12']/a/span/text()").get() item["company_address"] = response.xpath( "//span[contains(text(),'地 址:')]/following-sibling::p/text()").get( ) item["linkman"] = "".join( response.xpath("//li[contains(text(),'联系人:')]/text()").extract()) item["telephone"] = "".join( response.xpath("//span[@class='brand_phone']//text()").getall()) item["phone"] = "".join( re.findall(pattern_ph, response.text)) if re.findall( pattern_ph, response.text) else '' item["contact_Fax"] = "".join( re.findall(pattern_fx, response.text)) if re.findall( pattern_fx, response.text) else '' item["contact_QQ"] = "".join( re.findall(pattern_qq, response.text)) if re.findall( pattern_qq, response.text) else '' item["E_Mail"] = "".join( re.findall(pattern_em, response.text)) if re.findall( pattern_em, response.text) else '' item["Source"] = response.url item["kind"] = response.xpath( "//span[contains(text(),'主 营:')]/following-sibling::p/text()").get( ) # city_infos = response.xpath("//td[contains(text(),'所在地区:')]/following-sibling::td/text()").get() item["province"] = "".join( re.findall(pattern_area, response.text)[0][0]) if re.findall( pattern_area, response.text) else '' item["city_name"] = "".join( re.findall(pattern_area, response.text)[0][1]) if re.findall( pattern_area, response.text) else '' if item["company_Name"]: item["company_Name"] = self.cw.search_company(item["company_Name"]) else: item["company_Name"] = '' return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"] and item["kind"] != '': item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: try: item["kind"] = "".join(re.findall( pattern_k, response.text)) if re.findall( pattern_k, response.text) else '' item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|') \ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';', '|').replace('.', '').strip() except: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = self.cw.search_linkman(item["linkman"]) else: try: item["linkman"] = "".join( response.xpath("//p[@class='fd_zw']//b//text()").getall()) item["linkman"] = re.sub(r'\s|\r|\t|\n', '', item["linkman"]) except: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: try: item["E_Mail"] = "".join(re.findall( pattern_e, response.text)) if re.findall( pattern_e, response.text) else '' except: item["E_Mail"] = '' item["E_Mail"] = self.cw.search_email(item["E_Mail"]) if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = item["company_address"].replace( "联系地址:", "") item["company_address"] = self.cw.search_address( item["company_address"]) else: try: item["company_address"] = "".join( re.findall(pattern_add, response.text)) if re.findall( pattern_add, response.text) else '' except: item["company_address"] = '' item["company_address"] = self.cw.search_address( item["company_address"]) # if city_infos: # if '/' in city_infos: # try: # item["province"] = city_infos.split('/')[0] # item["city_name"] = city_infos.split('/')[1] # except: # item["province"] = '' # item["city_name"] = '' # else: # item["province"] = '' # item["city_name"] = '' # else: # item["province"] = '' # item["city_name"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class ShiPinDaiLiWangSpider(CrawlSpider): name = "spdl" allowed_domains = ['spdl.com', 'www.spdl.com'] start_urls = ['http://www.spdl.com/company/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//ul[@class='clearfix key-choice tac']//li//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths= ("//div[@class='main-list']//li[@class='clearfix']//a[contains(text(),'联系方式')]" )), callback='parse_items', follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='newpage']//a[contains(text(),'下一页')]")), follow=True), ) def parse_items(self, response): item = ShiPinDaiLiWangItem() if "浏览量:" or "展位会员" in response.text: pattern_l = re.compile( r'<div class="mobile-kf">\s*<span>联系人:(.*?)</span>', re.S) item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]//a/text()").get() item["kind"] = ",".join( response.xpath( "//td[@align='center']//span//a/text()").getall()) item["linkman"] = "".join(re.findall( pattern_l, response.text)) if re.findall( pattern_l, response.text) else '' item["company_address"] = "".join( response.xpath( "//li[contains(text(),'地 址:')]//text()").getall()) item["telephone"] = "".join( response.xpath( "//li[@class='haoma']//span[@class='linkstr']//text()"). getall()) item["phone"] = "".join( response.xpath( "//li[contains(text(),'手 机: ')]//span/a//text()").getall()) item["contact_Fax"] = "".join( response.xpath( "//li[contains(text(),'传 真: ')]//span//text()").getall()) item["contact_QQ"] = "".join( response.xpath( "//li[contains(text(),'Q:')]//span//text()").getall()) item["E_Mail"] = "".join( response.xpath( "//li[contains(text(),'邮 箱: ')]//span//text()").getall()) item["Source"] = response.url item["province"] = '' item["city_name"] = '' if item["company_Name"] and item["company_Name"] != '': if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub( r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace(' ', '').strip() else: return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords( self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace( '联 系 人:', '').replace('请您点击留言,留言后将显示联系方式!', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = item["phone"].replace('请您点击留言,留言后将显示联系方式!', '') item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = item["telephone"].replace( '请您点击留言,留言后将显示联系方式!', '') item["telephone"] = self.cw.search_telephone_num( item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = item["contact_Fax"].replace( '请您点击留言,留言后将显示联系方式!', '') item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = item["E_Mail"].replace( '请您点击留言,留言后将显示联系方式!', '') item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = item["contact_QQ"].replace( '请您点击留言,留言后将显示联系方式!', '') item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: try: item["contact_QQ"] = self.cw.search_email(item["E_Mail"]) except: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = item["company_address"].replace('请您点击留言,留言后将显示联系方式!','')\ .replace('地 址:','') else: item["company_address"] = '' item["company_address"] = self.cw.search_address( item["company_address"]) yield item else: pattern = re.compile( r'<meta name="keywords" content=".*?,(.*?)"/>', re.S) pattern_area = re.compile( r";\s*map.Address = '(.*?),(.*?),.*?,,';", re.S) pattern_k = re.compile(r'<p>食品代理网-专业的(.*?)服务平台。</p>', re.S) item["company_Name"] = response.xpath( "//p[contains(text(),'公司名称:')]//a/text()").extract_first() item["company_address"] = response.xpath( "//div[@id='contactleft']//p[contains(text(),'地址:')]/text()" ).extract_first() item["linkman"] = response.xpath( "//div[@class='zs-phone fr']//span/h4/text()").extract_first() item["telephone"] = response.xpath( "//div[@class='zs-phone fr']//span/p/text()").extract_first() item["phone"] = response.xpath( "//td[contains(text(),'手机号码:')]/following-sibling::td/text()" ).extract_first() item["contact_Fax"] = response.xpath( "//td[contains(text(),'公司传真:')]/following-sibling::td/text()" ).extract_first() item["contact_QQ"] = response.xpath( "//img[@title='点击QQ交谈/留言']/../@href").extract_first() item["E_Mail"] = response.xpath( "//p[contains(text(),'邮箱:')]/text()").extract_first() item["Source"] = response.url item["kind"] = ",".join(re.findall( pattern_k, response.text)) if re.findall( pattern_k, response.text) else '' # city_infos = response.xpath("//dt[contains(text(),'所在地区:')]/following-sibling::dd/text()").get() item["province"] = "".join( re.findall(pattern_area, response.text)[0][0]) if re.findall( pattern_area, response.text) else '' item["city_name"] = "".join( re.findall(pattern_area, response.text)[0][1]) if re.findall( pattern_area, response.text) else '' if item["company_Name"] and item["company_Name"] != '': if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub( r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace(' ', '').strip() else: return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords( self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace('联 系 人:', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num( item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: try: item["contact_QQ"] = self.cw.search_email(item["E_Mail"]) except: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' # if city_infos: # if '/' in city_infos: # try: # item["province"] = city_infos.split('/')[0] # item["city_name"] = city_infos.split('/')[1] # except: # item["province"] = '' # item["city_name"] = '' # else: # item["province"] = '' # item["city_name"] = '' # else: # item["province"] = '' # item["city_name"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class YiShangWangSpider(CrawlSpider): name = "esw" allowed_domains = ['www.esw.com.cn'] start_urls = ['http://www.esw.com.cn/company/default.aspx?page=1'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", # "Connection": "keep-alive", # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686", # "Host": "jamesni139.tybaba.com", # "Referer": "http://jamesni139.tybaba.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, # 不验证SSL证书 # "DOWNLOAD_HANDLERS_BASE": { # 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', # 'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', # 'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', # 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', # }, # "DOWNLOAD_HANDLERS": { # 'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'}, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r"\/member\/.*?\.html", restrict_xpaths=( "//div[@class='nt_left']//div[@class='yllist']//ul//li//span//a" )), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@id='AspNetPager1']//a[contains(text(),'下一页')]")), follow=True), Rule(LinkExtractor(allow=r"\/member\/contact\d+\.html", restrict_xpaths=("///a[contains(text(),'联系我们')]")), callback='parse_items', follow=True), ) def parse_items(self, response): pattern = re.compile(r'<META name=keywords content=(.*?),.*?>', re.S) pattern1 = re.compile(r'<p>\s*联系人:<span id="Span3">(.*?)</span></p>', re.S) pattern2 = re.compile(r'<p>\s*手机:<span id="Span2">(.*?)</span></p>', re.S) pattern3 = re.compile(r'<p>\s*电话:<span id="x_tel">(.*?)</span></p>', re.S) pattern4 = re.compile(r'<p>\s*传真:<span id="x_fax">(.*?)</span></p>', re.S) pattern5 = re.compile(r'</p>\s*地址:<span id="x_address">(.*?)<span>', re.S) pattern6 = re.compile(r'<META name=keywords content=.*?,(.*?)>', re.S) item = YiShangWangItem() item["company_Name"] = response.xpath( "//div[contains(text(),'公司全称:')]/following-sibling::div/text()" ).get() item["kind"] = response.xpath( "//div[contains(text(),'主营业务: ')]/following-sibling::div/text()" ).get() item["company_address"] = response.xpath( "//div[contains(text(),'址:')]/following-sibling::div/text()").get( ) item["linkman"] = response.xpath( "//div[contains(text(),'联系人')]/following-sibling::div/text()").get( ) item["telephone"] = response.xpath( "//div[contains(text(),'话:')]/following-sibling::div/text()").get( ) item["phone"] = response.xpath( "//div[contains(text(),'联系人')]/following-sibling::div/text()").get( ) item["contact_Fax"] = response.xpath( "//div[contains(text(),'话:')]/following-sibling::div/text()").get( ) item["contact_QQ"] = '' item["E_Mail"] = response.xpath( "//div[contains(text(),'邮')]/following-sibling::div/a/text()").get( ) item["Source"] = response.url item["province"] = "" item["city_name"] = "" if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:|企 业 名 称:', '', item["company_Name"]).replace( ' ', '').strip() else: try: item["company_Name"] = "".join( re.findall(pattern, response.text)) if re.findall( pattern, response.text) else '' item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:|企 业 名 称:', '', item["company_Name"]).replace( ' ', '').strip() except: item["company_Name"] = '' item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: try: item["kind"] = "".join(re.findall( pattern6, response.text)) if re.findall( pattern6, response.text) else '' item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace(';', '|').replace('.', '').strip() except: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: try: # '唐总 13533003050(手机) ' item["linkman"] = item["linkman"].split(' ')[0] except: item["linkman"] = '' else: try: item["linkman"] = "".join(re.findall( pattern1, response.text)) if re.findall( pattern1, response.text) else '' except: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"] and "手机" in item["phone"]: pattern = re.compile(r'(\d+)', re.S) item["phone"] = re.sub(r'\s|\n|\r|\t', '', item["phone"]) item["phone"] = "".join(re.findall(pattern, item["phone"])) item["phone"] = self.cw.search_phone_num(item["phone"]) else: try: item["phone"] = "".join(re.findall( pattern2, response.text)) if re.findall( pattern2, response.text) else '' except: item["phone"] = '' if item["telephone"]: try: item["telephone"] = item["telephone"].split(' ')[0] item["telephone"] = self.cw.search_telephone_num( item["telephone"]) except: item["telephone"] = '' else: try: item["telephone"] = "".join(re.findall( pattern3, response.text)) if re.findall( pattern3, response.text) else '' except: item["telephone"] = '' if item["contact_Fax"] and "传真:" in item["contact_Fax"]: # '020-81633545 传真:020-81633545' try: item["contact_Fax"] = item["contact_Fax"].split('传真:')[-1] item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) except: item["contact_Fax"] = '' else: try: item["contact_Fax"] = "".join( re.findall(pattern4, response.text)) if re.findall( pattern4, response.text) else '' except: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["E_Mail"]: item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: try: item["company_address"] = "".join( re.findall(pattern5, response.text)) if re.findall( pattern5, response.text) else '' except: item["company_address"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class KuSoBaSpider(CrawlSpider): name = "ksb" allowed_domains = ['www.kusoba.com'] start_urls = ['http://www.kusoba.com/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302}, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Host": "www.kusoba.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r"/.*/",restrict_xpaths=("//div[@class='beij_center']//div[@class='wenzlieb']//a")), follow=True), Rule(LinkExtractor( allow=r"/company/\d+.html", restrict_xpaths=("//div[@class='beij_center']//div[@class='nianmf']//a")), callback='parse_items', follow=False), Rule(LinkExtractor(allow=r"/.*/p.*/", restrict_xpaths=("//div[@class='fanye']//a")), follow=True), ) def parse_items(self, response): # print(response.text) pattern = re.compile(r'<li><p>联系我们:</p><span>(.*?)</span></li>',re.S) pattern1 = re.compile(r'<li><p>电<em></em>话:</p><span>(.*?)</span></li>', re.S) pattern2 = re.compile(r'<li><p>移动电话:</p><span>(.*?)</span></li>', re.S) pattern3 = re.compile(r'<li><p>传<em></em>真:</p><span>(.*?)</span></li>', re.S) item = KuSoBaspiderItem() item["company_Name"] = "".join(response.xpath("//div[contains(text(),'全称:')]/text()").extract()) item["kind"] = "".join(response.xpath("//div[contains(text(),'主营产品:')]/text()").extract()) item["company_address"] = "".join(response.xpath("//div[contains(text(),'注册地址:')]/text()").extract()) item["linkman"] = "".join(response.xpath("//div[contains(text(),'法定代表人:')]/text()").extract()) item["telephone"] = "".join(re.findall(pattern1,response.text)) if re.findall(pattern1,response.text) else '' item["phone"] = "".join(re.findall(pattern2,response.text)) if re.findall(pattern2,response.text) else '' item["contact_Fax"] = "".join(re.findall(pattern3,response.text)) if re.findall(pattern3,response.text) else '' item["contact_QQ"] = "" item["E_Mail"] = "" item["Source"] = response.url item["province"] = "" item["city_name"] = "" if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:|全称:', '', item["company_Name"]).replace(' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营产品:', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: if "(" in item["linkman"]: item["linkman"] = item["linkman"].split("(")[0].replace('法定代表人:','').replace('暂未公布','') else: item["linkman"] = item["linkman"].replace('法定代表人:','').replace('暂未公布','') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = item["company_address"].replace('注册地址:','') item["company_address"] = self.cw.search_address(item["company_address"]) else: item["company_address"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class ZiZhuMaoYiWangSpider(CrawlSpider): # 自助贸易网 name = 'diytrade' allowed_domains = ['cn.diytrade.com','diytrade.com'] start_urls = ['https://cn.diytrade.com/china/main.html'] cw = CleanWords() custom_settings = { 'DOWNLOAD_DELAY': 0.3, # 延时最低为2s # 'AUTOTHROTTLE_ENABLED': True, # 启动[自动限速] # 'AUTOTHROTTLE_DEBUG': True, # 开启[自动限速]的debug # 'AUTOTHROTTLE_MAX_DELAY': 10, # 设置最大下载延时 'DOWNLOAD_TIMEOUT': 5, #设置下载超时 'CONCURRENT_REQUESTS_PER_DOMAIN': 5, # 限制对该网站的并发请求数 'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302}, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Cookie": "Hm_lvt_34c005d4caf30d75012a05867beca619=1562747829,1564973957; Hm_lpvt_34c005d4caf30d75012a05867beca619=1564974049", # "Host": "b2b.huishangbao.com", # "Referer": "http://b2b.huishangbao.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, # 'BigB2BSpider.middlewares.ProcessAllExceptionMiddleware': 120, } } def parse(self, response): a_list = response.xpath("//div[@class='prodCatListDIV']//ul[@class='prodCatList']//li//a") for a in a_list: kind_name = a.xpath("./text()").extract_first() kind_href = a.xpath("./@href").extract_first() if kind_href: if kind_href.startswith("http://"): kind_href = kind_href else: kind_href = "https://cn.diytrade.com" + kind_href # print(kind_name,kind_href) yield scrapy.Request( url=kind_href, callback=self.parse_kind_list, dont_filter=True ) def parse_kind_list(self, response): s_kind_href = response.xpath("//a[contains(text(),'» 公司信息')]/@href").extract_first() if s_kind_href: s_kind_href = "https://cn.diytrade.com" + s_kind_href yield scrapy.Request( url=s_kind_href, callback=self.parse_company_list, dont_filter=True ) def parse_company_list(self, response): div_list = response.xpath("//form[@name='itemForm']//ul[@class='comItems']//li") for div in div_list: item = ZiZhuMaoYiWangspiderItem() item["company_Name"] = div.xpath(".//div[@class='col3']/h3/a/text()").extract_first() company_href = div.xpath(".//div[@class='col3']/h3/a/@href").extract_first() if company_href: # print(company_href) company_href = "https://cn.diytrade.com" + company_href # print(contact_href) yield scrapy.Request( url=company_href, callback=self.parse_company_contact, meta={"item": item}, dont_filter=True ) next_page_url = response.xpath("//div[@class='clearfix pageNavList']//a[contains(text(),'下一页')]/@href").extract_first() if next_page_url: next_page_url = "https://cn.diytrade.com" + next_page_url yield scrapy.Request( url=next_page_url, callback=self.parse_company_list ) # def parse_company_detail(self, response): # item = response.meta["item"] # contact_href = response.xpath("//li//a[contains(text(),'联系我们')]/@href").extract_first() # if contact_href: # yield scrapy.Request( # url=contact_href, # callback=self.parse_company_contact, # meta={"item": item}, # dont_filter=True # ) def parse_company_contact(self, response): item = response.meta["item"] pattern = re.compile(r'uin=(.*?)&',re.S) # item["company_Name"] = response.xpath("//th[contains(text(),'公司名称︰')]/following-sibling::td/text()").extract_first() item["kind"] = response.xpath("//th[contains(text(),'主营行业︰')]/following-sibling::td/h3/text()").extract_first() item["company_address"] = response.xpath("//th[contains(text(),'地址︰')]/following-sibling::td/text()").extract_first() item["linkman"] = "".join(response.xpath("//th[contains(text(),'联系人︰')]/following-sibling::td/text()").extract()) item["telephone"] = "".join(response.xpath("//th[contains(text(),'电话︰')]/following-sibling::td/text()").extract()) item["phone"] = "".join(response.xpath("//th[contains(text(),'手机︰')]/following-sibling::td/text()").extract()) item["E_Mail"] = "".join(response.xpath("//th[contains(text(),'公司邮箱︰')]/following-sibling::td/text()").extract()) item["contact_Fax"] = response.xpath("//th[contains(text(),'传真︰')]/following-sibling::td/text()").extract_first() item["contact_QQ"] = "".join(response.xpath("//img[@title='点击这里给我发消息']/../@href").extract()) item["Source"] = response.url city_infos = response.xpath("//th[contains(text(),'国家/地区︰')]/following-sibling::td/h3/text()").extract_first() if city_infos: pattern1 = re.compile(r'(.*?)省(.*?)市',re.S) # 广东/潮州市 # 广东省深圳市 try: item["province"] = re.findall(pattern1,city_infos)[0][0] item["city_name"] = re.findall(pattern1,city_infos)[0][1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:|(个人账号)', '', item["company_Name"]).replace(' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace(' ', '|').replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = item["phone"] else: item["phone"] = '' item["phone"] = self.cw.search_phone_num(item["phone"]) if item["telephone"]: item["telephone"] = item["telephone"] else: item["telephone"] = '' item["telephone"] = self.cw.search_telephone_num(item["telephone"]) if item["contact_Fax"]: item["contact_Fax"] = item["contact_Fax"] else: item["contact_Fax"] = '' item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) if item["contact_QQ"]: item["contact_QQ"] = "".join(re.findall(pattern,item["contact_QQ"])) if re.findall(pattern,item["contact_QQ"]) else '' else: item["contact_QQ"] = '' item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = (item["contact_QQ"] + "@qq.com") if item["contact_QQ"] else '' if item["company_address"]: item["company_address"] = item["company_address"].replace(",","").replace(',', '|').strip() else: item["company_address"] = '' item["company_address"] = self.cw.search_address(item["company_address"]) # if item["host_href"]: # item["host_href"] = item["host_href"] # else: # item["host_href"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class ShangLuWang(CrawlSpider): name = 'shl' allowed_domains = ['www.b2b6.com'] start_urls = ['http://www.b2b6.com'] cw = CleanWords() custom_settings = { 'DOWNLOAD_DELAY': 0.2, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Cookie": "__jsluid_h=5ce7ed337548f20c7a26c70933fc9b8a; UM_distinctid=16c6b51bdb62b1-0f185a00dd70c9-5a13331d-1fa400-16c6b51bdb7460; CNZZDATA4872360=cnzz_eid%3D973033440-1565166565-http%253A%252F%252Fwww.b2b6.com%252F%26ntime%3D1565166565", # "Host": "www.b2b6.com", # "Referer": "http://www.b2b6.com/yp/h1f3s0c0p10.aspx", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } def parse(self, response): a_list = response.xpath("//div[@id='dMain']//div[@class='mt']//a") for a in a_list: kind_name = a.xpath("./text()").extract_first() kind_href = a.xpath("./@href").extract_first() if kind_href: kind_href = "http://www.b2b6.com" + kind_href # print(kind_name,kind_href) yield scrapy.Request(url=kind_href, callback=self.parse_kind_list, dont_filter=True) def parse_kind_list(self, response): a_list = response.xpath( "//div[@id='dMain']//div[@id='dCatalogueBox']//ul//li//a") for a in a_list: # item["kind"] = a.xpath("./@title").extract_first() kind_href = a.xpath("./@href").extract_first() if kind_href: # print(item["kind"],kind_href) kind_href = "http://www.b2b6.com" + kind_href yield scrapy.Request(url=kind_href, callback=self.parse_company_list, dont_filter=True) def parse_company_list(self, response): kinds = "".join( response.xpath( "//div[@id='dNavBox']//a[contains(text(),'首页')]/..//text()"). extract()) # if kinds and "市" in kinds: # try: # # '商录分享目录首页 > 天津市 > 综合性行业' # kinds = re.sub(r'\s|\n|r|\t','',kinds).replace(' ','') # kinds = kinds.split('市>')[-1] # except: # kinds = '' # else: # kinds = '' div_list = response.xpath("//div[@id='dMain']//div[@id='dMainBox']") for div in div_list: item = ShangLuWangspiderItem() # pattern = re.compile(r'(.*?)\/(.*?)', re.S) item["company_Name"] = div.xpath(".//a/text()").extract_first() item["company_address"] = div.xpath( ".//span[@class='addr']/text()").extract_first() company_href = div.xpath(".//a/@href").extract_first() item["province"] = '' item["city_name"] = '' item["kind"] = '' if company_href: # print(company_href) contact_href = "http://www.b2b6.com" + company_href # print(contact_href) yield scrapy.Request(url=contact_href, callback=self.parse_company_contact, meta={"item": item}, dont_filter=True) next_page_url = response.xpath( "//div[@id='dMain']//div[@id='dMainBox']//b[contains(text(),'下一页')]/../@href" ).extract_first() if next_page_url: next_page_url = "http://www.b2b6.com" + next_page_url yield scrapy.Request(url=next_page_url, callback=self.parse_company_list) def parse_company_contact(self, response): item = response.meta["item"] pattern = re.compile(r'<b>公司名称: </b> (.*?)<br />', re.S) pattern1 = re.compile(r'<b>联系电话:</b> (.*?)<br />', re.S) pattern2 = re.compile(r'<b>公司地址:</b> (.*?)<br />', re.S) pattern3 = re.compile(r'<b>经营范围:</b> (.*?)<br />', re.S) pattern4 = re.compile( r'<b>网站网址:</b> <a target=_blank href=".*?" >(.*?)</a><br />', re.S) pattern5 = re.compile(r'<b>经济行业:</b> <a href=.*?>(.*?)</a><br />', re.S) item["company_Name"] = "".join(re.findall( pattern, response.text)) if response.text else '' item["company_address"] = "".join(re.findall( pattern2, response.text)) if response.text else '' item["linkman"] = '' item["telephone"] = "".join(re.findall( pattern1, response.text)) if response.text else '' item["phone"] = '' item["E_Mail"] = '' item["contact_Fax"] = '' item["contact_QQ"] = '' item["Source"] = response.url kinds = response.xpath( "//div[@id='dNavBox']/div/a[3]/text()").extract_first() if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营产品:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip().lstrip('|') else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(kinds)) if item["kind"]: try: item["kind"] = item["kind"].split('</')[0] except: item["kind"] = '' else: item["kind"] = '' if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = item["E_Mail"] else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' # if item["host_href"]: # item["host_href"] = item["host_href"] # else: # item["host_href"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return '' def requests_href(self, url, headers): res = requests.get(url=url, headers=headers, timeout=20, verify=False) res.encoding = "utf-8" if res.status_code == requests.codes.ok: img = res.content something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img\image.png" with open(something_img_file_path, "wb") as fp: fp.write(img) fp.close() if img: try: something = recognition_image(something_img_file_path) if something: return something else: return '' except: return '' else: return '' else: return ''
class QuanQiuTieYiWangSpider(CrawlSpider): name = "qqtyw" allowed_domains = ['tybaba.com', 'www.tybaba.com'] start_urls = ['http://www.tybaba.com/company/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686", # "Host": "jamesni139.tybaba.com", # "Referer": "http://jamesni139.tybaba.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='nav-sub']//li[@class='mod_cate']//dl//dt//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='listnew']//td[@align='left']//li[1]//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='pages']//a[contains(text(),'»')]")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@id='menu']//span[contains(text(),'联系方式')]/..")), callback='parse_items', follow=False), ) def parse_items(self, response): pattern = re.compile(r'<meta name="keywords" content="(.*?)"/>', re.S) item = QuanQiuTieYiWangspiderItem() item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td/text()" ).extract_first() item["company_address"] = "".join( response.xpath( "//td[contains(text(),'公司地址:')]/following-sibling::td/text()"). extract()) item["linkman"] = response.xpath( "//td[contains(text(),'联 系 人:')]/following-sibling::td/text()" ).extract_first() item["telephone"] = response.xpath( "//td[contains(text(),'公司电话:')]/following-sibling::td/text()" ).extract_first() item["phone"] = response.xpath( "//td[contains(text(),'手机号码:')]/following-sibling::td/text()" ).extract_first() item["contact_Fax"] = response.xpath( "//td[contains(text(),'公司传真:')]/following-sibling::td/text()" ).extract_first() item["contact_QQ"] = response.xpath( "//td[contains(text(),'即时通讯:')]/following-sibling::td/a/@href" ).extract_first() item["E_Mail"] = response.xpath( "//td[contains(text(),'电子邮件:')]/following-sibling::td/text()" ).extract_first() item["kind"] = "".join(re.findall(pattern, response.text)) if re.findall( pattern, response.text) else '' item["Source"] = response.url item["province"] = '' item["city_name"] = '' if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class QuanQiuFangZhiWangSpider(CrawlSpider): name = "tnc" allowed_domains = ['tnc.com.cn', 'www.tnc.com.cn'] start_urls = ['https://www.tnc.com.cn/company/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Host": "www.kusoba.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@class='area-list']//li//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='result-list-company']//p[@class='tit']//a")), follow=True), Rule(LinkExtractor(allow=r".*", restrict_xpaths=("//a[@class='page-next']")), follow=True), Rule(LinkExtractor(allow=r".*", restrict_xpaths=("//a[contains(text(),'联系方式')]")), callback='parse_items', follow=False), ) def parse_items(self, response): # print(response.text) # <title>Jill:-舟山达利针织有限公司联系方式--全球纺织网</title> # <meta name="keywords" content="舟山达利针织有限公司,Jill," /> # <meta name="description" content="舟山达利针织有限公司负责人:Jill,手机:,座机:-0580-8805716,传真:-0580-8805500,详细地址:定海区盐仓"/> # '舟山达利针织有限公司负责人:Jill,手机:,座机:-0580-8805716,传真:-0580-8805500,详细地址:定海区盐仓' pattern = re.compile(r'<div class="jbxx_zt">(.*?)</div>', re.S) pattern1 = re.compile(r'<p class="indouce">(.*?)</p>', re.S) pattern2 = re.compile(r'手机:(.*?),', re.S) pattern3 = re.compile(r'座机:(.*?),', re.S) pattern4 = re.compile(r'传真:(.*?),', re.S) pattern5 = re.compile(r'详细地址:(.*)', re.S) pattern6 = re.compile(r'负责人:(.*?),', re.S) pattern7 = re.compile(r'<meta name="description" content="(.*?)"/>', re.S) item = QuanQiuFangZhiWangspiderItem() if response.text: try: content = "".join(re.findall( pattern7, response.text)) if re.findall( pattern7, response.text) else '' item["company_Name"] = "".join( re.findall(pattern, response.text)) if re.findall( pattern, response.text) else '' item["kind"] = "".join(re.findall( pattern1, response.text)[0]) if re.findall( pattern1, response.text) else '' item["company_address"] = "".join(re.findall( pattern5, content)) if re.findall(pattern5, content) else '' item["linkman"] = "".join( re.findall(pattern6, content)) if re.findall( pattern6, content) else '' item["telephone"] = "".join( re.findall(pattern3, content)) if re.findall( pattern3, content) else '' item["phone"] = "".join( re.findall(pattern2, content)) if re.findall( pattern2, content) else '' item["contact_Fax"] = "".join( re.findall(pattern4, content)) if re.findall( pattern4, content) else '' item["contact_QQ"] = "" item["E_Mail"] = "" item["Source"] = response.url item["province"] = "" item["city_name"] = "" if item["company_Name"]: item["company_Name"] = re.sub( r'\n|\s|\r|\t|公司名称:|全称:', '', item["company_Name"]).replace(' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营产品:', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords( self.cw.replace_ss(item["kind"])) if item["linkman"]: if "(" in item["linkman"]: item["linkman"] = item["linkman"].split( "(")[0].replace('法定代表人:', '').replace('暂未公布', '') else: item["linkman"] = item["linkman"].replace( '法定代表人:', '').replace('暂未公布', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num( item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: if "\"" in item["company_address"]: item["company_address"] = item[ "company_address"].spilt('"')[0] item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item except: return def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class ZaoWaiXinXiWangSpider(CrawlSpider): name = 'zaow' allowed_domains = ['www.zaowai.com'] start_urls = ['http://www.zaowai.com/page/'] cw = CleanWords() custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Host": "www.yisi.cc", # "Referer": "http://www.yisi.cc/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@class='dirlist whitebg']//li//div//a")), follow=True), Rule(LinkExtractor( allow=r"http\:\/\/www\.zaowai\.com\/com\/.*\/", restrict_xpaths=( "//div[@id='listcolumn']//li//div[@class='company']//a")), callback='parse_items', follow=True), Rule(LinkExtractor( allow=r"http\:\/\/www\.zaowai\.com\/guangdong\/pn\d+\/", restrict_xpaths=("//a[contains(text(),'下一页»')]")), follow=True), # Rule(LinkExtractor( # allow=r".*",restrict_xpaths=("//div[@id='leftmenu']//a[contains(text(),'联系方式')]")),callback='parse_items', follow=False), ) def parse_items(self, response): item = ZaoWaiXinXiWangItem() item["company_Name"] = response.xpath( "//div[@class='companyname']/h1/text()").extract_first() # item["company_id"] = md5(item["company_Name"].encode()).hexdigest() item["kind"] = response.xpath( "//div[@class='shop-keyword']/text()").extract_first() item["company_address"] = "".join( response.xpath("//li[contains(text(),'公司地址:')]/text()").extract()) item["linkman"] = "".join( response.xpath("//li[contains(text(),'人:')]/text()").extract()) item["telephone"] = "".join( response.xpath("//li[contains(text(),'联系电话:')]/text()").extract()) item["phone"] = "" item["contact_Fax"] = "".join( response.xpath("//li[contains(text(),'公司传真:')]/text()").extract()) item["contact_QQ"] = "".join( response.xpath("//img[@alt='联系QQ']/../@href").extract()) item["E_Mail"] = "".join( response.xpath("//li[contains(text(),'电子邮箱:')]/text()").extract()) item["Source"] = response.url item["province"] = '' item["city_name"] = '' if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = re.sub(r'\s|\n|\r|\t', '', item["linkman"]) item["linkman"] = item["linkman"].replace("联系人:", "") else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class QiLingWangSpider(CrawlSpider): name = "qlw" allowed_domains = ['www.707070.cn', '707070.cn'] start_urls = ['http://www.707070.cn/city/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", # "Connection": "keep-alive", # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686", # "Host": "jamesni139.tybaba.com", # "Referer": "http://jamesni139.tybaba.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, # 不验证SSL证书 # "DOWNLOAD_HANDLERS_BASE": { # 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', # 'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', # 'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', # 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', # }, # "DOWNLOAD_HANDLERS": { # 'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'}, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//div[@id='hot']//dl[@class='area']//h4//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='content list']//li[@class='listbox-item']//h2//a" )), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='pages']//a[contains(text(),'下一页')]")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='nav']//a[contains(text(),'联系我们')]")), callback='parse_items', follow=False), ) def parse_items(self, response): item = QiLingWangItem() pattern = re.compile(r'(1\d{10})', re.S) item["company_Name"] = response.xpath( "//div[@class='comname']/text()").extract_first() item["kind"] = response.xpath( "//p[contains(text(),'主营:')]/text()").extract_first() item["company_address"] = response.xpath( "//div[@class='side']//li[contains(text(),'地址:')]/text()" ).extract_first() item["linkman"] = response.xpath( "//div[@class='main']//li[contains(text(),'联系人:')]/text()" ).extract_first() item["telephone"] = response.xpath( "//div[@class='main']//li[contains(text(),'电话:')]/text()" ).extract_first() item["phone"] = "".join(re.findall(pattern, response.text)[0]) if re.findall( pattern, response.text) else '' item["contact_Fax"] = response.xpath( "//div[@class='main']//li[contains(text(),'传真:')]/text()" ).extract_first() item["contact_QQ"] = response.xpath( "//div[@class='main']//li[contains(text(),'Q Q:')]/text()" ).extract_first() item["E_Mail"] = '' item["Source"] = response.url item["province"] = "" item["city_name"] = "" if item["company_Name"]: if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] else: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|')\ .replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = item["contact_QQ"].replace("Q Q:", '') else: item["contact_QQ"] = '' item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class QiYe39Spider(CrawlSpider): name = 'qy39' # pattern = re.compile(r'(.*?).qy39.com') allowed_domains = ['qy39.com', 'www.qy39.com'] start_urls = ['http://www.qy39.com/company/'] cw = CleanWords() custom_settings = { 'DOWNLOAD_DELAY': 0.2, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Cookie": "ASPSESSIONIDQQQBDBAD=PAKPCNLDKGAMHCDLNGBGEPLF; Hm_lvt_539760cac714bd8993dbfb0c1dfb96f7=1564976804; UM_distinctid=16c5fe2f1455-07d6d8162e26fa-5a13331d-1fa400-16c5fe2f14637e; CNZZDATA3636164=cnzz_eid%3D831514679-1564972722-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1564972722; ASPSESSIONIDSQRAAAAC=HEKFDNLDGBDAENAHAMHCLMHJ; Hm_lpvt_539760cac714bd8993dbfb0c1dfb96f7=1564976846", # "Host": "www.qy39.com", # "Referer": "http://www.qy39.com/beijing-huangye/10", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } def parse(self, response): a_list = response.xpath("///div[@class='categoryList']//ul//li//a") for a in a_list: kind_name = a.xpath("./text()").extract_first() kind_href = a.xpath("./@href").extract_first() if kind_href: kind_href = "http://www.qy39.com" + kind_href # print(kind_name,kind_href) yield scrapy.Request(url=kind_href, callback=self.parse_company_list, dont_filter=True) def parse_company_list(self, response): li_list = response.xpath("//div[@class='listMain02']//ul//li") for li in li_list: item = QiYe39spiderItem() pattern = re.compile(r'\[(.*?)\/(.*?)\]', re.S) item["company_Name"] = li.xpath( ".//a[@class='proName02']/text()").extract_first() company_href = li.xpath( ".//a[@class='proName02']/@href").extract_first() if company_href: # print(company_href) contact_href = company_href + "/qyjs/" # print(contact_href) yield scrapy.Request(url=contact_href, callback=self.parse_company_contact, meta={"item": item}, dont_filter=True) next_page_url = response.xpath( "//div[@class='listPage']//a[contains(text(),'下一页')]/@href" ).extract_first() if next_page_url: next_page_url = next_page_url.replace(".", "") next_page_url = "http://www.qy39.com" + next_page_url # print(next_page_url) yield scrapy.Request(url=next_page_url, callback=self.parse_company_list) def parse_company_contact(self, response): item = response.meta["item"] # pattern = re.compile(r'<meta name="keywords" content="(.*?)" />',re.S) # pattern1 = re.compile(r'<li>主营产品: (.*?)</li>', re.S) # pattern2 = re.compile(r'<li>所在地区:(.*?)</li>', re.S) # pattern3 = re.compile(r'<li>联系人:(.*?)</li>', re.S) # pattern4 = re.compile(r'<li>手机:(.*?)</li>', re.S) # pattern5 = re.compile(r'<li>联系电话:(.*?)</li>', re.S) # pattern6 = re.compile(r'<li>公司传真:(.*?)</li>', re.S) # pattern7 = re.compile(r'href="tencent://message/?Site=jiancai.com&Uin=(.*?)&Menu=yes"',re.S) # pattern8 = re.compile(r'>\s*联系人:(.*?)</li>',re.S) item["company_Name"] = response.xpath( "//th[contains(text(),'公司名称')]/following-sibling::td//font/text()" ).extract_first() # item["company_id"] = md5(item["company_Name"].encode()).hexdigest() item["kind"] = "" item["company_address"] = "".join( response.xpath( "//th[contains(text(),'企业地址')]/following-sibling::td/text()"). extract()) item["linkman"] = "".join( response.xpath( "//span[contains(text(),'联系人:')]/..//text()").extract()) item["telephone"] = "".join( response.xpath( "//span[contains(text(),'电话:')]/..//text()").extract()) item["phone"] = "".join( response.xpath( "//span[contains(text(),'手机:')]/..//text()").extract()) item["E_Mail"] = "".join( response.xpath( "//th[contains(text(),'电子邮件')]/following-sibling::td/text()"). extract()) item["contact_Fax"] = "".join( response.xpath( "//th[contains(text(),'传真号码')]/following-sibling::td/text()"). extract()) item["contact_QQ"] = "" item["province"] = '' item["city_name"] = '' item["Source"] = response.url if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = item["phone"] else: item["phone"] = '' item["phone"] = self.cw.search_phone_num(item["phone"]) if item["telephone"]: item["telephone"] = item["telephone"] else: item["telephone"] = '' item["telephone"] = self.cw.search_telephone_num(item["telephone"]) if item["contact_Fax"]: item["contact_Fax"] = item["contact_Fax"] else: item["contact_Fax"] = '' item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = item["contact_QQ"] else: item["contact_QQ"] = '' item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) if item["company_address"]: item["company_address"] = item["company_address"] else: item["company_address"] = '' item["company_address"] = self.cw.search_address( item["company_address"]) # if item["host_href"]: # item["host_href"] = item["host_href"] # else: # item["host_href"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class ShangQuWang(CrawlSpider): name = 'salqu' allowed_domains = ['www.salqu.com'] start_urls = ['http://www.salqu.com/company/'] cw = CleanWords() custom_settings = { 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "PHPSESSID=gluoteaio8b0brt17o9msoqa14", # "Host": "www.salqu.com", # "Referer": "http://www.salqu.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } def parse(self, response): a_list = response.xpath( "//div[@class='col-xs-12']//dd[@class='place']//a") for a in a_list: kind_name = a.xpath("./text()").extract_first() kind_href = a.xpath("./@href").extract_first() if kind_href: # print(kind_name,kind_href) yield scrapy.Request(url=kind_href, callback=self.parse_company_list, dont_filter=True) def parse_company_list(self, response): div_list = response.xpath( "//div[@id='list']//div[@class='company-wrap']") for div in div_list: item = ShangQuWangspiderItem() # pattern = re.compile(r'(.*?)\/(.*?)', re.S) item["company_Name"] = div.xpath( ".//span[@class='company-name']/a/text()").extract_first() company_href = div.xpath( ".//span[@class='company-name']/a/@href").extract_first() item["kind"] = div.xpath( ".//span[contains(text(),'主营产品:')]/text()").extract_first() city_infos = div.xpath( ".//p[contains(text(),'所在地区')]/following-sibling::p/text()" ).extract_first() if city_infos: # 广东/潮州市 try: item["province"] = city_infos.split("/")[0] item["city_name"] = city_infos.split("/")[1] except: item["province"] = city_infos item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' if company_href: # print(company_href) contact_href = company_href + "contact/" # print(contact_href) yield scrapy.Request(url=contact_href, callback=self.parse_company_contact, meta={"item": item}, dont_filter=True) next_page_url = response.xpath( "//div[@class='pages']//a[contains(text(),'下一页»')]/@href" ).extract_first() if next_page_url: yield scrapy.Request(url=next_page_url, callback=self.parse_company_list) def parse_company_contact(self, response): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "PHPSESSID=gluoteaio8b0brt17o9msoqa14", # "Host": "xyblifei.salqu.com", "Referer": response.url, "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } item = response.meta["item"] item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td/text()" ).extract_first() # item["company_id"] = md5(item["company_Name"].encode()).hexdigest() # item["kind"] = response.xpath("//div[@class='head']/h4/text()").extract_first() item["company_address"] = "".join( response.xpath( "//td[contains(text(),'公司地址:')]/following-sibling::td/a/text()" ).extract()) item["linkman"] = "".join( response.xpath( "//td[contains(text(),'联 系 人:')]/following-sibling::td/text()" ).extract()) item["telephone"] = "".join( response.xpath( "//td[contains(text(),'公司电话:')]/following-sibling::td/img/@src" ).extract()) item["phone"] = "".join( response.xpath( "//td[contains(text(),'手机号码:')]/following-sibling::td/img/@src" ).extract()) item["E_Mail"] = "".join( response.xpath( "//td[contains(text(),'电子邮件:')]/following-sibling::td/img/@src" ).extract()) item["contact_Fax"] = "".join( response.xpath( "//td[contains(text(),'公司传真:')]/following-sibling::td/img/@src" ).extract()) item["contact_QQ"] = "".join( response.xpath("//img[@title='点击QQ交谈/留言']/../@href").extract()) item["Source"] = response.url if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营产品:|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \ .replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.requests_href(item["phone"], headers) item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.requests_href(item["telephone"], headers) item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.requests_href(item["contact_Fax"], headers) item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.requests_href(item["E_Mail"], headers) if item["E_Mail"]: item["E_Mail"] = item["E_Mail"].replace("e", "@").replace( "8126", "@163").replace("8163", "@163").strip() # item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' # if item["host_href"]: # item["host_href"] = item["host_href"] # else: # item["host_href"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return '' def requests_href(self, url, headers): res = requests.get(url=url, headers=headers, timeout=20, verify=False) res.encoding = "utf-8" if res.status_code == requests.codes.ok: img = res.content something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img\image.png" with open(something_img_file_path, "wb") as fp: fp.write(img) fp.close() if img: try: something = recognition_image(something_img_file_path) if something: return something else: return '' except: return '' else: return '' else: return ''
class WuJiuShangWuWangSpider(CrawlSpider): name = 'sw59' allowed_domains = ['www.59b2b.com'] start_urls = ['http://www.59b2b.com/company/'] cw = CleanWords() custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Cookie": "Hm_lvt_a8ba6329e67dda68e5d5dadf2df13e01=1564989677; Hm_lpvt_a8ba6329e67dda68e5d5dadf2df13e01=1564989683; security_session_verify=804231afb453a0f6d1f322329f5bfb57", # "Host": "www.yiwangtui.com", # "Referer": "http://www.yiwangtui.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543, } } def parse(self, response): a_list = response.xpath("//div[@class='left_box']//tr//a") for a in a_list: kind_name = a.xpath("./text()").extract_first() kind_href = a.xpath("./@href").extract_first() if kind_href: print(kind_name,kind_href) yield scrapy.Request( url=kind_href, callback=self.parse_company_list, dont_filter=True ) def parse_company_list(self, response): tr_list = response.xpath("//div[@class='left_box']//div[@class='list']//table//tr") for tr in tr_list: item = WuJiuShangWuWangspiderItem() pattern = re.compile(r'\[(.*?)/(.*?)\]', re.S) item["company_Name"] = tr.xpath(".//li//a/strong/text()").extract_first() company_href = tr.xpath(".//li/a/@href").extract_first() item["kind"] = tr.xpath(".//li[contains(text(),'主营:')]/text()").extract_first() city_infos = tr.xpath(".//td[@class='f_orange']/text()").extract_first() if city_infos: # 广东/潮州市 try: item["province"] = re.findall(pattern, city_infos)[0][0] item["city_name"] = re.findall(pattern, city_infos)[0][1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace(' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) item["linkman"] = '' item["phone"] = '' item["telephone"] = '' item["contact_Fax"] = '' item["contact_QQ"] = '' item["E_Mail"] = '' item["company_address"] = '' item["Source"] = response.url yield item next_page_url = response.xpath("//div[@class='pages']//a[contains(text(),'下一页»')]/@href").extract_first() if next_page_url: yield scrapy.Request( url=next_page_url, callback=self.parse_company_list ) def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class ZhongGuoJiChuangWangSpider(CrawlSpider): name = "machine35" allowed_domains = ['www.machine35.com','achine35.com','search.machine35.com','vip.machine35.com'] start_urls = ['http://www.machine35.com/company/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302}, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*",restrict_xpaths=("//div[@class='content']//dl//dd//a")),follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//a[contains(text(),'联系方式')]")),callback="parse_items",follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=("//a[contains(text(),'下一页')]")), follow=True), ) def parse_items(self, response): item = ZhongGuoJiChuangWangItem() pattern = re.compile(r"href='http:\/\/wpa.qq.com\/msgrd\?v=3&uin=(\d+)&site=machine35.com&menu=yes'",re.S) item["company_Name"] = "".join(response.xpath("//dt[@class='maintitle']//text()").extract()) item["kind"] = response.xpath("//dd[@class='subtitle']/text()").get() item["company_address"] = response.xpath("//td[contains(text(),'地 址:')]/following-sibling::td/text()").extract_first() item["linkman"] = response.xpath("///span[@class='blue']/text()").extract_first() item["telephone"] = response.xpath("//td[contains(text(),'电 话:')]/following-sibling::td/text()").extract_first() item["phone"] = response.xpath("//td[contains(text(),'手 机:')]/following-sibling::td/text()").extract_first() item["contact_Fax"] = response.xpath("///td[contains(text(),'传 真:')]/following-sibling::td/text()").extract_first() item["contact_QQ"] = "".join(re.findall(pattern,response.text)) if re.findall(pattern,response.text) else '' item["E_Mail"] = response.xpath("//td[contains(text(),'邮 箱:')]/following-sibling::td/text()").extract_first() item["Source"] = response.url city_infos = response.xpath("//dt[contains(text(),'所在地区:')]/following-sibling::dd/text()").get() if item["company_Name"]: item["company_Name"] = self.cw.search_company(item["company_Name"]) else: item["company_Name"] = '' item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:|供应商', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = self.cw.search_linkman(item["linkman"]) else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = item["company_address"].replace("联系地址:","") item["company_address"] = self.cw.search_address(item["company_address"]) else: item["company_address"] = '' # if city_infos: if item["company_address"]: if '市' and '省' in item["company_address"]: try: pattern_p = re.compile(r'(.*?)省', re.S) pattern_c = re.compile(r'省(.*?)市', re.S) item["province"] = "".join(re.findall(pattern_p, item["company_address"])) \ if re.findall(pattern_p, item["company_address"]) else '' item["city_name"] = "".join(re.findall(pattern_c, item["company_address"])) \ if re.findall(pattern_c, item["company_address"]) else '' except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class WuYouJiaoYiWangSpider(CrawlSpider): name = "ec51" allowed_domains = ['www.ec51.com','ec51.com'] start_urls = ['https://www.ec51.com/site/company.html'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302}, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", # "Connection": "keep-alive", # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686", # "Host": "jamesni139.tybaba.com", # "Referer": "http://jamesni139.tybaba.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, # 不验证SSL证书 # "DOWNLOAD_HANDLERS_BASE": { # 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', # 'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', # 'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', # 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', # }, # "DOWNLOAD_HANDLERS": { # 'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'}, } def parse(self, response): div_list = response.xpath("//div[@class='flex-item product-wrap']//div[@class='product flex']") for div in div_list: item = WuYouJiaoYiWangItem() item["company_Name"] = div.xpath(".//a[@class='title ea']/text()").get() item["kind"] = div.xpath(".//p[contains(text(),'经营范围:')]/text()").get() item["company_address"] = div.xpath(".//p[contains(text(),'公司地址:')]/text()").get() item["linkman"] = '' item["telephone"] = div.xpath(".//p[contains(text(),'联系方式:')]/text()").get() item["phone"] = div.xpath(".//p[contains(text(),'联系方式:')]/text()").get() item["contact_Fax"] = '' item["contact_QQ"] = '' item["E_Mail"] = '' item["Source"] = response.url item["province"] = "" item["city_name"] = "" if item["company_Name"]: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace(' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营产品|经营范围:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: try: item["kind"] = ",".join(response.xpath("//p[contains(text(),'公司标签:')]//a/text()").getall()) item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营产品|经营范围:', '', item["kind"]).replace('-', '|') \ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';', '|').replace('.', '').strip() except: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"] else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: try: item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"]) except: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address(item["company_address"]) else: item["company_address"] = '' yield item next_page_url = response.xpath("//li[@class='next']//a[contains(text(),'下一页')]/@href").get() if next_page_url: next_page_url = "https://www.ec51.com" + next_page_url yield scrapy.Request( url=next_page_url, callback=self.parse ) def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class GongQiuXingXiWangSpider(CrawlSpider): name = "cnexpnet" allowed_domains = ['cnexpnet.net'] start_urls = ['http://cnexpnet.net/company/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, } # /c3847/p2/ rules = ( Rule(LinkExtractor( allow=r".*", restrict_xpaths=("///div[@class='list-cate']//td//a")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths= ("//div[@class='m m2']//div[@class='list']//td[@align='left']//li//a" )), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='pages']//a[contains(text(),'下一页»')]")), follow=True), Rule(LinkExtractor( allow=r".*", restrict_xpaths=( "//div[@class='menu']//span[contains(text(),'联系方式')]/..")), callback='parse_items', follow=True), ) def parse_items(self, response): item = GongQiuXingXiWangItem() item["company_Name"] = response.xpath( "//td[contains(text(),'公司名称:')]/following-sibling::td/text()" ).extract_first() item["company_address"] = response.xpath( "//td[contains(text(),'公司地址:')]/following-sibling::td/text()" ).extract_first() item["linkman"] = response.xpath( "//td[contains(text(),'联 系 人:')]/following-sibling::td/text()" ).extract_first() item["telephone"] = response.xpath( "//td[contains(text(),'公司电话:')]/following-sibling::td/text()" ).extract_first() item["phone"] = response.xpath( "//td[contains(text(),'手机号码:')]/following-sibling::td/text()" ).extract_first() item["contact_Fax"] = response.xpath( "//td[contains(text(),'公司传真:')]/following-sibling::td/text()" ).extract_first() item["contact_QQ"] = response.xpath( "//img[@title='点击QQ交谈/留言']/../@href").extract_first() item["E_Mail"] = response.xpath( "//td[contains(text(),'电子邮件:')]/following-sibling::td/text()" ).extract_first() item["Source"] = response.url item["kind"] = ",".join( response.xpath("//div[@class='head']//h4/text()").getall()) city_infos = response.xpath( "//td[contains(text(),'所在地区:')]/following-sibling::td/text()").get( ) if item["company_Name"] and item["company_Name"] != '': if "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "(" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('(')[0] elif "_" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('_')[0] elif "-" in item["company_Name"]: item["company_Name"] = item["company_Name"].split('-')[0] else: item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称:', '', item["company_Name"]).replace( ' ', '').strip() else: return item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: item["kind"] = item["kind"].replace(" ", '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营|主营项目:', '', item["kind"]).replace('-', '|')\ .replace('、', '|').replace(',', '|').replace(',', '|').replace(';','|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace('未填写', '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = self.cw.search_telephone_num(item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' if city_infos: if '/' in city_infos: try: item["province"] = city_infos.split('/')[0] item["city_name"] = city_infos.split('/')[1] except: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' else: item["province"] = '' item["city_name"] = '' yield item def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''
class ErWuBaWangSpider(CrawlSpider): name = "ewb" allowed_domains = ['258.com', 'www.258.com', 'shop.258.com'] start_urls = ['http://www.258.com/company/'] cw = CleanWords() # redis_key = "ksb:start_urls" custom_settings = { 'DOWNLOAD_DELAY': 0.2, 'ITEM_PIPELINES': { 'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302 }, 'DEFAULT_REQUEST_HEADERS': { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", # "Connection": "keep-alive", # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686", # "Host": "jamesni139.tybaba.com", # "Referer": "http://jamesni139.tybaba.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", }, 'DOWNLOADER_MIDDLEWARES': { 'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544, # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420, }, # 不验证SSL证书 # "DOWNLOAD_HANDLERS_BASE": { # 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', # 'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', # 'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler', # 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', # }, # "DOWNLOAD_HANDLERS": { # 'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'}, } def parse(self, response): a_list = response.xpath( "//li[@class='relative']//div[@class='ProductIndexRightNav']//li//a" ) for a in a_list: kind_href = a.xpath("./@href").get() kind_name = a.xpath("./text()").get() if kind_href: kind_href = "http://www.258.com" + kind_href # print(kind_name,kind_href) yield scrapy.Request(url=kind_href, callback=self.parse_company_list, dont_filter=True) def parse_company_list(self, response): div_list = response.xpath( "//div[@class='ovh mt10 ']//div[@class='qyk_sublistleft iconboxAll ']" ) for div in div_list: company_Name = div.xpath(".//h3/a/text()").get() company_href = div.xpath(".//h3/a/@href").get() kind = div.xpath( "//span[contains(text(),'主营产品:')]/../text()").get() if company_href: yield scrapy.Request(url=company_href, callback=self.parse_company_detail, dont_filter=True) # $.goToPage(4,this,'/Company/getList/cg/92/p/4') next_url = response.xpath("//a[contains(text(),'下一页')]/@onclick").get() pattern = re.compile(r"\$\.goToPage\(\d+,this,'(.*?)'\)", re.S) try: url = "".join(re.findall(pattern, next_url)) if url: url = url.replace("getList/cg/", '') next_page_url = "http://www.258.com" + url if next_page_url: yield scrapy.Request(url=next_page_url, callback=self.parse_company_list, dont_filter=True) except: return def parse_company_detail(self, response): contact_href = response.xpath( "//a[contains(text(),'联系方式')]/@href").get() if contact_href: yield scrapy.Request(url=contact_href, callback=self.parse_company_contact, dont_filter=True) def parse_company_contact(self, response): pattern = re.compile(r'<span class="cp-name">(.*?)</span>', re.S) pattern1 = re.compile(r'>联系人:(.*?) <', re.S) pattern2 = re.compile(r'>QQ:(.*?)<', re.S) pattern3 = re.compile(r'>电话:(.*?)<', re.S) pattern4 = re.compile(r'>手机:(.*?)<', re.S) pattern5 = re.compile(r'>传真:(.*?)<', re.S) pattern6 = re.compile(r'>邮箱:(.*?)<', re.S) pattern7 = re.compile(r'>地址:(.*?)\s*<', re.S) pattern8 = re.compile( r'<input type="hidden" id="business_address" value="(.*?)" />', re.S) pattern9 = re.compile(r'>主营产品:(.*?)<', re.S) # pattern10 = re.compile(r'>所在地区:上海市 市辖区<') item = ErWuBaWangItem() if response.text: try: item["company_Name"] = "".join( re.findall(pattern, response.text)) if re.findall( pattern, response.text) else '' item["kind"] = "".join(re.findall( pattern9, response.text)) if re.findall( pattern9, response.text) else '' item["company_address"] = "".join( re.findall(pattern7, response.text)[0]) if re.findall( pattern7, response.text) else '' item["linkman"] = "".join(re.findall( pattern1, response.text)) if re.findall( pattern1, response.text) else '' item["telephone"] = "".join(re.findall( pattern3, response.text)) if re.findall( pattern3, response.text) else '' item["phone"] = "".join(re.findall( pattern4, response.text)) if re.findall( pattern4, response.text) else '' item["contact_Fax"] = "".join( re.findall(pattern5, response.text)) if re.findall( pattern5, response.text) else '' item["contact_QQ"] = "".join( re.findall(pattern2, response.text)) if re.findall( pattern2, response.text) else '' item["E_Mail"] = "".join(re.findall( pattern6, response.text)) if re.findall( pattern6, response.text) else '' item["Source"] = response.url item["province"] = "" item["city_name"] = "" if item["company_Name"]: item["company_Name"] = re.sub( r'\n|\s|\r|\t|公司名称:|企 业 名 称:', '', item["company_Name"]).replace(' ', '').strip() item["company_id"] = self.get_md5(item["company_Name"]) if item["kind"]: if "主营产品" in item["kind"]: item["kind"] = item["kind"].split('主营产品:')[-1] item["kind"] = item["kind"].replace(' ', '|') item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营', '', item["kind"])\ .replace('-', '|').replace('、','|').replace(',', '|').replace(',', '|')\ .replace(';', '|').replace('.', '').strip() else: item["kind"] = item["kind"] item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务:|主营产品:', '', item["kind"]) \ .replace('-', '|').replace('、', '|').replace(',', '|').replace(',', '|') \ .replace(';', '|').replace('.', '').strip() else: item["kind"] = '' item["kind"] = self.cw.rinse_keywords( self.cw.replace_ss(item["kind"])) if item["linkman"]: item["linkman"] = item["linkman"].replace("联 系 人:", '') else: item["linkman"] = '' item["linkman"] = self.cw.search_linkman(item["linkman"]) if item["phone"]: item["phone"] = item["phone"].replace("联 系 电 话:", '') item["phone"] = self.cw.search_phone_num(item["phone"]) else: item["phone"] = '' if item["telephone"]: item["telephone"] = item["telephone"].replace( "联 系 电 话:", '') item["telephone"] = self.cw.search_telephone_num( item["telephone"]) else: item["telephone"] = '' if item["contact_Fax"]: item["contact_Fax"] = item["contact_Fax"].replace( "公 司 传 真:", '') item["contact_Fax"] = self.cw.search_contact_Fax( item["contact_Fax"]) else: item["contact_Fax"] = '' if item["E_Mail"]: item["E_Mail"] = item["E_Mail"].replace("电 子 邮 箱:", '') item["E_Mail"] = self.cw.search_email(item["E_Mail"]) else: item["E_Mail"] = '' if item["contact_QQ"]: item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"]) else: try: item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"]) except: item["contact_QQ"] = '' if item["company_address"]: item["company_address"] = item["company_address"].replace( '公 司 地 址:', '') item["company_address"] = self.cw.search_address( item["company_address"]) else: item["company_address"] = '' yield item except: return def get_md5(self, value): if value: return md5(value.encode()).hexdigest() return ''