def __init__(self): self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)' } self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb") self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.category_name_list = [] self.sec_category_dict = {} self.headers_forpage = { "Host": "www.80guakao.com", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36", "Accept": "*/*", "Referer": "http://www.80guakao.com/shengfen/hb/", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "", }
class Scrapy78Pipeline: def open_spider(self, spider): # 爬虫开始时候执行 # spider.hello = "world" # 为spider对象动态添加属性,可以在spider模块中获取该属性值 # 可以开启数据库等 self.Mongo = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao_changsha") def process_item(self, item, spider): i = {} i['companyCity'] = "长沙" i['companyProvince'] = "湖南省" i['code'] = 'BUS_YT_ZZ' i['name'] = '资质' i['busCode'] = '' i['webUrl'] = '无' i['orgId'] = '' i['deptId'] = '' i['centreId'] = '' i["companyName"] = item["companyName"] i["outName"] = item["outName"] i["resourceRemark"] = item['resourceRemark'] i["companyTel"] = str(item["companyTel"]) i["ibossNum"] = None i['isDir'] = 0 i['isShare'] = 0 i['flag'] = 0 i["_id"] = md5encryption(item["companyTel"]) self.Mongo.mongo_add(i) print(i) return item
class Guakao555Pipeline(object): def open_spider(self,spider): #爬虫开始时候执行 # spider.hello = "world" # 为spider对象动态添加属性,可以在spider模块中获取该属性值 # 可以开启数据库等 self.Mongo=MongoDB('mongodb://localhost', 'cuiworkdb', "555guakao_dg") def process_item(self, item, spider): item['companyCity']= '杭州' item['code']= 'BUS_YT_ZZ' item['name']= '资质' item['busCode']= '' item['webUrl']='无' item['orgId']='' item['deptId']= '' item['centreId']= '' item["ibossNum"]=None item['isDir']= 0 item['isShare']= 0 item["_id"]= md5encryption(item["companyTel"]) self.Mongo.mongo_add(item) return item def close_spider(self,spider): #爬虫结束时执行 #可以关闭数据库 pass
def __init__(self): self.db = MongoDB('172.16.74.249:27017', 'db_reptile_company', 'company_name') self.redis = REDIS(host=RedisHost, port=RedisPort, password=RedisPassword, db=RedisDB) self.item = {}
def __init__(self): self.starturl = 'http://hangzhou.qd8.com.cn/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'} self.s = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao") self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.item_dict = {} self.db = MongoDB('mongodb://localhost', 'cuiworkdb', 'kd8')
def __init__(self, start_url, cookie, referer, companyCity, companyProvince, db): self.start_url = start_url self.companyCity = companyCity self.companyProvince = companyProvince self.headers = { # ":authority":"www.cbi360.net", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip,deflate,br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", "Cookie": cookie, # "Cookie": "", "pragma": "no-cache", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "Referer": referer, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", } self.r0 = Redisclient(0) self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db) # self.f = FETCH() self.par = re.compile(r'\d+-\d+') self.par2 = re.compile(r'\d+')
# # } # # # # # # res = requests.post(url=text_url,headers=text_headers,data=pic_data,json=text_json_que) # # print(res.text) ####2021/1/8 爬取对应数据图片url import requests from time import sleep import json from Func.client import MongoDB from Func.fetchJX import FETCH s = FETCH() #改数据库 db = MongoDB('mongodb://localhost', 'cuiworkdb', "Shangbiao_GG-1731") #改url url = "http://wsgg.sbj.cnipa.gov.cn:9080/tmann/annInfoView/imageView.html?O56fzBVE=5cRVmWcgP6gC.1ulRszzI2_aOlP1jkNH0mxnPtgkE73P.1rSAtlAU1rHW64aHQoXm471Fzq7QOzRfVJiLnarbCbBAjmRHPnmNTUqx.Bfa6RWoAiipN6HKjl5E3Nb6Jp_LaGu5Dr0x1V4f2AsDjRza2LmDcNsd62msQ6SzqM646fK0XNFf.KzqSrexNQiIbLTdcX2wDPfCad.6G6Y4Pq28hw_OMDoIYVwZSvwH.emWD5UAVTbKi.mblyWCBYJOMZx5OMbUMWr05.V6JtgmG.usyr3_8OtVx8yHqisK54faJIdqZ5ofaDE4r6mjkZiGtqZZ96H_kqpPDS1WOjZMSlQGqQal8YnoPPDasrJ5lPkWyphiagHypaYQfoBfWUc3idLO" #改cookie pic_headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Content-Length": "52", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Cookie": "_gscu_1645064636=76464150vsisqf48; _gscu_2023327167=76464150m8szyi25; _trs_uv=k9wi5ba1_4030_8pj2; Hm_lvt_d7682ab43891c68a00de46e9ce5b76aa=1597140998; UM_distinctid=174ae765669480-09ef6ad0f222cb-4353761-1fa400-174ae76566aa07; goN9uW4i0iKzS=5db92.A0J2CMY23basgx2TZ.mTIJ7lkLr89FeTJ1C0aRMHE_2AokqW2_4RJ42AQplsUcWhHGBKqZ3JYJcp..cRA; __jsluid_h=b6457e19fe1b05edea1f19ada75c9f46; tmas_cookie=2272.7688.15400.0000; 018f9ebcc3834ce269=09b16dacaa2e3b985f2267dc76e5ae8f; arp_scroll_position=0; goN9uW4i0iKzT=53cCT8DqzzRLqqqm67z1U8qKZLtDgJCjEPobnFeBoFOQEkx_Gy09SZfAajrh2D40V2DJdi_T6Yxefkk.TyC5jkWKRfroyI0Ty8DNR0q2gea8MtkfUvIUVuyOffOLFIesbBvkJ4FVJn0c2XCNKuJKF5uWYYxN.9fe9K5lzUFILMY4E2DDUzrR2u3s2n5yMTLQ3QYDyAIEHcwh9210LUxxFmRFBxLVwWtAcBV_6cTdtf3pc22FM8A8bg8AGXagoEJRxfL.Lj2tq4BK8Li.zsiPB6R; JSESSIONID=0000P2s6vou1xkNAr0uRAjbsxE9:1bm112s99", "Host": "wsgg.sbj.cnipa.gov.cn:9080",
class get_json(object): def __init__(self): self.db = MongoDB('172.16.74.249:27017', 'db_reptile_company', 'company_name') self.redis = REDIS(host=RedisHost, port=RedisPort, password=RedisPassword, db=RedisDB) self.item = {} # mongdb--redeis def transfer(self): dd = self.db.mongo_find({}) for i in dd: item = {} item['_id'] = i['_id'] item['company_name'] = i['company_name'] b = self.redis.add('coampanylidt', json.dumps(item)) print('存入成功', b, item) # 百度企业信用基本信息 def get_companydetails(self, company_name): res1 = s.fetch('https://xin.baidu.com/s?q={}&t=0'.format( parse.quote(company_name))) href_list = re.findall(r'{"pid":"(\S+)","entName":', res1.text) if len(href_list) != 0: details_href = 'https://xin.baidu.com//detail//compinfo?pid=' + href_list[ 0] # company = res1.html.xpath('//a[@class="zx-list-item-url"]/@title')[0] print(details_href) res = s.fetch(details_href) # print(res.text) exit() # 统一社会信用代码 self.item['credit_code'] = res.html.xpath( '//td[contains(text(),"统一社会信用代码")]/following-sibling::td[1]/text()', first=True) # 客户公司注册时间 self.item['register_time'] = res.html.xpath( '//*[@class="zx-detail-basic-table"]//td[contains(text(),"成立日期")]/following-sibling::td[1]/text()', first=True) # 客户公司注册金额 registerMoney self.item['register_money'] = res.html.xpath( '//td[contains(text(),"注册资本")]/following-sibling::td[1]/text()', first=True) # 客户所属行业 self.item['industry'] = res.html.xpath( '//td[contains(text(),"所属行业")]/following-sibling::td[1]/text()', first=True) # 客户公司状态:正常/注销 self.item['business_state'] = res.html.xpath( '//td[contains(text(),"经营状态")]/following-sibling::td[1]/text()', first=True) # 组织机构代码 self.item['organization_code'] = res.html.xpath( '//td[contains(text(),"组织机构代码")]/following-sibling::td[1]/text()', first=True) # 工商注册号 self.item['register_num'] = res.html.xpath( '//td[contains(text(),"工商注册号")]/following-sibling::td[1]/text()', first=True) # 法定代表人 self.item['legal_man'] = res.html.xpath( '//td[contains(text(),"法定代表人")]/following-sibling::td[1]/text()', first=True) # 登记机关 self.item['regist_organ'] = res.html.xpath( '//td[contains(text(),"登记机关")]/following-sibling::td[1]/text()', first=True) # 核准日期 self.item['confirmtime'] = res.html.xpath( '//*[@class="zx-detail-basic-table"]//td[contains(text(),"审核/年检日期")]/following-sibling::td[1]/text()', first=True) # 营业期限 self.item['business_timeout'] = res.html.xpath( '//*[@class="zx-detail-basic-table"]//td[contains(text(),"营业期限")]/following-sibling::td[1]/text()', first=True) # 企业类型 self.item['register_address'] = res.html.xpath( '//*[@class="zx-detail-basic-table"]//td[contains(text(),"企业类型")]/following-sibling::td[1]/text()', first=True) # 企业地址 self.item['registerAddress'] = res.html.xpath( '//*[@class="zx-detail-basic-table"]//td[contains(text(),"注册地址")]/following-sibling::td[1]/text()', first=True) # 经营范围 self.item['business_scope'] = res.html.xpath( '//td[contains(text(),"经营范围")]/following-sibling::td[1]//@data-content', first=True) self.item['usedName'] = res.html.xpath( '//td[contains(text(),"曾用名")]/following-sibling::td[1]/text()', first=True) # 经营方式 # self.item['operation'] = None # 来源网站 self.item['web_source'] = 'https://xin.baidu.com/' # 公司名 self.item['company_name'] = company_name # 来源网址 self.item['company_url'] = details_href self.item['_id'] = hashlib.md5( (company_name).encode(encoding='utf-8')).hexdigest() self.item['web_update_time'] = time.strftime( "%Y-%m-%d", time.localtime(int(time.time()))) # print(self.item) # return self.item # code 201 if company_name != company and len(self.item) > 4: self.db.mong_find_one_update({"_id": self.item['_id']}, {"flag": "公司名有问题"}) return '公司名有问题 --- %s' % self.item['company_name'] else: db1.mongo_add(self.item) return '%s 插入成功 !!!!' % self.item['company_name'] else: _id = hashlib.md5( (company_name).encode(encoding='utf-8')).hexdigest() self.db.mong_find_one_update({"_id": _id}, {"flag": "未找到匹配的公司名"}) return '未找到匹配的公司名---%s' % company_name
#测试文件 #2021/1/11 下载图片,识别图片,存入mongo import requests from Func.client import MongoDB from Func.fetchJX import FETCH from PIL import Image as image_P import pytesseract import cv2 import openpyxl import os db = MongoDB('mongodb://localhost', 'cuiworkdb', "Shangbiao_GG-1726") s = FETCH() # 图片识别并写入excel模块 # 将“下载”文件夹中的图片按照数字顺序获取 # 将获取图片切割成小块图片进行图文转换 # 转换出的文字再进行辨识度提高处理 # 将图片数据转换成文字数据导入Mongo ###向excel中写入撤销和答辩种类 # 撤销复审决定书 rec_list1 = [ '发文菩暂菖= 撤锏复市决定书', '发文茎鲤= 撤销复市决定书', '发文莒鲤= 撒淌氯轲决定书', ] # 关于撤销连续三年未使用商标的决定 rec_list2 = [
#给没有flag字段的表添加flag=1,表应该是已推送的表,后续做增量爬取,(只做第一次) #给已经推送的表改变flag=1 from Func.client import MongoDB #只做一次 m = MongoDB('mongodb://localhost', 'cuiworkdb', '9guakao_zhengzhou') m.add_field_for_all() #改变flag=0为flag=1 m = MongoDB('mongodb://localhost', 'cuiworkdb', 'jianzhutong_beijing') m.change_flag()
# # return '程序错误' import hashlib import json import re import time from concurrent.futures.thread import ThreadPoolExecutor from urllib import parse from Func.Parse import FETCH from Func.client import MongoDB from Func.conf import * from Func.Redis import REDIS s = FETCH() db1 = MongoDB('172.16.74.249:27017', 'db_reptile_company', 'company_details') class get_json(object): def __init__(self): self.db = MongoDB('172.16.74.249:27017', 'db_reptile_company', 'company_name') self.redis = REDIS(host=RedisHost, port=RedisPort, password=RedisPassword, db=RedisDB) self.item = {} # mongdb--redeis def transfer(self): dd = self.db.mongo_find({})
def open_spider(self, spider): # 爬虫开始时候执行 # spider.hello = "world" # 为spider对象动态添加属性,可以在spider模块中获取该属性值 # 可以开启数据库等 self.Mongo = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao_changsha")
#从增量库里面导出flag=0的数据到clues表进行接口导入 from Func.client import MongoDB # 导入接口(等号后接表名,数据库为) https://dqk.dgg188.cn/api/import/import_data?ip=10.2.1.122:17017&docName= import requests m = MongoDB('mongodb://localhost', 'cuiworkdb', 'jianzhutong_hubei') m2 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20210129-zhijiazhuang") all_data = m.find_many("flag", 0) for one in all_data: m2.mongo_add(one) m2.del_field() #导入接口 # dbname='jianzhutong_guangzhou' # data={"ip": "10.2.1.122:17017","docName":dbname} # headers={ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", # "Accept-Encoding": "gzip, deflate, br", # "Accept-Language": "zh-CN,zh;q=0.9", # "Cache-Control": "no-cache", # "Connection": "keep-alive", # "Cookie": "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22172cc1339f4917-0029d8b329026-4353761-2073600-172cc1339f5818%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22172cc1339f4917-0029d8b329026-4353761-2073600-172cc1339f5818%22%7D", # "Host": "dqk.dgg188.cn", # "Pragma": "no-cache", # "Sec-Fetch-Dest": "document", # "Sec-Fetch-Mode": "navigate", # "Sec-Fetch-Site": "none", # "Sec-Fetch-User":"******", # "Upgrade-Insecure-Requests":"1", # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36" # }
#去重策略文件 from Func.client import MongoDB from md5encode import is_phone m1 = MongoDB('mongodb://localhost', 'cuiworkdb', "9guakao_zhengzhou") m2 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD_sort") m3 = MongoDB('10.2.1.121:17017', 'clues_resources', "test") m4 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD_20210205_push") m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "test") # m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_shengzheng") # m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20201224-4") # 导入clues,只有这个才能推送数据 # m1.mong_find_many_updata({"companyCity": "成都"}, {"isDir": 0}) #去重策略1 将小表全部聚集到BMD_sort大表中做_id去重 # all_data = m1.find_all() # for i in all_data: # m2.mongo_add(i) # 去重策略2 同名公司名去重策略 #第一版弃用 # list_data = list(m2.find_all()) # for i in range(len(list_data)): # for k in range(i+1, len(list_data)): # if list_data[i]["companyName"] == list_data[k]["companyName"]: # list_data[k]["companyName"] = "None" # list_company_name = [] # list_data = m2.find_all() # for i in list_data: # list_company_name.append(i["companyName"])
class Gkspider: def __init__(self): self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)' } self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb") self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.category_name_list = [] self.sec_category_dict = {} self.headers_forpage = { "Host": "www.80guakao.com", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36", "Accept": "*/*", "Referer": "http://www.80guakao.com/shengfen/hb/", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "", } def get_category(self): html = self.f.fetch(url=self.starturl, headers=self.headers, method='get') # html = requests.get(url=self.starturl, headers=self.headers) sleep(random.randint(0, 1)) res = etree.HTML(html.text) # print(html.text) # category_url_list = res.xpath('//div[@class="content"]//div//a') # # if len(category_url_list) > 19: # category_url_list = res.xpath('//div[@class="inner"][1]//ul[1]//a') category_url_list = res.xpath( '//div[@class="categories"]//ul//li[1]//dd[1]//a') for i in category_url_list: category_name = i.xpath('./text()')[0] category_url = i.xpath('./@href')[0] category_url = category_url.replace('m.', 'www.') if category_name != "不限": self.r0.save_category_url(category_name, category_url) self.category_name_list.append(category_name) def get_sec_category(self): for category_name in self.category_name_list: url = self.r0.get_category_url(category_name) # html = self.f.fetch(url=url,headers=self.headers,method='get') html = requests.get(url=url, headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html.text) sec_category_list = res.xpath('//div[@class="content"]//div//a') # sec_category_list = res.xpath('//div[@class="inner"][1]//ul//a') for i in sec_category_list: sec_category_name = i.xpath('./text()')[0] sec_category_url = i.xpath('./@href')[0] sec_category_url = sec_category_url.replace('m.', 'www.') if sec_category_name != '不限': print(sec_category_name) self.r1.save_one_dict(category_name, sec_category_name, sec_category_url) def get_all_page(self): for category in self.category_name_list: sec_category_list = self.r1.get_keys(category) for sec_category_name, url in sec_category_list.items(): # html = self.f.fetch(url=url.decode(),headers=self.headers_forpage,method='get') html = requests.get(url=url.decode(), headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html.text) self.r2.save_page_url( category + ":" + sec_category_name.decode(), url.decode()) while True: try: next_page = res.xpath( '//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href' )[0] except: break if not next_page: break self.r2.save_page_url( category + ":" + sec_category_name.decode(), next_page) html_next = self.f.fetch(url=next_page, headers=self.headers_forpage, method='get') # html_next = requests.get(url=next_page, headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html_next.text) def get_item_url(self): for category in self.category_name_list: sec_category_list = self.r1.get_keys(category) for sec_category_name in sec_category_list: while True: try: url = self.r2.get_page_url(category + ":" + sec_category_name.decode()) # html = self.f.fetch(url=url, headers=self.headers,method='get') html = requests.get(url=url, headers=self.headers_forpage) sleep(random.randint(1, 2)) res = etree.HTML(html.text) except Exception as e: print('error:', e) break # item_list = res.xpath('//li[@class="Tz"]//child::*/a/@href') item_list = res.xpath( '/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href' ) for item_url in item_list: # if 'tel' not in item_url: # url = item_url.replace('m.', 'www.') #每个数据url if 'http' not in item_url: item_url = 'http://www.80guakao.com/' + item_url self.r3.save_item_url( category + ':' + sec_category_name.decode(), item_url) def get_info(self): # print(res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"]/text()')[0]) #公司名 # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]//a/text()')[0]) #电话 # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]/text()')[0]) # 姓名 for category in self.category_name_list: sec_category_list = self.r1.get_keys(category) for sec_category_name in sec_category_list: while True: try: url = self.r3.get_item_url(category + ":" + sec_category_name.decode()) html = requests.get(url=url.decode(), headers=self.headers_forpage) sleep(random.randint(0, 1)) if html.status_code != 200: html = self.f.fetch(url=url.decode(), headers=self.headers_forpage, method='get') sleep(random.randint(0, 1)) res = etree.HTML(html.text) except: break item = {} # try: # company_name = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][1]/text()')[0] # except: try: company_name = res.xpath( '//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()' )[0] except: company_name = 'None' # try: # contact_people = res.xpath('//ul[@class="attr_info bottom"]//li[2]//span[@class="attrVal"]/text()')[0] # contact_people = contact_people.replace(r'\xa0\xa0','') # # except: contact_people = res.xpath( '//ul[@class="contacter"]//li//font/text()')[0] # try: # perf_request = res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')[0] # except: # # perf_request = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//text()')[0] # # try: # phone = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//a/text()')[0] # if phone == []: # raise Exception # except: # try: phone_url_re = res.xpath( '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick' )[0] par = re.compile("'.*?'") phone_url = re.findall(par, phone_url_re)[1].replace( "'", "") # 电话号码url if type(phone_url) == str: html = requests.get(url=phone_url, headers=self.headers_forpage) else: html = requests.get(url=phone_url.decode(), headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html.text) phone = res.xpath( '//div[@class="number"]//span[@class="num"]/text()')[0] # except: # phone = "None" item['companyCity'] = '宜昌' item['companyProvince'] = '湖北省' item['code'] = 'BUS_YT_ZZ' item['name'] = '资质' item['busCode'] = '' item['webUrl'] = '无' item['orgId'] = '' # 部门ID 字符串 item['deptId'] = '' # 中心ID 字符串 item['centreId'] = '' # item["first_category"] = category # item["sec_category"] = sec_category_name.decode() item["companyName"] = company_name item["outName"] = contact_people item[ "resourceRemark"] = category + ":" + sec_category_name.decode( ) item["companyTel"] = phone.strip() if len(contact_people) == 11: item["companyTel"] = contact_people item["ibossNum"] = None item['isDir'] = 0 item['isShare'] = 0 item["_id"] = md5encryption(item["companyTel"]) print(item) self.m.mongo_add(item) def test(self): url = 'http://www.80guakao.com/shengfen/sc/gonglugongcheng/23988.html' html = requests.get(url=url, headers=self.headers_forpage) print(html.text) res = etree.HTML(html.text) # print(res.xpath('//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href')) # print(res.xpath('//div[@class="content"]//div//a/text()')) # print(html.text) # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span/a/@href')) # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()')[0]) #公司名称 # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')) #专业要求 # print(res.xpath('//ul[@class="contacter"]//li//font/text()')[0]) #联系人 phone_url_re = res.xpath( '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick')[0] #电话号码 print(phone_url_re) par = re.compile("'.*?'") phone_url = re.findall(par, phone_url_re)[1].replace("'", "") #电话号码url html = requests.get(url=phone_url, headers=self.headers_forpage) res = etree.HTML(html.text) phone = res.xpath( '//div[@class="number"]//span[@class="num"]/text()')[0] print(phone) #Request URL: http://www.80guakao.com/box.php?part=seecontact_tel&id=54336&tel_base64=MTk5NTA0NTk5Mjc= # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href')) def run(self): self.get_category() self.get_sec_category() self.get_all_page() self.get_item_url() self.get_info()
import hashlib import json import time from urllib import parse from Func.Parse import FETCH from Func.Tyc3 import FETCH1 from lxml import etree from Func.client import MongoDB s = FETCH() s1 = FETCH1() db = MongoDB('172.16.74.249:27017', 'creditCode', 'KJ') class GetCode(object): def __init__(self): self.item = {} self.item1 = {} # 信用中国 self.item2 = {} # 天眼查 self.item3 = {} self.code = { '200': '请求成功', '203': '请求失败,please重试', '204': '所查询公司不精确或不存在' } def Mongo(self, Item): db.mongo_add(Item) # 百度企业信用基本信息
class JanzhuSpider(): def __init__(self, start_url, cookie, referer, companyCity, companyProvince, db): self.start_url = start_url self.companyCity = companyCity self.companyProvince = companyProvince self.headers = { # ":authority":"www.cbi360.net", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip,deflate,br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", "Cookie": cookie, # "Cookie": "", "pragma": "no-cache", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "Referer": referer, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", } self.r0 = Redisclient(0) self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db) # self.f = FETCH() self.par = re.compile(r'\d+-\d+') self.par2 = re.compile(r'\d+') def parse_next_page(self): self.r0.save_page_url(category_name="北京", page_url=self.start_url) # html = self.f.fetch(url=self.start_url, headers=self.headers, method='get') html = requests.get(url=self.start_url, headers=self.headers) sleep(2) while True: res = etree.HTML(html.text) try: next_page = res.xpath( '//ul[@class="pagination"]//li//a[contains(text(),"下一页")]/@href' ) print(next_page) next_page = 'https://www.cbi360.net' + next_page[0] except Exception as e: print(e) print(html.text) break self.r0.save_page_url(category_name="北京", page_url=next_page) self.parse_item(res) # html = self.f.fetch(url=next_page, headers=self.headers, method='get') html = requests.get(url=next_page, headers=self.headers) sleep(1) def re_phone(self, target): try: phone = re.findall(self.par, target)[0] except: print(target) try: phone = re.findall(self.par2, target)[0] except: phone = '' return phone def parse_item(self, res): # //dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2] # while True: # try: # # url = self.r0.get_page_url(category_name='北京') # # html = self.f.fetch(url=url, headers=self.headers, method='get') # # html = requests.get(url=url, headers=self.headers) # except: # continue sleep(1) # res = etree.HTML(html.text) companyName_list = res.xpath( '//ul[@class="table-con-top clear search-word"]//li[@style]//preceding-sibling::* //a[@target="_blank"]/text()' ) phone_list = res.xpath( '//dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2]/text()' ) for i in range(len(companyName_list)): item = {} companyName = companyName_list[i] phone = self.re_phone(phone_list[i]) if is_phone(phone): item['companyCity'] = self.companyCity item['companyProvince'] = self.companyProvince item['code'] = 'BUS_YT_ZZ' item['name'] = '资质' item['busCode'] = '' item['webUrl'] = '无' item['orgId'] = '' item['deptId'] = '' item['centreId'] = '' item["companyName"] = companyName item["outName"] = '' item["resourceRemark"] = '' item["companyTel"] = phone item["ibossNum"] = None item['isDir'] = 0 item['isShare'] = 0 item["_id"] = md5encryption(item["companyTel"]) item["flag"] = 0 print(item) self.m.mongo_add(item) def run(self): self.parse_next_page()
#2021/1/11 下载图片,识别图片,存入mongo import requests from Func.client import MongoDB from Func.fetchJX import FETCH from PIL import Image as image_P import pytesseract import cv2 import openpyxl import os import sys db = MongoDB('mongodb://localhost', 'cuiworkdb', "Shangbiao_GG-1731") s = FETCH() # 图片识别并写入excel模块 # 将“下载”文件夹中的图片按照数字顺序获取 # 将获取图片切割成小块图片进行图文转换 # 转换出的文字再进行辨识度提高处理 # 将图片数据转换成文字数据导入Mongo ###向excel中写入撤销和答辩种类 # 撤销复审决定书 rec_list1 = [ '发文菩暂菖= 撤锏复市决定书', '发文茎鲤= 撤销复市决定书', '发文莒鲤= 撒淌氯轲决定书', ] # 关于撤销连续三年未使用商标的决定 rec_list2 = [
from Func.client import MongoDB m1 = MongoDB('mongodb://localhost', 'cuiworkdb', "jianzhutong_shanghai") m2 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_shanghai") # m3 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_guangzhou") # m4 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_foshan") # m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_shengzheng") # m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20201224-4") # 导入clues,只有这个才能推送数据 # m1.mong_find_many_updata({"companyCity": "成都"}, {"isDir": 0}) # all_data = m1.find_all() # for i in all_data: # print(i) count = 0 gd_data = m1.find_all() for i in gd_data: # if count <= 3000: m2.mongo_add(i) # elif count <= 6000: # m3.mongo_add(i) # elif count<=9000: # m4.mongo_add(i) # else: # m5.mongo_add(i) # count += 1 #all_data = m3.find_all()
#空号检测文件 from Func.client import MongoDB db = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD_sort") db2 = MongoDB('mongodb://localhost', 'cuiworkdb', 'BMD20210201-chengdu-check') all_data = db.find_all() # all_data=db.find_many("flag",0) #mongo 电话数据导出为txt,拿去空号检测 with open('BMD_sort', 'w') as f: for i in all_data: phone = i["companyTel"] f.write(str(phone) + '\n') #将活跃号数据导入xxxx-check表 # huoyue_list = [] # with open('活跃号(实号).txt','r',encoding='utf-8') as f: # for line in f: # huoyue_list.append(line.strip()) # # for i in all_data: # if i["companyTel"] in huoyue_list: # db2.mongo_add(i)