class SaveToMysqlPipeline(object): def open_spider(self, spider): # host:localhost、127.0.0.1、 192.168.2.54 # user:连接数据库的用户名,一般都是root # password:连接数据库的密码, 123456 # database:连接的数据库名字(数据库必须存在) # port:mysql数据库的端口,默认3306 # charset:mysql数据库的编码格式, utf8 # connection = pymysql.connect(host="localhost", user="******", password="******", database="jobs", port=3306, charset="utf8") # connection = pymysql.connect( # host=settings['MYSQL_HOST'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # database=settings['MYSQL_DATABASE'], # port=settings['MYSQL_PORT'], # charset=settings['MYSQL_CHARSET'], # ) connection = pymysql.connect(**settings['MYSQL_SETTINGS']) cursor = connection.cursor() if os.path.exists("job.state"): self.bloom = Bloomfilter("job.state") else: self.bloom = Bloomfilter(1000000) # 主键 PRIMARY KEY,特点:不能重复 # 自增 AUTO_INCREMENT # 如果你的列名多个单词,最好用``括起来 例如 `my name` # 如果你的列名是sql的关键字,最好用`括起来 cursor.execute(""" CREATE TABLE IF NOT EXISTS `job` ( job_id INTEGER PRIMARY KEY AUTO_INCREMENT, job_name text COMMENT '工作名称', job_money text COMMENT '工作薪资', max_money FLOAT COMMENT '最大薪资', min_money FLOAT COMMENT '最少薪资', job_date text COMMENT '工作发布时间', company_name text COMMENT '公司名称', job_place text COMMENT '工作地点', job_city text COMMENT '工作城市', job_area text COMMENT '工作地区', job_education text COMMENT '工作学历', job_fuli text COMMENT '公司福利', job_from text COMMENT '工作所属网站', job_type text COMMENT '工作类型', job_detail_href text COMMENT '详情地址', job_state text COMMENT '工作数据的加密信息' ) """) self.connection = connection self.cursor = cursor def process_item(self, item, spider): # 将python数据结构转换为Json job_state = json.dumps(dict(item)) # 摘要算法,把任意长度的数据转换为一个长度固定的数据串 hl = hashlib.md5() hl.update(job_state.encode(encoding='utf-8')) job_state = hl.hexdigest() # 测试数据是否在bloom对象中 # 数据不在,添加,并且插入到数据库 if not self.bloom.test(item['job_detail_href']): print("添加数据========================") self.cursor.execute( """ INSERT INTO job ( job_name, job_money, max_money, min_money, job_date, company_name, job_place, job_city, job_area, job_education, job_fuli, job_from, job_type, job_detail_href, job_state ) VALUES ( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s ) """, (item['job_name'], item['job_money'], item['max_money'], item['min_money'], item['job_date'], item['company_name'], item['job_place'], item['job_city'], item['job_area'], item['job_education'], item['job_fuli'], item['job_from'], item['job_type'], item['job_detail_href'], job_state)) self.bloom.add(item['job_detail_href']) self.bloom.save("job.state") # else: self.cursor.execute( """SELECT job_state from job WHERE job_detail_href=%s""", (item['job_detail_href'], )) result = self.cursor.fetchone() if result and result[0] != job_state: print("更新数据=========================") self.cursor.execute( """ UPDATE job set job_name=%s, job_money=%s, max_money=%s, min_money=%s, job_date=%s, company_name=%s, job_place=%s, job_city=%s, job_area=%s, job_education=%s, job_fuli=%s, job_from=%s, job_type=%s WHERE job_detail_href=%s """, (item['job_name'], item['job_money'], item['max_money'], item['min_money'], item['job_date'], item['company_name'], item['job_place'], item['job_city'], item['job_area'], item['job_education'], item['job_fuli'], item['job_from'], item['job_type'], item['job_detail_href'])) else: print("不用更新数据=========================") self.connection.commit() return item def close_spider(self, spider): self.cursor.close() self.connection.close()
from bloomfilter import Bloomfilter import os # 参数1: n位 或者 文件路径 if os.path.exists("state.txt"): print("文件存在直接加载状态") bloom = Bloomfilter("state.txt") else: print("文件不存在设置大小为100000") bloom = Bloomfilter(100000) bloom =Bloomfilter(100000) # 如果程序是第一次使用填数字 如果不是第一次 用文件路径 # bloom =Bloomfilter("state.txt") while True: key = input("请输入数据") if bloom.test(key): #测试 print("数据存在",key) else: print("数据不存在!",key) bloom.add(key) #添加 bloom.save("state.txt") #状态保存 # 1 列表去重 #优点:逻辑/代码简单易懂 # 缺点 1)如果数据量巨大电脑吃不消(内存) # 2)只对当前运行有效 ,不能停 result = ['zhangsan','lisi'] keyword = 'wangwu' if 'zhangsan' in result:
all_page = root.xpath( "//div[@class='p_in']/span[@class='td']/text()")[0] pattern = re.compile(r"\d+") #正则表达式 all_page = pattern.findall(all_page)[0] all_page = int(all_page) for page in range(1, all_page + 1): new_url = f"https://search.51job.com/list/000000,000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" content = requests.get(new_url) content.encoding = "gbk" content = content.text root = etree.HTML(content) job_infos = root.xpath("//div[@class='dw_table']/div[@class='el']") for job_info in job_infos: job_key = job_info.xpath( "p[contains(@class,'t1')]/span/a/@href")[0] if bloom.test(job_key): # 测试 print("数据存在", job_key) else: print("数据不存在!", job_key) bloom.add(job_key) # 添加 bloom.save("state2.txt") # 状态保存 job_name = job_info.xpath( "p[contains(@class,'t1')]/span/a/@title") job_name = job_name[0] company = job_info.xpath("span[@class='t2']/a/@title") company = company[0] salary = job_info.xpath("span[@class='t4']/text()") if salary: salary = salary[0] pattern = re.compile(r"\d+\.?\d*")