예제 #1
0
class SaveToMysqlPipeline(object):
    def open_spider(self, spider):
        # host:localhost、127.0.0.1、 192.168.2.54
        # user:连接数据库的用户名,一般都是root
        # password:连接数据库的密码, 123456
        # database:连接的数据库名字(数据库必须存在)
        # port:mysql数据库的端口,默认3306
        # charset:mysql数据库的编码格式, utf8
        # connection = pymysql.connect(host="localhost", user="******", password="******", database="jobs", port=3306, charset="utf8")

        # connection = pymysql.connect(
        #     host=settings['MYSQL_HOST'],
        #     user=settings['MYSQL_USER'],
        #     password=settings['MYSQL_PASSWORD'],
        #     database=settings['MYSQL_DATABASE'],
        #     port=settings['MYSQL_PORT'],
        #     charset=settings['MYSQL_CHARSET'],
        # )
        connection = pymysql.connect(**settings['MYSQL_SETTINGS'])
        cursor = connection.cursor()

        if os.path.exists("job.state"):
            self.bloom = Bloomfilter("job.state")
        else:
            self.bloom = Bloomfilter(1000000)

        # 主键 PRIMARY KEY,特点:不能重复
        # 自增 AUTO_INCREMENT

        # 如果你的列名多个单词,最好用``括起来 例如 `my name`
        # 如果你的列名是sql的关键字,最好用`括起来
        cursor.execute("""
        CREATE TABLE IF NOT EXISTS `job` (
            job_id INTEGER PRIMARY KEY AUTO_INCREMENT,
            job_name text COMMENT '工作名称', 
            job_money text COMMENT '工作薪资',
            max_money FLOAT COMMENT '最大薪资',
            min_money FLOAT COMMENT '最少薪资',
            job_date text COMMENT '工作发布时间',
            company_name text COMMENT '公司名称',
            job_place text COMMENT '工作地点',
            job_city text COMMENT '工作城市',
            job_area text COMMENT '工作地区',
            job_education text COMMENT '工作学历',
            job_fuli text COMMENT '公司福利',
            job_from text COMMENT '工作所属网站',
            job_type text COMMENT '工作类型',
            job_detail_href text COMMENT '详情地址',
            job_state text COMMENT '工作数据的加密信息'
        )
        """)
        self.connection = connection
        self.cursor = cursor

    def process_item(self, item, spider):
        # 将python数据结构转换为Json
        job_state = json.dumps(dict(item))
        # 摘要算法,把任意长度的数据转换为一个长度固定的数据串
        hl = hashlib.md5()
        hl.update(job_state.encode(encoding='utf-8'))
        job_state = hl.hexdigest()
        # 测试数据是否在bloom对象中
        # 数据不在,添加,并且插入到数据库
        if not self.bloom.test(item['job_detail_href']):
            print("添加数据========================")
            self.cursor.execute(
                """
            INSERT INTO job ( job_name, job_money, max_money, min_money, job_date, company_name, job_place, job_city, job_area, job_education, job_fuli, job_from, job_type, job_detail_href, job_state ) VALUES ( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s )
            """, (item['job_name'], item['job_money'], item['max_money'],
                  item['min_money'], item['job_date'], item['company_name'],
                  item['job_place'], item['job_city'], item['job_area'],
                  item['job_education'], item['job_fuli'], item['job_from'],
                  item['job_type'], item['job_detail_href'], job_state))
            self.bloom.add(item['job_detail_href'])
            self.bloom.save("job.state")
        #
        else:
            self.cursor.execute(
                """SELECT job_state from job WHERE  job_detail_href=%s""",
                (item['job_detail_href'], ))
            result = self.cursor.fetchone()
            if result and result[0] != job_state:
                print("更新数据=========================")
                self.cursor.execute(
                    """
                UPDATE job set job_name=%s, job_money=%s, max_money=%s, min_money=%s, job_date=%s, company_name=%s, job_place=%s, job_city=%s, job_area=%s, job_education=%s, job_fuli=%s, job_from=%s, job_type=%s WHERE job_detail_href=%s
                """,
                    (item['job_name'], item['job_money'], item['max_money'],
                     item['min_money'], item['job_date'], item['company_name'],
                     item['job_place'], item['job_city'], item['job_area'],
                     item['job_education'], item['job_fuli'], item['job_from'],
                     item['job_type'], item['job_detail_href']))
            else:
                print("不用更新数据=========================")
        self.connection.commit()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.connection.close()
예제 #2
0
from bloomfilter import Bloomfilter
import os
#  参数1:       n位  或者  文件路径
if os.path.exists("state.txt"):
    print("文件存在直接加载状态")
    bloom = Bloomfilter("state.txt")
else:
    print("文件不存在设置大小为100000")
    bloom = Bloomfilter(100000)

bloom =Bloomfilter(100000)
# 如果程序是第一次使用填数字    如果不是第一次 用文件路径
# bloom =Bloomfilter("state.txt")
while True:
    key = input("请输入数据")
    if bloom.test(key):    #测试
        print("数据存在",key)
    else:
        print("数据不存在!",key)
        bloom.add(key)        #添加
        bloom.save("state.txt")   #状态保存



#   1   列表去重
#优点:逻辑/代码简单易懂
# 缺点 1)如果数据量巨大电脑吃不消(内存)
#      2)只对当前运行有效 ,不能停
result = ['zhangsan','lisi']
keyword = 'wangwu'
if 'zhangsan' in result:
        all_page = root.xpath(
            "//div[@class='p_in']/span[@class='td']/text()")[0]
        pattern = re.compile(r"\d+")  #正则表达式
        all_page = pattern.findall(all_page)[0]
        all_page = int(all_page)
        for page in range(1, all_page + 1):
            new_url = f"https://search.51job.com/list/000000,000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
            content = requests.get(new_url)
            content.encoding = "gbk"
            content = content.text
            root = etree.HTML(content)
            job_infos = root.xpath("//div[@class='dw_table']/div[@class='el']")
            for job_info in job_infos:
                job_key = job_info.xpath(
                    "p[contains(@class,'t1')]/span/a/@href")[0]
                if bloom.test(job_key):  # 测试
                    print("数据存在", job_key)
                else:
                    print("数据不存在!", job_key)
                    bloom.add(job_key)  # 添加
                    bloom.save("state2.txt")  # 状态保存

                job_name = job_info.xpath(
                    "p[contains(@class,'t1')]/span/a/@title")
                job_name = job_name[0]
                company = job_info.xpath("span[@class='t2']/a/@title")
                company = company[0]
                salary = job_info.xpath("span[@class='t4']/text()")
                if salary:
                    salary = salary[0]
                    pattern = re.compile(r"\d+\.?\d*")