Exemple #1
0
class Splunk(object):
    def __init__(self):
        self.bf = Bloomfilter(64)
        self.terms = {}  # Dictionary of term to set of events
        self.events = []

    def add_event(self, event):
        """Adds an event to this object"""

        # Generate a unique ID for the event, and save it
        event_id = len(self.events)
        self.events.append(event)

        # Add each term to the bloomfilter, and track the event by each term
        for term in segments(event):
            self.bf.add_value(term)

            if term not in self.terms:
                self.terms[term] = set()
            self.terms[term].add(event_id)

    def search(self, term):
        """Search for a single term, and yield all the events that contain it"""

        # In Splunk this runs in O(1), and is likely to be in filesystem cache (memory)
        if not self.bf.might_contain(term):
            return

        # In Splunk this probably runs in O(log N) where N is the number of terms in the tsidx
        if term not in self.terms:
            return

        for event_id in sorted(self.terms[term]):
            yield self.events[event_id]
Exemple #2
0
 def __init__(self, dbpool):
     self.dbpool = dbpool
     if os.path.exists("job.state"):
         bloom = Bloomfilter("job.state")
     else:
         bloom = Bloomfilter(1000000)
     self.bloom = bloom
     query = self.dbpool.runInteraction(self.db_create)
     query.addErrback(self.db_create_err)
Exemple #3
0
    def open_spider(self, spider):
        # host:localhost、127.0.0.1、 192.168.2.54
        # user:连接数据库的用户名,一般都是root
        # password:连接数据库的密码, 123456
        # database:连接的数据库名字(数据库必须存在)
        # port:mysql数据库的端口,默认3306
        # charset:mysql数据库的编码格式, utf8
        # connection = pymysql.connect(host="localhost", user="******", password="******", database="jobs", port=3306, charset="utf8")

        # connection = pymysql.connect(
        #     host=settings['MYSQL_HOST'],
        #     user=settings['MYSQL_USER'],
        #     password=settings['MYSQL_PASSWORD'],
        #     database=settings['MYSQL_DATABASE'],
        #     port=settings['MYSQL_PORT'],
        #     charset=settings['MYSQL_CHARSET'],
        # )
        connection = pymysql.connect(**settings['MYSQL_SETTINGS'])
        cursor = connection.cursor()

        if os.path.exists("job.state"):
            self.bloom = Bloomfilter("job.state")
        else:
            self.bloom = Bloomfilter(1000000)

        # 主键 PRIMARY KEY,特点:不能重复
        # 自增 AUTO_INCREMENT

        # 如果你的列名多个单词,最好用``括起来 例如 `my name`
        # 如果你的列名是sql的关键字,最好用`括起来
        cursor.execute("""
        CREATE TABLE IF NOT EXISTS `job` (
            job_id INTEGER PRIMARY KEY AUTO_INCREMENT,
            job_name text COMMENT '工作名称', 
            job_money text COMMENT '工作薪资',
            max_money FLOAT COMMENT '最大薪资',
            min_money FLOAT COMMENT '最少薪资',
            job_date text COMMENT '工作发布时间',
            company_name text COMMENT '公司名称',
            job_place text COMMENT '工作地点',
            job_city text COMMENT '工作城市',
            job_area text COMMENT '工作地区',
            job_education text COMMENT '工作学历',
            job_fuli text COMMENT '公司福利',
            job_from text COMMENT '工作所属网站',
            job_type text COMMENT '工作类型',
            job_detail_href text COMMENT '详情地址',
            job_state text COMMENT '工作数据的加密信息'
        )
        """)
        self.connection = connection
        self.cursor = cursor
Exemple #4
0
#   3  布隆去重   推荐
#布隆过滤存储的是状态  0 / 1
# 优点: 内存占有量低 ,可以持久化存储
from bloomfilter import Bloomfilter
import os
#  参数1:       n位  或者  文件路径
if os.path.exists("state.txt"):
    print("文件存在直接加载状态")
    bloom = Bloomfilter("state.txt")
else:
    print("文件不存在设置大小为100000")
    bloom = Bloomfilter(100000)

bloom =Bloomfilter(100000)
# 如果程序是第一次使用填数字    如果不是第一次 用文件路径
# bloom =Bloomfilter("state.txt")
while True:
    key = input("请输入数据")
    if bloom.test(key):    #测试
        print("数据存在",key)
    else:
        print("数据不存在!",key)
        bloom.add(key)        #添加
        bloom.save("state.txt")   #状态保存



#   1   列表去重
#优点:逻辑/代码简单易懂
# 缺点 1)如果数据量巨大电脑吃不消(内存)
#      2)只对当前运行有效 ,不能停
Exemple #5
0
class SaveToMysqlPipeline(object):
    def open_spider(self, spider):
        # host:localhost、127.0.0.1、 192.168.2.54
        # user:连接数据库的用户名,一般都是root
        # password:连接数据库的密码, 123456
        # database:连接的数据库名字(数据库必须存在)
        # port:mysql数据库的端口,默认3306
        # charset:mysql数据库的编码格式, utf8
        # connection = pymysql.connect(host="localhost", user="******", password="******", database="jobs", port=3306, charset="utf8")

        # connection = pymysql.connect(
        #     host=settings['MYSQL_HOST'],
        #     user=settings['MYSQL_USER'],
        #     password=settings['MYSQL_PASSWORD'],
        #     database=settings['MYSQL_DATABASE'],
        #     port=settings['MYSQL_PORT'],
        #     charset=settings['MYSQL_CHARSET'],
        # )
        connection = pymysql.connect(**settings['MYSQL_SETTINGS'])
        cursor = connection.cursor()

        if os.path.exists("job.state"):
            self.bloom = Bloomfilter("job.state")
        else:
            self.bloom = Bloomfilter(1000000)

        # 主键 PRIMARY KEY,特点:不能重复
        # 自增 AUTO_INCREMENT

        # 如果你的列名多个单词,最好用``括起来 例如 `my name`
        # 如果你的列名是sql的关键字,最好用`括起来
        cursor.execute("""
        CREATE TABLE IF NOT EXISTS `job` (
            job_id INTEGER PRIMARY KEY AUTO_INCREMENT,
            job_name text COMMENT '工作名称', 
            job_money text COMMENT '工作薪资',
            max_money FLOAT COMMENT '最大薪资',
            min_money FLOAT COMMENT '最少薪资',
            job_date text COMMENT '工作发布时间',
            company_name text COMMENT '公司名称',
            job_place text COMMENT '工作地点',
            job_city text COMMENT '工作城市',
            job_area text COMMENT '工作地区',
            job_education text COMMENT '工作学历',
            job_fuli text COMMENT '公司福利',
            job_from text COMMENT '工作所属网站',
            job_type text COMMENT '工作类型',
            job_detail_href text COMMENT '详情地址',
            job_state text COMMENT '工作数据的加密信息'
        )
        """)
        self.connection = connection
        self.cursor = cursor

    def process_item(self, item, spider):
        # 将python数据结构转换为Json
        job_state = json.dumps(dict(item))
        # 摘要算法,把任意长度的数据转换为一个长度固定的数据串
        hl = hashlib.md5()
        hl.update(job_state.encode(encoding='utf-8'))
        job_state = hl.hexdigest()
        # 测试数据是否在bloom对象中
        # 数据不在,添加,并且插入到数据库
        if not self.bloom.test(item['job_detail_href']):
            print("添加数据========================")
            self.cursor.execute(
                """
            INSERT INTO job ( job_name, job_money, max_money, min_money, job_date, company_name, job_place, job_city, job_area, job_education, job_fuli, job_from, job_type, job_detail_href, job_state ) VALUES ( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s )
            """, (item['job_name'], item['job_money'], item['max_money'],
                  item['min_money'], item['job_date'], item['company_name'],
                  item['job_place'], item['job_city'], item['job_area'],
                  item['job_education'], item['job_fuli'], item['job_from'],
                  item['job_type'], item['job_detail_href'], job_state))
            self.bloom.add(item['job_detail_href'])
            self.bloom.save("job.state")
        #
        else:
            self.cursor.execute(
                """SELECT job_state from job WHERE  job_detail_href=%s""",
                (item['job_detail_href'], ))
            result = self.cursor.fetchone()
            if result and result[0] != job_state:
                print("更新数据=========================")
                self.cursor.execute(
                    """
                UPDATE job set job_name=%s, job_money=%s, max_money=%s, min_money=%s, job_date=%s, company_name=%s, job_place=%s, job_city=%s, job_area=%s, job_education=%s, job_fuli=%s, job_from=%s, job_type=%s WHERE job_detail_href=%s
                """,
                    (item['job_name'], item['job_money'], item['max_money'],
                     item['min_money'], item['job_date'], item['company_name'],
                     item['job_place'], item['job_city'], item['job_area'],
                     item['job_education'], item['job_fuli'], item['job_from'],
                     item['job_type'], item['job_detail_href']))
            else:
                print("不用更新数据=========================")
        self.connection.commit()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.connection.close()
Exemple #6
0
 def __init__(self):
     self.bf = Bloomfilter(64)
     self.terms = {}  # Dictionary of term to set of events
     self.events = []