def __init__(self, max_inmemory_size=1000, sparse_offset=300, segment_size=50, persist_segments=True, path=None, merge_threshold=3): """ :param max_inmemory_size: maximum number of entries to hold in memory. :param sparse_offset: frequency of key offsets kept in memory. (Eg: if `sparse_offset=5`, one key offset is kept in memory for every 5 entries.) :param segment_size: maximum number of entries in a given segment. :param persist_segments: if set to false, cleans up segment files in the end. Otherwise, retains the files in disk :param merge_threshold: number of segment to keep in intact before merging :param path: absolute path to scan into for pre-existing segments, and to store current segments. If none provided, the default is sst_dir """ self._mem_table = MemTable(max_inmemory_size) self.max_inmemory_size = max_inmemory_size self._immutable_segments = [] self._sparse_memory_index = SortedDict() self.sparse_offset = sparse_offset self._segment_size = segment_size self._bloom_filter = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.persist = persist_segments self._merge_threshold = merge_threshold self._base_path = None if path: self._base_path = path self._scan_path_for_segments(path)
async def run(self) -> None: try: with open('/data/bloom-filter', 'rb') as f: log('debug', 'Using saved bloom-filter') self.filter = ScalableBloomFilter.fromfile(f) except FileNotFoundError: log('debug', 'Creating new bloom-filter') self.filter = ScalableBloomFilter(initial_capacity=100000) self.conn_pool = await retry( partial(asyncpg.create_pool, host='db', user='******', database='ipfs_crawler'), 'database', gaierror, ConnectionRefusedError, asyncpg.CannotConnectNowError) # start consumers for _ in range(8): self.workers.append(asyncio.ensure_future(self.worker())) # start producer self.producer: Future = asyncio.ensure_future(self.read_logs()) log('info', 'Started crawling') # If an exception is thrown in the background task, # our crawler should not ignore it and continue to run, but throws it. await asyncio.gather(self.producer, *self.workers)
def __init__(self, **kwargs): super(AutohomeValueSpider, self).__init__(**kwargs) self.counts = 0 self.carnum = 800000 self.name = 'autohome_value' self.carid = list() self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) db = self.connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_READ_COLLECTION']] num = (self.collection.count()) * 1.5 self.df = ScalableBloomFilter(initial_capacity=num, error_rate=0.001) # filename = '../blm/' + settings['MONGODB_WRITE_COLLECTION'] + '.blm' # filename = settings["BLM_PATH"] + '/' + settings['MONGODB_COLLECTION'] + '.blm' # filename = './test.blm' # self.fa = open(filename, "a") for i in self.collection.find(): if "familyid" in i.keys(): item = i["familyid"] item = md5(item.encode("utf8")).hexdigest() if not self.df.add(item): # self.fa.writelines(i["familyid"] + '\n') self.carid.append(i["familyid"]) self.connection.close()
def __init__(self, endpoint=config.config['general']['dbpedia']['endpoint'], one_hop_bloom_file=config.config['general']['dbpedia'] ['one_hop_bloom_file'], two_hop_bloom_file=config.config['general']['dbpedia'] ['two_hop_bloom_file']): super(DBpedia, self).__init__(endpoint) self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>" if os.path.exists(one_hop_bloom_file): with open(one_hop_bloom_file, 'rb') as bloom_file: self.one_hop_bloom = BloomFilter.fromfile(bloom_file) else: self.one_hop_bloom = None self.two_hop_bloom_file = two_hop_bloom_file self.two_hop_bloom = dict() for item in [True, False]: file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item)) if os.path.exists(file_path): with open(file_path, 'rb') as bloom_file: self.two_hop_bloom[item] = ScalableBloomFilter.fromfile( bloom_file) else: self.two_hop_bloom[item] = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.two_hop_bloom_counter = 0
def __init__(self, settings): # mysql # self.conn = create_engine( # f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8') # mongo self.connection = pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] ) self.db = self.connection[settings['MONGODB_DB']] # count self.mongocounts = 0 self.dropcounts = 0 # mongo self.collection = self.db[settings['MONGODB_COLLECTION']] # bloomfilter try: num = (int(settings['CRAWL_NUM']) + self.collection.count()) * 1.5 except: num = settings['CRAWL_NUM'] self.df = ScalableBloomFilter(initial_capacity=num, error_rate=0.001) self.settings = settings
def __init__(self, settings, idle_number, crawler): # mysql self.conn = create_engine( f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8' ) # mongo # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/' # self.connection = pymongo.MongoClient(uri) # self.connection = pymongo.MongoClient( # settings['MONGODB_SERVER'], # settings['MONGODB_PORT'] # ) # db = self.connection[settings['MONGODB_DB']] # self.collection = db[settings['MONGODB_COLLECTION']] # # count self.mongocounts = 0 self.counts = 0 self.CrawlCar_Num = 1000000 self.settings = settings self.add_num = 0 self.drop_num = 0 # 爬取时间 self.start_date = time.strftime('%Y-%m-%d %X', time.localtime()) self.end_date = time.strftime('%Y-%m-%d %X', time.localtime()) self.scrapy_date = None # redis 信号 self.crawler = crawler self.idle_number = idle_number self.idle_list = [] self.idle_count = 0 # bloom file filename = str(pathlib.Path.cwd()) + '/blm/' + settings[ 'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] self.df_result = pd.DataFrame() self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01) # # read if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line)
def __init__(self, root_urls, capacity=0, black_patterns=(CONFIG_URL_FILTER_PATTERN,)): """ constructor, use variable of BloomFilter if capacity else variable of set """ self._re_black_list = [re.compile(pattern, flags=re.IGNORECASE) \ for pattern in black_patterns] \ if black_patterns else [] # original white patterns = (r"^https?://",) # self._re_white_list = [re.compile(pattern, flags=re.IGNORECASE) \ # for pattern in white_patterns] \ # if white_patterns else [] self._re_white_list = [] prefix = r"^https?://(www\.)?" # add the domain of each root URLs to white list for url in root_urls: # remove http and www prefix first postfix = re.sub(prefix, '', url) # allow URLs in form of api-west1.amazon.com pattern = prefix + r"([\w\-]+\.)*" + postfix p = re.compile(pattern, flags=re.IGNORECASE) self._re_white_list.append(p) # bloom filter share the same interface with set() if capacity: from pybloom_live import ScalableBloomFilter self._url_set = ScalableBloomFilter(capacity, error_rate=0.001) else: self._url_set = set() return
class Bloomfilter(object): logger = None def __init__(self,spidername="",*args,**kwargs): self.sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH, error_rate=1e-6) self.setlogger(spidername) def setlogger(self,spidername=""): #防止读取时logger指向原logger而不存在 self.logger = logging.getLogger(spidername+".bloomfilter") def clearlogger(self): self.logger = None def md5(self, url): md5 = hashlib.md5() md5.update(url.encode("utf-8")) return md5.hexdigest() def check(self, url): url = self.md5(url) if url in self.sbf: return True else: return False def add_in_sbf(self, url): try: self.sbf.add(self.md5(url)) except Exception as e: self.logger("[%s] bloomfilter exception<<<<<<<<< [%s]" % (url, str(e)))
def __init__(self,keyList_queue,writer,contain): super(Spider_related, self).__init__() self.keyList_queue = keyList_queue self.writer = writer self.contain = contain # 可自动扩容的布隆过滤器 self.bloom = ScalableBloomFilter(initial_capacity=100, error_rate=0.001)
def create_from_file(self, filename): ''' 从文件中获得一个过滤器,同样,不会保存 ''' what = ScalableBloomFilter(100000000, 0.001) t = temp_stupid(filename) for i in t.read(): what.add(i) return what
def __init__(self, settings, idle_number, crawler): # mysql self.conn = create_engine( f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8' ) # db = pymysql.connect(settings["MYSQL_SERVER"], settings["MYSQL_USER"], settings["MYSQL_PWD"], settings["MYSQL_DB"], charset='utf8', port=3306) # db = create_engine(f'mysql+pymysql://{"baogang"}:{"Baogang@2019"}@{"192.168.2.120"}:{"3306"}/{"baogang"}?charset=utf8') # mongo # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/' # self.connection = pymongo.MongoClient(uri) self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) self.db = self.connection[settings['MONGODB_DB']] self.collection = self.db[settings['MONGODB_COLLECTION']] # self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"] # count self.mongocounts = 0 self.dropcounts = 0 # mongo 临时表 self.collection_tmp = self.db[settings['MONGODB_COLLECTION'] + "_tmp"] # pandas self.df_end = pd.DataFrame() # redis 信号 self.crawler = crawler self.idle_number = idle_number self.idle_list = [] self.idle_count = 0 self.settings = settings self.CrawlCar_Num = 1000000 # bloom file filename = str(pathlib.Path.cwd()) + '/blm/' + settings[ 'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] # pybloom self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line)
def __init__(self, name=None): self.timers = TimerRegistry(callback=self._trigger_frame_handler) super().__init__(name=name) self.states.should_stop = False self.states.running = False self.loop = None # asyncio.get_event_loop() self._spawn_on_start = set() self._seen_frames = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH, error_rate=0.001)
def generate_bloom(conf, capacity, cursor): b = ScalableBloomFilter(initial_capacity=capacity, error_rate=conf.bloom_error_rate) while True: row = cursor.fetchone() if not row: break if row[0]: b.add(row[0].rstrip()) return b
def __init__(self, site_name): self.client = MongoClient('localhost', 27017) self.db = self.client.crawlSpider self.col_url = self.db[site_name + "_url"] self.col_content = self.db[site_name + "_content"] self.sbf = ScalableBloomFilter(initial_capacity=100) for item in self.col_url.find(): self.sbf.add(item["url"]) self.insert_url = [] self.insert_content = []
class MyBloom: def __init__(self): self.sbf = ScalableBloomFilter(initial_capacity=100) def isExist(self, title): if title in self.sbf: return False else: self.sbf.add(title) return True
def __init__(self): redis_db = redis.Redis(host='127.0.0.1', port=6379, db=0, decode_responses=True) result = redis_db.smembers('spider:url') self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) for item in result: self.sbf.add(item)
def __init__(self): # mail self.mailer = MailSender.from_settings(settings) # mongo self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) db = self.connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"] self.collectionwrong = db[settings['MONGODB_COLLECTION'] + "_wrongurllog"] #bloom file # filename= settings['BLM_PATH'] + settings['MONGODB_DB']+'/'+settings['MONGODB_COLLECTION']+'.blm' filename = 'blm/' + settings['MONGODB_DB'] + '/' + settings[ 'MONGODB_COLLECTION'] + '.blm' #pybloom num = (int(settings['CrawlCar_Num']) + self.collection.count()) * 1.1 self.df = ScalableBloomFilter(initial_capacity=num, error_rate=0.01) #read isexists = os.path.exists(filename) self.fa = open(filename, "a") if isexists: fr = open(filename, "r") lines = fr.readlines() for line in lines: line = line.strip('\r\n') self.df.add(line) fr.close() else: for i in self.collection.find(): if "status" in i.keys(): item = i["status"] item = md5(item).hexdigest() self.df.add(item) self.fa.writelines(item + '\r\n') #count self.mongocounts = 0 self.sqlcounts = 0 #mysql self.mysqlconnection = MySQLdb.connect(settings['MYSQLDB_SERVER'], settings['MYSQLDB_USER'], settings['MYSQLDB_PASS'], settings['MYSQLDB_DB'], port=settings['MYSQLDB_PORT']) self.dbc = self.mysqlconnection.cursor() self.mysqlconnection.set_character_set('utf8') self.dbc.execute('SET NAMES utf8;') self.dbc.execute('SET CHARACTER SET utf8;') self.dbc.execute('SET character_set_connection=utf8;') # self.table = settings['MONGODB_COLLECTION']+ '_' +time.strftime("%Y%W") self.table = settings['MONGODB_COLLECTION'] + '_online' self.items = [] self.caritemlist = car_parse.Parse_conf(settings['MONGODB_COLLECTION'])
def get_updated(): db = pymysql.connect(host="192.168.2.97", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select title_url from spider_high_talent" cursor.execute(sql) db_data = cursor.fetchall() data = [i[0] for i in db_data] from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=100000, error_rate=0.001) for bl in data: bloom.add(bl) return bloom
def __init__(self, settings): # mysql self.conn = create_engine( f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8' ) # mongo self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) self.db = self.connection[settings['MONGODB_DB']] # mongo # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/' # self.connection = pymongo.MongoClient(uri) self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) db = self.connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"] # date self.start_date = None self.end_date = None self.scrapy_date = f'{self.start_date} - {self.end_date}' # count self.mongocounts = 0 self.counts = 0 self.CrawlCar_Num = 1000000 self.settings = settings # bloom file filename = str(pathlib.Path.cwd()) + '/blm/' + settings[ 'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] # pybloom self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line)
def get_updated(): db = pymysql.connect(host="192.168.2.99", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select product_license_num from spider_sh_company_medical_equipment_entrust_produce" cursor.execute(sql) db_data = cursor.fetchall() data = [i[0] for i in db_data] from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001) for bl in data: bloom.add(bl) return bloom
class BloomDupeFilter(): def __init__(self): self.fingerprints = ScalableBloomFilter( initial_capacity=2000000, error_rate=0.00001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) def request_seen(self, url): if url in self.fingerprints: return True else: self.fingerprints.add(url) return False
def get_updated(sum): sql = "select * from tyc_source_data" cursor.execute(sql) data = cursor.fetchall() if len(data)==0: return sum data = [i[2] for i in data] from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=100,error_rate=0.001) for bl in data: bloom.add(bl) sum = [i for i in sum if i['园区id'] not in bloom] return sum
class PageDupeFilter(BaseDupeFilter): """ PageDupeFilter The filter uses a bloom filter inside. It load processed page url from postgresql """ def __init__(self): self.__pg_client = get_database() self.__filter = ScalableBloomFilter(initial_capacity=2 * 10e5) @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request: scrapy.Request) -> bool: host, path = divide_url(request.url) # Foot print = sha1(host + path) fp_s = (host + path).encode() fp = hashlib.sha1(fp_s).hexdigest() if fp in self.__filter: return True self.__filter.add(fp) return False def open(self): size = 100 sql = 'SELECT DISTINCT host || path FROM public.pages WHERE publish_date IS NOT NULL' with self.__pg_client.cursor() as cursor: cursor.execute(sql) b_continue = True while b_continue: result = cursor.fetchmany(size) for r in result: fp_s = (r[0]).encode() fp = hashlib.sha1(fp_s).hexdigest() self.__filter.add(fp) if len(result) < size: b_continue = False self.__pg_client.close() self.__pg_client = None def close(self, reason): pass def log(self, request, spider): pass
def __init__(self, settings): # mysql self.conn = create_engine( f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8' ) # mongo # self.connection = pymongo.MongoClient( # settings['MONGODB_SERVER'], # settings['MONGODB_PORT'] # ) # self.db = self.connection[settings['MONGODB_DB']] # count self.mongocounts = 0 self.dropcounts = 0 # mongo # self.collection = self.db[settings['MONGODB_COLLECTION']] # print(settings['MONGODB_COLLECTION']) # print("*"*100) # bloomfilter # num = (int(settings['CRAWL_NUM']) + self.collection.count()) * 1.5 self.settings = settings self.CrawlCar_Num = 1000000 self.settings = settings # bloom file filename = str(pathlib.Path.cwd()) + '/blm/' + settings[ 'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] # pybloom self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01) # # read if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line)
def get_updated(): db = pymysql.connect(host="192.168.2.99", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select record_id from spider_sh_ralated_GMP_license" cursor.execute(sql) db_data = cursor.fetchall() # print(db_data) # exit() data = [i[0] for i in db_data] from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=10000,error_rate=0.001) for bl in data: bloom.add(bl) return bloom
def fiktergenerator(self, mode): ''' 从数据库某一字段读取生成过滤器 ''' if 'token' in mode: mode = 'TOKEN' if 'id' in mode: mode = 'ID' cc = self.c.execute('SELECT ' + mode + ' FROM id2token') r = cc.fetchall() bloom = ScalableBloomFilter(100000000, 0.001) for i in r: bloom.add(i[0]) return bloom
def updated(): db = pymysql.connect(host="192.168.2.97", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select url from spider_2_company_revoke" cursor.execute(sql) db_data = cursor.fetchall() data = [i[0].strip() for i in db_data] bloom = ScalableBloomFilter(initial_capacity=100000, error_rate=0.001) for i in data: bloom.add(i) return bloom
def __init__(self, site): ''' (Crawler, str) -> Crawler creates a Crawler with a given origin_url ''' self.site = site self.filters = site.referringsitefilter_set.all() self.domain = urlparse(site.url).netloc # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/ # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_dir='../ignore_filter/' if not os.path.exists(ignore_filter_dir): os.makedirs(ignore_filter_dir) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) try: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+') f.write(self.ignore_filter) except IOError: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() else: if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))): f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file: try: for line in ignore_filter_file: self.ignore_filter.add(line.decode('utf8').rstrip()) except Exception as e: logging.info(str(e)) ignore_filter_file.close() self.visited_count = 0 tmpqueuetmp_dir='../tmpqueue/tmp/' if not os.path.exists(tmpqueuetmp_dir): os.makedirs(tmpqueuetmp_dir) slugified_name = slugify(unicode(site.name)) tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name) if not os.path.exists(tmpqueue_dir): os.makedirs(tmpqueue_dir) self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir) # Initial url if (self.site.is_shallow == False): self.to_visit.put(site.url) else: self.to_visit.put((site.url, str(0))) # Limit self.limit = common.get_config()["crawler"]["limit"] # Specifies how deep the shallow crawler should go; "1" is the lowest option for this self.level = common.get_config()["crawler"]["level"] """
patterns.load('./pos-patterns') # Load list of stopwords print(colored('Loading stopwords...','cyan')) stopwords = [] with open('./pos-stopwords') as f: stopwords = list(f.readlines()) # Initialise a crawling dataset connection print(colored('Initialising wikipedia crawling collection...','cyan')) crawl_collection = init_crawl_collection() # Iterate through the crawling database n = 0 print(colored('Iterating over crawling database...','cyan')) bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for topic,sentence in iter_topic(crawl_collection,args['start']): # Clean topic string topic = topic.replace("'",'').replace('\n','') # Check if the number of processed topic exceed the limit? if topic not in bf: bf.add(topic) if len(bf) > args['limit']: print(colored('[Topics limit reached] ... BYE','cyan')) sys.exit(0) # Break the sentence into knowledge nodes pos = TextStructure.pos_tag(sentence) kb_nodes = patterns.capture(pos)
def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while(True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format(self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format(current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info(u"skipping url \"{0}\" because it matches filter".format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info(u"skipping url with invalid scheme: {0}".format(url)) continue parsed_as_list[5] = '' url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e))) continue if(not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put((url, str(int(current_level) + 1))) logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info(u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e
class Crawler(object): def __init__(self, site): ''' (Crawler, str) -> Crawler creates a Crawler with a given origin_url ''' self.site = site self.filters = site.referringsitefilter_set.all() self.domain = urlparse(site.url).netloc # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/ # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_dir='../ignore_filter/' if not os.path.exists(ignore_filter_dir): os.makedirs(ignore_filter_dir) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) try: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+') f.write(self.ignore_filter) except IOError: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() else: if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))): f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file: try: for line in ignore_filter_file: self.ignore_filter.add(line.decode('utf8').rstrip()) except Exception as e: logging.info(str(e)) ignore_filter_file.close() self.visited_count = 0 tmpqueuetmp_dir='../tmpqueue/tmp/' if not os.path.exists(tmpqueuetmp_dir): os.makedirs(tmpqueuetmp_dir) slugified_name = slugify(unicode(site.name)) tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name) if not os.path.exists(tmpqueue_dir): os.makedirs(tmpqueue_dir) self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir) # Initial url if (self.site.is_shallow == False): self.to_visit.put(site.url) else: self.to_visit.put((site.url, str(0))) # Limit self.limit = common.get_config()["crawler"]["limit"] # Specifies how deep the shallow crawler should go; "1" is the lowest option for this self.level = common.get_config()["crawler"]["level"] """ self.probabilistic_n = common.get_config()["crawler"]["n"] self.probabilistic_k = common.get_config()["crawler"]["k"] self.db = psycopg2.connect(host='localhost', database=common.get_config()["crawler"]["postgresql"]["name"], user=common.get_config()["crawler"]["postgresql"]["user"], password=common.get_config()["crawler"]["postgresql"]["password"]) self.cursor = self.db.cursor() self.already_added_urls = set() self.visited_table = "visited_" + str(site.id) self.tovisit_table = "tovisit_" + str(site.id) #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table) #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)") self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table) self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))") #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,)) self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,)) self.db.commit() """ def __iter__(self): return self def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while(True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format(self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format(current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info(u"skipping url \"{0}\" because it matches filter".format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info(u"skipping url with invalid scheme: {0}".format(url)) continue parsed_as_list[5] = '' url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e))) continue if(not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put((url, str(int(current_level) + 1))) logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info(u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e def url_in_filter(self, url, filters): """ Checks if any of the filters matches the url. Filters can be in regex search or normal string comparison. """ for filt in filters: if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or (not filt.regex and filt.pattern in url)): return True return False