def test_data_path_inside_project(self): with inside_a_project() as proj_path: expected = os.path.join(proj_path, '.scrapy', 'somepath') self.assertEqual(os.path.realpath(expected), os.path.realpath(data_path('somepath'))) abspath = os.path.join(os.path.sep, 'absolute', 'path') self.assertEqual(abspath, data_path(abspath))
def test_data_path_outside_project(self): self.assertEqual( os.path.join('.scrapy', 'somepath'), data_path('somepath') ) abspath = os.path.join(os.path.sep, 'absolute', 'path') self.assertEqual(abspath, data_path(abspath))
def test_data_path_inside_project(self): with inside_a_project() as proj_path: expected = os.path.join(proj_path, '.scrapy', 'somepath') self.assertEqual( os.path.realpath(expected), os.path.realpath(data_path('somepath')) ) self.assertEqual('/absolute/path', data_path('/absolute/path'))
class GoogleCloud(object): credentials_json_path = ''.join( (data_path('auth/', True), 'google-service-account.json')) @classmethod def from_crawler(cls, crawler): return cls(settings=crawler.settings) def __init__(self, settings): self.google_cloud_enabled = settings.getbool('GOOGLE_CLOUD_ENABLED') if self.google_cloud_enabled: credentials_json = settings.get( 'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON') if credentials_json: if not os.path.isfile(self.credentials_json_path): with open(self.credentials_json_path, 'w') as outfile: outfile.write(credentials_json) os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] = self.credentials_json_path logger.info('Google Cloud extensions inited with success') else: settings.set('GOOGLE_CLOUD_ENABLED', False) raise NotConfigured( 'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON ' 'not is set in settings') else: logger.info('GOOGLE_CLOUD_ENABLED is False') def close_spider(self, spider): if self.google_cloud_enabled \ and os.path.isfile(self.credentials_json_path): os.remove(self.credentials_json_path)
def __init__(self, settings): self.redis_conn = redis.Redis( host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT')) self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.persist = settings.get('SCHEDULER_PERSIST', True)
def from_crawler(cls, crawler): s = crawler.settings if not s.getbool('DELTAFETCH_ENABLED'): raise NotConfigured dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) dbmodule = s.get('DELTAFETCH_DBM_MODULE', 'anydbm') return cls(dir, dbmodule)
def __init__(self, settings): self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.dbmodule = import_module( settings['HTTPCACHE_DBM_MODULE'] ) # dbm,有这样的模块吗。数据存储,终究还是免不了I/O,open是必不可少的呀 self.db = None
def __init__(self, settings): warn("The LevelDB storage backend is deprecated.", ScrapyDeprecationWarning, stacklevel=2) import leveldb self._leveldb = leveldb self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.db = None
def from_crawler(cls, crawler): s = crawler.settings if not s.getbool('DELTAFETCH_ENABLED'): raise NotConfigured dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) reset = s.getbool('DELTAFETCH_RESET') o = cls(dir, reset, crawler.stats) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o
def from_crawler(cls, crawler): s = crawler.settings if not s.getbool('PERSISTENT_PAGE_CLUSTERING'): raise NotConfigured directory = data_path(s.get('CLUSTERING_DIR', 'clustering')) reset = s.getbool('CLUSTERING_RESET') o = cls(directory, reset, crawler.stats) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o
def __init__(self, filename, save_exec_time=False, *args, **kwargs): super(PersistDataManager, self).__init__(*args, **kwargs) if filename == '': raise PersistDataException( 'Filename required to persist data on SH') self.file_path = data_path(filename + '.json') logger.info('Using persistent file : ' + self.file_path) self._load() if save_exec_time: self['spider_exec'] = str(datetime.today())
def from_settings(cls, settings, **kwargs): from .pybloom import ScalableBloomFilter p = settings.get("BLOOMFILTER_PATH", data_path(".")) ic = settings.get("BLOOMFILTER_SIZE", 5000000) ert = settings.get("BLOOMFILTER_ERROR_RATE", 0.001) mode = settings.get("BLOOMFILTER_MODE", ScalableBloomFilter.SMALL_SET_GROWTH) return cls(path=p, initial_capacity=ic, error_rate=ert, mode=mode)
def from_crawler(cls, crawler): settings = crawler.settings if not settings.getbool('DROP_DUPLICATES_ENABLED'): raise NotConfigured dir__ = data_path(settings.get('CACHE_DIR', 'cache')) dedup = cls(dir__, crawler.stats, settings) crawler.signals.connect(dedup.spider_opened, signal=signals.spider_opened) crawler.signals.connect(dedup.spider_closed, signal=signals.spider_closed) return dedup
def from_crawler(cls, crawler): s = crawler.settings if not s.getbool('CRAWL_ONCE_ENABLED', True): raise NotConfigured() path = data_path(s.get('CRAWL_ONCE_PATH', 'crawl_once'), createdir=True) default = s.getbool('CRAWL_ONCE_DEFAULT', default=False) o = cls(path, crawler.stats, default) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o
def __init__(self, settings): super(S3CacheStorage, self).__init__(settings) self.tmpcachedir = data_path(settings.get( 'S3CACHE_TEMPDIR', os.path.join(tempfile.tempdir, '.s3cache'), )) self.aws_access_key = settings['AWS_ACCESS_KEY_ID'] self.aws_secret_key = settings['AWS_SECRET_ACCESS_KEY'] self.bucket_name = settings['S3CACHE_BUCKET'] if self.bucket_name is None: raise NotConfigured("S3CACHE_BUCKET must be specified") self._conn = None
def spider_opened(self): s = self.settings self.email = EmailNotification(s['EMAIL'], s['PASSWORD']) for address in self.addresses: self.email.add_address(address) self.path = data_path(s['FILE_NAME']) try: with open(self.path, 'r') as f: self.last_url = f.read() except Exception: self.last_url = 'URL_NULL'
def __init__(self, settings): super(S3CacheStorage, self).__init__(settings) self.tmpcachedir = data_path( settings.get( 'S3CACHE_TEMPDIR', os.path.join(tempfile.tempdir, '.s3cache'), )) self.aws_access_key = settings['AWS_ACCESS_KEY_ID'] self.aws_secret_key = settings['AWS_SECRET_ACCESS_KEY'] self.bucket_name = settings['S3CACHE_BUCKET'] if self.bucket_name is None: raise NotConfigured("S3CACHE_BUCKET must be specified") self._conn = None
def delete_spidersfiles(): dir_path = data_path('') logger.debug('Path of .scrapy dir == [%s]' % dir_path) display_list = os.listdir(dir_path) logger.debug('{%s}' % str(display_list)) file_list = [ f for f in os.listdir(dir_path) if f.endswith('Operator.json') ] for f in file_list: logger.info('We delete file [{}]'.format(f)) try: os.remove(f) except OSError as e: logger.error('Error: %s - %s.' % (e.filename, e.strerror))
def __init__(self, settings): self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True) self.sqlite_database = settings['SQLITE_DATABASE'] self.table = settings.get('SQLITE_REQUESTS_TABLE', connection.SQLITE_REQUESTS_TABLE) self.database = settings.get('SQLITE_DATABASE', connection.SQLITE_DATABASE) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.use_gzip = settings.getbool('HTTPCACHE_GZIP') if self.use_gzip: self._loads = self._gzip_loads self._dumps = self._gzip_dumps else: self._loads = self._pickle_loads self._dumps = self._pickle_dumps self.conn = None
def parse(self, response): # get all proxies and their ports proxy_table = response.css('table#proxylisttable') proxy_table_rows = proxy_table.xpath('.//tr')[ 1:] # remove table header filename = 'proxies.txt' mydata_path = data_path(filename) with open(mydata_path, 'w') as fout: for row in proxy_table_rows: ip = row.xpath('./td[1]/text()').extract_first() port = row.xpath('./td[2]/text()').extract_first() if ip and port: proxy = ':'.join([ip, port]) fout.write(proxy) fout.write('\n') fout.close()
def __init__(self, settings): self.cachedir = data_path(settings['HTTPCACHE_DIR']) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.use_gzip = settings.getbool('HTTPCACHE_GZIP') self._open = gzip.open if self.use_gzip else open
def __init__(self, settings=conf.settings): self.cachedir = data_path(settings['HTTPCACHE_DIR']) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.dbmodule = __import__(settings['HTTPCACHE_DBM_MODULE']) self.dbs = {}
def _stats_location(self, spider): statsdir = data_path("stats", createdir=True) return os.path.join(statsdir, f"{spider.name}_stats_history")
def __init__(self, settings: Settings): super(SQLiteStorage, self).__init__(settings) self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"]) self.database: str = settings["COOKIES_SQLITE_DATABASE"] self.conn: Connection = None self.cur: Cursor = None
def __init__(self, settings: Settings): super(InMemoryStorage, self).__init__() self.settings: Settings = settings self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"])
def __init__(self, settings=conf.settings): self.cachedir = data_path(settings['HTTPCACHE_DIR']) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
def __init__(self, settings): self.cachedir = data_path(settings['HTTPCACHE_DIR'])
def test_data_path_inside_project(self): with inside_a_project() as proj_path: expected = os.path.join(proj_path, '.scrapy', 'somepath') self.assertEquals(expected, data_path('somepath')) self.assertEquals('/absolute/path', data_path('/absolute/path'))
def __init__(self, settings): self.cachedir = data_path(settings["HTTPCACHE_DIR"], createdir=True) self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS") self.dbmodule = import_module(settings["HTTPCACHE_DBM_MODULE"]) self.db = None
def run_cleanup_cache(settings): days = int( settings.get('FEEDS_CONFIG', {}).get('feeds', {}).get('cache_expires', 14)) cleanup_cache(data_path(settings['HTTPCACHE_DIR']), datetime.now() - timedelta(days=days))
def __init__(self, crawler): super(LocalStorageStatsHistoryCollector, self).__init__(crawler) statsdir = data_path("stats", createdir=True) self.stats_location = os.path.join( statsdir, "{}_stats_history".format(crawler.spider.name))
def __init__(self, settings: Settings): super(FilesystemCacheStorage, self).__init__(settings) self.cachedir = data_path(settings["HTTPCACHE_DIR"]) self.use_gzip = settings.getbool("HTTPCACHE_GZIP") self._open = gzip.open if self.use_gzip else open
def __init__(self, settings): super(FilesystemCacheStorage, self).__init__(settings) self.cachedir = data_path(self.httpcache_dir.to_value()) self.expiration_secs = self.httpcach_expiration_secs.to_value()
def __init__(self, settings): import leveldb self._leveldb = leveldb self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.db = None
def test_data_path_outside_project(self): self.assertEquals('.scrapy/somepath', data_path('somepath')) self.assertEquals('/absolute/path', data_path('/absolute/path'))
def __init__(self, settings): self.cachedir = data_path(settings["HTTPCACHE_DIR"]) self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
def __init__(self, settings): self.cachedir = data_path(settings['HTTPCACHE_DIR']) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
'RISJbot.spmiddlewares.refetchcontrol.RefetchControl': 800, # Note: Should be after RefetchControl, to ensure that the URLs stored # are the altered "canonical" ones. 'RISJbot.spmiddlewares.equivalentdomains.EquivalentDomains': 900, 'RISJbot.spmiddlewares.unwantedcontent.UnwantedContent': 950, } # Enable RefetchControl, 8 fetches total, every 3 hours, including a # trawl of previously-fetched pages for completeness (TN, 2017-03-15) REFETCHCONTROL_ENABLED = True REFETCHCONTROL_MAXFETCHES = 8 REFETCHCONTROL_REFETCHSECS = 10800 REFETCHCONTROL_REFETCHFROMDB = True REFETCHCONTROL_TRIMDB = True REFETCHCONTROL_RQCALLBACK = 'spider.parse_page' REFETCHCONTROL_DIR = data_path('RefetchControl', createdir=True) # Enable UnwantedContent, stripping figures etc. (TN, 2017-02-27) UNWANTEDCONTENT_ENABLED = True UNWANTEDCONTENT_XPATHS = [ '//figure', '//script', '//style', '//form', ] # Enable Fake404, dropping responses that are actually "page not found", # but come with an improper HTTP 200 success code. Lookin' at you, foxnews.com. FAKE404_ENABLED = True # List of ( url regex, matching xpath ) tuples FAKE404_DETECTIONSIGS = [
def __init__(self, settings): self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True) self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE']) self.db = None
def test_data_path_outside_project(self): self.assertEqual(os.path.join('.scrapy', 'somepath'), data_path('somepath')) abspath = os.path.join(os.path.sep, 'absolute', 'path') self.assertEqual(abspath, data_path(abspath))
def test_data_path_outside_project(self): self.assertEqual('.scrapy/somepath', data_path('somepath')) self.assertEqual('/absolute/path', data_path('/absolute/path'))
def __init__(self, settings): self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True) self.httpcache_expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.dbmodule = __import__(settings['HTTPCACHE_DBM_MODULE']) self.db = None
def __init__(self, settings): self.cachedir = data_path(self.httpcache_dir.to_value()) self.expiration_secs = self.httpcache_expiration_secs.to_value() self.dbmodule = __import__(self.httpcache_dbm_module.to_value()) self.dbs = {}