def __init__(self, spec: CrawlSpecification, settings: {}): super().__init__(spec) self.log = shared.simple_logger("RemoteCrawlFinalizer", file_path=os.path.join( self.crawl_specification.logs, self.crawl_specification.name, "scrapy.log"))
def __init__(self, logname="scrapy_wrapper", spec=None, **kwargs): super().__init__(**kwargs) if spec.logs: self.logger = shared.simple_logger(loger_name="linkextractor", file_path=os.path.join( spec.logs, logname + ".log"))
def __init__(self): super().__init__() # setup individual logger for every spider if self.crawl_specification.logs: self.s_log = shared.simple_logger( loger_name="crawlspider", file_path=os.path.join(self.crawl_specification.logs, self.name + ".log")) else: self.s_log = shared.simple_logger(loger_name="crawlspider") # enter spider to parser self.parser.spider = self for hand in self.s_log.handlers: self.logger.logger.addHandler(hand) self.s_log.info("[__init__] - Crawlspider logger setup finished.")
def run_crawl(call_parameter, worker_flag=False): """Run crawl with given parameter.""" global MLOG # setup consistent language detection DetectorFactory.seed = 0 if os.path.exists(call_parameter): crawl_specification = load_settings(call_parameter) else: # assume the first parameter to be the json string crawl_specification = CrawlSpecification() crawl_specification.deserialize(call_parameter) if not crawl_specification: MLOG.error( "Crawl settings could not be loaded. Exiting scrapy_wrapper.") sys.exit(1) scrapy_settings = GenericScrapySettings() if crawl_specification.logs: if not os.path.exists(crawl_specification.logs): os.makedirs(crawl_specification.logs, exist_ok=True) # reset the master log for the wrapper to include file logging MLOG = shared.simple_logger(loger_name="scrapy_wrapper", file_path=os.path.join( crawl_specification.logs, "scrapy_wrapper.log"), file_level=log_level) # specifically assign a log file for scrapy scrapy_settings.set( "LOG_FILE", os.path.join(crawl_specification.logs, "scrapy.log")) scrapy_settings.set("ITEM_PIPELINES", crawl_specification.pipelines) MLOG.info("Initiating scrapy crawler process") process = CrawlerProcess(settings=scrapy_settings) start_urls = list(set(crawl_specification.urls)) allowed_domains = list(map(lambda x: urlparse(x).netloc, start_urls)) for url in start_urls: name = shared.url2filename(url) MLOG.info("Creating spider {0}".format(name)) process.crawl(create_spider(crawl_specification, url, name)) try: process.start() except Exception as exc: MLOG.exception("{0}: {1}".format(type(exc).__name__, exc)) # every spider finished, finalize crawl for finalizer_path in crawl_specification.finalizers: finalizer = shared.get_class(finalizer_path) if finalizer: # somehow pass the collected language statistics from parser finalizer(crawl_specification, crawl_specification. finalizers[finalizer_path]).finalize_crawl() if worker_flag == True: return True
def test_finalize_paragraphs(): """Correct finalization for crawl with csv data.""" logger = shared.simple_logger("RemoteCrawlFinalizer", file_path=os.path.join( 'result_logs', 'my_crawl', "scrapy.log")) finalize_paragraphs('my_crawl', 'result_data', 'result_logs', logger) assert True == False
LANGSTATS = pandas.DataFrame(columns=ACCEPTED_LANG) LANGSTATS.index.name = "url" DEBUG = False if len(sys.argv) >= 3 and sys.argv[2] == "DEBUG": DEBUG = True if DEBUG: log_level = logging.DEBUG else: log_level = logging.INFO VERSION = "0.4.2" # Prepare logging, before reading specification only log on console MLOG = shared.simple_logger(loger_name="scrapy_wrapper") MLOG.info("Running scrapy_wrapper on version {}".format(VERSION)) def load_settings(settings_path) -> CrawlSpecification: """ Loads the json string in settings_path and deserializes to CrawlSpecification object. Determines required behaviour with respect to CrawlSpecification.mode. :param settings_path: file path to the json crawl specification file :return: parsed and semantically updated CrawlSpecification object """ try: settings_file = open(settings_path, "r") settings = CrawlSpecification() json_str = settings_file.read() settings.deserialize(json_str)