def run(self, args, opts): if len(args) != 4: raise UsageError() jira_id = args[0] tracking_filename = args[1] data_file = args[2] final_method_name = args[3] tracking_path = os.path.join( self.settings.get('REQUEST_TRACKING_PATH'), jira_id, tracking_filename) if not FileUtils.isExist(tracking_path): print("tracking path is not exists:{0}".format(tracking_path)) return data_path = os.path.join(self.settings.get('STORAGE_DIR'), jira_id, data_file) if not FileUtils.isExist(data_path): print("data_path path is not exists:{0}".format(data_path)) return final_df = self.__collectMissingUrl(data_path, tracking_path, final_method_name) if not final_df.empty: out_path = os.path.join(self.settings.get('REQUEST_TRACKING_PATH'), jira_id, 'missing_url.csv') final_df.to_csv(out_path, sep='|', encoding='utf-8', index=False) else: print("No Missing URL Captured")
def spider_closed(self, spider): logger.info("started to Clean the Download Folder in spider %s", spider.name) download_path = os.path.join( self.settings.get('SELENIUM_DOWNLOAD_PATH', '/tmp'), spider.name) if FileUtils.isExist(download_path): FileUtils.deletePath(download_path)
def run(self, args, opts): if len(args) != 1: raise UsageError() spidername = args[0] if args[0] not in self.crawler_process.spider_loader.list(): print("Spider not available: {}".format(spidername)) return jobDir = os.path.join(self.settings.get('JOB_DIR_PAUSE_RESUME'), spidername) if FileUtils.isExist(jobDir): FileUtils.deletePath(jobDir) print("Job Directory is deleted- {}".format(jobDir))
def run(self, args, opts): if len(args) != 2: raise UsageError() jira_id = args[0] tracking_filename = args[1] tracking_path = os.path.join(self.settings.get('REQUEST_TRACKING_PATH'), jira_id, tracking_filename) if not FileUtils.isExist(tracking_path): print("tracking path is not exists:{0}".format(tracking_path)) return class AnalysisTracking: def __init__(self, tracking_path, out_path): self.tracking_path = tracking_path self.out_path = out_path def analysis(self): df = pd.read_csv(self.tracking_path, sep='|') df["status"].fillna(0.0, inplace=True) request = df.loc[df['type'] == 'request'].reset_index(drop=True)[['unique_id']] response = df.loc[df['type'] == 'response'].reset_index(drop=True)[['unique_id', 'status']] df = pd.merge(request, response, how='left', on=['unique_id']) df["status"].fillna(0.0, inplace=True) res = df.groupby(['status']).size().reset_index(name='counts') res.to_csv(self.out_path, sep='|', encoding='utf-8', index=False) print(res) class_analysis = AnalysisTracking(tracking_path, os.path.join(self.settings.get('REQUEST_TRACKING_PATH'), jira_id, 'analysis.csv')) class_analysis.analysis()
def getSeleniumDriver(self, drivertype=None, executable_path=None, run_headless=False, load_images=True, proxy_string=None, **kwargs): download_path = os.path.join( self.settings.get('SELENIUM_DOWNLOAD_PATH', '/tmp'), self.name) if not FileUtils.isExist(download_path): FileUtils.createDir(download_path) kwargs['download_path'] = download_path self.driverObj = Driver(drivertype) driver = None try: driver = self.driverObj.getDriver(executable_path, run_headless, load_images, proxy_string, **kwargs) except Exception as e: raise SeleniumExtensionsException( "problem to get selenium driver-{}", e) return driver
def spider_opened(self, spider): if not self.path or not FileUtils.isExist(self.path): self.path = FileUtils.createTempFolder() self._settings.overrides['TEMP_FILE_PATH'] = self.path
def spider_closed(self, spider): if self.path and FileUtils.isExist(self.path): FileUtils.deletePath(self.path)
def __checkAppendMode(self, name): if self.job_dir: seenPath = os.path.join(self.job_dir, name, 'requests.seen') if FileUtils.isExist(seenPath) and os.stat(seenPath).st_size > 10: self.appendMode = True
def _genspider(self, jiraid, module, name, domain, requirement_path, url, template_name, template_file, opts): headers = self.__headers(requirement_path) val_header = headers.get('top_header') for k in ['sourceName', 'url', 'ingestion_timestamp']: val_header.pop(k) tvars = { 'project_name': self.settings.get('BOT_NAME'), 'ProjectName': string_camelcase(self.settings.get('BOT_NAME')), 'module': module, 'jiraid':jiraid, 'name': name, 'start_url':url, 'username':Utils.getUsername(), 'datetime':Utils.getCurrentDateTimeStr(), 'domain': domain, 'val_header':val_header, 'ingestion_timestamp':'Utils.getingestion_timestamp()', 'default_val':{'sourceName':name, 'url':url}, 'null_header':None, 'feed_expo':None, 'top_header':None, 'classname': '%sSpider' % ''.join(s.capitalize() \ for s in name.split('_')) } try: if self.settings.get('NEWSPIDER_MODULE'): spiders_module = import_module(self.settings['NEWSPIDER_MODULE']) spiders_dir = os.path.join(abspath(dirname(spiders_module.__file__)), jiraid) if os.path.exists(spiders_dir): print("Spider %r jiraID already exists in module:" % jiraid) return os.mkdir(spiders_dir) else: spiders_module = None spiders_dir = "." if opts.custom: import pprint pp = pprint.PrettyPrinter(indent=25, width=250) tvars['null_header'] = headers.get('null_header') tvars['feed_expo'] = pp.pformat(headers.get('feed_expo')) tvars['top_header'] = pp.pformat(headers.get('top_header')) spider_file = "%s.py" % join(spiders_dir, name) shutil.copyfile(template_file, spider_file) render_templatefile(spider_file, **tvars) if self.settings['CUSTOM_TEMPLATES_DIR']: _template_file = join(self.settings['CUSTOM_TEMPLATES_DIR'], 'items.py.tmpl') item_file = "%s.py" % join(spiders_dir, 'items') shutil.copyfile(_template_file, item_file) render_templatefile(item_file, **tvars) __init_file = "%s.py" % join(spiders_dir, '__init__') open(__init_file, 'a').close() # copy the requirement document in spider folder shutil.copyfile(requirement_path, join(spiders_dir, os.path.basename(requirement_path))) print("Created spider %r using template %r " % (name, \ template_name), end=('' if spiders_module else '\n')) if spiders_module: print("in module:\n %s.%s" % (spiders_module.__name__, module)) except Exception as e: # delete the directory if spiders_dir: FileUtils.deletePath(spiders_dir) print(e)