def start_project(self): """ Starts crawl for the current project, crawling its URL """ if objects.eventmgr.raise_event('before_start_project', objects.queuemgr.baseurl, None) == False: return # crawls through a site using http/ftp/https protocols if objects.config.project: info('*** Log Started ***\n') if not objects.config.resuming: info('Starting project', objects.config.project, '...') else: info('Re-starting project', objects.config.project, '...') # Write the project file # if not objects.config.fromprojfile: # projector = utils.HarvestManProjectManager() # projector.write_project() # Write the project database record HarvestManDbManager.add_project_record() if not objects.config.resuming: info('Starting download of url', objects.config.url, '...') else: pass # Reset objects keeping project-specific states now # Reset and re-initialize datamgr objects.datamgr.clean_up() objects.datamgr.initialize() objects.rulesmgr.reset() # Read the project cache file, if any if objects.config.pagecache: objects.datamgr.read_project_cache() if not objects.config.resuming: # Configure tracker manager for this project if objects.queuemgr.configure(): # start the project objects.queuemgr.crawl() else: objects.queuemgr.restart() objects.eventmgr.raise_event('post_start_project', objects.queuemgr.baseurl, None)
def start_project(self): """ Starts crawl for the current project, crawling its URL """ if objects.eventmgr.raise_event('before_start_project', objects.queuemgr.baseurl, None)==False: return # crawls through a site using http/ftp/https protocols if objects.config.project: info('*** Log Started ***\n') if not objects.config.resuming: info('Starting project',objects.config.project,'...') else: info('Re-starting project',objects.config.project,'...') # Write the project file # if not objects.config.fromprojfile: # projector = utils.HarvestManProjectManager() # projector.write_project() # Write the project database record HarvestManDbManager.add_project_record() if not objects.config.resuming: info('Starting download of url',objects.config.url,'...') else: pass # Reset objects keeping project-specific states now # Reset and re-initialize datamgr objects.datamgr.clean_up() objects.datamgr.initialize() objects.rulesmgr.reset() # Read the project cache file, if any if objects.config.pagecache: objects.datamgr.read_project_cache() if not objects.config.resuming: # Configure tracker manager for this project if objects.queuemgr.configure(): # start the project objects.queuemgr.crawl() else: objects.queuemgr.restart() objects.eventmgr.raise_event('post_start_project', objects.queuemgr.baseurl, None)
def print_project_info(self, statsd): """ Print project information """ nlinks = statsd['links'] nservers = statsd['extservers'] + 1 nfiles = statsd['files'] ndirs = statsd['extdirs'] + 1 numfailed = statsd['failed'] nretried = statsd['retries'] fatal = statsd['fatal'] fetchtime = statsd['fetchtime'] nfilesincache = statsd['filesincache'] nfilesinrepos = statsd['filesinrepos'] nbroken = statsd['broken'] # Bug fix, download time to be calculated # precisely... dnldtime = fetchtime strings = [('link', nlinks), ('server', nservers), ('file', nfiles), ('file', nfilesinrepos), ('directory', ndirs), ('link', numfailed), ('link', fatal), ('link', nretried), ('file', nfilesincache), ('link', nbroken) ] fns = map(plural, strings) info(' ') bytes = self.bytes savedbytes = self.savedbytes ratespec='KB/sec' if bytes and dnldtime: bps = float(bytes/dnldtime)/1024.0 if bps<1.0: bps *= 1000.0 ratespec='bytes/sec' bps = '%.2f' % bps else: bps = '0.0' fetchtime = float((math.modf(fetchtime*100.0)[1])/100.0) if self._cfg.simulate: info("HarvestMan crawl simulation of",self._cfg.project,"completed in",fetchtime,"seconds.") else: info('HarvestMan mirror',self._cfg.project,'completed in',fetchtime,'seconds.') if nlinks: info(nlinks,fns[0],'scanned in',nservers,fns[1],'.') else: info('No links parsed.') if nfiles: info(nfiles,fns[2],'written.') else:info('No file written.') if nfilesinrepos: info(nfilesinrepos,fns[3],wasOrWere(nfilesinrepos),'already uptodate in the repository for this project and',wasOrWere(nfilesinrepos),'not updated.') if nfilesincache: info(nfilesincache,fns[8],wasOrWere(nfilesincache),'updated from the project cache.') if nbroken: info(nbroken,fns[9],wasOrWere(nbroken),'were broken.') if fatal: info(fatal,fns[5],'had fatal errors and failed to download.') if bytes: info(bytes,' bytes received at the rate of',bps,ratespec,'.') if savedbytes: info(savedbytes,' bytes were written to disk.\n') info('*** Log Completed ***\n') # get current time stamp s=time.localtime() tz=(time.tzname)[0] format='%b %d %Y '+tz+' %H:%M:%S' tstamp=time.strftime(format, s) if not self._cfg.simulate: # Write statistics to the crawl database HarvestManDbManager.add_stats_record(statsd) logconsole('Done.')
def print_project_info(self, statsd): """ Print project information """ nlinks = statsd['links'] nservers = statsd['extservers'] + 1 nfiles = statsd['files'] ndirs = statsd['extdirs'] + 1 numfailed = statsd['failed'] nretried = statsd['retries'] fatal = statsd['fatal'] fetchtime = statsd['fetchtime'] nfilesincache = statsd['filesincache'] nfilesinrepos = statsd['filesinrepos'] nbroken = statsd['broken'] # Bug fix, download time to be calculated # precisely... dnldtime = fetchtime strings = [('link', nlinks), ('server', nservers), ('file', nfiles), ('file', nfilesinrepos), ('directory', ndirs), ('link', numfailed), ('link', fatal), ('link', nretried), ('file', nfilesincache), ('link', nbroken)] fns = map(plural, strings) info(' ') bytes = self.bytes savedbytes = self.savedbytes ratespec = 'KB/sec' if bytes and dnldtime: bps = float(bytes / dnldtime) / 1024.0 if bps < 1.0: bps *= 1000.0 ratespec = 'bytes/sec' bps = '%.2f' % bps else: bps = '0.0' fetchtime = float((math.modf(fetchtime * 100.0)[1]) / 100.0) if self._cfg.simulate: info("HarvestMan crawl simulation of", self._cfg.project, "completed in", fetchtime, "seconds.") else: info('HarvestMan mirror', self._cfg.project, 'completed in', fetchtime, 'seconds.') if nlinks: info(nlinks, fns[0], 'scanned in', nservers, fns[1], '.') else: info('No links parsed.') if nfiles: info(nfiles, fns[2], 'written.') else: info('No file written.') if nfilesinrepos: info(nfilesinrepos, fns[3], wasOrWere(nfilesinrepos), 'already uptodate in the repository for this project and', wasOrWere(nfilesinrepos), 'not updated.') if nfilesincache: info(nfilesincache, fns[8], wasOrWere(nfilesincache), 'updated from the project cache.') if nbroken: info(nbroken, fns[9], wasOrWere(nbroken), 'were broken.') if fatal: info(fatal, fns[5], 'had fatal errors and failed to download.') if bytes: info(bytes, ' bytes received at the rate of', bps, ratespec, '.') if savedbytes: info(savedbytes, ' bytes were written to disk.\n') info('*** Log Completed ***\n') # get current time stamp s = time.localtime() tz = (time.tzname)[0] format = '%b %d %Y ' + tz + ' %H:%M:%S' tstamp = time.strftime(format, s) if not self._cfg.simulate: # Write statistics to the crawl database HarvestManDbManager.add_stats_record(statsd) logconsole('Done.')
logconsole('Creating "sessions" sub-directory in %s...' % harvestman_dir) os.makedirs(harvestman_sessions_dir) logconsole('Done.') except (OSError, IOError), e: logconsole(e) if not os.path.isdir(harvestman_db_dir): try: logconsole('Creating "db" sub-directory in %s...' % harvestman_dir) os.makedirs(harvestman_db_dir) logconsole('Done.') except (OSError, IOError), e: logconsole(e) try: HarvestManDbManager.create_user_database() except Exception, e: logconsole(e) def init(self): """ Initialize the crawler by creating, register common objects and creating the user folders """ if objects.config.USER_AGENT=='': objects.config.USER_AGENT = self.__class__.USER_AGENT self.register_common_objects() self.create_user_directories() # Calculate bandwidth and set max file size
os.makedirs(harvestman_sessions_dir) logconsole('Done.') except (OSError, IOError), e: logconsole(e) if not os.path.isdir(harvestman_db_dir): try: logconsole('Creating "db" sub-directory in %s...' % harvestman_dir) os.makedirs(harvestman_db_dir) logconsole('Done.') except (OSError, IOError), e: logconsole(e) try: HarvestManDbManager.create_user_database() except Exception, e: logconsole(e) def init(self): """ Initialize the crawler by creating, register common objects and creating the user folders """ if objects.config.USER_AGENT == '': objects.config.USER_AGENT = self.__class__.USER_AGENT self.register_common_objects() self.create_user_directories() # Calculate bandwidth and set max file size # bw = self.calculate_bandwidth()