Esempio n. 1
0
    def start_project(self):
        """ Starts crawl for the current project, crawling its URL  """

        if objects.eventmgr.raise_event('before_start_project',
                                        objects.queuemgr.baseurl,
                                        None) == False:
            return

        # crawls through a site using http/ftp/https protocols
        if objects.config.project:
            info('*** Log Started ***\n')
            if not objects.config.resuming:
                info('Starting project', objects.config.project, '...')
            else:
                info('Re-starting project', objects.config.project, '...')

            # Write the project file
            # if not objects.config.fromprojfile:
            #    projector = utils.HarvestManProjectManager()
            #    projector.write_project()

            # Write the project database record
            HarvestManDbManager.add_project_record()

        if not objects.config.resuming:
            info('Starting download of url', objects.config.url, '...')
        else:
            pass

        # Reset objects keeping project-specific states now
        # Reset and re-initialize datamgr
        objects.datamgr.clean_up()
        objects.datamgr.initialize()
        objects.rulesmgr.reset()

        # Read the project cache file, if any
        if objects.config.pagecache:
            objects.datamgr.read_project_cache()

        if not objects.config.resuming:
            # Configure tracker manager for this project
            if objects.queuemgr.configure():
                # start the project
                objects.queuemgr.crawl()
        else:
            objects.queuemgr.restart()

        objects.eventmgr.raise_event('post_start_project',
                                     objects.queuemgr.baseurl, None)
Esempio n. 2
0
    def start_project(self):
        """ Starts crawl for the current project, crawling its URL  """

        if objects.eventmgr.raise_event('before_start_project', objects.queuemgr.baseurl, None)==False:
            return
        
        # crawls through a site using http/ftp/https protocols
        if objects.config.project:
            info('*** Log Started ***\n')
            if not objects.config.resuming:
                info('Starting project',objects.config.project,'...')
            else:
                info('Re-starting project',objects.config.project,'...')                

            
            # Write the project file 
            # if not objects.config.fromprojfile:
            #    projector = utils.HarvestManProjectManager()
            #    projector.write_project()

            # Write the project database record
            HarvestManDbManager.add_project_record()
            
        if not objects.config.resuming:
            info('Starting download of url',objects.config.url,'...')
        else:
            pass

        # Reset objects keeping project-specific states now
        # Reset and re-initialize datamgr
        objects.datamgr.clean_up()
        objects.datamgr.initialize()
        objects.rulesmgr.reset()
            
        # Read the project cache file, if any
        if objects.config.pagecache:
            objects.datamgr.read_project_cache()
            
        if not objects.config.resuming:
            # Configure tracker manager for this project
            if objects.queuemgr.configure():
                # start the project
                objects.queuemgr.crawl()
        else:
            objects.queuemgr.restart()

        objects.eventmgr.raise_event('post_start_project', objects.queuemgr.baseurl, None)
    def print_project_info(self, statsd):
        """ Print project information """

        nlinks = statsd['links']
        nservers = statsd['extservers'] + 1
        nfiles = statsd['files']
        ndirs = statsd['extdirs'] + 1
        numfailed = statsd['failed']
        nretried = statsd['retries']
        fatal = statsd['fatal']
        fetchtime = statsd['fetchtime']
        nfilesincache = statsd['filesincache']
        nfilesinrepos = statsd['filesinrepos']
        nbroken = statsd['broken']
        
        # Bug fix, download time to be calculated
        # precisely...

        dnldtime = fetchtime

        strings = [('link', nlinks), ('server', nservers),
                   ('file', nfiles), ('file', nfilesinrepos),
                   ('directory', ndirs), ('link', numfailed), ('link', fatal),
                   ('link', nretried), ('file', nfilesincache), ('link', nbroken) ]

        fns = map(plural, strings)
        info(' ')

        bytes = self.bytes
        savedbytes = self.savedbytes
        
        ratespec='KB/sec'
        if bytes and dnldtime:
            bps = float(bytes/dnldtime)/1024.0
            if bps<1.0:
                bps *= 1000.0
                ratespec='bytes/sec'
            bps = '%.2f' % bps
        else:
            bps = '0.0'

        fetchtime = float((math.modf(fetchtime*100.0)[1])/100.0)

        if self._cfg.simulate:
            info("HarvestMan crawl simulation of",self._cfg.project,"completed in",fetchtime,"seconds.")
        else:
            info('HarvestMan mirror',self._cfg.project,'completed in',fetchtime,'seconds.')
            
        if nlinks: info(nlinks,fns[0],'scanned in',nservers,fns[1],'.')
        else: info('No links parsed.')
        if nfiles: info(nfiles,fns[2],'written.')
        else:info('No file written.')
        
        if nfilesinrepos:
            info(nfilesinrepos,fns[3],wasOrWere(nfilesinrepos),'already uptodate in the repository for this project and',wasOrWere(nfilesinrepos),'not updated.')
        if nfilesincache:
            info(nfilesincache,fns[8],wasOrWere(nfilesincache),'updated from the project cache.')

        if nbroken: info(nbroken,fns[9],wasOrWere(nbroken),'were broken.')
        if fatal: info(fatal,fns[5],'had fatal errors and failed to download.')
        if bytes: info(bytes,' bytes received at the rate of',bps,ratespec,'.')
        if savedbytes: info(savedbytes,' bytes were written to disk.\n')
        
        info('*** Log Completed ***\n')
        
        # get current time stamp
        s=time.localtime()

        tz=(time.tzname)[0]

        format='%b %d %Y '+tz+' %H:%M:%S'
        tstamp=time.strftime(format, s)

        if not self._cfg.simulate:
            # Write statistics to the crawl database
            HarvestManDbManager.add_stats_record(statsd)
            logconsole('Done.')
Esempio n. 4
0
    def print_project_info(self, statsd):
        """ Print project information """

        nlinks = statsd['links']
        nservers = statsd['extservers'] + 1
        nfiles = statsd['files']
        ndirs = statsd['extdirs'] + 1
        numfailed = statsd['failed']
        nretried = statsd['retries']
        fatal = statsd['fatal']
        fetchtime = statsd['fetchtime']
        nfilesincache = statsd['filesincache']
        nfilesinrepos = statsd['filesinrepos']
        nbroken = statsd['broken']

        # Bug fix, download time to be calculated
        # precisely...

        dnldtime = fetchtime

        strings = [('link', nlinks), ('server', nservers), ('file', nfiles),
                   ('file', nfilesinrepos), ('directory', ndirs),
                   ('link', numfailed), ('link', fatal), ('link', nretried),
                   ('file', nfilesincache), ('link', nbroken)]

        fns = map(plural, strings)
        info(' ')

        bytes = self.bytes
        savedbytes = self.savedbytes

        ratespec = 'KB/sec'
        if bytes and dnldtime:
            bps = float(bytes / dnldtime) / 1024.0
            if bps < 1.0:
                bps *= 1000.0
                ratespec = 'bytes/sec'
            bps = '%.2f' % bps
        else:
            bps = '0.0'

        fetchtime = float((math.modf(fetchtime * 100.0)[1]) / 100.0)

        if self._cfg.simulate:
            info("HarvestMan crawl simulation of", self._cfg.project,
                 "completed in", fetchtime, "seconds.")
        else:
            info('HarvestMan mirror', self._cfg.project, 'completed in',
                 fetchtime, 'seconds.')

        if nlinks: info(nlinks, fns[0], 'scanned in', nservers, fns[1], '.')
        else: info('No links parsed.')
        if nfiles: info(nfiles, fns[2], 'written.')
        else: info('No file written.')

        if nfilesinrepos:
            info(nfilesinrepos, fns[3], wasOrWere(nfilesinrepos),
                 'already uptodate in the repository for this project and',
                 wasOrWere(nfilesinrepos), 'not updated.')
        if nfilesincache:
            info(nfilesincache, fns[8], wasOrWere(nfilesincache),
                 'updated from the project cache.')

        if nbroken: info(nbroken, fns[9], wasOrWere(nbroken), 'were broken.')
        if fatal:
            info(fatal, fns[5], 'had fatal errors and failed to download.')
        if bytes:
            info(bytes, ' bytes received at the rate of', bps, ratespec, '.')
        if savedbytes: info(savedbytes, ' bytes were written to disk.\n')

        info('*** Log Completed ***\n')

        # get current time stamp
        s = time.localtime()

        tz = (time.tzname)[0]

        format = '%b %d %Y ' + tz + ' %H:%M:%S'
        tstamp = time.strftime(format, s)

        if not self._cfg.simulate:
            # Write statistics to the crawl database
            HarvestManDbManager.add_stats_record(statsd)
            logconsole('Done.')
Esempio n. 5
0
                logconsole('Creating "sessions" sub-directory in %s...' % harvestman_dir)
                os.makedirs(harvestman_sessions_dir)                        
                logconsole('Done.')
            except (OSError, IOError), e:
                logconsole(e)

        if not os.path.isdir(harvestman_db_dir):
            try:
                logconsole('Creating "db" sub-directory in %s...' % harvestman_dir)
                os.makedirs(harvestman_db_dir)                        
                logconsole('Done.')
            except (OSError, IOError), e:
                logconsole(e)

            try:
                HarvestManDbManager.create_user_database()
            except Exception, e:
                logconsole(e)
                
        
    def init(self):
        """ Initialize the crawler by creating, register common objects and creating the
        user folders """

        if objects.config.USER_AGENT=='':
            objects.config.USER_AGENT = self.__class__.USER_AGENT
            
        self.register_common_objects()
        self.create_user_directories()

        # Calculate bandwidth and set max file size
Esempio n. 6
0
                os.makedirs(harvestman_sessions_dir)
                logconsole('Done.')
            except (OSError, IOError), e:
                logconsole(e)

        if not os.path.isdir(harvestman_db_dir):
            try:
                logconsole('Creating "db" sub-directory in %s...' %
                           harvestman_dir)
                os.makedirs(harvestman_db_dir)
                logconsole('Done.')
            except (OSError, IOError), e:
                logconsole(e)

            try:
                HarvestManDbManager.create_user_database()
            except Exception, e:
                logconsole(e)

    def init(self):
        """ Initialize the crawler by creating, register common objects and creating the
        user folders """

        if objects.config.USER_AGENT == '':
            objects.config.USER_AGENT = self.__class__.USER_AGENT

        self.register_common_objects()
        self.create_user_directories()

        # Calculate bandwidth and set max file size
        # bw = self.calculate_bandwidth()