Ejemplo n.º 1
0
    def start_project(self):
        """ Starts crawl for the current project, crawling its URL  """

        if objects.eventmgr.raise_event('before_start_project',
                                        objects.queuemgr.baseurl,
                                        None) == False:
            return

        # crawls through a site using http/ftp/https protocols
        if objects.config.project:
            info('*** Log Started ***\n')
            if not objects.config.resuming:
                info('Starting project', objects.config.project, '...')
            else:
                info('Re-starting project', objects.config.project, '...')

            # Write the project file
            # if not objects.config.fromprojfile:
            #    projector = utils.HarvestManProjectManager()
            #    projector.write_project()

            # Write the project database record
            HarvestManDbManager.add_project_record()

        if not objects.config.resuming:
            info('Starting download of url', objects.config.url, '...')
        else:
            pass

        # Reset objects keeping project-specific states now
        # Reset and re-initialize datamgr
        objects.datamgr.clean_up()
        objects.datamgr.initialize()
        objects.rulesmgr.reset()

        # Read the project cache file, if any
        if objects.config.pagecache:
            objects.datamgr.read_project_cache()

        if not objects.config.resuming:
            # Configure tracker manager for this project
            if objects.queuemgr.configure():
                # start the project
                objects.queuemgr.crawl()
        else:
            objects.queuemgr.restart()

        objects.eventmgr.raise_event('post_start_project',
                                     objects.queuemgr.baseurl, None)
Ejemplo n.º 2
0
    def start_project(self):
        """ Starts crawl for the current project, crawling its URL  """

        if objects.eventmgr.raise_event('before_start_project', objects.queuemgr.baseurl, None)==False:
            return
        
        # crawls through a site using http/ftp/https protocols
        if objects.config.project:
            info('*** Log Started ***\n')
            if not objects.config.resuming:
                info('Starting project',objects.config.project,'...')
            else:
                info('Re-starting project',objects.config.project,'...')                

            
            # Write the project file 
            # if not objects.config.fromprojfile:
            #    projector = utils.HarvestManProjectManager()
            #    projector.write_project()

            # Write the project database record
            HarvestManDbManager.add_project_record()
            
        if not objects.config.resuming:
            info('Starting download of url',objects.config.url,'...')
        else:
            pass

        # Reset objects keeping project-specific states now
        # Reset and re-initialize datamgr
        objects.datamgr.clean_up()
        objects.datamgr.initialize()
        objects.rulesmgr.reset()
            
        # Read the project cache file, if any
        if objects.config.pagecache:
            objects.datamgr.read_project_cache()
            
        if not objects.config.resuming:
            # Configure tracker manager for this project
            if objects.queuemgr.configure():
                # start the project
                objects.queuemgr.crawl()
        else:
            objects.queuemgr.restart()

        objects.eventmgr.raise_event('post_start_project', objects.queuemgr.baseurl, None)