Esempio n. 1
0
 def __init__(self, occ, city, state, numthreads=8, js=True):
     self.scraper = YPScraper(occ, city, state)
     self.companies = []
     self.js = js
     self.to_process_queue = UniqueQueue()
     self.results_queue = Queue()
     self.threads = [DomainScrapeThread(self.to_process_queue, self.results_queue)
                     for _ in range(numthreads)]
Esempio n. 2
0
class Manager(object):
    """
        Manages the creation and delegation of DomainScrapeThread's for companies
        as well as the initial yp scraping. Only interface that a user should need to 
        use directly.
        fromStateFile should be the filename of which to load from
    """
    def __init__(self, occ, city, state, numthreads=8, js=True):
        self.scraper = YPScraper(occ, city, state)
        self.companies = []
        self.js = js
        self.to_process_queue = UniqueQueue()
        self.results_queue = Queue()
        self.threads = [DomainScrapeThread(self.to_process_queue, self.results_queue)
                        for _ in range(numthreads)]
        
    def __repr__(self):
        return "<Manager Object [%s, %s, %s]>" % (self.occ, self.city, self.state)
        
    def __str__(self):
        s = ''
        for c in self.companies:
            s += str(c)
        return s
    
    def __len__(self):
        return len(self.companies)
        
    def scrape_companies(self):
        """
            proxy call to ensure all threads get properly torn down to avoid
            extra running processes
        """
        self._scrape_comp_call()
        for i, t in enumerate(self.threads):
            self.threads[i] = None
            t.teardown()
        
    def _scrape_comp_call(self):
        """
            main 'manager' thread.. moniters all other threads and if one is 
            inactive, assigns a new one with the next url in tpProcess. 
            returns when all threads are inactive and toProcess is empty
        """
        self._init_with_yp()
        for t in self.threads:
            t.start()
        for t in self.threads:
            t.join()
        for idx, url, page in self.results_queue.queue:
            self.companies[idx].add_page(url, page)

    def _init_with_yp(self):
        """
            runs the YP search and populates self.urls with the base urls to process
            if they exist
        """
        self.companies = self.scraper.scrape()
        for i, c in enumerate(self.companies):
            if c.url:
                self.to_process_queue.put((i, c.url))
        
    def view_all_pages(self):
        """
            debugging method
        """
        for c in self.companies:
            if c.url:
                pprint(c.name)
                input()
                for p in c.pages:
                    pprint(p)
                    input()
                    
    def save_state(self, dump_file='application/tmp/state.txt'):
        """
            :type dump_file the file to save this instance to
            dump the entire manager instance to a pickle file
        """
        logging.info("dumping manager (self) to "+dump_file)
        d = shelve.open(dump_file)
        d['manager'] = self
        d.close()
            
    @staticmethod
    def load_state(load_file='application/tmp/state.txt'):
        """
            :type load_file the file to load from
            return a manager object from a previously saved execution
        """
        logging.info("retrieving manager from "+load_file)
        d = shelve.open(load_file)
        return d['manager']

    def define_expected(self):
        """
            open the base url for each company and accept user defined outcome
            used to build a training set for a scikit-learn pipeline
        """
        pass

    def fit_pipeline(self):
        pipeline.fit(self.companies)