def __init__(self, to_proc_queue, res_queue, js=True): """ the fetcher obj to use, the url, a generic 'marker' to be passed to both callbacks to keep a reference, a func to call on any new urls found, and a func to call on the result page. """ Thread.__init__(self) self.fetcher = getFetcher(js=js) self.to_proc_queue = to_proc_queue self.res_queue = res_queue
def scrape(self): """ begins iterating through every page of results and returns a list of company objects for each """ url = "http://www.yellowpages.com/search?search_terms={0}&geo_location_terms={1} {2}&page={3}" fetcher = getFetcher(js=self.js) companies = [] page = 1 while True: res = bs(fetcher.get(url.format(self.occ, self.city, self.state, page)), 'html.parser') cs = self._parse_yp_response(res) if not cs: break else: companies += cs page += 1 fetcher.teardown() return companies