def parse(self): raw_entities = [] for placemark in self.xpath(self.root, './/ns:Placemark'): raw_entities.append(self.placemark_to_entity(placemark)) entities = utils.parallelize(self.augment_entity, [(e,) for e in raw_entities]) name = tostring(self.xpath(self.root, 'ns:Document/ns:name')[0]) # TODO: Parse the latlngs into a Bounds object for the trip plan. # Right now this is happening the javascript as a hack. return data.TripPlan(name=name, entities=entities)
def scrape_entities_from_page_source(url, page_source): if scrape_logic.is_url_handleable(url): return scrape_entities_from_url(url, page_source) else: urls = extract_urls_from_page_source(url, page_source) handleable_urls = set(u for u in urls if scrape_logic.is_url_handleable(u, allow_expansion=False)) entity_lists = utils.parallelize(scrape_entities_from_url, [(u, None, True, None, False) for u in handleable_urls]) return utils.flatten(entity_lists)
def augment_trip_plan(raw_trip_plan): location_latlng = raw_trip_plan.location_latlng.to_json_obj() if raw_trip_plan.location_latlng else None entities = utils.parallelize( utils.retryable(augment_entity, retries=3), [(e, location_latlng) for e in raw_trip_plan.entities]) trip_plan = raw_trip_plan.copy() for i, entity in enumerate(entities): # If there's an RPC error, some of these may come back as None. # So as a fallback make sure we at least save the incoming entity. # TODO: Return an error message here so the user can be notified # that not all entities were saved. if not entity: entities[i] = raw_trip_plan.entities[i] trip_plan.entities = entities return trip_plan
def build_scrapers(url, client_page_source=None, force_fetch_page=False, allow_expansion=True, for_guide=False): page_source_tree = html_parsing.parse_tree_from_string(client_page_source) if client_page_source else None if not page_source_tree and (url_requires_server_page_source(url) or force_fetch_page): page_source_tree = html_parsing.parse_tree(url) scraped_pages = [] for scraper_class in ALL_SCRAPERS: handleable_urls = scraper_class.handleable_urls(url, page_source_tree, allow_expansion) if handleable_urls: reqs = [html_parsing.make_request(u) for u in handleable_urls] resps = utils.parallelize(utils.retryable(urllib2.urlopen, 3), [(req,) for req in reqs]) for url, resp in zip(handleable_urls, resps): if not resp: print "Failed to fetch url: %s" % url continue tree = etree.parse(resp, html_parsing.htmlparser()) scraper = scraper_class(url, tree, for_guide) scraped_pages.append(scraper) break return scraped_pages
def stopreadingCollaborator(self): print("=== Stoping (readers) %s collaborators ===" % self.__collab_type) utils.parallelize(utils.stopreading_collab, self.__collab_type, self.__addresses)
def startCollaborator(self): print("=== Starting %s collaborators ===" % self.__collab_type) utils.parallelize(utils.start_collab, self.__collab_type, self.__addresses)
def createCollaborator(self): print("=== Creating %s collaborators ===" % self.__collab_type) utils.parallelize(utils.create_collab, self.__collab_type, self.__addresses)
def make_photo_urls(photo_objs): return utils.parallelize(resolve_photo_url, [(obj['photo_reference'], obj['width'], obj['height']) for obj in photo_objs])
def prepare(): # TODO equally distribute larger languages parallelize(_prepare, LANGUAGES, n_workers=4)
def reencode_dir(input_dir, num_pools): raw_movie_files = glob('%s/*.mp4' % input_dir) mkdir_p('%s/%s' % (input_dir, DONE_DIR)) mkdir_p('%s/%s' % (input_dir, OUTPUT_DIR)) parallelize(reencode_single_file, raw_movie_files, num_pools=num_pools)
def refresh(self): try: if self.ready: status, dlq, ulq, shd = r = utils.parallelize( self.client.get_status, self.client.show_dl, self.client.show_ul, self.client.show_shared ) for i in r: if isinstance(i, BaseException): raise i # Queue merge (one file can be in two queues) downloads = {} for queue in (dlq, ulq, shd): is_downloading = queue is dlq for download in queue.itervalues(): dhash = download.partfile_hash.encode("hex") if dhash in downloads: downloads[dhash].update(download) downloads[dhash]["is_downloading"] |= is_downloading else: downloads[dhash] = download downloads[dhash]["is_downloading"] = is_downloading downloads_changed = downloads != self._data #frozen_cmp(downloads, self._data) != 0 self._status.update(status) self._data = downloads if downloads_changed: # Download updates for dhash, download in downloads.iteritems(): if dhash in self._downloads: self.outdated_downloads.add(self._downloads[dhash]) else: self._downloads[dhash] = Download(self, download, None) self.emit("download_new", self._downloads[dhash]) # Removing deleted downloads unowned = not self.manager is self for dhash in frozenset(downloads).symmetric_difference(self._downloads): if self._downloads[dhash].finished: self.outdated_downloads.add(self._downloads[dhash]) else: self.emit("download_remove", self._downloads[dhash]) if unowned: self.manager.remove(self._downloads[dhash]) del self._downloads[dhash] self._status_cache = ("",) else: self._status_cache = ("backend not ready",) except ec.ConnectionFailedError as e: # Amule daemon is prone to fail logger.exception(e) self._sync_numfails += 1 if not self._sync_restarting_daemon and ( self._connecting_to_kad or self._sync_numfails > self._sync_max_numfails ): # If connection failed during Kad connection or # sync failed more than _sync_numfails self.ready = False # Revert ready state self._sync_restarting_daemon = True logger.debug("Max ConnectionFailedError achieved, restarting daemon.") self.start_daemon() self._sync_restarting_daemon = False except BaseException as e: # Unexpected errors shouldn't ever happen logger.exception(e) else: # Once daemon starts to response, it should't fails, so # whe lower the fail tolerance if not self._sync_worked_once: self._sync_worked_once = True self._sync_max_numfails = 1 self._sync_numfails = 0 BackendBase.refresh(self)
def process_file(fname, nprocesses=mp.cpu_count()): """Process a file using multiprocessing.""" with open(fname, 'r', encoding='iso-8859-1') as text: strands = utils.spagettify(text, nprocesses) result = utils.parallelize(build_vocab, strands) return result
def scrape_entities_from_url(url, page_source=None, force_fetch_page=False, max_results=None, allow_expansion=True, for_guide=False): scrapers = scrape_logic.build_scrapers(url, page_source, force_fetch_page, for_guide=for_guide) scrapers = scrapers[:max_results] if max_results else scrapers return utils.parallelize(entity_from_scraper, [(scr, url) for scr in scrapers])
def get_raw_entities(self): path = urlparse.urlparse(self.url).path links = self.root.xpath(".//div[@class='content']//a/@href") entity_links = [urlparse.urljoin(self.url, l.strip()) for l in links if l.startswith(path)] return utils.parallelize(self.scrape_entity_page, [(l,) for l in entity_links])