def download_metadata(self, n_threads=5): web_request_queue = Queue() json_serialization_queue = Queue() urls = self.paginated_media_urls if len(urls) > 1: for url in urls: web_request_queue.put(url) web_thread = lambda: ThreadMetadataRequest( web_request_queue, json_serialization_queue, self.session ) pool_size = min(len(urls), n_threads) web_pool = [web_thread() for x in range(pool_size)] json_serializer = ThreadJSONWriter( json_serialization_queue, self.metadata_filepath ) for thread in web_pool: thread.setDaemon(True) thread.start() json_serializer.start() web_request_queue.join() json_serialization_queue.join() else: json_response = self._grab_json(urls[0]) media_entries = json_response['media'] media_dict = list_of_dicts_to_dict( media_entries, promote_to_key='_id') exists = osp.isfile(self.metadata_filepath) filemode = 'r+w' if exists else 'w' with open(self.metadata_filepath, filemode) as f: try: cached_meta = load_json(f) if exists else {} except ValueError: cached_meta = {} cached_meta.update(media_dict) dump_json(cached_meta, f) self._metadata = cached_meta
def run(self): filemode = 'r+w' if self.file_exists else 'w' with open(ap(self.filename), filemode) as f: try: metadata_dict = json.load(f) if self.file_exists else {} except ValueError: metadata_dict = {} while not self.stoprequest.isSet(): try: json_chunk = self.qi.get(True, 0.5) except Empty: continue metadata_dict.update(json_chunk) self.qi.task_done() dump_json(metadata_dict, f)