def download(force=False): """ Downloads and extracts the desired DBpedia datasets. They are extracted to the app's `DATASETS_PATH` value. """ # Get the desired dataset urls. dataset_urls = [dataset_url for dataset_url in get_dataset_urls() if any(setname in dataset_url for setname in DESIRED_DATASETS)] for dataset_url in dataset_urls: # dc = decompressed dc_filepath = os.path.join(DATASETS_PATH, os.path.basename(dataset_url)[:-4]) # remove '.bz2' if os.path.exists(dc_filepath) and not force: logger.warn('File exists, not re-downloading and extracting. You can force by passing `force=True`.') continue # Download the dataset. logger.info('Downloading knowledge dataset from {0}'.format(dataset_url)) filepath = gullet.download(dataset_url, '/tmp/') logger.info('Downloaded to {0}'.format(filepath)) # Decompress the files. logger.info('Extracting to {0}'.format(dc_filepath)) with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath, 'rb') as file: for data in iter(lambda : file.read(100 * 1024), b''): dc_file.write(data) # Clean up. os.remove(filepath) logger.info('Downloading and extraction complete.')
def test_ignores_existing_download(self): """ Download should skip if the file already is fully downloaded and is not expired. """ # Set remote file to not be expired. mock_resp = self.mock_response() mock_resp.headers['Last-Modified'] = 'Wed, 01 Sep 2013 08:53:26 GMT' tmpfile = self.mock_file(complete=True) self.mock_open.return_value = tmpfile gullet.download(self.url, self.save_path) # Should have opened new file to append to, # but not actually write anything. self.assertEquals(tmpfile.tell(), self.content_length) self.mock_open.assert_called_once_with(tmpfile.name, 'ab')
def test_restarts_expired_download(self): """ Download should restart if the remote file is newer than the existing file. """ # Set remote file to be expired. mock_resp = self.mock_response() mock_resp.headers['Last-Modified'] = 'Wed, 05 Sep 2013 08:53:26 GMT' tmpfile = self.mock_file() self.mock_open.return_value = tmpfile gullet.download(self.url, self.save_path) # Should have written to new file. self.assertEquals(tmpfile.tell(), self.content_length) self.mock_open.assert_called_once_with(tmpfile.name, 'wb')
def test_download_continues(self): """ Download should continue if incomplete, the file has not expired, and the server supports 'Accept-Ranges'. """ # Set remote file to not be expired. mock_resp = self.mock_response(partial=True) mock_resp.headers['Last-Modified'] = 'Wed, 01 Sep 2013 08:53:26 GMT' tmpfile = self.mock_file() self.mock_open.return_value = tmpfile gullet.download(self.url, self.save_path) # Should have appended to existing file. self.assertEquals(tmpfile.tell(), self.content_length) self.mock_open.assert_called_once_with(tmpfile.name, 'ab')
def test_restarts_unsupported_download(self): """ Download should restart if the server does not support 'Accept-Ranges'. """ # Set remote file to not be expired, and # Get rid of the Accept-Ranges header. mock_resp = self.mock_response() mock_resp.headers['Last-Modified'] = 'Wed, 01 Sep 2013 08:53:26 GMT' mock_resp.headers.pop('Accept-Ranges', None) tmpfile = self.mock_file() self.mock_open.return_value = tmpfile gullet.download(self.url, self.save_path) # Should have written to new file. self.assertEquals(tmpfile.tell(), self.content_length) self.mock_open.assert_called_with(tmpfile.name, 'wb')
def extract_image(entry_data, filename=None, save_dir='.'): """ Extracts and saves a representative image for the entry. """ image_url = '' if entry_data.top_image: remote_image_url = entry_data.top_image.src image_url = download(remote_image_url, save_dir, filename=filename) return image_url
def download(self, url): """ Downloads a file from the specified URL to replace this Digester's current file. Args: | url (str) -- the url of the file to download """ # Get save directory for download. save_path = os.path.dirname(self.file) # Download! saved_filepath = gullet.download(url, save_path) # Rename downloaded file to match Digester's file. os.rename(saved_filepath, self.file)
def download(force=False): """ Downloads and extracts the desired DBpedia datasets. They are extracted to the app's `DATASETS_PATH` value. """ # Get the desired dataset urls. dataset_urls = [ dataset_url for dataset_url in get_dataset_urls() if any(setname in dataset_url for setname in DESIRED_DATASETS) ] for dataset_url in dataset_urls: # dc = decompressed dc_filepath = os.path.join( DATASETS_PATH, os.path.basename(dataset_url)[:-4]) # remove '.bz2' if os.path.exists(dc_filepath) and not force: logger.warn( 'File exists, not re-downloading and extracting. You can force by passing `force=True`.' ) continue # Download the dataset. logger.info( 'Downloading knowledge dataset from {0}'.format(dataset_url)) filepath = gullet.download(dataset_url, '/tmp/') logger.info('Downloaded to {0}'.format(filepath)) # Decompress the files. logger.info('Extracting to {0}'.format(dc_filepath)) with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath, 'rb') as file: for data in iter(lambda: file.read(100 * 1024), b''): dc_file.write(data) # Clean up. os.remove(filepath) logger.info('Downloading and extraction complete.')