Exemple #1
0
def download(force=False):
    """
    Downloads and extracts the desired DBpedia datasets.

    They are extracted to the app's `DATASETS_PATH` value.
    """

    # Get the desired dataset urls.
    dataset_urls = [dataset_url for dataset_url in get_dataset_urls() if any(setname in dataset_url for setname in DESIRED_DATASETS)]

    for dataset_url in dataset_urls:
        # dc = decompressed
        dc_filepath = os.path.join(DATASETS_PATH,
                os.path.basename(dataset_url)[:-4]) # remove '.bz2'

        if os.path.exists(dc_filepath) and not force:
            logger.warn('File exists, not re-downloading and extracting. You can force by passing `force=True`.')
            continue

        # Download the dataset.
        logger.info('Downloading knowledge dataset from {0}'.format(dataset_url))
        filepath = gullet.download(dataset_url, '/tmp/')
        logger.info('Downloaded to {0}'.format(filepath))

        # Decompress the files.
        logger.info('Extracting to {0}'.format(dc_filepath))
        with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath, 'rb') as file:
            for data in iter(lambda : file.read(100 * 1024), b''):
                dc_file.write(data)

        # Clean up.
        os.remove(filepath)
    logger.info('Downloading and extraction complete.')
Exemple #2
0
    def test_ignores_existing_download(self):
        """
        Download should skip if the file already
        is fully downloaded and is not expired.
        """
        # Set remote file to not be expired.
        mock_resp = self.mock_response()
        mock_resp.headers['Last-Modified'] = 'Wed, 01 Sep 2013 08:53:26 GMT'

        tmpfile = self.mock_file(complete=True)
        self.mock_open.return_value = tmpfile
        gullet.download(self.url, self.save_path)

        # Should have opened new file to append to,
        # but not actually write anything.
        self.assertEquals(tmpfile.tell(), self.content_length)
        self.mock_open.assert_called_once_with(tmpfile.name, 'ab')
Exemple #3
0
    def test_restarts_expired_download(self):
        """
        Download should restart if the remote
        file is newer than the existing file.
        """

        # Set remote file to be expired.
        mock_resp = self.mock_response()
        mock_resp.headers['Last-Modified'] = 'Wed, 05 Sep 2013 08:53:26 GMT'

        tmpfile = self.mock_file()
        self.mock_open.return_value = tmpfile
        gullet.download(self.url, self.save_path)

        # Should have written to new file.
        self.assertEquals(tmpfile.tell(), self.content_length)
        self.mock_open.assert_called_once_with(tmpfile.name, 'wb')
Exemple #4
0
    def test_download_continues(self):
        """
        Download should continue if incomplete,
        the file has not expired,
        and the server supports 'Accept-Ranges'.
        """
        # Set remote file to not be expired.
        mock_resp = self.mock_response(partial=True)
        mock_resp.headers['Last-Modified'] = 'Wed, 01 Sep 2013 08:53:26 GMT'

        tmpfile = self.mock_file()
        self.mock_open.return_value = tmpfile
        gullet.download(self.url, self.save_path)

        # Should have appended to existing file.
        self.assertEquals(tmpfile.tell(), self.content_length)
        self.mock_open.assert_called_once_with(tmpfile.name, 'ab')
Exemple #5
0
    def test_restarts_unsupported_download(self):
        """
        Download should restart if the server does not
        support 'Accept-Ranges'.
        """
        # Set remote file to not be expired, and
        # Get rid of the Accept-Ranges header.
        mock_resp = self.mock_response()
        mock_resp.headers['Last-Modified'] = 'Wed, 01 Sep 2013 08:53:26 GMT'
        mock_resp.headers.pop('Accept-Ranges', None)

        tmpfile = self.mock_file()
        self.mock_open.return_value = tmpfile
        gullet.download(self.url, self.save_path)

        # Should have written to new file.
        self.assertEquals(tmpfile.tell(), self.content_length)
        self.mock_open.assert_called_with(tmpfile.name, 'wb')
Exemple #6
0
def extract_image(entry_data, filename=None, save_dir='.'):
    """
    Extracts and saves a representative
    image for the entry.
    """
    image_url = ''
    if entry_data.top_image:
        remote_image_url = entry_data.top_image.src
        image_url = download(remote_image_url, save_dir, filename=filename)
    return image_url
Exemple #7
0
def extract_image(entry_data, filename=None, save_dir='.'):
    """
    Extracts and saves a representative
    image for the entry.
    """
    image_url = ''
    if entry_data.top_image:
        remote_image_url = entry_data.top_image.src
        image_url = download(remote_image_url, save_dir, filename=filename)
    return image_url
Exemple #8
0
    def download(self, url):
        """
        Downloads a file from the specified URL to replace
        this Digester's current file.

        Args:
            | url (str) -- the url of the file to download
        """

        # Get save directory for download.
        save_path = os.path.dirname(self.file)

        # Download!
        saved_filepath = gullet.download(url, save_path)

        # Rename downloaded file to match Digester's file.
        os.rename(saved_filepath, self.file)
Exemple #9
0
    def download(self, url):
        """
        Downloads a file from the specified URL to replace
        this Digester's current file.

        Args:
            | url (str) -- the url of the file to download
        """

        # Get save directory for download.
        save_path = os.path.dirname(self.file)

        # Download!
        saved_filepath = gullet.download(url, save_path)

        # Rename downloaded file to match Digester's file.
        os.rename(saved_filepath, self.file)
Exemple #10
0
def download(force=False):
    """
    Downloads and extracts the desired DBpedia datasets.

    They are extracted to the app's `DATASETS_PATH` value.
    """

    # Get the desired dataset urls.
    dataset_urls = [
        dataset_url for dataset_url in get_dataset_urls()
        if any(setname in dataset_url for setname in DESIRED_DATASETS)
    ]

    for dataset_url in dataset_urls:
        # dc = decompressed
        dc_filepath = os.path.join(
            DATASETS_PATH,
            os.path.basename(dataset_url)[:-4])  # remove '.bz2'

        if os.path.exists(dc_filepath) and not force:
            logger.warn(
                'File exists, not re-downloading and extracting. You can force by passing `force=True`.'
            )
            continue

        # Download the dataset.
        logger.info(
            'Downloading knowledge dataset from {0}'.format(dataset_url))
        filepath = gullet.download(dataset_url, '/tmp/')
        logger.info('Downloaded to {0}'.format(filepath))

        # Decompress the files.
        logger.info('Extracting to {0}'.format(dc_filepath))
        with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath,
                                                              'rb') as file:
            for data in iter(lambda: file.read(100 * 1024), b''):
                dc_file.write(data)

        # Clean up.
        os.remove(filepath)
    logger.info('Downloading and extraction complete.')