Example #1
0
 def test_write_to_cache_without_date(self):
     file_path = self.test_cache_path + "aa/bb/aabb-testfile"
     data = "test-string"
     self.mixin._write_to_cache(file_path, data)
     expected_modified_date = localize_datetime(datetime.datetime.now())
     file_exists = os.path.exists("%s-%i" % (file_path, datetime_to_unixstamp(expected_modified_date)))
     self.assertTrue(file_exists, True)
Example #2
0
    def fetch(self, url, modified_date):
        modified_date = localize_datetime(str_to_datetime(modified_date))

        url_hash = get_sha1_hash(url)
        base_path = self.base_path(url_hash)

        try:
            file_path, latest_version = self._latest_version(base_path)
            latest_version_path = '%s-%s' % (file_path, latest_version)
            self._check_path(latest_version_path)
        except OSError:
            # File does not exist, download and cache the url
            data = self._download_file(url)
            self._write_to_cache(base_path, data, modified_date)
            return data

        if modified_date and modified_date > str_to_datetime(latest_version):
            # If file has been modified download it
            data = self._download_file(url)
            self._write_to_cache(base_path, data, modified_date)
            return data
        else:
            # todo force_old_files
            with open(latest_version_path, 'rb') as f:
                return f.read()
Example #3
0
    def fetch(self, url, path, modified_date):
        modified_date = localize_datetime(str_to_datetime(modified_date))

        url_hash = base64.urlsafe_b64encode(path)
        base_path = self.base_path(url_hash)

        try:
            file_path, latest_version = self._latest_version(base_path)
            latest_version_path = '%s-%s' % (file_path, latest_version)
            self._check_path(latest_version_path)
        except OSError:
            # File does not exist, download and cache the url
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file

        if modified_date and modified_date > str_to_datetime(latest_version):
            # If file has been modified download it
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file
        else:
            if self.source_definition.get('force_old_files'):
                with open(latest_version_path, 'rb') as f:
                    f.seek(0, 2)
                    content_length = f.tell()
                    f.seek(0, 0)
                    return None, content_length, f.read()

        raise ItemAlreadyProcessed("Item %s has already been processed on %s. "
                                   "Set 'force_old_files' in source_definition "
                                   "to download old files from cache." %
                                   (url, latest_version))
 def test_write_to_cache_without_date(self):
     file_path = self.test_cache_path + "aa/bb/aabb-testfile"
     data = "test-string"
     self.mixin._write_to_cache(file_path, data)
     expected_modified_date = localize_datetime(datetime.datetime.now())
     file_exists = os.path.exists("%s-%i" % (file_path, datetime_to_unixstamp(expected_modified_date)))
     self.assertTrue(file_exists, True)
Example #5
0
    def fetch(self, url, path, modified_date):
        if modified_date:
            modified_date = localize_datetime(str_to_datetime(modified_date))
        else:
            modified_date = None

        url_hash = base64.urlsafe_b64encode(path)
        base_path = self.base_path(url_hash)

        try:
            file_path, latest_version = self._latest_version(base_path)
            latest_version_path = '%s-%s' % (file_path, latest_version)
            self._check_path(latest_version_path)
        except OSError:
            # File does not exist, download and cache the url
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            # read() iterates over the file to the end, so we have to seek to the beginning to use it again!
            media_file.seek(0, 0)
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file

        if modified_date and modified_date > str_to_datetime(latest_version):
            # If file has been modified download it
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            media_file.seek(0, 0)
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file
        else:
            if self.source_definition.get('force_old_files'):
                with open(latest_version_path, 'rb') as f:
                    f.seek(0, 2)
                    content_length = f.tell()
                    f.seek(0, 0)
                    return None, content_length, f.read()

        raise ItemAlreadyProcessed(
            "Item %s has already been processed on %s. "
            "Set 'force_old_files' in source_definition "
            "to download old files from cache." % (url, latest_version))
Example #6
0
    def fetch(self, url, path, modified_date):
        """Fetch a resource url and save it to a path in GCS. The resource will
        only be downloaded from the source when the file has been modified,
        otherwise the file will be downloaded from cache if 'force_old_files'
        has been set.
        """

        bucket = self.get_bucket()
        blob = bucket.get_blob(path)
        if not blob:
            blob = bucket.blob(path)

            # File does not exist
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            self.compressed_upload(blob, data, content_type)
            return content_type, content_length, media_file

        modified_date = localize_datetime(str_to_datetime(modified_date))
        if modified_date > blob.updated:
            # Upload newer file
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            self.compressed_upload(blob, data, content_type)
            return content_type, content_length, media_file
        elif self.source_definition.get('force_old_files'):
            # Download up-to-date file
            media_file = NamedTemporaryFile(dir=TEMP_DIR_PATH)
            blob.download_to_file(media_file)
            media_file.seek(0, 0)
            return blob.content_type, blob.size, media_file

        raise ItemAlreadyProcessed("Item %s has already been processed on %s. "
                                   "Set 'force_old_files' in source_definition "
                                   "to download old files from cache." %
                                   (url, blob.updated.strftime("%c")))
Example #7
0
class LocalCachingMixin(HttpRequestMixin):
    def base_path(self, file_name):
        first_dir = file_name[0:2]
        second_dir = file_name[2:4]

        return os.path.join(
            DATA_DIR_PATH,
            'cache',
            self.source_definition['index_name'],
            first_dir,
            second_dir,
            file_name,
        )

    @staticmethod
    def _latest_version(file_path):
        version_paths = glob.glob('%s-*' % file_path)

        if len(version_paths) < 1:
            raise OSError

        versions = [
            os.path.basename(version_path).rpartition("-")[2]
            for version_path in version_paths
        ]
        latest_version = sorted(versions, reverse=True)[0]

        return file_path, latest_version,

    @staticmethod
    def _check_path(path):
        file_bytes = os.path.getsize(path)

        # Raise OSError if the filesize is smaller than two bytes
        if file_bytes < 2:
            raise InvalidFile

    def fetch(self, url, path, modified_date):
        if modified_date:
            modified_date = localize_datetime(str_to_datetime(modified_date))
        else:
            modified_date = None

        url_hash = base64.urlsafe_b64encode(path)
        base_path = self.base_path(url_hash)

        try:
            file_path, latest_version = self._latest_version(base_path)
            latest_version_path = '%s-%s' % (file_path, latest_version)
            self._check_path(latest_version_path)
        except OSError:
            # File does not exist, download and cache the url
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            # read() iterates over the file to the end, so we have to seek to the beginning to use it again!
            media_file.seek(0, 0)
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file

        if modified_date and modified_date > str_to_datetime(latest_version):
            # If file has been modified download it
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            media_file.seek(0, 0)
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file
        else:
            if self.source_definition.get('force_old_files'):
                with open(latest_version_path, 'rb') as f:
                    f.seek(0, 2)
                    content_length = f.tell()
                    f.seek(0, 0)
                    return None, content_length, f.read()

        raise ItemAlreadyProcessed(
            "Item %s has already been processed on %s. "
            "Set 'force_old_files' in source_definition "
            "to download old files from cache." % (url, latest_version))

    @staticmethod
    def _write_to_cache(file_path, data, modified_date=None):
        try:
            # Create all subdirectories
            os.makedirs(os.path.dirname(file_path))
        except OSError, e:
            # Reraise if error is not 'File exists'
            if e.errno != errno.EEXIST:
                raise e

        if not modified_date:
            modified_date = datetime.now()

        modified_date = datetime_to_unixstamp(localize_datetime(modified_date))

        with open('%s-%s' % (file_path, modified_date), 'w') as f:
            f.write(data)
Example #8
0
class HTTPCachingMixin(HttpRequestMixin):
    source_definition = None

    def base_path(self, file_name):
        first_dir = file_name[0:2]
        second_dir = file_name[2:4]

        return os.path.join(
            DATA_DIR_PATH,
            'cache',
            self.source_definition['index_name'],
            first_dir,
            second_dir,
            file_name,
        )

    @staticmethod
    def _latest_version(file_path):
        version_paths = glob.glob('%s-*' % file_path)

        if len(version_paths) < 1:
            raise OSError

        versions = [
            os.path.basename(version_path).rpartition("-")[2]
            for version_path in version_paths
        ]
        latest_version = sorted(versions, reverse=True)[0]

        return file_path, latest_version,

    @staticmethod
    def _check_path(path):
        file_bytes = os.path.getsize(path)

        # Raise OSError if the filesize is smaller than two bytes
        if file_bytes < 2:
            raise InvalidFile

    def fetch(self, url, modified_date):
        modified_date = localize_datetime(str_to_datetime(modified_date))

        url_hash = get_sha1_hash(url)
        base_path = self.base_path(url_hash)

        try:
            file_path, latest_version = self._latest_version(base_path)
            latest_version_path = '%s-%s' % (file_path, latest_version)
            self._check_path(latest_version_path)
        except OSError:
            # File does not exist, download and cache the url
            data = self._download_file(url)
            self._write_to_cache(base_path, data, modified_date)
            return data

        if modified_date and modified_date > str_to_datetime(latest_version):
            # If file has been modified download it
            data = self._download_file(url)
            self._write_to_cache(base_path, data, modified_date)
            return data
        else:
            # todo force_old_files
            with open(latest_version_path, 'rb') as f:
                return f.read()

    def _download_file(self, url):
        resp = self.http_session.get(url)
        resp.raise_for_status()
        return resp.content

    @staticmethod
    def _write_to_cache(file_path, data, modified_date=None):
        try:
            # Create all subdirectories
            os.makedirs(os.path.dirname(file_path))
        except OSError, e:
            # Reraise if error is not 'File exists'
            if e.errno != errno.EEXIST:
                raise e

        if not modified_date:
            modified_date = datetime.now()

        modified_date = datetime_to_unixstamp(localize_datetime(modified_date))

        with open('%s-%s' % (file_path, modified_date), 'w') as f:
            f.write(data)