def download_report( base_dir: str, bucket: gcp_storage.bucket.Bucket, name: str ) -> bool: path = f"{name}.json" archive_path = f"{name}.json.zstd" full_archive_path = os.path.join(base_dir, archive_path) full_path = os.path.join(base_dir, path) blob = bucket.blob(archive_path) if not blob.exists(): logger.debug("No report found on GCP", path=archive_path) return False if os.path.exists(full_path): logger.info("Report already available", path=full_path) return True os.makedirs(os.path.dirname(full_archive_path), exist_ok=True) blob.download_to_filename(full_archive_path) logger.info("Downloaded report archive", path=full_archive_path) with open(full_path, "wb") as output: with open(full_archive_path, "rb") as archive: dctx = zstandard.ZstdDecompressor() reader = dctx.stream_reader(archive) while True: chunk = reader.read(16384) if not chunk: break output.write(chunk) os.unlink(full_archive_path) return True
def check_new_files(bucket: storage.bucket.Bucket) -> np.ndarray: # Retrieve local list of files previously loaded into the database and compare to the files currently in the bucket loaded_files = pd.read_csv('loaded_files.csv')['file name'] bucket_files = [filename.name for filename in list(bucket.list_blobs())] new_files = np.setdiff1d(bucket_files, loaded_files.tolist(), assume_unique=False) return new_files
def _upload_file(self, source_file_path: str, bucket: storage.bucket.Bucket, destination_file_path: str) -> None: """Uploads file to Cloud Storage with Retry logic. The Retry decorator will retry transient API errors. Following errors are some examples of transient errors: 1. google.api_core.exceptions.InternalServerError 2. google.api_core.exceptions.TooManyRequests 3. google.api_core.exceptions.ServiceUnavailable Args: source_file_path: Path to the file to be uploaded. e.g - /tmp/file.txt bucket: Cloud Storage bucket to which the file should be uploaded. destination_file_path: Path of the destination blob/object within the Cloud Storage bucket. If the Cloud Storage URL is 'gs://bucket1/dir1/file1.txt', then the destination_file_path would be 'dir1/file1.txt'. """ destination_blob = bucket.blob(destination_file_path) destination_blob.upload_from_filename(source_file_path)
def list_reports( bucket: gcp_storage.bucket.Bucket, repository: str, until: Optional[datetime] = None ) -> Iterator[Tuple[str, str, str]]: REGEX_BLOB = re.compile( r"^{}/(\w+)/([\w\-]+):([\w\-]+).json.zstd$".format(repository) ) now = datetime.utcnow().replace(tzinfo=pytz.UTC) for blob in bucket.list_blobs(prefix=repository): if isinstance(until, timedelta) and (now - blob.time_created) >= until: logger.debug(f"Skipping old blob {blob}") continue # Get changeset from blob name match = REGEX_BLOB.match(blob.name) if match is None: logger.warn("Invalid blob found {}".format(blob.name)) continue changeset = match.group(1) platform = match.group(2) suite = match.group(3) # Build report instance and ingest it yield changeset, platform, suite
def download_manual_routeviews(bucket: storage.bucket.Bucket) -> None: first_date = datetime.date(2018, 7, 27) # Date of earliest data last_date = datetime.date.today() datelist = [ first_date + datetime.timedelta(days=x) for x in range(0, (last_date - first_date).days + 1) ] for date in datelist: print("checking date {}".format(date)) year, month, day = date.year, date.month, date.day path = f"http://data.caida.org/datasets/routing/routeviews-prefix2as/{year}/{month:02}/" # possible times are 0000 to 2200 in intervals of 200 times = [ "0000", "0200", "0400", "0600", "0800", "1000", "1200", "1400", "1600", "1800", "2000", "2200" ] for time in times: try: filename = f"routeviews-rv2-{year}{month:02}{day:02}-{time}.pfx2as.gz" url = path + filename cloud_filepath = "caida/routeviews/" + filename # This call will fail for most urls, # since we don't know which timestamp is correct. # In that case we just move on to our next guess. f = httpio.open(url) print( f"mirroring {url} to gs://{bucket.name}/{cloud_filepath}") blob = bucket.blob(cloud_filepath) blob.upload_from_file(f) except requests.exceptions.HTTPError as ex: if ex.response.status_code != 404: raise ex
def download_days_routeview(bucket: storage.bucket.Bucket, date: datetime.date) -> None: """Download a single date's missing routeview files. Args: bucket: GCS storage bucket to write routeviews to. date: the date to download a file for. """ print(f"checking date {date}") year, month, day = date.year, date.month, date.day path = f"http://data.caida.org/datasets/routing/routeviews-prefix2as/{year}/{month:02}/" # possible times are 0000 to 2200 in intervals of 200 times = [ "0000", "0200", "0400", "0600", "0800", "1000", "1200", "1400", "1600", "1800", "2000", "2200" ] for time in times: try: filename = f"routeviews-rv2-{year}{month:02}{day:02}-{time}.pfx2as.gz" url = path + filename cloud_filepath = "caida/routeviews/" + filename # This call will fail for most urls, # since we don't know which timestamp is correct. # In that case we just move on to our next guess. content = urllib.request.urlopen(url).read() print(f"mirroring {url} to gs://{bucket.name}/{cloud_filepath}") blob = bucket.blob(cloud_filepath) blob.upload_from_string(content) except requests.exceptions.HTTPError as ex: if ex.response.status_code != 404: raise ex
def __init__(self, bucket: storage.bucket.Bucket, remote_file_path: str): self.blob = bucket.blob(remote_file_path)