Exemple #1
0
def download_report(
    base_dir: str, bucket: gcp_storage.bucket.Bucket, name: str
) -> bool:
    path = f"{name}.json"
    archive_path = f"{name}.json.zstd"
    full_archive_path = os.path.join(base_dir, archive_path)
    full_path = os.path.join(base_dir, path)

    blob = bucket.blob(archive_path)
    if not blob.exists():
        logger.debug("No report found on GCP", path=archive_path)
        return False

    if os.path.exists(full_path):
        logger.info("Report already available", path=full_path)
        return True

    os.makedirs(os.path.dirname(full_archive_path), exist_ok=True)
    blob.download_to_filename(full_archive_path)
    logger.info("Downloaded report archive", path=full_archive_path)

    with open(full_path, "wb") as output:
        with open(full_archive_path, "rb") as archive:
            dctx = zstandard.ZstdDecompressor()
            reader = dctx.stream_reader(archive)
            while True:
                chunk = reader.read(16384)
                if not chunk:
                    break
                output.write(chunk)

    os.unlink(full_archive_path)
    return True
Exemple #2
0
def check_new_files(bucket: storage.bucket.Bucket) -> np.ndarray:
    # Retrieve local list of files previously loaded into the database and compare to the files currently in the bucket
    loaded_files = pd.read_csv('loaded_files.csv')['file name']
    bucket_files = [filename.name for filename in list(bucket.list_blobs())]
    new_files = np.setdiff1d(bucket_files, loaded_files.tolist(), assume_unique=False)

    return new_files
Exemple #3
0
  def _upload_file(self, source_file_path: str, bucket: storage.bucket.Bucket,
                   destination_file_path: str) -> None:
    """Uploads file to Cloud Storage with Retry logic.

    The Retry decorator will retry transient API errors. Following errors are
    some examples of transient errors:
    1. google.api_core.exceptions.InternalServerError
    2. google.api_core.exceptions.TooManyRequests
    3. google.api_core.exceptions.ServiceUnavailable

    Args:
      source_file_path: Path to the file to be uploaded. e.g - /tmp/file.txt
      bucket: Cloud Storage bucket to which the file should be uploaded.
      destination_file_path: Path of the destination blob/object within the
        Cloud Storage bucket. If the Cloud Storage URL is
        'gs://bucket1/dir1/file1.txt', then the destination_file_path would be
        'dir1/file1.txt'.
    """
    destination_blob = bucket.blob(destination_file_path)
    destination_blob.upload_from_filename(source_file_path)
Exemple #4
0
def list_reports(
    bucket: gcp_storage.bucket.Bucket, repository: str, until: Optional[datetime] = None
) -> Iterator[Tuple[str, str, str]]:
    REGEX_BLOB = re.compile(
        r"^{}/(\w+)/([\w\-]+):([\w\-]+).json.zstd$".format(repository)
    )
    now = datetime.utcnow().replace(tzinfo=pytz.UTC)
    for blob in bucket.list_blobs(prefix=repository):
        if isinstance(until, timedelta) and (now - blob.time_created) >= until:
            logger.debug(f"Skipping old blob {blob}")
            continue

        # Get changeset from blob name
        match = REGEX_BLOB.match(blob.name)
        if match is None:
            logger.warn("Invalid blob found {}".format(blob.name))
            continue
        changeset = match.group(1)
        platform = match.group(2)
        suite = match.group(3)

        # Build report instance and ingest it
        yield changeset, platform, suite
Exemple #5
0
def download_manual_routeviews(bucket: storage.bucket.Bucket) -> None:
    first_date = datetime.date(2018, 7, 27)  # Date of earliest data
    last_date = datetime.date.today()
    datelist = [
        first_date + datetime.timedelta(days=x)
        for x in range(0, (last_date - first_date).days + 1)
    ]

    for date in datelist:
        print("checking date {}".format(date))
        year, month, day = date.year, date.month, date.day

        path = f"http://data.caida.org/datasets/routing/routeviews-prefix2as/{year}/{month:02}/"
        # possible times are 0000 to 2200 in intervals of 200
        times = [
            "0000", "0200", "0400", "0600", "0800", "1000", "1200", "1400",
            "1600", "1800", "2000", "2200"
        ]
        for time in times:
            try:
                filename = f"routeviews-rv2-{year}{month:02}{day:02}-{time}.pfx2as.gz"
                url = path + filename
                cloud_filepath = "caida/routeviews/" + filename

                # This call will fail for most urls,
                # since we don't know which timestamp is correct.
                # In that case we just move on to our next guess.
                f = httpio.open(url)

                print(
                    f"mirroring {url} to gs://{bucket.name}/{cloud_filepath}")

                blob = bucket.blob(cloud_filepath)
                blob.upload_from_file(f)
            except requests.exceptions.HTTPError as ex:
                if ex.response.status_code != 404:
                    raise ex
def download_days_routeview(bucket: storage.bucket.Bucket,
                            date: datetime.date) -> None:
    """Download a single date's missing routeview files.

  Args:
    bucket: GCS storage bucket to write routeviews to.
    date: the date to download a file for.
  """

    print(f"checking date {date}")
    year, month, day = date.year, date.month, date.day

    path = f"http://data.caida.org/datasets/routing/routeviews-prefix2as/{year}/{month:02}/"
    # possible times are 0000 to 2200 in intervals of 200
    times = [
        "0000", "0200", "0400", "0600", "0800", "1000", "1200", "1400", "1600",
        "1800", "2000", "2200"
    ]
    for time in times:
        try:
            filename = f"routeviews-rv2-{year}{month:02}{day:02}-{time}.pfx2as.gz"
            url = path + filename
            cloud_filepath = "caida/routeviews/" + filename

            # This call will fail for most urls,
            # since we don't know which timestamp is correct.
            # In that case we just move on to our next guess.
            content = urllib.request.urlopen(url).read()

            print(f"mirroring {url} to gs://{bucket.name}/{cloud_filepath}")

            blob = bucket.blob(cloud_filepath)
            blob.upload_from_string(content)
        except requests.exceptions.HTTPError as ex:
            if ex.response.status_code != 404:
                raise ex
 def __init__(self, bucket: storage.bucket.Bucket, remote_file_path: str):
     self.blob = bucket.blob(remote_file_path)