Esempio n. 1
0
def GetCachedDataset():
    """Load the latest dataset with cached data."""
    local_path = CachedFilePath(DATASET_PKL_FILE)
    if os.path.exists(local_path) or DownloadFromCloudStorage(local_path):
        return pd.read_pickle(local_path)
    else:
        return None
Esempio n. 2
0
def GetWithCache(filename, frame_maker, expires_after):
  """Get a data frame from cache or, if necessary, create and cache it.

  Args:
    filename: The name of a file for the cached copy of the data frame,
      it will be stored in the CACHE_DIR.
    frame_maker: A function that takes no arguments and returns a data frame,
      only called to create the data frame if the cached copy does not exist
      or is too old.
    expires_after: A datetime.timedelta object, the cached copy will not be
      used if it was created longer that this time ago.
  """
  filepath = os.path.join(CACHE_DIR, filename)
  try:
    timestamp = os.path.getmtime(filepath)
    last_modified = datetime.datetime.utcfromtimestamp(timestamp)
    expired = datetime.datetime.utcnow() > last_modified + expires_after
  except OSError:  # If the file does not exist.
    expired = True

  if expired:
    df = frame_maker()
    if not os.path.exists(CACHE_DIR):
      os.makedirs(CACHE_DIR)
    df.to_pickle(filepath)
  else:
    df = pandas.read_pickle(filepath)
  return df
Esempio n. 3
0
def AggregateAndUploadResults(state):
  """Aggregate results collected and upload them to cloud storage."""
  cached_results = CachedFilePath(DATASET_PKL_FILE)
  dfs = []

  keep_revisions = set(item['revision'] for item in state)
  if os.path.exists(cached_results):
    # To speed things up, we take the cache computed from previous results.
    df = pd.read_pickle(cached_results)
    # Drop possible old data from revisions no longer in recent state.
    df = df[df['revision'].isin(keep_revisions)]
    dfs.append(df)
    known_revisions = set(df['revision'])
  else:
    known_revisions = set()

  found_new = False
  for item in state:
    if item['revision'] in known_revisions or _SkipProcessing(item):
      # Revision is already in cache, jobs are not ready, or all have failed.
      continue
    if not found_new:
      logging.info('Processing data from new results:')
      found_new = True
    logging.info('- %s (%s)', item['timestamp'][:10], item['revision'])
    dfs.append(GetRevisionResults(item))

  if not found_new:
    logging.info('No new data found.')
    return

  # Otherwise update our cache and upload.
  df = pd.concat(dfs, ignore_index=True)
  df.to_pickle(cached_results)

  # Drop revisions with no results and mark the last result for each metric,
  # both with/without patch, as a 'reference'. This allows making score cards
  # comparing their most recent results in Data Studio dashboards.
  df = df[df['count'] > 0].copy()
  latest_result = df.groupby(
      ['label', 'benchmark', 'name'])['timestamp'].transform('max')
  df['reference'] = df['timestamp'] == latest_result

  dataset_file = CachedFilePath(DATASET_CSV_FILE)
  df.to_csv(dataset_file, index=False)
  UploadToCloudStorage(dataset_file)
  logging.info('Total %s rows of data uploaded.' % len(df.index))