Esempio n. 1
0
    def write_info_file(self, dataset_name, original_fname):
        """Write the INFO file next to local file.

    Although the method is synchronized, there is still a risk two processes
    running at the same time overlap here. Risk accepted, since potentially lost
    data (`dataset_name`) is only for human consumption.

    Args:
      dataset_name: data used to dl the file.
      original_fname: name of file as downloaded.
    """
        info = self._get_info() or {}
        urls = set(info.get('urls', []) + [self.url])
        dataset_names = info.get('dataset_names', [])
        if dataset_name:
            dataset_names.append(dataset_name)
        if 'original_fname' in info and info[
                'original_fname'] != original_fname:
            raise AssertionError(
                '`original_fname` "%s" stored in %s does NOT match "%s".' %
                (info['original_fname'], self.info_path, original_fname))
        info = dict(urls=list(urls),
                    dataset_names=list(set(dataset_names)),
                    original_fname=original_fname)
        with py_utils.atomic_write(self.info_path, 'w') as info_f:
            json.dump(info, info_f, sort_keys=True)
        self._info = info
Esempio n. 2
0
def write_info_file(
    resource: 'Resource',
    path: str,
    dataset_name: str,
    original_fname: str,
    url_info: checksums_lib.UrlInfo,
) -> None:
    """Write the INFO file next to local file.

  Although the method is synchronized, there is still a risk two processes
  running at the same time overlap here. Risk accepted, since potentially lost
  data (`dataset_name`) is only for human consumption.

  Args:
    resource: resource for which to write the INFO file.
    path: path of downloaded file.
    dataset_name: data used to dl the file.
    original_fname: name of file as downloaded.
    url_info: checksums/size info of the url
  """
    url_info_dict = url_info.asdict()
    info_path = _get_info_path(path)
    info = _read_info(info_path) or {}
    urls = set(info.get('urls', []) + [resource.url])
    dataset_names = info.get('dataset_names', [])
    if dataset_name:
        dataset_names.append(dataset_name)
    if info.get('original_fname', original_fname) != original_fname:
        raise ValueError(
            '`original_fname` "{}" stored in {} does NOT match "{}".'.format(
                info['original_fname'], info_path, original_fname))
    if info.get('url_info', url_info_dict) != url_info_dict:
        raise ValueError(
            'File info {} contains a different checksum that the downloaded one: '
            'Stored: {}; Expected: {}'.format(info_path, info['url_info'],
                                              url_info_dict))
    info = dict(
        urls=list(urls),
        dataset_names=list(set(dataset_names)),
        original_fname=original_fname,
        url_info=url_info_dict,
    )
    with py_utils.atomic_write(info_path, 'w') as info_f:
        json.dump(info, info_f, sort_keys=True)