Ejemplo n.º 1
0
 def _check_dir(cls, dir: PurePath):
     """Create empty directory if not exists."""
     if not dir.exists():
         dir.mkdir(parents=True)
Ejemplo n.º 2
0
def export_space_files(space_id: str,
                       folder: PurePath,
                       auth_token: str,
                       fetch_after_timestamp: int = 0) -> int:
    """Export spaces, starting with the newest and going back only to the
    fetch_after_timestamp. This order is implemented since paging from oldest
    to newest doesn't seem to be implemented for the beta resource API.
    The downside to this approach is that if file name collisions occurs,
    the names will be non-deterministic and depend on when the files are created
    relative to prior executions of this method. For instance, myfile.txt and
    myfile 1.txt will refer to two files with title myfile.txt. Because we
    start from newest to oldest, myfile.txt will normally be the newest file,
    and myfile 1.txt will be older. This may already be counter-intuitive, but
    additionally, if the task was already run and an earlier file was downloaded
    as myfile.txt, it's possible the newer file will be named myfile 1.txt"""

    if fetch_after_timestamp:
        logger.info("Exporting files only back to %s ms",
                    fetch_after_timestamp)

    file_graphqlitem_by_id = {}
    file_entries_file_path = folder / constants.FILES_META_FOLDER / constants.FILE_ENTRIES_FILE_NAME
    if (file_entries_file_path).exists():
        with open(file_entries_file_path,
                  "r",
                  encoding=constants.FILE_ENCODING) as f:
            file_graphqlitem_by_id = json.load(f)
    file_path_by_id = {}
    file_paths_file_path = folder / constants.FILES_META_FOLDER / constants.FILE_PATHS_FILE_NAME
    if (file_paths_file_path).exists():
        with open(file_paths_file_path, "r",
                  encoding=constants.FILE_ENCODING) as f:
            file_path_by_id = json.load(f)

    downloaded = 0
    already_downloaded = 0
    duplicates = 0

    try:
        previous_page_ids = set()
        next_page_time_in_milliseconds = None

        while True:
            space_files_page = queries.space_files.execute(
                auth_token,
                spaceid=space_id,
                timestamp=next_page_time_in_milliseconds)
            if space_files_page:
                logger.debug("Fetched page with %s files for space %s",
                             len(space_files_page), space_id)
            elif len(previous_page_ids) == 0:
                logger.debug("No files found for space %s", space_id)
                break
            else:
                logger.error(
                    "Fetched page with no files for space %s, but expected this page to contain at least one file."
                )
                break

            folder.mkdir(exist_ok=True, parents=True)

            found_file = False
            page_ids = set()
            for file in space_files_page:
                file_created_ms = int(
                    parse(file["created"]).timestamp() * 1000)
                if file_created_ms >= fetch_after_timestamp:
                    file_graphqlitem_by_id[file["id"]] = file
                    if file["id"] in previous_page_ids:
                        logger.debug(
                            "skipping file with id %s since it was in the last page",
                            file["id"])
                    else:
                        found_file = True
                        page_ids.add(file["id"])
                        if next_page_time_in_milliseconds:
                            next_page_time_in_milliseconds = min(
                                next_page_time_in_milliseconds,
                                file_created_ms)
                        else:
                            next_page_time_in_milliseconds = file_created_ms
                        if file["id"] in file_path_by_id and Path(
                                file_path_by_id[file["id"]]).exists():
                            logger.debug(
                                "file %s is already downloaded to %s, skipping download",
                                file["id"], file_path_by_id[file["id"]])
                            already_downloaded += 1
                        else:
                            file_path, new_file = queries.download(
                                file["id"], file["title"], folder, auth_token)
                            file_path_by_id[file["id"]] = str(file_path)
                            if new_file:
                                downloaded += 1
                            else:
                                duplicates += 1
                else:
                    logger.debug(
                        "ignoring file %s since it is before the requested resume point %s",
                        file["id"], fetch_after_timestamp)

            previous_page_ids = page_ids
            if not found_file:
                break

    finally:
        # if we have some metadata, write it
        if len(file_graphqlitem_by_id) > 0:
            file_entries_file_path.parent.mkdir(exist_ok=True, parents=True)
            with open(file_entries_file_path,
                      "w+",
                      encoding=constants.FILE_ENCODING) as f:
                json.dump(file_graphqlitem_by_id, f)
        if len(file_path_by_id) > 0:
            file_paths_file_path.parent.mkdir(exist_ok=True, parents=True)
            with open(file_paths_file_path,
                      "w+",
                      encoding=constants.FILE_ENCODING) as f:
                json.dump(file_path_by_id, f)

    logger.info(
        "Downloaded %s files, %s files were skipped because they were downloaded according to meta files, %s downloaded files were duplicates of files already downloaded",
        downloaded, already_downloaded, duplicates)
    return downloaded
Ejemplo n.º 3
0
 def mkdirs(self, path: PurePath, exist_ok: bool = True) -> None:
     for path in self.dir_paths(path):
         path.mkdir(exist_ok=exist_ok, parents=True)