Example #1
0
    def _upload_resource(self):
        """
        Upload resources to the target and perform a fixity check on the resulting hashes.
        """
        action = 'resource_upload'
        # This doesn't happen during an upload, so it won't be an error. If there is an error during
        # transfer this will be overwritten.
        self.keyword_enhancement_successful = True
        # Write the process id to the process_info file
        self.process_info_obj[
            'function_process_id'] = self.function_process.pid
        update_or_create_process_info(self.process_info_obj, self.action,
                                      self.ticket_number)

        # Data directory in the bag
        self.data_directory = '{}/data'.format(self.resource_main_dir)

        # If we are uploading (not transferring) then create the initial metadata based on the
        # zipped bag provided.
        if self.action == 'resource_upload':
            update_process_info_message(self.process_info_path, self.action,
                                        "Creating PRESQT_FTS_METADATA...")
            self.new_fts_metadata_files = []
            for path, subdirs, files in os.walk(self.data_directory):
                for name in files:
                    self.new_fts_metadata_files.append({
                        'destinationHashes': {},
                        'destinationPath':
                        os.path.join(path, name)[len(self.data_directory):],
                        'failedFixityInfo': [],
                        'title':
                        name,
                        'sourceHashes': {
                            self.hash_algorithm:
                            self.file_hashes[os.path.join(path, name)]
                        },
                        'sourcePath':
                        os.path.join(path, name)[len(self.data_directory):],
                        'extra': {}
                    })

            destination_target_data = get_target_data(
                self.destination_target_name)
            self.details = "PresQT Upload to {}".format(
                destination_target_data['readable_name'])
            self.action_metadata = {
                'id': str(uuid4()),
                'details': self.details,
                'actionDateTime': str(timezone.now()),
                'actionType': self.action,
                'sourceTargetName': 'Local Machine',
                'sourceUsername': None,
                'destinationTargetName': self.destination_target_name,
                'destinationUsername': None,
                'keywords': {},
                'files': {
                    'created': self.new_fts_metadata_files,
                    'updated': [],
                    'ignored': []
                }
            }

        # If the target destination's storage hierarchy has a finite depth then zip the resources
        # to be uploaded along with their metadata.
        # Also, create metadata files for the new zip file to be uploaded.
        if self.infinite_depth is False:
            try:
                structure_validation(self)
                finite_depth_upload_helper(self)
            except PresQTResponseException as e:
                # Catch any errors that happen within the target fetch.
                # Update the server process_info file appropriately.
                self.process_info_obj['status_code'] = e.status_code
                self.process_info_obj['status'] = 'failed'
                if self.action == 'resource_transfer_in':
                    self.process_info_obj['upload_status'] = 'failed'
                self.process_info_obj['message'] = e.data
                # Update the expiration from 5 hours to 1 hour from now. We can delete this faster because
                # it's an incomplete/failed directory.
                self.process_info_obj['expiration'] = str(timezone.now() +
                                                          relativedelta(
                                                              hours=1))
                update_or_create_process_info(self.process_info_obj,
                                              self.action, self.ticket_number)
                return False

        # Fetch the proper function to call
        func = FunctionRouter.get_function(self.destination_target_name,
                                           action)

        # Upload the resources. func_dict has the following format:
        #   {
        #        'resources_ignored': resources_ignored,
        #        'resources_updated': resources_updated,
        #        'action_metadata': action_metadata,
        #        'file_metadata_list': file_metadata_list,
        #        'project_id': title
        #    }
        try:
            structure_validation(self)
            self.func_dict = func(self.destination_token,
                                  self.destination_resource_id,
                                  self.data_directory, self.hash_algorithm,
                                  self.file_duplicate_action,
                                  self.process_info_path, self.action)
        except PresQTResponseException as e:
            # Catch any errors that happen within the target fetch.
            # Update the server process_info file appropriately.
            self.process_info_obj['status_code'] = e.status_code
            self.process_info_obj['status'] = 'failed'
            if self.action == 'resource_transfer_in':
                self.process_info_obj['upload_status'] = 'failed'
            self.process_info_obj['message'] = e.data
            # Update the expiration from 5 hours to 1 hour from now. We can delete this faster
            # because it's an incomplete/failed directory.
            self.process_info_obj['expiration'] = str(timezone.now() +
                                                      relativedelta(hours=1))
            update_or_create_process_info(self.process_info_obj, self.action,
                                          self.ticket_number)
            return False

        self.process_info_obj = read_file(self.process_info_path,
                                          True)[self.action]

        # Check if fixity has failed on any files during a transfer. If so, update the
        # process_info_data file.
        self.upload_fixity = True
        self.upload_failed_fixity = []

        for resource in self.func_dict['file_metadata_list']:
            resource['failed_fixity_info'] = []
            if resource['destinationHash'] != self.file_hashes[resource['actionRootPath']] \
                    and resource['actionRootPath'] not in self.func_dict['resources_ignored']:
                self.upload_fixity = False
                self.upload_failed_fixity.append(
                    resource['actionRootPath'][len(self.data_directory):])
                resource['failed_fixity_info'].append({
                    'NewGeneratedHash':
                    self.file_hashes[resource['actionRootPath']],
                    'algorithmUsed':
                    self.hash_algorithm,
                    'reasonFixityFailed':
                    "Either the destination did not provide a hash "
                    "or fixity failed during upload."
                })

        # Strip the server created directory prefix of the file paths for ignored and updated files
        resources_ignored = [
            file[len(self.data_directory):]
            for file in self.func_dict['resources_ignored']
        ]
        self.process_info_obj['resources_ignored'] = resources_ignored
        resources_updated = [
            file[len(self.data_directory):]
            for file in self.func_dict['resources_updated']
        ]
        self.process_info_obj['resources_updated'] = resources_updated

        if self.action == 'resource_transfer_in':
            self.keyword_enhancement_successful = True
            if not self.destination_resource_id:
                self.destination_resource_id = self.func_dict['project_id']
            if self.supports_keywords:
                self.keyword_enhancement_successful, self.destination_initial_keywords = update_targets_keywords(
                    self, self.func_dict['project_id'])

                # Add the destination initial keywords to all keywords for accurate metadata list
                self.all_keywords = self.all_keywords + self.destination_initial_keywords

        self.metadata_validation = create_upload_metadata(
            self, self.func_dict['file_metadata_list'],
            self.func_dict['action_metadata'], self.func_dict['project_id'],
            resources_ignored, resources_updated)
        # Increment process_info one last time
        increment_process_info(self.process_info_path, self.action, 'upload')

        # Validate the final metadata
        upload_message = get_action_message(self, 'Upload', self.upload_fixity,
                                            self.metadata_validation,
                                            self.action_metadata)
        self.process_info_obj['message'] = upload_message

        if self.action == 'resource_upload':
            # Update server process file
            self.process_info_obj['status_code'] = '200'
            self.process_info_obj['status'] = 'finished'
            self.process_info_obj['hash_algorithm'] = self.hash_algorithm
            self.process_info_obj['failed_fixity'] = self.upload_failed_fixity
            self.process_info_obj['upload_status'] = upload_message
            self.process_info_obj['link_to_resource'] = self.func_dict[
                "project_link"]
            update_or_create_process_info(self.process_info_obj, self.action,
                                          self.ticket_number)

            if self.email:
                context = {
                    "upload_url": self.func_dict["project_link"],
                    "upload_message": upload_message,
                    "failed_fixity": self.upload_failed_fixity
                }
                email_blaster(self.email, "PresQT Upload Complete", context,
                              "emails/upload_email.html")

        return True
Example #2
0
def figshare_upload_resource(token, resource_id, resource_main_dir,
                             hash_algorithm, file_duplicate_action,
                             process_info_path, action):
    """
    Upload the files found in the resource_main_dir to the target.

    Parameters
    ----------
    token : str
        User's token.
    resource_id : str
        ID of the resource requested.
    resource_main_dir : str
        Path to the main directory for the resources to be uploaded.
    hash_algorithm : str
        Hash algorithm we are using to check for fixity.
    file_duplicate_action : str
        The action to take when a duplicate file is found
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources_ignored' : Array of string file paths of files that were ignored when
        uploading the resource. Path should have the same base as resource_main_dir.
                                Example:
                                    ['path/to/ignored/file.pg', 'another/ignored/file.jpg']

        'resources_updated' : Array of string file paths of files that were updated when
         uploading the resource. Path should have the same base as resource_main_dir.
                                 Example:
                                    ['path/to/updated/file.jpg']
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                            {
                                'destinationUsername': '******'
                            }
        'file_metadata_list': List of dictionaries for each file that contains metadata
                              and hash info. Must be in the following format:
                                {
                                    "actionRootPath": '/path/on/disk',
                                    "destinationPath": '/path/on/target/destination',
                                    "title": 'file_title',
                                    "destinationHash": {'hash_algorithm': 'the_hash'}}
                                }
        'project_id': ID of the parent project for this upload. Needed for metadata upload. 
        'project_link': The link to either the resource or the home page of the user if not available through API

    FigShare's Upload Process
        1. Initiate new file upload (POST) within the article. Send file size, md5, and name but no file contents yet.
        2. Send a GET request to the 'Uploader Service' to determine that the status is "Pending" and how many parts to split the upload into.
        3. Split the file into the correct number of parts and upload each using a PUT request.
        4. Send a POST request to complete the upload.
    """
    try:
        headers, username = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException(
            "Token is invalid. Response returned a 401 status code.",
            status.HTTP_401_UNAUTHORIZED)

    os_path = next(os.walk(resource_main_dir))
    total_files = upload_total_files(resource_main_dir)
    # Update process info file
    update_process_info(process_info_path, total_files, action, 'upload')
    update_process_info_message(process_info_path, action,
                                "Uploading files to FigShare...")

    resources_ignored = []
    resources_updated = []
    file_metadata_list = []
    action_metadata = {'destinationUsername': username}

    # Upload a new project
    if not resource_id:
        project_title = os_path[1][0]
        # Create a new project with the name being the top level directory's name.
        project_name, project_id = create_project(project_title, headers,
                                                  token)
        # Create article, for now we'll name it the same as the project
        article_id = create_article(project_title, headers, project_id)
    else:
        # Upload to an existing project
        split_id = str(resource_id).split(":")
        project_id = split_id[0]

        try:
            project_title = requests.get(
                "https://api.figshare.com/v2/account/projects/{}".format(
                    project_id),
                headers=headers).json()['title']
        except KeyError:
            raise PresQTResponseException(
                "Project with id, {}, could not be found by the requesting user."
                .format(project_id), status.HTTP_400_BAD_REQUEST)

        if len(split_id) == 1:
            # We only have a project and we need to make a new article id
            # Check to see if an article with this name already exists
            articles = requests.get(
                "https://api.figshare.com/v2/account/projects/{}/articles".
                format(project_id),
                headers=headers).json()
            article_titles = [article['title'] for article in articles]
            new_title = get_duplicate_title(project_title, article_titles,
                                            "(PresQT*)")
            article_id = create_article(new_title, headers, resource_id)
        elif len(split_id) == 2:
            article_id = split_id[1]
        else:
            # Can't upload to file
            raise PresQTResponseException(
                "Can not upload into an existing file.",
                status.HTTP_400_BAD_REQUEST)

    # Get the article title
    try:
        article_title = requests.get(
            "https://api.figshare.com/v2/account/articles/{}".format(
                article_id),
            headers=headers).json()['title']
    except KeyError:
        raise PresQTResponseException(
            "Article with id, {}, could not be found by the requesting user.".
            format(article_id), status.HTTP_400_BAD_REQUEST)

    # Get md5, size and name of zip file to be uploaded
    for path, subdirs, files in os.walk(resource_main_dir):
        for name in files:
            file_info = open(os.path.join(path, name), 'rb')
            zip_hash = hash_generator(file_info.read(), 'md5')

            figshare_file_upload_process(file_info,
                                         headers,
                                         name,
                                         article_id,
                                         file_type='zip',
                                         path=path)

            file_metadata_list.append({
                'actionRootPath':
                os.path.join(path, name),
                'destinationPath':
                '/{}/{}/{}'.format(project_title, article_title, name),
                'title':
                name,
                'destinationHash':
                zip_hash
            })
            increment_process_info(process_info_path, action, 'upload')

    return {
        "resources_ignored": resources_ignored,
        "resources_updated": resources_updated,
        "action_metadata": action_metadata,
        "file_metadata_list": file_metadata_list,
        "project_id": "{}:{}".format(project_id, article_id),
        "project_link": "https://figshare.com/account/home#/projects"
    }
Example #3
0
def download_directory(header, path_to_resource, repo_data, process_info_path,
                       action):
    """
    Go through a repo's tree and download all files inside of a given resource directory path.

    Parameters
    ----------
    header: dict
        API header expected by GitHub
    path_to_resource: str
        The path to the requested directory
    repo_data: dict
        Repository data gathered in the repo GET request
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    A list of dictionaries for each file being downloaded
    """
    repo_name = repo_data['name']
    # Strip {/sha} off the end
    trees_url = '{}/master?recursive=1'.format(repo_data['trees_url'][:-6])
    contents = requests.get(trees_url, headers=header).json()

    number_of_files = len([
        file for file in contents['tree']
        if file['path'].startswith(path_to_resource) and file['type'] == 'blob'
    ])
    # Add the total number of repository to the process info file.
    # This is necessary to keep track of the progress of the request.
    update_process_info(process_info_path, number_of_files, action, 'download')
    update_process_info_message(process_info_path, action,
                                'Downloading files from GitHub...')

    files = []
    for resource in contents['tree']:
        if resource['path'].startswith(
                path_to_resource) and resource['type'] == 'blob':
            # Strip the requested directory's parents off the directory path
            path_to_strip = path_to_resource.rpartition('/')[0]
            if path_to_strip:
                directory_path = '{}'.format(
                    resource['path'].partition(path_to_strip)[2])
            else:
                directory_path = '/{}'.format(resource['path'])

            file_data = requests.get(resource['url']).json()

            files.append({
                'file':
                base64.b64decode(file_data['content']),
                'hashes': {},
                'title':
                resource['path'].rpartition('/')[0],
                'path':
                directory_path,
                'source_path':
                '/{}/{}'.format(repo_name, resource['path']),
                'extra_metadata': {}
            })
            # Increment the number of files done in the process info file.
            increment_process_info(process_info_path, action, 'download')
    return files
Example #4
0
    def _download_resource(self):
        """
        Downloads the resources from the target, performs a fixity check,
        zips them up in BagIt format.
        """
        action = 'resource_download'

        # Write the process id to the process_info file
        self.process_info_obj[
            'function_process_id'] = self.function_process.pid
        update_or_create_process_info(self.process_info_obj, self.action,
                                      self.ticket_number)

        # Fetch the proper function to call
        func = FunctionRouter.get_function(self.source_target_name, action)

        # Fetch the resources. func_dict is in the format:
        #   {
        #       'resources': files,
        #       'empty_containers': empty_containers,
        #       'action_metadata': action_metadata
        #   }
        try:
            func_dict = func(self.source_token, self.source_resource_id,
                             self.process_info_path, self.action)
            # If the resource is being transferred, has only one file, and that file is the
            # PresQT metadata then raise an error.
            if self.action == 'resource_transfer_in' and \
                    len(func_dict['resources']) == 1 \
                    and func_dict['resources'][0]['title'] == 'PRESQT_FTS_METADATA.json':
                raise PresQTResponseException(
                    'PresQT Error: PresQT FTS metadata cannot not be transferred by itself.',
                    status.HTTP_400_BAD_REQUEST)
        except PresQTResponseException as e:
            # TODO: Functionalize this error section
            # Catch any errors that happen within the target fetch.
            # Update the server process_info file appropriately.
            self.process_info_obj['status_code'] = e.status_code
            self.process_info_obj['status'] = 'failed'
            if self.action == 'resource_transfer_in':
                self.process_info_obj['download_status'] = 'failed'
            self.process_info_obj['message'] = e.data
            # Update the expiration from 5 hours to 1 hour from now. We can delete this faster because
            # it's an incomplete/failed directory.
            self.process_info_obj['expiration'] = str(timezone.now() +
                                                      relativedelta(hours=1))
            update_or_create_process_info(self.process_info_obj, self.action,
                                          self.ticket_number)

            return False

        # Get the latest contents of the job's process_info.json file
        self.process_info_obj = read_file(self.process_info_path,
                                          True)[self.action]

        # The directory all files should be saved in.
        self.resource_main_dir = os.path.join(self.ticket_path,
                                              self.base_directory_name)
        update_process_info_message(
            self.process_info_path, self.action,
            'Performing fixity checks and gathering metadata...')

        self.extra_metadata = func_dict['extra_metadata']
        # For each resource, perform fixity check, gather metadata, and save it to disk.
        fixity_info = []
        self.download_fixity = True
        self.download_failed_fixity = []
        self.source_fts_metadata_actions = []
        self.new_fts_metadata_files = []
        self.all_keywords = []
        self.initial_keywords = []
        self.manual_keywords = []
        self.enhanced_keywords = []
        for resource in func_dict['resources']:
            # Perform the fixity check and add extra info to the returned fixity object.
            # Note: This method of calling the function needs to stay this way for test Mock
            fixity_obj, self.download_fixity = download_fixity_checker.download_fixity_checker(
                resource)
            fixity_info.append(fixity_obj)

            if not fixity_obj['fixity']:
                self.download_failed_fixity.append(resource['path'])

            # Create metadata for this resource or validate the metadata file
            if resource['title'] == 'PRESQT_FTS_METADATA.json':
                is_valid = validate_metadata(self, resource)
                if not is_valid:
                    resource['path'] = resource['path'].replace(
                        'PRESQT_FTS_METADATA.json',
                        'INVALID_PRESQT_FTS_METADATA.json')
                    create_download_metadata(self, resource, fixity_obj)
                    write_file(
                        '{}{}'.format(self.resource_main_dir,
                                      resource['path']), resource['file'])
            else:
                create_download_metadata(self, resource, fixity_obj)
                write_file(
                    '{}{}'.format(self.resource_main_dir, resource['path']),
                    resource['file'])

        # Enhance the source keywords
        self.keyword_dict = {}
        if self.action == 'resource_transfer_in':
            if self.supports_keywords:
                if self.keyword_action == 'automatic':
                    self.keyword_dict = automatic_keywords(self)
                elif self.keyword_action == 'manual':
                    self.keyword_dict = manual_keywords(self)
        self.keyword_enhancement_successful = True

        # Create PresQT action metadata
        update_process_info_message(self.process_info_path, self.action,
                                    "Creating PRESQT_FTS_METADATA...")
        self.source_username = func_dict['action_metadata']['sourceUsername']
        if self.action == 'resource_transfer_in':
            source_target_data = get_target_data(self.source_target_name)
            destination_target_data = get_target_data(
                self.destination_target_name)
            self.details = "PresQT Transfer from {} to {}".format(
                source_target_data['readable_name'],
                destination_target_data['readable_name'])
        else:
            source_target_data = get_target_data(self.source_target_name)
            self.details = "PresQT Download from {}".format(
                source_target_data['readable_name'])

        self.action_metadata = {
            'id': str(uuid4()),
            'details': self.details,
            'actionDateTime': str(timezone.now()),
            'actionType': self.action,
            'sourceTargetName': self.source_target_name,
            'sourceUsername': self.source_username,
            'destinationTargetName': 'Local Machine',
            'destinationUsername': None,
            'keywords': self.keyword_dict,
            'files': {
                'created': self.new_fts_metadata_files,
                'updated': [],
                'ignored': []
            }
        }

        # TODO: Move this up to make it occur after we loop through func_dict['resources'] and write
        # resources
        # Write empty containers to disk
        for container_path in func_dict['empty_containers']:
            # Make sure the container_path has a '/' and the beginning and end
            if container_path[-1] != '/':
                container_path += '/'
            if container_path[0] != '/':
                container_path = '/' + container_path
            os.makedirs(
                os.path.dirname('{}{}'.format(self.resource_main_dir,
                                              container_path)))

        # If we are transferring the downloaded resource then bag it for the resource_upload method
        if self.action == 'resource_transfer_in':
            self.action_metadata[
                'destinationTargetName'] = self.destination_target_name

            # Make a BagIt 'bag' of the resources.
            bagit.make_bag(self.resource_main_dir,
                           checksums=['md5', 'sha1', 'sha256', 'sha512'])
            self.process_info_obj['download_status'] = get_action_message(
                self, 'Download', self.download_fixity, True,
                self.action_metadata)
            return True
        # If we are only downloading the resource then create metadata, bag, zip,
        # and update the server process file.
        else:
            # Create Metadata file
            final_fts_metadata_data = create_fts_metadata(
                self.all_keywords, self.action_metadata,
                self.source_fts_metadata_actions, self.extra_metadata)

            # Validate the final metadata
            metadata_validation = schema_validator(
                'presqt/json_schemas/metadata_schema.json',
                final_fts_metadata_data)
            self.process_info_obj['message'] = get_action_message(
                self, 'Download', self.download_fixity, metadata_validation,
                self.action_metadata)

            # Make a BagIt 'bag' of the resources.
            bagit.make_bag(self.resource_main_dir,
                           checksums=['md5', 'sha1', 'sha256', 'sha512'])

            # Write metadata file.
            write_file(
                os.path.join(self.resource_main_dir,
                             'PRESQT_FTS_METADATA.json'),
                final_fts_metadata_data, True)

            # Add the fixity file to the disk directory
            write_file(
                os.path.join(self.resource_main_dir, 'fixity_info.json'),
                fixity_info, True)

            # Zip the BagIt 'bag' to send forward.
            zip_directory(self.resource_main_dir,
                          "{}.zip".format(self.resource_main_dir),
                          self.ticket_path)

            # Everything was a success so update the server metadata file.
            self.process_info_obj['status_code'] = '200'
            self.process_info_obj['status'] = 'finished'
            self.process_info_obj['zip_name'] = '{}.zip'.format(
                self.base_directory_name)
            self.process_info_obj[
                'failed_fixity'] = self.download_failed_fixity
            update_or_create_process_info(self.process_info_obj, self.action,
                                          self.ticket_number)
            if self.email:
                # Build link to retrieve the download
                download_reverse = reverse('job_status',
                                           kwargs={
                                               "action": "download",
                                               "response_format": "zip"
                                           })
                download_url = self.request.build_absolute_uri(
                    download_reverse)
                final_download_url = "{}?ticket_number={}".format(
                    download_url, self.ticket_number)
                context = {
                    "download_url": final_download_url,
                    "download_message": self.process_info_obj['message'],
                    "failed_fixity": self.process_info_obj['failed_fixity']
                }
                email_blaster(self.email, "PresQT Download Complete", context,
                              "emails/download_email.html")

        return True
Example #5
0
def zenodo_upload_resource(token, resource_id, resource_main_dir, hash_algorithm,
                           file_duplicate_action, process_info_path, action):
    """
    Upload the files found in the resource_main_dir to the target.

    Parameters
    ----------
    token : str
        User's token.
    resource_id : str
        ID of the resource requested.
    resource_main_dir : str
        Path to the main directory for the resources to be uploaded.
    hash_algorithm : str
        Hash algorithm we are using to check for fixity.
    file_duplicate_action : str
        The action to take when a duplicate file is found
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
            The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources_ignored' : Array of string file paths of files that were ignored when
        uploading the resource. Path should have the same base as resource_main_dir.
                                Example:
                                    ['path/to/ignored/file.pg', 'another/ignored/file.jpg']

        'resources_updated' : Array of string file paths of files that were updated when
         uploading the resource. Path should have the same base as resource_main_dir.
                                 Example:
                                    ['path/to/updated/file.jpg']
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                            {
                                'destinationUsername': '******'
                            }
        'file_metadata_list': List of dictionaries for each file that contains metadata
                              and hash info. Must be in the following format:
                                {
                                    "actionRootPath": '/path/on/disk',
                                    "destinationPath": '/path/on/target/destination',
                                    "title": 'file_title',
                                    "destinationHash": {'hash_algorithm': 'the_hash'}}
                                }
        'project_id': ID of the parent project for this upload. Needed for metadata upload.
        'project_link': The link to either the resource or the home page of the user if not available through API
    """
    try:
        auth_parameter = zenodo_validation_check(token)
    except PresQTValidationError:
        raise PresQTValidationError("Token is invalid. Response returned a 401 status code.",
                                    status.HTTP_401_UNAUTHORIZED)

    os_path = next(os.walk(resource_main_dir))
    total_files = upload_total_files(resource_main_dir)
    # Update process info file
    update_process_info(process_info_path, total_files, action, 'upload')
    update_process_info_message(process_info_path, action, "Uploading files to Zenodo...")

    # Since Zenodo is a finite depth target, the checks for path validity have already been done.
    if resource_id:
        name_helper = requests.get("https://zenodo.org/api/deposit/depositions/{}".format(
            resource_id), params=auth_parameter).json()

        try:
            final_title = name_helper['title']
        except KeyError:
            raise PresQTResponseException(
                "Can't find the resource with id {}, on Zenodo".format(resource_id),
                status.HTTP_404_NOT_FOUND)
        action_metadata = {"destinationUsername": None}

    else:
        action_metadata = {"destinationUsername": None}
        project_title = os_path[1][0]
        name_helper = requests.get("https://zenodo.org/api/deposit/depositions",
                                   params=auth_parameter).json()
        titles = [project['title'] for project in name_helper]
        final_title = get_duplicate_title(project_title, titles, ' (PresQT*)')
        resource_id = zenodo_upload_helper(auth_parameter, final_title)

    post_url = "https://zenodo.org/api/deposit/depositions/{}/files".format(resource_id)
    upload_dict = zenodo_upload_loop(action_metadata, resource_id, resource_main_dir,
                                     post_url, auth_parameter, final_title, file_duplicate_action,
                                     process_info_path, action)

    return upload_dict
Example #6
0
def zenodo_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from Zenodo along with its hash information.

    Parameters
    ----------
    token : str
        User's Zenodo token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'metadata': {
                                'sourcePath': '/full/path/at/source.jpg',
                                'title': 'file_title',
                                'sourceHashes': {'hash_algorithm': 'the_hash'},
                                'extra': {'any': 'extra'}
                             }
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                              'sourceUsername': '******',
                              }
    """
    try:
        auth_parameter = zenodo_validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException(
            'Token is invalid. Response returned a 401 status code.',
            status.HTTP_401_UNAUTHORIZED)
    files = []
    empty_containers = []
    extra_metadata = {}
    base_url = None

    # If the resource_id is longer than 7 characters, the resource is an individual file
    if len(resource_id) > 7:
        # First we need to check if the file id given belongs to a public published record.
        zenodo_file = requests.get(
            'https://zenodo.org/api/files/{}'.format(resource_id),
            params=auth_parameter)
        if zenodo_file.status_code != 200:
            # If not, we need to loop through their depositions to look for the file.
            zenodo_projects = requests.get(
                'https://zenodo.org/api/deposit/depositions',
                params=auth_parameter).json()
            for entry in zenodo_projects:
                project_files = requests.get(entry['links']['self'],
                                             params=auth_parameter).json()
                for file in project_files['files']:
                    if file['id'] == resource_id:
                        base_url = entry['links']['self']
                        file_url = file['links']['self']
                        is_record = False
                        break
                else:
                    # If the file wasn't found we want to continue the loop.
                    continue
                break
        else:
            is_record = True
            base_url = 'https://zenodo.org/api/files/{}'.format(resource_id)
            file_url = 'https://zenodo.org/api/files/{}'.format(resource_id)

        if base_url is None:
            raise PresQTResponseException(
                "The resource with id, {}, does not exist for this user.".
                format(resource_id), status.HTTP_404_NOT_FOUND)

        update_process_info_message(process_info_path, action,
                                    'Downloading files from Zenodo...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, 1, action, 'download')

        files, action_metadata = zenodo_download_helper(
            is_record, base_url, auth_parameter, files, file_url)

        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')

    # Otherwise, it's a full project
    else:
        base_url = 'https://zenodo.org/api/records/{}'.format(resource_id)
        zenodo_record = requests.get(base_url, params=auth_parameter)
        is_record = True
        if zenodo_record.status_code != 200:
            base_url = 'https://zenodo.org/api/deposit/depositions/{}'.format(
                resource_id)
            is_record = False
        try:
            files, action_metadata = zenodo_download_helper(
                is_record, base_url, auth_parameter, files)
        except PresQTResponseException:
            raise PresQTResponseException(
                "The resource with id, {}, does not exist for this user.".
                format(resource_id), status.HTTP_404_NOT_FOUND)

        extra_metadata = extra_metadata_helper(base_url, is_record,
                                               auth_parameter)
        file_urls = [file['file'] for file in files]

        update_process_info_message(process_info_path, action,
                                    'Downloading files from Zenodo...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, len(file_urls), action,
                            'download')

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        download_data = loop.run_until_complete(
            async_main(file_urls, auth_parameter, process_info_path, action))

        # Go through the file dictionaries and replace the file path with the binary_content
        for file in files:
            file['file'] = get_dictionary_from_list(
                download_data, 'url', file['file'])['binary_content']

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #7
0
def github_upload_resource(token, resource_id, resource_main_dir,
                           hash_algorithm, file_duplicate_action,
                           process_info_path, action):
    """
    Upload the files found in the resource_main_dir to the target.

    Parameters
    ----------
    token : str
        User's token.
    resource_id : str
        ID of the resource requested.
    resource_main_dir : str
        Path to the main directory for the resources to be uploaded.
    hash_algorithm : str
        Hash algorithm we are using to check for fixity.
    file_duplicate_action : str
        The action to take when a duplicate file is found
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources_ignored' : Array of string file paths of files that were ignored when
        uploading the resource. Path should have the same base as resource_main_dir.
                                Example:
                                    ['path/to/ignored/file.pg', 'another/ignored/file.jpg']

        'resources_updated' : Array of string file paths of files that were updated when
         uploading the resource. Path should have the same base as resource_main_dir.
                                 Example:
                                    ['path/to/updated/file.jpg']
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                            {
                                'destinationUsername': '******'
                            }
        'file_metadata_list': List of dictionaries for each file that contains metadata
                              and hash info. Must be in the following format:
                                {
                                    "actionRootPath": '/path/on/disk',
                                    "destinationPath": '/path/on/target/destination',
                                    "title": 'file_title',
                                    "destinationHash": {'hash_algorithm': 'the_hash'}}
                                }
        'project_id': ID of the parent project for this upload. Needed for metadata upload.
        'project_link': The link to either the resource or the home page of the user if not available through API
    """
    try:
        header, username = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException(
            "Token is invalid. Response returned a 401 status code.",
            status.HTTP_401_UNAUTHORIZED)
    os_path = next(os.walk(resource_main_dir))
    # Get total amount of files
    total_files = upload_total_files(resource_main_dir)
    update_process_info(process_info_path, total_files, action, 'upload')
    update_process_info_message(process_info_path, action,
                                "Uploading files to GitHub...")

    # Upload a new repository
    if not resource_id:
        # Create a new repository with the name being the top level directory's name.
        # Note: GitHub doesn't allow spaces, or circlebois in repo_names
        repo_title = os_path[1][0].replace(' ', '_').replace("(", "-").replace(
            ")", "-").replace(":", "-")
        repo_name, repo_id, repo_url = create_repository(repo_title, token)
        resources_ignored = []
        resources_updated = []
        action_metadata = {"destinationUsername": username}
        file_metadata_list = []
        for path, subdirs, files in os.walk(resource_main_dir):
            if not subdirs and not files:
                resources_ignored.append(path)
            for name in files:
                # Extract and encode the file bytes in the way expected by GitHub.
                file_bytes = open(os.path.join(path, name), 'rb').read()
                encoded_file = base64.b64encode(file_bytes).decode('utf-8')
                # A relative path to the file is what is added to the GitHub PUT address
                path_to_add = os.path.join(path.partition('/data/')[2], name)
                path_to_add_to_url = path_to_add.partition('/')[2].replace(
                    ' ', '_')
                finished_path = '/' + repo_name + '/' + path_to_add_to_url
                file_metadata_list.append({
                    "actionRootPath":
                    os.path.join(path, name),
                    "destinationPath":
                    finished_path,
                    "title":
                    name,
                    "destinationHash":
                    None
                })
                put_url = "https://api.github.com/repos/{}/{}/contents/{}".format(
                    username, repo_name, path_to_add_to_url)
                data = {
                    "message": "PresQT Upload",
                    "committer": {
                        "name": "PresQT",
                        "email": "N/A"
                    },
                    "content": encoded_file
                }

                file_response = requests.put(put_url,
                                             headers=header,
                                             data=json.dumps(data))
                if file_response.status_code != 201:
                    raise PresQTResponseException(
                        "Github returned the following error: '{}'".format(
                            str(file_response.json()['message'])),
                        status.HTTP_400_BAD_REQUEST)

                # Increment the file counter
                increment_process_info(process_info_path, action, 'upload')
    else:
        # Upload to an existing repository
        if ':' not in resource_id:
            repo_id = resource_id
            path_to_upload_to = ''
        # Upload to an existing directory
        else:
            partitioned_id = resource_id.partition(':')
            repo_id = partitioned_id[0]
            path_to_upload_to = '/{}'.format(partitioned_id[2]).replace(
                '%2F', '/').replace('%2E', '.')

        # Get initial repo data for the resource requested
        repo_url = 'https://api.github.com/repositories/{}'.format(repo_id)
        response = requests.get(repo_url, headers=header)

        if response.status_code != 200:
            raise PresQTResponseException(
                'The resource with id, {}, does not exist for this user.'.
                format(resource_id), status.HTTP_404_NOT_FOUND)
        repo_data = response.json()
        repo_name = repo_data['name']
        repo_url = repo_data['svn_url']

        # Get all repo resources so we can check if any files already exist
        repo_resources = requests.get('{}/master?recursive=1'.format(
            repo_data['trees_url'][:-6]),
                                      headers=header).json()
        if 'message' in repo_resources:
            repo_resources = requests.get('{}/main?recursive=1'.format(
                repo_data['trees_url'][:-6]),
                                          headers=header).json()
        # current_file_paths = ['/' + resource['path'] for resource in repo_resources['tree'] if resource['type'] == 'blob']
        current_file_paths = []
        for resource in repo_resources['tree']:
            if resource['type'] == 'blob':
                current_file_paths.append('/' + resource['path'])

        # Check if the provided path to upload to is actually a path to an existing file
        if path_to_upload_to in current_file_paths:
            raise PresQTResponseException(
                'The Resource provided, {}, is not a container'.format(
                    resource_id), status.HTTP_400_BAD_REQUEST)

        resources_ignored = []
        resources_updated = []
        file_metadata_list = []
        sha = None
        action_metadata = {"destinationUsername": username}

        for path, subdirs, files in os.walk(resource_main_dir):
            if not subdirs and not files:
                resources_ignored.append(path)
            for name in files:
                path_to_file = os.path.join('/',
                                            path.partition('/data/')[2],
                                            name).replace(' ', '_')

                # Check if the file already exists in this repository
                full_file_path = '{}{}'.format(path_to_upload_to, path_to_file)
                if full_file_path in current_file_paths:
                    if file_duplicate_action == 'ignore':
                        resources_ignored.append(os.path.join(path, name))
                        continue
                    else:
                        resources_updated.append(os.path.join(path, name))
                        # Get the sha
                        sha_url = 'https://api.github.com/repos/{}/contents{}'.format(
                            repo_data['full_name'], full_file_path)
                        sha_response = requests.get(sha_url, headers=header)
                        sha = sha_response.json()['sha']

                # Extract and encode the file bytes in the way expected by GitHub.
                file_bytes = open(os.path.join(path, name), 'rb').read()
                encoded_file = base64.b64encode(file_bytes).decode('utf-8')
                # A relative path to the file is what is added to the GitHub PUT address
                file_metadata_list.append({
                    "actionRootPath":
                    os.path.join(path, name),
                    "destinationPath":
                    '/{}{}{}'.format(repo_name, path_to_upload_to,
                                     path_to_file),
                    "title":
                    name,
                    "destinationHash":
                    None
                })
                put_url = 'https://api.github.com/repos/{}/contents{}{}'.format(
                    repo_data['full_name'], path_to_upload_to, path_to_file)

                data = {
                    "message": "PresQT Upload",
                    "sha": sha,
                    "committer": {
                        "name": "PresQT",
                        "email": "N/A"
                    },
                    "content": encoded_file
                }

                upload_response = requests.put(put_url,
                                               headers=header,
                                               data=json.dumps(data))

                if upload_response.status_code not in [200, 201]:
                    raise PresQTResponseException(
                        'Upload failed with a status code of {}'.format(
                            upload_response.status_code),
                        status.HTTP_400_BAD_REQUEST)
                # Increment the file counter
                increment_process_info(process_info_path, action, 'upload')

    return {
        'resources_ignored': resources_ignored,
        'resources_updated': resources_updated,
        'action_metadata': action_metadata,
        'file_metadata_list': file_metadata_list,
        'project_id': repo_id,
        "project_link": repo_url
    }
Example #8
0
def osf_upload_resource(token, resource_id, resource_main_dir, hash_algorithm,
                        file_duplicate_action, process_info_path, action):
    """
    Upload the files found in the resource_main_dir to OSF.

    Parameters
    ----------
    token : str
        User's OSF token.
    resource_id : str
        ID of the resource requested.
    resource_main_dir : str
        Path to the main directory for the resources to be uploaded.
    hash_algorithm : str
        Hash algorithm we are using to check for fixity.
    file_duplicate_action : str
        The action to take when a duplicate file is found
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources_ignored' : Array of string file paths of files that were ignored when
        uploading the resource. Path should have the same base as resource_main_dir.
                                Example:
                                    ['path/to/ignored/file.pg', 'another/ignored/file.jpg']

        'resources_updated' : Array of string file paths of files that were updated when
         uploading the resource. Path should have the same base as resource_main_dir.
                                 Example:
                                    ['path/to/updated/file.jpg']
        'action_metadata': Dictionary containing FTS action metadata. Must be in the following format:
                            {
                                'destinationUsername': '******'
                            }
        'file_metadata_list': List of dictionaries for each file that contains FTS metadata
                              and hash info. Must be in the following format:
                                {
                                    "actionRootPath": '/path/on/disk',
                                    "destinationPath": '/path/on/target/destination',
                                    "title": 'file_title',
                                    "destinationHash": {'hash_algorithm': 'the_hash'}}
                                }
        'project_id': ID of the parent project for this upload. Needed for metadata upload.
        'project_link': The link to either the resource or the home page of the user if not available through API
    """
    try:
        osf_instance = OSF(token)
    except PresQTInvalidTokenError:
        raise PresQTResponseException(
            "Token is invalid. Response returned a 401 status code.",
            status.HTTP_401_UNAUTHORIZED)

    # Get contributor name
    contributor_name = requests.get(
        'https://api.osf.io/v2/users/me/',
        headers={
            'Authorization': 'Bearer {}'.format(token)
        }).json()['data']['attributes']['full_name']
    action_metadata = {"destinationUsername": contributor_name}

    hashes = {}
    resources_ignored = []
    resources_updated = []
    file_metadata_list = []
    # Get total amount of files
    total_files = upload_total_files(resource_main_dir)
    update_process_info(process_info_path, total_files, action, 'upload')
    update_process_info_message(process_info_path, action,
                                "Uploading files to OSF...")

    # If we are uploading to an existing container
    if resource_id:
        # Get the resource
        resource = get_osf_resource(resource_id, osf_instance)

        # Resource being uploaded to must not be a file
        if resource.kind_name == 'file':
            raise PresQTResponseException(
                "The Resource provided, {}, is not a container".format(
                    resource_id), status.HTTP_400_BAD_REQUEST)

        elif resource.kind_name == 'project':
            project = resource
            project_id = project.id
            resource.storage('osfstorage').create_directory(
                resource_main_dir, file_duplicate_action, hashes,
                resources_ignored, resources_updated, file_metadata_list,
                process_info_path, action)

        else:  # Folder or Storage
            resource.create_directory(resource_main_dir, file_duplicate_action,
                                      hashes, resources_ignored,
                                      resources_updated, file_metadata_list,
                                      process_info_path, action)
            # Get the project class for later metadata work
            if resource.kind_name == 'storage':
                project_id = resource.node
            else:
                project_id = resource.parent_project_id
            project = osf_instance.project(project_id)

    # else we are uploading a new project
    else:
        os_path = next(os.walk(resource_main_dir))

        # Get the actual data we want to upload
        data_to_upload_path = '{}/{}'.format(os_path[0], os_path[1][0])

        # Create a new project with the name being the top level directory's name.
        project = osf_instance.create_project(os_path[1][0])
        project_id = project.id

        # Upload resources into OSFStorage for the new project.
        project.storage('osfstorage').create_directory(
            data_to_upload_path, file_duplicate_action, hashes,
            resources_ignored, resources_updated, file_metadata_list,
            process_info_path, action)

    for file_metadata in file_metadata_list:
        # Only send forward the hash we need based on the hash_algorithm provided
        file_metadata['destinationHash'] = file_metadata['destinationHash'][
            hash_algorithm]
        # Prepend the project title to each resource's the metadata destinationPath
        file_metadata['destinationPath'] = '/{}/{}'.format(
            project.title, file_metadata['destinationPath'])

    return {
        'resources_ignored': resources_ignored,
        'resources_updated': resources_updated,
        'action_metadata': action_metadata,
        'file_metadata_list': file_metadata_list,
        'project_id': project_id,
        "project_link": "https://osf.io/{}".format(project_id)
    }
Example #9
0
def osf_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from OSF along with its hash information.

    Parameters
    ----------
    token : str
        User's OSF token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'source_path: '/full/path/to/file',
                            'extra_metadata': {'any': 'extra'}
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                                'sourceUsername': '******',
                              }
    """
    try:
        osf_instance = OSF(token)
    except PresQTInvalidTokenError:
        raise PresQTResponseException("Token is invalid. Response returned a 401 status code.",
                                      status.HTTP_401_UNAUTHORIZED)
    # Get contributor name
    contributor_name = requests.get('https://api.osf.io/v2/users/me/',
                                    headers={'Authorization': 'Bearer {}'.format(token)}).json()[
                                        'data']['attributes']['full_name']
    action_metadata = {"sourceUsername": contributor_name}
    # Get the resource
    resource = get_osf_resource(resource_id, osf_instance)

    # Get all files for the provided resources.
    # The 'path' value will be the path that the file is eventually saved in. The root of the
    # path should be the resource.
    files = []
    empty_containers = []
    extra_metadata = {}

    if resource.kind_name == 'file':
        update_process_info_message(process_info_path, action, 'Downloading files from OSF...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, 1, action, 'download')

        project = osf_instance.project(resource.parent_project_id)
        files.append({
            "file": resource.download(),
            "hashes": resource.hashes,
            "title": resource.title,
            # If the file is the only resource we are downloading then we don't need it's full path
            "path": '/{}'.format(resource.title),
            "source_path": '/{}/{}{}'.format(project.title, resource.provider, resource.materialized_path),
            "extra_metadata": osf_download_metadata(resource)
        })
        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')
    else:
        if resource.kind_name == 'project':
            extra_metadata = extra_metadata_helper(resource_id, {'Authorization': 'Bearer {}'.format(token)})
            resource.get_all_files('', files, empty_containers)
            project = resource
        elif resource.kind_name == 'storage':
            resource.get_all_files('/{}'.format(resource.title), files, empty_containers)
            project = osf_instance.project(resource.node)
        else:
            resource.get_all_files('', files, empty_containers)
            project = osf_instance.project(resource.parent_project_id)
            for file in files:
                # File Path needs to start at the folder and strip everything before it.
                # Example: If the resource is 'Docs2' and the starting path is
                # '/Project/Storage/Docs1/Docs2/file.jpeg' then the final path
                # needs to be '/Docs2/file.jpeg'
                path_to_strip = resource.materialized_path[:-(len(resource.title) + 2)]
                file['path'] = file['file'].materialized_path[len(path_to_strip):]

        file_urls = [file['file'].download_url for file in files]

        update_process_info_message(process_info_path, action, 'Downloading files from OSF...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, len(file_urls), action, 'download')

        # Asynchronously make all download requests
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        download_data = loop.run_until_complete(async_main(file_urls, token, process_info_path, action))

        # Go through the file dictionaries and replace the file class with the binary_content
        for file in files:
            file['source_path'] = '/{}/{}{}'.format(project.title,
                                                    file['file'].provider,
                                                    file['file'].materialized_path)
            file['file'] = get_dictionary_from_list(
                download_data, 'url', file['file'].download_url)['binary_content']

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #10
0
def figshare_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from FigShare along with its hash information.

    Parameters
    ----------
    token : str
        User's FigShare token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'source_path: '/full/path/to/file',
                            'extra_metadata': {'any': 'extra'}
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                              'sourceUsername': '******',
                              }
    """
    try:
        headers, username = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException(
            "Token is invalid. Response returned a 401 status code.",
            status.HTTP_401_UNAUTHORIZED)
    split_id = str(resource_id).split(":")
    extra_metadata = {}

    # But first we need to see whether it is a public project, or a private project.
    project_url = "https://api.figshare.com/v2/account/projects/{}".format(
        split_id[0])
    response = requests.get(project_url, headers=headers)
    if response.status_code != 200:
        # Looking for a private project was unsuccessful, try a public project.
        project_url = "https://api.figshare.com/v2/projects/{}".format(
            split_id[0])
        response = requests.get(project_url, headers=headers)
        if response.status_code != 200:
            # Project id is invalid
            raise PresQTResponseException(
                "The resource could not be found by the requesting user.",
                status.HTTP_404_NOT_FOUND)
    data = response.json()
    project_name = data['title']

    # Flags to be used for file checks.
    file_urls = None
    files = None

    if len(split_id) == 1:
        # Download the contents of the project and build the list of file urls to download.
        articles_url = project_url + "/articles"
        files, empty_containers, action_metadata = download_project(
            username, articles_url, headers, project_name, [])
        file_urls = [file['file'] for file in files]
        extra_metadata = extra_metadata_helper(project_url, headers)

    elif len(split_id) == 2 or len(split_id) == 3:
        # We have an article or a file so we need to get the article url
        article_url = "https://api.figshare.com/v2/account/projects/{}/articles/{}".format(
            split_id[0], split_id[1])
        response = requests.get(article_url, headers=headers)

        if response.status_code != 200:
            # Let's see if this is a public article....
            article_url = "https://api.figshare.com/v2/articles/{}".format(
                split_id[1])
            response = requests.get(article_url, headers=headers)

            if response.status_code != 200:
                # We couldn't find the article.
                raise PresQTResponseException(
                    "The resource could not be found by the requesting user.",
                    status.HTTP_404_NOT_FOUND)
        if len(split_id) == 2:
            # Download the contents of the article and build the list of file urls to download.
            files, empty_containers, action_metadata = download_article(
                username, article_url, headers, project_name, [])
            file_urls = [file['file'] for file in files]

        elif len(split_id) == 3:
            update_process_info_message(process_info_path, action,
                                        'Downloading files from FigShare...')
            # Add the total number of articles to the process info file.
            # This is necessary to keep track of the progress of the request.
            update_process_info(process_info_path, 1, action, 'download')

            # Single file download.
            data = response.json()
            for file in data['files']:
                if str(file['id']) == split_id[2]:
                    files = [{
                        "file":
                        requests.get(file['download_url'],
                                     headers=headers).content,
                        "hashes": {
                            "md5": file['computed_md5']
                        },
                        "title":
                        file['name'],
                        "path":
                        "/{}".format(file['name']),
                        "source_path":
                        "/{}/{}/{}".format(project_name, data['title'],
                                           file['name']),
                        "extra_metadata": {
                            "size": file['size']
                        }
                    }]
                    # Increment the number of files done in the process info file.
                    increment_process_info(process_info_path, action,
                                           'download')

                    empty_containers = []
                    action_metadata = {"sourceUsername": username}
            if not files:
                # We could not find the file.
                raise PresQTResponseException(
                    "The resource could not be found by the requesting user.",
                    status.HTTP_404_NOT_FOUND)
    if file_urls:
        update_process_info_message(process_info_path, action,
                                    'Downloading files from FigShare...')
        # Add the total number of articles to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, len(file_urls), action,
                            'download')

        # Start the async calls for project or article downloads
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        download_data = loop.run_until_complete(
            async_main(file_urls, headers, process_info_path, action))

        # Go through the file dictionaries and replace the file path with the binary_content
        for file in files:
            file['file'] = get_dictionary_from_list(
                download_data, 'url', file['file'])['binary_content']

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #11
0
def curate_nd_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from CurateND along with its hash information.

    Parameters
    ----------
    token : str
        User's CurateND token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'source_path: '/full/path/to/file',
                            'extra_metadata': {'any': 'extra'}
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                              'sourceUsername': '******',
                              }
    """
    try:
        curate_instance = CurateND(token)
    except PresQTInvalidTokenError:
        raise PresQTValidationError(
            "Token is invalid. Response returned a 401 status code.",
            status.HTTP_401_UNAUTHORIZED)

    # Get the resource
    resource = get_curate_nd_resource(resource_id, curate_instance)
    action_metadata = {"sourceUsername": resource.extra['depositor']}
    extra_metadata = {}

    # Get all the files for the provided resources.
    files = []
    empty_containers = []
    if resource.kind_name == 'file':
        title_url = resource.extra['isPartOf']
        if type(title_url) is list:
            title_url = resource.extra['isPartOf'][0]
        # Get the title of the Project to add to sourcePath
        project_title = requests.get(title_url,
                                     headers={
                                         'X-Api-Token': '{}'.format(token)
                                     }).json()['title']

        # This is so we aren't missing the few extra keys that are pulled out for the PresQT payload
        resource.extra.update({
            "id": resource.id,
            "date_submitted": resource.date_submitted
        })

        update_process_info_message(process_info_path, action,
                                    'Downloading files from CurateND...')
        # Add the total number of items to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, 1, action, 'download')

        binary_file, curate_hash = resource.download()

        files.append({
            'file':
            binary_file,
            'hashes': {
                'md5': curate_hash
            },
            'title':
            resource.title,
            # If the file is the only resource we are downloading then we don't need it's full path.
            'path':
            '/{}'.format(resource.title),
            'source_path':
            '/{}/{}'.format(project_title, resource.title),
            'extra_metadata':
            resource.extra
        })

        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')

    else:
        if not resource.extra['containedFiles']:
            empty_containers.append('{}'.format(resource.title))
        else:
            update_process_info_message(process_info_path, action,
                                        'Downloading files from CurateND...')
            # Add the total number of items to the process info file.
            # This is necessary to keep track of the progress of the request.
            update_process_info(process_info_path,
                                len(resource.extra['containedFiles']), action,
                                'download')

            title_helper = {}
            hash_helper = {}
            file_urls = []
            project_title = resource.title
            file_metadata = []
            extra_metadata = extra_metadata_helper(resource)

            for file in resource.extra['containedFiles']:
                download_url = file['downloadUrl']
                contained_file = get_curate_nd_resource(
                    file['id'], curate_instance)
                file_metadata_dict = {
                    "title": contained_file.title,
                    "extra": contained_file.extra
                }
                file_metadata.append(file_metadata_dict)

                title_helper[download_url] = contained_file.title
                hash_helper[download_url] = contained_file.md5
                title_helper[file['downloadUrl']] = file['label']
                file_urls.append(file['downloadUrl'])

            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            download_data = loop.run_until_complete(
                async_main(file_urls, token, process_info_path, action))

            for file in download_data:
                title = title_helper[file['url']]
                hash = hash_helper[file['url']]
                files.append({
                    'file':
                    file['binary_content'],
                    'hashes': {
                        'md5': hash
                    },
                    'title':
                    title,
                    "source_path":
                    '/{}/{}'.format(project_title, title),
                    'path':
                    '/{}/{}'.format(resource.title, title),
                    'extra_metadata':
                    get_dictionary_from_list(file_metadata, 'title',
                                             title)['extra']
                })

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #12
0
def gitlab_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from GitLab along with its hash information.

    Parameters
    ----------
    token : str
        User's GitLab token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'source_path: '/full/path/to/file',
                            'extra_metadata': {'any': 'extra'}
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                              'sourceUsername': '******',
                              }
    """
    try:
        header, user_id = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException("Token is invalid. Response returned a 401 status code.",
                                      status.HTTP_401_UNAUTHORIZED)

    # Get the user's GitLab username for action metadata
    username = requests.get("https://gitlab.com/api/v4/user", headers=header).json()['username']

    partitioned_id = resource_id.partition(':')
    if ':' in resource_id:
        project_id = partitioned_id[0]
    else:
        project_id = resource_id

    project_url = 'https://gitlab.com/api/v4/projects/{}'.format(project_id)

    response = requests.get(project_url, headers=header)
    if response.status_code != 200:
        raise PresQTResponseException(
            'The resource with id, {}, does not exist for this user.'.format(resource_id),
            status.HTTP_404_NOT_FOUND)

    project_name = response.json()['name']
    extra_metadata = {}
    if ':' not in resource_id:
        # This is for a project
        all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?recursive=1".format(
            resource_id)
        data = gitlab_paginated_data(header, user_id, all_files_url)
        is_project = True
        # Get extra metadata
        extra_metadata = extra_metadata_helper(response.json(), header)

    elif ':' in resource_id and '%2E' not in resource_id:
        # This is for a directory
        all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?path={}&recursive=1".format(
            partitioned_id[0], partitioned_id[2].replace('+', ' '))
        data = gitlab_paginated_data(header, user_id, all_files_url)
        if not data:
            raise PresQTResponseException(
                'The resource with id, {}, does not exist for this user.'.format(resource_id),
                status.HTTP_404_NOT_FOUND)
        is_project = False

    else:
        update_process_info_message(process_info_path, action, 'Downloading files from GitLab...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, 1, action, 'download')

        # This is a single file
        data = requests.get('https://gitlab.com/api/v4/projects/{}/repository/files/{}?ref=master'.format(
            project_id, partitioned_id[2].replace('+', ' ')), headers=header).json()
        if 'message' in data.keys():
            raise PresQTResponseException(
                'The resource with id, {}, does not exist for this user.'.format(resource_id),
                status.HTTP_404_NOT_FOUND)

        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')
        return {
            'resources': [{
                'file': base64.b64decode(data['content']),
                'hashes': {'sha256': data['content_sha256']},
                'title': data['file_name'],
                'path': '/{}'.format(data['file_name']),
                'source_path': data['file_path'],
                'extra_metadata': {}}],
            'empty_containers': [],
            'action_metadata': {'sourceUsername': username},
            'extra_metadata': extra_metadata
        }

    files, empty_containers, action_metadata = download_content(
        username, project_name, project_id, data, [], is_project)
    file_urls = [file['file'] for file in files]

    update_process_info_message(process_info_path, action, 'Downloading files from GitLab...')
    # Add the total number of projects to the process info file.
    # This is necessary to keep track of the progress of the request.
    update_process_info(process_info_path, len(file_urls), action, 'download')

    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    download_data = loop.run_until_complete(
        async_main(file_urls, header, process_info_path, action))

    # Go through the file dictionaries and replace the file path with the binary_content
    # and replace the hashes with the correct file hashes
    for file in files:
        file['hashes'] = get_dictionary_from_list(
            download_data, 'url', file['file'])['hashes']
        file['file'] = get_dictionary_from_list(
            download_data, 'url', file['file'])['binary_content']

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #13
0
def gitlab_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action):
    """
    Upload the files found in the resource_main_dir to the target.

    Parameters
    ----------
    token : str
        User's token.
    resource_id : str
        ID of the resource requested.
    resource_main_dir : str
        Path to the main directory for the resources to be uploaded.
    hash_algorithm : str
        Hash algorithm we are using to check for fixity.
    file_duplicate_action : str
        The action to take when a duplicate file is found
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources_ignored' : Array of string file paths of files that were ignored when
        uploading the resource. Path should have the same base as resource_main_dir.
                                Example:
                                    ['path/to/ignored/file.pg', 'another/ignored/file.jpg']

        'resources_updated' : Array of string file paths of files that were updated when
         uploading the resource. Path should have the same base as resource_main_dir.
                                 Example:
                                    ['path/to/updated/file.jpg']
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                            {
                                'destinationUsername': '******'
                            }
        'file_metadata_list': List of dictionaries for each file that contains metadata
                              and hash info. Must be in the following format:
                                {
                                    "actionRootPath": '/path/on/disk',
                                    "destinationPath": '/path/on/target/destination',
                                    "title": 'file_title',
                                    "destinationHash": {'hash_algorithm': 'the_hash'}}
                                }
        'project_id': ID of the parent project for this upload. Needed for metadata upload.
        'project_link': The link to either the resource or the home page of the user if not available through API
    """
    base_url = "https://gitlab.com/api/v4/"

    try:
        headers, user_id = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException("Token is invalid. Response returned a 401 status code.",
                                      status.HTTP_401_UNAUTHORIZED)
    username = requests.get("https://gitlab.com/api/v4/user", headers=headers).json()['username']
    action_metadata = {"destinationUsername": username}

    os_path = next(os.walk(resource_main_dir))
    # Get total amount of files
    total_files = upload_total_files(resource_main_dir)
    update_process_info(process_info_path, total_files, action, 'upload')
    update_process_info_message(process_info_path, action,
                                "Uploading files to GitLab...")

    resources_ignored = []
    resources_updated = []
    file_metadata_list = []

    #*** CREATE NEW PROJECT ***#
    # Create a new project with the name being the top level directory's name.
    # Check if a project with this name exists for this user
    if not resource_id:
        project_title = os_path[1][0]
        titles = [data['name'] for data in gitlab_paginated_data(headers, user_id)]
        title = get_duplicate_title(project_title, titles,
                                    '-PresQT*-').replace('(', '-').replace(')', '-')
        response = requests.post('{}projects?name={}&visibility=public'.format(
            base_url, title), headers=headers)
        if response.status_code == 201:
            project_id = response.json()['id']
            project_name = response.json()['name']
            web_url = response.json()['web_url']
        else:
            raise PresQTResponseException(
                "Response has status code {} while creating project {}.".format(
                    response.status_code, project_title), status.HTTP_400_BAD_REQUEST)

        #*** UPLOAD FILES ***#
        # Upload files to project's repository
        base_repo_path = "{}projects/{}/repository/files/".format(base_url, project_id)
        for path, subdirs, files in os.walk(resource_main_dir):
            if not subdirs and not files:
                resources_ignored.append(path)
            for name in files:
                # Strip server directories from file path
                relative_file_path = os.path.join(path.partition('/data/{}/'.format(
                    project_title))[2], name)

                # Extract and encode the file bytes in the way expected by GitLab.
                file_bytes = open(os.path.join(path, name), 'rb').read()
                encoded_file = base64.b64encode(file_bytes)

                # A relative path to the file is what is added to the GitLab POST address
                encoded_file_path = relative_file_path.replace('/', '%2F').replace('.', '%2E')

                request_data = {"branch": "master",
                                "commit_message": "PresQT Upload",
                                "encoding": "base64",
                                "content": encoded_file}

                requests.post("{}{}".format(
                    base_repo_path, encoded_file_path), headers=headers, data=request_data)

                # Get the file hash
                file_json = requests.get("{}{}?ref=master".format(base_repo_path, encoded_file_path),
                                         headers=headers)
                # Increment files finished
                increment_process_info(process_info_path, action, 'upload')

                file_metadata_list.append({
                    "actionRootPath": os.path.join(path, name),
                    # This ensures that the title is up to date if there are duplicates
                    "destinationPath": os.path.join(project_name, path.partition(
                        '/data/')[2].partition('/')[2], name),
                    "title": name,
                    "destinationHash": file_json.json()['content_sha256']
                })
    else:
        if ':' not in resource_id:
            project_id = resource_id
            base_repo_url = "{}projects/{}/repository/files/".format(base_url, project_id)
            string_path_to_resource = ''
        else:
            partitioned_id = resource_id.partition(':')
            project_id = partitioned_id[0]
            base_repo_url = "{}projects/{}/repository/files/{}".format(
                base_url, project_id, partitioned_id[2])
            string_path_to_resource = partitioned_id[2].replace('%2F', '/').replace('%2E', '.')

        # Check if the resource_id belongs to a file
        tree_url = 'https://gitlab.com/api/v4/projects/{}/repository/tree?recursive=1'.format(
            project_id)
        file_data = gitlab_paginated_data(headers, None, tree_url)
        for data in file_data:
            if data['path'] == string_path_to_resource:
                if data['type'] == 'blob':
                    raise PresQTResponseException("Resource with id, {}, belongs to a file.".format(
                        resource_id), status.HTTP_400_BAD_REQUEST)

        # Get project data
        project = requests.get('{}projects/{}'.format(base_url, project_id), headers=headers)
        if project.status_code != 200:
            raise PresQTResponseException("Project with id, {}, could not be found.".format(
                project_id), status.HTTP_404_NOT_FOUND)
        project_name = project.json()['name']
        web_url = project.json()['web_url']

        for path, subdirs, files in os.walk(resource_main_dir):
            if not subdirs and not files:
                resources_ignored.append(path)
            for name in files:
                # Strip server directories from file path
                relative_file_path = os.path.join(path.partition('/data/')[2], name)

                # A relative path to the file is what is added to the GitLab POST address
                if base_repo_url == "{}projects/{}/repository/files/".format(base_url, project_id):
                    encoded_file_path = relative_file_path.replace('/', '%2F').replace('.', '%2E')
                else:
                    encoded_file_path = '%2F{}'.format(
                        relative_file_path.replace('/', '%2F').replace('.', '%2E'))
                full_encoded_url = '{}{}'.format(base_repo_url, encoded_file_path)

                ignore_file = False
                upload_request = requests.post
                file_bytes = None
                # Check if this file exists already
                for file in file_data:
                    if os.path.join(string_path_to_resource, relative_file_path) == file['path']:
                        if file_duplicate_action == 'ignore':
                            resources_ignored.append(os.path.join(path, name))
                            ignore_file = True
                            break
                        else:
                            file_url = '{}?ref=master'.format(full_encoded_url)
                            file_response = requests.get(file_url, headers=headers)
                            file_bytes = open(os.path.join(path, name), 'rb').read()
                            if hash_generator(file_bytes, 'sha256') == file_response.json()['content_sha256']:
                                resources_ignored.append(os.path.join(path, name))
                                ignore_file = True
                            else:
                                resources_updated.append(os.path.join(path, name))
                                upload_request = requests.put

                            # Break out of this for loop and attempt to upload this duplicate
                            break
                # If we find a file to ignore then move onto the next file in the os.walk
                if ignore_file:
                    continue

                # Extract and encode the file bytes in the way expected by GitLab.
                if not file_bytes:
                    file_bytes = open(os.path.join(path, name), 'rb').read()
                encoded_file = base64.b64encode(file_bytes)

                request_data = {"branch": "master",
                                "commit_message": "PresQT Upload",
                                "encoding": "base64",
                                "content": encoded_file}

                response = upload_request("{}".format(full_encoded_url),
                                          headers=headers, data=request_data)
                if response.status_code not in [201, 200]:
                    raise PresQTResponseException(
                        'Upload failed with a status code of {}'.format(response.status_code),
                        status.HTTP_400_BAD_REQUEST)

                # Get the file hash
                file_json = requests.get("{}?ref=master".format(full_encoded_url),
                                         headers=headers).json()
                # Increment files finished
                increment_process_info(process_info_path, action, 'upload')

                file_metadata_list.append({
                    "actionRootPath": os.path.join(path, name),
                    "destinationPath": os.path.join(project_name, path.partition('/data/')[2], name),
                    "title": name,
                    "destinationHash": file_json['content_sha256']
                })

    return {
        'resources_ignored': resources_ignored,
        'resources_updated': resources_updated,
        'action_metadata': action_metadata,
        'file_metadata_list': file_metadata_list,
        'project_id': project_id,
        'project_link': web_url
    }