Example #1
0
async def async_get(url, session, header, process_info_path, action):
    """
    Coroutine that uses aiohttp to make a GET request. This is the method that will be called
    asynchronously with other GETs.

    Parameters
    ----------
    url: str
        URL to call
    session: ClientSession object
        aiohttp ClientSession Object
    header: str
        Proper header for calls
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Response JSON
    """
    async with session.get(url, headers=header) as response:
        assert response.status == 200
        content = await response.json()
        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')
        return {
            'url': url,
            'binary_content': base64.b64decode(content['content']),
            'hashes': {'sha256': content['content_sha256']}}
Example #2
0
def download_file(repo_data, resource_data, process_info_path, action):
    """
    Build a dictionary for the requested file

    Parameters
    ----------
    repo_data: dict
        Repository data gathered in the repo GET request
    resource_data:
        Resource data gathered in the resource GET request
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    A list of a single dictionary representing the file requested and delivered. Boom.
    """
    repo_name = repo_data['name']
    # Increment the number of files done in the process info file.
    increment_process_info(process_info_path, action, 'download')
    return [{
        'file': base64.b64decode(resource_data['content']),
        'hashes': {},
        'title': resource_data['name'],
        'path': '/{}'.format(resource_data['name']),
        'source_path': '/{}/{}'.format(repo_name, resource_data['path']),
        'extra_metadata': {}
    }]
Example #3
0
async def async_get(url, session, params, process_info_path, action):
    """
    Coroutine that uses aiohttp to make a GET request. This is the method that will be called
    asynchronously with other GETs.

    Parameters
    ----------
    url: str
        URL to call
    session: ClientSession object
        aiohttp ClientSession Object
    params: str
        params
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Response JSON
    """
    async with session.get(url, params=params) as response:
        assert response.status == 200
        content = await response.read()
        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')
        return {'url': url, 'binary_content': content}
Example #4
0
File: main.py Project: ndlib/presqt
    def get_resources(self, process_info_path, url=None):
        """
        Get all of the user's resources. To batch calls together asynchronously we will group calls
        together by projects, then storages, then each storage's resources.
        """
        resources = []
        all_projects, top_level_projects = self.projects(url)
        # Add all top level projects and subprojects to the resources list
        self.iter_project_hierarchy(all_projects, top_level_projects,
                                    resources)
        # Add all storages to the resource list
        user_storages_links = self.iter_project_storages(
            all_projects, resources)
        # Get initial resources for all storages
        all_storages_resources = run_urls_async_with_pagination(
            self, user_storages_links)

        # Add the total number of storages to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, len(all_storages_resources),
                            'resource_collection', 'fetch')

        # Loop through the storage resources to either add them to the main resources list or
        # traverse further down the tree to get their children resources.
        for storage_resources in all_storages_resources:
            # Increment the number of files done in the process info file.
            increment_process_info(process_info_path, 'resource_collection',
                                   'fetch')

            if storage_resources and storage_resources[
                    'data']:  #TODO: First if check doing this to avoid private file errors look into it
                # Calculate the given resource's container_id
                parent_project_id = storage_resources['data'][0][
                    'relationships']['node']['data']['id']
                parent_storage = storage_resources['data'][0]['attributes'][
                    'provider']
                container_id = '{}:{}'.format(parent_project_id,
                                              parent_storage)

                self.iter_resources_objects(storage_resources, resources,
                                            container_id)

        return resources
Example #5
0
def figshare_upload_resource(token, resource_id, resource_main_dir,
                             hash_algorithm, file_duplicate_action,
                             process_info_path, action):
    """
    Upload the files found in the resource_main_dir to the target.

    Parameters
    ----------
    token : str
        User's token.
    resource_id : str
        ID of the resource requested.
    resource_main_dir : str
        Path to the main directory for the resources to be uploaded.
    hash_algorithm : str
        Hash algorithm we are using to check for fixity.
    file_duplicate_action : str
        The action to take when a duplicate file is found
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources_ignored' : Array of string file paths of files that were ignored when
        uploading the resource. Path should have the same base as resource_main_dir.
                                Example:
                                    ['path/to/ignored/file.pg', 'another/ignored/file.jpg']

        'resources_updated' : Array of string file paths of files that were updated when
         uploading the resource. Path should have the same base as resource_main_dir.
                                 Example:
                                    ['path/to/updated/file.jpg']
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                            {
                                'destinationUsername': '******'
                            }
        'file_metadata_list': List of dictionaries for each file that contains metadata
                              and hash info. Must be in the following format:
                                {
                                    "actionRootPath": '/path/on/disk',
                                    "destinationPath": '/path/on/target/destination',
                                    "title": 'file_title',
                                    "destinationHash": {'hash_algorithm': 'the_hash'}}
                                }
        'project_id': ID of the parent project for this upload. Needed for metadata upload. 
        'project_link': The link to either the resource or the home page of the user if not available through API

    FigShare's Upload Process
        1. Initiate new file upload (POST) within the article. Send file size, md5, and name but no file contents yet.
        2. Send a GET request to the 'Uploader Service' to determine that the status is "Pending" and how many parts to split the upload into.
        3. Split the file into the correct number of parts and upload each using a PUT request.
        4. Send a POST request to complete the upload.
    """
    try:
        headers, username = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException(
            "Token is invalid. Response returned a 401 status code.",
            status.HTTP_401_UNAUTHORIZED)

    os_path = next(os.walk(resource_main_dir))
    total_files = upload_total_files(resource_main_dir)
    # Update process info file
    update_process_info(process_info_path, total_files, action, 'upload')
    update_process_info_message(process_info_path, action,
                                "Uploading files to FigShare...")

    resources_ignored = []
    resources_updated = []
    file_metadata_list = []
    action_metadata = {'destinationUsername': username}

    # Upload a new project
    if not resource_id:
        project_title = os_path[1][0]
        # Create a new project with the name being the top level directory's name.
        project_name, project_id = create_project(project_title, headers,
                                                  token)
        # Create article, for now we'll name it the same as the project
        article_id = create_article(project_title, headers, project_id)
    else:
        # Upload to an existing project
        split_id = str(resource_id).split(":")
        project_id = split_id[0]

        try:
            project_title = requests.get(
                "https://api.figshare.com/v2/account/projects/{}".format(
                    project_id),
                headers=headers).json()['title']
        except KeyError:
            raise PresQTResponseException(
                "Project with id, {}, could not be found by the requesting user."
                .format(project_id), status.HTTP_400_BAD_REQUEST)

        if len(split_id) == 1:
            # We only have a project and we need to make a new article id
            # Check to see if an article with this name already exists
            articles = requests.get(
                "https://api.figshare.com/v2/account/projects/{}/articles".
                format(project_id),
                headers=headers).json()
            article_titles = [article['title'] for article in articles]
            new_title = get_duplicate_title(project_title, article_titles,
                                            "(PresQT*)")
            article_id = create_article(new_title, headers, resource_id)
        elif len(split_id) == 2:
            article_id = split_id[1]
        else:
            # Can't upload to file
            raise PresQTResponseException(
                "Can not upload into an existing file.",
                status.HTTP_400_BAD_REQUEST)

    # Get the article title
    try:
        article_title = requests.get(
            "https://api.figshare.com/v2/account/articles/{}".format(
                article_id),
            headers=headers).json()['title']
    except KeyError:
        raise PresQTResponseException(
            "Article with id, {}, could not be found by the requesting user.".
            format(article_id), status.HTTP_400_BAD_REQUEST)

    # Get md5, size and name of zip file to be uploaded
    for path, subdirs, files in os.walk(resource_main_dir):
        for name in files:
            file_info = open(os.path.join(path, name), 'rb')
            zip_hash = hash_generator(file_info.read(), 'md5')

            figshare_file_upload_process(file_info,
                                         headers,
                                         name,
                                         article_id,
                                         file_type='zip',
                                         path=path)

            file_metadata_list.append({
                'actionRootPath':
                os.path.join(path, name),
                'destinationPath':
                '/{}/{}/{}'.format(project_title, article_title, name),
                'title':
                name,
                'destinationHash':
                zip_hash
            })
            increment_process_info(process_info_path, action, 'upload')

    return {
        "resources_ignored": resources_ignored,
        "resources_updated": resources_updated,
        "action_metadata": action_metadata,
        "file_metadata_list": file_metadata_list,
        "project_id": "{}:{}".format(project_id, article_id),
        "project_link": "https://figshare.com/account/home#/projects"
    }
Example #6
0
    def _upload_resource(self):
        """
        Upload resources to the target and perform a fixity check on the resulting hashes.
        """
        action = 'resource_upload'
        # This doesn't happen during an upload, so it won't be an error. If there is an error during
        # transfer this will be overwritten.
        self.keyword_enhancement_successful = True
        # Write the process id to the process_info file
        self.process_info_obj[
            'function_process_id'] = self.function_process.pid
        update_or_create_process_info(self.process_info_obj, self.action,
                                      self.ticket_number)

        # Data directory in the bag
        self.data_directory = '{}/data'.format(self.resource_main_dir)

        # If we are uploading (not transferring) then create the initial metadata based on the
        # zipped bag provided.
        if self.action == 'resource_upload':
            update_process_info_message(self.process_info_path, self.action,
                                        "Creating PRESQT_FTS_METADATA...")
            self.new_fts_metadata_files = []
            for path, subdirs, files in os.walk(self.data_directory):
                for name in files:
                    self.new_fts_metadata_files.append({
                        'destinationHashes': {},
                        'destinationPath':
                        os.path.join(path, name)[len(self.data_directory):],
                        'failedFixityInfo': [],
                        'title':
                        name,
                        'sourceHashes': {
                            self.hash_algorithm:
                            self.file_hashes[os.path.join(path, name)]
                        },
                        'sourcePath':
                        os.path.join(path, name)[len(self.data_directory):],
                        'extra': {}
                    })

            destination_target_data = get_target_data(
                self.destination_target_name)
            self.details = "PresQT Upload to {}".format(
                destination_target_data['readable_name'])
            self.action_metadata = {
                'id': str(uuid4()),
                'details': self.details,
                'actionDateTime': str(timezone.now()),
                'actionType': self.action,
                'sourceTargetName': 'Local Machine',
                'sourceUsername': None,
                'destinationTargetName': self.destination_target_name,
                'destinationUsername': None,
                'keywords': {},
                'files': {
                    'created': self.new_fts_metadata_files,
                    'updated': [],
                    'ignored': []
                }
            }

        # If the target destination's storage hierarchy has a finite depth then zip the resources
        # to be uploaded along with their metadata.
        # Also, create metadata files for the new zip file to be uploaded.
        if self.infinite_depth is False:
            try:
                structure_validation(self)
                finite_depth_upload_helper(self)
            except PresQTResponseException as e:
                # Catch any errors that happen within the target fetch.
                # Update the server process_info file appropriately.
                self.process_info_obj['status_code'] = e.status_code
                self.process_info_obj['status'] = 'failed'
                if self.action == 'resource_transfer_in':
                    self.process_info_obj['upload_status'] = 'failed'
                self.process_info_obj['message'] = e.data
                # Update the expiration from 5 hours to 1 hour from now. We can delete this faster because
                # it's an incomplete/failed directory.
                self.process_info_obj['expiration'] = str(timezone.now() +
                                                          relativedelta(
                                                              hours=1))
                update_or_create_process_info(self.process_info_obj,
                                              self.action, self.ticket_number)
                return False

        # Fetch the proper function to call
        func = FunctionRouter.get_function(self.destination_target_name,
                                           action)

        # Upload the resources. func_dict has the following format:
        #   {
        #        'resources_ignored': resources_ignored,
        #        'resources_updated': resources_updated,
        #        'action_metadata': action_metadata,
        #        'file_metadata_list': file_metadata_list,
        #        'project_id': title
        #    }
        try:
            structure_validation(self)
            self.func_dict = func(self.destination_token,
                                  self.destination_resource_id,
                                  self.data_directory, self.hash_algorithm,
                                  self.file_duplicate_action,
                                  self.process_info_path, self.action)
        except PresQTResponseException as e:
            # Catch any errors that happen within the target fetch.
            # Update the server process_info file appropriately.
            self.process_info_obj['status_code'] = e.status_code
            self.process_info_obj['status'] = 'failed'
            if self.action == 'resource_transfer_in':
                self.process_info_obj['upload_status'] = 'failed'
            self.process_info_obj['message'] = e.data
            # Update the expiration from 5 hours to 1 hour from now. We can delete this faster
            # because it's an incomplete/failed directory.
            self.process_info_obj['expiration'] = str(timezone.now() +
                                                      relativedelta(hours=1))
            update_or_create_process_info(self.process_info_obj, self.action,
                                          self.ticket_number)
            return False

        self.process_info_obj = read_file(self.process_info_path,
                                          True)[self.action]

        # Check if fixity has failed on any files during a transfer. If so, update the
        # process_info_data file.
        self.upload_fixity = True
        self.upload_failed_fixity = []

        for resource in self.func_dict['file_metadata_list']:
            resource['failed_fixity_info'] = []
            if resource['destinationHash'] != self.file_hashes[resource['actionRootPath']] \
                    and resource['actionRootPath'] not in self.func_dict['resources_ignored']:
                self.upload_fixity = False
                self.upload_failed_fixity.append(
                    resource['actionRootPath'][len(self.data_directory):])
                resource['failed_fixity_info'].append({
                    'NewGeneratedHash':
                    self.file_hashes[resource['actionRootPath']],
                    'algorithmUsed':
                    self.hash_algorithm,
                    'reasonFixityFailed':
                    "Either the destination did not provide a hash "
                    "or fixity failed during upload."
                })

        # Strip the server created directory prefix of the file paths for ignored and updated files
        resources_ignored = [
            file[len(self.data_directory):]
            for file in self.func_dict['resources_ignored']
        ]
        self.process_info_obj['resources_ignored'] = resources_ignored
        resources_updated = [
            file[len(self.data_directory):]
            for file in self.func_dict['resources_updated']
        ]
        self.process_info_obj['resources_updated'] = resources_updated

        if self.action == 'resource_transfer_in':
            self.keyword_enhancement_successful = True
            if not self.destination_resource_id:
                self.destination_resource_id = self.func_dict['project_id']
            if self.supports_keywords:
                self.keyword_enhancement_successful, self.destination_initial_keywords = update_targets_keywords(
                    self, self.func_dict['project_id'])

                # Add the destination initial keywords to all keywords for accurate metadata list
                self.all_keywords = self.all_keywords + self.destination_initial_keywords

        self.metadata_validation = create_upload_metadata(
            self, self.func_dict['file_metadata_list'],
            self.func_dict['action_metadata'], self.func_dict['project_id'],
            resources_ignored, resources_updated)
        # Increment process_info one last time
        increment_process_info(self.process_info_path, self.action, 'upload')

        # Validate the final metadata
        upload_message = get_action_message(self, 'Upload', self.upload_fixity,
                                            self.metadata_validation,
                                            self.action_metadata)
        self.process_info_obj['message'] = upload_message

        if self.action == 'resource_upload':
            # Update server process file
            self.process_info_obj['status_code'] = '200'
            self.process_info_obj['status'] = 'finished'
            self.process_info_obj['hash_algorithm'] = self.hash_algorithm
            self.process_info_obj['failed_fixity'] = self.upload_failed_fixity
            self.process_info_obj['upload_status'] = upload_message
            self.process_info_obj['link_to_resource'] = self.func_dict[
                "project_link"]
            update_or_create_process_info(self.process_info_obj, self.action,
                                          self.ticket_number)

            if self.email:
                context = {
                    "upload_url": self.func_dict["project_link"],
                    "upload_message": upload_message,
                    "failed_fixity": self.upload_failed_fixity
                }
                email_blaster(self.email, "PresQT Upload Complete", context,
                              "emails/upload_email.html")

        return True
Example #7
0
def download_directory(header, path_to_resource, repo_data, process_info_path,
                       action):
    """
    Go through a repo's tree and download all files inside of a given resource directory path.

    Parameters
    ----------
    header: dict
        API header expected by GitHub
    path_to_resource: str
        The path to the requested directory
    repo_data: dict
        Repository data gathered in the repo GET request
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    A list of dictionaries for each file being downloaded
    """
    repo_name = repo_data['name']
    # Strip {/sha} off the end
    trees_url = '{}/master?recursive=1'.format(repo_data['trees_url'][:-6])
    contents = requests.get(trees_url, headers=header).json()

    number_of_files = len([
        file for file in contents['tree']
        if file['path'].startswith(path_to_resource) and file['type'] == 'blob'
    ])
    # Add the total number of repository to the process info file.
    # This is necessary to keep track of the progress of the request.
    update_process_info(process_info_path, number_of_files, action, 'download')
    update_process_info_message(process_info_path, action,
                                'Downloading files from GitHub...')

    files = []
    for resource in contents['tree']:
        if resource['path'].startswith(
                path_to_resource) and resource['type'] == 'blob':
            # Strip the requested directory's parents off the directory path
            path_to_strip = path_to_resource.rpartition('/')[0]
            if path_to_strip:
                directory_path = '{}'.format(
                    resource['path'].partition(path_to_strip)[2])
            else:
                directory_path = '/{}'.format(resource['path'])

            file_data = requests.get(resource['url']).json()

            files.append({
                'file':
                base64.b64decode(file_data['content']),
                'hashes': {},
                'title':
                resource['path'].rpartition('/')[0],
                'path':
                directory_path,
                'source_path':
                '/{}/{}'.format(repo_name, resource['path']),
                'extra_metadata': {}
            })
            # Increment the number of files done in the process info file.
            increment_process_info(process_info_path, action, 'download')
    return files
Example #8
0
def zenodo_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from Zenodo along with its hash information.

    Parameters
    ----------
    token : str
        User's Zenodo token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'metadata': {
                                'sourcePath': '/full/path/at/source.jpg',
                                'title': 'file_title',
                                'sourceHashes': {'hash_algorithm': 'the_hash'},
                                'extra': {'any': 'extra'}
                             }
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                              'sourceUsername': '******',
                              }
    """
    try:
        auth_parameter = zenodo_validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException(
            'Token is invalid. Response returned a 401 status code.',
            status.HTTP_401_UNAUTHORIZED)
    files = []
    empty_containers = []
    extra_metadata = {}
    base_url = None

    # If the resource_id is longer than 7 characters, the resource is an individual file
    if len(resource_id) > 7:
        # First we need to check if the file id given belongs to a public published record.
        zenodo_file = requests.get(
            'https://zenodo.org/api/files/{}'.format(resource_id),
            params=auth_parameter)
        if zenodo_file.status_code != 200:
            # If not, we need to loop through their depositions to look for the file.
            zenodo_projects = requests.get(
                'https://zenodo.org/api/deposit/depositions',
                params=auth_parameter).json()
            for entry in zenodo_projects:
                project_files = requests.get(entry['links']['self'],
                                             params=auth_parameter).json()
                for file in project_files['files']:
                    if file['id'] == resource_id:
                        base_url = entry['links']['self']
                        file_url = file['links']['self']
                        is_record = False
                        break
                else:
                    # If the file wasn't found we want to continue the loop.
                    continue
                break
        else:
            is_record = True
            base_url = 'https://zenodo.org/api/files/{}'.format(resource_id)
            file_url = 'https://zenodo.org/api/files/{}'.format(resource_id)

        if base_url is None:
            raise PresQTResponseException(
                "The resource with id, {}, does not exist for this user.".
                format(resource_id), status.HTTP_404_NOT_FOUND)

        update_process_info_message(process_info_path, action,
                                    'Downloading files from Zenodo...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, 1, action, 'download')

        files, action_metadata = zenodo_download_helper(
            is_record, base_url, auth_parameter, files, file_url)

        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')

    # Otherwise, it's a full project
    else:
        base_url = 'https://zenodo.org/api/records/{}'.format(resource_id)
        zenodo_record = requests.get(base_url, params=auth_parameter)
        is_record = True
        if zenodo_record.status_code != 200:
            base_url = 'https://zenodo.org/api/deposit/depositions/{}'.format(
                resource_id)
            is_record = False
        try:
            files, action_metadata = zenodo_download_helper(
                is_record, base_url, auth_parameter, files)
        except PresQTResponseException:
            raise PresQTResponseException(
                "The resource with id, {}, does not exist for this user.".
                format(resource_id), status.HTTP_404_NOT_FOUND)

        extra_metadata = extra_metadata_helper(base_url, is_record,
                                               auth_parameter)
        file_urls = [file['file'] for file in files]

        update_process_info_message(process_info_path, action,
                                    'Downloading files from Zenodo...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, len(file_urls), action,
                            'download')

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        download_data = loop.run_until_complete(
            async_main(file_urls, auth_parameter, process_info_path, action))

        # Go through the file dictionaries and replace the file path with the binary_content
        for file in files:
            file['file'] = get_dictionary_from_list(
                download_data, 'url', file['file'])['binary_content']

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #9
0
def zenodo_upload_loop(action_metadata, resource_id, resource_main_dir, post_url, auth_parameter,
                       title, file_duplicate_action, process_info_path, action):
    """
    Loop through the files to be uploaded and return the dictionary.

    Parameters
    ----------
    action_metadata : dict
        The metadata for this PresQT action
    resource_id : str
        The id of the resource the upload is happening on
    post_url : str
        The url to upload files to
    auth_parameter : dict
        Zenodo's authorization paramater
    title : str
        The title of the project created
    file_duplicate_action : str
        The action to take when a duplicate file is found
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
            The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources_ignored' : Array of string file paths of files that were ignored when
        uploading the resource. Path should have the same base as resource_main_dir.
                                Example:
                                    ['path/to/ignored/file.pg', 'another/ignored/file.jpg']

        'resources_updated' : Array of string file paths of files that were updated when
         uploading the resource. Path should have the same base as resource_main_dir.
                                 Example:
                                    ['path/to/updated/file.jpg']
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                            {
                                'destinationUsername': '******'
                            }
        'file_metadata_list': List of dictionaries for each file that contains metadata
                              and hash info. Must be in the following format:
                                {
                                    "actionRootPath": '/path/on/disk',
                                    "destinationPath": '/path/on/target/destination',
                                    "title": 'file_title',
                                    "destinationHash": {'hash_algorithm': 'the_hash'}}
                                }
        'project_id': ID of the parent project for this upload. Needed for metadata upload.
    """
    resources_ignored = []
    resources_updated = []
    file_metadata_list = []
    action_metadata = {'destinationUsername': None}

    # Get current files associated with the resource.
    project_url = "https://zenodo.org/api/deposit/depositions/{}".format(resource_id)
    current_file_list = requests.get(project_url, params=auth_parameter).json()['files']
    file_title_list = [entry['filename'] for entry in current_file_list]

    for path, subdirs, files in os.walk(resource_main_dir):
        if not subdirs and not files:
            resources_ignored.append(path)

        for name in files:
            formatted_name = name.replace(' ', '_')
            if formatted_name in file_title_list and file_duplicate_action == 'ignore':
                resources_ignored.append(os.path.join(path, name))
                continue

            data = {'name': formatted_name}
            files = {'file': open(os.path.join(path, name), "rb")}

            if formatted_name in file_title_list and file_duplicate_action == 'update':
                # First we need to delete the old file
                for entry in current_file_list:
                    if formatted_name == entry['filename']:
                        delete_response = requests.delete(
                            entry['links']['self'], params=auth_parameter)
                        if delete_response.status_code != 204:
                            raise PresQTResponseException(
                                "Zenodo returned an error trying to update {}".format(name),
                                status.HTTP_400_BAD_REQUEST)
                        # Add this resource to the updated list
                        resources_updated.append(os.path.join(path, name))
            # Make the upload request....
            response = requests.post(post_url, params=auth_parameter,
                                     data=data, files=files)
            if response.status_code != 201:
                raise PresQTResponseException(
                    "Zenodo returned an error trying to upload {}".format(name),
                    status.HTTP_400_BAD_REQUEST)
            # Increment process info file
            increment_process_info(process_info_path, action, 'upload')

            file_metadata_list.append({
                'actionRootPath': os.path.join(path, name),
                'destinationPath': '/{}/{}'.format(title, formatted_name),
                'title': formatted_name,
                'destinationHash': response.json()['checksum']})

    return {
        "resources_ignored": resources_ignored,
        "resources_updated": resources_updated,
        "action_metadata": action_metadata,
        "file_metadata_list": file_metadata_list,
        "project_id": resource_id,
        "project_link": "https://zenodo.org/deposit?page=1&size=20"
    }
Example #10
0
    def create_directory(self, directory_path, file_duplicate_action,
                         file_hashes, resources_ignored, resources_updated,
                         file_metadata_list, process_info_path, action):
        """
        Create a directory of folders and files found in the given directory_path.

        Parameters
        ----------
        directory_path : str
            Directory to find the resources to create.
        file_duplicate_action : str
            Flag for how to handle the case of the file already existing.
        file_hashes : dict
            Dictionary of uploaded file hashes.
        resources_ignored : list
            List of duplicate resources ignored.
        resources_updated : list
            List of duplicate resources updated.
        file_metadata_list: list
            List of file metadata
        process_info_path: str
            Path to the process info file that keeps track of the action's progress
        action: str
            The action being performed

        Returns
        -------
        Returns same file_hashes, resources ignored, resources updated parameters.
        """
        directory, folders, files = next(os.walk(directory_path))

        for filename in files:
            file_path = '{}/{}'.format(directory, filename)
            file_to_write = read_file(file_path)

            file_action, file = self.create_file(filename, file_to_write,
                                                 file_duplicate_action)

            file_metadata_list.append({
                "actionRootPath":
                file_path,
                "destinationPath":
                '{}{}'.format(file.provider, file.materialized_path),
                "title":
                file.title,
                "destinationHash":
                file.hashes
            })
            increment_process_info(process_info_path, action, 'upload')

            file_hashes[file_path] = file.hashes
            if file_action == 'ignored':
                resources_ignored.append(file_path)
            elif file_action == 'updated':
                resources_updated.append(file_path)

        for folder in folders:
            created_folder = self.create_folder(folder)
            created_folder.create_directory('{}/{}'.format(directory, folder),
                                            file_duplicate_action, file_hashes,
                                            resources_ignored,
                                            resources_updated,
                                            file_metadata_list,
                                            process_info_path, action)
Example #11
0
def github_upload_resource(token, resource_id, resource_main_dir,
                           hash_algorithm, file_duplicate_action,
                           process_info_path, action):
    """
    Upload the files found in the resource_main_dir to the target.

    Parameters
    ----------
    token : str
        User's token.
    resource_id : str
        ID of the resource requested.
    resource_main_dir : str
        Path to the main directory for the resources to be uploaded.
    hash_algorithm : str
        Hash algorithm we are using to check for fixity.
    file_duplicate_action : str
        The action to take when a duplicate file is found
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources_ignored' : Array of string file paths of files that were ignored when
        uploading the resource. Path should have the same base as resource_main_dir.
                                Example:
                                    ['path/to/ignored/file.pg', 'another/ignored/file.jpg']

        'resources_updated' : Array of string file paths of files that were updated when
         uploading the resource. Path should have the same base as resource_main_dir.
                                 Example:
                                    ['path/to/updated/file.jpg']
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                            {
                                'destinationUsername': '******'
                            }
        'file_metadata_list': List of dictionaries for each file that contains metadata
                              and hash info. Must be in the following format:
                                {
                                    "actionRootPath": '/path/on/disk',
                                    "destinationPath": '/path/on/target/destination',
                                    "title": 'file_title',
                                    "destinationHash": {'hash_algorithm': 'the_hash'}}
                                }
        'project_id': ID of the parent project for this upload. Needed for metadata upload.
        'project_link': The link to either the resource or the home page of the user if not available through API
    """
    try:
        header, username = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException(
            "Token is invalid. Response returned a 401 status code.",
            status.HTTP_401_UNAUTHORIZED)
    os_path = next(os.walk(resource_main_dir))
    # Get total amount of files
    total_files = upload_total_files(resource_main_dir)
    update_process_info(process_info_path, total_files, action, 'upload')
    update_process_info_message(process_info_path, action,
                                "Uploading files to GitHub...")

    # Upload a new repository
    if not resource_id:
        # Create a new repository with the name being the top level directory's name.
        # Note: GitHub doesn't allow spaces, or circlebois in repo_names
        repo_title = os_path[1][0].replace(' ', '_').replace("(", "-").replace(
            ")", "-").replace(":", "-")
        repo_name, repo_id, repo_url = create_repository(repo_title, token)
        resources_ignored = []
        resources_updated = []
        action_metadata = {"destinationUsername": username}
        file_metadata_list = []
        for path, subdirs, files in os.walk(resource_main_dir):
            if not subdirs and not files:
                resources_ignored.append(path)
            for name in files:
                # Extract and encode the file bytes in the way expected by GitHub.
                file_bytes = open(os.path.join(path, name), 'rb').read()
                encoded_file = base64.b64encode(file_bytes).decode('utf-8')
                # A relative path to the file is what is added to the GitHub PUT address
                path_to_add = os.path.join(path.partition('/data/')[2], name)
                path_to_add_to_url = path_to_add.partition('/')[2].replace(
                    ' ', '_')
                finished_path = '/' + repo_name + '/' + path_to_add_to_url
                file_metadata_list.append({
                    "actionRootPath":
                    os.path.join(path, name),
                    "destinationPath":
                    finished_path,
                    "title":
                    name,
                    "destinationHash":
                    None
                })
                put_url = "https://api.github.com/repos/{}/{}/contents/{}".format(
                    username, repo_name, path_to_add_to_url)
                data = {
                    "message": "PresQT Upload",
                    "committer": {
                        "name": "PresQT",
                        "email": "N/A"
                    },
                    "content": encoded_file
                }

                file_response = requests.put(put_url,
                                             headers=header,
                                             data=json.dumps(data))
                if file_response.status_code != 201:
                    raise PresQTResponseException(
                        "Github returned the following error: '{}'".format(
                            str(file_response.json()['message'])),
                        status.HTTP_400_BAD_REQUEST)

                # Increment the file counter
                increment_process_info(process_info_path, action, 'upload')
    else:
        # Upload to an existing repository
        if ':' not in resource_id:
            repo_id = resource_id
            path_to_upload_to = ''
        # Upload to an existing directory
        else:
            partitioned_id = resource_id.partition(':')
            repo_id = partitioned_id[0]
            path_to_upload_to = '/{}'.format(partitioned_id[2]).replace(
                '%2F', '/').replace('%2E', '.')

        # Get initial repo data for the resource requested
        repo_url = 'https://api.github.com/repositories/{}'.format(repo_id)
        response = requests.get(repo_url, headers=header)

        if response.status_code != 200:
            raise PresQTResponseException(
                'The resource with id, {}, does not exist for this user.'.
                format(resource_id), status.HTTP_404_NOT_FOUND)
        repo_data = response.json()
        repo_name = repo_data['name']
        repo_url = repo_data['svn_url']

        # Get all repo resources so we can check if any files already exist
        repo_resources = requests.get('{}/master?recursive=1'.format(
            repo_data['trees_url'][:-6]),
                                      headers=header).json()
        if 'message' in repo_resources:
            repo_resources = requests.get('{}/main?recursive=1'.format(
                repo_data['trees_url'][:-6]),
                                          headers=header).json()
        # current_file_paths = ['/' + resource['path'] for resource in repo_resources['tree'] if resource['type'] == 'blob']
        current_file_paths = []
        for resource in repo_resources['tree']:
            if resource['type'] == 'blob':
                current_file_paths.append('/' + resource['path'])

        # Check if the provided path to upload to is actually a path to an existing file
        if path_to_upload_to in current_file_paths:
            raise PresQTResponseException(
                'The Resource provided, {}, is not a container'.format(
                    resource_id), status.HTTP_400_BAD_REQUEST)

        resources_ignored = []
        resources_updated = []
        file_metadata_list = []
        sha = None
        action_metadata = {"destinationUsername": username}

        for path, subdirs, files in os.walk(resource_main_dir):
            if not subdirs and not files:
                resources_ignored.append(path)
            for name in files:
                path_to_file = os.path.join('/',
                                            path.partition('/data/')[2],
                                            name).replace(' ', '_')

                # Check if the file already exists in this repository
                full_file_path = '{}{}'.format(path_to_upload_to, path_to_file)
                if full_file_path in current_file_paths:
                    if file_duplicate_action == 'ignore':
                        resources_ignored.append(os.path.join(path, name))
                        continue
                    else:
                        resources_updated.append(os.path.join(path, name))
                        # Get the sha
                        sha_url = 'https://api.github.com/repos/{}/contents{}'.format(
                            repo_data['full_name'], full_file_path)
                        sha_response = requests.get(sha_url, headers=header)
                        sha = sha_response.json()['sha']

                # Extract and encode the file bytes in the way expected by GitHub.
                file_bytes = open(os.path.join(path, name), 'rb').read()
                encoded_file = base64.b64encode(file_bytes).decode('utf-8')
                # A relative path to the file is what is added to the GitHub PUT address
                file_metadata_list.append({
                    "actionRootPath":
                    os.path.join(path, name),
                    "destinationPath":
                    '/{}{}{}'.format(repo_name, path_to_upload_to,
                                     path_to_file),
                    "title":
                    name,
                    "destinationHash":
                    None
                })
                put_url = 'https://api.github.com/repos/{}/contents{}{}'.format(
                    repo_data['full_name'], path_to_upload_to, path_to_file)

                data = {
                    "message": "PresQT Upload",
                    "sha": sha,
                    "committer": {
                        "name": "PresQT",
                        "email": "N/A"
                    },
                    "content": encoded_file
                }

                upload_response = requests.put(put_url,
                                               headers=header,
                                               data=json.dumps(data))

                if upload_response.status_code not in [200, 201]:
                    raise PresQTResponseException(
                        'Upload failed with a status code of {}'.format(
                            upload_response.status_code),
                        status.HTTP_400_BAD_REQUEST)
                # Increment the file counter
                increment_process_info(process_info_path, action, 'upload')

    return {
        'resources_ignored': resources_ignored,
        'resources_updated': resources_updated,
        'action_metadata': action_metadata,
        'file_metadata_list': file_metadata_list,
        'project_id': repo_id,
        "project_link": repo_url
    }
Example #12
0
def osf_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from OSF along with its hash information.

    Parameters
    ----------
    token : str
        User's OSF token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'source_path: '/full/path/to/file',
                            'extra_metadata': {'any': 'extra'}
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                                'sourceUsername': '******',
                              }
    """
    try:
        osf_instance = OSF(token)
    except PresQTInvalidTokenError:
        raise PresQTResponseException("Token is invalid. Response returned a 401 status code.",
                                      status.HTTP_401_UNAUTHORIZED)
    # Get contributor name
    contributor_name = requests.get('https://api.osf.io/v2/users/me/',
                                    headers={'Authorization': 'Bearer {}'.format(token)}).json()[
                                        'data']['attributes']['full_name']
    action_metadata = {"sourceUsername": contributor_name}
    # Get the resource
    resource = get_osf_resource(resource_id, osf_instance)

    # Get all files for the provided resources.
    # The 'path' value will be the path that the file is eventually saved in. The root of the
    # path should be the resource.
    files = []
    empty_containers = []
    extra_metadata = {}

    if resource.kind_name == 'file':
        update_process_info_message(process_info_path, action, 'Downloading files from OSF...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, 1, action, 'download')

        project = osf_instance.project(resource.parent_project_id)
        files.append({
            "file": resource.download(),
            "hashes": resource.hashes,
            "title": resource.title,
            # If the file is the only resource we are downloading then we don't need it's full path
            "path": '/{}'.format(resource.title),
            "source_path": '/{}/{}{}'.format(project.title, resource.provider, resource.materialized_path),
            "extra_metadata": osf_download_metadata(resource)
        })
        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')
    else:
        if resource.kind_name == 'project':
            extra_metadata = extra_metadata_helper(resource_id, {'Authorization': 'Bearer {}'.format(token)})
            resource.get_all_files('', files, empty_containers)
            project = resource
        elif resource.kind_name == 'storage':
            resource.get_all_files('/{}'.format(resource.title), files, empty_containers)
            project = osf_instance.project(resource.node)
        else:
            resource.get_all_files('', files, empty_containers)
            project = osf_instance.project(resource.parent_project_id)
            for file in files:
                # File Path needs to start at the folder and strip everything before it.
                # Example: If the resource is 'Docs2' and the starting path is
                # '/Project/Storage/Docs1/Docs2/file.jpeg' then the final path
                # needs to be '/Docs2/file.jpeg'
                path_to_strip = resource.materialized_path[:-(len(resource.title) + 2)]
                file['path'] = file['file'].materialized_path[len(path_to_strip):]

        file_urls = [file['file'].download_url for file in files]

        update_process_info_message(process_info_path, action, 'Downloading files from OSF...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, len(file_urls), action, 'download')

        # Asynchronously make all download requests
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        download_data = loop.run_until_complete(async_main(file_urls, token, process_info_path, action))

        # Go through the file dictionaries and replace the file class with the binary_content
        for file in files:
            file['source_path'] = '/{}/{}{}'.format(project.title,
                                                    file['file'].provider,
                                                    file['file'].materialized_path)
            file['file'] = get_dictionary_from_list(
                download_data, 'url', file['file'].download_url)['binary_content']

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #13
0
def figshare_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from FigShare along with its hash information.

    Parameters
    ----------
    token : str
        User's FigShare token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'source_path: '/full/path/to/file',
                            'extra_metadata': {'any': 'extra'}
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                              'sourceUsername': '******',
                              }
    """
    try:
        headers, username = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException(
            "Token is invalid. Response returned a 401 status code.",
            status.HTTP_401_UNAUTHORIZED)
    split_id = str(resource_id).split(":")
    extra_metadata = {}

    # But first we need to see whether it is a public project, or a private project.
    project_url = "https://api.figshare.com/v2/account/projects/{}".format(
        split_id[0])
    response = requests.get(project_url, headers=headers)
    if response.status_code != 200:
        # Looking for a private project was unsuccessful, try a public project.
        project_url = "https://api.figshare.com/v2/projects/{}".format(
            split_id[0])
        response = requests.get(project_url, headers=headers)
        if response.status_code != 200:
            # Project id is invalid
            raise PresQTResponseException(
                "The resource could not be found by the requesting user.",
                status.HTTP_404_NOT_FOUND)
    data = response.json()
    project_name = data['title']

    # Flags to be used for file checks.
    file_urls = None
    files = None

    if len(split_id) == 1:
        # Download the contents of the project and build the list of file urls to download.
        articles_url = project_url + "/articles"
        files, empty_containers, action_metadata = download_project(
            username, articles_url, headers, project_name, [])
        file_urls = [file['file'] for file in files]
        extra_metadata = extra_metadata_helper(project_url, headers)

    elif len(split_id) == 2 or len(split_id) == 3:
        # We have an article or a file so we need to get the article url
        article_url = "https://api.figshare.com/v2/account/projects/{}/articles/{}".format(
            split_id[0], split_id[1])
        response = requests.get(article_url, headers=headers)

        if response.status_code != 200:
            # Let's see if this is a public article....
            article_url = "https://api.figshare.com/v2/articles/{}".format(
                split_id[1])
            response = requests.get(article_url, headers=headers)

            if response.status_code != 200:
                # We couldn't find the article.
                raise PresQTResponseException(
                    "The resource could not be found by the requesting user.",
                    status.HTTP_404_NOT_FOUND)
        if len(split_id) == 2:
            # Download the contents of the article and build the list of file urls to download.
            files, empty_containers, action_metadata = download_article(
                username, article_url, headers, project_name, [])
            file_urls = [file['file'] for file in files]

        elif len(split_id) == 3:
            update_process_info_message(process_info_path, action,
                                        'Downloading files from FigShare...')
            # Add the total number of articles to the process info file.
            # This is necessary to keep track of the progress of the request.
            update_process_info(process_info_path, 1, action, 'download')

            # Single file download.
            data = response.json()
            for file in data['files']:
                if str(file['id']) == split_id[2]:
                    files = [{
                        "file":
                        requests.get(file['download_url'],
                                     headers=headers).content,
                        "hashes": {
                            "md5": file['computed_md5']
                        },
                        "title":
                        file['name'],
                        "path":
                        "/{}".format(file['name']),
                        "source_path":
                        "/{}/{}/{}".format(project_name, data['title'],
                                           file['name']),
                        "extra_metadata": {
                            "size": file['size']
                        }
                    }]
                    # Increment the number of files done in the process info file.
                    increment_process_info(process_info_path, action,
                                           'download')

                    empty_containers = []
                    action_metadata = {"sourceUsername": username}
            if not files:
                # We could not find the file.
                raise PresQTResponseException(
                    "The resource could not be found by the requesting user.",
                    status.HTTP_404_NOT_FOUND)
    if file_urls:
        update_process_info_message(process_info_path, action,
                                    'Downloading files from FigShare...')
        # Add the total number of articles to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, len(file_urls), action,
                            'download')

        # Start the async calls for project or article downloads
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        download_data = loop.run_until_complete(
            async_main(file_urls, headers, process_info_path, action))

        # Go through the file dictionaries and replace the file path with the binary_content
        for file in files:
            file['file'] = get_dictionary_from_list(
                download_data, 'url', file['file'])['binary_content']

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #14
0
def curate_nd_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from CurateND along with its hash information.

    Parameters
    ----------
    token : str
        User's CurateND token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'source_path: '/full/path/to/file',
                            'extra_metadata': {'any': 'extra'}
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                              'sourceUsername': '******',
                              }
    """
    try:
        curate_instance = CurateND(token)
    except PresQTInvalidTokenError:
        raise PresQTValidationError(
            "Token is invalid. Response returned a 401 status code.",
            status.HTTP_401_UNAUTHORIZED)

    # Get the resource
    resource = get_curate_nd_resource(resource_id, curate_instance)
    action_metadata = {"sourceUsername": resource.extra['depositor']}
    extra_metadata = {}

    # Get all the files for the provided resources.
    files = []
    empty_containers = []
    if resource.kind_name == 'file':
        title_url = resource.extra['isPartOf']
        if type(title_url) is list:
            title_url = resource.extra['isPartOf'][0]
        # Get the title of the Project to add to sourcePath
        project_title = requests.get(title_url,
                                     headers={
                                         'X-Api-Token': '{}'.format(token)
                                     }).json()['title']

        # This is so we aren't missing the few extra keys that are pulled out for the PresQT payload
        resource.extra.update({
            "id": resource.id,
            "date_submitted": resource.date_submitted
        })

        update_process_info_message(process_info_path, action,
                                    'Downloading files from CurateND...')
        # Add the total number of items to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, 1, action, 'download')

        binary_file, curate_hash = resource.download()

        files.append({
            'file':
            binary_file,
            'hashes': {
                'md5': curate_hash
            },
            'title':
            resource.title,
            # If the file is the only resource we are downloading then we don't need it's full path.
            'path':
            '/{}'.format(resource.title),
            'source_path':
            '/{}/{}'.format(project_title, resource.title),
            'extra_metadata':
            resource.extra
        })

        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')

    else:
        if not resource.extra['containedFiles']:
            empty_containers.append('{}'.format(resource.title))
        else:
            update_process_info_message(process_info_path, action,
                                        'Downloading files from CurateND...')
            # Add the total number of items to the process info file.
            # This is necessary to keep track of the progress of the request.
            update_process_info(process_info_path,
                                len(resource.extra['containedFiles']), action,
                                'download')

            title_helper = {}
            hash_helper = {}
            file_urls = []
            project_title = resource.title
            file_metadata = []
            extra_metadata = extra_metadata_helper(resource)

            for file in resource.extra['containedFiles']:
                download_url = file['downloadUrl']
                contained_file = get_curate_nd_resource(
                    file['id'], curate_instance)
                file_metadata_dict = {
                    "title": contained_file.title,
                    "extra": contained_file.extra
                }
                file_metadata.append(file_metadata_dict)

                title_helper[download_url] = contained_file.title
                hash_helper[download_url] = contained_file.md5
                title_helper[file['downloadUrl']] = file['label']
                file_urls.append(file['downloadUrl'])

            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            download_data = loop.run_until_complete(
                async_main(file_urls, token, process_info_path, action))

            for file in download_data:
                title = title_helper[file['url']]
                hash = hash_helper[file['url']]
                files.append({
                    'file':
                    file['binary_content'],
                    'hashes': {
                        'md5': hash
                    },
                    'title':
                    title,
                    "source_path":
                    '/{}/{}'.format(project_title, title),
                    'path':
                    '/{}/{}'.format(resource.title, title),
                    'extra_metadata':
                    get_dictionary_from_list(file_metadata, 'title',
                                             title)['extra']
                })

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #15
0
def gitlab_download_resource(token, resource_id, process_info_path, action):
    """
    Fetch the requested resource from GitLab along with its hash information.

    Parameters
    ----------
    token : str
        User's GitLab token
    resource_id : str
        ID of the resource requested
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources': List of dictionary objects that each hold a file and its information.
                     Dictionary must be in the following format:
                         {
                            'file': binary_file,
                            'hashes': {'hash_algorithm': 'the_hash'},
                            'title': 'file.jpg',
                            'path': '/path/to/file',
                            'source_path: '/full/path/to/file',
                            'extra_metadata': {'any': 'extra'}
                         }
        'empty_containers: List of string paths representing empty containers that must be written.
                              Example: ['empty/folder/to/write/', 'another/empty/folder/]
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                              {
                              'sourceUsername': '******',
                              }
    """
    try:
        header, user_id = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException("Token is invalid. Response returned a 401 status code.",
                                      status.HTTP_401_UNAUTHORIZED)

    # Get the user's GitLab username for action metadata
    username = requests.get("https://gitlab.com/api/v4/user", headers=header).json()['username']

    partitioned_id = resource_id.partition(':')
    if ':' in resource_id:
        project_id = partitioned_id[0]
    else:
        project_id = resource_id

    project_url = 'https://gitlab.com/api/v4/projects/{}'.format(project_id)

    response = requests.get(project_url, headers=header)
    if response.status_code != 200:
        raise PresQTResponseException(
            'The resource with id, {}, does not exist for this user.'.format(resource_id),
            status.HTTP_404_NOT_FOUND)

    project_name = response.json()['name']
    extra_metadata = {}
    if ':' not in resource_id:
        # This is for a project
        all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?recursive=1".format(
            resource_id)
        data = gitlab_paginated_data(header, user_id, all_files_url)
        is_project = True
        # Get extra metadata
        extra_metadata = extra_metadata_helper(response.json(), header)

    elif ':' in resource_id and '%2E' not in resource_id:
        # This is for a directory
        all_files_url = "https://gitlab.com/api/v4/projects/{}/repository/tree?path={}&recursive=1".format(
            partitioned_id[0], partitioned_id[2].replace('+', ' '))
        data = gitlab_paginated_data(header, user_id, all_files_url)
        if not data:
            raise PresQTResponseException(
                'The resource with id, {}, does not exist for this user.'.format(resource_id),
                status.HTTP_404_NOT_FOUND)
        is_project = False

    else:
        update_process_info_message(process_info_path, action, 'Downloading files from GitLab...')
        # Add the total number of projects to the process info file.
        # This is necessary to keep track of the progress of the request.
        update_process_info(process_info_path, 1, action, 'download')

        # This is a single file
        data = requests.get('https://gitlab.com/api/v4/projects/{}/repository/files/{}?ref=master'.format(
            project_id, partitioned_id[2].replace('+', ' ')), headers=header).json()
        if 'message' in data.keys():
            raise PresQTResponseException(
                'The resource with id, {}, does not exist for this user.'.format(resource_id),
                status.HTTP_404_NOT_FOUND)

        # Increment the number of files done in the process info file.
        increment_process_info(process_info_path, action, 'download')
        return {
            'resources': [{
                'file': base64.b64decode(data['content']),
                'hashes': {'sha256': data['content_sha256']},
                'title': data['file_name'],
                'path': '/{}'.format(data['file_name']),
                'source_path': data['file_path'],
                'extra_metadata': {}}],
            'empty_containers': [],
            'action_metadata': {'sourceUsername': username},
            'extra_metadata': extra_metadata
        }

    files, empty_containers, action_metadata = download_content(
        username, project_name, project_id, data, [], is_project)
    file_urls = [file['file'] for file in files]

    update_process_info_message(process_info_path, action, 'Downloading files from GitLab...')
    # Add the total number of projects to the process info file.
    # This is necessary to keep track of the progress of the request.
    update_process_info(process_info_path, len(file_urls), action, 'download')

    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    download_data = loop.run_until_complete(
        async_main(file_urls, header, process_info_path, action))

    # Go through the file dictionaries and replace the file path with the binary_content
    # and replace the hashes with the correct file hashes
    for file in files:
        file['hashes'] = get_dictionary_from_list(
            download_data, 'url', file['file'])['hashes']
        file['file'] = get_dictionary_from_list(
            download_data, 'url', file['file'])['binary_content']

    return {
        'resources': files,
        'empty_containers': empty_containers,
        'action_metadata': action_metadata,
        'extra_metadata': extra_metadata
    }
Example #16
0
def gitlab_upload_resource(token, resource_id, resource_main_dir, hash_algorithm, file_duplicate_action, process_info_path, action):
    """
    Upload the files found in the resource_main_dir to the target.

    Parameters
    ----------
    token : str
        User's token.
    resource_id : str
        ID of the resource requested.
    resource_main_dir : str
        Path to the main directory for the resources to be uploaded.
    hash_algorithm : str
        Hash algorithm we are using to check for fixity.
    file_duplicate_action : str
        The action to take when a duplicate file is found
    process_info_path: str
        Path to the process info file that keeps track of the action's progress
    action: str
        The action being performed

    Returns
    -------
    Dictionary with the following keys: values
        'resources_ignored' : Array of string file paths of files that were ignored when
        uploading the resource. Path should have the same base as resource_main_dir.
                                Example:
                                    ['path/to/ignored/file.pg', 'another/ignored/file.jpg']

        'resources_updated' : Array of string file paths of files that were updated when
         uploading the resource. Path should have the same base as resource_main_dir.
                                 Example:
                                    ['path/to/updated/file.jpg']
        'action_metadata': Dictionary containing action metadata. Must be in the following format:
                            {
                                'destinationUsername': '******'
                            }
        'file_metadata_list': List of dictionaries for each file that contains metadata
                              and hash info. Must be in the following format:
                                {
                                    "actionRootPath": '/path/on/disk',
                                    "destinationPath": '/path/on/target/destination',
                                    "title": 'file_title',
                                    "destinationHash": {'hash_algorithm': 'the_hash'}}
                                }
        'project_id': ID of the parent project for this upload. Needed for metadata upload.
        'project_link': The link to either the resource or the home page of the user if not available through API
    """
    base_url = "https://gitlab.com/api/v4/"

    try:
        headers, user_id = validation_check(token)
    except PresQTResponseException:
        raise PresQTResponseException("Token is invalid. Response returned a 401 status code.",
                                      status.HTTP_401_UNAUTHORIZED)
    username = requests.get("https://gitlab.com/api/v4/user", headers=headers).json()['username']
    action_metadata = {"destinationUsername": username}

    os_path = next(os.walk(resource_main_dir))
    # Get total amount of files
    total_files = upload_total_files(resource_main_dir)
    update_process_info(process_info_path, total_files, action, 'upload')
    update_process_info_message(process_info_path, action,
                                "Uploading files to GitLab...")

    resources_ignored = []
    resources_updated = []
    file_metadata_list = []

    #*** CREATE NEW PROJECT ***#
    # Create a new project with the name being the top level directory's name.
    # Check if a project with this name exists for this user
    if not resource_id:
        project_title = os_path[1][0]
        titles = [data['name'] for data in gitlab_paginated_data(headers, user_id)]
        title = get_duplicate_title(project_title, titles,
                                    '-PresQT*-').replace('(', '-').replace(')', '-')
        response = requests.post('{}projects?name={}&visibility=public'.format(
            base_url, title), headers=headers)
        if response.status_code == 201:
            project_id = response.json()['id']
            project_name = response.json()['name']
            web_url = response.json()['web_url']
        else:
            raise PresQTResponseException(
                "Response has status code {} while creating project {}.".format(
                    response.status_code, project_title), status.HTTP_400_BAD_REQUEST)

        #*** UPLOAD FILES ***#
        # Upload files to project's repository
        base_repo_path = "{}projects/{}/repository/files/".format(base_url, project_id)
        for path, subdirs, files in os.walk(resource_main_dir):
            if not subdirs and not files:
                resources_ignored.append(path)
            for name in files:
                # Strip server directories from file path
                relative_file_path = os.path.join(path.partition('/data/{}/'.format(
                    project_title))[2], name)

                # Extract and encode the file bytes in the way expected by GitLab.
                file_bytes = open(os.path.join(path, name), 'rb').read()
                encoded_file = base64.b64encode(file_bytes)

                # A relative path to the file is what is added to the GitLab POST address
                encoded_file_path = relative_file_path.replace('/', '%2F').replace('.', '%2E')

                request_data = {"branch": "master",
                                "commit_message": "PresQT Upload",
                                "encoding": "base64",
                                "content": encoded_file}

                requests.post("{}{}".format(
                    base_repo_path, encoded_file_path), headers=headers, data=request_data)

                # Get the file hash
                file_json = requests.get("{}{}?ref=master".format(base_repo_path, encoded_file_path),
                                         headers=headers)
                # Increment files finished
                increment_process_info(process_info_path, action, 'upload')

                file_metadata_list.append({
                    "actionRootPath": os.path.join(path, name),
                    # This ensures that the title is up to date if there are duplicates
                    "destinationPath": os.path.join(project_name, path.partition(
                        '/data/')[2].partition('/')[2], name),
                    "title": name,
                    "destinationHash": file_json.json()['content_sha256']
                })
    else:
        if ':' not in resource_id:
            project_id = resource_id
            base_repo_url = "{}projects/{}/repository/files/".format(base_url, project_id)
            string_path_to_resource = ''
        else:
            partitioned_id = resource_id.partition(':')
            project_id = partitioned_id[0]
            base_repo_url = "{}projects/{}/repository/files/{}".format(
                base_url, project_id, partitioned_id[2])
            string_path_to_resource = partitioned_id[2].replace('%2F', '/').replace('%2E', '.')

        # Check if the resource_id belongs to a file
        tree_url = 'https://gitlab.com/api/v4/projects/{}/repository/tree?recursive=1'.format(
            project_id)
        file_data = gitlab_paginated_data(headers, None, tree_url)
        for data in file_data:
            if data['path'] == string_path_to_resource:
                if data['type'] == 'blob':
                    raise PresQTResponseException("Resource with id, {}, belongs to a file.".format(
                        resource_id), status.HTTP_400_BAD_REQUEST)

        # Get project data
        project = requests.get('{}projects/{}'.format(base_url, project_id), headers=headers)
        if project.status_code != 200:
            raise PresQTResponseException("Project with id, {}, could not be found.".format(
                project_id), status.HTTP_404_NOT_FOUND)
        project_name = project.json()['name']
        web_url = project.json()['web_url']

        for path, subdirs, files in os.walk(resource_main_dir):
            if not subdirs and not files:
                resources_ignored.append(path)
            for name in files:
                # Strip server directories from file path
                relative_file_path = os.path.join(path.partition('/data/')[2], name)

                # A relative path to the file is what is added to the GitLab POST address
                if base_repo_url == "{}projects/{}/repository/files/".format(base_url, project_id):
                    encoded_file_path = relative_file_path.replace('/', '%2F').replace('.', '%2E')
                else:
                    encoded_file_path = '%2F{}'.format(
                        relative_file_path.replace('/', '%2F').replace('.', '%2E'))
                full_encoded_url = '{}{}'.format(base_repo_url, encoded_file_path)

                ignore_file = False
                upload_request = requests.post
                file_bytes = None
                # Check if this file exists already
                for file in file_data:
                    if os.path.join(string_path_to_resource, relative_file_path) == file['path']:
                        if file_duplicate_action == 'ignore':
                            resources_ignored.append(os.path.join(path, name))
                            ignore_file = True
                            break
                        else:
                            file_url = '{}?ref=master'.format(full_encoded_url)
                            file_response = requests.get(file_url, headers=headers)
                            file_bytes = open(os.path.join(path, name), 'rb').read()
                            if hash_generator(file_bytes, 'sha256') == file_response.json()['content_sha256']:
                                resources_ignored.append(os.path.join(path, name))
                                ignore_file = True
                            else:
                                resources_updated.append(os.path.join(path, name))
                                upload_request = requests.put

                            # Break out of this for loop and attempt to upload this duplicate
                            break
                # If we find a file to ignore then move onto the next file in the os.walk
                if ignore_file:
                    continue

                # Extract and encode the file bytes in the way expected by GitLab.
                if not file_bytes:
                    file_bytes = open(os.path.join(path, name), 'rb').read()
                encoded_file = base64.b64encode(file_bytes)

                request_data = {"branch": "master",
                                "commit_message": "PresQT Upload",
                                "encoding": "base64",
                                "content": encoded_file}

                response = upload_request("{}".format(full_encoded_url),
                                          headers=headers, data=request_data)
                if response.status_code not in [201, 200]:
                    raise PresQTResponseException(
                        'Upload failed with a status code of {}'.format(response.status_code),
                        status.HTTP_400_BAD_REQUEST)

                # Get the file hash
                file_json = requests.get("{}?ref=master".format(full_encoded_url),
                                         headers=headers).json()
                # Increment files finished
                increment_process_info(process_info_path, action, 'upload')

                file_metadata_list.append({
                    "actionRootPath": os.path.join(path, name),
                    "destinationPath": os.path.join(project_name, path.partition('/data/')[2], name),
                    "title": name,
                    "destinationHash": file_json['content_sha256']
                })

    return {
        'resources_ignored': resources_ignored,
        'resources_updated': resources_updated,
        'action_metadata': action_metadata,
        'file_metadata_list': file_metadata_list,
        'project_id': project_id,
        'project_link': web_url
    }