Ejemplo n.º 1
0
    def test_dangling_link_to_existing_db_node(self, temp_dir):
        """A dangling link that references a Node that is not included in the archive should `not` be importable"""
        struct = orm.StructureData()
        struct.store()
        struct_uuid = struct.uuid

        calc = orm.CalculationNode()
        calc.add_incoming(struct, LinkType.INPUT_CALC, 'input')
        calc.store()
        calc.seal()
        calc_uuid = calc.uuid

        filename = os.path.join(temp_dir, 'export.aiida')
        export([struct], filename=filename, file_format='tar.gz')

        unpack = SandboxFolder()
        with tarfile.open(filename, 'r:gz', format=tarfile.PAX_FORMAT) as tar:
            tar.extractall(unpack.abspath)

        with open(unpack.get_abs_path('data.json'), 'r',
                  encoding='utf8') as fhandle:
            data = json.load(fhandle)
        data['links_uuid'].append({
            'output': calc.uuid,
            'input': struct.uuid,
            'label': 'input',
            'type': LinkType.INPUT_CALC.value
        })

        with open(unpack.get_abs_path('data.json'), 'wb') as fhandle:
            json.dump(data, fhandle)

        with tarfile.open(filename, 'w:gz', format=tarfile.PAX_FORMAT) as tar:
            tar.add(unpack.abspath, arcname='')

        # Make sure the CalculationNode is still in the database
        builder = orm.QueryBuilder().append(orm.CalculationNode,
                                            project='uuid')
        self.assertEqual(
            builder.count(),
            1,
            msg=
            f'There should be a single CalculationNode, instead {builder.count()} has been found'
        )
        self.assertEqual(builder.all()[0][0], calc_uuid)

        with self.assertRaises(DanglingLinkError):
            import_data(filename)

        # Using the flag `ignore_unknown_nodes` should import it without problems
        import_data(filename, ignore_unknown_nodes=True)
        builder = orm.QueryBuilder().append(orm.StructureData, project='uuid')
        self.assertEqual(
            builder.count(),
            1,
            msg=
            f'There should be a single StructureData, instead {builder.count()} has been found'
        )
        self.assertEqual(builder.all()[0][0], struct_uuid)
Ejemplo n.º 2
0
    def test_3(self):
        """
        Test importing of nodes, that have links to unknown nodes.
        """
        import json
        import tarfile
        import os
        import shutil
        import tempfile

        from aiida.orm import DataFactory
        from aiida.orm.importexport import export
        from aiida.common.folders import SandboxFolder

        # Creating a folder for the import/export files
        temp_folder = tempfile.mkdtemp()
        try:
            StructureData = DataFactory('structure')
            sd = StructureData()
            sd.store()

            filename = os.path.join(temp_folder, "export.tar.gz")
            export([sd.dbnode], outfile=filename, silent=True)

            unpack = SandboxFolder()
            with tarfile.open(filename, "r:gz",
                              format=tarfile.PAX_FORMAT) as tar:
                tar.extractall(unpack.abspath)

            with open(unpack.get_abs_path('data.json'), 'r') as f:
                metadata = json.load(f)
            metadata['links_uuid'].append({
                'output': sd.uuid,
                'input': 'non-existing-uuid',
                'label': 'parent'
            })
            with open(unpack.get_abs_path('data.json'), 'w') as f:
                json.dump(metadata, f)

            with tarfile.open(filename, "w:gz",
                              format=tarfile.PAX_FORMAT) as tar:
                tar.add(unpack.abspath, arcname="")

            self.clean_db()

            with self.assertRaises(ValueError):
                import_data(filename, silent=True)

            import_data(filename, ignore_unknown_nodes=True, silent=True)
        finally:
            # Deleting the created temporary folder
            shutil.rmtree(temp_folder, ignore_errors=True)
Ejemplo n.º 3
0
    def test_links_to_unknown_nodes(self, temp_dir):
        """Test importing of nodes, that have links to unknown nodes."""
        node_label = 'Test structure data'
        struct = orm.StructureData()
        struct.label = str(node_label)
        struct.store()
        struct_uuid = struct.uuid

        filename = os.path.join(temp_dir, 'export.aiida')
        export([struct], filename=filename, file_format='tar.gz')

        unpack = SandboxFolder()
        with tarfile.open(filename, 'r:gz', format=tarfile.PAX_FORMAT) as tar:
            tar.extractall(unpack.abspath)

        with open(unpack.get_abs_path('data.json'), 'r',
                  encoding='utf8') as fhandle:
            data = json.load(fhandle)
        data['links_uuid'].append({
            'output': struct.uuid,
            # note: this uuid is supposed to not be in the DB:
            'input': get_new_uuid(),
            'label': 'parent',
            'type': LinkType.CREATE.value
        })

        with open(unpack.get_abs_path('data.json'), 'wb') as fhandle:
            json.dump(data, fhandle)

        with tarfile.open(filename, 'w:gz', format=tarfile.PAX_FORMAT) as tar:
            tar.add(unpack.abspath, arcname='')

        self.clean_db()

        with self.assertRaises(DanglingLinkError):
            import_data(filename)

        import_data(filename, ignore_unknown_nodes=True)
        self.assertEqual(orm.load_node(struct_uuid).label, node_label)
Ejemplo n.º 4
0
def upload_calculation(node: CalcJobNode,
                       transport: Transport,
                       calc_info: CalcInfo,
                       folder: SandboxFolder,
                       inputs: Optional[MappingType[str, Any]] = None,
                       dry_run: bool = False) -> None:
    """Upload a `CalcJob` instance

    :param node: the `CalcJobNode`.
    :param transport: an already opened transport to use to submit the calculation.
    :param calc_info: the calculation info datastructure returned by `CalcJob.presubmit`
    :param folder: temporary local file system folder containing the inputs written by `CalcJob.prepare_for_submission`
    """
    # pylint: disable=too-many-locals,too-many-branches,too-many-statements

    # If the calculation already has a `remote_folder`, simply return. The upload was apparently already completed
    # before, which can happen if the daemon is restarted and it shuts down after uploading but before getting the
    # chance to perform the state transition. Upon reloading this calculation, it will re-attempt the upload.
    link_label = 'remote_folder'
    if node.get_outgoing(RemoteData, link_label_filter=link_label).first():
        EXEC_LOGGER.warning(
            f'CalcJobNode<{node.pk}> already has a `{link_label}` output: skipping upload'
        )
        return calc_info

    computer = node.computer

    codes_info = calc_info.codes_info
    input_codes = [
        load_node(_.code_uuid, sub_classes=(Code, )) for _ in codes_info
    ]

    logger_extra = get_dblogger_extra(node)
    transport.set_logger_extra(logger_extra)
    logger = LoggerAdapter(logger=EXEC_LOGGER, extra=logger_extra)

    if not dry_run and node.has_cached_links():
        raise ValueError(
            'Cannot submit calculation {} because it has cached input links! If you just want to test the '
            'submission, set `metadata.dry_run` to True in the inputs.'.format(
                node.pk))

    # If we are performing a dry-run, the working directory should actually be a local folder that should already exist
    if dry_run:
        workdir = transport.getcwd()
    else:
        remote_user = transport.whoami()
        remote_working_directory = computer.get_workdir().format(
            username=remote_user)
        if not remote_working_directory.strip():
            raise exceptions.ConfigurationError(
                "[submission of calculation {}] No remote_working_directory configured for computer '{}'"
                .format(node.pk, computer.label))

        # If it already exists, no exception is raised
        try:
            transport.chdir(remote_working_directory)
        except IOError:
            logger.debug(
                '[submission of calculation {}] Unable to chdir in {}, trying to create it'
                .format(node.pk, remote_working_directory))
            try:
                transport.makedirs(remote_working_directory)
                transport.chdir(remote_working_directory)
            except EnvironmentError as exc:
                raise exceptions.ConfigurationError(
                    '[submission of calculation {}] '
                    'Unable to create the remote directory {} on '
                    "computer '{}': {}".format(node.pk,
                                               remote_working_directory,
                                               computer.label, exc))
        # Store remotely with sharding (here is where we choose
        # the folder structure of remote jobs; then I store this
        # in the calculation properties using _set_remote_dir
        # and I do not have to know the logic, but I just need to
        # read the absolute path from the calculation properties.
        transport.mkdir(calc_info.uuid[:2], ignore_existing=True)
        transport.chdir(calc_info.uuid[:2])
        transport.mkdir(calc_info.uuid[2:4], ignore_existing=True)
        transport.chdir(calc_info.uuid[2:4])

        try:
            # The final directory may already exist, most likely because this function was already executed once, but
            # failed and as a result was rescheduled by the eninge. In this case it would be fine to delete the folder
            # and create it from scratch, except that we cannot be sure that this the actual case. Therefore, to err on
            # the safe side, we move the folder to the lost+found directory before recreating the folder from scratch
            transport.mkdir(calc_info.uuid[4:])
        except OSError:
            # Move the existing directory to lost+found, log a warning and create a clean directory anyway
            path_existing = os.path.join(transport.getcwd(),
                                         calc_info.uuid[4:])
            path_lost_found = os.path.join(remote_working_directory,
                                           REMOTE_WORK_DIRECTORY_LOST_FOUND)
            path_target = os.path.join(path_lost_found, calc_info.uuid)
            logger.warning(
                f'tried to create path {path_existing} but it already exists, moving the entire folder to {path_target}'
            )

            # Make sure the lost+found directory exists, then copy the existing folder there and delete the original
            transport.mkdir(path_lost_found, ignore_existing=True)
            transport.copytree(path_existing, path_target)
            transport.rmtree(path_existing)

            # Now we can create a clean folder for this calculation
            transport.mkdir(calc_info.uuid[4:])
        finally:
            transport.chdir(calc_info.uuid[4:])

        # I store the workdir of the calculation for later file retrieval
        workdir = transport.getcwd()
        node.set_remote_workdir(workdir)

    # I first create the code files, so that the code can put
    # default files to be overwritten by the plugin itself.
    # Still, beware! The code file itself could be overwritten...
    # But I checked for this earlier.
    for code in input_codes:
        if code.is_local():
            # Note: this will possibly overwrite files
            for filename in code.list_object_names():
                # Note, once #2579 is implemented, use the `node.open` method instead of the named temporary file in
                # combination with the new `Transport.put_object_from_filelike`
                # Since the content of the node could potentially be binary, we read the raw bytes and pass them on
                with NamedTemporaryFile(mode='wb+') as handle:
                    handle.write(code.get_object_content(filename, mode='rb'))
                    handle.flush()
                    transport.put(handle.name, filename)
            transport.chmod(code.get_local_executable(), 0o755)  # rwxr-xr-x

    # local_copy_list is a list of tuples, each with (uuid, dest_rel_path)
    # NOTE: validation of these lists are done inside calculation.presubmit()
    local_copy_list = calc_info.local_copy_list or []
    remote_copy_list = calc_info.remote_copy_list or []
    remote_symlink_list = calc_info.remote_symlink_list or []
    provenance_exclude_list = calc_info.provenance_exclude_list or []

    for uuid, filename, target in local_copy_list:
        logger.debug(
            f'[submission of calculation {node.uuid}] copying local file/folder to {target}'
        )

        try:
            data_node = load_node(uuid=uuid)
        except exceptions.NotExistent:
            data_node = _find_data_node(inputs, uuid) if inputs else None

        if data_node is None:
            logger.warning(
                f'failed to load Node<{uuid}> specified in the `local_copy_list`'
            )
        else:
            dirname = os.path.dirname(target)
            if dirname:
                os.makedirs(os.path.join(folder.abspath, dirname),
                            exist_ok=True)
            with folder.open(target, 'wb') as handle:
                with data_node.open(filename, 'rb') as source:
                    shutil.copyfileobj(source, handle)
            provenance_exclude_list.append(target)

    # In a dry_run, the working directory is the raw input folder, which will already contain these resources
    if not dry_run:
        for filename in folder.get_content_list():
            logger.debug(
                f'[submission of calculation {node.pk}] copying file/folder {filename}...'
            )
            transport.put(folder.get_abs_path(filename), filename)

        for (remote_computer_uuid, remote_abs_path,
             dest_rel_path) in remote_copy_list:
            if remote_computer_uuid == computer.uuid:
                logger.debug(
                    '[submission of calculation {}] copying {} remotely, directly on the machine {}'
                    .format(node.pk, dest_rel_path, computer.label))
                try:
                    transport.copy(remote_abs_path, dest_rel_path)
                except (IOError, OSError):
                    logger.warning(
                        '[submission of calculation {}] Unable to copy remote resource from {} to {}! '
                        'Stopping.'.format(node.pk, remote_abs_path,
                                           dest_rel_path))
                    raise
            else:
                raise NotImplementedError(
                    '[submission of calculation {}] Remote copy between two different machines is '
                    'not implemented yet'.format(node.pk))

        for (remote_computer_uuid, remote_abs_path,
             dest_rel_path) in remote_symlink_list:
            if remote_computer_uuid == computer.uuid:
                logger.debug(
                    '[submission of calculation {}] copying {} remotely, directly on the machine {}'
                    .format(node.pk, dest_rel_path, computer.label))
                try:
                    transport.symlink(remote_abs_path, dest_rel_path)
                except (IOError, OSError):
                    logger.warning(
                        '[submission of calculation {}] Unable to create remote symlink from {} to {}! '
                        'Stopping.'.format(node.pk, remote_abs_path,
                                           dest_rel_path))
                    raise
            else:
                raise IOError(
                    f'It is not possible to create a symlink between two different machines for calculation {node.pk}'
                )
    else:

        if remote_copy_list:
            with open(os.path.join(workdir, '_aiida_remote_copy_list.txt'),
                      'w') as handle:
                for remote_computer_uuid, remote_abs_path, dest_rel_path in remote_copy_list:
                    handle.write(
                        'would have copied {} to {} in working directory on remote {}'
                        .format(remote_abs_path, dest_rel_path,
                                computer.label))

        if remote_symlink_list:
            with open(os.path.join(workdir, '_aiida_remote_symlink_list.txt'),
                      'w') as handle:
                for remote_computer_uuid, remote_abs_path, dest_rel_path in remote_symlink_list:
                    handle.write(
                        'would have created symlinks from {} to {} in working directory on remote {}'
                        .format(remote_abs_path, dest_rel_path,
                                computer.label))

    # Loop recursively over content of the sandbox folder copying all that are not in `provenance_exclude_list`. Note
    # that directories are not created explicitly. The `node.put_object_from_filelike` call will create intermediate
    # directories for nested files automatically when needed. This means though that empty folders in the sandbox or
    # folders that would be empty when considering the `provenance_exclude_list` will *not* be copied to the repo. The
    # advantage of this explicit copying instead of deleting the files from `provenance_exclude_list` from the sandbox
    # first before moving the entire remaining content to the node's repository, is that in this way we are guaranteed
    # not to accidentally move files to the repository that should not go there at all cost. Note that all entries in
    # the provenance exclude list are normalized first, just as the paths that are in the sandbox folder, otherwise the
    # direct equality test may fail, e.g.: './path/file.txt' != 'path/file.txt' even though they reference the same file
    provenance_exclude_list = [
        os.path.normpath(entry) for entry in provenance_exclude_list
    ]

    for root, _, filenames in os.walk(folder.abspath):
        for filename in filenames:
            filepath = os.path.join(root, filename)
            relpath = os.path.normpath(
                os.path.relpath(filepath, folder.abspath))
            if relpath not in provenance_exclude_list:
                with open(filepath, 'rb') as handle:
                    node._repository.put_object_from_filelike(handle,
                                                              relpath,
                                                              'wb',
                                                              force=True)  # pylint: disable=protected-access

    if not dry_run:
        # Make sure that attaching the `remote_folder` with a link is the last thing we do. This gives the biggest
        # chance of making this method idempotent. That is to say, if a runner gets interrupted during this action, it
        # will simply retry the upload, unless we got here and managed to link it up, in which case we move to the next
        # task. Because in that case, the check for the existence of this link at the top of this function will exit
        # early from this command.
        remotedata = RemoteData(computer=computer, remote_path=workdir)
        remotedata.add_incoming(node,
                                link_type=LinkType.CREATE,
                                link_label='remote_folder')
        remotedata.store()