Esempio n. 1
0
    def migrate_storage(self, storage):
        """Migrate storage."""
        if str(storage[u'_id']) not in self.storage_index:
            self.unreferenced_storages.append(storage[u'_id'])
            return 1

        data_id = self.storage_index[str(storage[u'_id'])]['id']
        data_path = self.storage_index[str(storage[u'_id'])]['path']
        data = Data.objects.get(pk=data_id)

        new = Storage()
        new.name = 'data_{}_storage'.format(data_id)
        new.data = data
        new.json = storage[u'json']
        new.contributor = self.get_contributor(storage[u'author_id'])
        # XXX: Django will change this on create
        new.created = storage[u'date_created']
        # XXX: Django will change this on save
        new.modified = storage[u'date_modified']
        new.save()

        dict_dot(data.output, data_path, new.pk)
        data.save()

        self.id_mapping['storage'][str(storage[u'_id'])] = new.pk
Esempio n. 2
0
    def migrate_storage(self, storage):
        """Migrate storage."""
        if str(storage[u'_id']) not in self.storage_index:
            self.unreferenced_storages.append(storage[u'_id'])
            return 1

        data_id = self.storage_index[str(storage[u'_id'])]['id']
        data_path = self.storage_index[str(storage[u'_id'])]['path']
        data = Data.objects.get(pk=data_id)

        new = Storage()
        new.name = 'data_{}_storage'.format(data_id)
        new.data = data
        new.json = storage[u'json']
        new.contributor = self.get_contributor(storage[u'author_id'])
        # XXX: Django will change this on create
        new.created = storage[u'date_created']
        # XXX: Django will change this on save
        new.modified = storage[u'date_modified']
        new.save()

        dict_dot(data.output, data_path, new.pk)
        data.save()

        self.id_mapping['storage'][str(storage[u'_id'])] = new.pk
Esempio n. 3
0
    def assertFiles(self, obj, field_path, fn_list, **kwargs):  # pylint: disable=invalid-name
        """Compare a process's output file to the given correct file.

        :param obj: object which includes the files to compare
        :type obj: ~resolwe.flow.models.Data

        :param str field_path: path to
            :class:`~resolwe.flow.models.Data` object's field with the
            list of file names

        :param list fn_list: list of file names (and relative paths) of
            files to compare against. Paths should be relative to the
            ``tests/files`` directory of a Django application.

        :param str compression: if not ``None``, files will be
            uncompressed with the appropriate compression library
            before comparison.
            Currently supported compression formats are *gzip* and
            *zip*.

        :param filter: Function for filtering the contents of output
            files. It is used in :obj:`itertools.filterfalse` function
            and takes one parameter, a line of the output file. If it
            returns ``True``, the line is excluded from comparison of
            the two files.
        :type filter: ~types.FunctionType

        """
        field = dict_dot(obj.output, field_path)

        if len(field) != len(fn_list):
            self.fail(msg="Lengths of list:basic:file field and files list are not equal.")

        for fn_tested, fn_correct in zip(field, fn_list):
            self._assert_file(obj, fn_tested['file'], fn_correct, **kwargs)
Esempio n. 4
0
    def assertFiles(self, obj, field_path, fn_list, **kwargs):  # pylint: disable=invalid-name
        """Compare list of processes' output files to the given correct files

        :param obj: Data object which includes files that we want to
            compare.
        :type obj: :obj:`resolwe.flow.models.Data`

        :param str field_path: Path to list of file names in Data object.

        :param list fn_list: List od file names (and relative paths) of
            files to which we want to compare. Name/path is relative to
            ``tests/files`` folder of a Django application.

        :param compression: If not None, files will be uncompressed with
            the appropriate compression library before comparison.
            Currently supported compression formats are "gzip" and
            "zip".
        :type compression: :obj:`str`

        :param filter: Function for filtering the contents of output
            files. It is used in :obj:`itertools.filterfalse` function
            and takes one parameter, a line of the output file. If it
            returns `True`, the line is excluded from comparison of the
            two files.
        :type filter: :obj:`function`

        """
        field = dict_dot(obj.output, field_path)

        if len(field) != len(fn_list):
            self.fail(msg="Lengths of list:basic:file field and files list are not equal.")

        for fn_tested, fn_correct in zip(field, fn_list):
            self._assert_file(obj, fn_tested['file'], fn_correct, **kwargs)
Esempio n. 5
0
    def assertFiles(self, obj, field_path, fn, compression=None, filter=lambda _: False):  # pylint: disable=invalid-name
        """Compare output file of a processor to the given correct file.

        :param obj: Data object which includes file that we want to
            compare.
        :type obj: :obj:`resolwe.flow.models.Data`

        :param field_path: Path to file name in Data object.
        :type field_path: :obj:`str`

        :param fn: File name (and relative path) of file to which we
            want to compare. Name/path is relative to ``tests/files``
            folder of a Django application.
        :type fn: :obj:`str`

        :param compression: If not None, files will be uncompressed with
            the appropriate compression library before comparison.
            Currently supported compression formats are "gzip" and
            "zip".
        :type compression: :obj:`str`

        :param filter: Function for filtering the contents of output files. It
            is used in :obj:`itertools.filterfalse` function and takes one
            parameter, a line of the output file. If it returns `True`, the
            line is excluded from comparison of the two files.
        :type filter: :obj:`function`

        """
        open_kwargs = {}
        if compression is None:
            open_fn = open
            # by default, open() will open files as text and return str
            # objects, but we need bytes objects
            open_kwargs['mode'] = 'rb'
        elif compression == 'gzip':
            open_fn = gzip.open
        elif compression == 'zip':
            open_fn = zipfile.ZipFile.open
        else:
            raise ValueError("Unsupported compression format.")

        field = dict_dot(obj.output, field_path)
        output = os.path.join(settings.FLOW_EXECUTOR['DATA_PATH'], str(obj.pk), field['file'])
        with open_fn(output, **open_kwargs) as output_file:
            output_contents = b"".join([line for line in filterfalse(filter, output_file)])
        output_hash = hashlib.sha256(output_contents).hexdigest()

        wanted = os.path.join(self.files_path, fn)

        if not os.path.isfile(wanted):
            shutil.copyfile(output, wanted)
            self.fail(msg="Output file {} missing so it was created.".format(fn))

        with open_fn(wanted, **open_kwargs) as wanted_file:
            wanted_contents = b"".join([line for line in filterfalse(filter, wanted_file)])
        wanted_hash = hashlib.sha256(wanted_contents).hexdigest()
        self.assertEqual(wanted_hash, output_hash,
                         msg="File contents hash mismatch: {} != {}".format(
                             wanted_hash, output_hash) + self._debug_info(obj))
Esempio n. 6
0
    def migrate_storage(self, storage):
        if str(storage["_id"]) not in self.storage_index:
            self.unreferenced_storages.append(storage["_id"])
            return 1

        data_id = self.storage_index[str(storage["_id"])]["id"]
        data_path = self.storage_index[str(storage["_id"])]["path"]
        data = Data.objects.get(pk=data_id)

        new = Storage()
        new.name = "data_{}_storage".format(data_id)
        new.data = data
        new.json = storage["json"]
        new.contributor = self.get_contributor(storage["author_id"])
        # XXX: Django will change this on create
        new.created = storage["date_created"]
        # XXX: Django will change this on save
        new.modified = storage["date_modified"]
        new.save()

        dict_dot(data.output, data_path, new.pk)
        data.save()

        self.id_mapping["storage"][str(storage["_id"])] = new.pk
Esempio n. 7
0
    def assertFileExists(self, obj, field_path):  # pylint: disable=invalid-name
        """Ensure a file in the given object's field exists.

        :param obj: object that includes the file for which to check if
            it exists
        :type obj: ~resolwe.flow.models.Data

        :param str field_path: path to
            :class:`~resolwe.flow.models.Data` object's field with the
            file name/path
        """
        field = dict_dot(obj.output, field_path)
        output = os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(obj.pk), field['file'])

        if not os.path.isfile(output):
            self.fail(msg="File {} does not exist.".format(field_path))
Esempio n. 8
0
    def assertFileExists(self, obj, field_path):  # pylint: disable=invalid-name
        """Compare output file of a processor to the given correct file.

        :param obj: Data object which includes file that we want to
            compare.
        :type obj: :obj:`resolwe.flow.models.Data`

        :param field_path: Path to file name in Data object.
        :type field_path: :obj:`str`

        """
        field = dict_dot(obj.output, field_path)
        output = os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(obj.pk), field['file'])

        if not os.path.isfile(output):
            self.fail(msg="File {} does not exist.".format(field_path))
Esempio n. 9
0
    def assertJSON(self, obj, storage, field_path, file_name):  # pylint: disable=invalid-name
        """Compare JSON in Storage object to the given correct JSON.

        :param obj: object to which the
            :class:`~resolwe.flow.models.Storage` object belongs
        :type obj: ~resolwe.flow.models.Data

        :param storage: object or id which contains JSON to compare
        :type storage: :class:`~resolwe.flow.models.Storage` or
            :class:`str`

        :param str field_path: path to JSON subset in the
            :class:`~resolwe.flow.models.Storage`'s object to compare
            against. If it is empty, the entire object will be
            compared.

        :param str file_name: file name (and relative path) of the file
            with the correct JSON to compare against. Path should be
            relative to the ``tests/files`` directory of a Django
            application.

            .. note::

                The given JSON file should be compresed with *gzip* and
                have the ``.gz`` extension.

        """
        self.assertEqual(os.path.splitext(file_name)[1], '.gz', msg='File extension must be .gz')

        if not isinstance(storage, Storage):
            storage = Storage.objects.get(pk=storage)

        storage_obj = dict_dot(storage.json, field_path)

        file_path = os.path.join(self.files_path, file_name)
        if not os.path.isfile(file_path):
            with gzip.open(file_path, mode='wt') as f:
                json.dump(storage_obj, f)

            self.fail(msg="Output file {} missing so it was created.".format(file_name))

        with gzip.open(file_path, mode='rt') as f:
            file_obj = json.load(f)

        self.assertEqual(storage_obj, file_obj,
                         msg="Storage {} field '{}' does not match file {}".format(
                             storage.id, field_path, file_name) + self._debug_info(obj))
Esempio n. 10
0
    def assertFields(self, obj, path, value):  # pylint: disable=invalid-name
        """Compare Data object's field to given value.

        :param obj: Data object with field to compare
        :type obj: :obj:`resolwe.flow.models.Data`

        :param path: Path to field in Data object.
        :type path: :obj:`str`

        :param value: Desired value.
        :type value: :obj:`str`

        """
        field = dict_dot(obj.output, path)
        self.assertEqual(field, value,
                         msg="Field 'output.{}' mismatch: {} != {}".format(path, field, value) +
                         self._debug_info(obj))
Esempio n. 11
0
    def assertFields(self, obj, path, value):  # pylint: disable=invalid-name
        """Compare object's field to the given value.

        :param obj: object with the field to compare
        :type obj: ~resolwe.flow.models.Data

        :param str path: path to
            :class:`~resolwe.flow.models.Data` object's field

        :param str value: desired value of
            :class:`~resolwe.flow.models.Data` object's field

        """
        field = dict_dot(obj.output, path)
        self.assertEqual(field, value,
                         msg="Field 'output.{}' mismatch: {} != {}".format(path, field, value) +
                         self._debug_info(obj))
Esempio n. 12
0
    def assertJSON(self, obj, storage, field_path, file_name):  # pylint: disable=invalid-name
        """Compare JSON in Storage object to the given correct output.

        :param obj: Data object which includes file that we want to
            compare.
        :type obj: :obj:`resolwe.flow.models.Data`

        :param storage: Storage (or storage id) which contains JSON to
            compare.
        :type storage: :obj:`resolwe.flow.models.Storage` or :obj:`str`

        :param field_path: Path to JSON subset to compare in Storage
            object. If it is empty, entire Storage object will be
            compared.
        :type field_path: :obj:`str`

        :param file_name: File name (and relative path) of file to
            which we want to compare. Name/path is relative to
            ``tests/files`` folder of a Django application.
        :type file_name: :obj:`str`

        """
        self.assertEqual(os.path.splitext(file_name)[1], '.gz', msg='File extension must be .gz')

        if not isinstance(storage, Storage):
            storage = Storage.objects.get(pk=storage)

        storage_obj = dict_dot(storage.json, field_path)

        file_path = os.path.join(self.files_path, file_name)
        if not os.path.isfile(file_path):
            with gzip.open(file_path, 'w') as f:
                json.dump(storage_obj, f)

            self.fail(msg="Output file {} missing so it was created.".format(file_name))

        with gzip.open(file_path) as f:
            file_obj = json.load(f)

        self.assertEqual(storage_obj, file_obj,
                         msg="Storage {} field '{}' does not match file {}".format(
                             storage.id, field_path, file_name) + self._debug_info(obj))
Esempio n. 13
0
    def create(self, request, *args, **kwargs):
        """Create a resource."""
        collections = request.data.get('collections', [])

        # check that user has permissions on all collections that Data
        # object will be added to
        for collection_id in collections:
            try:
                collection = Collection.objects.get(pk=collection_id)
            except Collection.DoesNotExist:
                return Response({'collections': ['Invalid pk "{}" - object does not exist.'.format(collection_id)]},
                                status=status.HTTP_400_BAD_REQUEST)

            if not request.user.has_perm('add_collection', obj=collection):
                if request.user.is_authenticated():
                    raise exceptions.PermissionDenied
                else:
                    raise exceptions.NotFound

        # translate processe's slug to id
        process_slug = request.data.get('process', None)
        process_query = Process.objects.filter(slug=process_slug).order_by('version')
        if not process_query.exists():
            # XXX: security - is it ok to reveal which processes (don't) exist?
            return Response({'process': ['Invalid process slug "{}" - object does not exist.'.format(process_slug)]},
                            status=status.HTTP_400_BAD_REQUEST)
        process = process_query.last()
        request.data['process'] = process.pk

        # check that user has permission on the process
        if not request.user.has_perm('view_process', obj=process):
            if request.user.is_authenticated():
                raise exceptions.PermissionDenied
            else:
                raise exceptions.NotFound

        # perform "get_or_create" if requested - return existing object
        # if found
        if kwargs.pop('get_or_create', False):
            process_input = request.data.get('input', {})

            # use default values if they are not given
            for field_schema, fields, path in iterate_schema(process_input, process.input_schema):
                if 'default' in field_schema and field_schema['name'] not in fields:
                    dict_dot(process_input, path, field_schema['default'])

            checksum = get_data_checksum(process_input, process.slug, process.version)
            data_qs = Data.objects.filter(
                checksum=checksum,
                process__persistence__in=[Process.PERSISTENCE_CACHED, Process.PERSISTENCE_TEMP],
            )
            data_qs = get_objects_for_user(request.user, 'view_data', data_qs)
            if data_qs.exists():
                data = data_qs.order_by('created').last()
                serializer = self.get_serializer(data)
                return Response(serializer.data)

        # create the objects
        resp = super(ResolweCreateDataModelMixin, self).create(request, *args, **kwargs)

        # run manager
        manager.communicate()

        return resp
Esempio n. 14
0
    def run(self, data_id, script, verbosity=1):
        """Execute the script and save results."""
        if verbosity >= 1:
            print('RUN: {} {}'.format(data_id, script))

        self.data_id = data_id

        dir_mode = settings.FLOW_EXECUTOR.get('DATA_DIR_MODE', 0o755)

        output_path = os.path.join(settings.FLOW_EXECUTOR['DATA_PATH'], str(data_id))

        os.mkdir(output_path)
        # os.mkdir is not guaranteed to set the given mode
        os.chmod(output_path, dir_mode)
        os.chdir(output_path)

        log_file = open('stdout.txt', 'w+')
        json_file = open('jsonout.txt', 'w+')

        proc_pid = self.start()

        self.update_data_status(
            status=Data.STATUS_PROCESSING,
            started=now(),
            process_pid=proc_pid
        )

        # Run processor and handle intermediate results
        self.run_script(script)
        spawn_processors = []
        output = {}
        process_error, process_warning, process_info = [], [], []
        process_progress, process_rc = 0, 0

        # read processor output
        try:
            stdout = self.get_stdout()
            while True:
                line = stdout.readline()
                if not line:
                    break

                try:
                    if line.strip().startswith('run'):
                        # Save processor and spawn if no errors
                        log_file.write(line)
                        log_file.flush()

                        for obj in iterjson(line[3:].strip()):
                            spawn_processors.append(obj)
                    else:
                        # If JSON, save to MongoDB
                        updates = {}
                        for obj in iterjson(line):
                            for key, val in six.iteritems(obj):
                                if key.startswith('proc.'):
                                    if key == 'proc.error':
                                        process_error.append(val)
                                        if not process_rc:
                                            process_rc = 1
                                            updates['process_rc'] = process_rc
                                        updates['process_error'] = process_error
                                        updates['status'] = Data.STATUS_ERROR
                                    elif key == 'proc.warning':
                                        process_warning.append(val)
                                        updates['process_warning'] = process_warning
                                    elif key == 'proc.info':
                                        process_info.append(val)
                                        updates['process_info'] = process_info
                                    elif key == 'proc.rc':
                                        process_rc = int(val)
                                        updates['process_rc'] = process_rc
                                        if process_rc != 0:
                                            updates['status'] = Data.STATUS_ERROR
                                    elif key == 'proc.progress':
                                        process_progress = int(float(val) * 100)
                                        updates['process_progress'] = process_progress
                                else:
                                    dict_dot(output, key, val)
                                    updates['output'] = output

                        if updates:
                            updates['modified'] = now()
                            self.update_data_status(**updates)

                        if process_rc > 0:
                            log_file.close()
                            json_file.close()
                            os.chdir(CWD)
                            return

                        # Debug output
                        # Not referenced in Data object
                        json_file.write(line)
                        json_file.flush()

                except ValueError as ex:
                    # Ignore if not JSON
                    log_file.write(line)
                    log_file.flush()

        except MemoryError as ex:
            logger.error(__("Out of memory: {}", ex))

        except IOError as ex:
            # TODO: if ex.errno == 28: no more free space
            raise ex
        finally:
            # Store results
            log_file.close()
            json_file.close()
            os.chdir(CWD)

        return_code = self.end()

        if process_rc < return_code:
            process_rc = return_code

        if process_rc == 0:
            self.update_data_status(
                status=Data.STATUS_DONE,
                process_progress=100,
                finished=now()
            )
        else:
            self.update_data_status(
                status=Data.STATUS_ERROR,
                process_progress=100,
                process_rc=process_rc,
                finished=now()
            )

        # try:
        #     # Cleanup after processor
        #     data_purge(data_ids=[data_id], delete=True, verbosity=0)
        # except:  # pylint: disable=bare-except
        #     logger.error(__("Purge error:\n\n{}", traceback.format_exc()))

        # if not update_data(data):  # Data was deleted
        #     # Restore original directory
        #     os.chdir(settings.PROJECT_ROOT)
        #     return

        if spawn_processors and Data.objects.get(pk=self.data_id).status == Data.STATUS_DONE:
            # Spawn processors
            for d in spawn_processors:
                d['contributor'] = Data.objects.get(pk=self.data_id).contributor
                d['process'] = Process.objects.get(slug=d['process'])
                Data.objects.create(**d)
Esempio n. 15
0
    def run(self, data_id, script, verbosity=1):
        """Execute the script and save results."""
        if verbosity >= 1:
            print('RUN: {} {}'.format(data_id, script))

        self.data_id = data_id

        data_dir = settings.FLOW_EXECUTOR['DATA_DIR']
        dir_mode = getattr(settings, 'FLOW_EXECUTOR', {}).get('DATA_DIR_MODE', 0o755)

        output_path = os.path.join(data_dir, str(data_id))

        os.mkdir(output_path)
        # os.mkdir is not guaranteed to set the given mode
        os.chmod(output_path, dir_mode)
        os.chdir(output_path)

        log_file = open('stdout.txt', 'w+')
        json_file = open('jsonout.txt', 'w+')

        proc_pid = self.start()

        self.update_data_status(
            status=Data.STATUS_PROCESSING,
            started=now(),
            process_pid=proc_pid
        )

        # Run processor and handle intermediate results
        self.run_script(script)
        spawn_processors = []
        output = {}
        process_error, process_warning, process_info = [], [], []
        process_progress, process_rc = 0, 0

        # read processor output
        try:
            stdout = self.get_stdout()
            while True:
                line = stdout.readline()
                if not line:
                    break

                try:
                    if line.strip().startswith('run'):
                        # Save processor and spawn if no errors
                        log_file.write(line)
                        log_file.flush()

                        for obj in iterjson(line[3:].strip()):
                            spawn_processors.append(obj)
                    elif line.strip().startswith('export'):
                        file_name = line[6:].strip()

                        export_folder = settings.FLOW_EXECUTOR['UPLOAD_DIR']
                        unique_name = 'export_{}'.format(uuid.uuid4().hex)
                        export_path = os.path.join(export_folder, unique_name)

                        EXPORTED_FILES_MAPPER[file_name] = unique_name

                        shutil.move(file_name, export_path)
                    else:
                        # If JSON, save to MongoDB
                        updates = {}
                        for obj in iterjson(line):
                            for key, val in six.iteritems(obj):
                                if key.startswith('proc.'):
                                    if key == 'proc.error':
                                        process_error.append(val)
                                        if not process_rc:
                                            process_rc = 1
                                            updates['process_rc'] = process_rc
                                        updates['process_error'] = process_error
                                        updates['status'] = Data.STATUS_ERROR
                                    elif key == 'proc.warning':
                                        process_warning.append(val)
                                        updates['process_warning'] = process_warning
                                    elif key == 'proc.info':
                                        process_info.append(val)
                                        updates['process_info'] = process_info
                                    elif key == 'proc.rc':
                                        process_rc = int(val)
                                        updates['process_rc'] = process_rc
                                        if process_rc != 0:
                                            updates['status'] = Data.STATUS_ERROR
                                    elif key == 'proc.progress':
                                        process_progress = int(float(val) * 100)
                                        updates['process_progress'] = process_progress
                                else:
                                    dict_dot(output, key, val)
                                    updates['output'] = output

                        if updates:
                            updates['modified'] = now()
                            self.update_data_status(**updates)

                        if process_rc > 0:
                            log_file.close()
                            json_file.close()
                            os.chdir(CWD)
                            return

                        # Debug output
                        # Not referenced in Data object
                        json_file.write(line)
                        json_file.flush()

                except ValueError as ex:
                    # Ignore if not JSON
                    log_file.write(line)
                    log_file.flush()

        except MemoryError as ex:
            logger.error(__("Out of memory: {}", ex))

        except IOError as ex:
            # TODO: if ex.errno == 28: no more free space
            raise ex
        finally:
            # Store results
            log_file.close()
            json_file.close()
            os.chdir(CWD)

        return_code = self.end()

        if process_rc < return_code:
            process_rc = return_code

        if spawn_processors and process_rc == 0:
            parent_data = Data.objects.get(pk=self.data_id)

            # Spawn processors
            for d in spawn_processors:
                d['contributor'] = parent_data.contributor
                d['process'] = Process.objects.filter(slug=d['process']).order_by('version').last()

                for field_schema, fields in iterate_fields(d.get('input', {}), d['process'].input_schema):
                    type_ = field_schema['type']
                    name = field_schema['name']
                    value = fields[name]

                    if type_ == 'basic:file:':
                        fields[name] = hydrate_spawned_files(value, data_id)
                    elif type_ == 'list:basic:file:':
                        fields[name] = [hydrate_spawned_files(fn, data_id) for fn in value]

                with transaction.atomic():
                    d = Data.objects.create(**d)
                    for collection in parent_data.collection_set.all():
                        collection.data.add(d)

        if process_rc == 0:
            self.update_data_status(
                status=Data.STATUS_DONE,
                process_progress=100,
                finished=now()
            )
        else:
            self.update_data_status(
                status=Data.STATUS_ERROR,
                process_progress=100,
                process_rc=process_rc,
                finished=now()
            )

        try:
            # Cleanup after processor
            if data_id != 'no_data_id':
                data_purge(data_ids=[data_id], delete=True, verbosity=verbosity)
        except:  # pylint: disable=bare-except
            logger.error(__("Purge error:\n\n{}", traceback.format_exc()))