Example #1
0
    def _get_result_dataset_dir(base_dir, input_dataset_dir,
                                   parameter_setting, hide_parameters):
        """ Determines the name of the result directory

        Determines the name of the result directory based on the
        input_dataset_dir, the node_chain_name and the parameter setting.
        """
        input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
        input_name = input_name.strip("{}")
        # If the input is already the result of an operation
        if input_name.count("}{") > 0:
            input_name_parts = input_name.split("}{")
            input_name = input_name_parts[0]

        # Load the input meta data
        dataset_dir = os.sep.join([pySPACE.configuration.storage,
                                                input_dataset_dir])
        dataset_md = BaseDataset.load_meta_data(dataset_dir)

        # We are going to change the parameter_setting and don't want to
        # interfere with later runs so we work on a copy
        parameter_setting = copy.deepcopy(parameter_setting)

        # Ignore pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in parameter_setting:
            parameter_setting.pop("__PREPARE_OPERATION__")

        # Add the input parameters meta data to the given parameter setting
        if "parameter_setting" in dataset_md:
            parameter_setting.update(dataset_md["parameter_setting"])

        # We have to remove ' characters from the parameter value since
        # Weka does ignore them
        for key, value in parameter_setting.iteritems():
            if isinstance(value, basestring) and value.count("'") > 1:
                parameter_setting[key] = eval(value)

        # Determine the result_directory name
        # String between Key and value changed from ":" to "#",
        # because ot problems in windows and with windows file servers
        parameter_str = "}{".join(("%s#%s" % (key, value))
                                        for key, value in parameter_setting.iteritems()
                                            if key not in hide_parameters)

        result_name =  "{%s}" % input_name

        if parameter_str != "":
            result_name += "{%s}" % (parameter_str)

        # Determine the path where this result will be stored
        # and create the directory if necessary
        result_dir = base_dir
        result_dir += os.sep + result_name
        create_directory(result_dir)

        return result_dir
Example #2
0
    def consolidate(self):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([self.result_directory, "{*",])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            # Load their meta_data
            meta_data = BaseDataset.load_meta_data(dataset_path)

            # Determine author and date
            try:
                author = pwd.getpwuid(os.getuid())[4]
            except:
                author = "unknown"
                self._log("Author could not be resolved.",level=logging.WARNING)
            date = time.strftime("%Y%m%d_%H_%M_%S")

            # Update meta data and store it
            meta_data.update({"author" : author, "date" : date})
            BaseDataset.store_meta_data(dataset_path, meta_data)

            # Copy the input dataset specification file to the result
            # directory in order to make later analysis of
            # the results more easy
            input_meta_path = os.sep.join([pySPACE.configuration.storage,
                                          meta_data["input_collection_name"]])
            input_meta = BaseDataset.load_meta_data(input_meta_path)
            BaseDataset.store_meta_data(dataset_path,input_meta,
                                        file_name="input_metadata.yaml")
        # Check if some results consist of several runs
        # and update the meta data in this case
        # TODO: This is not a clean solution
        for dataset_dir in glob.glob(os.sep.join([self.result_directory,
                                                     "*"])):
            if not os.path.isdir(dataset_dir): continue
            # There can be either run dirs, persistency dirs, or both of them.
            # Check of whichever there are more. If both exist, their numbers
            # are supposed to be equal.
            nr_run_dirs = len(glob.glob(os.sep.join([dataset_dir,
                                              "data_run*"])))
            nr_per_dirs = len(glob.glob(os.sep.join([dataset_dir,
                                              "persistency_run*"])))
            nr_runs = max(nr_run_dirs, nr_per_dirs)

            if nr_runs > 1:
                collection_meta = BaseDataset.load_meta_data(dataset_dir)
                collection_meta["runs"] = nr_runs
                BaseDataset.store_meta_data(dataset_dir,collection_meta)
        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory,"results_*"))
        if len(pathlist)>0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
            self._log("done")
            self._log("Storing result collection")
            result_collection.store(self.result_directory)
            self._log("done")
            PerformanceResultSummary.merge_traces(self.result_directory)

            if not(self.compression == False):
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd=os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode, and then skip the zipping.
                try:
                    pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
                    
                    if not self.compression == "delete":                        
                        save_file=zipfile.ZipFile(self.result_directory+'/result_folders.zip',mode="w",compression=self.compression)
                        # we want to have the zipped file relative to the result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path=os.path.relpath(node[0],self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,data))
                        save_file.close()
                    # To still have an easy access to the history of the processing,
                    # we keep one folder.
                    pathlist.pop()
                    for path in pathlist:
                        shutil.rmtree(path)
                except:
                    self._log("Result files could not be compressed with 32 bit mode, switching to 64 bit mode.", level=logging.CRITICAL)
                    # nearly total code copy, only difference with 64 bit mode
                    try:
                        pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
                        save_file=zipfile.ZipFile(self.result_directory+'/result_folders.zip',mode="w",compression=self.compression, allowZip64=True)
                        # we want to have the zipped file relative to the result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path=os.path.relpath(node[0],self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,data))
                        save_file.close()
                        # To still have an easy access to the history of the processing,
                        # we keep one folder.
                        pathlist.pop()
                        for path in pathlist:
                            shutil.rmtree(path)
                    except:
                        self._log("64 bit mode also failed. Please check your files and your code or contact your local programmer!", level=logging.CRITICAL)
                os.chdir(cwd)
Example #3
0
    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """ A factory method that creates the processes which form an operation
        based on the  information given in the operation specification, *operation_spec*.

        In debug mode this is done in serial. In the other default mode,
        at the moment 4 processes are created in parallel and can be immediately
        executed. So generation of processes and execution are made in parallel.
        This kind of process creation is done independently from the backend.

        For huge parameter spaces this is necessary!
        Otherwise numerous processes are created and corresponding data is loaded
        but the concept of spreading the computation to different processors
        can not really be used, because process creation is blocking only
        one processor and memory space, but nothing more is done,
        till the processes are all created.

        .. todo:: Use :class:`~pySPACE.resources.dataset_defs.dummy.DummyDataset`
                  for empty data, when no input_path is given.
        """
        assert(operation_spec["type"] == "node_chain")

        # Determine all parameter combinations that should be tested
        parameter_settings = cls._get_parameter_space(operation_spec)

        ## Use node_chain parameter if no templates are given ##
        if not operation_spec.has_key("templates"):
            if operation_spec.has_key("node_chain"):
                operation_spec["templates"]=[operation_spec.pop("node_chain")]
            else:
                warnings.warn("Specify parameter 'templates' or 'node_chain' in your operation spec!")
                operation_spec["templates"]=[operation_spec.pop("flow")]
        elif operation_spec.has_key("node_chain"):
            operation_spec.pop("node_chain")
            warnings.warn("node_chain parameter is ignored. Templates are used.")
        elif type(operation_spec["templates"][0])==str: # load files in templates
            operation_spec["template_files"]=copy.deepcopy(operation_spec["templates"])
            for i in range(len(operation_spec["templates"])):
                rel_node_chain_file = operation_spec["templates"][i]
                abs_node_chain_file = open(os.sep.join([pySPACE.configuration.spec_dir,
                                                     "node_chains",
                                                     rel_node_chain_file]), 'r')
                node_chain = yaml.load(abs_node_chain_file)
                abs_node_chain_file.close()
                operation_spec["templates"][i] = node_chain


        storage = pySPACE.configuration.storage
        if not input_paths :
            raise Exception("No input datasets found in input_path %s in %s!"
                            % (operation_spec["input_path"],storage))

        # Get relative path
        rel_input_paths = [name[len(storage):]
                                for name in  input_paths]

        # Determine approximate number of runs
        if "runs" in operation_spec:
            runs = operation_spec["runs"]
        else:
            runs = []
            for dataset_dir in rel_input_paths:
                abs_collection_path = \
                        pySPACE.configuration.storage + os.sep \
                            + dataset_dir
                collection_runs = \
                        BaseDataset.load_meta_data(abs_collection_path).get('runs',1)
                runs.append(collection_runs)
            runs = max(runs)

        # Determine splits
        dataset_dir = rel_input_paths[0]
        abs_collection_path = \
                pySPACE.configuration.storage + os.sep + dataset_dir

        splits = BaseDataset.load_meta_data(abs_collection_path).get('splits', 1)

        # Determine how many processes will be created
        number_processes = len(operation_spec["templates"]) * \
                           len(parameter_settings) * len(rel_input_paths) * \
                           runs * splits

        if debug == True:
            # To better debug creation of processes we don't limit the queue
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, operation_spec,
                                 parameter_settings, rel_input_paths)
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes)
        else:
            # Create all processes by calling a recursive helper method in
            # another thread so that already created processes can be executed in
            # parallel. Therefore a queue is used which size is maximized to
            # guarantee that not to much objects are created (because this costs
            # memory). However, the actual number of 4 is arbitrary and might
            # be changed according to the system at hand.
            processes = processing.Queue(4)
            create_process = \
                    processing.Process(target=cls._createProcesses,
                                       args=(processes, result_directory,
                                             operation_spec, parameter_settings,
                                             rel_input_paths))
            create_process.start()
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes, create_process)
Example #4
0
    def _createProcesses(cls, processes, result_directory, operation_spec,
                         parameter_settings, input_collections):

        storage_format = operation_spec["storage_format"] if "storage_format" \
                         in operation_spec else None

        # Determine whether the node_chain should be stored after data processing
        store_node_chain = operation_spec["store_node_chain"] \
                         if "store_node_chain" in operation_spec else False

        # Determine whether certain parameters should not be remembered
        hide_parameters = [] if "hide_parameters" not in operation_spec \
                                else list(operation_spec["hide_parameters"])
        hide_parameters.append("__INPUT_COLLECTION__")
        hide_parameters.append("__INPUT_DATASET__")
        hide_parameters.append("__RESULT_DIRECTORY__")
        hide_parameters.append("__OUTPUT_BUNDLE__")

        # Create all combinations of collections, runs and splits
        collection_run_split_combinations = []
        for input_dataset_dir in input_collections:
            # Determine number of runs to be conducted for this collection
            abs_collection_path = \
                pySPACE.configuration.storage + os.sep \
                    + input_dataset_dir
            collection_runs = \
                BaseDataset.load_meta_data(abs_collection_path).get('runs', 1)
                # D.get(k[,d]) -> D[k] if k in D, else d.

            if "runs" not in operation_spec:
                requested_runs  = collection_runs
            else:
                requested_runs = operation_spec["runs"]

            assert collection_runs == requested_runs \
                        or collection_runs ==  1, \
                    "Requested %s runs but input collection %s provides "\
                    "data for %s runs." % (requested_runs, input_dataset_dir,
                                           collection_runs)

            for run in range(max(requested_runs, collection_runs)):
                collection_splits = \
                    BaseDataset.load_meta_data(abs_collection_path).get('splits', 1)
                for split in range(collection_splits):
                    collection_run_split_combinations.append((input_dataset_dir, run, split))

        # Shuffle order of dataset-run-split combinations. This should help to
        # avoid that all processes work on the same data which can cause
        # problems due to locking etc.
        random.shuffle(collection_run_split_combinations)

        # For all templates
        for node_chain_spec in operation_spec["templates"]:
            # For all possible parameter instantiations of this template
            for parameter_setting in parameter_settings:
                # For all input collections-run combinations
                for input_dataset_dir, run, split in collection_run_split_combinations:
                    # We are going to change the parameter_setting and don't want to
                    # interfere with later runs so we work on a copy
                    parameter_setting_cp = copy.deepcopy(parameter_setting)

                    # Add input and output path to parameter
                    # setting
                    parameter_setting_cp["__INPUT_DATASET__"] = \
                            input_dataset_dir.split(os.sep)[-2]
                    parameter_setting_cp["__RESULT_DIRECTORY__"] = \
                            result_directory
                    if len(operation_spec["templates"])>1:
                        index = operation_spec["templates"].index(node_chain_spec)
                        parameter_setting_cp["__Template__"]=\
                            operation_spec["template_files"][index]

                    # Load the input meta data
                    dataset_dir = os.sep.join([pySPACE.configuration.storage,
                                               input_dataset_dir])
                    dataset_md = BaseDataset.load_meta_data(dataset_dir)
                    # Add the input parameters meta data to the given parameter setting
                    if "parameter_setting" in dataset_md:
                        dataset_md["parameter_setting"].update(parameter_setting_cp)
                        all_parameters = dataset_md["parameter_setting"]
                    else:
                        all_parameters = parameter_setting_cp

                    def check_constraint(constraint, parameters):
                        for key, value in parameters.iteritems():
                            constraint = constraint.replace(key, str(value))
                        return eval(constraint)

                    if not all(check_constraint(constraint_def,
                                                all_parameters) for \
                               constraint_def in \
                               operation_spec.get('old_parameter_constraints',[])):
                        continue

                    # Determine directory in which the result of this
                    # process should be written
                    result_dataset_directory = \
                        NodeChainOperation._get_result_dataset_dir(result_directory,
                                                                 input_dataset_dir,
                                                                 parameter_setting_cp,
                                                                 hide_parameters)
                    # Create the respective process and put it to the
                    # executing-queue of processes
                    process = NodeChainProcess(node_chain_spec= node_chain_spec,
                                          parameter_setting   = parameter_setting_cp,
                                          rel_dataset_dir  = input_dataset_dir,
                                          run = run, split    = split,
                                          storage_format      = storage_format,
                                          result_dataset_directory = result_dataset_directory,
                                          store_node_chain          = store_node_chain)

                    processes.put(process)

        # give executing process the sign that creation is now finished
        processes.put(False)
Example #5
0
    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(ShuffleProcess, self).pre_benchmarking()

        for dataset_dir1 in self.input_datasets:
            for dataset_dir2 in self.input_datasets:
                dataset_name1 = dataset_dir1.split(os.sep)[-2]
                dataset_name2 = dataset_dir2.split(os.sep)[-2]

                # Check if the input data is split
                splitted = len(
                    glob.glob(os.sep.join([dataset_dir1, "data_run0", "*"
                                           ]))) > 1

                # Check that all constraints are fulfilled for this pair of
                # input datasets
                if not all(
                        eval(
                            constraint_template % {
                                'dataset_name1': dataset_name1,
                                'dataset_name2': dataset_name2
                            })
                        for constraint_template in self.dataset_constraints):
                    continue

                if dataset_name1 == dataset_name2:
                    if splitted:
                        # Copy the data
                        os.symlink(
                            dataset_dir1,
                            os.sep.join([self.result_directory,
                                         dataset_name1]))
                    continue

                # Determine names of the original data sets the input
                # datasets are based on
                base_dataset1 = dataset_name1.strip("}{").split("}{")[0]
                base_dataset2 = dataset_name2.strip("}{").split("}{")[0]

                # Determine target dataset name and create directory
                # for it
                mixed_base_dataset = "%s_vs_%s" % (base_dataset1,
                                                   base_dataset2)
                target_dataset_name = dataset_name1.replace(
                    base_dataset1, mixed_base_dataset)

                target_dataset_dir = os.sep.join(
                    [self.result_directory, target_dataset_name])

                create_directory(os.sep.join([target_dataset_dir,
                                              "data_run0"]))

                if splitted:
                    # For each split, copy the train data from dataset 1 and
                    # the test data from dataset 2 to the target dataset
                    for source_train_file_name in glob.glob(
                            os.sep.join(
                                [dataset_dir1, "data_run0", "*_sp*_train.*"])):
                        # TODO: We have $n$ train sets and $n$ test sets, we                   "metadata.yaml"])),

                        #       could use all $n*n$ combinations
                        target_train_file_name = source_train_file_name.replace(
                            dataset_dir1, target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name,
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name,
                                       target_train_file_name)

                        source_test_file_name = source_train_file_name.replace(
                            dataset_dir1, dataset_dir2)

                        source_test_file_name = source_test_file_name.replace(
                            "train.", "test.")
                        target_test_file_name = target_train_file_name.replace(
                            "train.", "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name,
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                else:
                    # Use the data set from dataset 1 as training set and
                    # the data set from dataset 2 as test data
                    for source_train_file_name in glob.glob(
                            os.sep.join(
                                [dataset_dir1, "data_run0", "*_sp*_test.*"])):
                        target_train_file_name = source_train_file_name.replace(
                            "test.", "train.")
                        target_train_file_name = target_train_file_name.replace(
                            dataset_dir1, target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name,
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name,
                                       target_train_file_name)

                        source_test_file_name = source_train_file_name.replace(
                            dataset_dir1, dataset_dir2)

                        target_test_file_name = target_train_file_name.replace(
                            "train.", "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name,
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                # Write metadata.yaml based on input meta data
                input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1)

                output_dataset_meta = dict(input_dataset1_meta)
                output_dataset_meta['train_test'] = True
                output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S")
                output_dataset_meta['author'] = get_author()
                BaseDataset.store_meta_data(target_dataset_dir,
                                            output_dataset_meta)

        ############## Clean up after benchmarking ##############
        super(ShuffleProcess, self).post_benchmarking()
Example #6
0
    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(MergeProcess, self).pre_benchmarking()

        # For all input collections
        for source_test_collection_path in self.input_collections:
            # Check if the input data is splitted
            # e.g. only a single test file is in the source directory
            source_files = glob.glob(
                os.sep.join(
                    [source_test_collection_path, "data_run0", "*test*"]))
            splitted = len(source_files) > 1
            assert (not splitted)
            source_file_name = str(source_files[-1])

            # check if train sets are also present
            train_data_present = len(glob.glob(os.sep.join(
                                 [source_test_collection_path,"data_run0",\
                                  "*train*"]))) > 0

            # if training data is present -> use train and test sets separately
            if train_data_present:
                train_set_name_suffix = "train"
            else:
                train_set_name_suffix = "test"

            # We create the collection Rest_vs_Collection
            source_test_collection_name = \
                                   source_test_collection_path.split(os.sep)[-2]
            test_base_collection_name = \
                          source_test_collection_name.strip("}{").split("}{")[0]
            if self.reverse:
                target_collection_name = source_test_collection_name.replace(
                    test_base_collection_name,
                    test_base_collection_name + "_vs_" + self.name_pattern)
                key = "train"
            else:
                target_collection_name = source_test_collection_name.replace(
                    test_base_collection_name,
                    self.name_pattern + "_vs_" + test_base_collection_name)
                key = "test"

            target_collection_path = os.sep.join(
                [self.result_directory, target_collection_name])
            # determine the parameter_settings of the test collection
            test_collection = BaseDataset.load(source_test_collection_path)
            target_collection_params = \
                                 test_collection.meta_data["parameter_setting"]
            target_collection_params["__INPUT_DATASET__"] = \
                                           {key: source_test_collection_name}

            if source_file_name.endswith("arff"):
                file_ending = "arff"
                # Copy arff file from input collection to target collection
                source_test_file_path = os.sep.join([
                    source_test_collection_path, "data_run0",
                    "features_sp0" + train_set_name_suffix + ".arff"
                ])
                target_test_file_path = os.sep.join([
                    target_collection_path, "data_run0",
                    "features_sp0_" + key + ".arff"
                ])

            else:
                file_ending = source_file_name.split(".")[-1]
                source_test_file_path = source_test_collection_path
                target_test_file_path = target_collection_path

            source_train_pathes = []
            for source_train_collection_path in self.input_collections:
                source_train_collection_name = \
                                  source_train_collection_path.split(os.sep)[-2]
                # We must not use data originating from the same input
                # collection both in train and test files
                if source_test_collection_name == source_train_collection_name:
                    continue

                # Check that all constraints are fulfilled for this pair of
                # input collections
                if not all(eval(constraint_template % \
                  {'source_train_collection_name': source_train_collection_name,
                   'source_test_collection_name': source_test_collection_name})
                        for constraint_template in self.collection_constraints):
                    continue

                # check if all parameters are stored in the target path
                source_collection = \
                                BaseDataset.load(source_train_collection_path)
                source_collection_params = \
                            source_collection.meta_data["parameter_setting"]
                remaining_params = \
                          [param for param in source_collection_params.items() \
                            if param not in target_collection_params.items() and \
                               param[0] not in ["__INPUT_DATASET__",
                               "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
                               "__INPUT_COLLECTION__" ]] # for old data
                if remaining_params != []:
                    for k, v in remaining_params:
                        target_collection_path += "{%s#%s}" % (k, str(v))
                        target_collection_params[k] = v

                if "arff" == file_ending:
                    source_train_file_path = \
                                      os.sep.join([source_train_collection_path,
                                                "data_run0", "features_sp0_" + \
                                               train_set_name_suffix + ".arff"])
                else:
                    source_train_file_path = source_train_collection_path

                source_train_pathes.append(source_train_file_path)

            if "arff" == file_ending:
                target_train_file_path = os.sep.join([
                    target_collection_path, "data_run0",
                    "features_sp0_" + key + ".arff"
                ])
            else:
                target_train_file_path = target_collection_path

            if len(source_train_pathes) == 0:
                continue

            create_directory(os.sep.join([target_collection_path,
                                          "data_run0"]))

            if "arff" == file_ending:
                self._copy_arff_file(source_test_file_path,
                                     target_test_file_path,
                                     source_test_collection_name,
                                     target_collection_name)

                self._merge_arff_files(target_train_file_path,
                                       source_train_pathes,
                                       target_collection_name)
                # Copy metadata.yaml
                # TODO: Adapt to new collection
                input_meta = BaseDataset.load_meta_data(
                    source_test_collection_path)
                BaseDataset.store_meta_data(target_collection_path, input_meta)
            else:
                self._copy_file(source_test_collection_path,
                                target_collection_path, train_set_name_suffix)

                self._merge_files(target_train_file_path, source_train_pathes,
                                  train_set_name_suffix,
                                  target_collection_params)

        ############## Clean up after benchmarking ##############
        super(MergeProcess, self).post_benchmarking()
Example #7
0
    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(MergeProcess, self).pre_benchmarking()
        
        # For all input collections
        for source_test_collection_path in self.input_collections:
            # Check if the input data is splitted
            # e.g. only a single test file is in the source directory 
            source_files = glob.glob(os.sep.join([source_test_collection_path,
                                                  "data_run0", "*test*"]))
            splitted = len(source_files) > 1
            assert(not splitted)
            source_file_name = str(source_files[-1])
            
            # check if train sets are also present
            train_data_present = len(glob.glob(os.sep.join(
                                 [source_test_collection_path,"data_run0",\
                                  "*train*"]))) > 0
            
            # if training data is present -> use train and test sets separately
            if train_data_present:
                train_set_name_suffix = "train"
            else:
                train_set_name_suffix =  "test"
            
            # We create the collection Rest_vs_Collection
            source_test_collection_name = \
                                   source_test_collection_path.split(os.sep)[-2]
            test_base_collection_name = \
                          source_test_collection_name.strip("}{").split("}{")[0]
            if self.reverse:
                target_collection_name = source_test_collection_name.replace(
                                         test_base_collection_name,
                                         test_base_collection_name + "_vs_Rest")
                key = "train"
            else:
                target_collection_name = source_test_collection_name.replace(
                                         test_base_collection_name,
                                         "Rest_vs_" + test_base_collection_name)
                key = "test"
                
            target_collection_path = os.sep.join([self.result_directory,
                                                  target_collection_name])
            # determine the parameter_settings of the test collection
            test_collection = BaseDataset.load(source_test_collection_path)
            target_collection_params = \
                                 test_collection.meta_data["parameter_setting"]
            target_collection_params["__INPUT_DATASET__"] = \
                                           {key: source_test_collection_name}
            
            if source_file_name.endswith("arff"):
                file_ending = "arff"
                # Copy arff file from input collection to target collection
                source_test_file_path = os.sep.join([source_test_collection_path,
                                        "data_run0","features_sp0" +
                                        train_set_name_suffix + ".arff"])
                target_test_file_path = os.sep.join([target_collection_path,
                                       "data_run0","features_sp0_"+key+".arff"])
            
            elif source_file_name.endswith("pickle"):
                file_ending = "pickle"
                source_test_file_path = source_test_collection_path
                target_test_file_path = target_collection_path
            else:
                raise NotImplementedError("File type not supported in " \
                                                               "MergeOperation")
            
            source_train_pathes = []
            for source_train_collection_path in self.input_collections:
                source_train_collection_name = \
                                  source_train_collection_path.split(os.sep)[-2]
                # We must not use data originating from the same input
                # collection both in train and test files
                if source_test_collection_name == source_train_collection_name:
                    continue
                
                # Check that all constraints are fulfilled for this pair of
                # input collections
                if not all(eval(constraint_template % \
                  {'source_train_collection_name': source_train_collection_name,
                   'source_test_collection_name': source_test_collection_name})
                        for constraint_template in self.collection_constraints):
                    continue
                
                # check if all parameters are stored in the target path
                source_collection = \
                                BaseDataset.load(source_train_collection_path)
                source_collection_params = \
                            source_collection.meta_data["parameter_setting"]
                remaining_params = \
                          [param for param in source_collection_params.items() \
                            if param not in target_collection_params.items() and \
                               param[0] not in ["__INPUT_DATASET__", 
                               "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
                               "__INPUT_COLLECTION__" ]] # for old data
                if remaining_params != []:
                    for k,v in remaining_params:
                         target_collection_path += "{%s#%s}" % (k,str(v))
                         target_collection_params[k]=v
                   
                if "arff" == file_ending:
                    source_train_file_path = \
                                      os.sep.join([source_train_collection_path, 
                                                "data_run0", "features_sp0_" + \
                                               train_set_name_suffix + ".arff"])
                elif "pickle" == file_ending:
                    source_train_file_path = source_train_collection_path

                else:
                    raise NotImplementedError("File type not supported in " \
                                                              "MergeOperation!")     
                    
                source_train_pathes.append(source_train_file_path)
            
            if "arff" == file_ending:
                target_train_file_path = os.sep.join([target_collection_path,
                                       "data_run0","features_sp0_"+key+".arff"])
            elif "pickle" == file_ending:
                target_train_file_path = target_collection_path
            else:
                raise NotImplementedError("File type not supported in "
                                                              "MergeOperation!")     
            
            if len(source_train_pathes) == 0:
                continue
            
            create_directory(os.sep.join([target_collection_path,
                                          "data_run0"]))
            
            if "arff" == file_ending:
                self._copy_arff_file(source_test_file_path, 
                                     target_test_file_path,
                                     source_test_collection_name, 
                                     target_collection_name)
                                
                self._merge_arff_files(target_train_file_path, 
                                       source_train_pathes,
                                       target_collection_name)
                # Copy metadata.yaml
                # TODO: Adapt to new collection
                input_meta = BaseDataset.load_meta_data(source_test_collection_path)
                BaseDataset.store_meta_data(target_collection_path,input_meta)
            elif "pickle" == file_ending:
                self._copy_pickle_file(source_test_collection_path,
                                       target_collection_path,
                                       train_set_name_suffix)

                self._merge_pickle_files(target_train_file_path, 
                                         source_train_pathes, 
                                         train_set_name_suffix,
                                         target_collection_params)
            else:
                raise NotImplementedError("File type not supported in merge_operation")
            
        ############## Clean up after benchmarking ##############
        super(MergeProcess, self).post_benchmarking()
Example #8
0
    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(ShuffleProcess, self).pre_benchmarking()
        
        for dataset_dir1 in self.input_datasets:                
            for dataset_dir2 in self.input_datasets:
                dataset_name1 = dataset_dir1.split(os.sep)[-2]
                dataset_name2 = dataset_dir2.split(os.sep)[-2]
                
                # Check if the input data is split
                splitted = len(glob.glob(os.sep.join([dataset_dir1, "data_run0",
                                                      "*"]))) > 1
                
                # Check that all constraints are fulfilled for this pair of
                # input datasets
                if not all(eval(constraint_template % {'dataset_name1': dataset_name1,
                                                       'dataset_name2': dataset_name2})
                                    for constraint_template in self.dataset_constraints):
                    continue
                
                if dataset_name1 == dataset_name2:
                    if splitted:
                        # Copy the data 
                        os.symlink(dataset_dir1,
                                   os.sep.join([self.result_directory, 
                                                dataset_name1]))
                    continue
             
                # Determine names of the original data sets the input 
                # datasets are based on
                base_dataset1 = dataset_name1.strip("}{").split("}{")[0]
                base_dataset2 = dataset_name2.strip("}{").split("}{")[0]
                
                # Determine target dataset name and create directory
                # for it
                mixed_base_dataset = "%s_vs_%s" % (base_dataset1, 
                                                      base_dataset2)
                target_dataset_name = dataset_name1.replace(base_dataset1,
                                                                  mixed_base_dataset)
                
                target_dataset_dir = os.sep.join([self.result_directory,
                                                     target_dataset_name])
                
                create_directory(os.sep.join([target_dataset_dir, "data_run0"]))
                
                if splitted:
                    # For each split, copy the train data from dataset 1 and
                    # the test data from dataset 2 to the target dataset
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                       "data_run0",
                                                                       "*_sp*_train.*"])):
                        # TODO: We have $n$ train sets and $n$ test sets, we                   "metadata.yaml"])),
                              
                        #       could use all $n*n$ combinations 
                        target_train_file_name = source_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        source_test_file_name =  source_test_file_name.replace("train.",
                                                                                "test.")
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                else:
                    # Use the data set from dataset 1 as training set and 
                    # the data set from dataset 2 as test data
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                         "data_run0",
                                                                         "*_sp*_test.*"])):
                        target_train_file_name = source_train_file_name.replace("test.",
                                                                                "train.")
                        target_train_file_name = target_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                # Write metadata.yaml based on input meta data
                input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1)

                output_dataset_meta = dict(input_dataset1_meta)
                output_dataset_meta['train_test'] = True
                output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S")
                try:
                    output_dataset_meta['author'] = pwd.getpwuid(os.getuid())[4]
                except :
                    self._log("Author could not be resolved.",level=logging.WARNING)
                    output_dataset_meta['author'] = "unknown"
                BaseDataset.store_meta_data(target_dataset_dir,output_dataset_meta)
        
        ############## Clean up after benchmarking ##############
        super(ShuffleProcess, self).post_benchmarking()
Example #9
0
    def consolidate(self):
        """
        Consolidates the results obtained by the single WEKA filter
        processes into a consistent summary of datasets that is stored on
        the file system.
        
        .. todo:: Some of the contents of this method should go into the
                  :class:`~pySPACE.resources.dataset_defs.feature_vector.FeatureVectorDataset`
        """

        # Iterate over all collections and store the collection meta data etc.
        for entries in os.listdir(self.result_directory):
            fullpath = os.path.join(self.result_directory, entries)
            # For each collection        
            if os.path.isdir(fullpath):
                if entries.startswith("{"):
                    # Extract the parameters from the collection name in order to
                    # adjust the relation name
                    if self.num_parameters > 0:
                        parameter_strings = entries.strip("}{").split("}{")[-self.num_parameters:]
                        parameter_postfix = "{" + "}{".join(parameter_strings) + "}"
                    else:
                        parameter_strings = ""
                        parameter_postfix = ""
                    # Postprocessing of the arff files of this collection
                    for train_arff_file in glob.glob(fullpath + os.sep + "data_run*" 
                                           + os.sep + "*train.arff"):
                        # Adjust the relation name of the train file
                        content = open(train_arff_file, 'r').readlines()             
                        # We strip everything after the last "}"
                        endindex = content[0].rfind("}")
                        content[0] = content[0][:endindex+1]
                        content[0] += parameter_postfix + "'"
                        open(train_arff_file, 'w').writelines(content)
                        # Use relation name of train data for test data
                        test_arff_file = train_arff_file.replace("train.arff", "test.arff") 
                        test_content = open(test_arff_file, 'r').readlines()
                        test_content[0] = content[0] + "\n"
                        open(test_arff_file, 'w').writelines(test_content)
                    
                        # Check which features are contained in the arff file
                        feature_names = []
                        for line in content:
                            if line.startswith("@attribute"):
                                attribute = line.split()[1]
                                if attribute is not "class":
                                    feature_names.append(attribute)
                    # Store the collection meta data etc.
                    if self.num_parameters > 0:
                        input_collection_name = \
                            "{" + "}{".join(entries.strip("}{").split("}{")[:-self.num_parameters]) + "}"
                    else:
                        input_collection_name = entries
                        
                    input_collection_path = os.path.join(self.operation_spec["input_path"],
                                                     input_collection_name)

                    input_collection_meta = BaseDataset.load_meta_data(
                                            pySPACE.configuration.storage
                                            + os.sep
                                            + input_collection_path)
                    # Store the input collection
                    BaseDataset.store_meta_data(fullpath, input_collection_meta,
                                                file_name="input_metadata.yaml")
                    # Adjust collection metadata for the new collection
                    input_collection_meta["feature_names"] = feature_names
                    input_collection_meta["num_features"] = len(feature_names)
                    input_collection_meta["author"] = get_author()
                    input_collection_meta["date"] = time.strftime("%Y%m%d")
                    input_collection_meta["input_collection_name"] = input_collection_name
                    # Write the collection meta information into the folder
                    BaseDataset.store_meta_data(fullpath,input_collection_meta)
                    # Store the command_template
                    command_template_file = open(os.path.join(fullpath,
                                                          "command_template"), 'w')
                    command_template_file.write(self.command_template)
                    command_template_file.close()
                else:
                    # training and test arff need the same relation name
                    # otherwise Weka can't relate it to each other; the collection
                    # name and the parameters in {}{}-optic must be the relation 
                    # name for further processing    
                    self._log("WARNING: Collection name doesn't begin with '{'. Further processing may be collapse!", level= logging.WARNING)
        # Write the specification of this operation
        # to the result directory in order to make later 
        # analysis of results more easy
        source_operation_file = open(os.path.join(self.result_directory,
                                                  "source_operation.yaml"), 'w')
        yaml.dump(self.operation_spec, source_operation_file)
        source_operation_file.close()
Example #10
0
    def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters):
        """ Determines the name of the result directory

        Determines the name of the result directory based on the
        input_dataset_dir, the node_chain_name and the parameter setting.
        """
        # Determine the result_directory name
        # String between Key and value changed from ":" to "#",
        # because ot problems in windows and with windows file servers
        def _get_result_dir_name(parameter_setting, hide_parameters, method=None):
            """ internal function to create result dir name in different ways"""
            if not method:
                parameter_str = "}{".join(
                    ("%s#%s" % (key, value))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters
                )
            elif method == "hash":
                parameter_str = "}{".join(
                    ("%s#%s" % (key, hash(str(value).replace(" ", ""))))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters
                )

            parameter_str = parameter_str.replace("'", "")
            parameter_str = parameter_str.replace(" ", "")
            parameter_str = parameter_str.replace("[", "")
            parameter_str = parameter_str.replace("]", "")
            parameter_str = parameter_str.replace(os.sep, "")
            result_name = "{%s}" % input_name

            if parameter_str != "":
                result_name += "{%s}" % (parameter_str)

            # Determine the path where this result will be stored
            # and create the directory if necessary
            result_dir = base_dir
            result_dir += os.sep + result_name
            # filename is to long
            # (longer than allowed including optional offsets for pyspace
            #  result csv naming conventions)
            # create a md5 hash of the result name and use that one
            import platform

            CURRENTOS = platform.system()
            if CURRENTOS == "Windows":
                # the maximum length for a filename on Windows is 255
                if len(result_dir) > 255 - 32:
                    result_name = "{" + hashlib.md5(result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir
            else:
                if len(result_dir) > os.pathconf(os.curdir, "PC_NAME_MAX") - 32:
                    result_name = "{" + hashlib.md5(result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir

        input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
        input_name = input_name.strip("{}")
        # If the input is already the result of an operation
        if input_name.count("}{") > 0:
            input_name_parts = input_name.split("}{")
            input_name = input_name_parts[0]

        # Load the input meta data
        dataset_dir = os.sep.join([pySPACE.configuration.storage, input_dataset_dir])
        dataset_md = BaseDataset.load_meta_data(dataset_dir)

        # We are going to change the parameter_setting and don't want to
        # interfere with later runs so we work on a copy
        parameter_setting = copy.deepcopy(parameter_setting)

        # Ignore pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in parameter_setting:
            parameter_setting.pop("__PREPARE_OPERATION__")

        # Add the input parameters meta data to the given parameter setting
        if "parameter_setting" in dataset_md:
            parameter_setting.update(dataset_md["parameter_setting"])

        # We have to remove ' characters from the parameter value since
        # Weka does ignore them
        for key, value in parameter_setting.iteritems():
            if isinstance(value, basestring) and value.count("'") > 1:
                parameter_setting[key] = eval(value)

        result_dir = _get_result_dir_name(parameter_setting, hide_parameters)
        try:
            create_directory(result_dir)
        except OSError as e:
            if e.errno == 36:
                # filename is too long
                result_dir = _get_result_dir_name(parameter_setting, hide_parameters, "hash")
            create_directory(result_dir)

        return result_dir
Example #11
0
    def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting,
                                hide_parameters):
        """ Determines the name of the result directory

        Determines the name of the result directory based on the
        input_dataset_dir, the node_chain_name and the parameter setting.
        """

        # Determine the result_directory name
        # String between Key and value changed from ":" to "#",
        # because ot problems in windows and with windows file servers
        def _get_result_dir_name(parameter_setting,
                                 hide_parameters,
                                 method=None):
            """ internal function to create result dir name in different ways"""
            if not method:
                parameter_str = "}{".join(
                    ("%s#%s" % (key, value))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters)
            elif method == "hash":
                parameter_str = "}{".join(
                    ("%s#%s" % (key, hash(str(value).replace(' ', ''))))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters)

            parameter_str = parameter_str.replace("'", "")
            parameter_str = parameter_str.replace(" ", "")
            parameter_str = parameter_str.replace("[", "")
            parameter_str = parameter_str.replace("]", "")
            parameter_str = parameter_str.replace(os.sep, "")
            result_name = "{%s}" % input_name

            if parameter_str != "":
                result_name += "{%s}" % (parameter_str)

            # Determine the path where this result will be stored
            # and create the directory if necessary
            result_dir = base_dir
            result_dir += os.sep + result_name
            # filename is to long
            # (longer than allowed including optional offsets for pyspace
            #  result csv naming conventions)
            # create a md5 hash of the result name and use that one
            import platform
            CURRENTOS = platform.system()
            if CURRENTOS == "Windows":
                # the maximum length for a filename on Windows is 255
                if len(result_dir) > 255 - 32:
                    result_name = "{" + hashlib.md5(
                        result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir
            else:
                if len(result_dir) > os.pathconf(os.curdir,
                                                 'PC_NAME_MAX') - 32:
                    result_name = "{" + hashlib.md5(
                        result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir

        input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
        input_name = input_name.strip("{}")
        # If the input is already the result of an operation
        if input_name.count("}{") > 0:
            input_name_parts = input_name.split("}{")
            input_name = input_name_parts[0]

        # Load the input meta data
        dataset_dir = os.sep.join(
            [pySPACE.configuration.storage, input_dataset_dir])
        dataset_md = BaseDataset.load_meta_data(dataset_dir)

        # We are going to change the parameter_setting and don't want to
        # interfere with later runs so we work on a copy
        parameter_setting = copy.deepcopy(parameter_setting)

        # Ignore pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in parameter_setting:
            parameter_setting.pop("__PREPARE_OPERATION__")

        # Add the input parameters meta data to the given parameter setting
        if "parameter_setting" in dataset_md:
            parameter_setting.update(dataset_md["parameter_setting"])

        # We have to remove ' characters from the parameter value since
        # Weka does ignore them
        for key, value in parameter_setting.iteritems():
            if isinstance(value, basestring) and value.count("'") > 1:
                parameter_setting[key] = eval(value)

        result_dir = _get_result_dir_name(parameter_setting, hide_parameters)
        try:
            create_directory(result_dir)
        except OSError as e:
            if e.errno == 36:
                # filename is too long
                result_dir = _get_result_dir_name(parameter_setting,
                                                  hide_parameters, "hash")
            create_directory(result_dir)

        return result_dir
Example #12
0
    def consolidate(self, _=None):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([
            self.result_directory,
            "{*",
        ])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            try:
                # Load their meta_data
                meta_data = BaseDataset.load_meta_data(dataset_path)

                # Determine author and date
                author = get_author()
                date = time.strftime("%Y%m%d_%H_%M_%S")

                # Update meta data and store it
                meta_data.update({"author": author, "date": date})

                # There can be either run dirs, persistency dirs, or both of them.
                # Check of whichever there are more. If both exist, their numbers
                # are supposed to be equal.
                nr_run_dirs = len(
                    glob.glob(os.path.join(dataset_path, "data_run*")))
                nr_per_dirs = len(
                    glob.glob(os.path.join(dataset_path, "persistency_run*")))
                nr_runs = max(nr_run_dirs, nr_per_dirs)
                if nr_runs > 1:
                    meta_data["runs"] = nr_runs

                # Store the metadata
                BaseDataset.store_meta_data(dataset_path, meta_data)

                # Copy the input dataset specification file to the result
                # directory in order to make later analysis of
                # the results more easy
                # THA: Split the first "/" from the input collection name, because otherwise it will be treated
                # as an absolute path
                input_collection_name = meta_data["input_collection_name"][1:] if \
                    meta_data["input_collection_name"][0] == os.sep else meta_data["input_collection_name"]
                input_meta_path = os.path.join(pySPACE.configuration.storage,
                                               input_collection_name)
                try:
                    input_meta = BaseDataset.load_meta_data(input_meta_path)
                    BaseDataset.store_meta_data(
                        dataset_path,
                        input_meta,
                        file_name="input_metadata.yaml")
                except (IOError, OSError) as e:
                    self._log("Error copying the input_metadata.yaml: {error}".
                              format(error=e.message),
                              level=logging.CRITICAL)
            except Exception as e:
                logging.getLogger("%s" % self).exception(
                    "Error updating the metadata: {error!s}".format(error=e))
                raise e

        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory, "results_*"))
        if len(pathlist) > 0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            try:
                result_collection = PerformanceResultSummary(
                    dataset_dir=self.result_directory)
                self._log("done")
                self._log("Storing result collection")
                result_collection.store(self.result_directory)
                self._log("done")
                PerformanceResultSummary.merge_traces(self.result_directory)
            except Exception as e:
                logging.getLogger("%s" % self).exception(
                    "Error merging the result collection: {error!s}".format(
                        error=e))

            if self.compression:
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd = os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode,
                # and then skip the zipping.
                try:
                    pathlist = glob.glob(
                        os.path.join(self.result_directory, "{*}"))

                    if not self.compression == "delete":
                        save_file = zipfile.ZipFile(
                            self.result_directory + '/result_folders.zip',
                            mode="w",
                            compression=self.compression)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(
                                    node[0], self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(
                                        os.path.join(rel_path, data))
                        save_file.close()
                    # To still have an easy access to the history of the
                    # processing, we keep one folder.
                    pathlist.pop()
                    for path in pathlist:
                        shutil.rmtree(path)
                except Exception, e:
                    self._log("Result files could not be compressed with 32" +
                              " bit mode, switching to 64 bit mode",
                              level=logging.CRITICAL)
                    # nearly total code copy, only difference with 64 bit mode
                    try:
                        pathlist = glob.glob(
                            os.path.join(self.result_directory, "{*}"))
                        save_file = zipfile.ZipFile(
                            self.result_directory + '/result_folders.zip',
                            mode="w",
                            compression=self.compression,
                            allowZip64=True)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(
                                    node[0], self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(
                                        os.path.join(rel_path, data))
                        save_file.close()
                        # To still have an easy access to the history of the
                        # processing, we keep one folder.
                        pathlist.pop()
                        for path in pathlist:
                            shutil.rmtree(path)
                    except:
                        self._log(
                            "64 bit mode also failed. Please check your files and your code or contact your local programmer!",
                            level=logging.CRITICAL)
                os.chdir(cwd)
Example #13
0
    def _createProcesses(cls, processes, result_directory, operation_spec,
                         parameter_settings, input_collections):
        try:
            storage_format = operation_spec["storage_format"] if "storage_format" \
                in operation_spec else None

            # Determine whether the node_chain should be stored after data processing
            store_node_chain = operation_spec["store_node_chain"] \
                             if "store_node_chain" in operation_spec else False

            # Determine whether certain parameters should not be remembered
            hide_parameters = [] if "hide_parameters" not in operation_spec \
                                    else list(operation_spec["hide_parameters"])
            hide_parameters.append("__INPUT_COLLECTION__")
            hide_parameters.append("__INPUT_DATASET__")
            hide_parameters.append("__RESULT_DIRECTORY__")
            hide_parameters.append("__OUTPUT_BUNDLE__")
            operation_spec["hide_parameters"] = hide_parameters

            # Create all combinations of collections, runs and splits
            collection_run_split_combinations = []
            for input_dataset_dir in input_collections:
                # Determine number of runs to be conducted for this collection
                abs_collection_path = \
                    pySPACE.configuration.storage + os.sep \
                        + input_dataset_dir
                collection_runs = \
                    BaseDataset.load_meta_data(abs_collection_path).get('runs', 1)
                # D.get(k[,d]) -> D[k] if k in D, else d.

                if "runs" not in operation_spec:
                    requested_runs = collection_runs
                else:
                    requested_runs = operation_spec["runs"]

                assert collection_runs == requested_runs \
                    or collection_runs == 1, \
                    "Requested %s runs but input collection %s provides "\
                    "data for %s runs." % (requested_runs, input_dataset_dir,
                                           collection_runs)

                for run in range(max(requested_runs, collection_runs)):
                    collection_splits = BaseDataset.load_meta_data(
                        abs_collection_path).get('splits', 1)
                    for split in range(collection_splits):
                        collection_run_split_combinations.append(
                            (input_dataset_dir, run, split))

            # Shuffle order of dataset-run-split combinations. This should help to
            # avoid that all processes work on the same data which can cause
            # problems due to locking etc.
            random.shuffle(collection_run_split_combinations)

            # For all templates
            for node_chain_spec in operation_spec["templates"]:
                # For all possible parameter instantiations of this template
                for parameter_setting in parameter_settings:
                    # For all input collections-run combinations
                    for input_dataset_dir, run, split in \
                            collection_run_split_combinations:
                        # We are going to change the parameter_setting and don't
                        # want to interfere with later runs so we work on a copy
                        parameter_setting_cp = copy.deepcopy(parameter_setting)

                        # Add input and output path to parameter
                        # setting
                        parameter_setting_cp["__INPUT_DATASET__"] = \
                                input_dataset_dir.split(os.sep)[-2]
                        parameter_setting_cp["__RESULT_DIRECTORY__"] = \
                                result_directory
                        if len(operation_spec["templates"]) > 1:
                            index = operation_spec["templates"].index(
                                node_chain_spec)
                            parameter_setting_cp["__Template__"]=\
                                operation_spec["template_files"][index]

                        # Load the input meta data
                        dataset_dir = os.sep.join(
                            [pySPACE.configuration.storage, input_dataset_dir])
                        dataset_md = BaseDataset.load_meta_data(dataset_dir)
                        # Add the input parameter's meta data
                        # to the given parameter setting
                        if "parameter_setting" in dataset_md:
                            dataset_md["parameter_setting"].update(
                                parameter_setting_cp)
                            all_parameters = dataset_md["parameter_setting"]
                        else:
                            all_parameters = parameter_setting_cp

                        def check_constraint(constraint, parameters):
                            for key, value in parameters.iteritems():
                                constraint = constraint.replace(
                                    key, str(value))
                            return eval(constraint)

                        if not all(
                                check_constraint(constraint_def,
                                                 all_parameters)
                                for constraint_def in operation_spec.get(
                                    'old_parameter_constraints', [])):
                            continue

                        # Determine directory in which the result of this
                        # process should be written
                        result_dataset_directory = \
                            NodeChainOperation._get_result_dataset_dir(
                                result_directory,
                                input_dataset_dir,
                                parameter_setting_cp,
                                hide_parameters)

                        # Create the respective process and put it to the
                        # executing-queue of processes
                        process = NodeChainProcess(
                            node_chain_spec=node_chain_spec,
                            parameter_setting=parameter_setting_cp,
                            rel_dataset_dir=input_dataset_dir,
                            run=run,
                            split=split,
                            storage_format=storage_format,
                            result_dataset_directory=result_dataset_directory,
                            store_node_chain=store_node_chain,
                            hide_parameters=hide_parameters)

                        processes.put(process)
        finally:
            # give executing process the sign that creation is now finished
            processes.put(False)
Example #14
0
    def create(cls,
               operation_spec,
               result_directory,
               debug=False,
               input_paths=[]):
        """ A factory method that creates the processes which form an operation
        based on the  information given in the operation specification, *operation_spec*.

        In debug mode this is done in serial. In the other default mode,
        at the moment 4 processes are created in parallel and can be immediately
        executed. So generation of processes and execution are made in parallel.
        This kind of process creation is done independently from the backend.

        For huge parameter spaces this is necessary!
        Otherwise numerous processes are created and corresponding data is loaded
        but the concept of spreading the computation to different processors
        can not really be used, because process creation is blocking only
        one processor and memory space, but nothing more is done,
        till the processes are all created.

        .. todo:: Use :class:`~pySPACE.resources.dataset_defs.dummy.DummyDataset`
                  for empty data, when no input_path is given.
        """
        assert (operation_spec["type"] == "node_chain")

        # Determine all parameter combinations that should be tested
        parameter_settings = cls._get_parameter_space(operation_spec)

        ## Use node_chain parameter if no templates are given ##
        if not operation_spec.has_key("templates"):
            if operation_spec.has_key("node_chain"):
                operation_spec["templates"] = [
                    operation_spec.pop("node_chain")
                ]


#                    extract_key_str(operation_spec["base_file"],
#                                    keyword="node_chain")]
#                operation_spec.pop("node_chain")
            else:
                warnings.warn(
                    "Specify parameter 'templates' or 'node_chain' in your operation spec!"
                )
        elif operation_spec.has_key("node_chain"):
            operation_spec.pop("node_chain")
            warnings.warn(
                "node_chain parameter is ignored. Templates are used.")
        # load files in templates as dictionaries
        elif type(operation_spec["templates"][0]) == str:
            operation_spec["template_files"] = \
                copy.deepcopy(operation_spec["templates"])
            for i in range(len(operation_spec["templates"])):
                rel_node_chain_file = operation_spec["templates"][i]
                abs_node_chain_file_name = os.sep.join([
                    pySPACE.configuration.spec_dir, "node_chains",
                    rel_node_chain_file
                ])
                with open(abs_node_chain_file_name, "r") as read_file:
                    node_chain = read_file.read()
                    #node_chain = yaml.load(read_file)
                operation_spec["templates"][i] = node_chain

        storage = pySPACE.configuration.storage
        if not input_paths:
            raise Exception("No input datasets found in input_path %s in %s!" %
                            (operation_spec["input_path"], storage))

        # Get relative path
        rel_input_paths = [name[len(storage):] for name in input_paths]

        # Determine approximate number of runs
        if "runs" in operation_spec:
            runs = operation_spec["runs"]
        else:
            runs = []
            for dataset_dir in rel_input_paths:
                abs_collection_path = \
                        pySPACE.configuration.storage + os.sep \
                            + dataset_dir
                collection_runs = \
                        BaseDataset.load_meta_data(abs_collection_path).get('runs',1)
                runs.append(collection_runs)
            runs = max(runs)

        # Determine splits
        dataset_dir = rel_input_paths[0]
        abs_collection_path = \
                pySPACE.configuration.storage + os.sep + dataset_dir

        splits = BaseDataset.load_meta_data(abs_collection_path).get(
            'splits', 1)

        # Determine how many processes will be created
        number_processes = len(operation_spec["templates"]) * \
                           len(parameter_settings) * len(rel_input_paths) * \
                           runs * splits

        if debug:
            # To better debug creation of processes we don't limit the queue
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, operation_spec,
                                 parameter_settings, rel_input_paths)
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes)
        else:
            # Create all processes by calling a recursive helper method in
            # another thread so that already created processes can be executed in
            # parallel. Therefore a queue is used which size is maximized to
            # guarantee that not to much objects are created (because this costs
            # memory). However, the actual number of 4 is arbitrary and might
            # be changed according to the system at hand.
            processes = processing.Queue(4)
            create_process = \
                    processing.Process(target=cls._createProcesses,
                                       args=(processes, result_directory,
                                             operation_spec, parameter_settings,
                                             rel_input_paths))
            create_process.start()
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes, create_process)
Example #15
0
    def consolidate(self, _=None):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([self.result_directory, "{*",])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            try:
                # Load their meta_data
                meta_data = BaseDataset.load_meta_data(dataset_path)

                # Determine author and date
                author = get_author()
                date = time.strftime("%Y%m%d_%H_%M_%S")

                # Update meta data and store it
                meta_data.update({"author": author, "date": date})

                # There can be either run dirs, persistency dirs, or both of them.
                # Check of whichever there are more. If both exist, their numbers
                # are supposed to be equal.
                nr_run_dirs = len(glob.glob(os.path.join(dataset_path, "data_run*")))
                nr_per_dirs = len(glob.glob(os.path.join(dataset_path, "persistency_run*")))
                nr_runs = max(nr_run_dirs, nr_per_dirs)
                if nr_runs > 1:
                    meta_data["runs"] = nr_runs

                # Store the metadata
                BaseDataset.store_meta_data(dataset_path, meta_data)

                # Copy the input dataset specification file to the result
                # directory in order to make later analysis of
                # the results more easy
                # THA: Split the first "/" from the input collection name, because otherwise it will be treated
                # as an absolute path
                input_collection_name = meta_data["input_dataset_name"][1:] if \
                    meta_data["input_dataset_name"][0] == os.sep else meta_data["input_dataset_name"]
                input_meta_path = os.path.join(pySPACE.configuration.storage, input_collection_name)
                try:
                    input_meta = BaseDataset.load_meta_data(input_meta_path)
                    BaseDataset.store_meta_data(dataset_path, input_meta, file_name="input_metadata.yaml")
                except (IOError, OSError) as e:
                    self._log("Error copying the input_metadata.yaml: {error}".format(error=e.message),
                              level=logging.CRITICAL)
            except Exception as e:
                logging.getLogger("%s" % self).exception("Error updating the metadata: {error!s}".format(error=e))
                raise e

        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory,"results_*"))
        if len(pathlist)>0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            try:
                result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
                self._log("done")
                self._log("Storing result collection")
                result_collection.store(self.result_directory)
                self._log("done")
                PerformanceResultSummary.merge_traces(self.result_directory)
            except Exception as e:
                logging.getLogger("%s" % self).exception("Error merging the result collection: {error!s}".format(
                    error=e))

            if self.compression:
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd = os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode,
                # and then skip the zipping.
                try:
                    pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))

                    if not self.compression == "delete":
                        save_file = zipfile.ZipFile(
                            self.result_directory+'/result_folders.zip',
                            mode="w", compression=self.compression)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path=os.path.relpath(node[0],
                                                         self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,
                                                                 data))
                        save_file.close()
                    # To still have an easy access to the history of the
                    # processing, we keep one folder.
                    pathlist.pop()
                    for path in pathlist:
                        shutil.rmtree(path)
                except Exception, e:
                    self._log("Result files could not be compressed with 32"+
                              " bit mode, switching to 64 bit mode",
                              level=logging.CRITICAL)
                    # nearly total code copy, only difference with 64 bit mode
                    try:
                        pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
                        save_file=zipfile.ZipFile(
                            self.result_directory+'/result_folders.zip',
                            mode="w", compression=self.compression,
                            allowZip64=True)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(node[0],
                                                         self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,data))
                        save_file.close()
                        # To still have an easy access to the history of the
                        # processing, we keep one folder.
                        pathlist.pop()
                        for path in pathlist:
                            shutil.rmtree(path)
                    except:
                        self._log("64 bit mode also failed. Please check your files and your code or contact your local programmer!", level=logging.CRITICAL)
                os.chdir(cwd)