Example #1
0
    def _merge_pickle_files(self, target_collection_path,
                            source_collection_pathes, train_set_name_suffix,
                            target_collection_params):
        """ Merge all collections in source_collection_pathes and store them \
            in the target collection"""

        # load a first collection, in which the data of all other collections
        # is assembled
        target_collection = BaseDataset.load(source_collection_pathes[0])
        author = get_author()
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_collection.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        k = "test" if self.reverse else "train"
        target_collection_params["__INPUT_DATASET__"][k] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
        target_collection_params[
            "__RESULT_DIRECTORY__"] = self.result_directory
        target_collection.meta_data.update({
            "author":
            author,
            "date":
            date,
            "dataset_directory":
            target_collection_path,
            "train_test":
            True,
            "parameter_setting":
            target_collection_params,
            "input_collection_name":
            source_collection_pathes[0][len(pySPACE.configuration.storage):]
        })

        # merge data of all other collections to target collection
        for source_collection_path in source_collection_pathes[1:]:
            source_collection = BaseDataset.load(source_collection_path)
            for run in source_collection.get_run_numbers():
                for split in source_collection.get_split_numbers():
                    data = source_collection.get_data(run, split,
                                                      train_set_name_suffix)
                    target_data = target_collection.get_data(
                        run, split, train_set_name_suffix)
                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)

        # if only test data was given, the "Rest_vs" collection is stored as
        # training data
        if not self.reverse and "test" == train_set_name_suffix:
            # exchange the "test" in key tuple to "train" before storing
            for key in target_collection.data.keys():
                assert ("test" == key[2])
                value = target_collection.data.pop(key)
                key = (key[0], key[1], "train")
                target_collection.data[key] = value

        target_collection.store(target_collection_path)
Example #2
0
 def _merge_pickle_files(self, target_collection_path, source_collection_pathes,
                               train_set_name_suffix, target_collection_params):
     """ Merge all collections in source_collection_pathes and store them \
         in the target collection"""
     
     # load a first collection, in which the data of all other collections 
     # is assembled
     target_collection = BaseDataset.load(source_collection_pathes[0])
     author = get_author()
     date = time.strftime("%Y%m%d_%H_%M_%S")
     # Delete node_chain file name
     try:
         target_collection.meta_data.pop("node_chain_file_name")
     except:
         pass
     # Update meta data and store it
     k = "test" if self.reverse else "train"
     target_collection_params["__INPUT_DATASET__"][k] = \
              [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
     target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory
     target_collection.meta_data.update({
             "author" : author, 
             "date" : date, 
             "dataset_directory" : target_collection_path,
             "train_test" : True,
             "parameter_setting" : target_collection_params,
             "input_collection_name" : source_collection_pathes[0][len(
                                     pySPACE.configuration.storage):]
     })
   
     # merge data of all other collections to target collection
     for source_collection_path in source_collection_pathes[1:]:
         source_collection = BaseDataset.load(source_collection_path)
         for run in source_collection.get_run_numbers():
             for split in source_collection.get_split_numbers():
                 data = source_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 target_data = target_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 # actual data is stored in a list that has to be extended
                 target_data.extend(data)
                 
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if not self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in target_collection.data.keys():
             assert("test" == key[2])
             value = target_collection.data.pop(key)
             key = (key[0],key[1],"train")
             target_collection.data[key] = value
                 
     target_collection.store(target_collection_path)
Example #3
0
    def store(self, result_dir, s_format = "None"):
        if not s_format == "None":
            self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "only output of individual nodes stored",
                                      "storage_format": s_format,
                                      "author" : author,
                                      "data_pattern": "no data stored"})

        # Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)
Example #4
0
    def store(self, result_dir, s_format="None"):
        if not s_format == "None":
            self._log("The format %s is not supported!" % s_format,
                      level=logging.CRITICAL)
            return
        # Update the meta data
        author = get_author()
        self.update_meta_data({
            "type": "only output of individual nodes stored",
            "storage_format": s_format,
            "author": author,
            "data_pattern": "no data stored"
        })

        # Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)
Example #5
0
    def store(self, result_dir, s_format="pickle"):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        **Parameters**
        
          :result_dir:
              The directory in which the collection will be stored.
              
          :name:
              The prefix of the file names in which the individual data sets are 
              stored. The actual file names are determined by appending suffixes
              that encode run, split, train/test information. 
              
              (*optional, default: "time_series"*)
              
          :s_format:
              The format in which the actual data sets should be stored.
              
              Possible formats are 'pickle', 'text', 'csv' and 'mat' (matlab)
              format. If s_format is a list, the second element further 
              specifies additional options for storing.
              
              - pickle:
                  Standard Python format
                  
              - text:
                  In the text format, all time series objects are concatenated 
                  to a single large table containing only integer values.
                  
              - csv:
                  For the csv format comma separated values are taken as default
                  or a specified Python format string.
                  
              - mat:
                  Scipy's savemat function is used for storing. Thereby the data
                  is stored as 3 dimensional array. Also meta data information,
                  like sampling frequency and channel names are saved.
                  As an additional parameter the orientation of the data arrays 
                  can be given as 'channelXtime' or 'timeXchannel'
              
              .. note:: For the text and MATLAB format, markers could be added 
                        by using a Marker_To_Mux node before
              
              (*optional, default: "pickle"*)

        .. todo:: Put marker to the right time point and also write marker channel.
        
        .. todo:: Shouldn't be 'text' and 'csv' format part of the stream data
                  set?!
        """
        name = "time_series"
        # for some storage procedures we need further specifications
        s_type = None
        if type(s_format) == list:
            # file format is first position
            f_format = s_format[0]
            if len(s_format) > 1:
                s_type = s_format[1]
        else:
            f_format = s_format
        if f_format == "text" and s_type is None:
            s_type = "%i"
        elif f_format == "csv" and s_type == "real":
            s_type = "%.18e"
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "time_series",
                               "storage_format": s_format,
                               "author": author,
                               "data_pattern": "data_run" + os.sep 
                                               + name + "_sp_tt." + f_format})

        # Iterate through splits and runs in this dataset
        for key, time_series in self.data.iteritems():
            # load data, if necessary 
            # (due to the lazy loading, the data might be not loaded already)
            if isinstance(time_series, basestring):
                time_series = self.get_data(key[0], key[1], key[2])
            if self.sort_string is not None:
                time_series.sort(key=eval(self.sort_string))
            # Construct result directory
            result_path = result_dir + os.sep + "data" + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
            
            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if f_format in ["pickle", "cpickle", "cPickle"]:
                result_file = open(os.path.join(result_path,
                                                name+key_str+".pickle"), "w")
                cPickle.dump(time_series, result_file, cPickle.HIGHEST_PROTOCOL)
                result_file.close()
            elif f_format in ["text","csv"]:
                self.update_meta_data({
                    "type": "stream",
                    "marker_column": "marker"})
                result_file = open(os.path.join(result_path,
                                                name + key_str + ".csv"), "w")
                csvwriter = csv.writer(result_file)
                channel_names = copy.deepcopy(time_series[0][0].channel_names)
                if f_format == "csv":
                    channel_names.append("marker")
                csvwriter.writerow(channel_names)
                for (data, key) in time_series:
                    if f_format == "text":
                        numpy.savetxt(result_file, data, delimiter=",", fmt=s_type)
                        if not key is None:
                            result_file.write(str(key))
                            result_file.flush()
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            result_file.write(str(data.marker_name))
                            result_file.flush()
                    else:
                        first_line = True
                        marker = ""
                        if not key is None:
                            marker = str(key)
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            marker = str(data.marker_name)
                        for line in data:
                            l = list(line)
                            l.append(marker)
                            csvwriter.writerow(list(l))
                            if first_line:
                                first_line = False
                                marker = ""
                        result_file.flush()
                result_file.close()
            elif f_format in ["matlab", "mat", "MATLAB"]:
                # todo: handle all the other attributes of ts objects!
                import scipy.io
                result_file_name = os.path.join(result_path, 
                                                name + key_str + ".mat")
                # extract a first time series object to get meta data 
                ts1 = time_series[0][0]
                
                # collect all important information in the collection_object
                dataset_dict = {
                    "sampling_frequency": ts1.sampling_frequency,
                    "channel_names": ts1.channel_names}
                
                # we have to extract the data and labels separatly
                if 'channelXtime' in s_format:
                    dataset_dict["data"] = [data.T for data, _ in time_series] 
                else:
                    dataset_dict["data"] = [data for data, _ in time_series]
                dataset_dict["labels"] = [label for _, label in time_series]
                # construct numpy 3d array (e.g., channelXtimeXtrials)
                dataset_dict["data"] = numpy.rollaxis(numpy.array(
                    dataset_dict["data"]), 0, 3)
                
                scipy.io.savemat(result_file_name, mdict=dataset_dict)
            elif f_format in ["bp_eeg"]:

                result_file = open(os.path.join(result_path,
                                                name + key_str + ".eeg"),"a+")
                result_file_mrk = open(os.path.join(result_path,
                                                name + key_str + ".vmrk"),"w")

                result_file_mrk.write("Brain Vision Data Exchange Marker File, "
                                      "Version 1.0\n")
                result_file_mrk.write("; Data stored by pySPACE\n")
                result_file_mrk.write("[Common Infos]\n")
                result_file_mrk.write("Codepage=UTF-8\n")
                result_file_mrk.write("DataFile=%s\n" %
                                      str(name + key_str + ".eeg"))
                result_file_mrk.write("\n[Marker Infos]\n")

                markerno = 1
                datapoint = 1
                sf = None
                channel_names = None

                for t in time_series:
                    if sf is None:
                        sf = t[0].sampling_frequency
                    if channel_names is None:
                        channel_names = t[0].get_channel_names()
                    for mrk in t[0].marker_name.keys():
                        for tm in t[0].marker_name[mrk]:
                            result_file_mrk.write(str("Mk%d=Stimulus,%s,%d,1,0\n" %
                                (markerno, mrk, datapoint+(tm*sf/1000.0))))
                            markerno += 1
                    data_ = t[0].astype(numpy.int16)
                    data_.tofile(result_file)
                    datapoint += data_.shape[0]

                result_hdr = open(os.path.join(result_path,
                                                name + key_str + ".vhdr"),"w")

                result_hdr.write("Brain Vision Data Exchange Header "
                                 "File Version 1.0\n")
                result_hdr.write("; Data stored by pySPACE\n\n")
                result_hdr.write("[Common Infos]\n")
                result_hdr.write("Codepage=UTF-8\n")
                result_hdr.write("DataFile=%s\n" %
                                      str(name + key_str + ".eeg"))
                result_hdr.write("MarkerFile=%s\n" %
                                      str(name + key_str + ".vmrk"))
                result_hdr.write("DataFormat=BINARY\n")
                result_hdr.write("DataOrientation=MULTIPLEXED\n")
                result_hdr.write("NumberOfChannels=%d\n" % len(channel_names))
                result_hdr.write("SamplingInterval=%d\n\n" % (1000000/sf))
                result_hdr.write("[Binary Infos]\n")
                result_hdr.write("BinaryFormat=INT_16\n\n")
                result_hdr.write("[Channel Infos]\n")

                # TODO: Add Resolutions to time_series
                # 0 = 0.1 [micro]V,
                # 1 = 0.5 [micro]V,
                # 2 = 10 [micro]V,
                # 3 = 152.6 [micro]V (seems to be unused!)
                resolutions_str = [unicode("0.1,%sV" % unicode(u"\u03BC")),
                   unicode("0.5,%sV" % unicode(u"\u03BC")),
                   unicode("10,%sV" % unicode(u"\u03BC")),
                   unicode("152.6,%sV" % unicode(u"\u03BC"))]
                for i in range(len(channel_names)):
                    result_hdr.write(unicode("Ch%d=%s,,%s\n" %
                        (i+1,channel_names[i],
                        unicode(resolutions_str[0]))).encode('utf-8'))
                result_file.close()
            else:
                NotImplementedError("Using unavailable storage format:%s!"
                                    % f_format)
        self.update_meta_data({
            "channel_names": copy.deepcopy(time_series[0][0].channel_names),
            "sampling_frequency": time_series[0][0].sampling_frequency
        })
        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)
Example #6
0
    def store(self, result_dir, s_format=["pickle", "real"]):
        """ store the collection in *result_dir*"""

        name = "predictions"
        # Update the meta data
        author = get_author()
        self.update_meta_data({
            "type":
            "prediction_vector",
            "storage_format":
            s_format,
            "author":
            author,
            "data_pattern":
            "data_run" + os.sep + name + "_sp_tt." + s_format[0]
        })

        if not s_format in ["csv", "arff", "pickle"]:
            self._log("Storage format not supported! Using default.",
                      level=logging.ERROR)
            s_format = "pickle"

        for key, prediction_vectors in self.data.iteritems():
            # Construct result directory
            result_path = result_dir + os.sep + "data" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format == "pickle":
                result_file = open(
                    os.path.join(result_path, name + key_str + ".pickle"), "w")
                cPickle.dump(prediction_vectors, result_file,
                             cPickle.HIGHEST_PROTOCOL)

            elif s_format == "csv":  # Write as Comma Separated Value
                result_file = open(
                    os.path.join(result_path, name + key_str + ".csv"), "w")
                if self.meta_data["num_predictors"] == 1:
                    result_file.write(
                        "Predicted Label, Prediction Score, True Label \n")
                    for pv in prediction_vectors:
                        result_file.write(
                            "%s, %s, %s\n" %
                            (pv[0].label[0], pv[0].prediction[0], pv[1]))
                else:
                    # we begin by dealing with the header of the csv file
                    base_header = "Predicted %(index)d Label, Prediction %(index)d Score, "
                    base_result = "%(label)s, %(score)s,"
                    header = ""
                    for i in range(self.meta_data["num_predictors"]):
                        header += base_header % dict(index=i + 1)
                    header += "True Label\n"
                    result_file.write(header)

                    # and now we can write each of the prediction vectors in turn

                    for pv in prediction_vectors:
                        result = ""
                        for i in range(self.meta_data["num_predictors"]):
                            result += base_result % dict(
                                label=pv[0].label[i],
                                score=pv[0].prediction[i])

                        result += str(pv[1]) + "\n"
                        result_file.write(result)

        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)
Example #7
0
    def store(self, result_dir, s_format=["pickle", "real"]):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        The method expects the following parameters:
          * *result_dir* The directory in which the collection will be stored
          * *name* The prefix of the file names in which the individual \
                   data sets are stored. The actual file names are determined \
                   by appending suffixes that encode run, split, train/test \
                   information. Defaults to "features".
          * *format* A list with information about the format in which the 
                    actual data sets should be stored. The first entry specifies
                    the file format. If it is "arff" the second entry specifies the
                    attribute format. 
                    
                    Examples: ["arff", "real"], ["arff", "{0,1}"]
                    
                    .. todo:: Someone could implement the format ["fasta"] for sax features
                    
                    To store the data in comma separated values, use ["csv", "real"].
                    
                    (*optional, default: ["pickle", "real"]*)

        .. todo:: Adapt storing of csv file to external library instead of
                  doing it manually.

        """
        name = "features"
        # Update the meta data
        author = get_author()
        self.update_meta_data({
            "type":
            "feature_vector",
            "storage_format":
            s_format,
            "author":
            author,
            "data_pattern":
            "data_run" + os.sep + name + "_sp_tt." + s_format[0]
        })

        if type(s_format) == list:
            s_type = s_format[1]
            s_format = s_format[0]
        else:
            s_type = "real"

        if not s_format in ["csv", "arff", "pickle"]:
            self._log("Storage format not supported! Using default.",
                      level=logging.ERROR)
            s_format = "pickle"

        # Iterate through splits and runs in this dataset
        for key, feature_vectors in self.data.iteritems():
            # test if dataset has already been loaded.
            # Otherwise replace with iterator to loaded version.
            if isinstance(feature_vectors, basestring):
                feature_vectors = self.get_data(key[0], key[1], key[2])

            # Construct result directory
            result_path = result_dir + os.sep + "data" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format == "pickle":
                result_file = open(
                    os.path.join(result_path, name + key_str + ".pickle"), "w")

                cPickle.dump(feature_vectors, result_file,
                             cPickle.HIGHEST_PROTOCOL)
            elif s_format == "arff":  # Write as ARFF
                result_file = open(
                    os.path.join(result_path, name + key_str + ".arff"), "w")
                # Create the arff file header
                relation_name = result_dir.split(os.sep)[-1]
                result_file.write('@relation "%s"\n' % relation_name)
                # Write the type of all features
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write("@attribute %s %s\n" %
                                      (feature_name, s_type))
                classString = "" + ",".join(
                    sorted(self.meta_data["classes_names"])) + ""

                result_file.write("@attribute class {%s}\n" % classString)

                result_file.write("@data\n")
                # Write all given training data into the ARFF file
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.string_):
                    feature_format = "%s,"
                elif numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                for features, class_name in feature_vectors:
                    for feature in features[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            elif s_format == "csv":  # Write as Comma Separated Value
                result_file = open(
                    os.path.join(result_path, name + key_str + ".csv"), "w")
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write("%s," % (feature_name))
                result_file.write("\n")
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                else:
                    feature_format = "%s,"
                for features, class_name in feature_vectors:
                    f = features.view(numpy.ndarray)
                    for feature in f[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            result_file.close()

        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)
Example #8
0
 def _merge_files(self, target_collection_path, source_collection_pathes,
                  train_set_name_suffix, target_collection_params):
     """ Merge all collections in source_collection_pathes and store them \
         in the target collection
         
     **Parameters**
     
         :target_collection_path:
             Path of the dataset, in which the data of all other datasets
             is assembled.
             
         :source_collection_pathes:
             Paths of the datasets to be merged.
             
         :train_set_name_suffix:
             Either 'train' or 'test'. Specifies if datasets are merged for
             training or testing.
             
         :target_collection_params:
             Dictionary with all the parameters of the target dataset.
             
     """
     
     # load a first collection, in which the data of all other collections 
     # is assembled
     target_collection = BaseDataset.load(source_collection_pathes[0])
     author = get_author()
     date = time.strftime("%Y%m%d_%H_%M_%S")
     # Delete node_chain file name
     try:
         target_collection.meta_data.pop("node_chain_file_name")
     except:
         pass
     # Update meta data and store it
     k = "test" if self.reverse else "train"
     target_collection_params["__INPUT_DATASET__"][k] = \
              [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
     target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory
     target_collection.meta_data.update({
             "author" : author, 
             "date" : date, 
             "dataset_directory" : target_collection_path,
             "train_test" : True,
             "parameter_setting" : target_collection_params,
             "input_collection_name" : source_collection_pathes[0][len(
                                     pySPACE.configuration.storage):]
     })
   
     # merge data of all other collections to target collection
     for source_collection_path in source_collection_pathes[1:]:
         source_collection = BaseDataset.load(source_collection_path)
         for run in source_collection.get_run_numbers():
             for split in source_collection.get_split_numbers():
                 target_data = target_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 
                 if self.set_flag:
                     for ts, l in target_data:
                         if ts.specs == None:
                             ts.specs = {"new_set": False}
                         elif ts.specs.has_key("new_set"):
                             break
                         else:
                             ts.specs["new_set"]= False
                 
                 data = source_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 
                 if self.set_flag:
                     for i, (ts, l) in enumerate(data):
                         # flag first element of the concatenated data list
                         if ts.specs == None:
                             ts.specs = {"new_set": i==0}
                         else:
                             ts.specs["new_set"] = (i==0)
                 
                 # actual data is stored in a list that has to be extended
                 target_data.extend(data)
                 
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if not self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in target_collection.data.keys():
             assert("test" == key[2])
             value = target_collection.data.pop(key)
             key = (key[0],key[1],"train")
             target_collection.data[key] = value
     # we store the data in the same format as before
     target_collection.store(target_collection_path, 
         target_collection.meta_data["storage_format"])
Example #9
0
    def store(self, result_dir, s_format = "BrainVision"):
        self.merged = False
        scale = 10.0 # is used to scale up the eeg sample values.  The data samples are converted to int16
                    # when saving, so scaling is necessary to keep maintain the resolutions. 
        # Keep original file name, depends on the AnalyserSinkNode, see it's documentation.
        if self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] is not None:
            name = self.meta_data['eeg_src_file_name']
        # or use default name from this collection
        else:
            name = "Analyzer"
        if not s_format == "BrainVision":
            self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "only output of individual nodes stored",
                                      "storage_format": s_format,
                                      "author" : author,
                                      "data_pattern": "Multiplexed"})
        # Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)
        #self._log("EEG data file %s" % self.collection.data_file)
        slices = []
        slices.append(0)
        channel_names = []
        

        
        for key, time_series in self.data.iteritems():
            # Sort the Times-Series Array
            def cmp_start(a, b):
                return cmp(a[0].start_time, b[0].start_time)

            time_series.sort(cmp_start)
            # Check for overlapping Windows and remove them if existent
            i = 0
            while i < len(time_series):
                ts = time_series[i]
                #print ts[0].start_time, ts[0].end_time
                #print len(time_series)
                if ts[0].start_time >= slices[-1]:
                    slices.append(ts[0].end_time)
                else:
                    warnings.warn("Ignoring at least one overlapping window!", UserWarning)
                i = i+1
            # STORE ACTUAL EEG DATA AND WRITE MARKERFILE
            result_path = result_dir + os.sep + "data_analyzer" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
            
            key_str = "_sp%s_%s" % key[1:]
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w")
            # or use default name from this collection
            else:
                result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w")
        
            # Write Marker header
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_mrk.write(header_mrk % (name))
            else:
                result_file_mrk.write(header_mrk % (name + key_str))
        
            result_file_ms = 0
        
            # Data for padding
            padding = None
        
            count_mrk = 2
            num_ch = 0
            sampling_int = 0
            
            for ts in time_series:
                ts0 = ts[0] * scale 
                ts0 = ts0.astype(numpy.int16)
                
                if padding == None:
                    padding = numpy.zeros(len(ts[0].channel_names), dtype='int16')
                    num_ch = len(ts[0].channel_names)
                    channel_names = ts[0].channel_names
                    sampling_int = 1000000/ts[0].sampling_frequency
                    #print "writing %d channels.." % len(ts[0].channel_names)
                # Write Padding (zeros)
                while result_file_ms < ts[0].start_time - sampling_int/1000.0:
                    result_file_eeg.write(padding.tostring())
                    result_file_ms += ts[0]._samples_to_ms(1)
                # Write window
                ts0.tofile(result_file_eeg)
                result_file_ms += ts[0].end_time - (ts[0].start_time - sampling_int/1000.0)
                # Write Marker
                markers = []
                
                if(len(ts[0].marker_name) > 0):
                    mk_keys = ts[0].marker_name.keys()
                    mk_values = ts[0].marker_name.values()
                    for mk in range(len(mk_keys)):
                        for mv in range(len(mk_values[mk])):
                            markers.append((mk_keys[mk], mk_values[mk][mv]))
                    markers = sorted(markers, key=lambda tup: tup[1])
                    
                    for i in range(len(markers)):
                        if 'R' in markers[i][0]: 
                            event_type = 'Response' 
                        elif 'S' in markers[i][0]:
                            event_type = 'Stimulus'
                        else:
                            event_type = 'Label'
                            
                        result_file_mrk.write("Mk%d=%s,%s,%d,1,0\n" % (count_mrk, event_type, markers[i][0], (ts[0].start_time + markers[i][1])*ts[0].sampling_frequency/1000.0))                                                     
                        count_mrk += 1

            # WRITE HEADERFILE
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int))
            # or use default name from this collection
            else:
                result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int))
            # Format: Ch1=Fp1,,0.1,\xB5V
            for i in range(num_ch):
                result_file_hdr.write("Ch%d=%s,,%.2f,\xB5V\n" % (i+1,channel_names[i], 1./scale))

            result_file_hdr.close()
            result_file_eeg.close()
            result_file_mrk.close()
Example #10
0
    def consolidate(self):
        """
        Consolidates the results obtained by the single WEKA filter
        processes into a consistent summary of datasets that is stored on
        the file system.
        
        .. todo:: Some of the contents of this method should go into the
                  :class:`~pySPACE.resources.dataset_defs.feature_vector.FeatureVectorDataset`
        """

        # Iterate over all collections and store the collection meta data etc.
        for entries in os.listdir(self.result_directory):
            fullpath = os.path.join(self.result_directory, entries)
            # For each collection        
            if os.path.isdir(fullpath):
                if entries.startswith("{"):
                    # Extract the parameters from the collection name in order to
                    # adjust the relation name
                    if self.num_parameters > 0:
                        parameter_strings = entries.strip("}{").split("}{")[-self.num_parameters:]
                        parameter_postfix = "{" + "}{".join(parameter_strings) + "}"
                    else:
                        parameter_strings = ""
                        parameter_postfix = ""
                    # Postprocessing of the arff files of this collection
                    for train_arff_file in glob.glob(fullpath + os.sep + "data_run*" 
                                           + os.sep + "*train.arff"):
                        # Adjust the relation name of the train file
                        content = open(train_arff_file, 'r').readlines()             
                        # We strip everything after the last "}"
                        endindex = content[0].rfind("}")
                        content[0] = content[0][:endindex+1]
                        content[0] += parameter_postfix + "'"
                        open(train_arff_file, 'w').writelines(content)
                        # Use relation name of train data for test data
                        test_arff_file = train_arff_file.replace("train.arff", "test.arff") 
                        test_content = open(test_arff_file, 'r').readlines()
                        test_content[0] = content[0] + "\n"
                        open(test_arff_file, 'w').writelines(test_content)
                    
                        # Check which features are contained in the arff file
                        feature_names = []
                        for line in content:
                            if line.startswith("@attribute"):
                                attribute = line.split()[1]
                                if attribute is not "class":
                                    feature_names.append(attribute)
                    # Store the collection meta data etc.
                    if self.num_parameters > 0:
                        input_collection_name = \
                            "{" + "}{".join(entries.strip("}{").split("}{")[:-self.num_parameters]) + "}"
                    else:
                        input_collection_name = entries
                        
                    input_collection_path = os.path.join(self.operation_spec["input_path"],
                                                     input_collection_name)

                    input_collection_meta = BaseDataset.load_meta_data(
                                            pySPACE.configuration.storage
                                            + os.sep
                                            + input_collection_path)
                    # Store the input collection
                    BaseDataset.store_meta_data(fullpath, input_collection_meta,
                                                file_name="input_metadata.yaml")
                    # Adjust collection metadata for the new collection
                    input_collection_meta["feature_names"] = feature_names
                    input_collection_meta["num_features"] = len(feature_names)
                    input_collection_meta["author"] = get_author()
                    input_collection_meta["date"] = time.strftime("%Y%m%d")
                    input_collection_meta["input_collection_name"] = input_collection_name
                    # Write the collection meta information into the folder
                    BaseDataset.store_meta_data(fullpath,input_collection_meta)
                    # Store the command_template
                    command_template_file = open(os.path.join(fullpath,
                                                          "command_template"), 'w')
                    command_template_file.write(self.command_template)
                    command_template_file.close()
                else:
                    # training and test arff need the same relation name
                    # otherwise Weka can't relate it to each other; the collection
                    # name and the parameters in {}{}-optic must be the relation 
                    # name for further processing    
                    self._log("WARNING: Collection name doesn't begin with '{'. Further processing may be collapse!", level= logging.WARNING)
        # Write the specification of this operation
        # to the result directory in order to make later 
        # analysis of results more easy
        source_operation_file = open(os.path.join(self.result_directory,
                                                  "source_operation.yaml"), 'w')
        yaml.dump(self.operation_spec, source_operation_file)
        source_operation_file.close()
Example #11
0
    def consolidate(self, _=None):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([
            self.result_directory,
            "{*",
        ])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            try:
                # Load their meta_data
                meta_data = BaseDataset.load_meta_data(dataset_path)

                # Determine author and date
                author = get_author()
                date = time.strftime("%Y%m%d_%H_%M_%S")

                # Update meta data and store it
                meta_data.update({"author": author, "date": date})

                # There can be either run dirs, persistency dirs, or both of them.
                # Check of whichever there are more. If both exist, their numbers
                # are supposed to be equal.
                nr_run_dirs = len(
                    glob.glob(os.path.join(dataset_path, "data_run*")))
                nr_per_dirs = len(
                    glob.glob(os.path.join(dataset_path, "persistency_run*")))
                nr_runs = max(nr_run_dirs, nr_per_dirs)
                if nr_runs > 1:
                    meta_data["runs"] = nr_runs

                # Store the metadata
                BaseDataset.store_meta_data(dataset_path, meta_data)

                # Copy the input dataset specification file to the result
                # directory in order to make later analysis of
                # the results more easy
                # THA: Split the first "/" from the input collection name, because otherwise it will be treated
                # as an absolute path
                input_collection_name = meta_data["input_collection_name"][1:] if \
                    meta_data["input_collection_name"][0] == os.sep else meta_data["input_collection_name"]
                input_meta_path = os.path.join(pySPACE.configuration.storage,
                                               input_collection_name)
                try:
                    input_meta = BaseDataset.load_meta_data(input_meta_path)
                    BaseDataset.store_meta_data(
                        dataset_path,
                        input_meta,
                        file_name="input_metadata.yaml")
                except (IOError, OSError) as e:
                    self._log("Error copying the input_metadata.yaml: {error}".
                              format(error=e.message),
                              level=logging.CRITICAL)
            except Exception as e:
                logging.getLogger("%s" % self).exception(
                    "Error updating the metadata: {error!s}".format(error=e))
                raise e

        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory, "results_*"))
        if len(pathlist) > 0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            try:
                result_collection = PerformanceResultSummary(
                    dataset_dir=self.result_directory)
                self._log("done")
                self._log("Storing result collection")
                result_collection.store(self.result_directory)
                self._log("done")
                PerformanceResultSummary.merge_traces(self.result_directory)
            except Exception as e:
                logging.getLogger("%s" % self).exception(
                    "Error merging the result collection: {error!s}".format(
                        error=e))

            if self.compression:
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd = os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode,
                # and then skip the zipping.
                try:
                    pathlist = glob.glob(
                        os.path.join(self.result_directory, "{*}"))

                    if not self.compression == "delete":
                        save_file = zipfile.ZipFile(
                            self.result_directory + '/result_folders.zip',
                            mode="w",
                            compression=self.compression)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(
                                    node[0], self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(
                                        os.path.join(rel_path, data))
                        save_file.close()
                    # To still have an easy access to the history of the
                    # processing, we keep one folder.
                    pathlist.pop()
                    for path in pathlist:
                        shutil.rmtree(path)
                except Exception, e:
                    self._log("Result files could not be compressed with 32" +
                              " bit mode, switching to 64 bit mode",
                              level=logging.CRITICAL)
                    # nearly total code copy, only difference with 64 bit mode
                    try:
                        pathlist = glob.glob(
                            os.path.join(self.result_directory, "{*}"))
                        save_file = zipfile.ZipFile(
                            self.result_directory + '/result_folders.zip',
                            mode="w",
                            compression=self.compression,
                            allowZip64=True)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(
                                    node[0], self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(
                                        os.path.join(rel_path, data))
                        save_file.close()
                        # To still have an easy access to the history of the
                        # processing, we keep one folder.
                        pathlist.pop()
                        for path in pathlist:
                            shutil.rmtree(path)
                    except:
                        self._log(
                            "64 bit mode also failed. Please check your files and your code or contact your local programmer!",
                            level=logging.CRITICAL)
                os.chdir(cwd)
Example #12
0
    def store(self, result_dir, s_format=["pickle", "real"]):
        """ store the collection in *result_dir*"""

        name = "predictions"
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "prediction_vector",
                               "storage_format": s_format,
                               "author": author,
                               "data_pattern": "data_run" + os.sep
                                                 + name + "_sp_tt." + s_format[0]})

        if not s_format in ["csv", "arff", "pickle"]:
            self._log("Storage format not supported! Using default.",
                      level=logging.ERROR)
            s_format = "pickle"

        for key, prediction_vectors in self.data.iteritems():
            # Construct result directory
            result_path = result_dir + os.sep + "data" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format == "pickle":
                result_file = open(os.path.join(result_path,
                                                name + key_str + ".pickle"),
                                   "w")
                cPickle.dump(prediction_vectors, result_file, cPickle.HIGHEST_PROTOCOL)

            elif s_format == "csv": # Write as Comma Separated Value
                result_file = open(os.path.join(result_path,
                                                name + key_str + ".csv"),"w")
                if self.meta_data["num_predictors"] == 1:
                    result_file.write("Predicted Label, Prediction Score, True Label \n")
                    for pv in prediction_vectors:
                        result_file.write("%s, %s, %s\n" % (pv[0].label[0], pv[0].prediction[0], pv[1]))
                else:
                    # we begin by dealing with the header of the csv file
                    base_header = "Predicted %(index)d Label, Prediction %(index)d Score, "
                    base_result = "%(label)s, %(score)s,"
                    header = ""
                    for i in range(self.meta_data["num_predictors"]):
                        header+= base_header % dict(index=i+1)
                    header += "True Label\n"
                    result_file.write(header)

                    # and now we can write each of the prediction vectors in turn

                    for pv in prediction_vectors:
                        result = ""
                        for i in range(self.meta_data["num_predictors"]):
                            result += base_result % dict(label=pv[0].label[i],
                                                         score=pv[0].prediction[i])

                        result += str(pv[1]) + "\n"
                        result_file.write(result)

        #Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)
Example #13
0
    def store(self, result_dir, s_format = "bp_eeg"):
        self.merged = False
        scale = 10.0 # is used to scale up the eeg sample values.  The data samples are converted to int16
                    # when saving, so scaling is necessary to keep maintain the resolutions. 
        # Keep original file name, depends on the AnalyserSinkNode, see it's documentation.
        if self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] is not None:
            name = self.meta_data['eeg_src_file_name']
        # or use default name from this collection
        else:
            name = "Analyzer"
        if not s_format == "bp_eeg":
            self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "only output of individual nodes stored",
                                      "storage_format": s_format,
                                      "author" : author,
                                      "data_pattern": "Multiplexed"})
        # Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)
        #self._log("EEG data file %s" % self.collection.data_file)
        slices = []
        slices.append(0)
        channel_names = []
        

        
        for key, time_series in self.data.iteritems():
            # Sort the Times-Series Array
            def cmp_start(a, b):
                return cmp(a[0].start_time, b[0].start_time)

            time_series.sort(cmp_start)
            # Check for overlapping Windows and remove them if existent
            i = 0
            while i < len(time_series):
                ts = time_series[i]
                #print ts[0].start_time, ts[0].end_time
                #print len(time_series)
                if ts[0].start_time >= slices[-1]:
                    slices.append(ts[0].end_time)
                else:
                    warnings.warn("Ignoring at least one overlapping window!", UserWarning)
                i = i+1
            # STORE ACTUAL EEG DATA AND WRITE MARKERFILE
            result_path = result_dir + os.sep + "data_analyzer" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
            
            key_str = "_sp%s_%s" % key[1:]
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w")
            # or use default name from this collection
            else:
                result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w")
        
            # Write Marker header
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_mrk.write(header_mrk % (name))
            else:
                result_file_mrk.write(header_mrk % (name + key_str))
        
            result_file_ms = 0
        
            # Data for padding
            padding = None
        
            count_mrk = 2
            num_ch = 0
            sampling_int = 0
            
            for ts in time_series:
                ts0 = ts[0] * scale 
                ts0 = ts0.astype(numpy.int16)
                
                if padding == None:
                    padding = numpy.zeros(len(ts[0].channel_names), dtype='int16')
                    num_ch = len(ts[0].channel_names)
                    channel_names = ts[0].channel_names
                    sampling_int = 1000000/ts[0].sampling_frequency
                    #print "writing %d channels.." % len(ts[0].channel_names)
                # Write Padding (zeros)
                while result_file_ms < ts[0].start_time - sampling_int/1000.0:
                    result_file_eeg.write(padding.tostring())
                    result_file_ms += ts[0]._samples_to_ms(1)
                # Write window
                ts0.tofile(result_file_eeg)
                result_file_ms += ts[0].end_time - (ts[0].start_time - sampling_int/1000.0)
                # Write Marker
                markers = []
                
                if(len(ts[0].marker_name) > 0):
                    mk_keys = ts[0].marker_name.keys()
                    mk_values = ts[0].marker_name.values()
                    for mk in range(len(mk_keys)):
                        for mv in range(len(mk_values[mk])):
                            markers.append((mk_keys[mk], mk_values[mk][mv]))
                    markers = sorted(markers, key=lambda tup: tup[1])
                    
                    for i in range(len(markers)):
                        if 'R' in markers[i][0]: 
                            event_type = 'Response' 
                        elif 'S' in markers[i][0]:
                            event_type = 'Stimulus'
                        else:
                            event_type = 'Label'
                            
                        result_file_mrk.write("Mk%d=%s,%s,%d,1,0\n" % (count_mrk, event_type, markers[i][0], (ts[0].start_time + markers[i][1])*ts[0].sampling_frequency/1000.0))                                                     
                        count_mrk += 1

            # WRITE HEADERFILE
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int))
            # or use default name from this collection
            else:
                result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int))
            # Format: Ch1=Fp1,,0.1,\xB5V
            for i in range(num_ch):
                result_file_hdr.write("Ch%d=%s,,%.2f,\xB5V\n" % (i+1,channel_names[i], 1./scale))

            result_file_hdr.close()
            result_file_eeg.close()
            result_file_mrk.close()
Example #14
0
    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(ShuffleProcess, self).pre_benchmarking()

        for dataset_dir1 in self.input_datasets:
            for dataset_dir2 in self.input_datasets:
                dataset_name1 = dataset_dir1.split(os.sep)[-2]
                dataset_name2 = dataset_dir2.split(os.sep)[-2]

                # Check if the input data is split
                splitted = len(
                    glob.glob(os.sep.join([dataset_dir1, "data_run0", "*"
                                           ]))) > 1

                # Check that all constraints are fulfilled for this pair of
                # input datasets
                if not all(
                        eval(
                            constraint_template % {
                                'dataset_name1': dataset_name1,
                                'dataset_name2': dataset_name2
                            })
                        for constraint_template in self.dataset_constraints):
                    continue

                if dataset_name1 == dataset_name2:
                    if splitted:
                        # Copy the data
                        os.symlink(
                            dataset_dir1,
                            os.sep.join([self.result_directory,
                                         dataset_name1]))
                    continue

                # Determine names of the original data sets the input
                # datasets are based on
                base_dataset1 = dataset_name1.strip("}{").split("}{")[0]
                base_dataset2 = dataset_name2.strip("}{").split("}{")[0]

                # Determine target dataset name and create directory
                # for it
                mixed_base_dataset = "%s_vs_%s" % (base_dataset1,
                                                   base_dataset2)
                target_dataset_name = dataset_name1.replace(
                    base_dataset1, mixed_base_dataset)

                target_dataset_dir = os.sep.join(
                    [self.result_directory, target_dataset_name])

                create_directory(os.sep.join([target_dataset_dir,
                                              "data_run0"]))

                if splitted:
                    # For each split, copy the train data from dataset 1 and
                    # the test data from dataset 2 to the target dataset
                    for source_train_file_name in glob.glob(
                            os.sep.join(
                                [dataset_dir1, "data_run0", "*_sp*_train.*"])):
                        # TODO: We have $n$ train sets and $n$ test sets, we                   "metadata.yaml"])),

                        #       could use all $n*n$ combinations
                        target_train_file_name = source_train_file_name.replace(
                            dataset_dir1, target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name,
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name,
                                       target_train_file_name)

                        source_test_file_name = source_train_file_name.replace(
                            dataset_dir1, dataset_dir2)

                        source_test_file_name = source_test_file_name.replace(
                            "train.", "test.")
                        target_test_file_name = target_train_file_name.replace(
                            "train.", "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name,
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                else:
                    # Use the data set from dataset 1 as training set and
                    # the data set from dataset 2 as test data
                    for source_train_file_name in glob.glob(
                            os.sep.join(
                                [dataset_dir1, "data_run0", "*_sp*_test.*"])):
                        target_train_file_name = source_train_file_name.replace(
                            "test.", "train.")
                        target_train_file_name = target_train_file_name.replace(
                            dataset_dir1, target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name,
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name,
                                       target_train_file_name)

                        source_test_file_name = source_train_file_name.replace(
                            dataset_dir1, dataset_dir2)

                        target_test_file_name = target_train_file_name.replace(
                            "train.", "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name,
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                # Write metadata.yaml based on input meta data
                input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1)

                output_dataset_meta = dict(input_dataset1_meta)
                output_dataset_meta['train_test'] = True
                output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S")
                output_dataset_meta['author'] = get_author()
                BaseDataset.store_meta_data(target_dataset_dir,
                                            output_dataset_meta)

        ############## Clean up after benchmarking ##############
        super(ShuffleProcess, self).post_benchmarking()
Example #15
0
    def store(self, result_dir, s_format = ["pickle", "real"]):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        The method expects the following parameters:
          * *result_dir* The directory in which the collection will be stored
          * *name* The prefix of the file names in which the individual \
                   data sets are stored. The actual file names are determined \
                   by appending suffixes that encode run, split, train/test \
                   information. Defaults to "features".
          * *format* A list with information about the format in which the 
                    actual data sets should be stored. The first entry specifies
                    the file format. If it is "arff" the second entry specifies the
                    attribute format. 
                    
                    Examples: ["arff", "real"], ["arff", "{0,1}"]
                    
                    .. todo:: Someone could implement the format ["fasta"] for sax features
                    
                    To store the data in comma separated values, use ["csv", "real"].
                    
                    (*optional, default: ["pickle", "real"]*)

        .. todo:: Adapt storing of csv file to external library instead of
                  doing it manually.

        """
        name = "features"
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "feature_vector",
                               "storage_format": s_format,
                               "author": author,
                               "data_pattern": "data_run" + os.sep 
                                                 + name + "_sp_tt." + s_format[0]})
        
        if type(s_format) == list:
            s_type = s_format[1]
            s_format = s_format[0]
        else:
            s_type = "real"
            
        if not s_format in ["csv", "arff", "pickle"]:
            self._log("Storage format not supported! Using default.", 
                      level=logging.ERROR)
            s_format = "pickle"
        
        # Iterate through splits and runs in this dataset
        for key, feature_vectors in self.data.iteritems():
            # test if dataset has already been loaded.
            # Otherwise replace with iterator to loaded version.
            if isinstance(feature_vectors, basestring):
                feature_vectors = self.get_data(key[0], key[1], key[2])

            # Construct result directory
            result_path = result_dir + os.sep + "data" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
                
            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format == "pickle":
                result_file = open(os.path.join(result_path, 
                                                name + key_str + ".pickle"),
                                   "w")
         
                cPickle.dump(feature_vectors, result_file, cPickle.HIGHEST_PROTOCOL)
            elif s_format == "arff": # Write as ARFF
                result_file = open(os.path.join(result_path, 
                                                name + key_str + ".arff"),"w")
                # Create the arff file header
                relation_name = result_dir.split(os.sep)[-1]
                result_file.write('@relation "%s"\n' % relation_name)
                # Write the type of all features
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write("@attribute %s %s\n" % (feature_name,  s_type))
                classString = "" + ",".join(sorted(self.meta_data["classes_names"])) + ""

                result_file.write("@attribute class {%s}\n" % classString)
                
                result_file.write("@data\n")
                # Write all given training data into the ARFF file
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.string_):
                    feature_format = "%s,"
                elif numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                for features, class_name in feature_vectors:
                    for feature in features[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            elif s_format == "csv": # Write as Comma Separated Value
                result_file = open(os.path.join(result_path, 
                                                name + key_str + ".csv"),"w")
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write("%s," % (feature_name))
                result_file.write("\n")
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                else:
                    feature_format = "%s,"
                for features, class_name in feature_vectors:
                    f = features.view(numpy.ndarray)
                    for feature in f[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            result_file.close()

        #Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)
Example #16
0
    def _merge_files(self, target_collection_path, source_collection_pathes,
                     train_set_name_suffix, target_collection_params):
        """ Merge all collections in source_collection_pathes and store them \
            in the target collection
            
        **Parameters**
        
            :target_collection_path:
                Path of the dataset, in which the data of all other datasets
                is assembled.
                
            :source_collection_pathes:
                Paths of the datasets to be merged.
                
            :train_set_name_suffix:
                Either 'train' or 'test'. Specifies if datasets are merged for
                training or testing.
                
            :target_collection_params:
                Dictionary with all the parameters of the target dataset.
                
        """

        # load a first collection, in which the data of all other collections
        # is assembled
        target_collection = BaseDataset.load(source_collection_pathes[0])
        author = get_author()
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_collection.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        k = "test" if self.reverse else "train"
        target_collection_params["__INPUT_DATASET__"][k] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
        target_collection_params[
            "__RESULT_DIRECTORY__"] = self.result_directory
        target_collection.meta_data.update({
            "author":
            author,
            "date":
            date,
            "dataset_directory":
            target_collection_path,
            "train_test":
            True,
            "parameter_setting":
            target_collection_params,
            "input_dataset_name":
            source_collection_pathes[0][len(pySPACE.configuration.storage):]
        })

        # merge data of all other collections to target collection
        for source_collection_path in source_collection_pathes[1:]:
            source_collection = BaseDataset.load(source_collection_path)
            for run in source_collection.get_run_numbers():
                for split in source_collection.get_split_numbers():
                    target_data = target_collection.get_data(
                        run, split, train_set_name_suffix)

                    if self.set_flag:
                        for ts, l in target_data:
                            if ts.specs == None:
                                ts.specs = {"new_set": False}
                            elif ts.specs.has_key("new_set"):
                                break
                            else:
                                ts.specs["new_set"] = False

                    data = source_collection.get_data(run, split,
                                                      train_set_name_suffix)

                    if self.set_flag:
                        for i, (ts, l) in enumerate(data):
                            # flag first element of the concatenated data list
                            if ts.specs == None:
                                ts.specs = {"new_set": i == 0}
                            else:
                                ts.specs["new_set"] = (i == 0)

                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)

        # if only test data was given, the "Rest_vs" collection is stored as
        # training data
        if not self.reverse and "test" == train_set_name_suffix:
            # exchange the "test" in key tuple to "train" before storing
            for key in target_collection.data.keys():
                assert ("test" == key[2])
                value = target_collection.data.pop(key)
                key = (key[0], key[1], "train")
                target_collection.data[key] = value
        # we store the data in the same format as before
        target_collection.store(target_collection_path,
                                target_collection.meta_data["storage_format"])
Example #17
0
    def consolidate(self):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([self.result_directory, "{*"])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            # Load their meta_data
            meta_data = BaseDataset.load_meta_data(dataset_path)

            # Determine author and date
            author = get_author()
            date = time.strftime("%Y%m%d_%H_%M_%S")

            # Update meta data and store it
            meta_data.update({"author": author, "date": date})
            BaseDataset.store_meta_data(dataset_path, meta_data)

            # Copy the input dataset specification file to the result
            # directory in order to make later analysis of
            # the results more easy
            input_meta_path = os.sep.join([pySPACE.configuration.storage, meta_data["input_collection_name"]])
            input_meta = BaseDataset.load_meta_data(input_meta_path)
            BaseDataset.store_meta_data(dataset_path, input_meta, file_name="input_metadata.yaml")
        # Check if some results consist of several runs
        # and update the meta data in this case
        # TODO: This is not a clean solution
        for dataset_dir in glob.glob(os.sep.join([self.result_directory, "*"])):
            if not os.path.isdir(dataset_dir):
                continue
            # There can be either run dirs, persistency dirs, or both of them.
            # Check of whichever there are more. If both exist, their numbers
            # are supposed to be equal.
            nr_run_dirs = len(glob.glob(os.sep.join([dataset_dir, "data_run*"])))
            nr_per_dirs = len(glob.glob(os.sep.join([dataset_dir, "persistency_run*"])))
            nr_runs = max(nr_run_dirs, nr_per_dirs)

            if nr_runs > 1:
                collection_meta = BaseDataset.load_meta_data(dataset_dir)
                collection_meta["runs"] = nr_runs
                BaseDataset.store_meta_data(dataset_dir, collection_meta)
        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory, "results_*"))
        if len(pathlist) > 0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
            self._log("done")
            self._log("Storing result collection")
            result_collection.store(self.result_directory)
            self._log("done")
            PerformanceResultSummary.merge_traces(self.result_directory)

            if self.compression:
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile

                cwd = os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode,
                # and then skip the zipping.
                try:
                    pathlist = glob.glob(os.path.join(self.result_directory, "{*}"))

                    if not self.compression == "delete":
                        save_file = zipfile.ZipFile(
                            self.result_directory + "/result_folders.zip", mode="w", compression=self.compression
                        )
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(node[0], self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path, data))
                        save_file.close()
                    # To still have an easy access to the history of the
                    # processing, we keep one folder.
                    pathlist.pop()
                    for path in pathlist:
                        shutil.rmtree(path)
                except Exception, e:
                    self._log(
                        "Result files could not be compressed with 32" + " bit mode, switching to 64 bit mode",
                        level=logging.CRITICAL,
                    )
                    # nearly total code copy, only difference with 64 bit mode
                    try:
                        pathlist = glob.glob(os.path.join(self.result_directory, "{*}"))
                        save_file = zipfile.ZipFile(
                            self.result_directory + "/result_folders.zip",
                            mode="w",
                            compression=self.compression,
                            allowZip64=True,
                        )
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(node[0], self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path, data))
                        save_file.close()
                        # To still have an easy access to the history of the
                        # processing, we keep one folder.
                        pathlist.pop()
                        for path in pathlist:
                            shutil.rmtree(path)
                    except:
                        self._log(
                            "64 bit mode also failed. Please check your files and your code or contact your local programmer!",
                            level=logging.CRITICAL,
                        )
                os.chdir(cwd)
Example #18
0
    def store(self, result_dir, s_format="pickle"):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        **Parameters**
        
          :result_dir:
              The directory in which the collection will be stored.
              
          :name:
              The prefix of the file names in which the individual data sets are 
              stored. The actual file names are determined by appending suffixes
              that encode run, split, train/test information. 
              
              (*optional, default: "time_series"*)
              
          :format:
              The format in which the actual data sets should be stored.
              
              Possible formats are *pickle*, *text*, *csv* and *MATLAB* (.mat)
              format.

              In the MATLAB and text format, all time series objects are
              concatenated to a single large table containing only integer
              values.
              For the csv format comma separated values are taken as default
              or a specified Python format string.
              
              The MATLAB format is a struct that contains the data, the
              sampling frequency and the channel names.
              
              .. note:: For the text and MATLAB format, markers could be added 
                        by using a Marker_To_Mux node before
              
              (*optional, default: "pickle"*)

        .. todo:: Put marker to the right time point and also write marker channel.
        """
        name = "time_series"
        if type(s_format) == list:
            s_type = s_format[1]
            s_format = s_format[0]
        else:
            s_type = "%.18e"
        if s_format in ["text", "matlab"]:
            s_type = "%i"
        if s_format == "csv" and s_type == "real":
            s_type = "%.18e"
        # Update the meta data
        author = get_author()
        self.update_meta_data({
            "type":
            "time_series",
            "storage_format":
            s_format,
            "author":
            author,
            "data_pattern":
            "data_run" + os.sep + name + "_sp_tt." + s_format
        })

        # Iterate through splits and runs in this dataset
        for key, time_series in self.data.iteritems():
            # load data, if necessary
            # (due to the  lazy loading, the data might be not loaded already)
            if isinstance(time_series, basestring):
                time_series = self.get_data(key[0], key[1], key[2])
            if self.sort_string is not None:
                time_series.sort(key=eval(self.sort_string))
            # Construct result directory
            result_path = result_dir + os.sep + "data" + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format in ["pickle", "cpickle", "cPickle"]:
                result_file = open(
                    os.path.join(result_path, name + key_str + ".pickle"), "w")
                cPickle.dump(time_series, result_file,
                             cPickle.HIGHEST_PROTOCOL)
            elif s_format in ["text", "csv"]:
                self.update_meta_data({
                    "type": "stream",
                    "marker_column": "marker"
                })
                result_file = open(
                    os.path.join(result_path, name + key_str + ".csv"), "w")
                csvwriter = csv.writer(result_file)
                channel_names = copy.deepcopy(time_series[0][0].channel_names)
                if s_format == "csv":
                    channel_names.append("marker")
                csvwriter.writerow(channel_names)
                for (data, key) in time_series:
                    if s_format == "text":
                        numpy.savetxt(result_file,
                                      data,
                                      delimiter=",",
                                      fmt=s_type)
                        if not key is None:
                            result_file.write(str(key))
                            result_file.flush()
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            result_file.write(str(data.marker_name))
                            result_file.flush()
                    else:
                        first_line = True
                        marker = ""
                        if not key is None:
                            marker = str(key)
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            marker = str(data.marker_name)
                        for line in data:
                            l = list(line)
                            l.append(marker)
                            csvwriter.writerow(list(l))
                            if first_line:
                                first_line = False
                                marker = ""
                        result_file.flush()
            elif s_format in ["mat"]:
                result_file = open(
                    os.path.join(result_path, name + key_str + ".mat"), "w")
                # extract a first time series object to get meta data
                merged_time_series = time_series.pop(0)[0]
                # collect all important information in the collection_object
                collection_object = {
                    "sampling_frequency":
                    merged_time_series.sampling_frequency,
                    "channel_names": merged_time_series.channel_names
                }

                # merge all data
                for (data, key) in time_series:
                    merged_time_series = numpy.vstack(
                        (merged_time_series, data))
                collection_object["data"] = merged_time_series
                mdict = dict()
                mdict[name + key_str] = collection_object
                import scipy.io
                scipy.io.savemat(result_file, mdict=mdict)
            elif s_format in ["eeg"]:

                result_file = open(
                    os.path.join(result_path, name + key_str + ".eeg"), "a+")
                result_file_mrk = open(
                    os.path.join(result_path, name + key_str + ".vmrk"), "w")

                result_file_mrk.write(
                    "Brain Vision Data Exchange Marker File, "
                    "Version 1.0\n")
                result_file_mrk.write("; Data stored by pySPACE\n")
                result_file_mrk.write("[Common Infos]\n")
                result_file_mrk.write("Codepage=UTF-8\n")
                result_file_mrk.write("DataFile=%s\n" %
                                      str(name + key_str + ".eeg"))
                result_file_mrk.write("\n[Marker Infos]\n")

                markerno = 1
                datapoint = 1
                sf = None
                channel_names = None

                for t in time_series:
                    if sf is None:
                        sf = t[0].sampling_frequency
                    if channel_names is None:
                        channel_names = t[0].get_channel_names()
                    for mrk in t[0].marker_name.keys():
                        for tm in t[0].marker_name[mrk]:
                            result_file_mrk.write(
                                str("Mk%d=Stimulus,%s,%d,1,0\n" %
                                    (markerno, mrk, datapoint +
                                     (tm * sf / 1000.0))))
                            markerno += 1
                    data_ = t[0].astype(numpy.int16)
                    data_.tofile(result_file)
                    datapoint += data_.shape[0]

                result_hdr = open(
                    os.path.join(result_path, name + key_str + ".vhdr"), "w")

                result_hdr.write("Brain Vision Data Exchange Header "
                                 "File Version 1.0\n")
                result_hdr.write("; Data stored by pySPACE\n\n")
                result_hdr.write("[Common Infos]\n")
                result_hdr.write("Codepage=UTF-8\n")
                result_hdr.write("DataFile=%s\n" %
                                 str(name + key_str + ".eeg"))
                result_hdr.write("MarkerFile=%s\n" %
                                 str(name + key_str + ".vmrk"))
                result_hdr.write("DataFormat=BINARY\n")
                result_hdr.write("DataOrientation=MULTIPLEXED\n")
                result_hdr.write("NumberOfChannels=%d\n" % len(channel_names))
                result_hdr.write("SamplingInterval=%d\n\n" % (1000000 / sf))
                result_hdr.write("[Binary Infos]\n")
                result_hdr.write("BinaryFormat=INT_16\n\n")
                result_hdr.write("[Channel Infos]\n")

                # TODO: Add Resolutions to time_series
                # 0 = 0.1 [micro]V,
                # 1 = 0.5 [micro]V,
                # 2 = 10 [micro]V,
                # 3 = 152.6 [micro]V (seems to be unused!)
                resolutions_str = [
                    unicode("0.1,%sV" % unicode(u"\u03BC")),
                    unicode("0.5,%sV" % unicode(u"\u03BC")),
                    unicode("10,%sV" % unicode(u"\u03BC")),
                    unicode("152.6,%sV" % unicode(u"\u03BC"))
                ]
                for i in range(len(channel_names)):
                    result_hdr.write(
                        unicode("Ch%d=%s,,%s\n" %
                                (i + 1, channel_names[i],
                                 unicode(resolutions_str[0]))).encode('utf-8'))
            else:
                NotImplementedError("Using unavailable storage format:%s!" %
                                    s_format)
            result_file.close()
        self.update_meta_data({
            "channel_names":
            copy.deepcopy(time_series[0][0].channel_names),
            "sampling_frequency":
            time_series[0][0].sampling_frequency
        })
        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)
Example #19
0
    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(ShuffleProcess, self).pre_benchmarking()
        
        for dataset_dir1 in self.input_datasets:                
            for dataset_dir2 in self.input_datasets:
                dataset_name1 = dataset_dir1.split(os.sep)[-2]
                dataset_name2 = dataset_dir2.split(os.sep)[-2]
                
                # Check if the input data is split
                splitted = len(glob.glob(os.sep.join([dataset_dir1, "data_run0",
                                                      "*"]))) > 1
                
                # Check that all constraints are fulfilled for this pair of
                # input datasets
                if not all(eval(constraint_template % {'dataset_name1': dataset_name1,
                                                       'dataset_name2': dataset_name2})
                                    for constraint_template in self.dataset_constraints):
                    continue
                
                if dataset_name1 == dataset_name2:
                    if splitted:
                        # Copy the data 
                        os.symlink(dataset_dir1,
                                   os.sep.join([self.result_directory, 
                                                dataset_name1]))
                    continue
             
                # Determine names of the original data sets the input 
                # datasets are based on
                base_dataset1 = dataset_name1.strip("}{").split("}{")[0]
                base_dataset2 = dataset_name2.strip("}{").split("}{")[0]
                
                # Determine target dataset name and create directory
                # for it
                mixed_base_dataset = "%s_vs_%s" % (base_dataset1, 
                                                      base_dataset2)
                target_dataset_name = dataset_name1.replace(base_dataset1,
                                                                  mixed_base_dataset)
                
                target_dataset_dir = os.sep.join([self.result_directory,
                                                     target_dataset_name])
                
                create_directory(os.sep.join([target_dataset_dir, "data_run0"]))
                
                if splitted:
                    # For each split, copy the train data from dataset 1 and
                    # the test data from dataset 2 to the target dataset
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                       "data_run0",
                                                                       "*_sp*_train.*"])):
                        # TODO: We have $n$ train sets and $n$ test sets, we                   "metadata.yaml"])),
                              
                        #       could use all $n*n$ combinations 
                        target_train_file_name = source_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        source_test_file_name =  source_test_file_name.replace("train.",
                                                                                "test.")
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                else:
                    # Use the data set from dataset 1 as training set and 
                    # the data set from dataset 2 as test data
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                         "data_run0",
                                                                         "*_sp*_test.*"])):
                        target_train_file_name = source_train_file_name.replace("test.",
                                                                                "train.")
                        target_train_file_name = target_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                # Write metadata.yaml based on input meta data
                input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1)

                output_dataset_meta = dict(input_dataset1_meta)
                output_dataset_meta['train_test'] = True
                output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S")
                output_dataset_meta['author'] = get_author()
                BaseDataset.store_meta_data(target_dataset_dir,output_dataset_meta)
        
        ############## Clean up after benchmarking ##############
        super(ShuffleProcess, self).post_benchmarking()