Beispiel #1
0
    def __init__(self, basename, input_dir, verbose=False, replace_missing=True, filter_features=False,
                 only_info=False):
        '''Constructor'''
        self.use_pickle = False  # Turn this to true to save data as pickle (inefficient)
        self.basename = basename
        if basename in input_dir:
            self.input_dir = input_dir
        else:
            self.input_dir = input_dir + "/" + basename + "/"
        if self.use_pickle:
            if os.path.exists("tmp"):
                self.tmp_dir = "tmp"
            elif os.path.exists("../tmp"):
                self.tmp_dir = "../tmp"
            else:
                os.makedirs("tmp")
                self.tmp_dir = "tmp"
        info_file = os.path.join(self.input_dir, basename + '_public.info')
        self.info = {}
        self.getInfo(info_file)
        # Checkl to see if we should do anything other than gather info
        if not only_info:
            self.feat_type = self.loadType(os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose)
            self.data = {}
            Xtr = self.loadData(os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose,
                                replace_missing=replace_missing)
            Ytr = self.loadLabel(os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose)
            Xva = self.loadData(os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose,
                                replace_missing=replace_missing)
            Xte = self.loadData(os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose,
                                replace_missing=replace_missing)
            Yte = self.loadData(os.path.join(self.input_dir, basename + '_test.solution'), verbose=verbose)
            # Normally, feature selection should be done as part of a pipeline.
            # However, here we do it as a preprocessing for efficiency reason
            idx = []
            if filter_features:  # add hoc feature selection, for the example...
                fn = min(Xtr.shape[1], 1000)
                idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose)
                Xtr = Xtr[:, idx]
                if not Xva is None:
                    Xva = Xva[:, idx]
                if not Xte is None:
                    Xte = Xte[:, idx]
            self.feat_idx = np.array(idx).ravel()
            self.data['X_train'] = Xtr
            self.data['Y_train'] = Ytr

            if not Xva is None:
                self.data['X_valid'] = Xva
            if not Xte is None:
                self.data['X_test'] = Xte
            if not Yte is None:
                self.data['Y_test'] = Yte
 def __init__(self, basename="", input_dir="", verbose=False, replace_missing=True, filter_features=False, max_samples=float('inf')):
     '''Constructor'''
     self.use_pickle = False # Turn this to true to save data as pickle (inefficient)
     self.basename = basename
     if basename in input_dir or os.path.isfile(os.path.join(input_dir, basename + '_train.data')) :
         self.input_dir = input_dir
     else:
         self.input_dir =  os.path.join (input_dir , basename )
     if self.use_pickle:
         if os.path.exists ("tmp"):
             self.tmp_dir = "tmp"
         elif os.path.exists ("../tmp"):
             self.tmp_dir = "../tmp"
         else:
             os.makedirs("tmp")
             self.tmp_dir = "tmp"
     info_file = os.path.join (self.input_dir, basename + '_public.info')
     self.info = {}
     self.getInfo (info_file)
     self.feat_type = self.loadType (os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose)
     self.data = {}
       #if True: return
     Xtr = self.loadData (os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose, replace_missing=replace_missing)
     Ytr = self.loadLabel (os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose)
     max_samples = min(Xtr.shape[0], max_samples)
     Xtr = Xtr[0:max_samples]
     Ytr = Ytr[0:max_samples]
     Xva = self.loadData (os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose, replace_missing=replace_missing)
     Xte = self.loadData (os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose, replace_missing=replace_missing)
        # Normally, feature selection should be done as part of a pipeline.
        # However, here we do it as a preprocessing for efficiency reason
     idx=[]
     if filter_features: # add hoc feature selection, for the example...
         fn = min(Xtr.shape[1], 1000)
         idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose)
         Xtr = Xtr[:,idx]
         Xva = Xva[:,idx]
         Xte = Xte[:,idx]
     self.feat_idx = np.array(idx).ravel()
     self.data['X_train'] = Xtr
     self.data['Y_train'] = Ytr
     self.data['X_valid'] = Xva
     self.data['X_test'] = Xte
     if GOD_VIEW:
       try:
         Yva = self.loadLabel (os.path.join(self.input_dir, basename + '_valid.solution'), verbose=verbose)
         Yte = self.loadLabel (os.path.join(self.input_dir, basename + '_test.solution'), verbose=verbose)
         self.data['Y_valid'] = Yva
         self.data['Y_test'] = Yte
       except:
         print("Sadly you are not really the god so can't load solutions for validation and test.")
    def __init__(self, basename, input_dir, verbose=False, replace_missing=True, filter_features=False,
                 only_info=False):
        '''Constructor'''
        self.use_pickle = False  # Turn this to true to save data as pickle (inefficient)
        self.basename = basename
        if basename in input_dir:
            self.input_dir = input_dir
        else:
            self.input_dir = input_dir + "/" + basename + "/"
        if self.use_pickle:
            if os.path.exists("tmp"):
                self.tmp_dir = "tmp"
            elif os.path.exists("../tmp"):
                self.tmp_dir = "../tmp"
            else:
                os.makedirs("tmp")
                self.tmp_dir = "tmp"
        info_file = os.path.join(self.input_dir, basename + '_public.info')
        self.info = {}
        self.getInfo(info_file)
        # Checkl to see if we should do anything other than gather info
        if not only_info:
            self.feat_type = self.loadType(os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose)
            self.data = {}
            Xtr = self.loadData(os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose,
                                replace_missing=replace_missing)
            Ytr = self.loadLabel(os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose)
            Xva = self.loadData(os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose,
                                replace_missing=replace_missing)
            Xte = self.loadData(os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose,
                                replace_missing=replace_missing)
            # Normally, feature selection should be done as part of a pipeline.
            # However, here we do it as a preprocessing for efficiency reason
            idx = []
            if filter_features:  # add hoc feature selection, for the example...
                fn = min(Xtr.shape[1], 1000)
                idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose)
                Xtr = Xtr[:, idx]
                if not Xva is None:
                    Xva = Xva[:, idx]
                if not Xte is None:
                    Xte = Xte[:, idx]
            self.feat_idx = np.array(idx).ravel()
            self.data['X_train'] = Xtr
            self.data['Y_train'] = Ytr

            if not Xva is None:
                self.data['X_valid'] = Xva
            if not Xte is None:
                self.data['X_test'] = Xte
Beispiel #4
0
 def __init__(self, basename="", input_dir="", verbose=False, replace_missing=True, filter_features=False,
              max_samples=float('inf')):
     '''Constructor'''
     self.use_pickle = False  # Turn this to true to save data as pickle (inefficient)
     self.basename = basename
     if basename in input_dir or os.path.isfile(os.path.join(input_dir, basename + '_train.data')):
         self.input_dir = input_dir
     else:
         self.input_dir = os.path.join(input_dir, basename)
     if self.use_pickle:
         if os.path.exists("tmp"):
             self.tmp_dir = "tmp"
         elif os.path.exists("../tmp"):
             self.tmp_dir = "../tmp"
         else:
             os.makedirs("tmp")
             self.tmp_dir = "tmp"
     info_file = os.path.join(self.input_dir, basename + '_public.info')
     self.info = {}
     self.getInfo(info_file)
     self.feat_type = self.loadType(os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose)
     self.data = {}
     # if True: return
     Xtr = self.loadData(os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose,
                         replace_missing=replace_missing)
     Ytr = self.loadLabel(os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose)
     max_samples = min(Xtr.shape[0], max_samples)
     Xtr = Xtr[0:max_samples]
     Ytr = Ytr[0:max_samples]
     Xva = self.loadData(os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose,
                         replace_missing=replace_missing)
     Xte = self.loadData(os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose,
                         replace_missing=replace_missing)
     # Normally, feature selection should be done as part of a pipeline.
     # However, here we do it as a preprocessing for efficiency reason
     idx = []
     if filter_features:  # add hoc feature selection, for the example...
         fn = min(Xtr.shape[1], 1000)
         idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose)
         Xtr = Xtr[:, idx]
         Xva = Xva[:, idx]
         Xte = Xte[:, idx]
     self.feat_idx = np.array(idx).ravel()
     self.data['X_train'] = Xtr
     self.data['Y_train'] = Ytr
     self.data['X_valid'] = Xva
     self.data['X_test'] = Xte
Beispiel #5
0
    def __init__(self,
                 basename="",
                 input_dir="",
                 verbose=False,
                 replace_missing=True,
                 filter_features=False,
                 max_samples=float('inf')):
        '''Constructor'''
        self.use_pickle = False  # Turn this to true to save data as pickle (inefficient)
        self.basename = basename
        if basename in input_dir:
            self.input_dir = input_dir
        else:
            self.input_dir = os.path.join(input_dir, basename)
        if self.use_pickle:
            if os.path.exists("tmp"):
                self.tmp_dir = "tmp"
            elif os.path.exists("../tmp"):
                self.tmp_dir = "../tmp"
            else:
                os.makedirs("tmp")
                self.tmp_dir = "tmp"
        info_file = os.path.join(self.input_dir, basename + '_public.info')
        self.info = {}
        self.getInfo(info_file)
        self.feat_type = self.loadType(os.path.join(self.input_dir,
                                                    basename + '_feat.type'),
                                       verbose=verbose)
        self.data = {}
        #if True: return
        Xtr = self.loadData(os.path.join(self.input_dir,
                                         basename + '_train.data'),
                            verbose=verbose,
                            replace_missing=replace_missing)
        Ytr = self.loadLabel(os.path.join(self.input_dir,
                                          basename + '_train.solution'),
                             verbose=verbose)
        max_samples = min(Xtr.shape[0], max_samples)
        Xtr = Xtr[0:max_samples]
        Ytr = Ytr[0:max_samples]
        Xva = self.loadData(os.path.join(self.input_dir,
                                         basename + '_valid.data'),
                            verbose=verbose,
                            replace_missing=replace_missing)
        Xte = self.loadData(os.path.join(self.input_dir,
                                         basename + '_test.data'),
                            verbose=verbose,
                            replace_missing=replace_missing)
        # Normally, feature selection should be done as part of a pipeline.
        # However, here we do it as a preprocessing for efficiency reason
        idx = []
        if filter_features:  # add hoc feature selection, for the example...
            fn = min(Xtr.shape[1], 100)
            # perform a PCA if feature number > 100
            idx = data_converter.tp_filter(Xtr,
                                           Ytr,
                                           feat_num=fn,
                                           verbose=verbose)

            if fn == 100:
                pca = PCA(n_components=int(fn))
                Xtr = pca.fit_transform(Xtr)
                #Xtr = Xtr[:,idx]
                Xva = pca.transform(Xva)
                Xte = pca.transform(Xte)
        self.feat_idx = np.array(idx).ravel()
        self.data['X_train'] = Xtr
        self.data['Y_train'] = Ytr
        self.data['X_valid'] = Xva
        self.data['X_test'] = Xte