Python replace_missingの例、data_converter.replace_missing Pythonの例

コード例 #1

0

ファイルを表示

ファイル: data_manager.py プロジェクト: madclam/VISION_project

   def loadData (self, filename, verbose=True, replace_missing=True):
       ''' Get the data from a text file in one of 3 formats: matrix, sparse, sparse_binary'''
       if verbose:  print("========= Reading " + filename)
       start = time.time()
       if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")):
           with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file:
               vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"))
               return pickle.load(pickle_file)
       if 'format' not in self.info.keys():
           self.getFormatData(filename)
       if 'feat_num' not in self.info.keys():
           self.getNbrFeatures(filename)
           
       data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse}
       
       data = data_func[self.info['format']](filename, self.info['feat_num'])
 
       # INPORTANT: when we replace missing values we double the number of variables
 
       if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)):
           vprint (verbose, "Replace missing values by 0 (slow, sorry)")
           data = data_converter.replace_missing(data)
       if self.use_pickle:
           with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file:
               vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"))
               p = pickle.Pickler(pickle_file) 
               p.fast = True 
               p.dump(data)
       end = time.time()
       if verbose:  print( "[+] Success in %5.2f sec" % (end - start))
       return data

コード例 #2

0

ファイルを表示

ファイル: data_manager.py プロジェクト: atanna/neptune

	def loadData (self, filename, verbose=True, replace_missing=True):
		''' Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse'''
		if verbose:  print("========= Reading " + filename)
		start = time.time()
		if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")):
			with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file:
				vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"))
				return pickle.load(pickle_file)
		if 'format' not in self.info.keys():
			self.getFormatData(filename)
		if 'feat_num' not in self.info.keys():
			self.getNbrFeatures(filename)
			
		data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse}
		
		data = data_func[self.info['format']](filename, self.info['feat_num'])
  
		# INPORTANT: when we replace missing values we double the number of variables
  
		if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)):
			vprint (verbose, "Replace missing values by 0 (slow, sorry)")
			data = data_converter.replace_missing(data)
		if self.use_pickle:
			with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file:
				vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"))
				p = pickle.Pickler(pickle_file) 
				p.fast = True 
				p.dump(data)
		end = time.time()
		if verbose:  print( "[+] Success in %5.2f sec" % (end - start))
		return data

コード例 #3

0

ファイルを表示

    def fit(self, F, y, datainfo, timeinfo):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        # only get numerical variables
        X = F['numerical']

        # convert NaN to zeros
        X = data_converter.replace_missing(X)

        self.num_train_samples = X.shape[0]
        self.num_feat = X.shape[1]
        num_train_samples = y.shape[0]

        self.DataX = X
        self.DataY = y
        logging.info("The whole available data is: ")
        logging.info(
            ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0],
                                                      self.DataX.shape[1]))
        logging.info(
            ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0],
                                                      self.num_labels))

        X_trn, X_val, y_trn, y_val = train_test_split(X,
                                                      y,
                                                      test_size=.25,
                                                      random_state=SEED)
        self.clf.fit(X_trn,
                     y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     verbose=10)

        if (self.num_train_samples != num_train_samples):
            logging.info("ARRGH: number of samples in X and y do not match!")
        self.is_trained = True

コード例 #4

0

ファイルを表示

    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves. 
        The function predict eventually casdn return probabilities or continuous values.
        '''
        # get the raw categorical multivalued variables in case you want to process them, in this baseline we simply ignore them
        MV = F['MV']
        CAT = F['CAT']

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        print("[***] Overall time spent %5.2f sec" % overall_spenttime)
        print("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        # only get numerical variables
        X = F['numerical']

        # get numerical variables, concatenate them with categorical variables
        # catnumeric_dataset=np.array(CAT)
        # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C')

        # convert NaN to zeros
        X = data_converter.replace_missing(X)
        #imputer = SimpleImputer(missing_values='NaN', strategy='constant', fill_value=0)
        #X = imputer.transform(X)
        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        print(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                       num_feat))
        if (self.num_feat != num_feat):
            print(
                "ARRGH: number of features in X does not match training data!")
        print(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                       self.num_labels))
        y = self.clf.predict(X)
        print("Y", y)
        np.errstate(divide='ignore', invalid='ignore')
        y = np.transpose(y)
        return y

コード例 #5

0

ファイルを表示

    def predict(self, F, data_info, time_info):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves. 
        The function predict eventually can return probabilities or continuous values.
        '''
        self.batch_num += 1
        self.predict_start_time = time.time()
        self.module_time['read_data'][self.batch_num] = (
            self.predict_start_time - self.fit_end_time)

        module_start = time.time()

        F['numerical'] = self.missing_value_preprocess.transform(
            F['numerical'], 'numerical', input_type='ndarray')
        F['CAT'] = self.missing_value_preprocess.transform(
            F['CAT'], 'CAT', input_type='dataframe')
        if self.use_mv:
            F['MV'] = self.missing_value_preprocess.transform(
                F['MV'], 'MV', input_type='dataframe')

        F['numerical'] = data_converter.replace_missing(
            F['numerical']).astype('float32')
        F['CAT'] = F['CAT'].fillna('-1')

        module_end = time.time()
        self.module_time['preprocess'][
            self.batch_num] = module_end - module_start

        self.F = F

        self.y_pred = self.transferPredict(0, data_info, time_info)

        self.predict_end_time = time.time()
        return self.y_pred

コード例 #6

0

ファイルを表示

    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        # only get numerical variables
        X = F['numerical']

        # convert NaN to zeros
        X = data_converter.replace_missing(X)

        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        logging.info(
            ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                     num_feat))
        if (self.num_feat != num_feat):
            logging.info(
                "ARRGH: number of features in X does not match training data!")
        logging.info(
            ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                     self.num_labels))
        y = self.clf.predict_proba(X)[:, 1]
        y = np.transpose(y)
        return y

コード例 #7

0

ファイルを表示

    def fit(self, F, y, datainfo, timeinfo):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''
        # get the raw categorical and categorical multivalued variables in case you want to process them, in this baseline we simply ignore them
        MV = F['MV']
        CAT = F['CAT']

        # only get numerical variables
        X = F['numerical']

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        print("[***] Overall time spent %5.2f sec" % overall_spenttime)
        print("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        # get numerical variables, concatenate them with categorical variables
        # catnumeric_dataset=np.array(CAT)
        # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C')

        # convert NaN to zeros
        X = data_converter.replace_missing(X)
        #print "This batch of data has: "
        self.num_train_samples = X.shape[0]
        if X.ndim > 1: self.num_feat = X.shape[1]
        #print("FIT: dim(X)= [{:d}, {:d}]").format(self.num_train_samples, self.num_feat)
        num_train_samples = y.shape[0]
        if y.ndim > 1: self.num_labels = y.shape[1]
        #print("FIT: dim(y)= [{:d}, {:d}]").format(num_train_samples, self.num_labels)
        # subsample the data for efficient processing
        removeperc = 0.9
        if removeperc > 0:
            rem_samples = int(num_train_samples * removeperc)
            skip = sorted(
                random.sample(range(num_train_samples),
                              num_train_samples - rem_samples))
            num_train_samples = num_train_samples - rem_samples

            X = X[skip, :]
            y = y[skip, :]
            self.num_train_samples = X.shape[0]

        if self.is_trained:
            _ = self.clf.set_params(n_estimators=self.clf.n_estimators + 1,
                                    warm_start=True)
            self.DataX = X
            self.DataY = y
        else:
            self.DataX = X
            self.DataY = y
        print("The whole available data is: ")
        print(
            ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0],
                                                      self.DataX.shape[1]))
        print(
            ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0],
                                                      self.num_labels))
        #print "fitting with ..."
        #print self.clf.n_estimators
        self.clf.fit(self.DataX, np.ravel(self.DataY))

        #print "Model fitted.."
        if (self.num_train_samples != num_train_samples):
            print("ARRGH: number of samples in X and y do not match!")
        self.is_trained = True

コード例 #8

0

ファイルを表示

    def fit(self, F, y, data_info, time_info):

        # time budget score
        if not self.is_trained:
            weight = [1., 1., 2., 4.]
            feat_weight = 0.0
            for i in range(4):
                feat_weight += weight[i] * data_info['loaded_feat_types'][i]
            self.budget_score = float(data_info['time_budget']) / (
                y.shape[0] * 10. / 1e6) / feat_weight
            self.time_manager = TimeManager(self.budget_score,
                                            self.batch_window_size)
            print('budget score: %.2f' % (self.budget_score))

            self.use_mv = False
            if (data_info['loaded_feat_types'][3] != 0
                    and self.budget_score > 3.5):
                self.use_mv = True

        # read data
        if not self.is_trained:
            self.module_time['read_data'][
                self.batch_num] = time.time() - time_info[1]

        y = y.ravel()

        # preprocessing
        if not self.is_trained:

            module_start = time.time()

            F['numerical'] = self.missing_value_preprocess.fit_transform(
                F['numerical'], 'numerical', input_type='ndarray')
            F['CAT'] = self.missing_value_preprocess.fit_transform(
                F['CAT'], 'CAT', input_type='dataframe')
            if self.use_mv:
                F['MV'] = self.missing_value_preprocess.fit_transform(
                    F['MV'], 'MV', input_type='dataframe')

            F['numerical'] = data_converter.replace_missing(
                F['numerical']).astype('float32')
            F['CAT'] = F['CAT'].fillna('-1')

            module_end = time.time()
            self.module_time['preprocess'][
                self.batch_num] = module_end - module_start

            self.F = F

        # store current batch of data
        if (len(self.data_memory) == self.batch_window_size):
            del self.data_memory[0]
        self.data_memory.append([self.F, y])

        self.batch_end_time = time.time()
        if self.is_trained:
            self.batch_start_time = self.next_batch_start_time
        else:
            self.batch_start_time = time_info[1]

        self.overall_time[
            self.batch_num] = self.batch_end_time - self.batch_start_time
        print('overall time spent on batch %d: %.2f seconds' %
              (self.batch_num, self.overall_time[self.batch_num]))
        for m in module:
            t = self.module_time[m][self.batch_num]
            ratio = t / self.overall_time[self.batch_num]
            print('%s: %.2f seconds, %.2f%%' % (m, t, ratio * 100.))
        if self.is_trained:
            print('time spent ratio: %.2f%%' % (self.time_spent_ratio))

        self.fit_end_time = time.time()
        self.next_batch_start_time = time.time()

コード例 #9

0

ファイルを表示

    def loadDataMV(self, filename, verbose=True, replace_missing=True):
        ''' Get the data from a text file in one of 3 formats: matrix, sparse, sparse_binary'''
        #                data.MV=[]
        #                data.X=[]
        if verbose: print("========= Reading " + filename)
        ntime = nnum = ncat = nmvc = 0
        start = time.time()
        # find the type of features for the data set
        dictfeats = self.feat_type
        usetime = np.array(np.where(self.feat_type == 'Time'))[0]
        usenum = np.array(np.where(self.feat_type == 'Numerical'))[0]
        usecat = np.array(np.where(self.feat_type == 'Categorical'))[0]
        usemulticat = np.array(np.where(self.feat_type == 'Multi-value'))[0]
        if verbose:
            print("=== Detected %d Numerical Features" % len(usenum))
            print("=== Detected %d Categorical Features" % len(usecat))
            print("=== Detected %d Multi-valued Categorical Features" %
                  len(usemulticat))
            print("=== Detected %d Time Features" % len(usetime))
# artificial headers for features
        for i in range(len(dictfeats)):
            dictfeats[i] = str(i)
# read the first column to identify the total number of features
        df = pd.read_csv(filename,
                         header=None,
                         names=dictfeats,
                         delim_whitespace=True,
                         usecols=[0],
                         parse_dates=True,
                         na_values='NaN')

        n_samples = len(df.index)

        if verbose: print("=== %d Samples will be loaded " % len(df.index))
        concadat = np.zeros((n_samples, 1))
        del df

        # Check the available types of features
        if verbose: print("========================")
        if usetime != []:
            if verbose:
                print("=== Processing %d Time features " % len(usetime))
            try:
                dftime = pd.read_csv(filename,
                                     header=None,
                                     names=self.feat_type[usetime],
                                     usecols=usetime,
                                     delim_whitespace=True,
                                     parse_dates=True,
                                     na_values='NaN')
                ddt = np.array(dftime)
                ntime = ddt.shape[1]
                concadat = np.concatenate((concadat, ddt), axis=1)
                del dftime
                del ddt
            except:
                print("Failed to load time variables")

        if usenum != []:
            if verbose:
                print("=== Processing %d Numerical features " % len(usenum))
            try:
                dfnum = pd.read_csv(filename,
                                    header=None,
                                    names=self.feat_type[usenum],
                                    usecols=usenum,
                                    delim_whitespace=True,
                                    na_values='NaN')
                dd = np.array(dfnum)
                nnum = dd.shape[1]
                concadat = np.concatenate((concadat, dd), axis=1)
                del dfnum
                del dd
            except:
                print("Failed to load numerical variables")

        if usecat != []:  # categorical features will be loaded as numbers for efficiency
            if verbose:
                print("=== Processing %d Categorical features " % len(usecat))
            try:
                dfcat = pd.read_csv(filename,
                                    header=None,
                                    names=self.feat_type[usecat],
                                    usecols=usecat,
                                    dtype=object,
                                    delim_whitespace=True,
                                    na_values='NaN')
                ncat = dfcat.shape[1]
                CAT = dfcat

                # Treat categorical variables as integers or perform hash encoding (one hot encoding is far more expensive)
                #                                catnumeric_dataset=np.array(dfcat)
                #print("Tipo catego")
                #print (catnumeric_dataset.dtype)
                #enca = OrdinalEncoder().fit(dfcat)
                #catnumeric_dataset = enca.transform(dfcat)
                #catnumeric_dataset = np.array(catnumeric_dataset)

                #                                ncat = catnumeric_dataset.shape[1]
                #                                concadat= np.concatenate((concadat,catnumeric_dataset),axis=1)
                #                                print (catnumeric_dataset)
                #                                #np.savetxt('categ.csv',catnumeric_dataset,delimiter=',')
                del dfcat
#                                del catnumeric_dataset
            except:
                print("Failed to load Categorical variables")
                CAT = []
        else:
            CAT = []
        if len(usemulticat) > 0:
            if verbose:
                print("=== Processing %d Multi Valued Categorical features " %
                      len(usemulticat))
            try:
                dfmvc = pd.read_csv(filename,
                                    header=None,
                                    names=self.feat_type[usemulticat],
                                    usecols=usemulticat,
                                    dtype=object,
                                    delim_whitespace=True,
                                    na_values='NaN')
                nmvc = dfmvc.shape[1]
                MV = dfmvc
                del dfmvc
            except:
                print("Failed to load Multi-Valued Categorical variables")
                MV = []
        else:
            MV = []

        concadat = np.delete(concadat, 0, 1)
        self.info['loaded_feat_types'] = [ntime, nnum, ncat, nmvc]

        if self.use_pickle and os.path.exists(
                os.path.join(self.tmp_dir,
                             os.path.basename(filename) + ".pickle")):
            with open(
                    os.path.join(self.tmp_dir,
                                 os.path.basename(filename) + ".pickle"),
                    "r") as pickle_file:
                vprint(
                    verbose, "Loading pickle file : " +
                    os.path.join(self.tmp_dir,
                                 os.path.basename(filename) + ".pickle"))
                return pickle.load(pickle_file)
        if 'format' not in self.info.keys():
            self.getFormatData(filename)
            print("not in self")
        if 'feat_num' not in self.info.keys():
            self.getNbrFeatures(filename)

        dataX = concadat.astype(np.float64).copy(order='C')

        # IMPORTANT: when we replace missing values we double the number of variables

        if self.info['format'] == 'dense' and replace_missing and np.any(
                map(np.isnan, dataX)):
            vprint(verbose, "Replace missing values by 0 (slow, sorry)")
            dataX = data_converter.replace_missing(dataX)
        if self.use_pickle:
            with open(
                    os.path.join(self.tmp_dir,
                                 os.path.basename(filename) + ".pickle"),
                    "wb") as pickle_file:
                vprint(
                    verbose, "Saving pickle file : " +
                    os.path.join(self.tmp_dir,
                                 os.path.basename(filename) + ".pickle"))
                p = pickle.Pickler(pickle_file)
                p.fast = True
                p.dump(dataX)
        end = time.time()
        if verbose:
            print("Loaded %d Samples and %d Features" %
                  (dataX.shape[0], dataX.shape[1]))
            print("[+] Success in %5.2f sec" % (end - start))

        data = {}
        data['numerical'] = dataX
        data['MV'] = MV
        data['CAT'] = CAT
        return data

コード例 #10

0

ファイルを表示

    def fit(self, F, y, datainfo, timeinfo):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''
        # get the raw categorical and categorical multivalued variables in case you want to process them, in this baseline we simply ignore them
        MV = F['MV']
        CAT = F['CAT']

        # only get numerical variables
        X = F['numerical']

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        print("[***] Overall time spent %5.2f sec" % overall_spenttime)
        print("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        # get numerical variables, concatenate them with categorical variables
        # catnumeric_dataset=np.array(CAT)
        # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C')

        # convert NaN to zeros
        X = data_converter.replace_missing(X)
        XDash = X
        yDash = y
        #imputer = SimpleImputer(missing_values='NaN', strategy='constant', fill_value=0)
        #X= imputer.transform(X)
        #print "This batch of data has: "
        self.num_train_samples = X.shape[0]
        if X.ndim > 1: self.num_feat = X.shape[1]
        #print("FIT: dim(X)= [{:d}, {:d}]").format(self.num_train_samples, self.num_feat)
        num_train_samples = y.shape[0]
        if y.ndim > 1: self.num_labels = y.shape[1]
        #print("FIT: dim(y)= [{:d}, {:d}]").format(num_train_samples, self.num_labels)
        # subsample the data for efficient processing
        removeperc = 0.6
        if removeperc > 0:
            rem_samples = int(num_train_samples * removeperc)
            skip = sorted(
                random.sample(range(num_train_samples),
                              num_train_samples - rem_samples))
            filteredIndex = list(set(range(0, num_train_samples)) - set(skip))
            num_train_samples = num_train_samples - rem_samples

            X = X[skip, :]
            y = y[skip, :]
            self.num_train_samples = X.shape[0]

        if self.is_trained:
            #_ = self.clf.set_params(n_estimators=self.clf.n_estimators+1,warm_start=True);
            self.DataX = X
            self.DataY = y
        else:
            self.DataX = X
            self.DataY = y
        print("The whole available data is: ")
        print(
            ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0],
                                                      self.DataX.shape[1]))
        print(
            ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0],
                                                      self.num_labels))
        #print "fitting with ..."
        #print self.clf.n_estimators
        print("np.unique(self.DataY)", np.unique(self.DataY,
                                                 return_counts=True))
        if not self.is_trained:
            self.clf.fit(self.DataX, np.ravel(self.DataY))
        else:
            for i in range(self.num_train_samples):
                print("dim(X)", self.DataX[i, :].shape)
                predictY = self.clf.predict(self.DataX[i, :].reshape(1, -1))
                print("y,predictY:", self.DataY[i, :], predictY)
                print("accuracy score",
                      accuracy_score(predictY, self.DataY[i, :]))
                changedetected = self.adwin2.insertInput(
                    accuracy_score(predictY, self.DataY[i, :].reshape(1, -1)))
                print("Change Detected:", changedetected)
                if changedetected:
                    self.clf = clone(self.clf)
                    self.clf.partial_fit(self.DataX[i, :].reshape(1, -1),
                                         self.DataY[i, :].reshape(1,
                                                                  -1).ravel(),
                                         classes=np.unique(self.DataY))
                else:
                    self.clf.partial_fit(
                        self.DataX[i, :].reshape(1, -1),
                        self.DataY[i, :].reshape(1, -1).ravel())

        #print "Model fitted.."
        if (self.num_train_samples != num_train_samples):
            print("ARRGH: number of samples in X and y do not match!")
        self.is_trained = True