def pyunit_deep_copy(): pros_1 = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) pros_2 = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) pros_copy_1 = h2o.deep_copy(pros_1, "copy") pros_copy_2 = h2o.deep_copy(pros_2, "copy2") #Change a part of the original frame and a copied frame. It is expected in a deep copy that changing the original #frame will not effect the duplicate and vice versa pros_1.insert_missing_values() pros_copy_2.insert_missing_values() print("Original Frame with inserted missing values:") print(pros_1) print("Duplicate Frame with no inserted missing values") print(pros_copy_1) print("Original Frame with no inserted missing values:") print(pros_2) print("Duplicate Frame with inserted missing values") print(pros_copy_2) print("Number of frames in session after deep_copy") print(h2o.ls()) assert pros_1.nacnt() != pros_copy_1.nacnt( ), "Inserted NA's into the original frame but the original seems to match the duplicates NA count!" assert pros_2.nacnt() != pros_copy_2.nacnt( ), "Inserted NA's into the duplicate frame but the original seems to match the originals NA count!"
def run_two_step_model(self, model=None, rass=False): if model is None: model = h2o.estimators.random_forest.H2ORandomForestEstimator( col_sample_rate_per_tree=0.9, ntrees=100, model_id='two_step_model') # model = h2o.estimators.glm.H2OGeneralizedLinearEstimator(family='binomial') self.model = model if not rass: self.first_step() else: self.df['ys'] = self.df['target'] for _ in range(10): log(f'Step {_}. {time.ctime()}') df2 = self.second_step() if df2['ys'] == self.df['ys']: log('Finished') break else: self.df = h2o.deep_copy(df2, 'df') # self.df = df2[df2['ys'] != '-1'] self.df = df2 # #self.df['ys'] = self.df['ys'].ascharacter() # self.df['ys'] = self.df['ys'].asnumeric() # self.df['ys'] = self.df['ys'].ascharacter() # self.df = self.df[self.df['ys'] != '-1'] # self.df['ys'] = self.df['ys'].asfactor() self.change_target()
def analyze_improvements(self, df_eval, target_class, base_line_scoring): improvement_results = [] for q in self.actionable_q: try: _uuid = "df_" + str(uuid.uuid1()) df_mod = h2o.deep_copy(df_eval, _uuid) impr = {"varname": q["varname"]} # modify single variable by increment / decrement flag_re_run = False if "actionable" in q: curr_val = df_mod[q["varname"]] flag_re_run, new_val, new_val_itm, init_val_itm, action_meta = self._get_next_val( q, curr_val) if flag_re_run: df_mod[q["varname"]] = new_val # print(df_mod[q["varname"]]) if flag_re_run: # predict with slightly modified feature vector df_prediction = self._get_evaluation(df_mod) impr["p" + str(target_class)] = float( df_prediction["p" + str(target_class)]) impr["target_class"] = str(target_class) impr["meta"] = action_meta impr["new_val_itm"] = new_val_itm impr["init_val_itm"] = init_val_itm impr["delta"] = float( df_prediction["p" + str(target_class)]) - base_line_scoring improvement_results.append(impr) except Exception as e: print(e) df_improvements = pd.DataFrame(improvement_results) df_improvements.sort_values(by="delta", ascending=False, inplace=True) df_improvements = df_improvements[df_improvements["delta"] > 0] return df_improvements
def h2odeep_copy(): """ Python API test: h2o.deep_copy(data, xid) """ new_name = "new_frame" training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) training_copy = h2o.deep_copy(training_data, new_name) assert_is_type(training_data, H2OFrame) assert_is_type(training_copy, H2OFrame) assert training_data.nacnt()==training_copy.nacnt(), "h2o.deep_copy() command is not working." training_copy.insert_missing_values(fraction=0.9) # randomly added missing values with high probability assert not(training_data.nacnt()==training_copy.nacnt()), "h2o.deep_copy() command is not working."
def h2odeep_copy(): """ Python API test: h2o.deep_copy(data, xid) """ try: new_name = "new_frame" training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) training_copy = h2o.deep_copy(training_data, new_name) assert_is_type(training_data, H2OFrame) assert_is_type(training_copy, H2OFrame) assert training_data.nacnt()==training_copy.nacnt(), "h2o.deep_copy() command is not working." training_copy.insert_missing_values(fraction=0.9) # randomly added missing values with high probability assert not(training_data.nacnt()==training_copy.nacnt()), "h2o.deep_copy() command is not working." except Exception as e: assert False, "h2o.deep_copy() command is not working."
def second_step(self): # self.model = h2o.estimators.glm.H2OGeneralizedLinearEstimator(family='multinomial') df = h2o.deep_copy(self.df, 'df2') df['ys'] = df['ys'].asfactor() self.model.train(x=list(self.features), y='ys', training_frame=df) pred = self.model.predict(df)['p1'] df['p1'] = pred df['ys'] = df['ys'].asnumeric() max_prob = df[df['ys'] > 0, 'p1'].max() min_prob = df[df['ys'] > 0, 'p1'].min() log('New positives: {0}.'.format(df[(df['ys'] < 0) & (df['p1'] > max_prob)].shape)) df[(df['ys'] < 0) & (df['p1'] > max_prob), 'ys'] = 1 log('New negatives: {0}.'.format(df[(df['ys'] < 0) & (df['p1'] < min_prob)].shape)) df[(df['ys'] < 0) & (df['p1'] < min_prob), 'ys'] = 0 df['ys'] = df['ys'].asfactor() return df
def probabilistic_labels2weights(df,prob_label_col='prob_bogus',label_var='is_bogus'): """ turns pandas/h2o DataFrame @df with probabilistic labels in @prob_label_col (0-1) to H2OFrame with double the rows, where each observation is duplicated into two: (label_var=1, weight=prob) and (label_var=0, weight=1-prob) """ if isinstance(df,h2o.H2OFrame): df2 = h2o.deep_copy(df,'some_internal_id') # h2o elif isinstance(df,pd.DataFrame): df2 = df.copy() else: raise ValueError('not a data frame') df[label_var] = 1 df['weight'] = df[prob_label_col] df2[label_var] = 0 df2['weight'] = 1 - df2[prob_label_col] if isinstance(df,h2o.H2OFrame): df_weighted = df.concat(df2,axis=0) # h2o elif isinstance(df,pd.DataFrame): df_weighted = pd.concat([df,df2],axis=0) return df_weighted
def first_step(self): """ :return: """ df = h2o.deep_copy(self.df, 'df_') self.model.train(x=list(self.features), y='target', training_frame=df) pred = self.model.predict(df) df['target'] = df['target'].asnumeric() df['ys'] = df['target'] * 2 - 1 df = df.cbind(pred['p1']) max_prob = df[df['ys'] > 0, 'p1'].max() min_prob = df[df['ys'] > 0, 'p1'].min() log('New positives: {0}.'.format(df[(df['ys'] < 0) & (df['p1'] > max_prob)].shape)) df[(df['ys'] < 0) & (df['p1'] > max_prob), 'ys'] = 1 log('New negatives: {0}.'.format(df[(df['ys'] < 0) & (df['p1'] < min_prob)].shape)) df[(df['ys'] < 0) & (df['p1'] < min_prob), 'ys'] = 0 df['ys'] = df['ys'].asfactor() self.df = df
def prepare_data(self, target: str = None, to_target: str = None, n_sample: int = 10, hidden_size: int = 5, process: bool = True, rass=False): """ Prepare data for model. :param target: :param to_target: :param n_sample: :param hidden_size: :param process: :param rass: :return: """ self.hidden_size = hidden_size if not rass: if target is not None: if target in self.data.columns: pass else: raise ValueError(f'{target} column not found!') elif to_target is not None: self.data['target'] = 0 self.data[self.data[to_target] != '\\N', 'target'] = 1 else: raise ValueError('Target column not defined!') self.data['target'] = self.data['target'].asfactor() if hidden_size == 0: # sample data from negative class print('Model will be trained without validation.') df = self.data[self.data['target'] == '0'][:n_sample, :].rbind( self.data[self.data['target'] == '1']) else: # random sampling of positive, the rest is validation. df_ = self.data[self.data['target'] == '0'][:n_sample, :].rbind( self.data[self.data['target'] == '1']) self.orig_target = df_['target'] print('Doing random sampling') target_1_len = self.data[self.data['target'] == '1'].shape[0] rand_ind = np.random.choice(range(target_1_len), hidden_size, replace=False) data_1 = self.data[self.data['target'] == '1'] data_1[list(rand_ind), 'target'] = '0' df = self.data[self.data['target'] == '0'][:n_sample, :].rbind( data_1) else: self.data['target'] = self.data['target'].set_levels( ['0', '1', '-1']) if hidden_size == 0: # sample data from negative class print('Model will be trained without validation.') df = self.data[ self.data['target'] == '-1'][:n_sample, :].rbind( self.data[self.data['target'] == '0']).rbind( self.data[self.data['target'] == '1']) else: # random sampling of positive, the rest is validation. df_ = self.data[ self.data['target'] == '-1'][:n_sample, :].rbind( self.data[self.data['target'] == '0']).rbind( self.data[self.data['target'] == '1']) self.orig_target = df_['target'] print('Doing random sampling') target_1_len = self.data[self.data['target'] == '1'].shape[0] rand_ind = np.random.choice(range(target_1_len), hidden_size, replace=False) data_1 = self.data[self.data['target'] == '1'] data_1[list(rand_ind), 'target'] = '-1' df = self.data[self.data['target'] == '-1'][:n_sample, :].rbind(data_1).rbind( self.data[self.data['target'] == '0']) self.features = df.columns[3:-1] if process: df = process_df(df) self.df = df self.orig_df = h2o.deep_copy(self.df, 'orig_df')
h2o.connection.H2OConnection.post("GarbageCollect") h2o.connection.H2OConnection.post("GarbageCollect") h2o.connection.H2OConnection.post("GarbageCollect") #loop through post_cols start = False for c_i, post_col in enumerate(post_cols): if post_col == settings.start_icd or settings.start_icd == "": start = True if start: print utils.time() + 'Work on ' + post_col # calculate node statistics node = post_col.replace(settings.post_prefix, "") idx_col = settings.index_prefix + node if post_col == "POST_DEATH": possible_incidents = h2o.deep_copy(matrix, "poss_inc") else: pre_col = settings.pre_prefix + node pre_col_vector = matrix[pre_col] print "...Anteil possible incidents: " + str( pre_col_vector.mean() ) #needed for tricking h2o to not lazy calc this vector -> MEM issues possible_incidents = matrix[pre_col_vector == 0] h2o.remove(pre_col_vector) pre_col_vector = None print "...Possible incidents: " + str( possible_incidents.nrow ) #needed for tricking h2o to not lazy calc this vector -> MEM issues post_col_vector = possible_incidents[post_col] print "...Anteil real incidents: " + str(post_col_vector.mean()) real_incidents = possible_incidents[post_col_vector > 0]