def test_save_and_load(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) dir_path = tempfile.mkdtemp() save_path = dir_path + "label_model" label_model.save(save_path) label_model.load(save_path) shutil.rmtree(dir_path)
def test_save_and_load(self): L = np.array([[0, -1, 0], [0, 1, 1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) original_preds = label_model.predict(L) dir_path = tempfile.mkdtemp() save_path = dir_path + "label_model.pkl" label_model.save(save_path) label_model_new = LabelModel(cardinality=2, verbose=False) label_model_new.load(save_path) loaded_preds = label_model_new.predict(L) shutil.rmtree(dir_path) np.testing.assert_array_equal(loaded_preds, original_preds)
def load(self, dir_name): with open(os.path.join(dir_name, 'model_lfs.pkl'), "rb") as file: lfs = pickle.load(file) label_model = LabelModel.load( os.path.join(dir_name, 'label_model.pkl')) self.lfs = lfs self.label_model = label_model
def train_f_on_d_U(self, datafeeder, num_epochs, loss_type): sess = self.hls.sess total_batch = datafeeder.get_batches_per_epoch(f_d_U) batch_size = datafeeder.get_batch_size(f_d_U) if loss_type == 'pure-likelihood': train_op = self.hls.f_d_U_pure_likelihood_op loss_op = self.hls.f_d_U_pure_likelihood_loss elif loss_type == 'implication': train_op = self.hls.f_d_U_implication_op loss_op = self.hls.f_d_U_implication_loss elif loss_type == 'pr_loss': train_op = self.hls.pr_train_op loss_op = self.hls.pr_loss elif loss_type == 'gcross': train_op = self.hls.gcross_train_op loss_op = self.hls.gcross_loss elif loss_type == 'gcross_snorkel': train_op = self.hls.snork_gcross_train_op loss_op = self.hls.snork_gcross_loss elif loss_type == 'learn2reweight': train_op = self.hls.l2r_train_op loss_op = self.hls.l2r_loss elif loss_type == 'label_snorkel': train_op = self.hls.label_snorkel_train_op loss_op = self.hls.label_snorkel_loss elif loss_type == 'pure_snorkel': train_op = self.hls.pure_snorkel_train_op loss_op = self.hls.pure_snorkel_loss else: raise ValueError('Invalid loss type %s' % loss_type) best_saver_f_d_U = self.hls.best_savers.get_best_saver(f_d_U) metrics_dict = {} #{'config': self.config} if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode: label_model = LabelModel(cardinality=self.hls.num_classes, verbose=True) if os.path.isfile( os.path.join(self.config.data_dir, "saved_label_model")): label_model = label_model.load( os.path.join(self.config.data_dir, "saved_label_model")) else: print("LABEL MODEL NOT SAVED") exit() if 'gcross' in self.config.mode or 'learn2reweight' in self.config.mode: majority_model = MajorityLabelVoter( cardinality=self.hls.num_classes) with sess.as_default(): print("Optimization started for f_d_U with %s loss!" % loss_type) print("Batch size: %d!" % batch_size) print("Batches per epoch : %d!" % total_batch) print("Number of epochs: %d!" % num_epochs) # Training cycle iteration = 0 global_step = 0 patience = 0 for epoch in range(num_epochs): avg_epoch_cost = 0. for i in range(total_batch): batch_x, batch_l, batch_m, batch_L, batch_d, batch_r =\ datafeeder.get_f_d_U_next_batch() feed_dict = { self.hls.f_d_U_adam_lr: self.config.f_d_U_adam_lr, self.hls.f_d_U_x: batch_x, self.hls.f_d_U_l: batch_l, self.hls.f_d_U_m: batch_m, self.hls.f_d_U_L: batch_L, self.hls.f_d_U_d: batch_d, self.hls.f_d_U_r: batch_r } batch_lsnork = conv_l_to_lsnork(batch_l, batch_m) if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode: batch_snork_L = label_model.predict_proba( L=batch_lsnork) #snorkel_probs feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L if 'gcross' == self.config.mode or 'learn2reweight' == self.config.mode: batch_snork_L = majority_model.predict( L=batch_lsnork) #majority votes batch_snork_L = np.eye( self.hls.num_classes)[batch_snork_L] #one hot rep feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L merge_dict_a_into_b(self.hls.dropout_train_dict, feed_dict) # Run optimization op (backprop) and cost op (to get loss value) _, cost, num_d, f_d_U_global_step = sess.run( [ train_op, loss_op, self.hls.f_d_U_num_d, self.hls.f_d_U_global_step ], feed_dict=feed_dict) global_epoch = f_d_U_global_step / total_batch # This assertion is valid only if true U labels are available but not being used such as for # synthetic data. assert np.all(batch_L <= self.hls.num_classes) avg_epoch_cost += cost / total_batch cost1 = (avg_epoch_cost * total_batch) / (i + 1) global_step += 1 # Compute and report metrics, update checkpoints after each epoch print("\n========== epoch : {} ============\n".format(epoch)) print("cost: {}\n".format(cost1)) print("patience: {}\n".format(patience)) precision, recall, f1_score, support = self.hls.test.test_f( datafeeder) self.compute_f_d_metrics(metrics_dict, precision, recall, f1_score, support, global_epoch, f_d_U_global_step) print("\nmetrics_dict: ", metrics_dict) print() self.report_f_d_perfs_to_tensorboard(cost1, metrics_dict, global_step) did_improve = self.maybe_save_metrics_dict(f_d_U, metrics_dict) if did_improve: patience = 0 #rest patience if primary metric improved else: patience += 1 if patience > self.config.early_stopping_p: print("bye! stopping early!......") break # Save checkpoint print() self.hls.mru_saver.save(global_step) print() best_saver_f_d_U.save_if_best( metrics_dict[self.config.f_d_primary_metric]) print() global_step += 1 print("Optimization Finished for f_d_U!")
df = df.loc[mask] Y_data = df.bm25_relevant.values print(df.shape) lfs = [ lf.has_type_diap_medd_or_bhvr, lf.is_doctor_reply, lf.has_votes, lf.enity_overlap_jacc, lf.same_author, lf.number_relations_total, lf.entity_types ] applier = PandasLFApplier(lfs) L_data = applier.apply(df=df) label_model = LabelModel(cardinality=2, verbose=True) label_model.load("trained_model_ehf.lbm") valid_probabilities = label_model.predict_proba(L=L_data) if 'predicted_prob' in df: del df['predicted_prob'] df['predicted_prob'] = valid_probabilities[:, 1] PROBABILITY_CUTOFF = 0.5 df['predicted_label'] = df['predicted_prob'] >= PROBABILITY_CUTOFF df_out = df[df['predicted_label'] == int(RELEVANT)][[ 'query_id', 'document_id' ]] with open(qrels_path, 'a+', encoding='utf8') as output_file: for index, row in df_out.iterrows():