def read_labeled_text_data(file_path, encoding, save=False, save_path=None, flatten=False, make_ft=True): f = codecs.open(file_path, 'r', encoding=encoding) orisents, nesents, sents = pre.read_labeled_file(f, flatten=flatten) X = [] if make_ft == True: X = [pre.sent2features(s, flatten=flatten) for s in sents] y = [pre.sent2labels(s, flatten=flatten) for s in sents] f.close() if save is True: save_points = [10000, 50000, 200000, 1000000] for sp in save_points: st.write_log('writing : ' + str(sp) + '.pkl', open=True, close=True, std_print=False) with open(save_path + str(sp) + '.pkl', 'wb') as output: pickle.dump(sents[:sp], output, pickle.HIGHEST_PROTOCOL) pickle.dump(X[:sp], output, pickle.HIGHEST_PROTOCOL) pickle.dump(y[:sp], output, pickle.HIGHEST_PROTOCOL) return orisents, nesents, sents, X, y
def out_act_in_golden(self, boot_iter): self.boot_iter = boot_iter if os.path.exists(self.active_dir) is False: os.makedirs(self.active_dir) utils.write_result_from_ft(self.X_bf_active, self.y_bf_active, self.active_dir + str(boot_iter) + '.out') while True: print('ready?') ready = raw_input() active_in_fn = self.active_dir + str(boot_iter) + '.in' if st.ACTIVE_DEBUG == True: active_in_fn = self.active_dir + str(boot_iter) + '.out' if os.path.exists(active_in_fn): act_sents, X_golden, y_golden = utils.read_labeled_text_data( active_in_fn, encoding=st.ENCODING, flatten=st.FLATTEN) st.write_log(str(len(act_sents)) + ' sents are read from active data.', open=True, close=True, std_print=True) y_golden, _, _ = utils._remove_all_o(y_golden, [], []) break print(active_in_fn) yprob_golden = utils.generate_all(y_golden, 1) if st.ACTIVE_DEBUG == True: idx = 0 for x_b, x_a in zip(self.X_bf_active, X_golden): idx += 1 for x_ft_B, x_ft_A in zip(x_b, x_a): if tuple(x_ft_A) != tuple(x_ft_B): print('error!!!') self.X_bf_active = [] self.y_bf_active = [] self.active_put_count = 0 return X_golden, y_golden, yprob_golden
def _add_n_train_CRF(self, X, y, yprob=None, clear=False): this_model_name = self.model_name + str(self.iter) added_cnt = 0 X_added = [] y_added = [] yprob_added = [] if yprob != None: for xseq, yseq, yseqprob in zip(X, y, yprob): if len(yseq) != 0: added_cnt += 1 X_added.append(xseq) y_added.append(yseq) yprob_added.append(yseqprob) self.trainer.append(xseq, yseq) else: for xseq, yseq in zip(X, y): if len(yseq) != 0: added_cnt += 1 X_added.append(xseq) y_added.append(yseq) self.trainer.append(xseq, yseq) if st.DICTIONARY is True: self.X_total += X_added #added for dic self.y_total += y_added #st.write_log('added : ' + str(added_cnt) + '/' + str(len(y)), open=True,close=True, std_print=False) if self.save_path is not None: st.write_log('writing added data file', open=True, close=True, std_print=False) if os.path.isfile(self.save_path + str(self.iter) + '.pkl'): save_path = self.save_path + 'R' else: save_path = self.save_path with open(save_path + str(self.iter) + '.pkl', 'wb') as output: pickle.dump(X_added, output, pickle.HIGHEST_PROTOCOL) pickle.dump(y_added, output, pickle.HIGHEST_PROTOCOL) if yprob != None: pickle.dump(yprob_added, output, pickle.HIGHEST_PROTOCOL) if clear == True and self.iter > 0: os.remove(self.model_path + self.model_name + str(self.iter - 1) + '.crfsuite') self.trainer.set_params({ 'c1': 0.0, # coefficient for L1 penalty 'c2': 0.0, # coefficient for L2 penalty 'max_iterations': st.CRF_ITER, # stop earlier **** # include transitions that are possible, but not observed 'feature.possible_transitions': True }) self.trainer.params() self.trainer.train(self.model_path + this_model_name + '.crfsuite') # model save self.trainer.logparser.last_iteration # ??? self.iter += 1
def eval_prediction(self, ypred, tag_conf_table=False, log=True): out_str = 'evaluation\n' cor, pred, ans = self.eval(ypred) if tag_conf_table == True: for tag in st.TAG: self._draw_confusion_table(ypred, tag) precision = float(cor) / pred recall = float(cor) / ans f1score = 2.0 * (precision * recall) / (precision + recall) out_str += 'NEs : ' + str(pred) + ', precision : ' + str( precision) + ', recall : ' + str(recall) + ', f1score : ' + str( f1score) + '\n' if log == True: st.write_log(out_str, open=True, close=True, std_print=True) else: print(out_str) return f1score
def put_act_n_get_remain(self, X_auto, y_auto, y_mar_prob): X_selected, y_selected, yprob_selected, X_remain, y_remain, yprob_remain = self._select_in_range( X_auto, y_auto, y_mar_prob) st.write_log(str(len(X_selected)) + ' sents are added to active data.', open=True, close=True, std_print=True) #write file(str(boot_iter) + '_' + str(self.active_put_count)+'.pkl) #if os.paty.size >50 , name = get, boot_names = for name.split(_),, remove_start_with( boot_name.int.sort.getfirst) self._save(X_selected, y_selected) self.X_bf_active += X_selected self.y_bf_active += y_selected self.active_put_count += 1 #debug #X_remain = X_auto #y_remain = y_auto #yprob_remain = y_mar_prob return X_remain, y_remain, yprob_remain
def write_ML_data(self, orisents, nesents, X_auto, y_auto, y_mar_prob): orisents_selected, nesents_selected, X_selected, y_selected, yprob_selected, orisents_remain, nesents_remain, X_remain, y_remain, yprob_remain = self._select_in_range( orisents, nesents, X_auto, y_auto, y_mar_prob) # orisents_remain, nesents_remain, X_remain, y_remain, yprob_remain remove all o st.write_log(str(len(X_selected)) + ':' + str(len(X_remain)) + ' = active : good', open=True, close=True, std_print=True) if st.PROB_OUT == True: print('writing : ' + self.active_dir + str(self.boot_iter) + '_active' + '.txt') utils.write_result_from_ft(orisents_selected, nesents_selected, X_selected, y_selected, self.active_dir + str(self.boot_iter) + '_active' + '.txt', yprob=yprob_selected) print('writing : ' + self.good_dir + str(self.boot_iter) + '_good' + '.txt') utils.write_result_from_ft(orisents_remain, nesents_remain, X_remain, y_remain, self.good_dir + str(self.boot_iter) + '_good' + '.txt', yprob=yprob_remain) else: print('writing : ' + self.active_dir + str(self.boot_iter) + '_active' + '.txt') utils.write_result_from_ft( orisents_selected, nesents_selected, X_selected, y_selected, self.active_dir + str(self.boot_iter) + '_active' + '.txt') print('writing : ' + self.good_dir + str(self.boot_iter) + '_good' + '.txt') utils.write_result_from_ft( orisents_remain, nesents_remain, X_remain, y_remain, self.good_dir + str(self.boot_iter) + '_good' + '.txt') return self.active_dir + str( self.boot_iter) + '_active' + '.txt', self.good_dir + str( self.boot_iter) + '_good' + '.txt'
def main(): summary = '=========================== summary ============================\n' if not os.path.exists(st.MODEL_DIR + MODEL_NUMBER): os.makedirs(st.MODEL_DIR + MODEL_NUMBER) #st.print_setting() print_name() st.write_log('Reading files\n', open=True, close=True) _,_,test_sents, X_test, y_test = \ utils.read_labeled_text_data_dir(st.TEST_DIR, encoding=st.ENCODING)##20170912 _,_,train_sents, X_train, y_train = \ utils.read_labeled_text_data_dir(st.TRAIN_DIR, encoding=st.ENCODING)##20170912 _,_,act_sents, X_act, y_act = \ utils.read_labeled_text_data_dir(st.ACT_DIR, encoding=st.ENCODING) _,_,good_sents, X_good, y_good = \ utils.read_labeled_text_data_dir(st.GOOD_ML_DATA_DIR, encoding=st.ENCODING) if len(FULL_IN_FILENAME) == 0: unlabeled_orisents, unlabeled_nesents, unlabeled_sents, X_unlabeled, \ y_unlabeled = utils.read_labeled_text_data_dir(st.UNLABELED_DIR, encoding=st.ENCODING) else: unlabeled_orisents, unlabeled_nesents, unlabeled_sents, X_unlabeled, \ y_unlabeled = utils.read_labeled_text_data(FULL_IN_FILENAME, encoding=st.ENCODING) if len(unlabeled_orisents) != len(unlabeled_sents): print 'error! check the unlabeled input file!' X_basiccrf = X_train + X_act + X_good y_basiccrf = y_train + y_act + y_good if len(test_sents) > 1: tester = myTagger(X_test=X_test, y_test=y_test, test_sents=test_sents) basic_CRF = BasicModel(BASE_MODEL_PATH, BASE_LINE_NAME) bagging_model = BaggingModel(BAGGING_MODEL_PATH, BAGGING_MODEL_NAME, num_of_comp_mds=st.NUM_BAGGING_MODEL, boot_sample_size=st.BOOTSTRAP_SAMPLE_SIZE / st.NUM_BAGGING_MODEL, X_labeled=X_train, y_labeled=y_train) st.write_log('Training Basic CRF', close=True, std_print=True) unlabeled_orisents_now =\ unlabeled_orisents[BOOT_ITER*st.BOOTSTRAP_SAMPLE_SIZE : (BOOT_ITER+1)*st.BOOTSTRAP_SAMPLE_SIZE] unlabeled_nesents_now =\ unlabeled_nesents[BOOT_ITER*st.BOOTSTRAP_SAMPLE_SIZE : (BOOT_ITER+1)*st.BOOTSTRAP_SAMPLE_SIZE] X_unlabeled_now =\ X_unlabeled[BOOT_ITER*st.BOOTSTRAP_SAMPLE_SIZE : (BOOT_ITER+1)*st.BOOTSTRAP_SAMPLE_SIZE] y_basiccrf, _, _ = utils._remove_all_o(y_basiccrf, [], []) if st.SELF_ITER_N == 1: basic_CRF._add_n_train_CRF(X_basiccrf, y_basiccrf) else: basic_CRF.add_n_train_CRF(X_basiccrf, y_basiccrf, clear_past_model=True, add_total=True, write_added=True) if st.ALL_TEST is True: tester.eval_prediction( basic_CRF.make_prediction(tester.X_test, remove_all_o=False)[0]) if GENERATE_FULL is True: anm, a, m = generate(act_sents, y_act, unlabeled_orisents, unlabeled_nesents, unlabeled_sents, X_unlabeled, basic_CRF) summary += '=generated full data\n' + '=' + anm + '\n' + '=' + a + '\n' + '=' + m + '\n' #y_pred_u_full, y_mar_p_u_full = basic_CRF.make_prediction(X_unlabeled, remove_all_o= False, min_conf= -1.0, link_pos=None) if NON_ACTIVE == False: y_pred_u =\ basic_CRF.make_prediction(X_unlabeled_now, remove_all_o=True, min_conf=st.FIXED_MIN_SEQ_PROB,link_pos=None)[0] print 'Training Bagging CRF' bagging_model.set_selflabeled_data_n_train(X_unlabeled_now, y_pred_u) if st.ALL_TEST is True: y_pred_test, _ = bagging_model.make_prediction(tester.X_test, remove_all_o=False) tester.eval_prediction(y_pred_test) X_basiccrf = X_unlabeled_now if st.SELF_ITER_N == 1: y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction( X_basiccrf, remove_all_o=False, min_conf=st.FIXED_MIN_MARGINAL_PROB, link_pos=None, replace_o=False, mul_ne_cnt=False) elif st.SELF_ITER_N > 1: y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction( X_basiccrf, remove_all_o=True, min_conf=st.FIXED_MIN_MARGINAL_PROB, link_pos=None, replace_o=False, mul_ne_cnt=False) for boot_sub_iter in range(1, st.SELF_ITER_N): st.write_log('sub-iter : ' + str(boot_sub_iter) + '/' + str(st.SELF_ITER_N - 1), open=True, std_print=True) print 'sub : Training Basic CRF' basic_CRF.temp_add_n_train_CRF(X_basiccrf, y_basiccrf, boot_sub_iter) if st.ALL_TEST is True: tester.eval_prediction( basic_CRF.make_prediction(tester.X_test, remove_all_o=False)[0]) y_pred_u = basic_CRF.make_prediction( X_unlabeled_now, remove_all_o=True, min_conf=st.FIXED_MIN_SEQ_PROB, link_pos=None)[0] print 'sub : Training Bagging CRF' bagging_model.set_selflabeled_data_n_train( X_unlabeled_now, y_pred_u) if st.ALL_TEST is True: y_pred_test, _ = bagging_model.make_prediction( tester.X_test, remove_all_o=False) tester.eval_prediction(y_pred_test) X_basiccrf = X_unlabeled_now if boot_sub_iter < st.SELF_ITER_N - 1: y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction( X_basiccrf, remove_all_o=True, min_conf=st.FIXED_MIN_MARGINAL_PROB, link_pos=None, replace_o=False, mul_ne_cnt=False) #remove all o false elif boot_sub_iter == st.SELF_ITER_N - 1: y_basiccrf, y_basiccrf_mar_p = bagging_model.make_prediction( X_basiccrf, remove_all_o=False, min_conf=st.FIXED_MIN_MARGINAL_PROB, link_pos=None, replace_o=False, mul_ne_cnt=False) #remove all o false #print out file & remove model dir active_assistant = Active_Assistant(st.active_min_prob, st.active_max_prob, st.GOOD_ML_DATA_DIR_OUT, st.ACT_DIR_OUT, BOOT_ITER) a, g = active_assistant.write_ML_data(unlabeled_orisents_now, unlabeled_nesents_now, X_basiccrf, y_basiccrf, y_basiccrf_mar_p) summary += '=generated bootstrapping data\n' + '=' + a + '\n' + '=' + g + '\n' if SAVE_MODEL == False: shutil.rmtree(st.MODEL_DIR + MODEL_NUMBER + '/') else: shutil.rmtree(st.MODEL_DIR + MODEL_NUMBER + '/bagging_model/') if NON_ACTIVE != True: if st.SELF_ITER_N > 1: basic_CRF.remove_latest_model() summary += '=generated model\n' + '=' + basic_CRF.get_first_model_name( ) + '\n' summary += '================================================================\n' print summary