class Step1(object): def __init__(self, excel_name_training, excel_name_test, dict_file="zh.dic"): self.excel_name_training = excel_name_training self.excel_name_test = excel_name_test self.dict_file = dict_file self.header_training, self.train_data = ExcelHelper.read_excel( self.excel_name_training) self.header_test, self.test_data = ExcelHelper.read_excel( self.excel_name_test) self.row_number_training, self.column_number_training = self.train_data.shape self.row_number_test, self.column_number_test = self.test_data.shape self.segment = SegmentHelper( self.excel_name_training, self.dict_file) # generate dictionary for training data self.test_str_list = [0 for i in range(self.row_number_test)] self.test_repair_data = copy.deepcopy(self.test_data) self.train = None # get the test data, then split it into phrases , return the result def get_test_str_list(self): for row in range(self.row_number_test): temp_str = " ".join( [unicode(item) for item in self.test_data[row, :]]) self.test_str_list[row] = self.segment.segment(unicode(temp_str)) def training(self, save_result=True, save_file="pattern_relationship.dat", recover_file=None): if self.train is not None: return if recover_file is not None: self.train = PatternCorrelationHelper.build_from_file(recover_file) return self.train = PatternCorrelationHelper(self.excel_name_training) self.train.build_big_pattern() self.train.build_pattern_relationship() if save_result: self.train.save(save_file) def recover(self): for row in range(self.row_number_test): self.recover_row(row) def recover_row(self, row): # store all candidate for each column recover_list = [[] for i in range(self.column_number_test)] # Store relationship { segmented text --> big pattern } big_pattern_dict = { text: (PatternHelper.find_first_word_length(text), PatternHelper.find_last_word_length(text)) for text in self.test_str_list[row] } # store judge's small pattern (judge is the one only one candidate in a excel cell) small_pattern_list = [[] for i in range(self.column_number_test)] # step 1. Get all element for match big pattern for i in range(self.column_number_test): for key, value in big_pattern_dict.items(): if self.train.match_big_pattern(value, i): recover_list[i].append(key) # step 2. Check for candidate more than one # for only one candidate cell can be a judge vote for other cell # for zero candidate cell, will ignore while True: old_recover_list = copy.deepcopy(recover_list) # update judge small pattern for column in range(self.column_number_test): if len(recover_list[column]) == 1 and len( small_pattern_list[column]) == 0: Step1.column_choose_decided(recover_list, column) # vote for 2 more candidate # for column in range(self.column_number_test): # if len(recover_list[column]) > 1: # score_dict = self.score_column_candidate(column, recover_list, small_pattern_list) # if len(score_dict) == 0: # continue # max_score = max(score_dict.values()) # recover_list[column] = [] # for candidate, score in score_dict.items(): # if score == max_score: # recover_list[column].append(candidate) # break for no further change if old_recover_list == recover_list: break # step 3. recover data for column in range(self.column_number_test): self.test_repair_data[row][column] = recover_list[column][ 0] if len(recover_list[column]) > 0 else '' def score_column_candidate(self, column, recover_list, small_pattern_list): score_dict = {} for candidate in recover_list[column]: candidate_small_pattern = self.train.get_small_pattern( candidate, column) for j in range(self.column_number_test): if len(recover_list[j]) == 1 and self.train.vote_for_column( column, candidate_small_pattern, j, small_pattern_list[j]): # can be a judge DictHelper.increase_dic_key(score_dict, candidate) return score_dict @staticmethod def column_choose_decided(recover_list, column): for i in range(len(recover_list)): if i != column and recover_list[column][0] in recover_list[ i] and len(recover_list[i]) > 1: recover_list[i].remove(recover_list[column][0])
class Recover(object): def __init__(self, excel_name, dict_file="wu.dic"): self.excel_name = excel_name self.dict_file = dict_file self.header, self.raw_data = ExcelHelper.read_excel(self.excel_name) self.row_number, self.column_number = self.raw_data.shape self.label = [[] for i in range(self.column_number)] self.model_list = [] # store model information self.error_base = [[] for i in range(self.row_number)] self.repair_data = copy.deepcopy(self.raw_data) self.has_label = False self.segment = SegmentHelper(self.excel_name, self.dict_file) def mark_error(self, excel_output): self._mark_label() ExcelHelper.write_excel(excel_output, self.raw_data, header=self.header, mask_array=self.label) def repair_excel(self, excel_output): self._mark_label() self._collect_error_knowledge_base() self._training_model() self._repair() ExcelHelper.write_excel(excel_output, self.repair_data, "repair", self.header, self.label, {1: 'red', 2: 'yellow'}) def _mark_label(self): if self.has_label: return for i in range(self.column_number): column_list = self.raw_data[:, i] column_features = FeatureExtractor(column_list).generate_features() self.label[i].extend(LabelHelper(column_features).get_label()) self.has_label = True def _collect_error_knowledge_base(self): for i in range(self.row_number): for j in range(self.column_number): if self.label[j][i] != 0 and TextHelper.get_data_length(self.raw_data[i, j]) > 0: self.error_base[i].append(self.raw_data[i, j]) self.error_base[i].extend(self.segment.segment(TextHelper.to_string(self.raw_data[i, j]))) print(self.error_base) def _training_model(self): for i in range(self.column_number): model = tree.DecisionTreeRegressor(max_depth=4) column_list = self.raw_data[:, i] column_features = FeatureExtractor(column_list).generate_features() model = model.fit(column_features, self.label[i]) self.model_list.append(model) def _repair(self): for i in range(self.row_number): for j in range(self.column_number): if self.label[j][i] != 0: best_candidate = self._get_recover_data(i, j) if best_candidate is not None: self.repair_data[i, j] = best_candidate print("(" + str(i) + "," + str(j) + ") choose " + str(best_candidate)) self.label[j][i] = 2 # 0 means good cell, 1 means error cell, 2 means repaired cell self.error_base[i].remove(best_candidate) # remove the data from knowledge base else: print("Can not find suitable error test for (%i, %i)" % (i, j)) def _get_recover_data(self, row, column): candidates = None min_error_probability = 1 for phrase in self.error_base[row]: element_features = FeatureExtractor([phrase]).generate_features() tmp_probability = self.model_list[column].predict(element_features)[0] print("Get probability %s for %s in (%i, %i)" %(tmp_probability, phrase, row, column)) if tmp_probability < min_error_probability: max_probability = tmp_probability candidates = phrase return candidates