Exemple #1
0
 def __init__(self, excel_name, dict_file="wu.dic"):
     self.excel_name = excel_name
     self.dict_file = dict_file
     self.header, self.raw_data = ExcelHelper.read_excel(self.excel_name)
     self.row_number, self.column_number = self.raw_data.shape
     self.label = [[] for i in range(self.column_number)]
     self.model_list = []      # store model information
     self.error_base = [[] for i in range(self.row_number)]
     self.repair_data = copy.deepcopy(self.raw_data)
     self.has_label = False
     self.segment = SegmentHelper(self.excel_name, self.dict_file)
Exemple #2
0
 def get_combine_company_dict(store_data_file):
     company_dict = {}
     for tab in range(2):
         header, raw_data = ExcelHelper.read_excel('../resource/us_list_company2.xlsx', tab)
         row, column = raw_data.shape
         for i in range(row):
             company_name = SegmentHelper.normalize(str(raw_data[i][0]).strip())
             if len(company_name) > 0:
                 DictHelper.increase_dic_key(company_dict, raw_data[i][0])
     df = pd.read_csv('../resource/us_list_company_1.csv')
     name_serial = df['Name']
     for i in range(df.shape[0]):
         company_name = SegmentHelper.normalize(name_serial[i])
         if len(company_name) > 0:
             DictHelper.increase_dic_key(company_dict, name_serial[i])
     StoreHelper.store_data(company_dict, store_data_file)
Exemple #3
0
 def _remove_conjunction_segment(self, probability_dict):
     phase_list = []
     sentence_list = []
     word_list = SegmentHelper.segment_text(self.raw_position)
     word_group = []
     for word in word_list:
         if word in stopwords.words('english'):
             if len(word_group) > 0:
                 sentence_list.append(' '.join(word_group))
                 word_group = []
         else:
             word_group.append(word)
     if len(word_group) > 0:
         sentence_list.append(' '.join(word_group))
     for sentence in sentence_list:
         phase_list.extend(
             SegmentHelper.phase_segment(probability_dict, sentence, 0.05))
     return phase_list
Exemple #4
0
 def get_discipline_dict(excel_file, dict_file):
     probability_dict = {}
     header, raw_data = ExcelHelper.read_excel(excel_file)
     row_number, column_number = raw_data.shape
     print(raw_data.shape)
     if column_number != 2:
         print("Attention! Excel file more than two column, please have a check! Use the first two column as dict")
     for i in range(row_number):
         value = raw_data[i][0]
         key_list = raw_data[i][1].split('|')
         for key in key_list:
             key = SegmentHelper.normalize(key)
             if len(key.strip()) == 0:  # ignore single word
                 continue
             probability_dict[key] = value
         probability_dict[SegmentHelper.normalize(value)] = value
     StoreHelper.store_data(probability_dict, dict_file)
     print (probability_dict)
     print("Generalized successfully and store dict(%i) to data file %s!" % (len(probability_dict), dict_file))
 def __init__(self,
              excel_name_training,
              excel_name_test,
              dict_file="zh.dic"):
     self.excel_name_training = excel_name_training
     self.excel_name_test = excel_name_test
     self.dict_file = dict_file
     self.header_training, self.train_data = ExcelHelper.read_excel(
         self.excel_name_training)
     self.header_test, self.test_data = ExcelHelper.read_excel(
         self.excel_name_test)
     self.row_number_training, self.column_number_training = self.train_data.shape
     self.row_number_test, self.column_number_test = self.test_data.shape
     self.segment = SegmentHelper(
         self.excel_name_training,
         self.dict_file)  # generate dictionary for training data
     self.test_str_list = [0 for i in range(self.row_number_test)]
     self.test_repair_data = copy.deepcopy(self.test_data)
     self.train = None
class Step1(object):
    def __init__(self,
                 excel_name_training,
                 excel_name_test,
                 dict_file="zh.dic"):
        self.excel_name_training = excel_name_training
        self.excel_name_test = excel_name_test
        self.dict_file = dict_file
        self.header_training, self.train_data = ExcelHelper.read_excel(
            self.excel_name_training)
        self.header_test, self.test_data = ExcelHelper.read_excel(
            self.excel_name_test)
        self.row_number_training, self.column_number_training = self.train_data.shape
        self.row_number_test, self.column_number_test = self.test_data.shape
        self.segment = SegmentHelper(
            self.excel_name_training,
            self.dict_file)  # generate dictionary for training data
        self.test_str_list = [0 for i in range(self.row_number_test)]
        self.test_repair_data = copy.deepcopy(self.test_data)
        self.train = None

    # get the test data, then split it into phrases , return the result
    def get_test_str_list(self):
        for row in range(self.row_number_test):
            temp_str = " ".join(
                [unicode(item) for item in self.test_data[row, :]])
            self.test_str_list[row] = self.segment.segment(unicode(temp_str))

    def training(self,
                 save_result=True,
                 save_file="pattern_relationship.dat",
                 recover_file=None):
        if self.train is not None:
            return
        if recover_file is not None:
            self.train = PatternCorrelationHelper.build_from_file(recover_file)
            return
        self.train = PatternCorrelationHelper(self.excel_name_training)
        self.train.build_big_pattern()
        self.train.build_pattern_relationship()
        if save_result:
            self.train.save(save_file)

    def recover(self):
        for row in range(self.row_number_test):
            self.recover_row(row)

    def recover_row(self, row):
        # store all candidate for each column
        recover_list = [[] for i in range(self.column_number_test)]
        # Store relationship { segmented text --> big pattern }
        big_pattern_dict = {
            text: (PatternHelper.find_first_word_length(text),
                   PatternHelper.find_last_word_length(text))
            for text in self.test_str_list[row]
        }
        # store judge's small pattern (judge is the one only one candidate in a excel cell)
        small_pattern_list = [[] for i in range(self.column_number_test)]

        # step 1. Get all element for match big pattern
        for i in range(self.column_number_test):
            for key, value in big_pattern_dict.items():
                if self.train.match_big_pattern(value, i):
                    recover_list[i].append(key)

        # step 2. Check for candidate more than one
        # for only one candidate cell can be a judge vote for other cell
        # for zero candidate cell, will ignore
        while True:
            old_recover_list = copy.deepcopy(recover_list)

            # update judge small pattern
            for column in range(self.column_number_test):
                if len(recover_list[column]) == 1 and len(
                        small_pattern_list[column]) == 0:
                    Step1.column_choose_decided(recover_list, column)

            # vote for 2 more candidate
            # for column in range(self.column_number_test):
            #     if len(recover_list[column]) > 1:
            #         score_dict = self.score_column_candidate(column, recover_list, small_pattern_list)
            #         if len(score_dict) == 0:
            #             continue
            #         max_score = max(score_dict.values())
            #         recover_list[column] = []
            #         for candidate, score in score_dict.items():
            #             if score == max_score:
            #                 recover_list[column].append(candidate)

            # break for no further change
            if old_recover_list == recover_list:
                break
        # step 3. recover data
        for column in range(self.column_number_test):
            self.test_repair_data[row][column] = recover_list[column][
                0] if len(recover_list[column]) > 0 else ''

    def score_column_candidate(self, column, recover_list, small_pattern_list):
        score_dict = {}
        for candidate in recover_list[column]:
            candidate_small_pattern = self.train.get_small_pattern(
                candidate, column)
            for j in range(self.column_number_test):
                if len(recover_list[j]) == 1 and self.train.vote_for_column(
                        column, candidate_small_pattern, j,
                        small_pattern_list[j]):  # can be a judge
                    DictHelper.increase_dic_key(score_dict, candidate)
        return score_dict

    @staticmethod
    def column_choose_decided(recover_list, column):
        for i in range(len(recover_list)):
            if i != column and recover_list[column][0] in recover_list[
                    i] and len(recover_list[i]) > 1:
                recover_list[i].remove(recover_list[column][0])
Exemple #7
0
 def generate_word_list(self):
     words_list = []
     for line in self.raw_position.splitlines():
         words_list.extend(
             SegmentHelper.lemmatization(SegmentHelper.segment_text(line)))
     return words_list
Exemple #8
0
class Recover(object):
    def __init__(self, excel_name, dict_file="wu.dic"):
        self.excel_name = excel_name
        self.dict_file = dict_file
        self.header, self.raw_data = ExcelHelper.read_excel(self.excel_name)
        self.row_number, self.column_number = self.raw_data.shape
        self.label = [[] for i in range(self.column_number)]
        self.model_list = []      # store model information
        self.error_base = [[] for i in range(self.row_number)]
        self.repair_data = copy.deepcopy(self.raw_data)
        self.has_label = False
        self.segment = SegmentHelper(self.excel_name, self.dict_file)

    def mark_error(self, excel_output):
        self._mark_label()
        ExcelHelper.write_excel(excel_output, self.raw_data, header=self.header, mask_array=self.label)

    def repair_excel(self, excel_output):
        self._mark_label()
        self._collect_error_knowledge_base()
        self._training_model()
        self._repair()
        ExcelHelper.write_excel(excel_output, self.repair_data, "repair", self.header, self.label, {1: 'red',
                                                                                                    2: 'yellow'})

    def _mark_label(self):
        if self.has_label:
            return
        for i in range(self.column_number):
            column_list = self.raw_data[:, i]
            column_features = FeatureExtractor(column_list).generate_features()
            self.label[i].extend(LabelHelper(column_features).get_label())
        self.has_label = True

    def _collect_error_knowledge_base(self):
        for i in range(self.row_number):
            for j in range(self.column_number):
                if self.label[j][i] != 0 and TextHelper.get_data_length(self.raw_data[i, j]) > 0:
                    self.error_base[i].append(self.raw_data[i, j])
                    self.error_base[i].extend(self.segment.segment(TextHelper.to_string(self.raw_data[i, j])))
        print(self.error_base)

    def _training_model(self):
        for i in range(self.column_number):
            model = tree.DecisionTreeRegressor(max_depth=4)
            column_list = self.raw_data[:, i]
            column_features = FeatureExtractor(column_list).generate_features()
            model = model.fit(column_features, self.label[i])
            self.model_list.append(model)

    def _repair(self):
        for i in range(self.row_number):
            for j in range(self.column_number):
                if self.label[j][i] != 0:
                    best_candidate = self._get_recover_data(i, j)
                    if best_candidate is not None:
                        self.repair_data[i, j] = best_candidate
                        print("(" + str(i) + "," + str(j) + ") choose " + str(best_candidate))
                        self.label[j][i] = 2  # 0 means good cell, 1 means error cell, 2 means repaired cell
                        self.error_base[i].remove(best_candidate)  # remove the data from knowledge base
                    else:
                        print("Can not find suitable error test for (%i, %i)" % (i, j))

    def _get_recover_data(self, row, column):
        candidates = None
        min_error_probability = 1
        for phrase in self.error_base[row]:
            element_features = FeatureExtractor([phrase]).generate_features()
            tmp_probability = self.model_list[column].predict(element_features)[0]
            print("Get probability %s for %s in (%i, %i)" %(tmp_probability, phrase, row, column))
            if tmp_probability < min_error_probability:
                max_probability = tmp_probability
                candidates = phrase
        return candidates