Exemple #1
0
    def _find(self, column, threshold):
        # 记录所有可能出现的feature,以供后边统计字典中该feature出现的次数
        # find all the feature , prepare for calculating the count that the feature appears
        column_list = self.raw_data[:, column]
        pre_dict = {}
        end_dict = {}
        start = time.clock()
        for i in range(self.row_number):
            for j in range(i + 1, self.row_number):
                pre_pattern = PatternHelper.find_pre_common_str(
                    column_list[i], column_list[j])
                end_pattern = PatternHelper.find_end_common_str(
                    column_list[i], column_list[j])
                if pre_pattern != '':
                    DictHelper.increase_dic_key(pre_dict, pre_pattern)
                    self.cell_pre_patterns[i][column].append(pre_pattern)
                    self.cell_pre_patterns[j][column].append(pre_pattern)
                if end_pattern != '':
                    DictHelper.increase_dic_key(end_dict, end_pattern)
                    self.cell_end_patterns[i][column].append(end_pattern)
                    self.cell_end_patterns[j][column].append(end_pattern)
        print("find1 : {0}".format(time.clock() - start))

        pre_list = [
            key for key, value in pre_dict.items() if value > threshold
        ]
        end_list = [
            key for key, value in end_dict.items() if value > threshold
        ]
        return pre_list, end_list
 def score_column_candidate(self, column, recover_list, small_pattern_list):
     score_dict = {}
     for candidate in recover_list[column]:
         candidate_small_pattern = self.train.get_small_pattern(
             candidate, column)
         for j in range(self.column_number_test):
             if len(recover_list[j]) == 1 and self.train.vote_for_column(
                     column, candidate_small_pattern, j,
                     small_pattern_list[j]):  # can be a judge
                 DictHelper.increase_dic_key(score_dict, candidate)
     return score_dict
 def merge_dict():
     profile_dict_list = StoreHelper.load_data(
         '../resource/convert_profile.dat', [])
     merged_list = []
     for profile_dict in profile_dict_list:
         merged_dict = {}
         for feature in profile_dict:
             for key in profile_dict[feature]:
                 DictHelper.increase_dic_key(merged_dict, key)
         merged_list.append(merged_dict)
     StoreHelper.store_data(merged_list, '../resource/merged_profile.dat')
     StoreHelper.save_file(merged_list, '../resource/merged_profile.txt')
 def generate_probability_dict(file_content_list):
     # statistics single word and continue two words
     single_word_dict = {}
     two_word_dict = {}
     for file_content in file_content_list:
         for line in file_content.splitlines():
             word_list = SegmentHelper.segment_text(line)
             if len(word_list) == 1:
                 DictHelper.increase_dic_key(single_word_dict, word_list[0])
             else:
                 for i in range(len(word_list) - 1):
                     DictHelper.increase_dic_key(single_word_dict,
                                                 word_list[i])
                     DictHelper.increase_dic_key(
                         two_word_dict,
                         "%s %s" % (word_list[i], word_list[i + 1]))
                 DictHelper.increase_dic_key(single_word_dict,
                                             word_list[-1])
     # compute two word probability
     prob_a_b_dict = {}
     for words, count in two_word_dict.items():
         word_a, word_b = words.split(' ')
         pro_a_b = two_word_dict[words] * 1.0 / single_word_dict[word_b]
         pro_b_a = two_word_dict[words] * 1.0 / single_word_dict[word_a]
         prob_a_b_dict[words] = max(pro_a_b, pro_b_a)
     return prob_a_b_dict
Exemple #5
0
 def get_combine_company_dict(store_data_file):
     company_dict = {}
     for tab in range(2):
         header, raw_data = ExcelHelper.read_excel('../resource/us_list_company2.xlsx', tab)
         row, column = raw_data.shape
         for i in range(row):
             company_name = SegmentHelper.normalize(str(raw_data[i][0]).strip())
             if len(company_name) > 0:
                 DictHelper.increase_dic_key(company_dict, raw_data[i][0])
     df = pd.read_csv('../resource/us_list_company_1.csv')
     name_serial = df['Name']
     for i in range(df.shape[0]):
         company_name = SegmentHelper.normalize(name_serial[i])
         if len(company_name) > 0:
             DictHelper.increase_dic_key(company_dict, name_serial[i])
     StoreHelper.store_data(company_dict, store_data_file)
Exemple #6
0
 def _add_and_remove(self, words_dict):
     for words, count in words_dict.items():
         if words in self.phrase_dict:
             if self.phrase_dict[words] < count:
                 DictHelper.increase_dic_key(
                     self.phrase_dict, words,
                     count - self.phrase_dict[words])
                 self._count_down_single_word(
                     words, count - self.phrase_dict[words])
             elif self.phrase_dict[words] > count:
                 print(
                     "Warning: phrase match times little than origin split: %s"
                     % words)
         else:
             DictHelper.increase_dic_key(self.phrase_dict, words, count)
             self._count_down_single_word(words, count)
Exemple #7
0
 def get_dict_pattern(context, _dict, convert=True):
     match_result = {}
     for key in _dict.keys():
         key_split = key.split(' ')
         if len(key_split) >= 3 and key_split[1] == '...':
             match_times = len(
                 re.findall(
                     re.escape(key_split[0]) + r'( \w+){0,5} ' +
                     re.escape(' '.join(key_split[2:])), context))
         else:
             key = key.strip()
             match_times = len(
                 re.findall(r'\b' + re.escape(key) + r'\b', context))
         if match_times > 0:
             if convert is True and type(_dict[key]) is not int:
                 DictHelper.increase_dic_key(match_result, _dict[key],
                                             match_times)
             else:
                 DictHelper.increase_dic_key(match_result, key, match_times)
     return match_result
Exemple #8
0
 def convert(self, skill_dict, discipline_dict, education_dict,
             responsibility_dict, year_convert_file):
     year_phase_dict = self._get_working_year_words(year_convert_file)
     skill_phase_dict = self._get_skill_words(skill_dict)
     discipline_phase_dict = self._get_discipline_words(discipline_dict)
     education_phase_dict = self._get_education_words(education_dict)
     responsibility_phase_dict = self._get_responsibility_words(
         responsibility_dict)
     self._add_and_remove(year_phase_dict)
     self._add_and_remove(skill_phase_dict)
     self._add_and_remove(discipline_phase_dict)
     self._add_and_remove(education_phase_dict)
     self._add_and_remove(responsibility_phase_dict)
     for word in self.new_words_list:
         DictHelper.increase_dic_key(self.phrase_dict, word)
     result_dict = {
         "education": education_phase_dict.keys(),
         "major": discipline_phase_dict.keys(),
         "skills": skill_phase_dict.keys(),
         "working-year": year_phase_dict.keys(),
         "responsibility": responsibility_phase_dict.keys()
     }
     return result_dict
Exemple #9
0
    def _get_full_relation(self, column1, column2, row):
        # for pre pattern
        cell1_patterns = self.cell_pre_patterns[row][column1]
        cell2_patterns = self.cell_pre_patterns[row][column2]
        for pattern1 in cell1_patterns:
            for pattern2 in cell2_patterns:
                DictHelper.increase_dic_key(
                    self.pre_pattern_relation[column1][column2],
                    pattern1 + "|" + pattern2)
                DictHelper.increase_dic_key(
                    self.pre_pattern_relation[column2][column1],
                    pattern2 + "|" + pattern1)

        # for end pattern
        cell1_patterns = self.cell_end_patterns[row][column1]
        cell2_patterns = self.cell_end_patterns[row][column2]
        for pattern1 in cell1_patterns:
            for pattern2 in cell2_patterns:
                DictHelper.increase_dic_key(
                    self.end_pattern_relation[column1][column2],
                    pattern1 + "|" + pattern2)
                DictHelper.increase_dic_key(
                    self.end_pattern_relation[column2][column1],
                    pattern2 + "|" + pattern1)
Exemple #10
0
 def _collect_words_dict(self):
     result_dict = {}
     for _dict in self.blob_dict_list:
         for key in _dict.keys():
             DictHelper.increase_dic_key(result_dict, key)
     return result_dict