def _get_working_year_words(self, year_convert_file=None): year_list = TextHelper.get_years_pattern(self.raw_position) if len(year_list) == 0: default_year_requirement = "[0]" self.new_words_list.append(default_year_requirement) year_list = [default_year_requirement] elif year_convert_file is not None: year_convert_dict = StoreHelper.load_data(year_convert_file, {}) year_list = [ year_convert_dict[item] for item in year_list if item in year_convert_dict ] return DictHelper.dict_from_count_list(year_list)
def print_label(label, index_list, cluster_number=None): if cluster_number is None: label_dict = DictHelper.dict_from_count_list(label) print("\t".join([str(i) for i in label])) print(label_dict) print("max cluster number: %i" % max(label_dict)) print("min cluster number: %i" % min(label_dict)) position_tag = {} for i in range(len(label)): DictHelper.append_dic_key(position_tag, label[i], int(index_list[i])) for key, value in position_tag.items(): print("%s: %s" % (key, value)) StoreHelper.store_data(position_tag, 'position_tag.dat') StoreHelper.save_file(position_tag, 'position_tag.txt') else: length = len(label) clusters = [[str(j) for j in range(length) if label[j] == i] for i in range(cluster_number)] for i in range(len(clusters)): print("Cluster %i has %i position, position: %s" % (i, len(clusters[i]), str(clusters[i])))
def get_frequency_dict(content): words_list = [] for line in content.splitlines(): words_list.extend( SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) return DictHelper.dict_from_count_list(words_list)
def convert_2(self, probability_dict): year_phase_list = self._get_working_year_words() phrase_list = self._remove_conjunction_segment(probability_dict) phrase_list.extend(year_phase_list) return DictHelper.dict_from_count_list(phrase_list)
def __init__(self, raw_position, word_list=[]): self.raw_position = raw_position.lower() self.word_list = word_list self.phrase_dict = DictHelper.dict_from_count_list(self.word_list) self.new_words_list = []