def _expand_similar_sentences(self, o2o_dict_sorted_in): # similar_line_o2o_dict = self._similar_line_o2o_dict similar_line_o2o_dict = o2o_dict_sorted_in o2m_similar_dict = self._similiar_class.o2m_similar_dict similar_line_o2o_expanded_dict = dict() for similar_str_iter, std_str_iter in similar_line_o2o_dict.items(): for std_word, re_similar_words in self._similiar_class.re_o2m_similar_dict.items( ): matched_part = re_similar_words.search(similar_str_iter) if matched_part: matched_str = matched_part.group() for word_iter in o2m_similar_dict[std_word]: new_str_iter = similar_str_iter.replace( matched_str, word_iter) similar_line_o2o_expanded_dict[ new_str_iter] = std_str_iter # print('before expand', len(o2o_dict_sorted_in)) _similar_line_o2o_dict = { **o2o_dict_sorted_in, **similar_line_o2o_expanded_dict } _similar_line_o2o_dict_sorted = dict2sorted_dict( _similar_line_o2o_dict) # print('after expand', len(_similar_line_o2o_dict_sorted)) return _similar_line_o2o_dict_sorted
def _preload(self): # 同义词和标准词的对应 line_o2m_dict, line_o2o_dict = readfile_line2dict( self._similiar_filepath) self._entity_o2o_dict = self._entity_class.entity_o2o_dict entity_o2o_dict_tmp = {**line_o2o_dict, **self._entity_o2o_dict} self._o2o_similar_dict = dict2sorted_dict(entity_o2o_dict_tmp) self.re_o2o_similar_keys = re.compile( '(' + '|'.join(self._o2o_similar_dict.keys()) + ')') #一对多的对应 o2m_similar_dict = collections.OrderedDict() entity_o2m_dict = self._entity_class.entity_o2m_order_dict o2m_tmp_dict = [entity_o2m_dict, line_o2m_dict] for o2m_dict_iter in o2m_tmp_dict: for o_iter, m_iter in o2m_dict_iter.items(): if o_iter not in o2m_similar_dict: o2m_similar_dict[o_iter] = set(m_iter) else: o2m_similar_dict[o_iter].update(m_iter) re_o2m_similiar_dict = dict() for o_iter, m_iter in o2m_similar_dict.items(): m_iter_sorted = sorted(m_iter, key=lambda x: len(x), reverse=True) o2m_similar_dict[o_iter] = m_iter_sorted re_o2m_similiar_dict[o_iter] = list2re(m_iter_sorted) self._o2m_similar_dict = o2m_similar_dict self._re_o2m_similar_dict = re_o2m_similiar_dict
def get_o2o_map_word_and_sentence(self): line_o2m_dict, line_o2o_dict = readfile_line2dict( PathUtil().similiar_sentences_filepath) # 句子同义置换和同义词同义置换同时放在一块进行处理 word_o2o_dict = self._o2o_similar_dict o2o_dict = {**word_o2o_dict, **line_o2o_dict} o2o_dict_sorted = dict2sorted_dict(o2o_dict) return o2o_dict_sorted
def _preload(self): intent2sim2std_o2o_dict = collections.defaultdict(dict) for field_iter in self._field_o2o_needed: dict_iter = self._get_o2o_by_intent(field_iter) intent2sim2std_o2o_dict[field_iter].update(dict_iter) for field_iter, o2o_iter in intent2sim2std_o2o_dict.items(): intent2sim2std_o2o_dict[field_iter] = dict2sorted_dict(o2o_iter) self._intent2sim2std_o2o_dict = intent2sim2std_o2o_dict
def get_intent2entities(self): # entity_o2o_dict, entity_label2words_dict, entity_word2label_dict_sorted,entity_o2m_order_dict,label2std2sim_dict=self._get_entities() # return entity_label2words_dict intent2entities_dict = collections.defaultdict(list) for domain_iter, items in self._domain2intent2words_dict.items(): for intent_iter, v2 in items.items(): intent2entities_dict[intent_iter].extend(v2) intent2entities_dict_sorted = dict2sorted_dict(intent2entities_dict) for k, v in intent2entities_dict_sorted.items(): intent2entities_dict_sorted[k] = list2sorted_list(v) return intent2entities_dict_sorted
def _load_abbreviation2std_map(self): results = collections.defaultdict(dict) for label_iter in self._abbreviations_entity: filepath = self._filepath_general.format(label_iter) line_o2m_dict, line_o2o_dict_order = readfile_line2dict(filepath) for k, v in line_o2m_dict.items(): # print('line_o2m_dict[k]==>',line_o2m_dict[k]) line_o2m_dict[k].remove(k) line_o2m_dict_sorted = dict2sorted_dict(line_o2m_dict) results[label_iter] = line_o2m_dict_sorted return results
def _get_entities(self, domain2entity2paths_set_in): domain2intent2words_dict, domain2word2intent_dict, domain2entity_o2o_dict, domain2entity_o2m_dict, domain2intent2std2sim_dict = {}, {}, {}, {}, {}, intent2sim2std_o2o_dict = collections.defaultdict(dict) assert isinstance(domain2entity2paths_set_in, dict) for domain_iter, entity2paths_set_iter in domain2entity2paths_set_in.items( ): classification2words_dict, word2classification_dict, entity_o2o_dict, entity_o2m_dict, classification2std2sim_dict, classification2sim2std_o2o_dict = \ read_data_from_paths_set(entity2paths_set_iter) word2classification_dict_sorted = dict2sorted_dict( word2classification_dict) entity_o2m_dict_sorted = dict2sorted_dict(entity_o2m_dict) entity_o2o_dict_sorted = dict2sorted_dict(entity_o2o_dict) domain2intent2words_dict[domain_iter] = classification2words_dict domain2word2intent_dict[ domain_iter] = word2classification_dict_sorted domain2entity_o2o_dict[domain_iter] = entity_o2o_dict_sorted domain2entity_o2m_dict[domain_iter] = entity_o2m_dict_sorted domain2intent2std2sim_dict[ domain_iter] = classification2std2sim_dict for intent_iter, sim2std_dict_iter in classification2sim2std_o2o_dict.items( ): intent2sim2std_o2o_dict[intent_iter].update(sim2std_dict_iter) alldomain_intent2words_dict = collections.defaultdict(list) for domain_iter, dict_iter in domain2intent2words_dict.items(): for intent_iter, words_iter in dict_iter.items(): alldomain_intent2words_dict[intent_iter].extend(words_iter) domain2intent2words_dict[ dcname.alldomain.value] = alldomain_intent2words_dict alldomain_word2intent_dict = collections.defaultdict(list) for domain_iter, dict_iter in domain2word2intent_dict.items(): for word_iter, intent_iter in dict_iter.items(): alldomain_word2intent_dict[word_iter] = intent_iter domain2word2intent_dict[ dcname.alldomain.value] = alldomain_word2intent_dict return domain2intent2words_dict, domain2word2intent_dict, domain2entity_o2o_dict, domain2entity_o2m_dict, domain2intent2std2sim_dict, intent2sim2std_o2o_dict
def entity_o2m_order_dict(self): vs = {} for k, v in self._domain2entity_o2m_dict.items(): vs.update(v) vs_sorted = dict2sorted_dict(vs) return vs_sorted