def check_dependency_rules(self, sentence, verbose=True): ## simple normalization sentence = re.sub(r'\s+', ' ', sentence).strip() sentence = self.ini_processor.check_and_remove_ini(sentence, self.analyzer, verbose=False) ## check for complex dependency structure ob = base_structure(sentence, self.analyzer) if verbose: ob.print_dep_tree() r1_res = ob.loop_nodes(ob.dep_tree, self.rule1) if len(r1_res) == 0: return False eles = re.split(r',| |,', sentence) for e in eles: ob_e = base_structure(e, self.analyzer) if verbose: ob_e.print_dep_tree() r2_res = ob_e.loop_nodes(ob_e.dep_tree, self.rule2) if len(r2_res) == 0: return False return True
def check_and_remove_ini(self, input_sentance, analyzer, verbose=True): """ function to process everything """ check = self.check_all_rules(input_sentance, analyzer) if verbose: print(check) print(self.rule2name[check]) if check > 0: res = base_structure(input_sentance, analyzer) out_sent = self.remove_init_stop_words(input_sentance, verbose) if verbose: res.print_dep_tree() ## check if rule satisfied print(self.remove_init_stop_words(input_sentance)) return out_sent else: return input_sentance
def check_all_rules(sentance, analyzer, rule_map): check = processor._check_candidate(sentance) if check: res = base_structure(sentance, analyzer) for i in range(1, len(rule_map) + 1): f_l = res.loop_nodes(res.dep_tree, rule_map[i][0]) ## check if negation logic, it was set in rules maps if rule_map[i][1]: if len(f_l) > 0: return i else: pass else: ## this is negation logic if len(f_l) == 0: return i else: pass return 0 else: return 0
results_path = './data/dep_tree_out_1.xlsx' keep_columns = ['ask'] df = pd.read_excel(data_path, sheet_name='a_b_1') df = df[keep_columns] df.dropna(inplace=True) df.reset_index(inplace=True) #df = df.head(1000) input_column_name = 'ask' #intent_column_name = '意图' #%% ## use stanford parser print('parsing using han analyzer....') analyzer = han_analyzer() processor = Processor('../libs/init_stop_words.txt') input_data = df[input_column_name].values #%% test_data = [processor.remove_init_stop_words(i) for i in input_data] assert len(test_data) == len(input_data) df['filtered_input'] = np.array(test_data) #%% msg_list = [ base_structure(s, analyzer).print_dep_tree(print_out=False) for s in test_data ] msg_list = ['\n'.join(m) for m in msg_list] #%% df['han_dep'] = df[input_column_name].apply(get_dep_output_han, args=(analyzer, )) df['han_dep_tree'] = np.array(msg_list) #%% df.to_excel(results_path)
1: (rule_1, True), 2: (rule_2, True), 3: (rule_3, True), 4: (rule_4, False) } rule2name = { 0: "其他", 1: "动词+主谓关系", 2: "动词+并列动词 没有主语", 3: "动词+名词", 4: "层数=2" } #%% test = "你相信世界上有鬼吗" r = base_structure(test, analyzer).print_dep_tree(print_out=True) label = check_all_rules(test, analyzer, rule_map) print(label, rule2name[label]) #%% overall_results = [] for t in test_data: try: msg = base_structure(t, analyzer).print_dep_tree(print_out=False) msg = '\n'.join(msg) label = check_all_rules(t, analyzer, rule_map) name = rule2name[label] overall_results.append((t, msg, label, name)) except: print('Something Went Wrong') raise Exception(t)
print(self.remove_init_stop_words(input_sentance)) return out_sent else: return input_sentance # def check_filter_rules(self,input_sentance): # stop_words = self.init_stop_words #%% if __name__ == "__main__": processor = Processor(init_stop_words_path='init_stop_words.txt') analyzer = han_analyzer() ## test sentence test = "你觉得旅游必去的地方有哪些?" ## check if it is start with stop words #check = processor._check_candidate(test) check = processor.check_all_rules(test, analyzer) print(check) print(processor.rule2name[check]) if check > 0: res = base_structure(test, analyzer) res.print_dep_tree() ## check if rule satisfied print(processor.remove_init_stop_words(test)) # easy one function to process everything res = processor.check_and_remove_ini(test, analyzer, verbose=False) print(res)