def load_normal_data(self, feature_list, label_list, sen_list=None): sens = loading_data.load_sentence() num_sen = len(sens) print "Loading total %s normal sentence." % num_sen for i in xrange(num_sen - 1): sen = sens[i] spliter_id = len(sen) - 1 #for single sentence feature, _ = self.feature_model.gen_feature_vector(sen, spliter_id) feature_list.append(feature) label_list.append(1) if sen_list != None: sen_list.append(sen) #for merge sentence sen_merge = " ".join([sens[i], sens[i + 1]]) feature, _ = self.feature_model.gen_feature_vector( sen_merge, spliter_id) feature_list.append(feature) label_list.append(1) if sen_list != None: sen_list.append(sen) idx = 0 for c in sen[:-1]: if Feature.is_spliter_candidate(c): feature, _ = self.feature_model.gen_feature_vector( sen, idx) feature_list.append(feature) label_list.append(0) if sen_list != None: sen_list.append(sen) idx += 1
def loading_none_spliter_rule(self, feature_list, label_list, sen_list=None): rules = loading_data.load_spliter_rules() print "Loading rules." for rule in rules: if rule[0] == "#": continue if rule[0] == "r": rule = rule[1:] print "Add a soft regex: %s" % rule self.feature_model.add_none_spliter_regrex(rule) continue elif rule[0] == "h": rule = rule[1:] print "Add a hard rule regex: %s" % rule self.feature_model.add_none_spliter_regrex(rule, True) continue idx = 0 for c in rule: if Feature.is_spliter_candidate(c): feature, _ = self.feature_model.gen_feature_vector( rule, idx, is_forced=True) feature_list.append(feature) label_list.append(0) if sen_list != None: sen_list.append(rule) idx += 1
def __split_par(self, par, is_debug=False): list_sens = [] list_features = [] list_candidates = [] list_hard_rule_none_spliter_idx = [] list_hard_rule_forcing_spliter_idx = [] idx = 0 for c in par: if Feature.is_spliter_candidate(c): list_candidates.append(idx) feature, is_hard = self.feature_model.gen_feature_vector( par, idx) if is_hard > 0: list_hard_rule_none_spliter_idx.append( len(list_candidates) - 1) elif is_hard < 0: list_hard_rule_forcing_spliter_idx.append( len(list_candidates) - 1) if is_debug: print feature list_features.append(feature) idx += 1 if is_debug: print list_candidates if len(list_candidates) == 0: list_sens.append(par) return list_sens #print list_features #list_features = np.array(list_features) #print "Shape: ",list_features.shape labels = self.classifier.predict(list_features) for l in list_hard_rule_none_spliter_idx: labels[l] = 0 for l in list_hard_rule_forcing_spliter_idx: labels[l] = 1 list_true_spliters = [-1] for i in xrange(len(labels)): if labels[i] == 1: list_true_spliters.append(list_candidates[i]) if list_candidates[-1] != len(par) - 1: list_true_spliters.append(len(par) - 1) if is_debug: print list_true_spliters if len(list_true_spliters) > 1: for i in xrange(len(list_true_spliters) - 1): list_sens.append(par[list_true_spliters[i] + 1:list_true_spliters[i + 1] + 1].strip()) else: list_sens.append(par) return list_sens