Beispiel #1
0
    def load_normal_data(self, feature_list, label_list, sen_list=None):
        sens = loading_data.load_sentence()
        num_sen = len(sens)
        print "Loading total %s normal sentence." % num_sen
        for i in xrange(num_sen - 1):
            sen = sens[i]
            spliter_id = len(sen) - 1

            #for single sentence
            feature, _ = self.feature_model.gen_feature_vector(sen, spliter_id)
            feature_list.append(feature)
            label_list.append(1)
            if sen_list != None:
                sen_list.append(sen)

            #for merge sentence
            sen_merge = " ".join([sens[i], sens[i + 1]])
            feature, _ = self.feature_model.gen_feature_vector(
                sen_merge, spliter_id)
            feature_list.append(feature)
            label_list.append(1)
            if sen_list != None:
                sen_list.append(sen)
            idx = 0
            for c in sen[:-1]:
                if Feature.is_spliter_candidate(c):
                    feature, _ = self.feature_model.gen_feature_vector(
                        sen, idx)
                    feature_list.append(feature)
                    label_list.append(0)
                    if sen_list != None:
                        sen_list.append(sen)
                idx += 1
Beispiel #2
0
    def loading_none_spliter_rule(self,
                                  feature_list,
                                  label_list,
                                  sen_list=None):
        rules = loading_data.load_spliter_rules()
        print "Loading rules."
        for rule in rules:
            if rule[0] == "#":
                continue
            if rule[0] == "r":
                rule = rule[1:]
                print "Add a soft regex: %s" % rule
                self.feature_model.add_none_spliter_regrex(rule)
                continue
            elif rule[0] == "h":
                rule = rule[1:]
                print "Add a hard rule regex: %s" % rule
                self.feature_model.add_none_spliter_regrex(rule, True)
                continue

            idx = 0
            for c in rule:
                if Feature.is_spliter_candidate(c):
                    feature, _ = self.feature_model.gen_feature_vector(
                        rule, idx, is_forced=True)
                    feature_list.append(feature)
                    label_list.append(0)
                    if sen_list != None:
                        sen_list.append(rule)
                idx += 1
Beispiel #3
0
    def __split_par(self, par, is_debug=False):
        list_sens = []

        list_features = []
        list_candidates = []
        list_hard_rule_none_spliter_idx = []
        list_hard_rule_forcing_spliter_idx = []
        idx = 0
        for c in par:
            if Feature.is_spliter_candidate(c):
                list_candidates.append(idx)
                feature, is_hard = self.feature_model.gen_feature_vector(
                    par, idx)
                if is_hard > 0:
                    list_hard_rule_none_spliter_idx.append(
                        len(list_candidates) - 1)
                elif is_hard < 0:
                    list_hard_rule_forcing_spliter_idx.append(
                        len(list_candidates) - 1)
                if is_debug:
                    print feature
                list_features.append(feature)
            idx += 1
        if is_debug:
            print list_candidates

        if len(list_candidates) == 0:
            list_sens.append(par)
            return list_sens

        #print list_features
        #list_features = np.array(list_features)
        #print "Shape: ",list_features.shape

        labels = self.classifier.predict(list_features)

        for l in list_hard_rule_none_spliter_idx:
            labels[l] = 0
        for l in list_hard_rule_forcing_spliter_idx:
            labels[l] = 1

        list_true_spliters = [-1]
        for i in xrange(len(labels)):
            if labels[i] == 1:
                list_true_spliters.append(list_candidates[i])

        if list_candidates[-1] != len(par) - 1:
            list_true_spliters.append(len(par) - 1)

        if is_debug:
            print list_true_spliters
        if len(list_true_spliters) > 1:
            for i in xrange(len(list_true_spliters) - 1):
                list_sens.append(par[list_true_spliters[i] +
                                     1:list_true_spliters[i + 1] + 1].strip())

        else:
            list_sens.append(par)

        return list_sens