Ejemplo n.º 1
0
    def load_integer_features(self, data):
        """Gives each POS tag in data a number."""
        integer_features = []
        pos_feature = np.array([])

        for txt in data.textfiles:
            if self.annotations == "union":
                txt.compute_union_relations()
            elif self.annotations == "intersected":
                txt.compute_intersection_relations()

            for rel in txt.relations:
                f = Feature(rel)
                # Build arrays of integers with which we can fit the encoder
                # Standardize because f.get_pos_$x() doesn't have to be of length self.number_tags_per_feature/2
                standardized_pos_target = self.standardize_sub_pos_feature(f.get_pos_target())
                standardized_pos_source = self.standardize_sub_pos_feature(f.get_pos_source())
                # Concatenate the two plain POS tag arrays from target and source event
                pos_feature = np.concatenate((standardized_pos_target, standardized_pos_source))
                # Transform this array into the corresponding array of integers
                integer_feature = self.pos_tags_to_integers(pos_feature)

                integer_features.append(integer_feature)

        return integer_features
Ejemplo n.º 2
0
 def train(self):
     self.feature_model = Feature()
     feature_list = []
     label_list = []
     sen_list = []
     self.loading_none_spliter_rule(feature_list, label_list, sen_list)
     self.loading_forcing_spliter_rule()
     self.load_normal_data(feature_list, label_list, sen_list)
     self.classifier = LogisticRegression(verbose=False)
     print "Learning..."
     self.classifier.fit(feature_list, label_list)
     print "Saving..."
     utils.pickle_save(self, self.model_path)
     print "Done"
     print "Test..."
     #f = open("wrong.dat","w")
     predicted_labels = self.classifier.predict(feature_list)
     ll = len(predicted_labels)
     cc = 0
     for i in xrange(ll):
         if label_list[i] == 0 and predicted_labels[i] == 1:
             cc += 1
             #print sen_list[i]
             #f.write("%s\n"%sen_list[i])
     #f.close()
     print cc, ll, cc * 1.0 / ll
Ejemplo n.º 3
0
    def load_integer_features(self, data):
        """Gives each POS tag in data a number."""
        integer_features = []
        pos_feature = np.array([])

        for txt in data.textfiles:
            if self.annotations == "union":
                txt.compute_union_relations()
            elif self.annotations == "intersected":
                txt.compute_intersection_relations()

            for rel in txt.relations:
                f = Feature(rel)
                # Build arrays of integers with which we can fit the encoder
                # Standardize because f.get_pos_$x() doesn't have to be of length self.number_tags_per_feature/2
                standardized_pos_target = self.standardize_sub_pos_feature(
                    f.get_pos_target())
                standardized_pos_source = self.standardize_sub_pos_feature(
                    f.get_pos_source())
                # Concatenate the two plain POS tag arrays from target and source event
                pos_feature = np.concatenate(
                    (standardized_pos_target, standardized_pos_source))
                # Transform this array into the corresponding array of integers
                integer_feature = self.pos_tags_to_integers(pos_feature)

                integer_features.append(integer_feature)

        return integer_features
Ejemplo n.º 4
0
 def __init__(self, filepath, resultpath):
     self.filepath = filepath
     self.resultpath = resultpath
     self.patient_id = list()  # pat_id
     self.feature = Feature(filepath,
                            resultpath)  # feature_info and other statistics
     self.patient_info = list()  # list of Patient()
Ejemplo n.º 5
0
 def make_feature(self, file=None):
     features_list = []
     label_list = []
     self.feature_model = Feature()
     if file is None:
         return features_list, label_list
     else:
         features_list, label_list = self.feature_model.gen_feature_matrix(
             file)
     return features_list, label_list
Ejemplo n.º 6
0
 def split_paragraph(self, par):
     sens = []
     try:
         new_par = self.regex_rule.fn_normalize_special_mark(par)
         paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \
             self.regex_rule.run_regex_predict(new_par)
         features, _ = self.make_feature(paragraph)
         if not features:
             sens.append(par)
             return sens
         labels = self.classifier.predict(features)
         idx = 0
         pos_start = 0
         pos_end = 0
         for c in paragraph:
             if Feature.is_splitter_candidate(c):
                 if idx < len(labels) and labels[idx] == 1:
                     sens.append(paragraph[pos_start:pos_end + 1].strip())
                     pos_start = pos_end + 1
                 idx += 1
             pos_end += 1
         if pos_start < len(paragraph):
             sens.append(paragraph[pos_start:].strip())
         paragraph = '\n'.join(sens)
         paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \
                                            mark3, mark4)
         # paragraph = self.regex_rule.normalize_special_mark.sub(u' \g<special_mark> ', paragraph)
         # paragraph = self.regex_rule.normalize_space.sub(u' ', paragraph)
         sens = paragraph.split('\n')
         return sens
     except Exception as e:
         print(traceback.format_exc())
         sens.append(par)
         return sens
Ejemplo n.º 7
0
 def split_paragraph(self, par):
     sens = []
     try:
         paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \
             self.regex_rule.run_regex_predict(par)
         features, _ = self.make_feature(paragraph)
         if not features:
             sens.append(par)
             return sens
         labels = self.classifier.predict(features)
         idx = 0
         pos_start = 0
         pos_end = 0
         for c in paragraph:
             if Feature.is_splitter_candidate(c):
                 if idx < len(labels) and labels[idx] == 1:
                     sens.append(paragraph[pos_start:pos_end + 1].strip())
                     pos_start = pos_end + 1
                 idx += 1
             pos_end += 1
         if pos_start < len(paragraph):
             sens.append(paragraph[pos_start:].strip())
         paragraph = '\n'.join(sens)
         paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \
                                            mark3, mark4)
         sens = paragraph.split('\n')
         return sens
     except:
         sens.append(par)
         return sens
Ejemplo n.º 8
0
    def loading_none_spliter_rule(self,
                                  feature_list,
                                  label_list,
                                  sen_list=None):
        rules = loading_data.load_spliter_rules()
        print "Loading rules."
        for rule in rules:
            if rule[0] == "#":
                continue
            if rule[0] == "r":
                rule = rule[1:]
                print "Add a soft regex: %s" % rule
                self.feature_model.add_none_spliter_regrex(rule)
                continue
            elif rule[0] == "h":
                rule = rule[1:]
                print "Add a hard rule regex: %s" % rule
                self.feature_model.add_none_spliter_regrex(rule, True)
                continue

            idx = 0
            for c in rule:
                if Feature.is_spliter_candidate(c):
                    feature, _ = self.feature_model.gen_feature_vector(
                        rule, idx, is_forced=True)
                    feature_list.append(feature)
                    label_list.append(0)
                    if sen_list != None:
                        sen_list.append(rule)
                idx += 1
Ejemplo n.º 9
0
    def __split_par(self, par, is_debug=False):
        list_sens = []

        list_features = []
        list_candidates = []
        list_hard_rule_none_spliter_idx = []
        list_hard_rule_forcing_spliter_idx = []
        idx = 0
        for c in par:
            if Feature.is_spliter_candidate(c):
                list_candidates.append(idx)
                feature, is_hard = self.feature_model.gen_feature_vector(
                    par, idx)
                if is_hard > 0:
                    list_hard_rule_none_spliter_idx.append(
                        len(list_candidates) - 1)
                elif is_hard < 0:
                    list_hard_rule_forcing_spliter_idx.append(
                        len(list_candidates) - 1)
                if is_debug:
                    print feature
                list_features.append(feature)
            idx += 1
        if is_debug:
            print list_candidates

        if len(list_candidates) == 0:
            list_sens.append(par)
            return list_sens

        #print list_features
        #list_features = np.array(list_features)
        #print "Shape: ",list_features.shape

        labels = self.classifier.predict(list_features)

        for l in list_hard_rule_none_spliter_idx:
            labels[l] = 0
        for l in list_hard_rule_forcing_spliter_idx:
            labels[l] = 1

        list_true_spliters = [-1]
        for i in xrange(len(labels)):
            if labels[i] == 1:
                list_true_spliters.append(list_candidates[i])

        if list_candidates[-1] != len(par) - 1:
            list_true_spliters.append(len(par) - 1)

        if is_debug:
            print list_true_spliters
        if len(list_true_spliters) > 1:
            for i in xrange(len(list_true_spliters) - 1):
                list_sens.append(par[list_true_spliters[i] +
                                     1:list_true_spliters[i + 1] + 1].strip())

        else:
            list_sens.append(par)

        return list_sens
Ejemplo n.º 10
0
    def load_normal_data(self, feature_list, label_list, sen_list=None):
        sens = loading_data.load_sentence()
        num_sen = len(sens)
        print "Loading total %s normal sentence." % num_sen
        for i in xrange(num_sen - 1):
            sen = sens[i]
            spliter_id = len(sen) - 1

            #for single sentence
            feature, _ = self.feature_model.gen_feature_vector(sen, spliter_id)
            feature_list.append(feature)
            label_list.append(1)
            if sen_list != None:
                sen_list.append(sen)

            #for merge sentence
            sen_merge = " ".join([sens[i], sens[i + 1]])
            feature, _ = self.feature_model.gen_feature_vector(
                sen_merge, spliter_id)
            feature_list.append(feature)
            label_list.append(1)
            if sen_list != None:
                sen_list.append(sen)
            idx = 0
            for c in sen[:-1]:
                if Feature.is_spliter_candidate(c):
                    feature, _ = self.feature_model.gen_feature_vector(
                        sen, idx)
                    feature_list.append(feature)
                    label_list.append(0)
                    if sen_list != None:
                        sen_list.append(sen)
                idx += 1
Ejemplo n.º 11
0
    def load_stems(self, data):
        """Returns all word stems used in the parsed XML data."""
        # Get all word stems
        stems = np.array([])
        for txt in data.textfiles:
            if self.annotations == "union":
                txt.compute_union_relations()
            elif self.annotations == "intersected":
                txt.compute_intersection_relations()

            for rel in txt.relations:
                f = Feature(rel)
                stems = np.append(stems, [f.get_stem_target()])
                stems = np.append(stems, [f.get_stem_source()])

        stems = np.unique(stems)
        return stems
Ejemplo n.º 12
0
    def load_pos_tags(self, data):
        """Loads all POS tags used in the pos_surrounding area around an event."""
        pos_tags = np.array([])

        for txt in data.textfiles:
            if self.annotations == "union":
                txt.compute_union_relations()
            elif self.annotations == "intersected":
                txt.compute_intersection_relations()

            for rel in txt.relations:
                f = Feature(rel)
                # Collect all pos tags from the data
                pos_tags = np.concatenate((pos_tags, f.get_pos_target()))
                pos_tags = np.concatenate((pos_tags, f.get_pos_source()))

        pos_tags = np.unique(pos_tags)

        # Append a blank tag which will be used for filling up features which don't have enough elements
        pos_tags = np.append(pos_tags, 'BL')
        return pos_tags
Ejemplo n.º 13
0
    def load_pos_tags(self, data):
        """Loads all POS tags used in the pos_surrounding area around an event."""
        pos_tags = np.array([])

        for txt in data.textfiles:
            if self.annotations == "union":
                txt.compute_union_relations()
            elif self.annotations == "intersected":
                txt.compute_intersection_relations()

            for rel in txt.relations:
                f = Feature(rel)
                # Collect all pos tags from the data
                pos_tags = np.concatenate((pos_tags, f.get_pos_target()))
                pos_tags = np.concatenate((pos_tags, f.get_pos_source()))

        pos_tags = np.unique(pos_tags)

        # Append a blank tag which will be used for filling up features which don't have enough elements
        pos_tags = np.append(pos_tags, 'BL')
        return pos_tags
Ejemplo n.º 14
0
 def __init__(self, K, filepath, resultpath):
    self.K = K
    self.feature = Feature(filepath, resultpath)
    self.pat_edu = dict()
    # p-value: first, median, last
    self.p_value = list() # feature name, p-value tuple (first, median, last, diff)
    self.mean = dict() # feature name : list(cluster1_mean, cluster1_std, ...)}
                       # mean of the last state for temporal features
    self.mean_first = dict() # feature name : list(cluster1_mean, cluster1_std, ...)
                             # mean of the first state for temporal fatures
    self.mean_median = dict()
    self.mean_total = dict() # feature name : (total mean, total std) 
    self.mean_follow_up = dict() # feature name : mean follow-up time in this feature 
Ejemplo n.º 15
0
    mode = "train" if data_loader.is_train else "valid"
    print(
        f"epoch {epoch_idx:02} {mode} score > {score:.4} ({int(timer() - epoch_start)}s)"
    )

    total_loss /= len(data_loader.dataset)
    return score, total_loss


if __name__ == "__main__":
    config = get_config()

    # Vocab load
    sp = spm.SentencePieceProcessor()
    feature = Feature()

    if config.mode == "train":
        data_dir = os.path.join(nsml.DATASET_PATH, "train", "train_data")
        build_vocab(os.path.join(data_dir, config.train_file_name))
        sp.load('vocab.model')

        feature.init_idf(os.path.join(data_dir, config.train_file_name))

    # random seed
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.random.manual_seed(config.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(config.seed)
Ejemplo n.º 16
0
class DataIO(object):
    def __init__(self, filepath, resultpath):
        self.filepath = filepath
        self.resultpath = resultpath
        self.patient_id = list()  # pat_id
        self.feature = Feature(filepath,
                               resultpath)  # feature_info and other statistics
        self.patient_info = list()  # list of Patient()

    def load_patient_id(self):
        f = codecs.open(self.filepath + 'patient_id.csv', 'r', 'utf-8')
        reader = csv.reader(f)
        line_ctr = 0
        for row in reader:
            # table title
            if line_ctr < 1:
                table_ttl = dict(zip(row, range(len(row))))
                line_ctr += 1
                continue

            pid = row[table_ttl['PATNO']]
            self.patient_id.append(pid)
            line_ctr += 1
        f.close()

    def load_demographics(self):
        f = codecs.open(self.filepath + 'demographics/' + 'patient_demo.csv',
                        'r', 'utf-8')
        reader = csv.reader(f)
        line_ctr = 0
        for row in reader:
            # table title
            if line_ctr < 1:
                table_ttl = dict(zip(row, range(len(row))))
                line_ctr += 1
                continue
            if len(row) == 0:
                continue
            pval = Patient()
            pval.id = row[table_ttl['ID']]
            pval.age = row[table_ttl['AGE']]
            pval.gender = row[table_ttl['GENDER']]
            pval.edu_year = row[table_ttl['EDUCATION YEAR']]
            pval.duration = row[table_ttl['DURATION(MONTH)']]
            pval.diagnosis = row[table_ttl['DIAGNOSIS']]
            self.patient_info.append(pval)
            line_ctr += 1
        f.close()

    def load_feature(self, ftype=None, fname=None, featname=None):
        self.feature.load_feature(ftype, fname, featname)

    def read_data(self):
        self.load_patient_id()
        self.load_demographics()
        self.load_feature('Motor', 'MDS UPDRS PartI')
        self.load_feature('Motor', 'MDS UPDRS PartII')
        self.load_feature('Motor', 'MDS UPDRS PartIII')
        self.load_feature('Motor', 'MDS UPDRS PartIV')

        self.load_feature('Non-Motor', 'BJLO')
        self.load_feature('Non-Motor', 'ESS')
        self.load_feature('Non-Motor', 'GDS')
        self.load_feature('Non-Motor', 'HVLT')
        self.load_feature('Non-Motor', 'LNS')
        self.load_feature('Non-Motor', 'MoCA')
        self.load_feature('Non-Motor', 'QUIP')
        self.load_feature('Non-Motor', 'RBD')
        self.load_feature('Non-Motor', 'SCOPA-AUT')
        self.load_feature('Non-Motor', 'SF')
        self.load_feature('Non-Motor', 'STAI')
        self.load_feature('Non-Motor', 'SDM')
        self.load_feature('Non-Motor', 'MCI')

        self.load_feature('Biospecimen', 'DNA')
        self.load_feature('Biospecimen', 'CSF', 'Total tau')
        self.load_feature('Biospecimen', 'CSF', 'Abeta 42')
        self.load_feature('Biospecimen', 'CSF', 'p-Tau181P')
        self.load_feature('Biospecimen', 'CSF', 'CSF Alpha-synuclein')

        self.load_feature('Image', 'DaTScan SBR')
        self.load_feature('Image', 'MRI')
        self.load_feature('Medication', 'MED USE')

        return self.feature.get_feature_name()
Ejemplo n.º 17
0
class SentenceSpliter():
    def __init__(self,
                 path="models/model.dump",
                 is_training=False,
                 new_rule_path=None):
        self.classifier = None
        self.feature_model = None
        self.multi_newline_regex = re.compile("\n+")
        self.c_dir = os.path.abspath(os.path.dirname(__file__))
        self.model_path = "%s/%s" % (self.c_dir, path)
        if not is_training:
            if os.path.exists(self.model_path) and not is_training:
                #print "Loading model..."
                model = utils.pickle_load(self.model_path)
                self.classifier = model.classifier
                self.feature_model = model.feature_model
                if new_rule_path != None:
                    self.load_custom_hard_rule(new_rule_path)
            else:
                print "Unalbe to load the spliter model. %s" % path
                exit(-1)

    def load_normal_data(self, feature_list, label_list, sen_list=None):
        sens = loading_data.load_sentence()
        num_sen = len(sens)
        print "Loading total %s normal sentence." % num_sen
        for i in xrange(num_sen - 1):
            sen = sens[i]
            spliter_id = len(sen) - 1

            #for single sentence
            feature, _ = self.feature_model.gen_feature_vector(sen, spliter_id)
            feature_list.append(feature)
            label_list.append(1)
            if sen_list != None:
                sen_list.append(sen)

            #for merge sentence
            sen_merge = " ".join([sens[i], sens[i + 1]])
            feature, _ = self.feature_model.gen_feature_vector(
                sen_merge, spliter_id)
            feature_list.append(feature)
            label_list.append(1)
            if sen_list != None:
                sen_list.append(sen)
            idx = 0
            for c in sen[:-1]:
                if Feature.is_spliter_candidate(c):
                    feature, _ = self.feature_model.gen_feature_vector(
                        sen, idx)
                    feature_list.append(feature)
                    label_list.append(0)
                    if sen_list != None:
                        sen_list.append(sen)
                idx += 1

    def load_custom_hard_rule(self, path):
        rules = loading_data.load_spliter_rules(path)
        for rule in rules:
            if rule[0] == "#":
                continue
            elif rule[0] == "h":
                rule = rule[1:]
                print "Add a hard rule regex: %s" % rule
                self.feature_model.add_none_spliter_regrex(rule, True)
                continue

    def loading_forcing_spliter_rule(self):
        rules = loading_data.load_spliter_rules(
            loading_data.raw_forcing_spliter_path)
        for rule in rules:
            if rule[0] == "#":
                continue
            elif rule[0] == "h":
                rule = rule[1:]
                print "Add a hard forcing rule regex: %s" % rule
                self.feature_model.add_forcing_splitter_regrex(rule)

    def loading_none_spliter_rule(self,
                                  feature_list,
                                  label_list,
                                  sen_list=None):
        rules = loading_data.load_spliter_rules()
        print "Loading rules."
        for rule in rules:
            if rule[0] == "#":
                continue
            if rule[0] == "r":
                rule = rule[1:]
                print "Add a soft regex: %s" % rule
                self.feature_model.add_none_spliter_regrex(rule)
                continue
            elif rule[0] == "h":
                rule = rule[1:]
                print "Add a hard rule regex: %s" % rule
                self.feature_model.add_none_spliter_regrex(rule, True)
                continue

            idx = 0
            for c in rule:
                if Feature.is_spliter_candidate(c):
                    feature, _ = self.feature_model.gen_feature_vector(
                        rule, idx, is_forced=True)
                    feature_list.append(feature)
                    label_list.append(0)
                    if sen_list != None:
                        sen_list.append(rule)
                idx += 1
        #print Feature.NONE_SPLITER_DICT

    def train(self):
        self.feature_model = Feature()
        feature_list = []
        label_list = []
        sen_list = []
        self.loading_none_spliter_rule(feature_list, label_list, sen_list)
        self.loading_forcing_spliter_rule()
        self.load_normal_data(feature_list, label_list, sen_list)
        self.classifier = LogisticRegression(verbose=False)
        print "Learning..."
        self.classifier.fit(feature_list, label_list)
        print "Saving..."
        utils.pickle_save(self, self.model_path)
        print "Done"
        print "Test..."
        #f = open("wrong.dat","w")
        predicted_labels = self.classifier.predict(feature_list)
        ll = len(predicted_labels)
        cc = 0
        for i in xrange(ll):
            if label_list[i] == 0 and predicted_labels[i] == 1:
                cc += 1
                #print sen_list[i]
                #f.write("%s\n"%sen_list[i])
        #f.close()
        print cc, ll, cc * 1.0 / ll

    def __split_par(self, par, is_debug=False):
        list_sens = []

        list_features = []
        list_candidates = []
        list_hard_rule_none_spliter_idx = []
        list_hard_rule_forcing_spliter_idx = []
        idx = 0
        for c in par:
            if Feature.is_spliter_candidate(c):
                list_candidates.append(idx)
                feature, is_hard = self.feature_model.gen_feature_vector(
                    par, idx)
                if is_hard > 0:
                    list_hard_rule_none_spliter_idx.append(
                        len(list_candidates) - 1)
                elif is_hard < 0:
                    list_hard_rule_forcing_spliter_idx.append(
                        len(list_candidates) - 1)
                if is_debug:
                    print feature
                list_features.append(feature)
            idx += 1
        if is_debug:
            print list_candidates

        if len(list_candidates) == 0:
            list_sens.append(par)
            return list_sens

        #print list_features
        #list_features = np.array(list_features)
        #print "Shape: ",list_features.shape

        labels = self.classifier.predict(list_features)

        for l in list_hard_rule_none_spliter_idx:
            labels[l] = 0
        for l in list_hard_rule_forcing_spliter_idx:
            labels[l] = 1

        list_true_spliters = [-1]
        for i in xrange(len(labels)):
            if labels[i] == 1:
                list_true_spliters.append(list_candidates[i])

        if list_candidates[-1] != len(par) - 1:
            list_true_spliters.append(len(par) - 1)

        if is_debug:
            print list_true_spliters
        if len(list_true_spliters) > 1:
            for i in xrange(len(list_true_spliters) - 1):
                list_sens.append(par[list_true_spliters[i] +
                                     1:list_true_spliters[i + 1] + 1].strip())

        else:
            list_sens.append(par)

        return list_sens

    def split(self, doc, is_debug=False):
        doc = doc.replace("\r", "")
        doc = self.multi_newline_regex.sub("\n", doc)
        paragraphs = doc.split("\n")
        sens = []
        for par in paragraphs:
            if len(par) < 1:
                continue
            par_sens = self.__split_par(par, is_debug)
            for sen in par_sens:
                sens.append(sen)

        return sens
Ejemplo n.º 18
0
def parse_Features(data, new=False, annotations="union", features=["pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality"], distance=False):
    """Extracts the features out of the dataset and returns a list of features with the corresponding classes.

    Args:
        data (list): The parsed data from fables-100-temporal-dependency.xml.
        new (bool): With new=True a new calculation of Pos() and Stem() can be enforced. Otherwise it will be loaded from a file.
        annotations (str): Looking on all relations ("union") or at all relations in common between the annotators ("intersected").
        features (list): Determines which features should be activated. Possible values: "pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality".
        distance (bool): If set to True parse_Features() will return distance information for the data (needed for evaluation)

    """
    # Only compute pos and stem if new flag is set
    if "pos" in features or "stem" in features:
        if new or not os.path.isfile("set.p"):
                pos = Pos(data, 6, annotations)
                stem = Stem(data, annotations)
                pickle.dump((pos, stem), open("save.p", "wb"))
        else:
            pos, stem = pickle.load(open("save.p", "rb"))

    if distance:
        distance_diff = []

    X = []
    y = np.array([], dtype=int)

    for txt in data.textfiles:
        # Union or intersected relations?
        if annotations == "union":
            txt.compute_union_relations()
        elif annotations == "intersected":
            txt.compute_intersection_relations()

        for rel in txt.relations:
            f = Feature(rel)

            feature = []

            # Make polarity feature
            if "polarity" in features:
                feature = np.concatenate((feature, [f.get_polarity()]))

            # Make distance feature
            if "distance" in features:
                feature = np.concatenate((feature, f.get_distance()))

            # Make POS feature
            if "pos" in features:
                pos_feature = pos.transform(f.get_pos_target(), f.get_pos_source())
                pos_feature = pos_feature.toarray()[0]
                feature = np.concatenate((feature, pos_feature))

            # Make Stem feature
            if "stem" in features:
                stem_feature = stem.transform(f.get_stem_source(), f.get_stem_target())
                stem_feature = stem_feature[0]
                feature = np.concatenate((feature, stem_feature))

            # Make similarity feature
            if "similarity" in features:
                feature = np.concatenate((feature, [f.get_similarity_of_words()]))

            # Make modality feature
            if "modality" in features:
                feature = np.concatenate((feature, [f.get_modality()]))

            # Make aspect feature
            if "aspect" in features:
                feature = np.concatenate((feature, f.get_aspect()))

            # Make tense feature
            if "tense" in features:
                feature = np.concatenate((feature, f.get_tense()))

            # Append feature to X
            X.append(feature)
            y = np.append(y, [f.get_class()])

            # Append distance information if needed
            if distance:
                distance_diff.append(f.get_distance_diff())

    if distance:
        return (X, y, distance_diff)
    else:
        return (X, y)
Ejemplo n.º 19
0
def get_sentences(number, class_id, annotations="intersected"):
    """Returns number sentences which have the relation type class_id.

    Useful if you need to get an overview over sentences with a certain temporal relation.

    """
    data = parse_XML("fables-100-temporal-dependency.xml", "McIntyreLapata09Resources/fables")

    i=0
    go_to_next_textfile = False

    for txt in data.textfiles:
        go_to_next_textfile = False

        if annotations == "union":
            txt.compute_union_relations()
        elif annotations == "intersected":
            txt.compute_intersection_relations()

        for rel in txt.relations:
            f = Feature(rel)

            if f.get_class() == class_id and go_to_next_textfile == False:
                # Stop if number relations are reached
                if i >= number:
                    break
                i += 1

                if rel.target.sentence == rel.source.sentence:
                    print "---------------"
                    print "Source event: " +rel.source.content
                    print "Target event: " +rel.target.content
                    print rel.target.sentence
                    print
                    print "Source Surrounding: " + rel.source.surrounding
                    print "Target Surrounding: " + rel.target.surrounding
                else:
                    print "---------------"
                    print "Source event: " +rel.source.content
                    print "Whole sentence " +rel.source.sentence
                    print "Surrounding" + rel.source.surrounding
                    print
                    print "Target event: " +rel.target.content
                    print "Whole sentence: " + rel.target.sentence
                    print "Surrounding: " + rel.target.surrounding

                tense_source = f.get_tense_source()
                tense_target = f.get_tense_target()
                if tense_source == 0:
                    print "Estimated tense for source event: None"
                elif tense_source == 1:
                    print "Estimated tense for source event: Present"
                elif tense_source == 2:
                    print "Estimated tense for source event: Past"
                elif tense_source == 3:
                    print "Estimated tense for source event: Future"

                if tense_target == 0:
                    print "Estimated tense for target event: None"
                elif tense_target == 1:
                    print "Estimated tense for target event: Present"
                elif tense_target == 2:
                    print "Estimated tense for target event: Past"
                elif tense_target == 3:
                    print "Estimated tense for target event: Future"

                aspect_source = f.get_aspect_source()
                aspect_target = f.get_aspect_target()
                if aspect_source == 0:
                    print "Estimated aspect for source event: None"
                elif aspect_source == 1:
                    print "Estimated aspect for source event: Progressive"
                elif aspect_source == 2:
                    print "Estimated aspect for source event: Perfect"
                elif aspect_source == 3:
                    print "Estimated aspect for source event: Perfect Progressive"

                if aspect_target == 0:
                    print "Estimated aspect for target event: None"
                elif aspect_target == 1:
                    print "Estimated aspect for target event: Progressive"
                elif aspect_target == 2:
                    print "Estimated aspect for target event: Perfect"
                elif aspect_target == 3:
                    print "Estimated aspect for target event: Perfect Progressive"

                print "Distance between events: " + str(f.get_distance())


                print "---------------"
                print

                # Get next sentence from the next text
                go_to_next_textfile = True
def get_sentences(number, class_id, annotations="intersected"):
    """Returns number sentences which have the relation type class_id.

    Useful if you need to get an overview over sentences with a certain temporal relation.

    """
    data = parse_XML("fables-100-temporal-dependency.xml",
                     "McIntyreLapata09Resources/fables")

    i = 0
    go_to_next_textfile = False

    for txt in data.textfiles:
        go_to_next_textfile = False

        if annotations == "union":
            txt.compute_union_relations()
        elif annotations == "intersected":
            txt.compute_intersection_relations()

        for rel in txt.relations:
            f = Feature(rel)

            if f.get_class() == class_id and go_to_next_textfile == False:
                # Stop if number relations are reached
                if i >= number:
                    break
                i += 1

                if rel.target.sentence == rel.source.sentence:
                    print "---------------"
                    print "Source event: " + rel.source.content
                    print "Target event: " + rel.target.content
                    print rel.target.sentence
                    print
                    print "Source Surrounding: " + rel.source.surrounding
                    print "Target Surrounding: " + rel.target.surrounding
                else:
                    print "---------------"
                    print "Source event: " + rel.source.content
                    print "Whole sentence " + rel.source.sentence
                    print "Surrounding" + rel.source.surrounding
                    print
                    print "Target event: " + rel.target.content
                    print "Whole sentence: " + rel.target.sentence
                    print "Surrounding: " + rel.target.surrounding

                tense_source = f.get_tense_source()
                tense_target = f.get_tense_target()
                if tense_source == 0:
                    print "Estimated tense for source event: None"
                elif tense_source == 1:
                    print "Estimated tense for source event: Present"
                elif tense_source == 2:
                    print "Estimated tense for source event: Past"
                elif tense_source == 3:
                    print "Estimated tense for source event: Future"

                if tense_target == 0:
                    print "Estimated tense for target event: None"
                elif tense_target == 1:
                    print "Estimated tense for target event: Present"
                elif tense_target == 2:
                    print "Estimated tense for target event: Past"
                elif tense_target == 3:
                    print "Estimated tense for target event: Future"

                aspect_source = f.get_aspect_source()
                aspect_target = f.get_aspect_target()
                if aspect_source == 0:
                    print "Estimated aspect for source event: None"
                elif aspect_source == 1:
                    print "Estimated aspect for source event: Progressive"
                elif aspect_source == 2:
                    print "Estimated aspect for source event: Perfect"
                elif aspect_source == 3:
                    print "Estimated aspect for source event: Perfect Progressive"

                if aspect_target == 0:
                    print "Estimated aspect for target event: None"
                elif aspect_target == 1:
                    print "Estimated aspect for target event: Progressive"
                elif aspect_target == 2:
                    print "Estimated aspect for target event: Perfect"
                elif aspect_target == 3:
                    print "Estimated aspect for target event: Perfect Progressive"

                print "Distance between events: " + str(f.get_distance())

                print "---------------"
                print

                # Get next sentence from the next text
                go_to_next_textfile = True
Ejemplo n.º 21
0
class SentenceSpliter():
    def __init__(self, is_training=False):
        self.classifier = None
        self.feature_model = None
        self.regex_rule = Regex()
        if not is_training:
            self.classifier = utils.load(
                os.path.join('vnspliter/model', 'model.pkl'))
            if self.classifier is None:
                print "Unable to load model!"
                exit(-1)

    def make_feature(self, file=None):
        features_list = []
        label_list = []
        self.feature_model = Feature()
        if file is None:
            return features_list, label_list
        else:
            features_list, label_list = self.feature_model.gen_feature_matrix(
                file)
        return features_list, label_list

    def split_paragraph(self, par):
        sens = []
        try:
            new_par = self.regex_rule.fn_normalize_special_mark(par)
            paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \
                self.regex_rule.run_regex_predict(new_par)
            features, _ = self.make_feature(paragraph)
            if not features:
                sens.append(par)
                return sens
            labels = self.classifier.predict(features)
            idx = 0
            pos_start = 0
            pos_end = 0
            for c in paragraph:
                if Feature.is_splitter_candidate(c):
                    if idx < len(labels) and labels[idx] == 1:
                        sens.append(paragraph[pos_start:pos_end + 1].strip())
                        pos_start = pos_end + 1
                    idx += 1
                pos_end += 1
            if pos_start < len(paragraph):
                sens.append(paragraph[pos_start:].strip())
            paragraph = '\n'.join(sens)
            paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \
                                               mark3, mark4)
            # paragraph = self.regex_rule.normalize_special_mark.sub(u' \g<special_mark> ', paragraph)
            # paragraph = self.regex_rule.normalize_space.sub(u' ', paragraph)
            sens = paragraph.split('\n')
            return sens
        except Exception as e:
            print(traceback.format_exc())
            sens.append(par)
            return sens

    def split(self, pars):
        sens = []
        try:
            pars = pars.replace(u'\r', u'\n')
            pars = re.compile(u'\n+').sub(u'\n', pars)
            pars = pars.split('\n')
            for par in pars:
                if par.strip():
                    s = self.split_paragraph(par)
                    sens += s
            return sens
        except:
            sens.append(pars)
            return sens