Esempio n. 1
0
    def load_integer_features(self, data):
        """Gives each POS tag in data a number."""
        integer_features = []
        pos_feature = np.array([])

        for txt in data.textfiles:
            if self.annotations == "union":
                txt.compute_union_relations()
            elif self.annotations == "intersected":
                txt.compute_intersection_relations()

            for rel in txt.relations:
                f = Feature(rel)
                # Build arrays of integers with which we can fit the encoder
                # Standardize because f.get_pos_$x() doesn't have to be of length self.number_tags_per_feature/2
                standardized_pos_target = self.standardize_sub_pos_feature(f.get_pos_target())
                standardized_pos_source = self.standardize_sub_pos_feature(f.get_pos_source())
                # Concatenate the two plain POS tag arrays from target and source event
                pos_feature = np.concatenate((standardized_pos_target, standardized_pos_source))
                # Transform this array into the corresponding array of integers
                integer_feature = self.pos_tags_to_integers(pos_feature)

                integer_features.append(integer_feature)

        return integer_features
Esempio n. 2
0
    def load_integer_features(self, data):
        """Gives each POS tag in data a number."""
        integer_features = []
        pos_feature = np.array([])

        for txt in data.textfiles:
            if self.annotations == "union":
                txt.compute_union_relations()
            elif self.annotations == "intersected":
                txt.compute_intersection_relations()

            for rel in txt.relations:
                f = Feature(rel)
                # Build arrays of integers with which we can fit the encoder
                # Standardize because f.get_pos_$x() doesn't have to be of length self.number_tags_per_feature/2
                standardized_pos_target = self.standardize_sub_pos_feature(
                    f.get_pos_target())
                standardized_pos_source = self.standardize_sub_pos_feature(
                    f.get_pos_source())
                # Concatenate the two plain POS tag arrays from target and source event
                pos_feature = np.concatenate(
                    (standardized_pos_target, standardized_pos_source))
                # Transform this array into the corresponding array of integers
                integer_feature = self.pos_tags_to_integers(pos_feature)

                integer_features.append(integer_feature)

        return integer_features
Esempio n. 3
0
    def load_pos_tags(self, data):
        """Loads all POS tags used in the pos_surrounding area around an event."""
        pos_tags = np.array([])

        for txt in data.textfiles:
            if self.annotations == "union":
                txt.compute_union_relations()
            elif self.annotations == "intersected":
                txt.compute_intersection_relations()

            for rel in txt.relations:
                f = Feature(rel)
                # Collect all pos tags from the data
                pos_tags = np.concatenate((pos_tags, f.get_pos_target()))
                pos_tags = np.concatenate((pos_tags, f.get_pos_source()))

        pos_tags = np.unique(pos_tags)

        # Append a blank tag which will be used for filling up features which don't have enough elements
        pos_tags = np.append(pos_tags, 'BL')
        return pos_tags
Esempio n. 4
0
    def load_pos_tags(self, data):
        """Loads all POS tags used in the pos_surrounding area around an event."""
        pos_tags = np.array([])

        for txt in data.textfiles:
            if self.annotations == "union":
                txt.compute_union_relations()
            elif self.annotations == "intersected":
                txt.compute_intersection_relations()

            for rel in txt.relations:
                f = Feature(rel)
                # Collect all pos tags from the data
                pos_tags = np.concatenate((pos_tags, f.get_pos_target()))
                pos_tags = np.concatenate((pos_tags, f.get_pos_source()))

        pos_tags = np.unique(pos_tags)

        # Append a blank tag which will be used for filling up features which don't have enough elements
        pos_tags = np.append(pos_tags, 'BL')
        return pos_tags
def parse_Features(data, new=False, annotations="union", features=["pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality"], distance=False):
    """Extracts the features out of the dataset and returns a list of features with the corresponding classes.

    Args:
        data (list): The parsed data from fables-100-temporal-dependency.xml.
        new (bool): With new=True a new calculation of Pos() and Stem() can be enforced. Otherwise it will be loaded from a file.
        annotations (str): Looking on all relations ("union") or at all relations in common between the annotators ("intersected").
        features (list): Determines which features should be activated. Possible values: "pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality".
        distance (bool): If set to True parse_Features() will return distance information for the data (needed for evaluation)

    """
    # Only compute pos and stem if new flag is set
    if "pos" in features or "stem" in features:
        if new or not os.path.isfile("set.p"):
                pos = Pos(data, 6, annotations)
                stem = Stem(data, annotations)
                pickle.dump((pos, stem), open("save.p", "wb"))
        else:
            pos, stem = pickle.load(open("save.p", "rb"))

    if distance:
        distance_diff = []

    X = []
    y = np.array([], dtype=int)

    for txt in data.textfiles:
        # Union or intersected relations?
        if annotations == "union":
            txt.compute_union_relations()
        elif annotations == "intersected":
            txt.compute_intersection_relations()

        for rel in txt.relations:
            f = Feature(rel)

            feature = []

            # Make polarity feature
            if "polarity" in features:
                feature = np.concatenate((feature, [f.get_polarity()]))

            # Make distance feature
            if "distance" in features:
                feature = np.concatenate((feature, f.get_distance()))

            # Make POS feature
            if "pos" in features:
                pos_feature = pos.transform(f.get_pos_target(), f.get_pos_source())
                pos_feature = pos_feature.toarray()[0]
                feature = np.concatenate((feature, pos_feature))

            # Make Stem feature
            if "stem" in features:
                stem_feature = stem.transform(f.get_stem_source(), f.get_stem_target())
                stem_feature = stem_feature[0]
                feature = np.concatenate((feature, stem_feature))

            # Make similarity feature
            if "similarity" in features:
                feature = np.concatenate((feature, [f.get_similarity_of_words()]))

            # Make modality feature
            if "modality" in features:
                feature = np.concatenate((feature, [f.get_modality()]))

            # Make aspect feature
            if "aspect" in features:
                feature = np.concatenate((feature, f.get_aspect()))

            # Make tense feature
            if "tense" in features:
                feature = np.concatenate((feature, f.get_tense()))

            # Append feature to X
            X.append(feature)
            y = np.append(y, [f.get_class()])

            # Append distance information if needed
            if distance:
                distance_diff.append(f.get_distance_diff())

    if distance:
        return (X, y, distance_diff)
    else:
        return (X, y)