def produce_content(self, field_representation_name: str,
                        field_data) -> FeaturesBagField:
        """
        Produces the field content for this representation,
        bag of features whose keys is babel net synset id and
        values are global score of the sysnset

        Args:
            field_representation_name (str): Name of the field representation
            field_data: Text that will be linked to BabelNet

        Returns:
            feature_bag (FeaturesBagField)
        """
        field_data = check_not_tokenized(field_data)

        self.__babel_client.babelfy(field_data)
        feature_bag = FeaturesBagField(field_representation_name)
        try:
            if self.__babel_client.entities is not None:
                try:
                    for entity in self.__babel_client.entities:
                        feature_bag.append_feature(entity['babelSynsetID'],
                                                   entity['globalScore'])
                except AttributeError:
                    pass
        except AttributeError:
            pass

        return feature_bag
 def test_get_feature_dict(self):
     feature = FeaturesBagField('repr_name')
     feature.append_feature('synsetID_1', 'global_score_1')
     feature.append_feature('synsetID_2', 'global_score_2')
     self.assertEqual(feature.value, {
         'synsetID_1': 'global_score_1',
         'synsetID_2': 'global_score_2'
     }, "Error in the features_dict")
Beispiel #3
0
 def test_load_serialize(self):
     content_field_repr = FeaturesBagField("test")
     content_field_repr.append_feature("test_key", "test_value")
     content_field = ContentField("test_field", "0000")
     content_field.append(str(0), content_field_repr)
     content = Content("001")
     content.append("test_field", content_field)
     try:
         content.serialize(".")
     except FileNotFoundError:
         self.fail("Could not create file!")
     with lzma.open('001.xz', 'r') as file:
         self.assertEqual(content, pickle.load(file))
Beispiel #4
0
    def produce_content(self, field_representation_name: str, content_id: str,
                        field_name: str):
        """
        Retrieve the tf-idf values, for terms in document that match with content_id,
        from the pre-computed word - document matrix.

        Args:
            field_representation_name (str): Name of the field representation
            content_id (str): Id of the content that contains the terms for which extract the tf-idf
            field_name (str): Name of the field to consider

        Returns:
            (FeaturesBag): <term, tf-idf>
        """

        doc = self.__matching[content_id]
        feature_index = self.__tfidf_matrix[doc, :].nonzero()[1]
        tfidf_scores = zip(
            feature_index,
            [self.__tfidf_matrix[doc, x] for x in feature_index])

        features = {}
        for word, score in [(self.__feature_names[i], score)
                            for (i, score) in tfidf_scores]:
            features[word] = score

        return FeaturesBagField(field_representation_name, features)
Beispiel #5
0
    def produce_content(self, field_representation_name: str,
                        field_data) -> FeaturesBagField:
        """
        Produces a bag of features whose key is a wordnet synset
        and whose value is the frequency of the synset in the
        field data text
        """

        field_data = check_not_tokenized(field_data)

        synsets = disambiguate(field_data)
        synsets = [synset for word, synset in synsets if synset is not None]

        return FeaturesBagField(field_representation_name, Counter(synsets))
Beispiel #6
0
    def test_append_remove(self):
        content_field_repr = FeaturesBagField("test")
        content_field_repr.append_feature("test_key", "test_value")
        content_field = ContentField("test_field", "0000")
        content_field.append(str(0), content_field_repr)
        content1 = Content("001")
        content1.append("test_field", content_field)

        content2 = Content("002")
        content2.append("test_field", content_field)
        content_field_repr = FeaturesBagField("test")
        content_field_repr.append_feature("test_key", "test_value")
        content_field2 = ContentField("test_field2", "0000")
        content_field2.append(str(0), content_field_repr)
        content2.append("test_field2", content_field2)
        content2.remove("test_field2")
        self.assertTrue(content1.get_field_list(), content2.get_field_list())
Beispiel #7
0
    def __decode_field_data(self, field: ContentField, field_name: str,
                            field_data: str):
        # Decode string into dict or list
        try:
            loaded = json.loads(field_data)
        except json.JSONDecodeError:
            try:
                # in case the dict is {'foo': 1} json expects {"foo": 1}
                reformatted_field_data = field_data.replace("\'", "\"")
                loaded = json.loads(reformatted_field_data)
            except json.JSONDecodeError:
                # if it has issues decoding we consider the data as str
                loaded = reformatted_field_data

        # if the decoded is a list, maybe it is an EmbeddingField repr
        if isinstance(loaded, list):
            arr = np.array(loaded)
            # if the array has only numbers then we consider it as a dense vector
            # else it is not and we consider the field data as a string
            if issubclass(arr.dtype.type, np.number):
                result = EmbeddingField(field_name, arr)
                field.append(field_name, result)
            else:
                result = StringField(field_name, field_data)
                field.append(field_name, result)

        # if the decoded is a dict, maybe it is a FeaturesBagField
        elif isinstance(loaded, dict):
            # if all values of the dict are numbers then we consider it as a bag of words
            # else it is not and we consider it as a string
            if len(loaded.values()) != 0 and \
                    all(isinstance(value, (float, int)) for value in loaded.values()):

                result = FeaturesBagField(field_name, loaded)
                field.append(field_name, result)
            else:
                result = StringField(field_name, field_data)
                field.append(field_name, result)

        # if the decoded is a string, then it is a StringField
        elif isinstance(loaded, str):
            result = StringField(field_name, loaded)
            field.append(field_name, result)
Beispiel #8
0
    def produce_content(self, field_representation_name: str, content_id: str,
                        field_name: str) -> FeaturesBagField:

        return FeaturesBagField(
            field_representation_name,
            self.__index.get_tf_idf(field_name, content_id))
 def test_append_get_feature(self):
     feature = FeaturesBagField('repr_name')
     feature.append_feature('synsetID', 'global_score')
     self.assertEqual(feature.get_feature('synsetID'), 'global_score',
                      "Error in the features_dict")