Example #1
0
 def test_check_tokenized(self):
     str_ = 'abcd efg'
     list_ = ['abcd', 'efg']
     check_tokenized(str_)
     check_tokenized(list_)
     check_not_tokenized(str_)
     check_not_tokenized(list_)
Example #2
0
 def test_check_tokenized(self):
     str_ = 'abcd efg'
     list_ = ['abcd', 'efg']
     x = check_tokenized(str_)
     self.assertEqual(x, ['abcd', 'efg'])
     y = check_tokenized(list_)
     self.assertEqual(y, ['abcd', 'efg'])
     z = check_not_tokenized(str_)
     self.assertEqual(z, 'abcd efg')
     s = check_not_tokenized(list_)
     self.assertEqual(s, 'abcd efg')
    def produce_content(
            self, field_name: str,
            preprocessor_list: List[InformationProcessor],
            source: RawInformationSource) -> List[FieldRepresentation]:
        """
        The content's raw data is decoded using the appropriate method (in case the data is not a string).
        Each decoded representation is added to a list which is then returned
        """
        representation_list: List[FieldRepresentation] = []

        for content_data in source:
            # if a preprocessor is specified, then surely we must import the field data as a string,
            # there's no other option
            if len(preprocessor_list) != 0:
                representation = SimpleField(
                    check_not_tokenized(
                        self.process_data(str(content_data[field_name]),
                                          preprocessor_list)))

            # If a preprocessor isn't specified, well maybe it is a complex representation:
            # let's decode what kind of complex representation it is and import it accordingly.
            else:
                representation = self.__decode_field_data(
                    str(content_data[field_name]))

            representation_list.append(representation)

        return representation_list
Example #4
0
    def produce_content(self, field_representation_name: str,
                        field_data) -> FeaturesBagField:
        """
        Produces the field content for this representation,
        bag of features whose keys is babel net synset id and
        values are global score of the sysnset

        Args:
            field_representation_name (str): Name of the field representation
            field_data: Text that will be linked to BabelNet

        Returns:
            feature_bag (FeaturesBagField)
        """
        field_data = check_not_tokenized(field_data)

        self.__babel_client.babelfy(field_data)
        feature_bag = FeaturesBagField(field_representation_name)
        try:
            if self.__babel_client.entities is not None:
                try:
                    for entity in self.__babel_client.entities:
                        feature_bag.append_feature(entity['babelSynsetID'],
                                                   entity['globalScore'])
                except AttributeError:
                    pass
        except AttributeError:
            pass

        return feature_bag
Example #5
0
    def dataset_refactor(self, information_source: RawInformationSource,
                         id_field_names: str):
        """
        Creates a corpus structure, a list of string where each string is a document.
        Then call TfIdfVectorizer this collection, obtaining term-document
        tf-idf matrix, the corpus is then deleted

        Args:
            information_source (RawInformationSource): Source for the raw data
            id_field_names: names of the fields that compounds the id
        """

        field_name = self.field_need_refactor
        preprocessor_list = self.processor_list

        for raw_content in information_source:
            processed_field_data = raw_content[field_name]
            for preprocessor in preprocessor_list:
                processed_field_data = preprocessor.process(
                    processed_field_data)

            processed_field_data = check_not_tokenized(processed_field_data)
            content_id = id_merger(raw_content, id_field_names)
            self.__matching[content_id] = len(self.__corpus)
            self.__corpus.append(processed_field_data)

        tf_vectorizer = TfidfVectorizer(sublinear_tf=True)
        self.__tfidf_matrix = tf_vectorizer.fit_transform(self.__corpus)

        del self.__corpus

        self.__feature_names = tf_vectorizer.get_feature_names()
Example #6
0
 def produce_single_repr(
         self, field_data: Union[List[str], str]) -> EmbeddingField:
     """
     Produces a single representation with document granularity by combining the embedding vectors in order to
     create an embedding matrix that represents the document
     """
     doc_matrix = self.embedding_source.load(
         self.process_data_granularity(check_not_tokenized(field_data)))
     return EmbeddingField(self.combining_technique.combine(doc_matrix))
    def produce_single_repr(
            self, field_data: Union[List[str], str]) -> FeaturesBagField:
        """
        Produces a bag of features whose key is a wordnet synset and whose value is the frequency of the synset in the
        field data text
        """

        field_data = check_not_tokenized(field_data)

        synsets = disambiguate(field_data)
        synsets = [synset for word, synset in synsets if synset is not None]

        return FeaturesBagField(Counter(synsets))
    def get_properties(self,
                       raw_source: RawInformationSource) -> List[EntitiesProp]:
        """
        Produces a list of EntitiesProp objects for every raw content in the raw source where .

        An Entity Prop object is basically a dict where the keys are the entity linked (since there can be multiple
        entities in a field) and values are properties retrieved from BabelPy for that entity.
        EXAMPLE:
            properties_list = [EntityProp(), EntityProp(), ...]

            EntityProp.value -> {'DiCaprio': {'babelSynsetID': ..., ...},'Nolan': {'babelSynsetID: ..., ...}, ...}

        """
        properties_list = []
        logger.info("Doing Entity Linking with BabelFy")
        for raw_content in progbar(raw_source,
                                   max_value=len(list(raw_source))):
            data_to_disambiguate = check_not_tokenized(
                raw_content[self.__field_to_link])

            self.__babel_client.babelfy(data_to_disambiguate)

            properties_content = {}
            try:
                if self.__babel_client.merged_entities is not None:

                    for entity in self.__babel_client.merged_entities:
                        properties_entity = {
                            'babelSynsetID': '',
                            'DBPediaURL': '',
                            'BabelNetURL': '',
                            'score': '',
                            'coherenceScore': '',
                            'globalScore': '',
                            'source': ''
                        }

                        for key in properties_entity:
                            if entity.get(key) is not None:
                                properties_entity[key] = entity[key]

                        properties_content[entity['text']] = properties_entity

                properties_list.append(EntitiesProp(properties_content))
            except AttributeError:
                raise AttributeError(
                    "BabelFy limit reached! Insert an api key or change it if you inserted one!"
                )

        return properties_list
Example #9
0
    def produce_content(self, field_representation_name: str,
                        field_data) -> FeaturesBagField:
        """
        Produces a bag of features whose key is a wordnet synset
        and whose value is the frequency of the synset in the
        field data text
        """

        field_data = check_not_tokenized(field_data)

        synsets = disambiguate(field_data)
        synsets = [synset for word, synset in synsets if synset is not None]

        return FeaturesBagField(field_representation_name, Counter(synsets))
    def produce_content(self, field_name: str, pipeline_id, field_data, indexer: IndexInterface):
        """
        Save field data as a document field using the given indexer,
        the resulting can be used for an index query recommender

        Args:
            indexer: Index in which new field will be created
            field_data: Data that will be stored in the index
            pipeline_id: Second part of the field name in the indexer index,
                complete field_name is field_name + pipeline_id
            field_name (str): First part of the field name in the indexer index,
                complete field_name is field_name + pipeline_id

        """
        field_data = check_not_tokenized(field_data)
        indexer.new_searching_field(field_name + pipeline_id, field_data)
Example #11
0
 def process(self, field_data) -> List[str]:
     field_data = check_not_tokenized(field_data)
     if self.strip_multiple_whitespaces:
         field_data = self.__strip_multiple_whitespaces_operation(
             field_data)
     if self.url_tagging:
         field_data = self.__url_tagging_operation(field_data)
     field_data = self.__tokenization_operation(field_data)
     if self.stopwords_removal:
         field_data = self.__stopwords_removal_operation(field_data)
     if self.lemmatization:
         field_data = self.__lemmatization_operation(field_data)
     if self.stemming:
         field_data = self.__stemming_operation(field_data)
     if self.named_entity_recognition:
         field_data = self.__named_entity_recognition_operation(field_data)
     return self.__compact_tokens(field_data)
    def produce_single_repr(self, field_data: Union[List[str], str]) -> FeaturesBagField:
        """
        Produces a bag of features whose keys is babel net synset id and values are global score of the sysnset
        """
        field_data = check_not_tokenized(field_data)

        self.__babel_client.babelfy(field_data)
        feature_bag = {}
        try:
            if self.__babel_client.entities is not None:
                try:
                    for entity in self.__babel_client.entities:
                        feature_bag[entity['babelSynsetID']] = entity['globalScore']
                except AttributeError:
                    pass
        except AttributeError:
            pass

        return FeaturesBagField(feature_bag)
    def produce_content(self, field_name: str,
                        preprocessor_list: List[InformationProcessor],
                        source: RawInformationSource) -> List[SimpleField]:
        """
        The contents' raw data in the given field_name is extracted and stored in a SimpleField object.
        The SimpleField objects created are stored in a list which is then returned.
        No further operations are done on the data in order to keep it in the original form.
        Because of that the preprocessor_list is ignored and not used by this technique
        """

        representation_list: List[SimpleField] = []

        for content_data in source:
            processed_data = self.process_data(content_data[field_name],
                                               preprocessor_list)
            representation_list.append(
                SimpleField(self.__dtype(check_not_tokenized(processed_data))))

        return representation_list
Example #14
0
    def dataset_refactor(self, information_source: RawInformationSource,
                         field_name: str,
                         preprocessor_list: List[InformationProcessor]) -> int:
        """
        Creates a corpus structure, a list of string where each string is a document.
        Then calls TfIdfVectorizer on this collection, obtaining term-document tf-idf matrix, the corpus is then deleted
        """
        self.__corpus = []

        for raw_content in information_source:
            processed_field_data = self.process_data(raw_content[field_name],
                                                     preprocessor_list)

            processed_field_data = check_not_tokenized(processed_field_data)
            self.__corpus.append(processed_field_data)

        tf_vectorizer = TfidfVectorizer(sublinear_tf=True)
        self.__tfidf_matrix = tf_vectorizer.fit_transform(self.__corpus)

        del self.__corpus

        self.__feature_names = tf_vectorizer.get_feature_names()

        return self.__tfidf_matrix.shape[0]
 def process_data_granularity(self, doc_data: str) -> str:
     return check_not_tokenized(doc_data)
Example #16
0
 def process_data_granularity(
         self, field_data: Union[List[str], str]) -> List[str]:
     return [check_not_tokenized(field_data)]