def test_check_tokenized(self): str_ = 'abcd efg' list_ = ['abcd', 'efg'] check_tokenized(str_) check_tokenized(list_) check_not_tokenized(str_) check_not_tokenized(list_)
def test_check_tokenized(self): str_ = 'abcd efg' list_ = ['abcd', 'efg'] x = check_tokenized(str_) self.assertEqual(x, ['abcd', 'efg']) y = check_tokenized(list_) self.assertEqual(y, ['abcd', 'efg']) z = check_not_tokenized(str_) self.assertEqual(z, 'abcd efg') s = check_not_tokenized(list_) self.assertEqual(s, 'abcd efg')
def produce_content( self, field_name: str, preprocessor_list: List[InformationProcessor], source: RawInformationSource) -> List[FieldRepresentation]: """ The content's raw data is decoded using the appropriate method (in case the data is not a string). Each decoded representation is added to a list which is then returned """ representation_list: List[FieldRepresentation] = [] for content_data in source: # if a preprocessor is specified, then surely we must import the field data as a string, # there's no other option if len(preprocessor_list) != 0: representation = SimpleField( check_not_tokenized( self.process_data(str(content_data[field_name]), preprocessor_list))) # If a preprocessor isn't specified, well maybe it is a complex representation: # let's decode what kind of complex representation it is and import it accordingly. else: representation = self.__decode_field_data( str(content_data[field_name])) representation_list.append(representation) return representation_list
def produce_content(self, field_representation_name: str, field_data) -> FeaturesBagField: """ Produces the field content for this representation, bag of features whose keys is babel net synset id and values are global score of the sysnset Args: field_representation_name (str): Name of the field representation field_data: Text that will be linked to BabelNet Returns: feature_bag (FeaturesBagField) """ field_data = check_not_tokenized(field_data) self.__babel_client.babelfy(field_data) feature_bag = FeaturesBagField(field_representation_name) try: if self.__babel_client.entities is not None: try: for entity in self.__babel_client.entities: feature_bag.append_feature(entity['babelSynsetID'], entity['globalScore']) except AttributeError: pass except AttributeError: pass return feature_bag
def dataset_refactor(self, information_source: RawInformationSource, id_field_names: str): """ Creates a corpus structure, a list of string where each string is a document. Then call TfIdfVectorizer this collection, obtaining term-document tf-idf matrix, the corpus is then deleted Args: information_source (RawInformationSource): Source for the raw data id_field_names: names of the fields that compounds the id """ field_name = self.field_need_refactor preprocessor_list = self.processor_list for raw_content in information_source: processed_field_data = raw_content[field_name] for preprocessor in preprocessor_list: processed_field_data = preprocessor.process( processed_field_data) processed_field_data = check_not_tokenized(processed_field_data) content_id = id_merger(raw_content, id_field_names) self.__matching[content_id] = len(self.__corpus) self.__corpus.append(processed_field_data) tf_vectorizer = TfidfVectorizer(sublinear_tf=True) self.__tfidf_matrix = tf_vectorizer.fit_transform(self.__corpus) del self.__corpus self.__feature_names = tf_vectorizer.get_feature_names()
def produce_single_repr( self, field_data: Union[List[str], str]) -> EmbeddingField: """ Produces a single representation with document granularity by combining the embedding vectors in order to create an embedding matrix that represents the document """ doc_matrix = self.embedding_source.load( self.process_data_granularity(check_not_tokenized(field_data))) return EmbeddingField(self.combining_technique.combine(doc_matrix))
def produce_single_repr( self, field_data: Union[List[str], str]) -> FeaturesBagField: """ Produces a bag of features whose key is a wordnet synset and whose value is the frequency of the synset in the field data text """ field_data = check_not_tokenized(field_data) synsets = disambiguate(field_data) synsets = [synset for word, synset in synsets if synset is not None] return FeaturesBagField(Counter(synsets))
def get_properties(self, raw_source: RawInformationSource) -> List[EntitiesProp]: """ Produces a list of EntitiesProp objects for every raw content in the raw source where . An Entity Prop object is basically a dict where the keys are the entity linked (since there can be multiple entities in a field) and values are properties retrieved from BabelPy for that entity. EXAMPLE: properties_list = [EntityProp(), EntityProp(), ...] EntityProp.value -> {'DiCaprio': {'babelSynsetID': ..., ...},'Nolan': {'babelSynsetID: ..., ...}, ...} """ properties_list = [] logger.info("Doing Entity Linking with BabelFy") for raw_content in progbar(raw_source, max_value=len(list(raw_source))): data_to_disambiguate = check_not_tokenized( raw_content[self.__field_to_link]) self.__babel_client.babelfy(data_to_disambiguate) properties_content = {} try: if self.__babel_client.merged_entities is not None: for entity in self.__babel_client.merged_entities: properties_entity = { 'babelSynsetID': '', 'DBPediaURL': '', 'BabelNetURL': '', 'score': '', 'coherenceScore': '', 'globalScore': '', 'source': '' } for key in properties_entity: if entity.get(key) is not None: properties_entity[key] = entity[key] properties_content[entity['text']] = properties_entity properties_list.append(EntitiesProp(properties_content)) except AttributeError: raise AttributeError( "BabelFy limit reached! Insert an api key or change it if you inserted one!" ) return properties_list
def produce_content(self, field_representation_name: str, field_data) -> FeaturesBagField: """ Produces a bag of features whose key is a wordnet synset and whose value is the frequency of the synset in the field data text """ field_data = check_not_tokenized(field_data) synsets = disambiguate(field_data) synsets = [synset for word, synset in synsets if synset is not None] return FeaturesBagField(field_representation_name, Counter(synsets))
def produce_content(self, field_name: str, pipeline_id, field_data, indexer: IndexInterface): """ Save field data as a document field using the given indexer, the resulting can be used for an index query recommender Args: indexer: Index in which new field will be created field_data: Data that will be stored in the index pipeline_id: Second part of the field name in the indexer index, complete field_name is field_name + pipeline_id field_name (str): First part of the field name in the indexer index, complete field_name is field_name + pipeline_id """ field_data = check_not_tokenized(field_data) indexer.new_searching_field(field_name + pipeline_id, field_data)
def process(self, field_data) -> List[str]: field_data = check_not_tokenized(field_data) if self.strip_multiple_whitespaces: field_data = self.__strip_multiple_whitespaces_operation( field_data) if self.url_tagging: field_data = self.__url_tagging_operation(field_data) field_data = self.__tokenization_operation(field_data) if self.stopwords_removal: field_data = self.__stopwords_removal_operation(field_data) if self.lemmatization: field_data = self.__lemmatization_operation(field_data) if self.stemming: field_data = self.__stemming_operation(field_data) if self.named_entity_recognition: field_data = self.__named_entity_recognition_operation(field_data) return self.__compact_tokens(field_data)
def produce_single_repr(self, field_data: Union[List[str], str]) -> FeaturesBagField: """ Produces a bag of features whose keys is babel net synset id and values are global score of the sysnset """ field_data = check_not_tokenized(field_data) self.__babel_client.babelfy(field_data) feature_bag = {} try: if self.__babel_client.entities is not None: try: for entity in self.__babel_client.entities: feature_bag[entity['babelSynsetID']] = entity['globalScore'] except AttributeError: pass except AttributeError: pass return FeaturesBagField(feature_bag)
def produce_content(self, field_name: str, preprocessor_list: List[InformationProcessor], source: RawInformationSource) -> List[SimpleField]: """ The contents' raw data in the given field_name is extracted and stored in a SimpleField object. The SimpleField objects created are stored in a list which is then returned. No further operations are done on the data in order to keep it in the original form. Because of that the preprocessor_list is ignored and not used by this technique """ representation_list: List[SimpleField] = [] for content_data in source: processed_data = self.process_data(content_data[field_name], preprocessor_list) representation_list.append( SimpleField(self.__dtype(check_not_tokenized(processed_data)))) return representation_list
def dataset_refactor(self, information_source: RawInformationSource, field_name: str, preprocessor_list: List[InformationProcessor]) -> int: """ Creates a corpus structure, a list of string where each string is a document. Then calls TfIdfVectorizer on this collection, obtaining term-document tf-idf matrix, the corpus is then deleted """ self.__corpus = [] for raw_content in information_source: processed_field_data = self.process_data(raw_content[field_name], preprocessor_list) processed_field_data = check_not_tokenized(processed_field_data) self.__corpus.append(processed_field_data) tf_vectorizer = TfidfVectorizer(sublinear_tf=True) self.__tfidf_matrix = tf_vectorizer.fit_transform(self.__corpus) del self.__corpus self.__feature_names = tf_vectorizer.get_feature_names() return self.__tfidf_matrix.shape[0]
def process_data_granularity(self, doc_data: str) -> str: return check_not_tokenized(doc_data)
def process_data_granularity( self, field_data: Union[List[str], str]) -> List[str]: return [check_not_tokenized(field_data)]