def translate_text(text_list): translate_analyzer = TranslationAnalyzer( model_name_or_path="Helsinki-NLP/opus-mt-hi-en", device="auto") source_responses = [ TextPayload(processed_text=text.processed_text, source_name="sample") for text in text_list ] analyzer_responses = translate_analyzer.analyze_input( source_response_list=source_responses) return [ TextPayload( processed_text=response.segmented_data["translated_text"], source_name="translator", ) for response in analyzer_responses ]
def analyze_input( self, source_response_list: List[TextPayload], analyzer_config: Optional[BaseAnalyzerConfig] = None, **kwargs, ) -> List[TextPayload]: analyzer_output: List[TextPayload] = [] for batch_responses in self.batchify(source_response_list, self.batch_size): texts = [ source_response.processed_text[:self._max_length] for source_response in batch_responses ] batch_predictions = self._prediction_from_model(texts) for prediction, source_response in zip(batch_predictions, batch_responses): segmented_data = {"ner_data": prediction} if source_response.segmented_data: segmented_data = { **segmented_data, **source_response.segmented_data, } analyzer_output.append( TextPayload( processed_text=source_response.processed_text, meta=source_response.meta, segmented_data=segmented_data, source_name=source_response.source_name, )) return analyzer_output
def analyze_input( # type: ignore[override] self, source_response_list: List[TextPayload], analyzer_config: Optional[DummyAnalyzerConfig] = None, **kwargs, ) -> List[TextPayload]: responses = [] for source_response in source_response_list: segmented_data = { "dummy_data": None if not analyzer_config else analyzer_config.dummy_data } if source_response.segmented_data: segmented_data = {**segmented_data, **source_response.segmented_data} responses.append( TextPayload( processed_text=source_response.processed_text, meta=source_response.meta, source_name=source_response.source_name, segmented_data=segmented_data, ) ) return responses
def lookup( # type: ignore[override] self, config: TrafilaturaCrawlerConfig, **kwargs) -> List[TextPayload]: source_responses: List[TextPayload] = [] final_urls = [] if config.is_sitemap or config.is_feed: for url in config.urls: final_urls.extend(config.find_urls(url=url)) else: final_urls = config.urls for url in final_urls: extracted_data = config.extract_url(url=url) if extracted_data is None: logger.warning(f"Unable to crawl {url}, hence skipping it") continue comments = ("" if "comments" not in extracted_data else extracted_data["comments"]) source_responses.append( TextPayload( processed_text=f"{extracted_data['text']}. {comments}", meta=extracted_data, source_name=self.NAME, )) return source_responses
def test_pii_analyzer_replace_original(pii_analyzer): analyzer_config = PresidioPIIAnalyzerConfig(analyze_only=False, return_decision_process=True, replace_original_text=True) source_responses = [ TextPayload(processed_text=text, source_name="sample") for text in TEXTS ] analyzer_responses = pii_analyzer.analyze_input( source_response_list=source_responses, analyzer_config=analyzer_config) assert len(analyzer_responses) == len(TEXTS) for text, analyzer_response in zip(TEXTS, analyzer_responses): assert analyzer_response.segmented_data is not None assert analyzer_response.segmented_data["pii_data"] is not None assert analyzer_response.segmented_data["pii_data"][ "analyzer_result"] is not None assert analyzer_response.segmented_data["pii_data"][ "anonymized_result"] is not None assert analyzer_response.segmented_data["pii_data"][ "anonymized_text"] is not None for pii_info in PII_LIST: assert pii_info not in analyzer_response.segmented_data[ "pii_data"]["anonymized_text"] assert (analyzer_response.segmented_data["pii_data"]["anonymized_text"] == analyzer_response.processed_text) assert analyzer_response.segmented_data["pii_data"][ "anonymized_text"] != text
def analyze_input( # type: ignore[override] self, source_response_list: List[TextPayload], analyzer_config: Optional[ClassificationAnalyzerConfig] = None, **kwargs, ) -> List[TextPayload]: analyzer_output: List[TextPayload] = [] if ( analyzer_config is not None and analyzer_config.use_splitter_and_aggregator and analyzer_config.splitter_config ): source_response_list = self.splitter.preprocess_input( source_response_list, config=analyzer_config.splitter_config, ) for batch_responses in self.batchify(source_response_list, self.batch_size): texts = [ source_response.processed_text[: self._max_length] for source_response in batch_responses ] batch_predictions = self.prediction_from_model(texts=texts, analyzer_config=analyzer_config) for score_dict, source_response in zip(batch_predictions, batch_responses): segmented_data = { "classifier_data": score_dict } if source_response.segmented_data: segmented_data = { **segmented_data, **source_response.segmented_data, } analyzer_output.append( TextPayload( processed_text=source_response.processed_text, meta=source_response.meta, segmented_data=segmented_data, source_name=source_response.source_name, ) ) if ( analyzer_config is not None and analyzer_config.use_splitter_and_aggregator and analyzer_config.aggregator_config ): analyzer_output = self.aggregator.postprocess_input( input_list=analyzer_output, config=analyzer_config.aggregator_config, ) return analyzer_output
def test_remove_stop_words(text_cleaner): request = TextPayload(processed_text=TEXT_WITH_STOP_WORDS) config = TextCleanerConfig( cleaning_functions=[RemoveStopWords(language="english")]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert "In hello , obsei" == cleaner_response.processed_text
def test_decode_unicode(text_cleaner): request = TextPayload(processed_text=TEXT_WITH_UNICODE) config = TextCleanerConfig(cleaning_functions=[DecodeUnicode()]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert "what is this ! ! !" == cleaner_response.processed_text
def test_lower_case(text_cleaner): request = TextPayload(processed_text=TEXT_WITH_UPPER_CASE) config = TextCleanerConfig(cleaning_functions=[ToLowerCase()]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert "how is this possible ? ? ?" == cleaner_response.processed_text
def test_remove_date_time(text_cleaner): request = TextPayload(processed_text=TEXT_WITH_DATE_TIME) config = TextCleanerConfig(cleaning_functions=[RemoveDateTime()]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert ("Peter drinks likely likes to tea at every" == cleaner_response.processed_text)
def test_remove_punctuation(text_cleaner): request = TextPayload(processed_text=TEXT_WITH_PUNCTUATION) config = TextCleanerConfig(cleaning_functions=[RemovePunctuation()]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert ( "I had the worst experience ever with XYZ in Egypt Bad Cars asking to pay in cash" == cleaner_response.processed_text)
def test_remove_special_characters(text_cleaner): request = TextPayload(processed_text=TEXT_WITH_SPECIAL_CHARACTERS) config = TextCleanerConfig(cleaning_functions=[RemoveSpecialChars()]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert ("datascience shahrukh lalit developing obsei" == cleaner_response.processed_text)
def test_regex(text_cleaner): request = TextPayload(processed_text="Obsei-is-a-lowcode-lib") config = TextCleanerConfig( cleaning_functions=[RegExSubstitute(pattern=r'-', substitute=" ")]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert ("Obsei is a lowcode lib" == cleaner_response.processed_text)
def test_white_space_cleaner(text_cleaner): request = TextPayload(processed_text=TEXT_WITH_WHITE_SPACES) config = TextCleanerConfig( cleaning_functions=[RemoveWhiteSpaceAndEmptyToken()]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert ( """If anyone is interested ... these are our hosts . I can ’ t recommend them enough , Abc & Pbc .""" == cleaner_response.processed_text)
def test_char_splits_without_paragraph_honor(doc, expected_lengths, stride, text_splitter): doc_splits = text_splitter.preprocess_input( input_list=[TextPayload(processed_text=doc)], config=TextSplitterConfig( max_split_length=512, split_stride=stride ), ) assert len(expected_lengths) == len(doc_splits) for text_payload, expected_length in zip(doc_splits, expected_lengths): assert "splitter" in text_payload.meta splitter_payload = text_payload.meta["splitter"] assert splitter_payload.chunk_length == expected_length
def test_spacy_lemmatizer(text_cleaner): request = TextPayload( processed_text= u'the bats saw the cats with best stripes hanging upside down by their feet' ) config = TextCleanerConfig(disable_tokenization=True, cleaning_functions=[SpacyLemmatization()]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert ( 'the bat see the cat with good stripe hang upside down by their foot' == cleaner_response.processed_text)
def test_sentence_splits(doc, expected_lengths, stride, text_splitter): doc_splits = text_splitter.preprocess_input( input_list=[TextPayload(processed_text=doc)], config=TextSplitterConfig( max_split_length=512, split_stride=stride, enable_sentence_split=True, ), ) assert len(expected_lengths) == len(doc_splits) for text_payload, expected_length in zip(doc_splits, expected_lengths): assert "splitter" in text_payload.meta splitter_payload = text_payload.meta["splitter"] assert splitter_payload.chunk_length == expected_length
def test_translate_analyzer(translate_analyzer): source_responses = [ TextPayload(processed_text=text, source_name="sample") for text in TEXTS ] analyzer_responses = translate_analyzer.analyze_input( source_response_list=source_responses, ) assert len(analyzer_responses) == len(TEXTS) for text, analyzer_response in zip(TEXTS, analyzer_responses): assert analyzer_response.segmented_data is not None assert analyzer_response.segmented_data["translation_data"] is not None assert text == analyzer_response.segmented_data["translation_data"][ "original_text"] assert text != analyzer_response.processed_text
def test_replace_domain_keywords(text_cleaner): request = TextPayload(processed_text=TEXT_WITH_DOMAIN_WORDS) config = TextCleanerConfig(cleaning_functions=[ ReplaceDomainKeywords( domain_keywords=[("ML", "machine learning"), ("DL", "deep learning")]) ]) cleaner_responses = text_cleaner.preprocess_input(config=config, input_list=[request]) cleaner_response = cleaner_responses[0] assert ( "deep learning and machine learning are going to change the world and will not overfit" == cleaner_response.processed_text)
def test_vader_analyzer(vader_analyzer): source_responses = [ TextPayload(processed_text=text, source_name="sample") for text in TEXTS ] analyzer_responses = vader_analyzer.analyze_input( source_response_list=source_responses) assert len(analyzer_responses) == len(TEXTS) for analyzer_response in analyzer_responses: assert len(analyzer_response.segmented_data["classifier_data"]) == 2 assert "positive" in analyzer_response.segmented_data[ "classifier_data"] assert "negative" in analyzer_response.segmented_data[ "classifier_data"]
def test_text_classification_analyzer(text_classification_analyzer, label_map, expected): source_responses = [ TextPayload(processed_text=text, source_name="sample") for text in BUY_SELL_TEXTS ] analyzer_responses = text_classification_analyzer.analyze_input( source_response_list=source_responses, analyzer_config=ClassificationAnalyzerConfig(label_map=label_map, ), ) assert len(analyzer_responses) == len(BUY_SELL_TEXTS) for analyzer_response in analyzer_responses: assert analyzer_response.segmented_data["classifier_data"] is not None assert analyzer_response.segmented_data["classifier_data"].keys( ) <= set(expected)
def lookup(self, config: PandasSourceConfig, **kwargs) -> List[TextPayload]: # type: ignore[override] df_to_records = config.dataframe.to_dict("records") source_responses: List[TextPayload] = [ TextPayload( processed_text=config.separator.join([ record.get(text_column) for text_column in config.text_columns ]), meta={key: record[key] for key in config.include_columns} if config.include_columns is not None else record, source_name=self.NAME, ) for record in df_to_records ] return source_responses
def execute(self, input_list: List[TextPayload], **kwargs) -> List[TextPayload]: if len(input_list) == 0: logger.warning("Can't aggregate empty list") return input_list if not input_list[0].is_contains_classification_payload(): logger.warning( "ClassificationAverage supports Classification and Sentiment Analyzers only" ) return input_list default_value = kwargs.get("default_value", self.default_value) source_name = input_list[0].source_name doc_text, document_length, meta = self._extract_merged_parameters( input_list) # Perform average based on chunk length scores: Dict[str, float] = {} for payload in input_list: if payload.segmented_data: for key, value in payload.segmented_data.get( "classifier_data", {}).items(): ratio = len(payload.processed_text) / document_length scores[key] = scores.get(key, default_value) + value * ratio return [ TextPayload( processed_text=" ".join(doc_text), meta=meta, segmented_data={ "aggregator_data": { "avg_score": scores, "aggregator_name": self.name, } }, source_name=source_name, ) ]
def execute(self, input_list: List[TextPayload], **kwargs) -> List[TextPayload]: if len(input_list) == 0: logger.warning("Can't aggregate empty list") return input_list if not input_list[0].is_contains_classification_payload(): logger.warning( "ClassificationAverage supports Classification and Sentiment Analyzers only" ) return input_list score_threshold = kwargs.get("score_threshold", self.score_threshold) source_name = input_list[0].source_name doc_text, _, meta = self._extract_merged_parameters(input_list) max_scores: Dict[str, float] = {} category_count: Dict[str, int] = {} for payload in input_list: if payload.segmented_data: for key, value in payload.segmented_data.get( "classifier_data", {}).items(): if value > score_threshold: category_count[key] = category_count.get(key, 0) + 1 max_scores[key] = max(max_scores.get(key, 0.0), value) return [ TextPayload( processed_text=" ".join(doc_text), meta=meta, segmented_data={ "aggregator_data": { "category_count": category_count, "max_scores": max_scores, "aggregator_name": self.name, } }, source_name=source_name, ) ]
def test_zero_shot_analyzer(zero_shot_analyzer): labels = ["facility", "food", "comfortable", "positive", "negative"] source_responses = [ TextPayload(processed_text=text, source_name="sample") for text in TEXTS ] analyzer_responses = zero_shot_analyzer.analyze_input( source_response_list=source_responses, analyzer_config=ClassificationAnalyzerConfig(labels=labels), ) assert len(analyzer_responses) == len(TEXTS) for analyzer_response in analyzer_responses: assert len( analyzer_response.segmented_data["classifier_data"]) == len(labels) assert "positive" in analyzer_response.segmented_data[ "classifier_data"] assert "negative" in analyzer_response.segmented_data[ "classifier_data"]
def test_spacy_ner_analyzer(spacy_ner_analyzer): source_responses = [ TextPayload( processed_text="My name is Sam and I live in Berlin, Germany.", source_name="sample", ) ] analyzer_responses = spacy_ner_analyzer.analyze_input( source_response_list=source_responses, ) assert len(analyzer_responses) == 1 entities = analyzer_responses[0].segmented_data["ner_data"] matched_count = 0 for entity in entities: if entity["word"] == "Sam" and entity["entity_group"] == "PERSON": matched_count = matched_count + 1 elif entity["word"] == "Berlin" and entity["entity_group"] == "GPE": matched_count = matched_count + 1 elif entity["word"] == "Germany" and entity["entity_group"] == "GPE": matched_count = matched_count + 1 assert matched_count == 3
def analyze_input( self, source_response_list: List[TextPayload], analyzer_config: BaseAnalyzerConfig = None, **kwargs, ) -> List[TextPayload]: analyzer_output: List[TextPayload] = [] for batch_responses in self.batchify(source_response_list, self.batch_size): for source_response in batch_responses: classification_map = {} sentiment_value = self._get_sentiment_score_from_vader( source_response.processed_text) if sentiment_value < 0.0: classification_map["negative"] = -sentiment_value classification_map["positive"] = ( 1.0 - classification_map["negative"]) else: classification_map["positive"] = sentiment_value classification_map["negative"] = ( 1.0 - classification_map["positive"]) segmented_data = {"classifier_data": classification_map} if source_response.segmented_data: segmented_data = { **segmented_data, **source_response.segmented_data, } analyzer_output.append( TextPayload( processed_text=source_response.processed_text, meta=source_response.meta, segmented_data=segmented_data, source_name=source_response.source_name, )) return analyzer_output
def test_classification_analyzer_with_splitter_aggregator( aggregate_function, zero_shot_analyzer): labels = ["facility", "food", "comfortable", "positive", "negative"] source_responses = [ TextPayload(processed_text=text, source_name="sample") for text in TEXTS ] analyzer_responses = zero_shot_analyzer.analyze_input( source_response_list=source_responses, analyzer_config=ClassificationAnalyzerConfig( labels=labels, use_splitter_and_aggregator=True, splitter_config=TextSplitterConfig(max_split_length=50), aggregator_config=InferenceAggregatorConfig( aggregate_function=aggregate_function), ), ) assert len(analyzer_responses) == len(TEXTS) for analyzer_response in analyzer_responses: assert "aggregator_data" in analyzer_response.segmented_data
def analyze_input( self, source_response_list: List[TextPayload], analyzer_config: Optional[BaseAnalyzerConfig] = None, **kwargs, ) -> List[TextPayload]: analyzer_output: List[TextPayload] = [] texts = [ source_response.processed_text for source_response in source_response_list ] for batch_docs, batch_source_response in self._spacy_pipe_batchify( texts, self.batch_size, source_response_list): for doc, source_response in zip(batch_docs, batch_source_response): ner_prediction = [{ "entity_group": ent.label_, "word": ent.text, "start": ent.start_char, "end": ent.end_char, } for ent in doc.ents] segmented_data = {"ner_data": ner_prediction} if source_response.segmented_data: segmented_data = { **segmented_data, **source_response.segmented_data, } analyzer_output.append( TextPayload( processed_text=source_response.processed_text, meta=source_response.meta, segmented_data=segmented_data, source_name=source_response.source_name, )) return analyzer_output
def lookup(self, config: OSGoogleMapsReviewsConfig, **kwargs) -> List[TextPayload]: # type: ignore[override] source_responses: List[TextPayload] = [] # Get data from state identifier: str = kwargs.get("id", None) state: Optional[Dict[str, Any]] = (None if id is None or self.store is None else self.store.get_source_state(identifier)) update_state: bool = True if identifier else False state = state or dict() since_timestamp: Optional[int] = (config.since_timestamp or None if state is None else state.get( "since_timestamp", None)) if since_timestamp is None and config.lookup_period is not None: if len(config.lookup_period) <= 5: since_time = convert_utc_time(config.lookup_period) else: since_time = datetime.strptime(config.lookup_period, DATETIME_STRING_PATTERN) since_timestamp = int(since_time.timestamp()) last_reviews_since_time = since_timestamp params: Dict[str, Any] = { 'query': config.queries, 'reviewsLimit': config.number_of_reviews, 'limit': config.number_of_places_per_query, 'sort': config.sort, 'start': since_timestamp, 'cutoff': config.until_timestamp, 'ignoreEmpty': config.ignore_empty_reviews, 'coordinates': config.central_coordinates, 'language': config.language, 'region': config.country, 'async': False, } response = requests.get(f'{OUTSCRAPPER_API_URL}/maps/reviews-v2', params=params, headers={ 'X-API-KEY': "" if config.api_key is None else config.api_key.get_secret_value(), }) queries_data = [] if response.status_code == 200: queries_data = response.json().get('data', []) else: logger.warning(f"API call failed with error: {response.json()}") for query_data in queries_data: reviews = [] if "reviews_data" not in query_data else query_data.pop( "reviews_data") for review in reviews: source_responses.append( TextPayload( processed_text=review["review_text"], meta={ **review, **query_data }, source_name=self.NAME, )) review_time = review["review_timestamp"] if last_reviews_since_time is None or last_reviews_since_time < review_time: last_reviews_since_time = review_time state["since_timestamp"] = last_reviews_since_time if update_state and self.store is not None: self.store.update_source_state(workflow_id=identifier, state=state) return source_responses