コード例 #1
0
 def convert(self,
             analyzer_response: TextPayload,
             base_payload: Optional[Dict[str, Any]] = None,
             **kwargs) -> dict:
     base_payload = base_payload or dict()
     return ({
         **base_payload,
         **analyzer_response.to_dict()
     } if base_payload is not None else analyzer_response.to_dict())
コード例 #2
0
def translate_text(text_list):
    translate_analyzer = TranslationAnalyzer(
        model_name_or_path="Helsinki-NLP/opus-mt-hi-en", device="auto")
    source_responses = [
        TextPayload(processed_text=text.processed_text, source_name="sample")
        for text in text_list
    ]
    analyzer_responses = translate_analyzer.analyze_input(
        source_response_list=source_responses)
    return [
        TextPayload(
            processed_text=response.segmented_data["translated_text"],
            source_name="translator",
        ) for response in analyzer_responses
    ]
コード例 #3
0
ファイル: ner_analyzer.py プロジェクト: lalitpagaria/obsei
    def analyze_input(
        self,
        source_response_list: List[TextPayload],
        analyzer_config: Optional[BaseAnalyzerConfig] = None,
        **kwargs,
    ) -> List[TextPayload]:
        analyzer_output: List[TextPayload] = []

        for batch_responses in self.batchify(source_response_list,
                                             self.batch_size):
            texts = [
                source_response.processed_text[:self._max_length]
                for source_response in batch_responses
            ]
            batch_predictions = self._prediction_from_model(texts)

            for prediction, source_response in zip(batch_predictions,
                                                   batch_responses):
                segmented_data = {"ner_data": prediction}
                if source_response.segmented_data:
                    segmented_data = {
                        **segmented_data,
                        **source_response.segmented_data,
                    }

                analyzer_output.append(
                    TextPayload(
                        processed_text=source_response.processed_text,
                        meta=source_response.meta,
                        segmented_data=segmented_data,
                        source_name=source_response.source_name,
                    ))
        return analyzer_output
コード例 #4
0
ファイル: dummy_analyzer.py プロジェクト: lalitpagaria/obsei
    def analyze_input(  # type: ignore[override]
        self,
        source_response_list: List[TextPayload],
        analyzer_config: Optional[DummyAnalyzerConfig] = None,
        **kwargs,
    ) -> List[TextPayload]:
        responses = []
        for source_response in source_response_list:

            segmented_data = {
                "dummy_data": None
                if not analyzer_config
                else analyzer_config.dummy_data
            }

            if source_response.segmented_data:
                segmented_data = {**segmented_data, **source_response.segmented_data}

            responses.append(
                TextPayload(
                    processed_text=source_response.processed_text,
                    meta=source_response.meta,
                    source_name=source_response.source_name,
                    segmented_data=segmented_data,
                )
            )

        return responses
コード例 #5
0
def test_pii_analyzer_replace_original(pii_analyzer):
    analyzer_config = PresidioPIIAnalyzerConfig(analyze_only=False,
                                                return_decision_process=True,
                                                replace_original_text=True)

    source_responses = [
        TextPayload(processed_text=text, source_name="sample")
        for text in TEXTS
    ]
    analyzer_responses = pii_analyzer.analyze_input(
        source_response_list=source_responses, analyzer_config=analyzer_config)
    assert len(analyzer_responses) == len(TEXTS)

    for text, analyzer_response in zip(TEXTS, analyzer_responses):

        assert analyzer_response.segmented_data is not None
        assert analyzer_response.segmented_data["pii_data"] is not None
        assert analyzer_response.segmented_data["pii_data"][
            "analyzer_result"] is not None
        assert analyzer_response.segmented_data["pii_data"][
            "anonymized_result"] is not None
        assert analyzer_response.segmented_data["pii_data"][
            "anonymized_text"] is not None

        for pii_info in PII_LIST:
            assert pii_info not in analyzer_response.segmented_data[
                "pii_data"]["anonymized_text"]

        assert (analyzer_response.segmented_data["pii_data"]["anonymized_text"]
                == analyzer_response.processed_text)
        assert analyzer_response.segmented_data["pii_data"][
            "anonymized_text"] != text
コード例 #6
0
    def lookup(  # type: ignore[override]
            self, config: TrafilaturaCrawlerConfig,
            **kwargs) -> List[TextPayload]:
        source_responses: List[TextPayload] = []

        final_urls = []
        if config.is_sitemap or config.is_feed:
            for url in config.urls:
                final_urls.extend(config.find_urls(url=url))
        else:
            final_urls = config.urls

        for url in final_urls:
            extracted_data = config.extract_url(url=url)
            if extracted_data is None:
                logger.warning(f"Unable to crawl {url}, hence skipping it")
                continue
            comments = ("" if "comments" not in extracted_data else
                        extracted_data["comments"])
            source_responses.append(
                TextPayload(
                    processed_text=f"{extracted_data['text']}. {comments}",
                    meta=extracted_data,
                    source_name=self.NAME,
                ))

        return source_responses
コード例 #7
0
ファイル: pandas_sink.py プロジェクト: lalitpagaria/obsei
 def convert(
     self,
     analyzer_response: TextPayload,
     base_payload: Optional[Dict[str, Any]] = None,
     **kwargs,
 ) -> Dict[str, Any]:
     base_payload = base_payload or {}
     merged_dict = {**base_payload, **analyzer_response.to_dict()}
     return flatten_dict(merged_dict)
コード例 #8
0
def test_lower_case(text_cleaner):
    request = TextPayload(processed_text=TEXT_WITH_UPPER_CASE)

    config = TextCleanerConfig(cleaning_functions=[ToLowerCase()])
    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]

    assert "how is this possible ? ? ?" == cleaner_response.processed_text
コード例 #9
0
def test_remove_date_time(text_cleaner):
    request = TextPayload(processed_text=TEXT_WITH_DATE_TIME)

    config = TextCleanerConfig(cleaning_functions=[RemoveDateTime()])
    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]
    assert ("Peter drinks likely likes to tea at every" ==
            cleaner_response.processed_text)
コード例 #10
0
def test_decode_unicode(text_cleaner):
    request = TextPayload(processed_text=TEXT_WITH_UNICODE)

    config = TextCleanerConfig(cleaning_functions=[DecodeUnicode()])

    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]
    assert "what is this ! ! !" == cleaner_response.processed_text
コード例 #11
0
    def analyze_input(  # type: ignore[override]
        self,
        source_response_list: List[TextPayload],
        analyzer_config: Optional[ClassificationAnalyzerConfig] = None,
        **kwargs,
    ) -> List[TextPayload]:
        analyzer_output: List[TextPayload] = []

        if (
            analyzer_config is not None
            and analyzer_config.use_splitter_and_aggregator
            and analyzer_config.splitter_config
        ):
            source_response_list = self.splitter.preprocess_input(
                source_response_list,
                config=analyzer_config.splitter_config,
            )

        for batch_responses in self.batchify(source_response_list, self.batch_size):
            texts = [
                source_response.processed_text[: self._max_length]
                for source_response in batch_responses
            ]

            batch_predictions = self.prediction_from_model(texts=texts, analyzer_config=analyzer_config)

            for score_dict, source_response in zip(batch_predictions, batch_responses):
                segmented_data = {
                    "classifier_data": score_dict
                }

                if source_response.segmented_data:
                    segmented_data = {
                        **segmented_data,
                        **source_response.segmented_data,
                    }

                analyzer_output.append(
                    TextPayload(
                        processed_text=source_response.processed_text,
                        meta=source_response.meta,
                        segmented_data=segmented_data,
                        source_name=source_response.source_name,
                    )
                )

        if (
            analyzer_config is not None
            and analyzer_config.use_splitter_and_aggregator
            and analyzer_config.aggregator_config
        ):
            analyzer_output = self.aggregator.postprocess_input(
                input_list=analyzer_output,
                config=analyzer_config.aggregator_config,
            )

        return analyzer_output
コード例 #12
0
def test_remove_stop_words(text_cleaner):
    request = TextPayload(processed_text=TEXT_WITH_STOP_WORDS)

    config = TextCleanerConfig(
        cleaning_functions=[RemoveStopWords(language="english")])
    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]
    assert "In hello , obsei" == cleaner_response.processed_text
コード例 #13
0
def test_regex(text_cleaner):
    request = TextPayload(processed_text="Obsei-is-a-lowcode-lib")

    config = TextCleanerConfig(
        cleaning_functions=[RegExSubstitute(pattern=r'-', substitute=" ")])

    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]
    assert ("Obsei is a lowcode lib" == cleaner_response.processed_text)
コード例 #14
0
def test_remove_special_characters(text_cleaner):
    request = TextPayload(processed_text=TEXT_WITH_SPECIAL_CHARACTERS)

    config = TextCleanerConfig(cleaning_functions=[RemoveSpecialChars()])

    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]
    assert ("datascience shahrukh lalit developing obsei" ==
            cleaner_response.processed_text)
コード例 #15
0
def test_remove_punctuation(text_cleaner):
    request = TextPayload(processed_text=TEXT_WITH_PUNCTUATION)

    config = TextCleanerConfig(cleaning_functions=[RemovePunctuation()])
    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]
    assert (
        "I had the worst experience ever with XYZ in Egypt Bad Cars asking to pay in cash"
        == cleaner_response.processed_text)
コード例 #16
0
def test_white_space_cleaner(text_cleaner):
    request = TextPayload(processed_text=TEXT_WITH_WHITE_SPACES)

    config = TextCleanerConfig(
        cleaning_functions=[RemoveWhiteSpaceAndEmptyToken()])
    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]
    assert (
        """If anyone is interested ... these are our hosts . I can ’ t recommend them enough , Abc & Pbc ."""
        == cleaner_response.processed_text)
コード例 #17
0
def test_char_splits_without_paragraph_honor(doc, expected_lengths, stride, text_splitter):
    doc_splits = text_splitter.preprocess_input(
        input_list=[TextPayload(processed_text=doc)],
        config=TextSplitterConfig(
            max_split_length=512,
            split_stride=stride
        ),
    )

    assert len(expected_lengths) == len(doc_splits)
    for text_payload, expected_length in zip(doc_splits, expected_lengths):
        assert "splitter" in text_payload.meta
        splitter_payload = text_payload.meta["splitter"]
        assert splitter_payload.chunk_length == expected_length
コード例 #18
0
def test_replace_domain_keywords(text_cleaner):
    request = TextPayload(processed_text=TEXT_WITH_DOMAIN_WORDS)

    config = TextCleanerConfig(cleaning_functions=[
        ReplaceDomainKeywords(
            domain_keywords=[("ML", "machine learning"), ("DL",
                                                          "deep learning")])
    ])

    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]
    assert (
        "deep learning and machine learning are going to change the world and will not overfit"
        == cleaner_response.processed_text)
コード例 #19
0
def test_spacy_lemmatizer(text_cleaner):
    request = TextPayload(
        processed_text=
        u'the bats saw the cats with best stripes hanging upside down by their feet'
    )

    config = TextCleanerConfig(disable_tokenization=True,
                               cleaning_functions=[SpacyLemmatization()])

    cleaner_responses = text_cleaner.preprocess_input(config=config,
                                                      input_list=[request])
    cleaner_response = cleaner_responses[0]
    assert (
        'the bat see the cat with good stripe hang upside down by their foot'
        == cleaner_response.processed_text)
コード例 #20
0
def test_translate_analyzer(translate_analyzer):
    source_responses = [
        TextPayload(processed_text=text, source_name="sample")
        for text in TEXTS
    ]
    analyzer_responses = translate_analyzer.analyze_input(
        source_response_list=source_responses, )
    assert len(analyzer_responses) == len(TEXTS)

    for text, analyzer_response in zip(TEXTS, analyzer_responses):
        assert analyzer_response.segmented_data is not None
        assert analyzer_response.segmented_data["translation_data"] is not None
        assert text == analyzer_response.segmented_data["translation_data"][
            "original_text"]
        assert text != analyzer_response.processed_text
コード例 #21
0
def test_sentence_splits(doc, expected_lengths, stride, text_splitter):
    doc_splits = text_splitter.preprocess_input(
        input_list=[TextPayload(processed_text=doc)],
        config=TextSplitterConfig(
            max_split_length=512,
            split_stride=stride,
            enable_sentence_split=True,
        ),
    )

    assert len(expected_lengths) == len(doc_splits)
    for text_payload, expected_length in zip(doc_splits, expected_lengths):
        assert "splitter" in text_payload.meta
        splitter_payload = text_payload.meta["splitter"]
        assert splitter_payload.chunk_length == expected_length
コード例 #22
0
ファイル: test_analyzer.py プロジェクト: lalitpagaria/obsei
def test_vader_analyzer(vader_analyzer):
    source_responses = [
        TextPayload(processed_text=text, source_name="sample")
        for text in TEXTS
    ]
    analyzer_responses = vader_analyzer.analyze_input(
        source_response_list=source_responses)

    assert len(analyzer_responses) == len(TEXTS)

    for analyzer_response in analyzer_responses:
        assert len(analyzer_response.segmented_data["classifier_data"]) == 2
        assert "positive" in analyzer_response.segmented_data[
            "classifier_data"]
        assert "negative" in analyzer_response.segmented_data[
            "classifier_data"]
コード例 #23
0
    def lookup(self, config: PandasSourceConfig,
               **kwargs) -> List[TextPayload]:  # type: ignore[override]
        df_to_records = config.dataframe.to_dict("records")
        source_responses: List[TextPayload] = [
            TextPayload(
                processed_text=config.separator.join([
                    record.get(text_column)
                    for text_column in config.text_columns
                ]),
                meta={key: record[key]
                      for key in config.include_columns}
                if config.include_columns is not None else record,
                source_name=self.NAME,
            ) for record in df_to_records
        ]

        return source_responses
コード例 #24
0
ファイル: test_analyzer.py プロジェクト: lalitpagaria/obsei
def test_text_classification_analyzer(text_classification_analyzer, label_map,
                                      expected):
    source_responses = [
        TextPayload(processed_text=text, source_name="sample")
        for text in BUY_SELL_TEXTS
    ]
    analyzer_responses = text_classification_analyzer.analyze_input(
        source_response_list=source_responses,
        analyzer_config=ClassificationAnalyzerConfig(label_map=label_map, ),
    )

    assert len(analyzer_responses) == len(BUY_SELL_TEXTS)

    for analyzer_response in analyzer_responses:
        assert analyzer_response.segmented_data["classifier_data"] is not None
        assert analyzer_response.segmented_data["classifier_data"].keys(
        ) <= set(expected)
コード例 #25
0
    def execute(self, input_list: List[TextPayload],
                **kwargs) -> List[TextPayload]:
        if len(input_list) == 0:
            logger.warning("Can't aggregate empty list")
            return input_list

        if not input_list[0].is_contains_classification_payload():
            logger.warning(
                "ClassificationAverage supports Classification and Sentiment Analyzers only"
            )
            return input_list

        score_threshold = kwargs.get("score_threshold", self.score_threshold)

        source_name = input_list[0].source_name

        doc_text, _, meta = self._extract_merged_parameters(input_list)

        max_scores: Dict[str, float] = {}
        category_count: Dict[str, int] = {}
        for payload in input_list:
            if payload.segmented_data:
                for key, value in payload.segmented_data.get(
                        "classifier_data", {}).items():
                    if value > score_threshold:
                        category_count[key] = category_count.get(key, 0) + 1
                        max_scores[key] = max(max_scores.get(key, 0.0), value)

        return [
            TextPayload(
                processed_text=" ".join(doc_text),
                meta=meta,
                segmented_data={
                    "aggregator_data": {
                        "category_count": category_count,
                        "max_scores": max_scores,
                        "aggregator_name": self.name,
                    }
                },
                source_name=source_name,
            )
        ]
コード例 #26
0
    def execute(self, input_list: List[TextPayload],
                **kwargs) -> List[TextPayload]:
        if len(input_list) == 0:
            logger.warning("Can't aggregate empty list")
            return input_list

        if not input_list[0].is_contains_classification_payload():
            logger.warning(
                "ClassificationAverage supports Classification and Sentiment Analyzers only"
            )
            return input_list

        default_value = kwargs.get("default_value", self.default_value)

        source_name = input_list[0].source_name

        doc_text, document_length, meta = self._extract_merged_parameters(
            input_list)

        # Perform average based on chunk length
        scores: Dict[str, float] = {}
        for payload in input_list:
            if payload.segmented_data:
                for key, value in payload.segmented_data.get(
                        "classifier_data", {}).items():
                    ratio = len(payload.processed_text) / document_length
                    scores[key] = scores.get(key,
                                             default_value) + value * ratio

        return [
            TextPayload(
                processed_text=" ".join(doc_text),
                meta=meta,
                segmented_data={
                    "aggregator_data": {
                        "avg_score": scores,
                        "aggregator_name": self.name,
                    }
                },
                source_name=source_name,
            )
        ]
コード例 #27
0
ファイル: test_analyzer.py プロジェクト: lalitpagaria/obsei
def test_zero_shot_analyzer(zero_shot_analyzer):
    labels = ["facility", "food", "comfortable", "positive", "negative"]

    source_responses = [
        TextPayload(processed_text=text, source_name="sample")
        for text in TEXTS
    ]
    analyzer_responses = zero_shot_analyzer.analyze_input(
        source_response_list=source_responses,
        analyzer_config=ClassificationAnalyzerConfig(labels=labels),
    )

    assert len(analyzer_responses) == len(TEXTS)

    for analyzer_response in analyzer_responses:
        assert len(
            analyzer_response.segmented_data["classifier_data"]) == len(labels)
        assert "positive" in analyzer_response.segmented_data[
            "classifier_data"]
        assert "negative" in analyzer_response.segmented_data[
            "classifier_data"]
コード例 #28
0
ファイル: test_analyzer.py プロジェクト: lalitpagaria/obsei
def test_spacy_ner_analyzer(spacy_ner_analyzer):
    source_responses = [
        TextPayload(
            processed_text="My name is Sam and I live in Berlin, Germany.",
            source_name="sample",
        )
    ]
    analyzer_responses = spacy_ner_analyzer.analyze_input(
        source_response_list=source_responses, )
    assert len(analyzer_responses) == 1

    entities = analyzer_responses[0].segmented_data["ner_data"]
    matched_count = 0
    for entity in entities:
        if entity["word"] == "Sam" and entity["entity_group"] == "PERSON":
            matched_count = matched_count + 1
        elif entity["word"] == "Berlin" and entity["entity_group"] == "GPE":
            matched_count = matched_count + 1
        elif entity["word"] == "Germany" and entity["entity_group"] == "GPE":
            matched_count = matched_count + 1

    assert matched_count == 3
コード例 #29
0
    def analyze_input(
        self,
        source_response_list: List[TextPayload],
        analyzer_config: BaseAnalyzerConfig = None,
        **kwargs,
    ) -> List[TextPayload]:
        analyzer_output: List[TextPayload] = []

        for batch_responses in self.batchify(source_response_list,
                                             self.batch_size):
            for source_response in batch_responses:
                classification_map = {}
                sentiment_value = self._get_sentiment_score_from_vader(
                    source_response.processed_text)
                if sentiment_value < 0.0:
                    classification_map["negative"] = -sentiment_value
                    classification_map["positive"] = (
                        1.0 - classification_map["negative"])
                else:
                    classification_map["positive"] = sentiment_value
                    classification_map["negative"] = (
                        1.0 - classification_map["positive"])

                segmented_data = {"classifier_data": classification_map}
                if source_response.segmented_data:
                    segmented_data = {
                        **segmented_data,
                        **source_response.segmented_data,
                    }

                analyzer_output.append(
                    TextPayload(
                        processed_text=source_response.processed_text,
                        meta=source_response.meta,
                        segmented_data=segmented_data,
                        source_name=source_response.source_name,
                    ))

        return analyzer_output
コード例 #30
0
ファイル: test_analyzer.py プロジェクト: lalitpagaria/obsei
def test_classification_analyzer_with_splitter_aggregator(
        aggregate_function, zero_shot_analyzer):
    labels = ["facility", "food", "comfortable", "positive", "negative"]

    source_responses = [
        TextPayload(processed_text=text, source_name="sample")
        for text in TEXTS
    ]
    analyzer_responses = zero_shot_analyzer.analyze_input(
        source_response_list=source_responses,
        analyzer_config=ClassificationAnalyzerConfig(
            labels=labels,
            use_splitter_and_aggregator=True,
            splitter_config=TextSplitterConfig(max_split_length=50),
            aggregator_config=InferenceAggregatorConfig(
                aggregate_function=aggregate_function),
        ),
    )

    assert len(analyzer_responses) == len(TEXTS)

    for analyzer_response in analyzer_responses:
        assert "aggregator_data" in analyzer_response.segmented_data