def translate_text(text_list): translate_analyzer = TranslationAnalyzer( model_name_or_path="Helsinki-NLP/opus-mt-hi-en", device="auto" ) source_responses = [AnalyzerRequest(processed_text=text.processed_text, source_name="sample") for text in text_list] analyzer_responses = translate_analyzer.analyze_input(source_response_list=source_responses) return [ AnalyzerRequest(processed_text=response.segmented_data['translated_text'], source_name="translator") for response in analyzer_responses ]
def test_pii_analyzer_replace_original(pii_analyzer): analyzer_config = PresidioPIIAnalyzerConfig(analyze_only=False, return_decision_process=True, replace_original_text=True) source_responses = [ AnalyzerRequest(processed_text=text, source_name="sample") for text in TEXTS ] analyzer_responses = pii_analyzer.analyze_input( source_response_list=source_responses, analyzer_config=analyzer_config) assert len(analyzer_responses) == len(TEXTS) for text, analyzer_response in zip(TEXTS, analyzer_responses): assert analyzer_response.segmented_data is not None assert analyzer_response.segmented_data["analyzer_result"] is not None assert analyzer_response.segmented_data[ "anonymized_result"] is not None assert analyzer_response.segmented_data["anonymized_text"] is not None for pii_info in PII_LIST: assert pii_info not in analyzer_response.segmented_data[ "anonymized_text"] assert (analyzer_response.segmented_data["anonymized_text"] == analyzer_response.processed_text) assert analyzer_response.segmented_data["anonymized_text"] != text
def lookup(self, config: PlayStoreConfig, **kwargs) -> List[AnalyzerRequest]: source_responses: List[AnalyzerRequest] = [] # Refer https://github.com/googleapis/google-api-python-client/blob/master/docs/start.md with build(serviceName='androidpublisher', version='v3', credentials=config.get_google_credentials(), developerKey=config.cred_info.developer_key. get_secret_value()) as service: reviews = service.reviews() pagination_token: Optional[str] = None # Get data from state id: str = kwargs.get("id", None) state: Dict[ str, Any] = None if id is None else self.store.get_source_state(id) start_index: Optional[ str] = config.start_index or None if state is None else state.get( "start_index", None) update_state: bool = True if id else False state = state or dict() review_id = start_index while True: # Refer https://googleapis.github.io/google-api-python-client/docs/dyn/androidpublisher_v3.reviews.html#list responses = reviews.list(package_name=config.package_name, max_results=config.max_results, start_index=start_index, token=pagination_token) if "reviews" in responses: reviews = responses["responses"] for review in reviews: if "comments" not in review: continue review_id = review["reviewId"] # Currently only one user comment is supported text = review["comments"][0]["userComment"]["text"] source_responses.append( AnalyzerRequest(processed_text=text, meta=review, source_name=self.NAME)) pagination_token = None if "tokenPagination" in responses: if "nextPageToken" in responses["tokenPagination"]: pagination_token = responses["tokenPagination"][ "nextPageToken"] if pagination_token is None: break if update_state: state["start_index"] = review_id self.store.update_source_state(workflow_id=id, state=state) return source_responses
def _get_source_output(self, tweet: Dict[str, Any]): tweet_url = TwitterSource.get_tweet_url(tweet["text"]) processed_text = TwitterSource.clean_tweet_text(tweet["text"]) tweet["tweet_url"] = tweet_url return AnalyzerRequest(processed_text=processed_text, meta=tweet, source_name=self.NAME)
def lookup(self, config: RedditScrapperConfig, **kwargs) -> List[AnalyzerRequest]: source_responses: List[AnalyzerRequest] = [] # Get data from state id: str = kwargs.get("id", None) state: Dict[ str, Any] = None if id is None else self.store.get_source_state(id) update_state: bool = True if id else False state = state or dict() scrapper_stat: Dict[str, Any] = state.get(config.url_id, dict()) lookup_period: str = scrapper_stat.get("since_time", config.lookup_period) lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD if len(lookup_period) <= 5: since_time = convert_utc_time(lookup_period) else: since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN) last_since_time: datetime = since_time since_id: Optional[str] = scrapper_stat.get("since_id", None) last_index = since_id state[config.url_id] = scrapper_stat reddit_data: Optional[List[RedditContent]] = None try: reddit_data = config.get_readers().fetch_content(after=since_time, since_id=since_id) except RuntimeError as ex: logger.warning(ex.__cause__) reddit_data = reddit_data or [] for reddit in reddit_data: source_responses.append( AnalyzerRequest( processed_text=f"{reddit.title}. {reddit.extracted_text}", meta=reddit.__dict__, source_name=self.NAME)) if last_since_time is None or last_since_time < reddit.updated: last_since_time = reddit.updated if last_index is None: # Assuming list is sorted based on time last_index = reddit.id scrapper_stat["since_time"] = last_since_time.strftime( DATETIME_STRING_PATTERN) scrapper_stat["since_id"] = last_index if update_state: self.store.update_source_state(workflow_id=id, state=state) return source_responses
def lookup(self, config: AppStoreScrapperConfig, **kwargs) -> List[AnalyzerRequest]: source_responses: List[AnalyzerRequest] = [] # Get data from state id: str = kwargs.get("id", None) state: Dict[ str, Any] = None if id is None else self.store.get_source_state(id) update_state: bool = True if id else False state = state or dict() for scrapper in config.get_review_readers(): country_stat: Dict[str, Any] = state.get(scrapper.country, dict()) lookup_period: str = country_stat.get("since_time", config.lookup_period) lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD if len(lookup_period) <= 5: since_time = convert_utc_time(lookup_period) else: since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN) last_since_time: datetime = since_time since_id: Optional[int] = country_stat.get("since_id", None) last_index = since_id state[scrapper.country] = country_stat reviews = scrapper.fetch_reviews(after=since_time, since_id=since_id) reviews = reviews or [] for review in reviews: source_responses.append( AnalyzerRequest( processed_text=f"{review.title}. {review.content}", meta=review.__dict__, source_name=self.NAME, )) if review.date < since_time: break if last_since_time is None or last_since_time < review.date: last_since_time = review.date if last_index is None or last_index < review.id: last_index = review.id country_stat["since_time"] = last_since_time.strftime( DATETIME_STRING_PATTERN) country_stat["since_id"] = last_index if update_state: self.store.update_source_state(workflow_id=id, state=state) return source_responses
def test_translate_analyzer(translate_analyzer): source_responses = [ AnalyzerRequest(processed_text=text, source_name="sample") for text in TEXTS ] analyzer_responses = translate_analyzer.analyze_input( source_response_list=source_responses) assert len(analyzer_responses) == len(TEXTS) for text, analyzer_response in zip(TEXTS, analyzer_responses): assert analyzer_response.segmented_data is not None assert text != analyzer_response.segmented_data["translated_text"]
def test_vader_analyzer(vader_analyzer): source_responses = [AnalyzerRequest(processed_text=text, source_name="sample") for text in TEXTS] analyzer_responses = vader_analyzer.analyze_input( source_response_list=source_responses ) assert len(analyzer_responses) == len(TEXTS) for analyzer_response in analyzer_responses: assert len(analyzer_response.segmented_data) == 2 assert "positive" in analyzer_response.segmented_data assert "negative" in analyzer_response.segmented_data
def test_translate_analyzer(translate_analyzer): source_responses = [ AnalyzerRequest(processed_text=text, source_name="sample") for text in TEXTS ] analyzer_responses = translate_analyzer.analyze_input( source_response_list=source_responses) assert len(analyzer_responses) == len(TEXTS) logger.info("Result:") for analyzer_response in analyzer_responses: # print(analyzer_response) logger.info(analyzer_response)
def test_zero_shot_analyzer(zero_shot_analyzer): labels = ["facility", "food", "comfortable", "positive", "negative"] source_responses = [AnalyzerRequest(processed_text=text, source_name="sample") for text in TEXTS] analyzer_responses = zero_shot_analyzer.analyze_input( source_response_list=source_responses, analyzer_config=ClassificationAnalyzerConfig( labels=labels ) ) assert len(analyzer_responses) == len(TEXTS) for analyzer_response in analyzer_responses: assert len(analyzer_response.segmented_data) == len(labels) assert "positive" in analyzer_response.segmented_data assert "negative" in analyzer_response.segmented_data
def classify_texts(request: ClassifierRequest): global rate_limiter global analyzer with rate_limiter.run(): analyzer_requests: List[AnalyzerRequest] = [ AnalyzerRequest(processed_text=text, source_name="API") for text in request.texts ] analyzer_responses = analyzer.analyze_input( source_response_list=analyzer_requests, analyzer_config=request.analyzer_config, ) response = [] for analyzer_response in analyzer_responses: response.append(analyzer_response.segmented_data) return ClassifierResponse(data=response)
def test_pii_analyzer_analyze_only(pii_analyzer): analyzer_config = PresidioPIIAnalyzerConfig(analyze_only=True, return_decision_process=True) source_responses = [ AnalyzerRequest(processed_text=text, source_name="sample") for text in TEXTS ] analyzer_responses = pii_analyzer.analyze_input( source_response_list=source_responses, analyzer_config=analyzer_config) assert len(analyzer_responses) == len(TEXTS) for text, analyzer_response in zip(TEXTS, analyzer_responses): assert analyzer_response.segmented_data is not None assert analyzer_response.segmented_data["analyzer_result"] is not None assert analyzer_response.segmented_data["anonymized_result"] is None assert text == analyzer_response.processed_text
def test_ner_analyzer(ner_analyzer): source_responses = [ AnalyzerRequest( processed_text="My name is Lalit and I live in Berlin, Germany.", source_name="sample", ) ] analyzer_responses = ner_analyzer.analyze_input( source_response_list=source_responses) assert len(analyzer_responses) == 1 entities = analyzer_responses[0].segmented_data["data"] matched_count = 0 for entity in entities: if entity["word"] == "Lalit" and entity["entity_group"] == "PER": matched_count = matched_count + 1 elif entity["word"] == "Berlin" and entity["entity_group"] == "LOC": matched_count = matched_count + 1 elif entity["word"] == "Germany" and entity["entity_group"] == "LOC": matched_count = matched_count + 1 assert matched_count == 3
def lookup(self, config: EmailConfig, **kwargs) -> List[AnalyzerRequest]: source_responses: List[AnalyzerRequest] = [] # Get data from state id: str = kwargs.get("id", None) state: Dict[ str, Any] = None if id is None else self.store.get_source_state(id) update_state: bool = True if id else False state = state or dict() imap_client = config.get_client() for mailbox in config.mailboxes: need_more_lookup = True status, messages = imap_client.select(mailbox=mailbox, readonly=True) if status != 'OK': logger.warning(f"Not able to connect with {mailbox}: {status}") continue mailbox_stat: Dict[str, Any] = state.get(mailbox, dict()) lookup_period: str = mailbox_stat.get( "since_time", config.lookup_period or DEFAULT_LOOKUP_PERIOD) if len(lookup_period) <= 5: since_time = convert_utc_time(lookup_period) else: since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN) if since_time.tzinfo is None: since_time = since_time.replace(tzinfo=pytz.utc) else: since_time = since_time.astimezone(pytz.utc) last_since_time: datetime = since_time since_id: Optional[int] = mailbox_stat.get("since_message_id", None) last_index = since_id state[mailbox] = mailbox_stat num_of_emails = int(messages[0]) # Read in reverse order means latest emails first # Most of code is borrowed from https://www.thepythoncode.com/article/reading-emails-in-python and # modified to suite here for index in range(num_of_emails, 0, -1): email_meta: Dict[str, Any] = dict() # fetch the email message by ID status, email_message = imap_client.fetch( str(index), "(RFC822)") email_content: str = "" for response in email_message: if isinstance(response, tuple): # parse a bytes email into a message object msg = email.message_from_bytes(response[1]) email_meta["subject"] = self._parse_email_header( msg, "Subject") email_meta["from_address"] = self._parse_email_header( msg, "From") email_meta["to_address"] = self._parse_email_header( msg, "To") date_received_str = self._parse_email_header( msg, "Date") try: date_received = datetime.strptime( date_received_str, "%a, %d %b %Y %H:%M:%S %Z") except Exception: try: date_received = datetime.strptime( date_received_str, "%a, %d %b %Y %H:%M:%S %z") except Exception: date_received = datetime.strptime( date_received_str, "%a, %d %b %Y %H:%M:%S %z (%Z)") if date_received.tzinfo is None: date_received = date_received.replace( tzinfo=pytz.utc) else: date_received = date_received.astimezone(pytz.utc) email_meta["date_received"] = date_received email_meta["message_id"] = self._parse_email_header( msg, "Message-ID") part_id = 0 # if the email message is multipart if msg.is_multipart(): # iterate over email parts for part in msg.walk(): part_id_str = f'part_{part_id}' # extract content type of email content_type = part.get_content_type() content_disposition = str( part.get("Content-Disposition")) email_meta[part_id_str] = dict() email_meta[part_id_str][ "content_type"] = content_type email_meta[part_id_str][ "content_disposition"] = content_disposition if "attachment" not in content_disposition and "text/" in content_type: try: # get the email body email_body = part.get_payload( decode=True).decode() if content_type == "text/html": email_body = text_from_html( email_body) # append email body with existing email_meta[part_id_str][ "email_body"] = email_body email_content = email_content + "\n" + email_body except Exception: logger.error( "Unable to parse email body") elif "attachment" in content_disposition: logger.warning( "Email attachment download is not supported" ) # Download attachment is commented currently # # download attachment # filename = part.get_filename() # if filename: # folder_name = self.clean(subject) # if not os.path.isdir(folder_name): # # make a folder for this email (named after the subject) # os.mkdir(folder_name) # filepath = os.path.join(folder_name, filename) # # download attachment and save it # open(filepath, "wb").write(part.get_payload(decode=True)) part_id = part_id + 1 else: part_id_str = f'part_{part_id}' email_meta[part_id_str] = dict() # extract content type of email content_type = msg.get_content_type() email_meta[part_id_str][ "content_type"] = content_type # get the email body email_body = msg.get_payload(decode=True).decode() if content_type == "text/html": email_body = text_from_html(email_body) email_meta[part_id_str]["email_body"] = email_body email_content = email_content + "\n" + email_body if date_received < since_time: need_more_lookup = False break if last_index and last_index == email_meta[ "message_id"]: need_more_lookup = False break if last_since_time is None or last_since_time < date_received: last_since_time = date_received if last_index is None: last_index = email_meta["message_id"] source_responses.append( AnalyzerRequest(processed_text="\n".join( [email_meta.get("subject"), email_content]), meta=email_meta, source_name=self.NAME)) if not need_more_lookup: break mailbox_stat["since_time"] = last_since_time.strftime( DATETIME_STRING_PATTERN) mailbox_stat["since_comment_id"] = last_index if update_state: self.store.update_source_state(workflow_id=id, state=state) return source_responses
from obsei.analyzer.base_analyzer import AnalyzerRequest from obsei.analyzer.pii_analyzer import ( PresidioEngineConfig, PresidioModelConfig, PresidioPIIAnalyzer, PresidioPIIAnalyzerConfig, ) logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout, level=logging.INFO) analyzer_config = PresidioPIIAnalyzerConfig( analyze_only=False, return_decision_process=True ) analyzer = PresidioPIIAnalyzer( engine_config=PresidioEngineConfig( nlp_engine_name="spacy", models=[PresidioModelConfig(model_name="en_core_web_lg", lang_code="en")], ) ) text_to_anonymize = "His name is Mr. Jones and his phone number is 212-555-5555" analyzer_results = analyzer.analyze_input( source_response_list=[AnalyzerRequest(processed_text=text_to_anonymize)], analyzer_config=analyzer_config, ) for analyzer_result in analyzer_results: logging.info(analyzer_result.to_dict())
def lookup(self, config: RedditConfig, **kwargs) -> List[AnalyzerRequest]: source_responses: List[AnalyzerRequest] = [] # Get data from state id: str = kwargs.get("id", None) state: Dict[ str, Any] = None if id is None else self.store.get_source_state(id) update_state: bool = True if id else False state = state or dict() subreddit_reference = config.get_reddit_client().subreddit("+".join( config.subreddits)) post_stream = subreddit_reference.stream.submissions(pause_after=-1) for post in post_stream: if post is None: break post_data = vars(post) post_id = post_data["id"] if config.post_ids and not config.post_ids.__contains__(post_id): continue post_stat: Dict[str, Any] = state.get(post_id, dict()) lookup_period: str = post_stat.get("since_time", config.lookup_period) lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD if len(lookup_period) <= 5: since_time = convert_utc_time(lookup_period) else: since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN) last_since_time: datetime = since_time since_id: Optional[str] = post_stat.get("since_comment_id", None) last_index = since_id state[post_id] = post_stat post.comment_sort = "new" post.comments.replace_more(limit=None) # top_level_comments only first_comment = True for comment in post.comments: comment_data = vars(comment) if config.include_post_meta: comment_data[config.post_meta_field] = post_data comment_time = datetime.utcfromtimestamp( int(comment_data["created_utc"])) comment_id = comment_data["id"] if comment_time < since_time: break if last_index and last_index == comment_id: break if last_since_time is None or last_since_time < comment_time: last_since_time = comment_time if last_index is None or first_comment: last_index = comment_id first_comment = False text = "".join(text_from_html(comment_data["body_html"])) source_responses.append( AnalyzerRequest(processed_text=text, meta=comment_data, source_name=self.NAME)) post_stat["since_time"] = last_since_time.strftime( DATETIME_STRING_PATTERN) post_stat["since_comment_id"] = last_index if update_state: self.store.update_source_state(workflow_id=id, state=state) return source_responses
def lookup( self, config: PlayStoreScrapperConfig, **kwargs ) -> List[AnalyzerRequest]: source_responses: List[AnalyzerRequest] = [] # Get data from state id: str = kwargs.get("id", None) state: Dict[str, Any] = None if id is None else self.store.get_source_state(id) update_state: bool = True if id else False state = state or dict() for country in config.countries: country_stat: Dict[str, Any] = state.get(country, dict()) lookup_period: str = country_stat.get("since_time", config.lookup_period) lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD if len(lookup_period) <= 5: since_time = convert_utc_time(lookup_period) else: since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN) last_since_time: datetime = since_time # since_id: Optional[str] = country_stat.get("since_id", None) # last_index = since_id # state[scrapper.country] = country_stat continuation_token: Optional[ContinuationToken] = None while True: store_reviews, continuation_token = reviews( app_id=config.package_name, lang=config.language, country=country, sort=Sort.NEWEST, filter_score_with=config.filter_score_with, continuation_token=continuation_token, count=config.max_count, ) store_reviews = store_reviews or [] for review in store_reviews: source_responses.append( AnalyzerRequest( processed_text=review["content"], meta=review, source_name=self.NAME, ) ) if since_time > review["at"]: break if last_since_time is None or last_since_time < review["at"]: last_since_time = review["at"] # if last_index is None or last_index < review.id: # last_index = review.id if ( continuation_token is None or continuation_token.token is None or continuation_token.count <= len(source_responses) ): break country_stat["since_time"] = last_since_time.strftime( DATETIME_STRING_PATTERN ) # country_stat["since_id"] = last_index if update_state: self.store.update_source_state(workflow_id=id, state=state) return source_responses