def clean_text(self, extracted_text: str): """Removes stop words and samples words out of tweet to create a snippet. Attributes: extracted_text: A string denoting extracted text from image. Returns: A tuple contaning a tweet snippet as well as Enum ResultStatus which gives out result status. """ if not isinstance(extracted_text, str): raise TypeError('Extracted text must be type string') if not extracted_text: raise ValueError('Extracted text cannot be empty') try: non_punc_tweet = extracted_text.translate( str.maketrans('', '', string.punctuation)) word_tokens = nltk.tokenize.word_tokenize(non_punc_tweet) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) filtered_sentence = [w for w in word_tokens if not w in stopwords] picked_words = filtered_sentence[2:min([len(filtered_sentence), 6])] tweet_snippet = " ".join(picked_words) if not tweet_snippet: return (tweet_snippet, ResultStatus.NO_RESULT) logger.debug(f'Tweet Snippet: {tweet_snippet}') return (tweet_snippet, ResultStatus.ALL_OKAY)
def aggregate_tweets(self, user_id: str, date: datetime.datetime): """Aggregates tweets from a single day. Retrieves tweets pertaining to the given username and date using Twitter Search API. Aggregates tweets to a list. Returns: A list contaning a dict representing a Tweet Object. Ref: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object. For example: :: { "created_at": "Wed Oct 10 20:19:24 +0000 2018", "text": "To make room for more expression, we will now count all emojis as equal—including those with gender and skin t… https://t.co/MkGjXf9aXm" } """ if not isinstance(user_id, str) or not isinstance( date, datetime.datetime): raise TypeError( 'User ID must be type string and date must be type datetime.datetime' ) if not user_id or not date: raise ValueError('User ID or Date cannot be empty') logger.info('Searching for tweet using Twitter API...') querystring = dict({ app_config.TWEET_USERNAME_KEY: user_id, app_config.TWEET_COUNT_KEY: app_config.TWEET_COUNT }) try: response, response_status = self._call_twitter_api(querystring) if response_status != ResultStatus.ALL_OKAY: return (None, response_status) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) same_day_tweets = list() for entry in response: tweet_date = date_parser.parse(entry[app_config.TWEET_DATE_KEY]) if date_checker.format_for_date( tweet_date) == date_checker.format_for_date( date) and date_checker.valid_date(tweet_date): logger.debug('Tweet found...: ' + str(entry[app_config.TWEET_TEXT_KEY])) same_day_tweets.append(entry) if not same_day_tweets: return (same_day_tweets, ResultStatus.NO_RESULT) return (same_day_tweets, ResultStatus.ALL_OKAY)
def save_to_disk(file_obj): """Saves an uploaded file via POST request to disk """ if not isinstance(file_obj, FileStorage): raise TypeError( 'file obj must be type werkzeug.datastructures.FileStorage') if not file_obj: raise ValueError('file obj cannot be empty') filename = secure_filename(file_obj.filename) if file_obj and allowed_file(filename): saved_file_name = str(uuid.uuid1()) + '.' + \ filename.rsplit('.', 1)[1].lower() saved_file_path = os.path.join(app_config.FILE_DIRECTORY, saved_file_name) logger.debug('Saving file to path: ' + saved_file_path) file_obj.save(saved_file_path) return saved_file_path return None
def _call_twitter_api(querystring): if not isinstance(querystring, dict): raise TypeError('Query String must be type dict') if not querystring: raise ValueError('Query String must be a valid dictionary') headers = {'Authorization': 'Bearer ' + app_config.TWITTER_ACCESSTOKEN} search_url = urlparser.urljoin( app_config.TWITTER_HOSTNAME + '/' + app_config.TWITTER_APIVER + '/', app_config.TWITTER_CONTEXT) r = requests.get(search_url, headers=headers, params=querystring) response = r.json() logger.debug('Status Code for Twitter API: ' + str(r.status_code)) if r.status_code != 200: raise RuntimeError('Twitter API returned status:' + str(r.status_code)) if not response: return (response, ResultStatus.NO_RESULT) return (response, ResultStatus.ALL_OKAY)
def get_similarity(self, extracted_tweet: str, same_day_tweets: list): """Calculates a similarity matrix. Calculates a similarity matrix of the corpus containing extracted tweet and tweets aggregated from Twitter Search API using consine similarity approach. Attributes: extracted_tweet: A string denoting extracted tweet from image. same_day_tweets: A list contaning tweets of target date Returns: A tuple contaning a similarity matrix, which is a numpy array as well as Enum ResultStatus which gives out result status. For example: :: ([[1. 0.9258201] [0.9258201 1. ]], ResultStatus.ALL_OKAY) """ if not isinstance(extracted_tweet, str) or not isinstance( same_day_tweets, list): raise TypeError( 'Extracted tweet must be type str and Same day tweets must be type list' ) if not extracted_tweet or not same_day_tweets: raise ValueError( 'Extracted tweet must be a valid string and same day tweets must be a valid list' ) logger.info('Processing similarity of two tweets...') corpus = list() corpus.append(extracted_tweet) corpus.extend(same_day_tweets) logger.debug('Corpus: ' + str(corpus)) try: sparse_matrix = count_vectorizer.fit_transform(corpus) similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) logger.debug('Similartiy Matrix: ' + str(similarity_matrix)) return (similarity_matrix, ResultStatus.ALL_OKAY)
def search(self, user_id: str, tweet_snippet: str, date: datetime.datetime = None): """Searches for tweets Retrieves tweets of given username, date as well as tweet snippet using Twint. Aggregates tweets to a list. Returns: A tuple contaning a list of results, each result represents a tweet object as well as ResultStatus. For example: :: ([<tweet_obj>], ResultStatus.ALL_OKAY) """ if not isinstance(user_id, str) or not (tweet_snippet, str): raise TypeError( 'User ID and tweet_snippet must be type string, date must be type datetime.datetime' ) if not user_id or not tweet_snippet: raise ValueError('User ID, Tweet or Date cannot be empty') results = list() twint_config = twint.Config() twint_config.Username = user_id if date: twint_config.Since = date_checker.format_for_date(date) twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=2)) else: twint_config.Search = tweet_snippet twint_config.Limit = app_config.TWEET_MAX_STORE twint_config.Store_object = True twint_config.Store_object_tweets_list = results try: twint.run.Search(twint_config) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if not results: return (results, ResultStatus.NO_RESULT) logger.debug(f'Search results: {results}\n') return (results, ResultStatus.ALL_OKAY)
def calculate_and_validate(entities: dict, tweet_text_list: list): """Calculates similarity matrix and validates tweet Calculates a similarity matrix from same day tweet corpus using text service and validates tweet using validator Args: entities: represents dictionary of entities extracted from text tweet_text_list: list of strings representing same day tweets Returns: valid_tweet: Validity status of tweet status: Enum ResultStatus representing result status """ try: text_processor = text_service.TextProcessor() similarity_matrix, processor_status = text_processor.get_similarity( entities['tweet'], tweet_text_list) except Exception as e: logger.exception(e) return (None, None, ResultStatus.MODULE_FAILURE) if processor_status != ResultStatus.ALL_OKAY: return (None, None, processor_status) try: valid_tweet, match_index, validator_status = validator.verify_validity( similarity_matrix) except Exception as e: logger.exception(e) return (None, None, ResultStatus.MODULE_FAILURE) if validator_status == ResultStatus.MODULE_FAILURE: return (None, None, validator_status) logger.debug('Tweet Validity: ' + str(valid_tweet)) if not valid_tweet: return (False, None, ResultStatus.NO_RESULT) return (valid_tweet, match_index-1, ResultStatus.ALL_OKAY)
def extract_and_parse(file_path: str): """Preprocess text from image Extracts text from image using image service, parses entities from text using text service. Args: file_path: represents path of the image file. Returns: entities: Entities parsed from text such as tweet, user_id and date. status: Enum ResultStatus representing result status """ if not isinstance(file_path, str): raise TypeError('File path must be type string') if not file_path: raise ValueError('File path must be a valid path') try: text_extractor = image_service.Extractor() extracted_text, extractor_status = text_extractor.get_text(file_path) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if extractor_status != ResultStatus.ALL_OKAY: return (None, extractor_status) logger.debug('Processed text: ' + extracted_text) try: entity_parser = text_service.DataParser() entities, parser_status = entity_parser.get_entities(extracted_text) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if parser_status != ResultStatus.ALL_OKAY: return (None, parser_status) logger.debug('Entities: ' + str(entities)) return (entities, parser_status)