Beispiel #1
0
    def clean_text(self, extracted_text: str):
        """Removes stop words and samples words out of tweet
        to create a snippet.

        Attributes:
            extracted_text: A string denoting extracted text from image.

        Returns:
            A tuple contaning a tweet snippet
            as well as Enum ResultStatus which gives out result status.
        """
        if not isinstance(extracted_text, str):
            raise TypeError('Extracted text must be type string')
        if not extracted_text:
            raise ValueError('Extracted text cannot be empty')
        try:
            non_punc_tweet = extracted_text.translate(
                str.maketrans('', '', string.punctuation))
            word_tokens = nltk.tokenize.word_tokenize(non_punc_tweet)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        filtered_sentence = [w for w in word_tokens if not w in stopwords]
        picked_words = filtered_sentence[2:min([len(filtered_sentence), 6])]
        tweet_snippet = " ".join(picked_words)
        if not tweet_snippet:
            return (tweet_snippet, ResultStatus.NO_RESULT)
        logger.debug(f'Tweet Snippet: {tweet_snippet}')
        return (tweet_snippet, ResultStatus.ALL_OKAY)
Beispiel #2
0
    def exec(self, file_path):
        """Executes controller flow

        Attributes:
            file_path: A string denoting a twitter username.

        Controller uses image service to extract text from
        image, passes text to text service to parse entities such
        as username, tweet as well as date, uses search service 
        to retrieve a tweet if available.

        Returns:
            search_results: list of tweet objects
            status: Enum ResultStatus representing result status

        """
        if not isinstance(file_path, str):
            raise TypeError('File path must be type str')
        if not file_path:
            raise ValueError('File path must be a valid string')
        entities, preprocess_status = common.extract_and_parse(file_path)
        if preprocess_status != ResultStatus.ALL_OKAY:
            return (None, ResultStatus.MODULE_FAILURE)

        try:
            text_processor = text_service.DataParser()
            tweet_snippet, text_processor_status = text_processor.clean_text(
                entities['tweet'])
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        if text_processor_status != ResultStatus.ALL_OKAY:
            return (None, text_processor_status)

        try:
            search_controller = search_service.TwintSearch()
            search_results, search_status = search_controller.search(
                entities['user_id'], tweet_snippet, entities['date'])
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        if search_status != ResultStatus.ALL_OKAY:
            return (None, search_status)
        tweet_text_list = list()
        for tweet_obj in search_results:
            tweet_text_list.append(tweet_obj.tweet)
        validity, match_index, validator_status = common.calculate_and_validate(
            entities=entities, tweet_text_list=tweet_text_list)
        if validator_status == ResultStatus.MODULE_FAILURE:
            return (None, ResultStatus.MODULE_FAILURE)
        if not validity:
            return (None, ResultStatus.NO_RESULT)
        return (search_results[match_index], ResultStatus.ALL_OKAY)
Beispiel #3
0
    def aggregate_tweets(self, user_id: str, date: datetime.datetime):
        """Aggregates tweets from a single day.

        Retrieves tweets pertaining to the given username and date using Twitter Search API.
        Aggregates tweets to a list.

        Returns:
            A list contaning a dict representing a Tweet Object. 
            
            Ref: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object.
            
            For example: ::

                {
                    "created_at": "Wed Oct 10 20:19:24 +0000 2018",
                    "text": "To make room for more expression, we will now count all emojis as equal—including those with gender‍‍‍ ‍‍and skin t… https://t.co/MkGjXf9aXm"
                }

        """
        if not isinstance(user_id, str) or not isinstance(
                date, datetime.datetime):
            raise TypeError(
                'User ID must be type string and date must be type datetime.datetime'
            )
        if not user_id or not date:
            raise ValueError('User ID or Date cannot be empty')
        logger.info('Searching for tweet using Twitter API...')
        querystring = dict({
            app_config.TWEET_USERNAME_KEY: user_id,
            app_config.TWEET_COUNT_KEY: app_config.TWEET_COUNT
        })
        try:
            response, response_status = self._call_twitter_api(querystring)
            if response_status != ResultStatus.ALL_OKAY:
                return (None, response_status)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        same_day_tweets = list()
        for entry in response:
            tweet_date = date_parser.parse(entry[app_config.TWEET_DATE_KEY])
            if date_checker.format_for_date(
                    tweet_date) == date_checker.format_for_date(
                        date) and date_checker.valid_date(tweet_date):
                logger.debug('Tweet found...: ' +
                             str(entry[app_config.TWEET_TEXT_KEY]))
                same_day_tweets.append(entry)
        if not same_day_tweets:
            return (same_day_tweets, ResultStatus.NO_RESULT)
        return (same_day_tweets, ResultStatus.ALL_OKAY)
Beispiel #4
0
    def get_similarity(self, extracted_tweet: str, same_day_tweets: list):
        """Calculates a similarity matrix.

        Calculates a similarity matrix of the corpus containing
        extracted tweet and tweets aggregated from Twitter Search API
        using consine similarity approach.

        Attributes:
            extracted_tweet: A string denoting extracted tweet from image.
            same_day_tweets: A list contaning tweets of target date

        Returns:
            A tuple contaning a similarity matrix, which is a numpy array
            as well as Enum ResultStatus which gives out result status.
            For example: ::

                ([[1.        0.9258201]
                 [0.9258201 1.       ]], ResultStatus.ALL_OKAY)


        """
        if not isinstance(extracted_tweet, str) or not isinstance(
                same_day_tweets, list):
            raise TypeError(
                'Extracted tweet must be type str and Same day tweets must be type list'
            )
        if not extracted_tweet or not same_day_tweets:
            raise ValueError(
                'Extracted tweet must be a valid string and same day tweets must be a valid list'
            )
        logger.info('Processing similarity of two tweets...')
        corpus = list()
        corpus.append(extracted_tweet)
        corpus.extend(same_day_tweets)
        logger.debug('Corpus: ' + str(corpus))
        try:
            sparse_matrix = count_vectorizer.fit_transform(corpus)
            similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        logger.debug('Similartiy Matrix: ' + str(similarity_matrix))
        return (similarity_matrix, ResultStatus.ALL_OKAY)
Beispiel #5
0
    def search(self, user_id: str, tweet_snippet: str,
               date: datetime.datetime = None):
        """Searches for tweets

        Retrieves tweets of given username, date as well as tweet snippet using Twint.
        Aggregates tweets to a list.

        Returns:
            A tuple contaning a list of results, each result represents a tweet object
            as well as ResultStatus.
            For example: ::

                ([<tweet_obj>], ResultStatus.ALL_OKAY)

        """
        if not isinstance(user_id, str) or not (tweet_snippet, str):
            raise TypeError(
                'User ID and tweet_snippet must be type string, date must be type datetime.datetime'
            )
        if not user_id or not tweet_snippet:
            raise ValueError('User ID, Tweet or Date cannot be empty')
        results = list()
        twint_config = twint.Config()
        twint_config.Username = user_id
        if date:
            twint_config.Since = date_checker.format_for_date(date)
            twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=2))
        else:
            twint_config.Search = tweet_snippet
        twint_config.Limit = app_config.TWEET_MAX_STORE
        twint_config.Store_object = True
        twint_config.Store_object_tweets_list = results
        try:
            twint.run.Search(twint_config)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        if not results:
            return (results, ResultStatus.NO_RESULT)
        logger.debug(f'Search results: {results}\n')
        return (results, ResultStatus.ALL_OKAY)
Beispiel #6
0
    def exec(self, file_path: str):
        """Executes controller flow

        Controller uses image service to extract text from
        image, passes text to text service to parse entities such
        as username, tweet as well as date, uses search service 
        to retrieve same day tweets, text service to find similar tweet
        and finally verifying the tweet.

        Attributes:
            file_path: A string denoting a twitter username.

        Returns:
            valid_tweet: A tweet object
            status: Enum ResultStatus representing result status

        """
        if not isinstance(file_path, str):
            raise TypeError('File path must be type str')
        if not file_path:
            raise ValueError('File path must be a valid string')
        entities, preprocess_status = common.extract_and_parse(file_path)
        if preprocess_status != ResultStatus.ALL_OKAY:
            return (None, ResultStatus.MODULE_FAILURE)

        try:
            search_controller = search_service.TwitterAPISearch()
            same_day_tweets, search_status = search_controller.aggregate_tweets(
                entities['user_id'], entities['date'])
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        if search_status != ResultStatus.ALL_OKAY:
            return (None, search_status)
        validity, match_index, validator_status = common.calculate_and_validate(
            entities=entities, same_day_tweets=same_day_tweets)
        if validator_status != ResultStatus.ALL_OKAY:
            return (None, ResultStatus.MODULE_FAILURE)
        return (same_day_tweets[match_index], ResultStatus.ALL_OKAY)
Beispiel #7
0
def calculate_and_validate(entities: dict, tweet_text_list: list):
    """Calculates similarity matrix and validates tweet

    Calculates a similarity matrix from same day tweet
    corpus using text service and validates tweet
    using validator

    Args:
        entities: represents dictionary of entities extracted from text
        tweet_text_list: list of strings representing same day tweets

    Returns:
        valid_tweet: Validity status of tweet
        status: Enum ResultStatus representing result status

    """
    try:
        text_processor = text_service.TextProcessor()
        similarity_matrix, processor_status = text_processor.get_similarity(
            entities['tweet'], tweet_text_list)
    except Exception as e:
        logger.exception(e)
        return (None, None, ResultStatus.MODULE_FAILURE)
    if processor_status != ResultStatus.ALL_OKAY:
        return (None, None, processor_status)

    try:
        valid_tweet, match_index, validator_status = validator.verify_validity(
            similarity_matrix)
    except Exception as e:
        logger.exception(e)
        return (None, None, ResultStatus.MODULE_FAILURE)
    if validator_status == ResultStatus.MODULE_FAILURE:
        return (None, None, validator_status)
    logger.debug('Tweet Validity: ' + str(valid_tweet))
    if not valid_tweet:
        return (False, None, ResultStatus.NO_RESULT)
    return (valid_tweet, match_index-1, ResultStatus.ALL_OKAY)
Beispiel #8
0
def extract_and_parse(file_path: str):
    """Preprocess text from image

    Extracts text from image using image service,
    parses entities from text using text service.

    Args:
        file_path: represents path of the image file.

    Returns:
        entities: Entities parsed from text such as tweet, user_id and date.
        status: Enum ResultStatus representing result status

    """
    if not isinstance(file_path, str):
            raise TypeError('File path must be type string')
    if not file_path:
        raise ValueError('File path must be a valid path')
    try:
        text_extractor = image_service.Extractor()
        extracted_text, extractor_status = text_extractor.get_text(file_path)
    except Exception as e:
        logger.exception(e)
        return (None, ResultStatus.MODULE_FAILURE)
    if extractor_status != ResultStatus.ALL_OKAY:
        return (None, extractor_status)
    logger.debug('Processed text: ' + extracted_text)

    try:
        entity_parser = text_service.DataParser()
        entities, parser_status = entity_parser.get_entities(extracted_text)
    except Exception as e:
        logger.exception(e)
        return (None, ResultStatus.MODULE_FAILURE)
    if parser_status != ResultStatus.ALL_OKAY:
        return (None, parser_status)
    logger.debug('Entities: ' + str(entities))
    return (entities, parser_status)
Beispiel #9
0
 def get_text(self, file_path: str):
     """Extracts text from image
     """
     if not isinstance(file_path, str):
         raise TypeError('File path must be type string')
     if not file_path:
         raise ValueError('File path cannot be empty')
     logger.info('Processing Image...')
     try:
         new_file_path = self.rescale(file_path)
         logger.info('Extracting text from rescaled image...')
         img = PIL.Image.open(new_file_path)
         text = pytesseract.image_to_string(image=img)
         try:
             os.remove(new_file_path)
         except Exception as e:
             logger.exception(e)
         if not text:
             return (None, ResultStatus.NO_RESULT)
         return (text, ResultStatus.ALL_OKAY)
     except Exception as e:
         logger.exception(e)
         return (None, ResultStatus.MODULE_FAILURE)