Ejemplo n.º 1
0
    def clean_text(self, extracted_text: str):
        """Removes stop words and samples words out of tweet
        to create a snippet.

        Attributes:
            extracted_text: A string denoting extracted text from image.

        Returns:
            A tuple contaning a tweet snippet
            as well as Enum ResultStatus which gives out result status.
        """
        if not isinstance(extracted_text, str):
            raise TypeError('Extracted text must be type string')
        if not extracted_text:
            raise ValueError('Extracted text cannot be empty')
        try:
            non_punc_tweet = extracted_text.translate(
                str.maketrans('', '', string.punctuation))
            word_tokens = nltk.tokenize.word_tokenize(non_punc_tweet)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        filtered_sentence = [w for w in word_tokens if not w in stopwords]
        picked_words = filtered_sentence[2:min([len(filtered_sentence), 6])]
        tweet_snippet = " ".join(picked_words)
        if not tweet_snippet:
            return (tweet_snippet, ResultStatus.NO_RESULT)
        logger.debug(f'Tweet Snippet: {tweet_snippet}')
        return (tweet_snippet, ResultStatus.ALL_OKAY)
Ejemplo n.º 2
0
    def aggregate_tweets(self, user_id: str, date: datetime.datetime):
        """Aggregates tweets from a single day.

        Retrieves tweets pertaining to the given username and date using Twitter Search API.
        Aggregates tweets to a list.

        Returns:
            A list contaning a dict representing a Tweet Object. 
            
            Ref: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object.
            
            For example: ::

                {
                    "created_at": "Wed Oct 10 20:19:24 +0000 2018",
                    "text": "To make room for more expression, we will now count all emojis as equal—including those with gender‍‍‍ ‍‍and skin t… https://t.co/MkGjXf9aXm"
                }

        """
        if not isinstance(user_id, str) or not isinstance(
                date, datetime.datetime):
            raise TypeError(
                'User ID must be type string and date must be type datetime.datetime'
            )
        if not user_id or not date:
            raise ValueError('User ID or Date cannot be empty')
        logger.info('Searching for tweet using Twitter API...')
        querystring = dict({
            app_config.TWEET_USERNAME_KEY: user_id,
            app_config.TWEET_COUNT_KEY: app_config.TWEET_COUNT
        })
        try:
            response, response_status = self._call_twitter_api(querystring)
            if response_status != ResultStatus.ALL_OKAY:
                return (None, response_status)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        same_day_tweets = list()
        for entry in response:
            tweet_date = date_parser.parse(entry[app_config.TWEET_DATE_KEY])
            if date_checker.format_for_date(
                    tweet_date) == date_checker.format_for_date(
                        date) and date_checker.valid_date(tweet_date):
                logger.debug('Tweet found...: ' +
                             str(entry[app_config.TWEET_TEXT_KEY]))
                same_day_tweets.append(entry)
        if not same_day_tweets:
            return (same_day_tweets, ResultStatus.NO_RESULT)
        return (same_day_tweets, ResultStatus.ALL_OKAY)
Ejemplo n.º 3
0
def save_to_disk(file_obj):
    """Saves an uploaded file via POST request to disk
    """
    if not isinstance(file_obj, FileStorage):
        raise TypeError(
            'file obj must be type werkzeug.datastructures.FileStorage')
    if not file_obj:
        raise ValueError('file obj cannot be empty')
    filename = secure_filename(file_obj.filename)
    if file_obj and allowed_file(filename):
        saved_file_name = str(uuid.uuid1()) + '.' + \
            filename.rsplit('.', 1)[1].lower()
        saved_file_path = os.path.join(app_config.FILE_DIRECTORY,
                                        saved_file_name)
        logger.debug('Saving file to path: ' + saved_file_path)
        file_obj.save(saved_file_path)
        return saved_file_path
    return None
Ejemplo n.º 4
0
 def _call_twitter_api(querystring):
     if not isinstance(querystring, dict):
         raise TypeError('Query String must be type dict')
     if not querystring:
         raise ValueError('Query String must be a valid dictionary')
     headers = {'Authorization': 'Bearer ' + app_config.TWITTER_ACCESSTOKEN}
     search_url = urlparser.urljoin(
         app_config.TWITTER_HOSTNAME + '/' + app_config.TWITTER_APIVER +
         '/', app_config.TWITTER_CONTEXT)
     r = requests.get(search_url, headers=headers, params=querystring)
     response = r.json()
     logger.debug('Status Code for Twitter API: ' + str(r.status_code))
     if r.status_code != 200:
         raise RuntimeError('Twitter API returned status:' +
                            str(r.status_code))
     if not response:
         return (response, ResultStatus.NO_RESULT)
     return (response, ResultStatus.ALL_OKAY)
Ejemplo n.º 5
0
    def get_similarity(self, extracted_tweet: str, same_day_tweets: list):
        """Calculates a similarity matrix.

        Calculates a similarity matrix of the corpus containing
        extracted tweet and tweets aggregated from Twitter Search API
        using consine similarity approach.

        Attributes:
            extracted_tweet: A string denoting extracted tweet from image.
            same_day_tweets: A list contaning tweets of target date

        Returns:
            A tuple contaning a similarity matrix, which is a numpy array
            as well as Enum ResultStatus which gives out result status.
            For example: ::

                ([[1.        0.9258201]
                 [0.9258201 1.       ]], ResultStatus.ALL_OKAY)


        """
        if not isinstance(extracted_tweet, str) or not isinstance(
                same_day_tweets, list):
            raise TypeError(
                'Extracted tweet must be type str and Same day tweets must be type list'
            )
        if not extracted_tweet or not same_day_tweets:
            raise ValueError(
                'Extracted tweet must be a valid string and same day tweets must be a valid list'
            )
        logger.info('Processing similarity of two tweets...')
        corpus = list()
        corpus.append(extracted_tweet)
        corpus.extend(same_day_tweets)
        logger.debug('Corpus: ' + str(corpus))
        try:
            sparse_matrix = count_vectorizer.fit_transform(corpus)
            similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        logger.debug('Similartiy Matrix: ' + str(similarity_matrix))
        return (similarity_matrix, ResultStatus.ALL_OKAY)
Ejemplo n.º 6
0
    def search(self, user_id: str, tweet_snippet: str,
               date: datetime.datetime = None):
        """Searches for tweets

        Retrieves tweets of given username, date as well as tweet snippet using Twint.
        Aggregates tweets to a list.

        Returns:
            A tuple contaning a list of results, each result represents a tweet object
            as well as ResultStatus.
            For example: ::

                ([<tweet_obj>], ResultStatus.ALL_OKAY)

        """
        if not isinstance(user_id, str) or not (tweet_snippet, str):
            raise TypeError(
                'User ID and tweet_snippet must be type string, date must be type datetime.datetime'
            )
        if not user_id or not tweet_snippet:
            raise ValueError('User ID, Tweet or Date cannot be empty')
        results = list()
        twint_config = twint.Config()
        twint_config.Username = user_id
        if date:
            twint_config.Since = date_checker.format_for_date(date)
            twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=2))
        else:
            twint_config.Search = tweet_snippet
        twint_config.Limit = app_config.TWEET_MAX_STORE
        twint_config.Store_object = True
        twint_config.Store_object_tweets_list = results
        try:
            twint.run.Search(twint_config)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        if not results:
            return (results, ResultStatus.NO_RESULT)
        logger.debug(f'Search results: {results}\n')
        return (results, ResultStatus.ALL_OKAY)
Ejemplo n.º 7
0
def calculate_and_validate(entities: dict, tweet_text_list: list):
    """Calculates similarity matrix and validates tweet

    Calculates a similarity matrix from same day tweet
    corpus using text service and validates tweet
    using validator

    Args:
        entities: represents dictionary of entities extracted from text
        tweet_text_list: list of strings representing same day tweets

    Returns:
        valid_tweet: Validity status of tweet
        status: Enum ResultStatus representing result status

    """
    try:
        text_processor = text_service.TextProcessor()
        similarity_matrix, processor_status = text_processor.get_similarity(
            entities['tweet'], tweet_text_list)
    except Exception as e:
        logger.exception(e)
        return (None, None, ResultStatus.MODULE_FAILURE)
    if processor_status != ResultStatus.ALL_OKAY:
        return (None, None, processor_status)

    try:
        valid_tweet, match_index, validator_status = validator.verify_validity(
            similarity_matrix)
    except Exception as e:
        logger.exception(e)
        return (None, None, ResultStatus.MODULE_FAILURE)
    if validator_status == ResultStatus.MODULE_FAILURE:
        return (None, None, validator_status)
    logger.debug('Tweet Validity: ' + str(valid_tweet))
    if not valid_tweet:
        return (False, None, ResultStatus.NO_RESULT)
    return (valid_tweet, match_index-1, ResultStatus.ALL_OKAY)
Ejemplo n.º 8
0
def extract_and_parse(file_path: str):
    """Preprocess text from image

    Extracts text from image using image service,
    parses entities from text using text service.

    Args:
        file_path: represents path of the image file.

    Returns:
        entities: Entities parsed from text such as tweet, user_id and date.
        status: Enum ResultStatus representing result status

    """
    if not isinstance(file_path, str):
            raise TypeError('File path must be type string')
    if not file_path:
        raise ValueError('File path must be a valid path')
    try:
        text_extractor = image_service.Extractor()
        extracted_text, extractor_status = text_extractor.get_text(file_path)
    except Exception as e:
        logger.exception(e)
        return (None, ResultStatus.MODULE_FAILURE)
    if extractor_status != ResultStatus.ALL_OKAY:
        return (None, extractor_status)
    logger.debug('Processed text: ' + extracted_text)

    try:
        entity_parser = text_service.DataParser()
        entities, parser_status = entity_parser.get_entities(extracted_text)
    except Exception as e:
        logger.exception(e)
        return (None, ResultStatus.MODULE_FAILURE)
    if parser_status != ResultStatus.ALL_OKAY:
        return (None, parser_status)
    logger.debug('Entities: ' + str(entities))
    return (entities, parser_status)