def get_entities(self, extracted_text: str): """Parses entities from extracted text. Parses username (denoted by user_id), tweet as well as date from extracted text. Attributes: extracted_text: A string denoting extracted text from image. Returns: A tuple contaning a dictionary: a mapping of user_id, tweet and date as well as Enum ResultStatus which gives out result status. For example: :: { "user_id": "elonmusk", "tweet": "Ms. Tree caught the Falcon fairing!!", "date": datetime.datetime(2019, 6, 8, 7, 29, tzinfo=datetime.timezone.utc) } """ if not isinstance(extracted_text, str): raise TypeError('Extracted text must be type string') if not extracted_text: raise ValueError('Extracted text cannot be empty') logger.info('Parsing data out of extracted text...') username_match = re.search(USERNAME_REGEX, extracted_text) datetime_match = re.search(DATETIME_REGEX, extracted_text) if not username_match: return (dict({ 'user_id': None, 'tweet': None, 'datetime': None }), ResultStatus.NO_RESULT) user_id = username_match.group()[1:] tweet_start_index = username_match.end() tweet_end_index = len( extracted_text ) - 1 if not datetime_match else datetime_match.start() tweet = extracted_text[tweet_start_index:tweet_end_index].strip() if not datetime_match: return (dict({ 'user_id': user_id, 'tweet': tweet, 'date': None }), ResultStatus.ALL_OKAY) date_str = datetime_match.group().replace('-', '') processed_datetime = date_parser.parse(date_str).replace( tzinfo=datetime.timezone.utc) return (dict({ 'user_id': user_id, 'tweet': tweet, 'date': processed_datetime }), ResultStatus.ALL_OKAY)
def aggregate_tweets(self, user_id: str, date: datetime.datetime): """Aggregates tweets from a single day. Retrieves tweets pertaining to the given username and date using Twitter Search API. Aggregates tweets to a list. Returns: A list contaning a dict representing a Tweet Object. Ref: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object. For example: :: { "created_at": "Wed Oct 10 20:19:24 +0000 2018", "text": "To make room for more expression, we will now count all emojis as equal—including those with gender and skin t… https://t.co/MkGjXf9aXm" } """ if not isinstance(user_id, str) or not isinstance( date, datetime.datetime): raise TypeError( 'User ID must be type string and date must be type datetime.datetime' ) if not user_id or not date: raise ValueError('User ID or Date cannot be empty') logger.info('Searching for tweet using Twitter API...') querystring = dict({ app_config.TWEET_USERNAME_KEY: user_id, app_config.TWEET_COUNT_KEY: app_config.TWEET_COUNT }) try: response, response_status = self._call_twitter_api(querystring) if response_status != ResultStatus.ALL_OKAY: return (None, response_status) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) same_day_tweets = list() for entry in response: tweet_date = date_parser.parse(entry[app_config.TWEET_DATE_KEY]) if date_checker.format_for_date( tweet_date) == date_checker.format_for_date( date) and date_checker.valid_date(tweet_date): logger.debug('Tweet found...: ' + str(entry[app_config.TWEET_TEXT_KEY])) same_day_tweets.append(entry) if not same_day_tweets: return (same_day_tweets, ResultStatus.NO_RESULT) return (same_day_tweets, ResultStatus.ALL_OKAY)
def rescale(file_path): if not isinstance(file_path, str): raise TypeError('File path must be type string') if not file_path: raise ValueError('File path cannot be empty') logger.info('Rescaling Image to 300 dpi...') new_file_path = os.path.join(app_config.FILE_DIRECTORY, str(uuid.uuid1()) + '.png') cmd = [ 'convert', file_path, '-resample', '300x300', '-alpha', 'off', '-colorspace', 'Gray', '-threshold', '75%', '-density', '300x300', '-units', 'PixelsPerCentimeter', '-blur', '1x65000', '-level', '50x100%', new_file_path ] completed_process = subprocess.run(cmd) completed_process.check_returncode() return new_file_path
def save_from_url(image_url: str): """Saves image given via url to disk """ if not isinstance(image_url, str): raise TypeError('image_url must be a string') if not image_url: raise ValueError('image_url has to be a valid string') r = requests.get(image_url, stream=True, allow_redirects=True) if r.status_code != 200: raise FileNotFoundError() filename = image_url.split("/")[-1] if filename and allowed_file(filename): saved_file_path = os.path.join(app_config.FILE_DIRECTORY, filename) logger.info('Saving file to path: ' + saved_file_path) r.raw.decode_content = True with open(saved_file_path, 'wb') as f: shutil.copyfileobj(r.raw, f) logger.info('Image successfully downloaded') return saved_file_path
def get_similarity(self, extracted_tweet: str, same_day_tweets: list): """Calculates a similarity matrix. Calculates a similarity matrix of the corpus containing extracted tweet and tweets aggregated from Twitter Search API using consine similarity approach. Attributes: extracted_tweet: A string denoting extracted tweet from image. same_day_tweets: A list contaning tweets of target date Returns: A tuple contaning a similarity matrix, which is a numpy array as well as Enum ResultStatus which gives out result status. For example: :: ([[1. 0.9258201] [0.9258201 1. ]], ResultStatus.ALL_OKAY) """ if not isinstance(extracted_tweet, str) or not isinstance( same_day_tweets, list): raise TypeError( 'Extracted tweet must be type str and Same day tweets must be type list' ) if not extracted_tweet or not same_day_tweets: raise ValueError( 'Extracted tweet must be a valid string and same day tweets must be a valid list' ) logger.info('Processing similarity of two tweets...') corpus = list() corpus.append(extracted_tweet) corpus.extend(same_day_tweets) logger.debug('Corpus: ' + str(corpus)) try: sparse_matrix = count_vectorizer.fit_transform(corpus) similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) logger.debug('Similartiy Matrix: ' + str(similarity_matrix)) return (similarity_matrix, ResultStatus.ALL_OKAY)
def get_text(self, file_path: str): """Extracts text from image """ if not isinstance(file_path, str): raise TypeError('File path must be type string') if not file_path: raise ValueError('File path cannot be empty') logger.info('Processing Image...') try: new_file_path = self.rescale(file_path) logger.info('Extracting text from rescaled image...') img = PIL.Image.open(new_file_path) text = pytesseract.image_to_string(image=img) try: os.remove(new_file_path) except Exception as e: logger.exception(e) if not text: return (None, ResultStatus.NO_RESULT) return (text, ResultStatus.ALL_OKAY) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE)