def __init__(self): self._logger = Logger().get_logger() self._results = None self._words_not_found = 0 self._app_id = None self._app_key = None self._load_credentials()
def __init__(self, words): self._logger = Logger().get_logger() self._words = words self._found = [] self._not_found = [] self._resource = connect_to_aws_resource('dynamodb') self._table = self._get_table() self._partition_key = self._get_partition_key()
def __init__(self, words): """ :param words: Words to be classified :type words: :class: `list` """ self._logger = Logger().get_logger() self._dynamo_dao = DynamoDAO(words) self._oxford_dao = OxfordDAO() self._classified = []
class WordClassifier: """API to hide implementation of where lexical data about words comes from.""" def __init__(self, words): """ :param words: Words to be classified :type words: :class: `list` """ self._logger = Logger().get_logger() self._dynamo_dao = DynamoDAO(words) self._oxford_dao = OxfordDAO() self._classified = [] def classify(self): """ Returns lexical data about passed words. First attempts to find data per word in DynamoDB; if that fails those words are queried via the Oxford Dictionaries API directly. :return: Lexical information about words :rtype: :class: `list` """ self._logger.info("Looking up lexical data") self._dynamo_dao.check_storage() if self._dynamo_dao.found: self._classified.extend(self._dynamo_dao.found) self._logger.info(f"Word(s) found in DynamoDB: {len(self._dynamo_dao.found)}") if self._dynamo_dao.not_found: oxford_classifications = self._oxford_dao.classify(self._dynamo_dao.not_found) self._classified.extend(oxford_classifications) self._dynamo_dao.update_storage(oxford_classifications) self._logger.info(f"Word(s) not found in DynamoDB: {len(self._dynamo_dao.not_found)}") return self._classified
"""Utilities for interacting with AWS via boto3""" # pylint: disable=invalid-name, logging-fstring-interpolation import os import boto3 from botocore.exceptions import ProfileNotFound, SSLError, ClientError, ConnectTimeoutError from bananas_as_a_service.app_logger import Logger from bananas_as_a_service.error_handler import GeneralError logger = Logger().get_logger() def connect_to_aws_resource(resource_name): """ Pass in the AWS service and return a boto3 resource. :param resource_name: Name of AWS service :type resource_name: :class: `str` :return: boto3 resource :rtype: :class: `boto3.resource` """ logger.info(f"Attempting connection to AWS resource: {resource_name}") try: resource = boto3.resource(resource_name, region_name=os.environ.get('AWS_REGION'), verify=True)
class OxfordDAO: """Data Access Object for making requests to the Oxford Dictionaries API.""" # TODO: investigate data classes _BASE_URL = 'https://od-api.oxforddictionaries.com:443/api/v1/inflections/en/' _TO_PARSE = { 'categories': 'lexicalCategory', 'features': 'grammaticalFeatures', 'inflection': 'inflectionOf', } _HTTP_OK = 200 _HTTP_FORBIDDEN = 403 def __init__(self): self._logger = Logger().get_logger() self._results = None self._words_not_found = 0 self._app_id = None self._app_key = None self._load_credentials() def classify(self, tokens): """ Request and parse lexical categories, grammatical features and inflections of words. As we are hitting an external API for every single word to classify, and calls to that API take about a second each, and each of these calls is I/O bound, and none of those calls can cause a race condition, let's go ahead and get all multi-threaded all up in this hizzle. Big shout out to these two MT-spirational peeps: https://www.shanelynn.ie/using-python-threading-for-multiple-results-queue/ https://www.amazon.com/Core-Python-Applications-Programming-3rd/dp/0132678209 :param tokens: Words to be classified :type tokens: :class: `list` :return: Lexical information about words :rtype: :class: `list` """ self._logger.info(f"Classifying tokens: {tokens}") # TODO: use `Queue` for batching to prevent error and to make event driven app_id, app_key = self._load_credentials() self._results = [{} for _ in tokens] threads = [] for index, token in enumerate(tokens): if isinstance(token, int): self._results[index] = {token: {'categories': ['number']}} else: thread = Thread(target=self._request_from_api, args=(token, index, app_id, app_key)) thread.start() threads.append(thread) for thread in threads: thread.join() if self._words_not_found: self._logger.info( f"Number of word(s) not found: {self._words_not_found}") if not self._results: raise GeneralError("Exiting due to no words matched") self._logger.info( f"Word(s) processed from OxfordDAO: {len(self._results)}") return [result for result in self._results if result] @classmethod def _load_credentials(cls): ssm_parameters = get_from_parameter_store(['app_id', 'app_key']) return ssm_parameters['app_id'], ssm_parameters['app_key'] def _request_from_api(self, token, index, app_id, app_key): word = {} try: response = requests.get(f'{self._BASE_URL}{token.lower()}', headers={ 'app_id': app_id, 'app_key': app_key }) if response.status_code == self._HTTP_FORBIDDEN: raise GeneralError("Incorrect app credentials") if response.status_code != self._HTTP_OK: self._words_not_found += 1 raise RequestException except RequestException as exc: self._logger.error( f"Unable to get word: '{token}' from API due to: {exc}", exc_info=True) self._results[index] = {} else: word = self._categorise(response, word) self._results[index] = {token: word} return True def _categorise(self, response, word): for key, value in self._TO_PARSE.items(): parsed = [ category.get(value) for category in dpath.values( response.json(), '**/lexicalEntries/*') ] word.update({key: self._to_lower(parsed)}) return word @classmethod def _to_lower(cls, parsed): for index, item in enumerate(parsed): if isinstance(item, str): parsed[index] = item.lower() elif isinstance(item, list): for feature in item: for key, value in feature.items(): feature.update({key: value.lower()}) return parsed
class DynamoDAO: """ Initialise with a list of words. These can be queried against DynamoDB. Words that are 'found' and 'not found' are saved to instance attributes. Accessors of the 'found' can use these to save a costly external API lookup; while 'not found' attributes can be looked up up elsewhere and then saved to DynamoDB. """ def __init__(self, words): self._logger = Logger().get_logger() self._words = words self._found = [] self._not_found = [] self._resource = connect_to_aws_resource('dynamodb') self._table = self._get_table() self._partition_key = self._get_partition_key() @property def found(self): """Returns words that are found in DynamoDB.""" return self._found @property def not_found(self): """Returns words that are not found in DynamoDB.""" return self._not_found # TODO: multi-thread this lookup def check_storage(self): """ Looks up DynamoDB for metadata about word. The DynamoDB schema is defined as a String Partition Key but users may enter stringy ints. We definitely don't want to store or lookup ints as PKs so we do some hackery on the lookup side. Numbers to words and vice versa. This is because the OxfordDAO is well equipped to handle both kinds of "numbers" and DynamoDB is just for storage and retrieval purposes. Keep the AI tamed to a single place, lest we unleash the Machine Apocalypse. """ self._logger.info(f"Checking DynamoDB storage for: {self._words}") for word in self._words: is_number, word = self._is_a_number(word) response = self._query_item(self._partition_key, word) if response.get('Count'): word = dpath.get(response, 'Items/*/word') self._logger.info(f"Word found in DynamoDB: {word}") item = {word: dpath.get(response, 'Items/*')} if is_number: value = item.get(word) word = w2n.word_to_num(word) item = {word: value} self._found.append(item) else: self._logger.info(f"Word not found in DynamoDB: {word}") if is_number: word = w2n.word_to_num(word) self._not_found.append(word) # TODO: multi-thread this put def update_storage(self, oxford_classifications): """ Stores Oxford provided classifications to DynamoDB. See docstring for check_storage(): why we do all this string to int and back again madness. :param oxford_classifications: :type oxford_classifications: :class: list """ self._logger.info( f"Updating DynamoDB storage for: {oxford_classifications}") for classification in oxford_classifications: word, *_ = list(classification) is_number, word = self._is_a_number(word) item = {self._partition_key: word} if is_number: word = w2n.word_to_num(word) item.update(classification.get(word)) # TODO: move put_item to AWS for consistency try: self._table.put_item(Item=item) except ClientError as exc: raise GeneralError(f"ClientError with DynamoDB put: {exc}") else: if is_number: word = num2words(word) self._logger.info(f"Updated DynamoDB with word: {word}") def _get_table(self): try: table = self._resource.Table(os.environ['TABLE_NAME']) except KeyError: raise GeneralError( "Missing DynamoDB table name environment variable") else: return table @classmethod def _get_partition_key(cls): try: partition_key = os.environ['PARTITION_KEY'] except KeyError: raise GeneralError( "Missing DynamoDB partition key environment variable") else: return partition_key def _query_item(self, key_name, key_value): try: response = self._table.query( KeyConditionExpression=Key(key_name).eq(key_value)) except ClientError as exc: raise GeneralError(f"ClientError with DynamoDB query: {exc}") else: return response @classmethod def _is_a_number(cls, word): return (True, num2words(word)) if isinstance(word, int) else (False, word)
def __init__(self): self._logger = Logger().get_logger()
class Banana: """Turns common phrases into new and fun sentences.""" _ORDERING = ['number', 'adverb', 'adjective', 'noun'] _NUMBERS_AS_WORDS = 10 _FIRST_WORD = 0 def __init__(self): self._logger = Logger().get_logger() def execute(self, data): """ Entry point to parse your friend's phrases. Functional/procedural style of programming where we fetch what we want, pass it for processing and use the returned value for the next process; not a true "object". """ self._logger.info(f"Executing Banana for data: {data}") tokens = self._tokenise(data) words_as_numbers = list(set(self._words_to_numbers(tokens))) classified = WordClassifier(words_as_numbers).classify() cleaned = self._clean(classified) ordered = self._order(cleaned) return self._make_some_sentences(ordered) @classmethod def _tokenise(cls, data): # Get rid of anything that isn't a word or space, then make them uniformly lower case. all_your_token_are_belong_to_us = set() for phrase in data: youre_not_special = re.sub(r'([^\s\w]|_)+', '', str(phrase)) all_your_token_are_belong_to_us.update( set(youre_not_special.lower().split(' '))) return list(all_your_token_are_belong_to_us) @classmethod def _words_to_numbers(cls, tokens): for index, token in enumerate(tokens): try: as_number = w2n.word_to_num(token) except ValueError: pass # Ignore other words else: tokens[index] = as_number return tokens def _clean(self, classified): # This is a very simple sentence generator so throw away categories we don't account for. for word in classified: for value in word.values(): for category in value.get('categories'): if category not in self._ORDERING: value.get('categories').remove(category) return classified def _order(self, cleaned): # TODO: Word == (adjective or noun) && (plural) && (before a noun) -> make singular e.g.: # - Five easy bananas minutes. (Weird-as-a-Service) # - Five easy banana minutes. (Totally sensible Driven Development) mapped = {} for words in cleaned: for key, value in words.items(): for kind in self._ORDERING: if kind in value.get('categories'): mapped.setdefault(kind, []).append(key) remove_empty = list( filter(None, [ mapped.get('number'), mapped.get('adverb'), mapped.get('adjective'), mapped.get('noun'), ])) # Order the words in the sentence using a basic English language syntax. cartesian_product = list(product(*remove_empty)) sentences = [ list(self._flat_tuple(item)) for item in cartesian_product ] duplicates_removed = OrderedSet( [tuple(OrderedSet(sentence)) for sentence in sentences]) # Having no numbers at the start of otherwise valid sentences is legit English. no_need_for_numbers = OrderedSet([ tuple(self._get_rest(sentence)) for sentence in duplicates_removed if isinstance(self._get_first(sentence), int) and len(sentence) > 1 ]) return duplicates_removed.union(no_need_for_numbers) def _flat_tuple(self, nice_tuple): # Shout out to my man for inspiration on this one: https://adammonsen.com/post/176/ if not isinstance(nice_tuple, (tuple, list)): return nice_tuple, if not nice_tuple: return tuple(nice_tuple) return (self._flat_tuple(self._get_first(nice_tuple)) + self._flat_tuple(self._get_rest(nice_tuple))) @classmethod def _get_first(cls, collection): first, *_ = collection return first @classmethod def _get_rest(cls, collection): _, *rest = collection return rest def _make_some_sentences(self, ordered): ordered = list(map(list, ordered)) for sentence in ordered: first_word = self._get_first(sentence) self._set_first(first_word, sentence) return [f"{' '.join(sentence)}." for sentence in ordered] def _set_first(self, first_word, sentence): # Only use digits for 10 and up, below is words e.g. five. Always capitalise the sentence. if isinstance(first_word, int): sentence[self._FIRST_WORD] = (num2words(first_word).capitalize() if first_word < self._NUMBERS_AS_WORDS else str(first_word)) else: sentence[self._FIRST_WORD] = str(first_word).capitalize()
"""Lambda handler for processing input, output and exceptions.""" # pylint: disable=invalid-name, logging-fstring-interpolation, broad-except import json from bananas_as_a_service.app_logger import Logger from bananas_as_a_service.banana import Banana logger = Logger.get_logger() HTTP_OK = 200 HTTP_INTERNAL_SERVER_ERROR = 500 def lambda_handler(event, context): """ Top-level Banana Handler. :param event: Details of HTTP request :type event: :class: `dict` :param context: Runtime information :type context: :class: `LambdaContext` """ logger.info(f"Beginning Lambda execution with context: {context}") body = json.loads(event.get('body')) logger.info(f"Lambda body: {body}") try: sentences = Banana().execute(body) except Exception as exc: