def __init__(self):
     self._logger = Logger().get_logger()
     self._results = None
     self._words_not_found = 0
     self._app_id = None
     self._app_key = None
     self._load_credentials()
 def __init__(self, words):
     self._logger = Logger().get_logger()
     self._words = words
     self._found = []
     self._not_found = []
     self._resource = connect_to_aws_resource('dynamodb')
     self._table = self._get_table()
     self._partition_key = self._get_partition_key()
 def __init__(self, words):
     """
     :param words: Words to be classified
     :type words: :class: `list`
     """
     self._logger = Logger().get_logger()
     self._dynamo_dao = DynamoDAO(words)
     self._oxford_dao = OxfordDAO()
     self._classified = []
class WordClassifier:
    """API to hide implementation of where lexical data about words comes from."""

    def __init__(self, words):
        """
        :param words: Words to be classified
        :type words: :class: `list`
        """
        self._logger = Logger().get_logger()
        self._dynamo_dao = DynamoDAO(words)
        self._oxford_dao = OxfordDAO()
        self._classified = []

    def classify(self):
        """
        Returns lexical data about passed words.

        First attempts to find data per word in DynamoDB; if that fails those words are queried via
        the Oxford Dictionaries API directly.

        :return: Lexical information about words
        :rtype: :class: `list`
        """
        self._logger.info("Looking up lexical data")

        self._dynamo_dao.check_storage()

        if self._dynamo_dao.found:
            self._classified.extend(self._dynamo_dao.found)
            self._logger.info(f"Word(s) found in DynamoDB: {len(self._dynamo_dao.found)}")
        if self._dynamo_dao.not_found:
            oxford_classifications = self._oxford_dao.classify(self._dynamo_dao.not_found)
            self._classified.extend(oxford_classifications)
            self._dynamo_dao.update_storage(oxford_classifications)
            self._logger.info(f"Word(s) not found in DynamoDB: {len(self._dynamo_dao.not_found)}")

        return self._classified
"""Utilities for interacting with AWS via boto3"""

# pylint: disable=invalid-name, logging-fstring-interpolation

import os

import boto3

from botocore.exceptions import ProfileNotFound, SSLError, ClientError, ConnectTimeoutError

from bananas_as_a_service.app_logger import Logger
from bananas_as_a_service.error_handler import GeneralError

logger = Logger().get_logger()


def connect_to_aws_resource(resource_name):
    """
    Pass in the AWS service and return a boto3 resource.

    :param resource_name: Name of AWS service
    :type resource_name: :class: `str`
    :return: boto3 resource
    :rtype: :class: `boto3.resource`
    """
    logger.info(f"Attempting connection to AWS resource: {resource_name}")

    try:
        resource = boto3.resource(resource_name,
                                  region_name=os.environ.get('AWS_REGION'),
                                  verify=True)
class OxfordDAO:
    """Data Access Object for making requests to the Oxford Dictionaries API."""

    # TODO: investigate data classes
    _BASE_URL = 'https://od-api.oxforddictionaries.com:443/api/v1/inflections/en/'
    _TO_PARSE = {
        'categories': 'lexicalCategory',
        'features': 'grammaticalFeatures',
        'inflection': 'inflectionOf',
    }
    _HTTP_OK = 200
    _HTTP_FORBIDDEN = 403

    def __init__(self):
        self._logger = Logger().get_logger()
        self._results = None
        self._words_not_found = 0
        self._app_id = None
        self._app_key = None
        self._load_credentials()

    def classify(self, tokens):
        """
        Request and parse lexical categories, grammatical features and inflections of words.

        As we are hitting an external API for every single word to classify, and calls to that API
        take about a second each, and each of these calls is I/O bound, and none of those calls can
        cause a race condition, let's go ahead and get all multi-threaded all up in this hizzle.

        Big shout out to these two MT-spirational peeps:
        https://www.shanelynn.ie/using-python-threading-for-multiple-results-queue/
        https://www.amazon.com/Core-Python-Applications-Programming-3rd/dp/0132678209

        :param tokens: Words to be classified
        :type tokens: :class: `list`
        :return: Lexical information about words
        :rtype: :class: `list`
        """
        self._logger.info(f"Classifying tokens: {tokens}")
        # TODO: use `Queue` for batching to prevent error and to make event driven

        app_id, app_key = self._load_credentials()
        self._results = [{} for _ in tokens]
        threads = []
        for index, token in enumerate(tokens):
            if isinstance(token, int):
                self._results[index] = {token: {'categories': ['number']}}
            else:
                thread = Thread(target=self._request_from_api,
                                args=(token, index, app_id, app_key))
                thread.start()
                threads.append(thread)

        for thread in threads:
            thread.join()

        if self._words_not_found:
            self._logger.info(
                f"Number of word(s) not found: {self._words_not_found}")

        if not self._results:
            raise GeneralError("Exiting due to no words matched")

        self._logger.info(
            f"Word(s) processed from OxfordDAO: {len(self._results)}")
        return [result for result in self._results if result]

    @classmethod
    def _load_credentials(cls):
        ssm_parameters = get_from_parameter_store(['app_id', 'app_key'])
        return ssm_parameters['app_id'], ssm_parameters['app_key']

    def _request_from_api(self, token, index, app_id, app_key):
        word = {}
        try:
            response = requests.get(f'{self._BASE_URL}{token.lower()}',
                                    headers={
                                        'app_id': app_id,
                                        'app_key': app_key
                                    })
            if response.status_code == self._HTTP_FORBIDDEN:
                raise GeneralError("Incorrect app credentials")
            if response.status_code != self._HTTP_OK:
                self._words_not_found += 1
                raise RequestException
        except RequestException as exc:
            self._logger.error(
                f"Unable to get word: '{token}' from API due to: {exc}",
                exc_info=True)
            self._results[index] = {}
        else:
            word = self._categorise(response, word)
            self._results[index] = {token: word}
        return True

    def _categorise(self, response, word):
        for key, value in self._TO_PARSE.items():
            parsed = [
                category.get(value) for category in dpath.values(
                    response.json(), '**/lexicalEntries/*')
            ]
            word.update({key: self._to_lower(parsed)})
        return word

    @classmethod
    def _to_lower(cls, parsed):
        for index, item in enumerate(parsed):
            if isinstance(item, str):
                parsed[index] = item.lower()
            elif isinstance(item, list):
                for feature in item:
                    for key, value in feature.items():
                        feature.update({key: value.lower()})
        return parsed
class DynamoDAO:
    """
    Initialise with a list of words. These can be queried against DynamoDB. Words that are 'found'
    and 'not found' are saved to instance attributes. Accessors of the 'found' can use these to save
    a costly external API lookup; while 'not found' attributes can be looked up up elsewhere and
    then saved to DynamoDB.
    """
    def __init__(self, words):
        self._logger = Logger().get_logger()
        self._words = words
        self._found = []
        self._not_found = []
        self._resource = connect_to_aws_resource('dynamodb')
        self._table = self._get_table()
        self._partition_key = self._get_partition_key()

    @property
    def found(self):
        """Returns words that are found in DynamoDB."""
        return self._found

    @property
    def not_found(self):
        """Returns words that are not found in DynamoDB."""
        return self._not_found

    # TODO: multi-thread this lookup
    def check_storage(self):
        """
        Looks up DynamoDB for metadata about word.

        The DynamoDB schema is defined as a String Partition Key but users may enter stringy ints.
        We definitely don't want to store or lookup ints as PKs so we do some hackery on the lookup
        side. Numbers to words and vice versa. This is because the OxfordDAO is well equipped to
        handle both kinds of "numbers" and DynamoDB is just for storage and retrieval purposes. Keep
        the AI tamed to a single place, lest we unleash the Machine Apocalypse.
        """
        self._logger.info(f"Checking DynamoDB storage for: {self._words}")

        for word in self._words:
            is_number, word = self._is_a_number(word)

            response = self._query_item(self._partition_key, word)
            if response.get('Count'):
                word = dpath.get(response, 'Items/*/word')
                self._logger.info(f"Word found in DynamoDB: {word}")
                item = {word: dpath.get(response, 'Items/*')}

                if is_number:
                    value = item.get(word)
                    word = w2n.word_to_num(word)
                    item = {word: value}

                self._found.append(item)
            else:
                self._logger.info(f"Word not found in DynamoDB: {word}")

                if is_number:
                    word = w2n.word_to_num(word)

                self._not_found.append(word)

    # TODO: multi-thread this put
    def update_storage(self, oxford_classifications):
        """
        Stores Oxford provided classifications to DynamoDB.

        See docstring for check_storage(): why we do all this string to int and back again madness.

        :param oxford_classifications:
        :type oxford_classifications: :class: list
        """
        self._logger.info(
            f"Updating DynamoDB storage for: {oxford_classifications}")

        for classification in oxford_classifications:
            word, *_ = list(classification)
            is_number, word = self._is_a_number(word)
            item = {self._partition_key: word}

            if is_number:
                word = w2n.word_to_num(word)

            item.update(classification.get(word))
            # TODO: move put_item to AWS for consistency
            try:
                self._table.put_item(Item=item)
            except ClientError as exc:
                raise GeneralError(f"ClientError with DynamoDB put: {exc}")
            else:
                if is_number:
                    word = num2words(word)
                self._logger.info(f"Updated DynamoDB with word: {word}")

    def _get_table(self):
        try:
            table = self._resource.Table(os.environ['TABLE_NAME'])
        except KeyError:
            raise GeneralError(
                "Missing DynamoDB table name environment variable")
        else:
            return table

    @classmethod
    def _get_partition_key(cls):
        try:
            partition_key = os.environ['PARTITION_KEY']
        except KeyError:
            raise GeneralError(
                "Missing DynamoDB partition key environment variable")
        else:
            return partition_key

    def _query_item(self, key_name, key_value):
        try:
            response = self._table.query(
                KeyConditionExpression=Key(key_name).eq(key_value))
        except ClientError as exc:
            raise GeneralError(f"ClientError with DynamoDB query: {exc}")
        else:
            return response

    @classmethod
    def _is_a_number(cls, word):
        return (True, num2words(word)) if isinstance(word, int) else (False,
                                                                      word)
Exemple #8
0
 def __init__(self):
     self._logger = Logger().get_logger()
Exemple #9
0
class Banana:
    """Turns common phrases into new and fun sentences."""

    _ORDERING = ['number', 'adverb', 'adjective', 'noun']
    _NUMBERS_AS_WORDS = 10
    _FIRST_WORD = 0

    def __init__(self):
        self._logger = Logger().get_logger()

    def execute(self, data):
        """
        Entry point to parse your friend's phrases.

        Functional/procedural style of programming where we fetch what we want, pass it for
        processing and use the returned value for the next process; not a true "object".
        """
        self._logger.info(f"Executing Banana for data: {data}")

        tokens = self._tokenise(data)
        words_as_numbers = list(set(self._words_to_numbers(tokens)))
        classified = WordClassifier(words_as_numbers).classify()
        cleaned = self._clean(classified)
        ordered = self._order(cleaned)
        return self._make_some_sentences(ordered)

    @classmethod
    def _tokenise(cls, data):
        # Get rid of anything that isn't a word or space, then make them uniformly lower case.
        all_your_token_are_belong_to_us = set()
        for phrase in data:
            youre_not_special = re.sub(r'([^\s\w]|_)+', '', str(phrase))
            all_your_token_are_belong_to_us.update(
                set(youre_not_special.lower().split(' ')))
        return list(all_your_token_are_belong_to_us)

    @classmethod
    def _words_to_numbers(cls, tokens):
        for index, token in enumerate(tokens):
            try:
                as_number = w2n.word_to_num(token)
            except ValueError:
                pass  # Ignore other words
            else:
                tokens[index] = as_number
        return tokens

    def _clean(self, classified):
        # This is a very simple sentence generator so throw away categories we don't account for.
        for word in classified:
            for value in word.values():
                for category in value.get('categories'):
                    if category not in self._ORDERING:
                        value.get('categories').remove(category)
        return classified

    def _order(self, cleaned):
        # TODO: Word == (adjective or noun) && (plural) && (before a noun) -> make singular e.g.:
        #   - Five easy bananas minutes. (Weird-as-a-Service)
        #   - Five easy banana minutes. (Totally sensible Driven Development)
        mapped = {}
        for words in cleaned:
            for key, value in words.items():
                for kind in self._ORDERING:
                    if kind in value.get('categories'):
                        mapped.setdefault(kind, []).append(key)

        remove_empty = list(
            filter(None, [
                mapped.get('number'),
                mapped.get('adverb'),
                mapped.get('adjective'),
                mapped.get('noun'),
            ]))

        # Order the words in the sentence using a basic English language syntax.
        cartesian_product = list(product(*remove_empty))
        sentences = [
            list(self._flat_tuple(item)) for item in cartesian_product
        ]

        duplicates_removed = OrderedSet(
            [tuple(OrderedSet(sentence)) for sentence in sentences])
        # Having no numbers at the start of otherwise valid sentences is legit English.
        no_need_for_numbers = OrderedSet([
            tuple(self._get_rest(sentence)) for sentence in duplicates_removed
            if isinstance(self._get_first(sentence), int) and len(sentence) > 1
        ])
        return duplicates_removed.union(no_need_for_numbers)

    def _flat_tuple(self, nice_tuple):
        # Shout out to my man for inspiration on this one: https://adammonsen.com/post/176/
        if not isinstance(nice_tuple, (tuple, list)):
            return nice_tuple,
        if not nice_tuple:
            return tuple(nice_tuple)
        return (self._flat_tuple(self._get_first(nice_tuple)) +
                self._flat_tuple(self._get_rest(nice_tuple)))

    @classmethod
    def _get_first(cls, collection):
        first, *_ = collection
        return first

    @classmethod
    def _get_rest(cls, collection):
        _, *rest = collection
        return rest

    def _make_some_sentences(self, ordered):
        ordered = list(map(list, ordered))
        for sentence in ordered:
            first_word = self._get_first(sentence)
            self._set_first(first_word, sentence)
        return [f"{' '.join(sentence)}." for sentence in ordered]

    def _set_first(self, first_word, sentence):
        # Only use digits for 10 and up, below is words e.g. five. Always capitalise the sentence.
        if isinstance(first_word, int):
            sentence[self._FIRST_WORD] = (num2words(first_word).capitalize() if
                                          first_word < self._NUMBERS_AS_WORDS
                                          else str(first_word))
        else:
            sentence[self._FIRST_WORD] = str(first_word).capitalize()
"""Lambda handler for processing input, output and exceptions."""

# pylint: disable=invalid-name, logging-fstring-interpolation, broad-except

import json

from bananas_as_a_service.app_logger import Logger
from bananas_as_a_service.banana import Banana

logger = Logger.get_logger()

HTTP_OK = 200
HTTP_INTERNAL_SERVER_ERROR = 500


def lambda_handler(event, context):
    """
    Top-level Banana Handler.

    :param event: Details of HTTP request
    :type event: :class: `dict`
    :param context: Runtime information
    :type context: :class: `LambdaContext`
    """
    logger.info(f"Beginning Lambda execution with context: {context}")

    body = json.loads(event.get('body'))
    logger.info(f"Lambda body: {body}")
    try:
        sentences = Banana().execute(body)
    except Exception as exc: