Beispiel #1
0
 def upload_file(self, file, public_read=False):
     """
     Upload a file to an S3 bucket and optional folder.
     :param file: The file to be uploaded.
     :return: True if succeeded, False if failed
     """
     try:
         if public_read:
             self.bucket.upload_file(
                 Filename='../output/index.html',
                 Key='index.html',
                 ExtraArgs={'ACL': 'public-read'}
             )
         else:
             self.bucket.upload_file(
                 Filename='../output/index.html',
                 Key='index.html'
             )
     except exceptions.S3UploadFailedError as e:
         handle_error(
             exc=exceptions.S3UploadFailedError,
             err=e
         )
         return False
     return True
Beispiel #2
0
    def get_all_rows(self, table_name):
        """
        Retrieves all rows by scanning a DynamoDB table.
        :param table_name: the name of the table to scan
        :return: returns all rows as a list of dicts if successful, returns 
        None if unsuccessful
        """
        table = self.dynamodb.Table(table_name)

        try:
            response = table.scan()
        except ClientError as e:
            error = e.response['Error']['Code']
            if error == 'ResourceNotFoundException':
                handle_error(
                    exc=error,
                    err=e,
                    msg='table_name "{t}" was not found'.format(t=table_name))
            else:
                handle_error(exc=error, err=e, msg='unknown error')
            return None

        items = response['Items']

        while 'LastEvaluatedKey' in response:
            response = table.scan(
                ExclusiveStartKey=response['LastEvaluatedKey'])
            items.extend(response['Items'])

        return items
Beispiel #3
0
def load_sites(dynamo_session):
    """
    Load Dynamo data and instantiate site objects (with scraping & api calls).
    :param site_items: sites as a list of dicts
    :param dynamo_session: a session (connection) to DynamoDB
    :return: a list of site objects.
    """
    site_objects = []
    items = dynamo_session.get_all_rows(
        table_name='sites'
    )
    # Turn the DynamoDB rows about the sites into a list of Site objects.
    for item in items:
        try:
            site_obj = Site(item)
        except ValueError as err:
            handle_error(err=err)
        site_objects.append(site_obj)
    return site_objects
Beispiel #4
0
    def batch_update_rows(self, table_name, items):
        """
        Takes a list of object(s) and updates those in DynamoDB table_name
        :param table_name: the name of the DynamoDB table to be updated
        :param items: a list of object(s)
        :return: True if succeeded, false if failed
        """
        table = self.dynamodb.Table(table_name)

        try:
            with table.batch_writer() as batch:
                for item in items:
                    item = vars(item)
                    batch.put_item(Item=item)
        except ClientError as e:
            handle_error(exc=ClientError, err=e, msg="unknown error")
            return False

        return True
Beispiel #5
0
    def __init__(self, site_dict):
        """
        Instantiate a Site object.
        :param site_dict: a dict with Site's existing config and data
        """
        time_start = datetime.datetime.utcnow()
        # Copy site_dict keys to Site keys.
        try:
            self.__dict__.update(site_dict)
        except Exception as e:
            handle_error(
                err=e,
                msg='could not copy JSON to Site keys -- invalid/missing JSON?'
            )
            raise ValueError('could not create Site with JSON: {j}'.format(
                j=site_dict
            ))
        self.test_mode = False  # test_mode caches HTML; False means no cache

        # Clean and establish default self.url values if not already present.
        self.url = tidy_url(self.url)

        # Create Site's filename to be used for local caching if test_mode=True
        if self.test_mode:
            self.url['filename'] = generate_filename(self.url)

        # Create data branch of Site if it doesn't already exist.
        try:
            len(self.data)
        except AttributeError:
            self.data = {}
        # Make sure all top-level keys within self.data are present
        self.data = setup_data_branch(
            data_dict=self.data,
            directives_dict=self.directives
        )

        # directives_map holds instructions of how to handle the different
        # types of directives: which function to call and which parameters
        # to pass to that function. This enables, below, to make a single loop
        # through self.directives without using any if statements.
        directives_map = {
            'moz': {
                'func': moz_search,
                'params_to_pass': {
                    'params': 'this will be replaced with params',
                }
            },
            'scrape_newest': {
                'func': scrape_newest,
                'params_to_pass': {
                    'url': self.url,
                    'params': 'this will be replaced with params',
                    'test_mode': self.test_mode
                }
            },
            'twitter': {
                'func': twitter_search,
                'params_to_pass': {
                    'params': 'this will be replaced with params',
                }
            }
        }

        # Follow all the directives (to scrape and ping apis) and save all
        # the collected data into the proper locations within self.data
        # TODO: handle errors here in case of incomplete/incorrect directives
        for directive in self.directives:
            start_time = datetime.datetime.utcnow()
            params = self.directives[directive]["parameters"]
            # d_type points to the top-level key within directives_map (e.g.
            # 'moz', 'twitter', 'scrape_newest')
            d_type = self.directives[directive]['type']
            # Save the relevant value from the "parameters" key in
            # self.directives to the relevant 'params' key in directives_map.
            directives_map[d_type]['params_to_pass']['params'] = params
            # func is the function to be called
            func = directives_map[d_type]['func']
            # params_to_pass are the parameters to pass to func
            params_to_pass = directives_map[d_type]['params_to_pass']
            # Call func, passing in params_to_pass and start_time, which is
            # used to calculate how long func took to run. response will be
            # a list of dict(s).
            response = func(**params_to_pass, start_time=start_time)
            # Unpack the list of dicts(s) returned in response and save them
            # to the relevant lists within self.data.
            self.data = unpack_and_save_list(
                list_of_dicts=response,
                data_dict=self.data,
                location=directive
            )

        # Remove any empty dict key/value pairs from self.data if they
        # exist. This is needed because DynamoDB can't save empty strings
        # as dict values.
        self.data = prune_empty_branches(self.data)
        time_end = datetime.datetime.utcnow()
        self.elapsed_seconds = Decimal(
            str((time_end - time_start).total_seconds())
        )