Ejemplo n.º 1
0
def upsert_metrics(stats: Dict):
    with db_conn_orm(Settings()['MONITORING_DB']) as sesh:
        if 'spotcharter' in stats.get('datatype'):
            item = map_keys(stats.get('spider_attribute_stats'), sc_mapping())
            citem = map_keys(stats, common_mapping())
            item.update(
                citem,
                datatype='spotcharter',
            )
            spider_stats = SpotCharterStats(**item)
        elif 'portcall' in stats.get('datatype'):
            item = map_keys(stats.get('spider_attribute_stats'), cml_mapping())
            citem = map_keys(stats, common_mapping())
            item.update(
                citem,
                datatype='portcall',
            )
            spider_stats = CargoMovementStats(**item)
        elif 'cargo_movement' in stats.get('datatype'):
            item = map_keys(stats.get('spider_attribute_stats'), cml_mapping())
            citem = map_keys(stats, common_mapping())
            item.update(
                citem,
                datatype='cargo_movement',
            )
            spider_stats = CargoMovementStats(**item)
        else:
            logger.info('redshift monitoring not supported')
            return

        sesh.add(spider_stats)
        sesh.commit()
Ejemplo n.º 2
0
 def __init__(self):
     settings = Settings()
     validate_settings(
         'GMAIL_CLIENT_SECRET',
         'GMAIL_ACCESS_TOKEN',
         'GMAIL_CLIENT_ID',
         'GMAIL_REFRESH_TOKEN',
         'GMAIL_TOKEN_EXPIRY',
     )
     credentials = self._build_credentials(settings)
     self.session = credentials.authorize(Http())
     self._drive = self._build_service(self.session, 'drive', 'v3')
     self._sheets = self._build_service(self.session, 'sheets', 'v4')
Ejemplo n.º 3
0
 def __init__(self):
     settings = Settings()
     validate_settings('GOOGLE_DRIVE_DEFAULT_USER',
                       'GOOGLE_DRIVE_PRIVATE_KEY',
                       'GOOGLE_DRIVE_PRIVATE_KEY_ID')
     credentials = self._build_credentials(
         settings['GOOGLE_DRIVE_DEFAULT_USER'],
         self._json_credentials(settings))
     # the api only support this http lib but it probably mess how Scrapy
     # schedules the requests, making them sequential and threfor a lot
     # slower. To keep in mind when we will migrate a lot of this type of
     # scraper.
     self.session = credentials.authorize(Http())
     self._drive = self._build_service(self.session, 'drive', 'v3')
     self._sheets = self._build_service(self.session, 'sheets', 'v4')
Ejemplo n.º 4
0
def connect(base):
    """API sugar for accessing table data by table name.

    For docs, see https://airtable.com/appY2FMxUDFExuYGK/api/docs

    Args:
        base (str):

    Returns:
        Airtable:

    """
    validate_settings('AIRTABLE_API_KEY')

    return airtable.Airtable(BASE_MAPPING[base], Settings()['AIRTABLE_API_KEY'])
Ejemplo n.º 5
0
def select_channel():
    """Abstract how we decide which channel to target.

    Useful choices:

        crew: '#crew-process-reports'
        test: '#dev-alerts-stg'
        dev: '#dev-alerts-data'
        engineering: '#crew-data-sourcing'
        analysts: '#sgp-data'

    """
    # use channel defined in project settings
    # overriding channel in Scrapinghub spider settings will not work
    return Settings().get('SLACK_CHANNEL') or DEFAULT_CHANNEL
Ejemplo n.º 6
0
def send(text, channel=None, attachments=None):
    sc = SlackClient(Settings().get('SLACK_TOKEN'))
    channel = channel or select_channel()

    res = sc.api_call(
        'chat.postMessage',
        channel=channel,
        text=text,
        # overwrite user name (otherwise guess from token)
        username=PROJECT_USER,
        mrkdwn=True,
        attachments=attachments,
        icon_emoji=':mailchimp:',
    )

    return res
Ejemplo n.º 7
0
def connect_to_s3(config=None):
    """Wraps S3 conn with defaults and error handler.

    Args:
        config(dict): AWS credentials defined like standard ENV

    """
    settings = config or Settings()
    logger.debug('Connecting to S3 service')

    try:
        yield boto3.resource(
            's3',
            aws_access_key_id=settings['AWS_ACCESS_KEY_ID'],
            aws_secret_access_key=settings['AWS_SECRET_ACCESS_KEY'],
        )
    except ClientError as e:
        logger.error(f'Failed to connect to S3: {e}')
Ejemplo n.º 8
0
    def _fork_tabula(self, body, **kwargs):
        """Wrapping of interactions with file system and tabula in a function.

        The key of each entry in the kwargs is the option for tabula and the value
        should be a list of strings for the values.

        If the kwargs is {'-p': ['all']}, the following will be executed:
            java ... -p all
        To set a command-line flag with no argument, set the value as empty list:
            {'-l': [], ...}

        Args:
            body (str): filestream
            **kwargs: The options for tabula.

        Returns:
            list[list[str]]: Result in list of lists from tabula.

        """
        # obtain tabula path
        validate_settings('TABULA_JAR_PATH')
        jar_file_path = Settings()['TABULA_JAR_PATH']
        if not os.path.isfile(jar_file_path):
            raise IOError(
                'Tabula.jar not found at `{}`, check `local_settings.py`'.
                format(jar_file_path))
        self.logger.debug('Using tabula.jar at `{}`'.format(jar_file_path))
        filename = self.generate_filename(body)
        self.save_file(filename + '.pdf', body)
        tabula_subprocess = Popen(
            tabula_command(self.data_path, filename, jar_file_path, **kwargs),
            stdout=PIPE,
            stderr=PIPE,
        )
        signal.alarm(300)
        output, error = tabula_subprocess.communicate()

        if error:
            # warnings regarding missing fonts by the java process are safe to ignore
            # warnings regarding end of stream may be ignored, however please check output integrity
            self.logger.warning(
                'Tabula.jar has encountered warnings/exceptions while processing, '
                'please verify output integrity\n{}'.format(
                    error.decode('utf-8')))
Ejemplo n.º 9
0
def initialize_api():
    validate_settings('DATADOG_APP_KEY', 'DATADOG_API_KEY')

    initialize(api_key=Settings()['DATADOG_API_KEY'],
               app_key=Settings()['DATADOG_APP_KEY'])
Ejemplo n.º 10
0
    - suggestions welcome

"""

from __future__ import absolute_import, unicode_literals

import click

from kp_scrapers.lib.services.gdrive import build_query, GDriveService
from kp_scrapers.lib.services.shub import global_settings as Settings, validate_settings


gdrive = GDriveService()

validate_settings('GOOGLE_DRIVE_BASE_FOLDER_ID')
BASE_FOLDER_ID = Settings()['GOOGLE_DRIVE_BASE_FOLDER_ID']

DEFAULT_FILE_FIELDS = 'parents, mimeType, id, name'  # noqa
PROCESS_TAG = 'processed'  # noqa
CPP_TEST = 'Reports/Ship Agents/Generic'  # noqa, to be removed once script is fully tested


@click.group()
def cli():
    pass


def set_files(path, name, tags, mime_types, do_add_tag):
    """Sets the tags of the specified file to the provided value

        Args:
Ejemplo n.º 11
0
import logging

from dateutil.parser import parse as parse_date
from dateutil.relativedelta import relativedelta
import requests

from kp_scrapers.lib.services.shub import global_settings as Settings
from kp_scrapers.lib.utils import map_row_to_dict

logger = logging.getLogger(__name__)

# all 3 platforms share the same credentials
KP_API_BASE = Settings()['KP_API_BASE']
KP_API_CREDENTIALS = {
    'email': Settings()['KP_API_EMAIL'],
    'password': Settings()['KP_API_PASSWORD'],
}

# date format to use when POSTing requests
KP_API_DATE_PARAM_FORMAT = '%Y-%m-%d'

# store session state so we don't have to re-authenticate for each call
_SESSION = None


class KplerApiService(object):
    """Interface with Kpler API.

    Access data from Kpler API to supplement data sources lacking certain fields to compute
    attributes like lay_can_start and lay_can_end.