Beispiel #1
0
    def _get_default_args_list(self):
        """ Return the base list of args for sherlock that remains unchanged
        across multiple invocations of this worker's process_msg.
        """
        args = [
            # First arg, does not matter
            "ingest_multiple_dates.py",

            # We do not want to record status in redshift
            "--skip-progress-in-redshift",

            # run parallelism in the worker instead
            "--serial-stepper",
        ]

        cur_base_dir = os.getcwd()
        if self._should_run_local:
            private_file = os.path.join(
                cur_base_dir,
                staticconf.read_string("run_local.private", "private.yaml")
            )
            args.extend(["-r"])  # For run-local in sherlock
        else:
            private_file = staticconf.read_string("run_service.private")

        if self._config_override_loc is not None:
            args.extend(["--config-override", self._config_override_loc])
        args.extend(["--private", private_file])
        args.extend(["--config", self._config_loc])

        return args
Beispiel #2
0
def create_emr_args(date_with_slashes, cores, infile_prefix, local):
    """creates a string containing arguments for mr job

    inputs:
        date_with_slashes -- a date string of the form 'YYYY/MM/DD'
        cores -- the number of cores to use for a conversion
        infile_prefix -- the prefix to the search bucket
        delimiter -- column delimiter for S3 output

    outputs:
        string containing arguments used by ET mr job"""

    input_file = infile_prefix + date_with_slashes +\
        read_string('pipeline.et_step.s3_input_suffix')
    user_prefix = get_s3_output_user_prefix()
    output_file = os.path.join(user_prefix, date_with_slashes)

    if int(cores) > MAX_CORES:
        cores = MAX_CORES

    extractions = pipeline_yaml_schema_file_path()
    delimiter = read_string('redshift_column_delimiter')
    if local:
        template = read_string('run_local.mrjob_arg_template')
    else:
        template = read_string('run_service.mrjob_arg_template')

    return template.format(input_file, output_file, cores, extractions,
                           delimiter)
Beispiel #3
0
def test_create_emr_args(input_date, dev, cores, pipeline_yaml):
    print "just starting"
    load_package_config('config.yaml')
    YamlConfiguration(pipeline_yaml)

    input_prefix = read_list('pipeline.et_step.s3_prefixes')[0]
    input_file = input_prefix + input_date + '/part-*.gz'

    expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS
    expected_out_file = read_string('pipeline.s3_output_prefix')
    delimiter = read_string('redshift_column_delimiter')
    with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}):
        logname = os.environ['LOGNAME']
        expected_out_file = os.path.join(
            expected_out_file.format(logname=logname),
            input_date
        )
        extractions = pipeline_yaml_schema_file_path()
        formatted_args = expected_args.format(input_file,
                                              expected_out_file,
                                              cores,
                                              extractions,
                                              delimiter)
        output_under_test = create_emr_args(input_date, 10,
                                            input_prefix, dev)
        assert output_under_test == formatted_args
Beispiel #4
0
def create_emr_args(date_with_slashes, cores, infile_prefix, local):
    """creates a string containing arguments for mr job

    inputs:
        date_with_slashes -- a date string of the form 'YYYY/MM/DD'
        cores -- the number of cores to use for a conversion
        infile_prefix -- the prefix to the search bucket
        delimiter -- column delimiter for S3 output

    outputs:
        string containing arguments used by ET mr job"""

    input_file = infile_prefix + date_with_slashes +\
        read_string('pipeline.et_step.s3_input_suffix')
    user_prefix = get_s3_output_user_prefix()
    output_file = os.path.join(user_prefix, date_with_slashes)

    if int(cores) > MAX_CORES:
        cores = MAX_CORES

    extractions = pipeline_yaml_schema_file_path()
    delimiter = read_string('redshift_column_delimiter')
    if local:
        template = read_string('run_local.mrjob_arg_template')
    else:
        template = read_string('run_service.mrjob_arg_template')

    return template.format(
        input_file, output_file, cores, extractions, delimiter
    )
Beispiel #5
0
def test_setup_config_cluster(cluster, pool, scheduler, tag,
                              mock_config_files):
    args = argparse.Namespace(
        env_config_path='/nail/etc/config.yaml',
        cluster=cluster,
        pool=pool,
        scheduler=scheduler,
        signals_branch_or_tag=tag,
    )
    with mock.patch(
            'clusterman.config.load_cluster_pool_config',
            autospec=True,
    ) as mock_pool_load, mock.patch('clusterman.config._load_module_configs',
                                    ) as mock_load_module_configs:

        config.setup_config(args)

        assert mock_load_module_configs.call_args == mock.call(
            '/nail/etc/config.yaml')
        assert staticconf.read_string('aws.region') == 'us-test-3'
        if pool:
            assert mock_pool_load.call_args == mock.call(
                cluster, pool, scheduler, tag)
        else:
            assert mock_pool_load.call_count == 0
            if tag:
                assert staticconf.read_string(
                    'autoscale_signal.branch_or_tag') == tag
def main():
    staticconf.YamlConfiguration(CONFIG_FILE)
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )
    api = API(auth)

    big_ben_ids = [723360691457945600, 723375789467553793, 723390890664824834, 723405988221489154, 
                   723421087703261186, 723436186644025344, 723451541563138052, 723466386304057344, 
                   723481486737985536, 723497089410457600, 723511939465392128, 723528048931430400,
                   723541884208091137, 723556981991202816, 723572081485615104, 723587184276721665,
                   723602282374414338, 723617381017374720, 723632480964759553, 723647581516124160,
                   723662932664524800, 723678284538589184, 723693384272121857, 723709493939453952,
                   723723076614164480]

    for startid, endid in zip(big_ben_ids, big_ben_ids[1:]):
        for tweet in limit_handled(Cursor(api.search,
                q=' OR '.join('qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'),
                since_id=str(startid),
                max_id=str(endid),
                lang="en").items(2500)):
            print(json.dumps(tweet._json))
Beispiel #7
0
    def _get_default_args_list(self):
        """ Return the base list of args for sherlock that remains unchanged
        across multiple invocations of this worker's process_msg.
        """
        args = [
            # First arg, does not matter
            "ingest_multiple_dates.py",

            # We do not want to record status in redshift
            "--skip-progress-in-redshift",

            # run parallelism in the worker instead
            "--serial-stepper",
        ]

        cur_base_dir = os.getcwd()
        if self._should_run_local:
            private_file = os.path.join(
                cur_base_dir,
                staticconf.read_string("run_local.private", "private.yaml"))
            args.extend(["-r"])  # For run-local in sherlock
        else:
            private_file = staticconf.read_string("run_service.private")

        if self._config_override_loc is not None:
            args.extend(["--config-override", self._config_override_loc])
        args.extend(["--private", private_file])
        args.extend(["--config", self._config_loc])

        return args
def main():
    staticconf.YamlConfiguration(CONFIG_FILE)
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )
    api = API(auth)

    big_ben_ids = [
        723360691457945600, 723375789467553793, 723390890664824834,
        723405988221489154, 723421087703261186, 723436186644025344,
        723451541563138052, 723466386304057344, 723481486737985536,
        723497089410457600, 723511939465392128, 723528048931430400,
        723541884208091137, 723556981991202816, 723572081485615104,
        723587184276721665, 723602282374414338, 723617381017374720,
        723632480964759553, 723647581516124160, 723662932664524800,
        723678284538589184, 723693384272121857, 723709493939453952,
        723723076614164480
    ]

    for startid, endid in zip(big_ben_ids, big_ben_ids[1:]):
        for tweet in limit_handled(
                Cursor(
                    api.search,
                    q=' OR '.join(
                        'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'
                    ),
                    since_id=str(startid),
                    max_id=str(endid),
                    lang="en").items(2500)):
            print(json.dumps(tweet._json))
def main():
    staticconf.YamlConfiguration(CONFIG_FILE)
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )
    api = API(auth)

    big_ben_ids = [
        727256357607464960, 727271714187522048, 727287317912817664,
        727302414039158785, 727317768509480960, 727332108876705794,
        727347714380419072, 727362055750176768, 727377660742123520,
        727393264060534784, 727407354162122753, 727422705876762624,
        727437555210293248, 727452651210809344, 727468761842876416,
        727483610119413760, 727498961741856768, 727513051440762881,
        727528910452305921, 727543248458149888, 727559107612422144,
        727574712830857221, 727588550133288961, 727603646221914113,
        727619000348348416
    ]

    for startid, endid in zip(big_ben_ids[13:], big_ben_ids[14:]):
        for tweet in limit_handled(
                Cursor(
                    api.search,
                    q=' OR '.join(
                        'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'
                    ),
                    since_id=str(startid),
                    max_id=str(endid),
                    lang="en").items(2500)):
            print(json.dumps(tweet._json))
Beispiel #10
0
def init():
        # Generate the configures in advance and cache them to avoid repeated processing
        global S3_BUCKET
        global S3_LOG_PREFIX
        global PATH_RE
        S3_BUCKET = staticconf.read_string('s3_bucket')
        S3_LOG_PREFIX = staticconf.read_string('s3_log_prefix')
        PATH_RE = re.compile(PATH_RE_PREFIX.format(S3_LOG_PREFIX))
 def __init__(self, cluster: str, pool: str) -> None:
     super().__init__(cluster, pool)
     kubernetes.config.load_kube_config(
         staticconf.read_string(f'clusters.{cluster}.kubeconfig_path'))
     self._core_api = kubernetes.client.CoreV1Api()
     self._safe_to_evict_annotation = staticconf.read_string(
         f'clusters.{cluster}.pod_safe_to_evict_annotation',
         default='cluster-autoscaler.kubernetes.io/safe-to-evict',
     )
 def __init__(self, cluster: str, pool: Optional[str]) -> None:
     super().__init__(cluster, pool)
     self.kubeconfig_path = staticconf.read_string(
         f'clusters.{cluster}.kubeconfig_path')
     self._safe_to_evict_annotation = staticconf.read_string(
         f'clusters.{cluster}.pod_safe_to_evict_annotation',
         default='cluster-autoscaler.kubernetes.io/safe-to-evict',
     )
     self._nodes_by_ip = {}
Beispiel #13
0
def et_scanner_main(args):
    """ Create an instance of ETScanner and run it once.
    """
    setup_config(args, 'ETScanner')
    sqs_scanner_queue = SQSWrapper(read_string("sqs.et_scanner_queue_name"))
    sqs_worker_queue = SQSWrapper(read_string("sqs.et_queue_name"))
    scanner = ETScanner(TableConnection.get_connection('ScheduledJobs'),
                        sqs_scanner_queue, sqs_worker_queue, Mailer(args.run_local))
    scanner.run()
Beispiel #14
0
def _init_session():
    global _session

    if not _session:
        _session = boto3.session.Session(
            staticconf.read_string('accessKeyId', namespace=CREDENTIALS_NAMESPACE),
            staticconf.read_string('secretAccessKey', namespace=CREDENTIALS_NAMESPACE),
            region_name=staticconf.read_string('aws.region')
        )
Beispiel #15
0
def et_scanner_main(args):
    """ Create an instance of ETScanner and run it once.
    """
    setup_config(args, 'ETScanner')
    sqs_scanner_queue = SQSWrapper(read_string("sqs.et_scanner_queue_name"))
    sqs_worker_queue = SQSWrapper(read_string("sqs.et_queue_name"))
    scanner = ETScanner(TableConnection.get_connection('ScheduledJobs'),
                        sqs_scanner_queue, sqs_worker_queue,
                        Mailer(args.run_local))
    scanner.run()
Beispiel #16
0
 def __init__(self, cluster_name: str) -> None:
     self.client = sqs
     self.cluster = cluster_name
     self.drain_queue_url = staticconf.read_string(
         f'clusters.{cluster_name}.drain_queue_url')
     self.termination_queue_url = staticconf.read_string(
         f'clusters.{cluster_name}.termination_queue_url')
     self.draining_host_ttl_cache: Dict[str, arrow.Arrow] = {}
     self.warning_queue_url = staticconf.read_string(
         f'clusters.{cluster_name}.warning_queue_url',
         default=None,
     )
Beispiel #17
0
def fetch_creds():
    '''
    Return a dictionary holding temporary credentials from the metadata server.
    This function will block upto the timeout specified in config file. You may
    not call this method unless config.yaml is loaded
    '''
    url = '{url_root}/{name}'.format(
        url_root=staticconf.read_string('instance_profile_creds_url'),
        name=staticconf.read_string('instance_profile_name'))
    in_stream = urllib2.urlopen(
        url,
        timeout=staticconf.read_int(
            'instance_profile_creds_timeout_in_seconds', default=4))
    return simplejson.load(in_stream)
Beispiel #18
0
def rs_check_schema(rs_mgmt, args):
    yaml_data = load_from_file(args.schema)
    tables = RedShiftLogSchema(safe_load(yaml_data)).tables()

    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    pipe_strm_lgr = PipelineStreamLogger(
        log_stream,
        True,
        'rs_check_schema'
    )
    psql = RedshiftPostgres(pipe_strm_lgr, args.credentials, run_local=True)
    rs_check_table_def(psql, db, tables, args.redshift_schema)
    rs_check_table_rows(psql, db, tables, args.redshift_schema)
def main():
    """Connects to the stream and starts threads to write them to a file."""
    staticconf.YamlConfiguration(CONFIG_FILE)
    listener = QueueListener()
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )

    writer_thread = threading.Thread(target=worker, args=(listener,))
    writer_thread.start()

    stream = Stream(auth, listener)

    print_status(listener)

    try:
        while True:
            try:
                
                stream.sample(languages=['en'])  # blocking!
                
            except KeyboardInterrupt:
                print('KEYBOARD INTERRUPT', file=sys.stderr)
                return
            except (socket.error, httplib.HTTPException):
                global tcpip_delay
                print(
                    'TCP/IP Error: Restarting after {delay} seconds.'.format(
                        delay=tcpip_delay,
                    ),
                    file=sys.stderr,
                )
                time.sleep(min(tcpip_delay, MAX_TCPIP_TIMEOUT))
                tcpip_delay += 0.25
    finally:
        print('Disconnecting stream', file=sys.stderr)
        stream.disconnect()
        print('Waiting for last tweets to finish processing', file=sys.stderr)
        # Send poison pill to writer thread and wait for it to exit
        listener.queue.put(None)
        listener.queue.join()
        print('Waiting for writer thread to finish', file=sys.stderr)
        writer_thread.join()
        print('Exit successful', file=sys.stderr)
Beispiel #20
0
def main():
    """Connects to the stream and starts threads to write them to a file."""
    staticconf.YamlConfiguration(CONFIG_FILE)
    listener = QueueListener()
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )

    writer_thread = threading.Thread(target=worker, args=(listener,))
    writer_thread.start()

    stream = Stream(auth, listener)

    print_status(listener)

    try:
        while True:
            try:
                # stream.sample()  # blocking!
                stream.filter(track=["#airpodsmax"])
            except KeyboardInterrupt:
                print('KEYBOARD INTERRUPT', file=sys.stderr)
                return
            except (socket.error):
                global tcpip_delay
                print(
                    'TCP/IP Error: Restarting after {delay} seconds.'.format(
                        delay=tcpip_delay,
                    ),
                    file=sys.stderr,
                )
                time.sleep(min(tcpip_delay, MAX_TCPIP_TIMEOUT))
                tcpip_delay += 0.25
    finally:
        print('Disconnecting stream', file=sys.stderr)
        stream.disconnect()
        print('Waiting for last tweets to finish processing', file=sys.stderr)
        # Send poison pill to writer thread and wait for it to exit
        listener.queue.put(None)
        # listener.queue.join()
        print('Waiting for writer thread to finish', file=sys.stderr)
        writer_thread.join()
        print('Exit successful', file=sys.stderr)
Beispiel #21
0
 def test_setup_config_with_env_vars(self):
     args = parse_cmd_args(['program', '--config=./config.yaml',
                            '--config-override=config-env-dev.yaml', '-r'])
     with staticconf.testing.MockConfiguration(MOCK_CONFIG):
         setup_config(args, 'test_worker')
         # pick some key and ensure it ws loaded from config
         assert read_string('log_stream_name', 'default') != 'default'
Beispiel #22
0
def process_queues(cluster_name: str) -> None:
    draining_client = DrainingClient(cluster_name)
    mesos_master_url = staticconf.read_string(
        f'clusters.{cluster_name}.mesos_master_fqdn')
    mesos_secret_path = staticconf.read_string(
        f'mesos.mesos_agent_secret_path', default=None)
    operator_client = operator_api(mesos_master_url, mesos_secret_path)
    logger.info('Polling SQS for messages every 5s')
    while True:
        draining_client.clean_processing_hosts_cache()
        draining_client.process_warning_queue()
        draining_client.process_drain_queue(
            mesos_operator_client=operator_client, )
        draining_client.process_termination_queue(
            mesos_operator_client=operator_client, )
        time.sleep(5)
Beispiel #23
0
    def __init__(self,
                 config_loc,
                 config_override_loc,
                 run_local,
                 emailer,
                 dummy_run=False):
        super(ImdWorker, self).__init__(
            config_loc,
            config_override_loc,
            emailer,
            num_processes=3,
        )
        for key in self.KEYS_TO_LOAD:
            self.__setattr__(key, staticconf.read_string(key))
        if dummy_run:
            log("Dummy worker! Skip the real etl process. Just for test.")
            import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates
        else:
            import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates
        self._should_run_local = run_local
        self.dummy_run = dummy_run
        self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main
        self.queue_name = staticconf.get_string("sqs.et_queue_name")
        self.scanner_queue_name = staticconf.get_string(
            "sqs.et_scanner_queue_name")

        log("ImdWorker initialization")
        log(dict((k, str(v)) for k, v in vars(self).iteritems()))
Beispiel #24
0
    def run(self):
        while self.running:
            time.sleep(splay_event_time(
                self.run_interval,
                self.get_name() + staticconf.read_string('aws.region'),
            ))

            now = arrow.utcnow()
            with self.metrics_client.get_writer(METADATA) as writer:
                try:
                    with suppress_request_limit_exceeded():
                        self.write_prices(now, writer)
                except socket.timeout:
                    # We don't really care if we miss a few spot price changes so just continue here
                    logger.warn(f'Timed out getting spot prices:\n\n{format_exc()}')
                    continue

            # Report successful run to Sensu.
            sensu_args = dict(
                check_name='check_clusterman_spot_prices_running',
                output='OK: clusterman spot_prices was successful',
                check_every='1m',
                source=self.options.aws_region,
                ttl='10m',
                noop=self.options.disable_sensu,
            )
            sensu_checkin(**sensu_args)
Beispiel #25
0
def fetch_creds():
    '''
    Return a dictionary holding temporary credentials from the metadata server.
    This function will block upto the timeout specified in config file. You may
    not call this method unless config.yaml is loaded
    '''
    url = '{url_root}/{name}'.format(
        url_root=staticconf.read_string('instance_profile_creds_url'),
        name=staticconf.read_string('instance_profile_name'))
    in_stream = urllib2.urlopen(
        url,
        timeout=staticconf.read_int(
            'instance_profile_creds_timeout_in_seconds', default=4
        )
    )
    return simplejson.load(in_stream)
Beispiel #26
0
def test__get_key_name():
    log_name = 'x'
    log_version = 'y'
    return_value = _get_key_name(log_name, log_version)
    s3_log_prefix = staticconf.read_string('s3_log_prefix')
    path_re = re.compile(PATH_RE_PREFIX.format(s3_log_prefix))
    assert path_re.match(return_value) is not None
    def configure_initial(self) -> None:
        setup_config(self.options)

        # Since we want to collect metrics for all the pools, we need to call setup_config
        # first to load the cluster config path, and then read all the entries in that directory
        self.pools: MutableMapping[str, List[str]] = {}
        for scheduler in {'mesos', 'kubernetes'}:
            self.pools[scheduler] = get_pool_name_list(self.options.cluster,
                                                       scheduler)
        for scheduler, pools in self.pools.items():
            for pool in pools:
                self.config.watchers.append({
                    f'{pool}.{scheduler}':
                    get_pool_config_path(self.options.cluster, pool,
                                         scheduler),
                })
                load_cluster_pool_config(self.options.cluster, pool, scheduler,
                                         None)

        self.region = staticconf.read_string('aws.region')
        self.run_interval = staticconf.read_int(
            'batches.cluster_metrics.run_interval_seconds')
        self.logger = logger

        self.metrics_client = ClustermanMetricsBotoClient(
            region_name=self.region)
Beispiel #28
0
def _get_logger(run_local, tag):
    try:
        return PipelineStreamLogger(staticconf.read_string("log_stream_name"),
                                    run_local, tag)
    except:
        logger.write_msg("Error creating a pipeline stream logger!")
        return logger  # Return existing logger instance in case of errors
Beispiel #29
0
def test__get_key_name():
    log_name = 'x'
    log_version = 'y'
    return_value = _get_key_name(log_name, log_version)
    s3_log_prefix = staticconf.read_string('s3_log_prefix')
    path_re = re.compile(PATH_RE_PREFIX.format(s3_log_prefix))
    assert path_re.match(return_value) is not None
Beispiel #30
0
 def parse_config(self, config_file_path):
     super(S3Feeder, self).parse_config(config_file_path)
     self.s3_event_notifications_queue_name = staticconf.read(
         's3_event_notifications_queue_name')
     self.number_messages = staticconf.read('number_messages', default=1)
     self.aws_region = staticconf.read('aws_region', default=None)
     self.owner_account_id = staticconf.read_string('owner_account_id')
     self.role_arn = staticconf.read('role_arn', default=None)
Beispiel #31
0
def get_scanner_queue(etl_type):
    """
    Return the scanner sqs for jobs to send a message when post a job
    to wake up the scanner
    :param etl_type: et or load
    :type etl_type: string in ['et', 'load']
    """
    return SQSWrapper(read_string("sqs.{0}_scanner_queue_name".format(etl_type)))
Beispiel #32
0
def get_sqs_connection():
    '''
    :returns: sqs connection
    '''
    return boto.sqs.connect_to_region(
        read_string('aws_config.region'),
        **get_boto_creds()
    )
Beispiel #33
0
def dynamodb_table_names():
    '''
    :returns: iterable of string that each element is a DyanmoDB table name used in mycroft
    '''
    table_names = []
    # append other table resources required by mycroft
    table_names.append(staticconf.read_string('aws_config.scheduled_jobs_table'))
    return table_names
Beispiel #34
0
def get_dynamodb_connection():
    '''
    :returns: dynamodb2 connection
    '''
    return boto.dynamodb2.connect_to_region(
        read_string('aws_config.region'),
        **get_boto_creds()
    )
def main():
    staticconf.YamlConfiguration(CONFIG_FILE)
    auth = OAuthHandler(
        staticconf.read_string('twitter.consumer_key'),
        staticconf.read_string('twitter.consumer_secret'),
    )
    auth.set_access_token(
        staticconf.read_string('twitter.access_token'),
        staticconf.read_string('twitter.access_token_secret'),
    )
    api = API(auth)

    big_ben_ids = [727256357607464960, 
                   727271714187522048, 
                   727287317912817664, 
                   727302414039158785, 
                   727317768509480960, 
                   727332108876705794, 
                   727347714380419072, 
                   727362055750176768, 
                   727377660742123520, 
                   727393264060534784, 
                   727407354162122753, 
                   727422705876762624,
                   727437555210293248, 
                   727452651210809344, 
                   727468761842876416, 
                   727483610119413760,
                   727498961741856768, 
                   727513051440762881, 
                   727528910452305921, 
                   727543248458149888,
                   727559107612422144, 
                   727574712830857221, 
                   727588550133288961, 
                   727603646221914113,
                   727619000348348416]

    for startid, endid in zip(big_ben_ids[13:], big_ben_ids[14:]):
        for tweet in limit_handled(Cursor(api.search,
                q=' OR '.join('qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'),
                since_id=str(startid),
                max_id=str(endid),
                lang="en").items(2500)):
            print(json.dumps(tweet._json))
Beispiel #36
0
def setup_config(args: argparse.Namespace) -> None:
    # load_default_config merges the 'module_config' key from the first file
    # and the 'module_env_config' key from the second file to configure packages.
    # This allows us to configure packages differently in different hiera envs by
    # changing 'module_env_config'. We use the same file for both keys.
    _load_module_configs(args.env_config_path)

    signals_branch_or_tag = getattr(args, 'signals_branch_or_tag', None)
    cluster_config_directory = getattr(args, 'cluster_config_directory',
                                       None) or DEFAULT_CLUSTER_DIRECTORY
    staticconf.DictConfiguration(
        {'cluster_config_directory': cluster_config_directory})

    aws_region = getattr(args, 'aws_region', None)
    cluster = getattr(args, 'cluster', None)
    pool = getattr(args, 'pool', None)
    scheduler = getattr(args, 'scheduler', None)
    if aws_region and cluster:
        raise argparse.ArgumentError(
            None, 'Cannot specify both cluster and aws_region')

    # If there is a cluster specified via --cluster, load cluster-specific attributes
    # into staticconf.  These values are not specified using hiera in srv-configs because
    # we might want to be operating on a cluster in one region while running from a
    # different region.
    elif cluster:
        aws_region = staticconf.read_string(f'clusters.{cluster}.aws_region',
                                            default=None)
        if pool:
            load_cluster_pool_config(cluster, pool, scheduler,
                                     signals_branch_or_tag)

    staticconf.DictConfiguration({'aws': {'region': aws_region}})

    boto_creds_file = staticconf.read_string('aws.access_key_file',
                                             default=None)
    if boto_creds_file:
        staticconf.JSONConfiguration(boto_creds_file,
                                     namespace=CREDENTIALS_NAMESPACE)

    if signals_branch_or_tag:
        staticconf.DictConfiguration(
            {'autoscale_signal': {
                'branch_or_tag': signals_branch_or_tag
            }})
Beispiel #37
0
def _get_logger(run_local, tag):
    try:
        return PipelineStreamLogger(
            staticconf.read_string("log_stream_name"),
            run_local, tag
        )
    except:
        logger.write_msg("Error creating a pipeline stream logger!")
        return logger  # Return existing logger instance in case of errors
Beispiel #38
0
 def __init__(self, cluster: str, pool: str) -> None:
     super().__init__(cluster, pool)
     mesos_master_fqdn = staticconf.read_string(f'clusters.{self.cluster}.mesos_master_fqdn')
     self.non_batch_framework_prefixes = self.pool_config.read_list(
         'non_batch_framework_prefixes',
         default=['marathon'],
     )
     self.api_endpoint = f'http://{mesos_master_fqdn}:5050/'
     logger.info(f'Connecting to Mesos masters at {self.api_endpoint}')
Beispiel #39
0
def rs_cluster_restore(rs_mgmt, args):
    """ restore cluster from snapshot
    Output can be appended to a YAML config file
    """

    if not args.subnet_group_name:
        args.subnet_group_name = read_string('redshift_cluster_subnet_group_name')
    if not args.vpc_security_group:
        args.vpc_security_group = read_string('security_group_id')
    rs_mgmt.restore_from_cluster_snapshot(
        args.cluster_name,
        args.snapshot,
        args.parameter_group,
        args.vpc_security_group,
        args.subnet_group_name,
    )
    cluster_info = rs_mgmt.get_cluster_info(args.cluster_name)
    return cluster_info['Endpoint']['Address'], cluster_info['Endpoint']['Port']
Beispiel #40
0
def test_load_cluster_pool_config(cluster, pool, pool_other_config,
                                  mock_config_files):
    config.load_cluster_pool_config(cluster, pool, 'mesos', None)

    pool_namespace = POOL_NAMESPACE.format(pool=pool, scheduler='mesos')
    assert staticconf.read_int('other_config',
                               namespace=pool_namespace) == pool_other_config
    assert staticconf.read_string(f'resource_groups',
                                  namespace=pool_namespace) == cluster
Beispiel #41
0
def test_setup_config_region(mock_load_module_configs, mock_config_files):
    args = argparse.Namespace(
        env_config_path='/nail/etc/config.yaml',
        aws_region='fake-region-A',
    )
    config.setup_config(args)
    assert staticconf.read_string('aws.region') == 'fake-region-A'
    assert mock_load_module_configs.call_args == mock.call(
        '/nail/etc/config.yaml')
Beispiel #42
0
def ensure_account_id(cluster) -> None:
    current_account_id = sts.get_caller_identity()['Account']
    cluster_account_id = staticconf.read_string(
        f'clusters.{cluster}.aws_account_number')

    if (current_account_id != cluster_account_id):
        raise AccountNumberMistmatchError(
            f'ACCOUNT ID MISMATCH! Current account id: {current_account_id}. Cluster account id: {cluster_account_id}'
        )
Beispiel #43
0
def setup_private(input_args):
    """
    setup_private sets up the aws credentials required to run on the server
    in the appropriate environment variables

    Args:
    local -- True if we're on dev, False if on stageb
    input_args -- input yaml file with aws access_key_id and secret_access_key

    Returns
    a yaml file with the private information in it
    ---
    """

    YamlConfiguration(input_args, optional=True)
    os.environ['AWS_ACCESS_KEY_ID'] = read_string('emr_aws_access_key_id')
    os.environ['AWS_SECRET_ACCESS_KEY'] = \
        read_string('emr_aws_secret_access_key')
Beispiel #44
0
def get_scanner_queue(etl_type):
    """
    Return the scanner sqs for jobs to send a message when post a job
    to wake up the scanner
    :param etl_type: et or load
    :type etl_type: string in ['et', 'load']
    """
    return SQSWrapper(
        read_string("sqs.{0}_scanner_queue_name".format(etl_type)))
Beispiel #45
0
def dynamodb_table_names():
    '''
    :returns: iterable of string that each element is a DyanmoDB table name used in mycroft
    '''
    table_names = []
    # append other table resources required by mycroft
    table_names.append(
        staticconf.read_string('aws_config.scheduled_jobs_table'))
    return table_names
Beispiel #46
0
def setup_private(input_args):
    """
    setup_private sets up the aws credentials required to run on the server
    in the appropriate environment variables

    Args:
    local -- True if we're on dev, False if on stageb
    input_args -- input yaml file with aws access_key_id and secret_access_key

    Returns
    a yaml file with the private information in it
    ---
    """

    YamlConfiguration(input_args, optional=True)
    os.environ['AWS_ACCESS_KEY_ID'] = read_string('emr_aws_access_key_id')
    os.environ['AWS_SECRET_ACCESS_KEY'] = \
        read_string('emr_aws_secret_access_key')
Beispiel #47
0
def s3_to_psv_main(args):

    mrjob = read_string('pipeline.et_step.mrjob')
    stream_name = read_string('pipeline.et_step.s3_to_s3_stream')
    DATABASE = read_string('pipeline.redshift_database')

    LOG_STREAM = PipelineStreamLogger(
        stream_name,
        args.run_local,
        mrjob,
        input_date=args.date
    )

    day_to_run = setup_dates_to_check(args.date, args.run_local, LOG_STREAM)

    try:
        if not args.run_local:
            setup_private(args.private)
        # Create a psql instance based on args
        if args.skip_progress_in_redshift:
            status_table = DynamoDbStatusTable(
                LOG_STREAM, run_local=args.run_local
            )
        else:
            status_table = RedshiftStatusTable(
                RedshiftPostgres(
                    LOG_STREAM, args.private, run_local=args.run_local
                )
            )
        load_msg = __load_data_from_s3(
            status_table,
            read_list('pipeline.et_step.s3_prefixes'),
            day_to_run,
            mrjob,
            args.run_local,
            DATABASE,
            LOG_STREAM,
            force_et=args.force_et
        )
        LOG_STREAM.write_msg("complete", extra_msg=load_msg)

    finally:
        clear_env(args.run_local)
Beispiel #48
0
def pipeline_yaml_schema_file_path():
    """Return the full path of the yaml schema file for the pipeline. Do
    nothing if the path is already an S3 path
    """
    yaml_schema_file_path = read_string('pipeline.yaml_schema_file')
    if is_s3_path(yaml_schema_file_path):
        return yaml_schema_file_path
    return '{directory}/{filename}'.format(
        directory=os.environ['YELPCODE'],
        filename=yaml_schema_file_path,
    )
Beispiel #49
0
def fetch_creds_from_file():
    '''
    Returns a dictionary holding credentials from a file defined in config.yaml
    '''
    with open(staticconf.read_string('run_local.session_file'), 'r') as creds:
        if os.fstat(creds.fileno()).st_size == 0:
            raise Exception("session file is empty")
        creds_dict = simplejson.load(creds)
        creds_dict['Expiration'] = creds_dict.get('Expiration', MAX_UNIX_TIME)
        for optional_key in ['Token', 'LastUpdated']:
            creds_dict[optional_key] = creds_dict.get(optional_key)
        return creds_dict
Beispiel #50
0
    def mail_result(self, final_status, msg, additional_info=None):
        link = self.link_temp.format(msg['uuid'])
        content = self.template.format(
            msg['uuid'], final_status, msg['log_name'], msg['log_schema_version'],
            msg['s3_path'], msg['redshift_id'], msg['start_date'], msg['end_date'], link,
            additional_info
        )

        new_msg = MIMEText(content)
        new_msg['Subject'] = self.subject.format(msg['uuid'])
        new_msg['From'] = self.address
        new_msg['To'] = ','.join(msg['contact_emails'])

        smtp_host = staticconf.read_string('smtp_host', 'localhost')
        smtp_port = staticconf.read_string('smtp_port', None)
        smtp_login = staticconf.read_string('smtp_login', None)
        smtp_password = staticconf.read_string('smtp_password', None)
        smtp_security = staticconf.read_string('smtp_security', None)

        if smtp_port is not None:
            smtp_host = "{0}:{1}".format(smtp_host, smtp_port)

        if smtp_security is not None:
            smtp_security = smtp_security.upper()

        if smtp_security == 'SSL':
            s = smtplib.SMTP_SSL(smtp_host)
            s.login(smtp_login, smtp_password)
        elif smtp_security == 'TLS':
            s = smtplib.SMTP(smtp_host)
            s.ehlo()
            s.starttls()
            s.login(smtp_login, smtp_password)
        else:
            s = smtplib.SMTP(smtp_host)

        s.sendmail(self.address, msg['contact_emails'], new_msg.as_string())
        s.quit()
Beispiel #51
0
def search_log_source_by_keyword(request_body):
    disabled_logfinder = staticconf.read_bool('disable_logfinder_service')
    if disabled_logfinder:
        return {'logs': []}

    # send HTTP request
    search_endpoint = staticconf.read_string('log_finder_search_end_point')
    response = requests.post(search_endpoint, request_body)

    # if we get a bad HTTP status, raise an exception
    response.raise_for_status()

    content = response.json()
    return content
Beispiel #52
0
    def __init__(self, logstrm, psql_auth_file, run_local=False):

        self.run_local = run_local
        self.host = staticconf.read_string('redshift_host')
        self.port = staticconf.read_int('redshift_port')
        private_dict = YamlConfiguration(psql_auth_file)
        self.user = private_dict['redshift_user']
        self.password = private_dict['redshift_password']
        self.log_stream = logstrm
        self._aws_key = ''
        self._aws_secret = ''
        self._aws_token = ''
        self._aws_token_expiry = datetime.utcnow()
        self._whitelist = ['select', 'create', 'insert', 'update']
        self._set_aws_auth()
        psycopg2.extensions.set_wait_callback(wait_select_inter)
Beispiel #53
0
def get_log_meta_data(bucket_name, log_name):
    if bucket_name is None or log_name is None:
        return None

    if staticconf.read_bool('disable_logfinder_service'):
        return None

    # send HTTP request
    endpoint = staticconf.read_string('log_finder_buckets_end_point') \
        + '/' + bucket_name + '/' + log_name
    response = requests.get(endpoint)

    # if we get a bad HTTP status, raise an exception
    response.raise_for_status()

    return response.json()
Beispiel #54
0
def copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream):
    s3_log, rs_table = log_tuple
    namespaced_table_name = get_namespaced_tablename(rs_table)
    table_start = time.time()
    extra_msg = "from s3 log: {0}".format(s3_log)
    logstream.write_msg('starting', extra_msg=extra_msg)

    # about to load new day, remove oldest
    rows_deleted = None
    if ttl_days is not None:
        rows_deleted = \
            delete_old_data(psql_helper, db_name, rs_table, ttl_days - 1)
    if rows_deleted:
        logstream.write_msg('delete_ok',
                            extra_msg="{0} rows".format(rows_deleted))

    # Try to reclaim disk space.  If not needed, it will be fast.
    # Calling here and not in the 'if rows_deleted' code to prevent
    # scenario where rows were deleted but compact failed. Then on retry
    # there will be nothing to delete but since space is not reclaimed
    # there may not be enough for a new load, resulting in failure forever.
    if ttl_days is not None:
        compact_table(psql_helper, db_name, namespaced_table_name)

    delimiter = read_string('redshift_column_delimiter')
    delimiter = delimiter.decode("string_escape")
    if delimiter not in string.printable:
        delimiter = '\\' + oct(ord(delimiter))

    copy_sql = LOAD % (namespaced_table_name, s3_log, delimiter)
    result = psql_helper.run_sql(
        copy_sql,
        db_name, " copying from " + s3_log,
        s3_needed=True,
        time_est_secs=read_int('pipeline.load_step.copy_time_est_secs')
    )
    if result is not False:
        logstream.write_msg('complete', job_start_secs=table_start,
                            extra_msg=extra_msg)
    return result
Beispiel #55
0
 def get_connection(cls, table_object_name):
     if table_object_name not in cls._connection_dict:
         if cls._region_conn is None:
             cls._region_conn = get_dynamodb_connection()
         table_properties = cls._TABLE_NAME_TO_PROPERTIES[table_object_name]
         avro_schema = get_avro_schema(table_properties['avro_schema'])
         table_name = read_string(table_properties['physical_id_key'])
         table = Table(
             table_name,
             connection=cls._region_conn
         )
         try:
             results = table.describe()
             raw_indexes = results['Table'].get('GlobalSecondaryIndexes', [])
             table.global_indexes = introspect_global_indexes(raw_indexes)
         except Exception:
             log_exception("Table Connection Failed")
         cls._connection_dict[table_object_name] = table_properties['class'](
             table,
             avro_schema
         )
     return cls._connection_dict[table_object_name]
Beispiel #56
0
    def __init__(self, config_loc, config_override_loc, run_local, emailer, dummy_run=False):
        super(ImdWorker, self).__init__(
            config_loc,
            config_override_loc,
            emailer,
            num_processes=3,
        )
        for key in self.KEYS_TO_LOAD:
            self.__setattr__(key, staticconf.read_string(key))
        if dummy_run:
            log("Dummy worker! Skip the real etl process. Just for test.")
            import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates
        else:
            import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates
        self._should_run_local = run_local
        self.dummy_run = dummy_run
        self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main
        self.queue_name = staticconf.get_string("sqs.et_queue_name")
        self.scanner_queue_name = staticconf.get_string("sqs.et_scanner_queue_name")

        log("ImdWorker initialization")
        log(dict((k, str(v))for k, v in vars(self).iteritems()))
Beispiel #57
0
def get_redshift_schema():
    # note we do lower for backward compatability
    return read_string('redshift_schema', DEFAULT_NAMESPACE).lower()
Beispiel #58
0
def s3_to_redshift_main(args):

    db = read_string('pipeline.redshift_database')
    s3_log_prefix = read_string('pipeline.s3_output_prefix').format(
        logname=os.environ.get('LOGNAME', 'unknown')
    )

    # setup logging
    stream_name = read_string('pipeline.load_step.s3_to_redshift_stream')
    LOG_STREAM = PipelineStreamLogger(
        stream_name,
        args.run_local,
        's3_to_redshift',
        job_name='load'
    )

    # handle to redshift db
    loader_psql = RedshiftPostgres(
        LOG_STREAM, args.private, run_local=args.run_local
    )

    if args.skip_progress_in_redshift:
        status_table = DynamoDbStatusTable(
            LOG_STREAM, run_local=args.run_local
        )
    else:
        status_table = RedshiftStatusTable(loader_psql)

    create_tuples = get_table_creates(args.db_file, LOG_STREAM)

    data_candidates = dates_from_rs_status(
        status_table,
        db,
        LOG_STREAM,
        args.retry_errors,
        args.date,
    )
    if data_candidates:
        try:
            update_database_schema(
                loader_psql,
                db,
                data_candidates[0],
                s3_log_prefix,
                args.db_file,
                LOG_STREAM
            )
        except Exception as e:
            status_table.update_status(
                db,
                data_candidates[0],
                get_yaml_table_versions(pipeline_yaml_schema_file_path()),
                "error",
                start_time_secs=time.time(), error_msg=repr(e)
            )
            raise
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(
            stream_name,
            args.run_local,
            's3_to_redshift',
            job_name='load',
            input_date=input_date
        )
        logs_to_copy = [
            (join(s3_log_prefix, input_date, table), table)
            for (table, _) in create_tuples
        ]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)
Beispiel #59
0
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures)
        )


if __name__ == "__main__":
    args = get_cmd_line_args()
    run_local = args.run_local
    merge_configs(args.config)
    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint')
    psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local)

    yaml = load_from_file(args.schema)
    schema = RedShiftLogSchema(safe_load(yaml))

    if args.compact:
        compact_tables(psql, db, schema.tables(), args.redshift_schema)
    analyze_tables(psql, db, schema.tables(), args.redshift_schema)