コード例 #1
0
def main():
    # In order to see errors during extension loading, you can uncomment the next line.
    logging.basicConfig(level=logging.DEBUG)

    # Load tasks configured using entry_points
    # TODO: launch tasks by their entry_point name
    stevedore.ExtensionManager('edx.analytics.tasks')

    configuration = luigi.configuration.get_config()
    if os.path.exists(OVERRIDE_CONFIGURATION_FILE):
        log.debug('Using %s', OVERRIDE_CONFIGURATION_FILE)
        configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE)
    else:
        log.debug('Configuration file %s does not exist', OVERRIDE_CONFIGURATION_FILE)

    # Tell luigi what dependencies to pass to the Hadoop nodes
    # - boto is used for all direct interactions with s3.
    # - cjson is used for all parsing event logs.
    # - filechunkio is used for multipart uploads of large files to s3.
    # - opaque_keys is used to interpret serialized course_ids
    #   - dependencies of opaque_keys:  bson, stevedore
    luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, ciso8601)

    # TODO: setup logging for tasks or configured logging mechanism

    # Launch Luigi using the default builder

    with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')):
        luigi.run()
コード例 #2
0
def main():
    # In order to see errors during extension loading, you can uncomment the next line.
    # logging.basicConfig(level=logging.DEBUG)

    # Load tasks configured using entry_points
    # TODO: launch tasks by their entry_point name
    stevedore.ExtensionManager('edx.analytics.tasks')

    configuration = luigi.configuration.get_config()
    if os.path.exists(OVERRIDE_CONFIGURATION_FILE):
        log.debug('Using override.cfg')
        with open(OVERRIDE_CONFIGURATION_FILE, 'r') as override_file:
            log.debug(override_file.read())
        configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE)
    else:
        log.debug('override.cfg does not exist')

    # Tell luigi what dependencies to pass to the Hadoop nodes
    # - boto is used for all direct interactions with s3.
    # - cjson is used for all parsing event logs.
    # - filechunkio is used for multipart uploads of large files to s3.
    # - opaque_keys is used to interpret serialized course_ids
    #   - dependencies of opaque_keys:  bson, stevedore
    luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore)

    # TODO: setup logging for tasks or configured logging mechanism

    # Launch Luigi using the default builder
    luigi.run()
コード例 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--additional-config', help='additional configuration file to be loaded after default/override',
        default=None, action='append')
    arguments, _extra_args = parser.parse_known_args()

    # We get a cleaned command-line arguments list, free of the arguments *we* care about, since Luigi will throw
    # errors when it sees arguments that it or the workflow didn't specify.  We pass these in when invoking Luigi.
    cmdline_args = get_cleaned_command_line_args()

    # In order to see errors during extension loading, you can uncomment the next line.
    logging.basicConfig(level=logging.DEBUG)

    # Load tasks configured using entry_points
    # TODO: launch tasks by their entry_point name
    stevedore.ExtensionManager('edx.analytics.tasks')

    # Load the override configuration if it's specified/exists.
    configuration = luigi.configuration.get_config()
    if os.path.exists(OVERRIDE_CONFIGURATION_FILE):
        log.debug('Loading override configuration \'%s\'...', OVERRIDE_CONFIGURATION_FILE)
        configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE)
    else:
        log.debug('Configuration file \'%s\' does not exist!', OVERRIDE_CONFIGURATION_FILE)

    # Load any additional configuration files passed in.
    if arguments.additional_config is not None:
        for additional_config in arguments.additional_config:
            if os.path.exists(additional_config):
                log.debug('Loading additional configuration file \'%s\'...', additional_config)
                configuration.add_config_path(additional_config)
            else:
                log.debug('Configuration file \'%s\' does not exist!', additional_config)


    # Tell luigi what dependencies to pass to the Hadoop nodes
    # - edx.analytics.tasks is used to load the pipeline code, since we cannot trust all will be loaded automatically.
    # - boto is used for all direct interactions with s3.
    # - cjson is used for all parsing event logs.
    # - filechunkio is used for multipart uploads of large files to s3.
    # - opaque_keys is used to interpret serialized course_ids
    #   - opaque_keys extensions:  ccx_keys
    #   - dependencies of opaque_keys:  bson, stevedore
    luigi.hadoop.attach(edx.analytics.tasks)
    luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, ciso8601, requests)

    if configuration.getboolean('ccx', 'enabled', default=False):
        import ccx_keys
        luigi.hadoop.attach(ccx_keys)

    # TODO: setup logging for tasks or configured logging mechanism

    # Launch Luigi using the default builder

    with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')):
        luigi.run(cmdline_args)
コード例 #4
0
def test_extract_user_metrics(mocker):
    MockTarget.fs.clear()
    add_config_path('testconfig/luigi.conf')
    mocker.patch('luigi.Task.input',
                 return_value=luigi.LocalTarget("data/user_profile.json"))

    luigi.build([ExtractUserMetricsMock(file_number=0)],
                local_scheduler=True,
                no_lock=True,
                workers=1)
    r = json.loads(MockTarget.fs.get_data('/tmp/a.txt'))
    assert 2 == len(r.get("root"))
コード例 #5
0
def test_fetch_user_list(requests_mock):
    MockTarget.fs.clear()
    add_config_path('testconfig/luigi.conf')
    with open('data/publication_response.json') as input_file:
        url = re.compile("https://medium.com/*.*")
        requests_mock.register_uri(method='GET',
                                   url=url,
                                   text=input_file.read())
        luigi.build([FetchUserListMock()],
                    local_scheduler=True,
                    no_lock=True,
                    workers=1)
        assert 1 == requests_mock.call_count
        r = json.loads(MockTarget.fs.get_data('/tmp/a.txt'))
        assert 10 == len(r)
コード例 #6
0
def test_fetch_user_profiles(mocker, requests_mock):
    MockTarget.fs.clear()
    add_config_path('testconfig/luigi.conf')
    with open('data/user_profile_response.json') as input_file:
        mocker.patch('luigi.Task.input',
                     return_value=luigi.LocalTarget("data/user_list.json"))
        url = re.compile("https://medium.com/*.*")
        requests_mock.register_uri(method='GET',
                                   url=url,
                                   text=input_file.read())

        luigi.build([FetchUserProfileMock(file_number=0)],
                    local_scheduler=True,
                    no_lock=True,
                    workers=1)
        assert 8 == requests_mock.call_count
        r = json.loads(MockTarget.fs.get_data('/tmp/a.txt'))
        assert 8 == len(r.get("root"))
コード例 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--additional-config',
        help='additional configuration file to be loaded after default/override',
        default=None,
        action='append'
    )
    arguments, _extra_args = parser.parse_known_args()

    # We get a cleaned command-line arguments list, free of the arguments *we* care about, since Luigi will throw
    # errors when it sees arguments that it or the workflow didn't specify.  We pass these in when invoking Luigi.
    cmdline_args = get_cleaned_command_line_args()

    # In order to see errors during extension loading, you can uncomment the next line.
    logging.basicConfig(level=logging.DEBUG)

    # Load tasks configured using entry_points
    # TODO: launch tasks by their entry_point name
    stevedore.ExtensionManager('edx.analytics.tasks')

    # Load the override configuration if it's specified/exists.
    configuration = luigi.configuration.get_config()
    if os.path.exists(OVERRIDE_CONFIGURATION_FILE):
        log.debug('Loading override configuration \'%s\'...', OVERRIDE_CONFIGURATION_FILE)
        configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE)
    else:
        log.debug('Configuration file \'%s\' does not exist!', OVERRIDE_CONFIGURATION_FILE)

    # Load any additional configuration files passed in.
    if arguments.additional_config is not None:
        for additional_config in arguments.additional_config:
            if os.path.exists(additional_config):
                log.debug('Loading additional configuration file \'%s\'...', additional_config)
                configuration.add_config_path(additional_config)
            else:
                log.debug('Configuration file \'%s\' does not exist!', additional_config)

    # Tell luigi what dependencies to pass to the Hadoop nodes:
    # - edx.analytics.tasks is used to load the pipeline code, since we cannot trust all will be loaded automatically.
    # - boto is used for all direct interactions with s3.
    # - cjson is used for all parsing event logs.
    # - filechunkio is used for multipart uploads of large files to s3.
    # - opaque_keys is used to interpret serialized course_ids
    #   - opaque_keys extensions:  ccx_keys
    #   - dependencies of opaque_keys:  bson, stevedore, six
    # - requests has several dependencies:
    #   - chardet, urllib3, certifi, idna
    luigi.contrib.hadoop.attach(edx.analytics.tasks)
    luigi.contrib.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, six, ciso8601, chardet, urllib3, certifi, idna, requests)

    if configuration.getboolean('ccx', 'enabled', default=False):
        import ccx_keys
        luigi.contrib.hadoop.attach(ccx_keys)

    # TODO: setup logging for tasks or configured logging mechanism

    # Launch Luigi using the default builder

    with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')):
        luigi.retcodes.run_with_retcodes(cmdline_args)
コード例 #8
0
 def test_add_without_install(self):
     enabled = LuigiTomlParser.enabled
     LuigiTomlParser.enabled = False
     with self.assertRaises(ImportError):
         add_config_path('test/testconfig/luigi.toml')
     LuigiTomlParser.enabled = enabled
コード例 #9
0
 def setUpClass(cls):
     add_config_path('test/testconfig/luigi.toml')
     add_config_path('test/testconfig/luigi_local.toml')
コード例 #10
0
ファイル: config_toml_test.py プロジェクト: spotify/luigi
 def test_add_without_install(self):
     enabled = LuigiTomlParser.enabled
     LuigiTomlParser.enabled = False
     with self.assertRaises(ImportError):
         add_config_path('test/testconfig/luigi.toml')
     LuigiTomlParser.enabled = enabled
コード例 #11
0
ファイル: config_toml_test.py プロジェクト: spotify/luigi
 def setUpClass(cls):
     add_config_path('test/testconfig/luigi.toml')
     add_config_path('test/testconfig/luigi_local.toml')