Ejemplo n.º 1
0
def addModeledPrt():
    for tname in [_.rstrip() for _ in file("../dat/casf_names.txt")]:
        luigi.build([AddCenter2Modeled(tname,
                                       version="0.7"),
                     AddCenter2Modeled(tname,
                                       version="0.5")],
                    local_scheduler=True)
Ejemplo n.º 2
0
    def test_row_overload(self):
        """Overload the rows method and we should be able to insert data into database"""

        class SQLARowOverloadTest(sqla.CopyToTable):
            columns = [(["item", sqlalchemy.String(64)], {}), (["property", sqlalchemy.String(64)], {})]
            connection_string = CONNECTION_STRING
            table = "item_property"
            chunk_size = 1

            def rows(self):
                tasks = [
                    ("item0", "property0"),
                    ("item1", "property1"),
                    ("item2", "property2"),
                    ("item3", "property3"),
                    ("item4", "property4"),
                    ("item5", "property5"),
                    ("item6", "property6"),
                    ("item7", "property7"),
                    ("item8", "property8"),
                    ("item9", "property9"),
                ]
                for row in tasks:
                    yield row

        task = SQLARowOverloadTest()
        luigi.build([task], local_scheduler=True, workers=self.NUM_WORKERS)
        self._check_entries(self.engine)
def main(tname, subset):
    luigi.build([Curate.PairWisePsScore(tname=tname,
                                        subset=subset),
                 Curate.PairWiseTanimoto(tname=tname,
                                         subset=subset)],
                local_scheduler=True)
    pass
Ejemplo n.º 4
0
 def test_unified_snapshot(self):
     task = UnifiedSnapshotMock()
     luigi.build([task], local_scheduler=True)
     print(task.output().path)
     with task.output().open() as handle:
         lines = handle.readlines()
         self.assertEquals(186, len(lines))
Ejemplo n.º 5
0
    def setUp(self):
        super(TestVisualiser, self).setUp()

        x = 'I scream for ice cream'
        task = UberTask(base_task=FailingMergeSort, x=x, copies=4)
        luigi.build([task], workers=1, scheduler_port=self.get_http_port())

        self.done = threading.Event()

        def _do_ioloop():
            # Enter ioloop for maximum TEST_TIMEOUT.  Check every 2s whether the test has finished.
            print('Entering event loop in separate thread')

            for i in range(TEST_TIMEOUT):
                try:
                    self.wait(timeout=1)
                except AssertionError:
                    pass
                if self.done.is_set():
                    break

            print('Exiting event loop thread')

        self.iothread = threading.Thread(target=_do_ioloop)
        self.iothread.start()
Ejemplo n.º 6
0
def test():
    luigi.build(
        [
            Extract("1b9vA"),
            PrepareLigStateEnsemble("1b9vA"),
        ],
        local_scheduler=True, )
Ejemplo n.º 7
0
 def closest(self):
     task = VIAFLatestDate()
     luigi.build([task], local_scheduler=True)
     with task.output().open() as handle:
         date, _ = handle.iter_tsv(cols=('date', 'url')).next()
         dateobj = datetime.date(*map(int, date.split('-')))
     return dateobj
Ejemplo n.º 8
0
def main():
    # get the arguments from the command line
    parser = build_arg_parser()
    cmdline_args = parser.parse_args()

    # get a named tuple of indexes v1...vN
    indexes = Load.label_indices(
        cmdline_args.backup_count + 1, cmdline_args.index)

    # get the end class
    task = Load(indexes=indexes,
                mapping_file=cmdline_args.mapping_file,
                settings_file=cmdline_args.settings_file,
                docs_file=cmdline_args.docs_file,
                table=cmdline_args.table,
                sql_filter=cmdline_args.sql_filter,
                marker_table=cmdline_args.marker_table, 
                es_timeout=cmdline_args.es_timeout)
    if cmdline_args.clear:
        clear(task)
    else:
        if cmdline_args.restart:
            clear(task)
        luigi.build([task], local_scheduler=True)
    # luigi suppresses exceptions
    if capture_task_exceptions:
        raise Exception(
            'A luigi.Task failed, see traceback in the Luigi Execution Summary')
Ejemplo n.º 9
0
 def requires(self):
     task = SWBOpenDataDates(date=self.date)
     luigi.build([task], local_scheduler=True)
     with task.output().open() as handle:
         for row in handle.iter_tsv(cols=('date',)):
             dateobj = datetime.date(*map(int, row.date.split('-')))
             yield SWBOpenDataListified(date=dateobj)
Ejemplo n.º 10
0
    def run(self):
        output = shellout("cut -f2 {input} | LANG=C sort | LANG=C uniq > {output}", input=self.input().path)
        with open(output) as handle:
            dates = map(string.strip, handle.readlines())

        with self.output().open('w') as output:
            for date in dates:
                dateobj = datetime.date(*map(int, date.split('-')))
                marc = SWBOpenDataMarc(date=dateobj)
                sdb = SWBOpenDataSeekMapDB(date=dateobj)
                luigi.build([marc, sdb], local_scheduler=True)
                with open(marc.output().path) as handle:
                    with sqlite3db(sdb.output().path) as cursor:
                        idset = df[df.date == date].id.values.tolist()
                        limit, offset = self.limit, 0
                        while True:
                            cursor.execute("""
                                SELECT offset, length
                                FROM seekmap WHERE id IN (%s)""" % (
                                    ','.join(("'%s'" % id for id in idset[offset:offset + limit]))))
                            rows = cursor.fetchall()
                            if not rows:
                                break
                            else:
                                copyregions(handle, output, rows)
                                offset += limit
def test():
    task1 = VinaResultAccuracy("3ofl_JHM_B_1.pdb")
    task2 = QueryVinaResultOnIdenticalTemplate("3ofl_JHM_B_1.pdb")
    task3 = QueryVinaResultOnIdenticalTemplate("1yst_U10_L_1.pdb")
    task4 = QueryVinaResultOnIdenticalTemplate("3mbm_717_C_1.pdb")
    task5 = QueryVinaResultOnIdenticalTemplate("3jdw_ORN_A_1.pdb")
    luigi.build([task1, task2, task3, task4, task5], local_scheduler=True)
Ejemplo n.º 12
0
    def test_update_table_task(self, mock_config):
        mock_config.get_config.return_value.get.return_value = AWS_ACCESS_KEY
        t = TestUpdateDynamoDBTableTask()

        # mock s3 location for writing output token
        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)
        s3_client.s3.create_bucket("mybucket")

        # create table
        table_name = "dynamo_table1"
        schema = [HashKey("my_hash", data_type=STRING)]
        indexes = [
            AllIndex(
                "IndexName", parts=[HashKey("my_hash", data_type=STRING), RangeKey("range_index", data_type=NUMBER)]
            )
        ]
        throughput = {"read": 2, "write": 4}
        client = DynamoDBClient(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
        client.create_table(table_name, schema, throughput, indexes=indexes)

        luigi.build([t], local_scheduler=True)

        table = client.get_table("dynamo_table1")
        self.assertEquals(8, table.throughput["read"])
        self.assertEquals(16, table.throughput["write"])
Ejemplo n.º 13
0
def test():
    luigi.build(
        [AddNativeCenter("3owjA00"), AddCenter2Modeled("3owjA00",
                                                       version="0.7"),
         AddCenter2Modeled("3owjA00",
                           version="0.5")],
        local_scheduler=True)
Ejemplo n.º 14
0
    def test_column_row_separator(self):
        """
        Test alternate column row separator works
        :return:
        """

        class ModBaseTask(luigi.Task):
            def output(self):
                return MockFile("ModBaseTask", mirror_on_stderr=True)

            def run(self):
                out = self.output().open("w")
                tasks = ["item%d,property%d\n" % (i, i) for i in range(10)]
                for task in tasks:
                    out.write(task)
                out.close()

        class ModSQLATask(sqla.CopyToTable):
            columns = [(["item", sqlalchemy.String(64)], {}), (["property", sqlalchemy.String(64)], {})]
            connection_string = CONNECTION_STRING
            table = "item_property"
            column_separator = ","
            chunk_size = 1

            def requires(self):
                return ModBaseTask()

        task1, task2 = ModBaseTask(), ModSQLATask()
        luigi.build([task1, task2], local_scheduler=True, workers=self.NUM_WORKERS)
        self._check_entries(self.engine)
Ejemplo n.º 15
0
def mariobros(
        targets=('DEFAULT',), mariofile='mario.ini', print_ns=False, dry_run=False, workers=1,
        port=8082,
        **kwargs
):
    """Main mariobros entry point. Parse the configuration file and launch the build of targets.

    :param sequence targets: List of targets.
    :param unicode mariofile: MarioFile name.
    :param bool print_ns: Flag to print namespace.
    :param bool dry_run: Dry run flag.
    :param int workers: Number of workers.
    :param dict kwargs: Passed to the luigi.build function.
    """
    assert all(isinstance(target, str) for target in targets)

    if dry_run and workers > 1:
        workers = 1
        mario.LOGGER.warning('Dry run is incompatible with multiprocessing. Setting --workers=1')
    section_namespaces = mariofile_.parse_mariofile(mariofile)
    default_namespace, rendered_namespaces = mario.render_config(section_namespaces)
    if print_ns:
        namespaces = mario.print_namespaces(default_namespace, section_namespaces)
        print(namespaces)
    else:
        target_tasks = mario.mario(
            rendered_namespaces, default_namespace, targets=targets, dry_run=dry_run
        )
        luigi.build(target_tasks, workers=workers, scheduler_port=port, **kwargs)
Ejemplo n.º 16
0
    def test_multiple_tasks(self):
        """
        Test a case where there are multiple tasks
        :return:
        """

        class SmallSQLATask(sqla.CopyToTable):
            item = luigi.Parameter()
            property = luigi.Parameter()
            columns = [(["item", sqlalchemy.String(64)], {}), (["property", sqlalchemy.String(64)], {})]
            connection_string = CONNECTION_STRING
            table = "item_property"
            chunk_size = 1

            def rows(self):
                yield (self.item, self.property)

        class ManyBaseTask(luigi.Task):
            def requires(self):
                for t in TASK_LIST:
                    item, property = t.strip().split("\t")
                    yield SmallSQLATask(item=item, property=property)

        task2 = ManyBaseTask()
        luigi.build([task2], local_scheduler=True, workers=self.NUM_WORKERS)
        self._check_entries(self.engine)
Ejemplo n.º 17
0
 def test_customized_worker(self):
     a = DummyTask(3)
     self.assertFalse(a.complete())
     self.assertFalse(self.worker.complete())
     luigi.build([a], scheduler_instance=self.scheduler, worker_instance=self.worker)
     self.assertTrue(a.complete())
     self.assertTrue(self.worker.complete())
Ejemplo n.º 18
0
 def test_map_only(self):
     luigi.build([MapOnlyJob()], local_scheduler=True)
     c = []
     for line in File("luigitest-3").open("r"):
         c.append(line.strip())
     self.assertEquals(c[0], "kj")
     self.assertEquals(c[4], "ljoi")
Ejemplo n.º 19
0
 def test_map_only(self):
     luigi.build([MapOnlyJob()], local_scheduler=True)
     c = []
     for line in File('luigitest-3').open('r'):
         c.append(line.strip())
     self.assertEquals(c[0], 'kj')
     self.assertEquals(c[4], 'ljoi')
    def run(self):

        self.setup_arguments()
        self.process_arguments(self.parser.parse_args())
        self.log_arguments()

        luigi.build(self.create_tasks(), local_scheduler=self.local_scheduler, workers=self.workers)
Ejemplo n.º 21
0
    def test_extract(self, mock_config):
        """
        Tests that the mysql extraction runs correctly.
        """
        output_file = os.path.join(tempfile.gettempdir(), 'extract-%s.txt' % uuid.uuid4().hex)
        conf = ConfigParser.ConfigParser()
        conf.add_section('mysql')
        conf.set('mysql', 'dbname', 'mydb')
        conf.set('mysql', 'host',   'myhost')
        conf.set('mysql', 'port', '3306')
        conf.set('mysql', 'user', 'myuser')
        conf.set('mysql', 'password', 'my_password')
        mock_config.get_config.return_value = conf
        with mock.patch("mortar.luigi.dbms.subprocess") as subprocess:
            subprocess_return = mock.Mock()
            subprocess_return.stdout = StringIO.StringIO()
            subprocess_return.stderr = StringIO.StringIO()
            subprocess_return.communicate.return_value = (None, None)
            subprocess_return.returncode = 0
            subprocess.Popen.return_value = subprocess_return

            t = dbms.ExtractFromMySQL(table='foo', output_path=output_file)
            self.assertFalse(t.output()[0].exists())
            luigi.build([t], local_scheduler=True)
            self.assertTrue(t.output()[0].exists())

            os.remove(output_file)
Ejemplo n.º 22
0
def main():
    op = OptionParser()
    op.add_option('--interval', '-i', default=1000)
    op.add_option('--directory', '-d')
    op.add_option('--config', '-c')
    op.add_option('--start', '-s')
    op.add_option('--end', '-e')
    op.add_option('--workflow', '-w')
    op.add_option('--filetype', '-f', default='json')

    options, arguments = op.parse_args()

    if not options.config:
        op.error('No configuration YAML')
    if not options.directory:
        op.error('No input file directory')

    if not options.workflow:
        op.error('No workflow specified')

    files = glob.glob(
        os.path.join(
            options.directory, '*.{0}'.format(options.filetype)
        )
    )
    if not files:
        op.error(
            'Empty input file directory (no {0})'.format(options.filetype)
        )

    try:
        interval = int(options.interval)
    except:
        op.error('Non-integer interval value')

    try:
        start_index = int(options.start)
    except:
        start_index = 0
    try:
        end_index = int(options.end)
    except:
        end_index = len(files)

    # this only works for the workflows imported above (and they
    # need to be imported, obv). things like the eda, which aren't
    # working on a single file dependency tree, need something else
    try:
        workflow_class = getattr(sys.modules[__name__], options.workflow)
    except AttributeError:
        op.error('Unable to load workflow for {0}'.format(options.workflow))

    for i in xrange(start_index, end_index, interval):
        w = workflow_class(
            doc_dir=options.directory,
            yaml_file=options.config,
            start_index=i,
            end_index=(i + interval) if (i + interval) < end_index else end_index
        )
        luigi.build([w], local_scheduler=True)
Ejemplo n.º 23
0
    def test_rows(self):
        task, task0 = SQLATask(), BaseTask()
        luigi.build([task, task0], local_scheduler=True, workers=self.NUM_WORKERS)

        for i, row in enumerate(task.rows()):
            given = TASK_LIST[i].strip("\n").split("\t")
            self.assertEqual(row, given)
Ejemplo n.º 24
0
 def test_customized_worker(self):
     a = DummyTask(3)
     self.assertFalse(a.complete())
     self.assertFalse(self.worker_scheduler_factory.worker.complete())
     luigi.build([a], worker_scheduler_factory=self.worker_scheduler_factory)
     self.assertTrue(a.complete())
     self.assertTrue(self.worker_scheduler_factory.worker.complete())
Ejemplo n.º 25
0
    def test_reflect(self):
        """
        If the table is setup already, then one can set reflect to True, and
        completely skip the columns part. It is not even required at that point.
        :return:
        """

        class AnotherSQLATask(sqla.CopyToTable):
            connection_string = CONNECTION_STRING
            table = "item_property"
            reflect = True
            chunk_size = 1

            def requires(self):
                return SQLATask()

            def copy(self, conn, ins_rows, table_bound):
                ins = (
                    table_bound.update()
                    .where(table_bound.c.property == sqlalchemy.bindparam("_property"))
                    .values({table_bound.c.item: sqlalchemy.bindparam("_item")})
                )
                conn.execute(ins, ins_rows)

            def rows(self):
                for line in TASK_LIST:
                    yield line.strip("\n").split("\t")

        task0, task1, task2 = AnotherSQLATask(), SQLATask(), BaseTask()
        luigi.build([task0, task1, task2], local_scheduler=True, workers=self.NUM_WORKERS)
        self._check_entries(self.engine)
Ejemplo n.º 26
0
 def test_use_json_as_data_interchange_format_job(test_case):
     job = UseJsonAsDataInteterchangeFormatJob(use_hdfs=test_case.use_hdfs)
     luigi.build([job], local_scheduler=True)
     c = []
     for line in job.output().open("r"):
         c.append(line)
     test_case.assertEqual(c, ['{"data type": "json"}\n'])
def main(sdf):
    luigi.build([
        PairwiseCms(sdf, maximum_radius=8.0),
        PairwiseCmsEvenlyDistributed(sdf, maximum_radius=8.0),
        PairwiseCmsEvenlyGryration(sdf)
    ],
                local_scheduler=True)
Ejemplo n.º 28
0
 def test_run_job_with_dump(self, mock_check_output):
     mock_check_output.side_effect = [
         'Your job 12345 ("test_job") has been submitted',
         ''
     ]
     task = TestJobTask(i=1, n_cpu=1, shared_tmp_dir='/tmp')
     luigi.build([task], local_scheduler=True)
     self.assertEqual(mock_check_output.call_count, 2)
Ejemplo n.º 29
0
 def test_output_token(self):
     """
     Output should be the given token path and the class name
     """
     t = TestShellScriptTask(self.token_path) 
     self.t = t 
     luigi.build([t], local_scheduler=True)
     self.assertEquals('%s/%s' % (self.token_path, t.__class__.__name__), t.output()[0].path)
Ejemplo n.º 30
0
 def test_map_only(test_case):
     job = MapOnlyJob(use_hdfs=test_case.use_hdfs)
     luigi.build([job], local_scheduler=True)
     c = []
     for line in job.output().open("r"):
         c.append(line.strip())
     test_case.assertEqual(c[0], "kj")
     test_case.assertEqual(c[4], "ljoi")
Ejemplo n.º 31
0
 def test_inheritance(self):
     t = PowerSum(lo=42, hi=45, p=2)
     luigi.build([t], local_scheduler=True)
     self.assertEqual(t.s, 42**2 + 43**2 + 44**2)
        yield bss2_handle


if __name__ == "__main__":
    # Set up the command line parameters
    PARSER = argparse.ArgumentParser(
        description="BS Seeker 2 Peak Caller Pipeline Wrapper")
    PARSER.add_argument("--genome_fa", help="")
    PARSER.add_argument("--genome_idx", help="")
    PARSER.add_argument("--aligner", help="")
    PARSER.add_argument("--aligner_path", help="")
    PARSER.add_argument("--bss_path", help="")
    PARSER.add_argument("--shared_tmp_dir", help="")
    PARSER.add_argument("--python_path", default=sys.executable, help="")

    # Get the matching parameters from the command line
    ARGS = PARSER.parse_args()

    SHARED_TMP_DIR = ARGS.shared_tmp_dir

    luigi.build([
        BSseeker2Indexer(genome_fa=ARGS.genome_fa,
                         genome_idx=ARGS.genome_idx,
                         aligner=ARGS.aligner,
                         aligner_path=ARGS.aligner_path,
                         bss_path=ARGS.bss_path,
                         user_python_path=ARGS.python_path)
    ],
                local_scheduler=True,
                workers=5)
Ejemplo n.º 33
0
import state_to_state_transitions2 as sst
import pandas as pd
import numpy as np
import luigi


class wrapper(luigi.WrapperTask):
    def requires(self):
        files = ['Session.csv', 'lead.csv', 'opportunity.csv', 'complete.csv']
        task_list = []
        for i in range(1, len(files)):
            path = '/Users/emmanuels/Documents/AttributionData/Data/'
            one = path + str(files[i - 1])
            two = path + str(files[i])
            task_list.append(
                sst.state_to_state(first_file=one, second_file=two))
            return task_list

    def run(self):
        print('Wrapper ran')
        pd.DataFrame().to_csv(
            '/Users/emmanuels/Documents/AttributionData/Data/wrangler1.csv')

    def output(self):
        return luigi.LocalTarget(
            '/Users/emmanuels/Documents/AttributionData/Data/wrangler1.csv')


if __name__ == '__main__':
    luigi.build([wrapper()], workers=8, local_scheduler=True)
Ejemplo n.º 34
0
 def test_priority_w_dep(self):
     x, y, z = PrioTask(25), PrioTask(15), PrioTask(5)
     a, b, c = PrioTask(24), PrioTask(14), PrioTask(4)
     luigi.build([a, b, c, x, y, z], local_scheduler=True)
     self.assertTrue(z.t < y.t < x.t < c.t < b.t < a.t)
Ejemplo n.º 35
0
def run_tasks_for_dry_run(tasks_to_run):
    for type in [
            "failure",
            "success",
            "timeout",
            "process_failure",
            "processing_time",
            "broken_task",
    ]:
        os.makedirs(Path(constants.RESULTS_DIRECTORY) / type)

    run_result = luigi.build(
        tasks_to_run,
        local_scheduler=True,
        detailed_summary=True,
        workers=10,
        log_level='INFO',
    )
    for filename in glob('results/failure/*.json'):
        result = json.loads(open(filename, 'r').read())
        click.echo(
            colorclass.Color("{red}" + result.get('task_type') +
                             " failed{/red}"))
        click.echo(
            f"{yaml.safe_dump({'parameters':result.get('task_params')})}")
        click.echo("\n".join(result.get('exception_stack_trace')))
        click.echo('')
    exit_status_codes = {
        LuigiStatusCode.SUCCESS: 0,
        LuigiStatusCode.SUCCESS_WITH_RETRY: 0,
        LuigiStatusCode.FAILED: 1,
        LuigiStatusCode.FAILED_AND_SCHEDULING_FAILED: 2,
        LuigiStatusCode.SCHEDULING_FAILED: 3,
        LuigiStatusCode.NOT_RUN: 4,
        LuigiStatusCode.MISSING_EXT: 5,
    }

    click.echo("Dry run results")
    table_data = [
        [
            'Result', 'Launch', 'Account', 'Region', 'Current Version',
            'New Version', 'Notes'
        ],
    ]
    table = terminaltables.AsciiTable(table_data)
    for filename in glob('output/TerminateProductDryRunTask/*.json'):
        result = json.loads(open(filename, 'r').read())
        table_data.append([
            result.get('effect'),
            result.get('params').get('launch_name'),
            result.get('params').get('account_id'),
            result.get('params').get('region'),
            result.get('current_version'),
            result.get('new_version'),
            result.get('notes'),
        ])
    for filename in glob('output/ProvisionProductDryRunTask/*.json'):
        result = json.loads(open(filename, 'r').read())
        table_data.append([
            result.get('effect'),
            result.get('params').get('launch_name'),
            result.get('params').get('account_id'),
            result.get('params').get('region'),
            result.get('current_version'),
            result.get('new_version'),
            result.get('notes'),
        ])
    click.echo(table.table)
    sys.exit(exit_status_codes.get(run_result.status))
Ejemplo n.º 36
0
 def run(cls):
     cls.logger.info('Running Systems Checks...')
     return luigi.build([SystemCheckTask()], local_scheduler=True)
def main(config):
    luigi.build([groupEmbTags(config=config)])
Ejemplo n.º 38
0
        combinations = {
            'experiment': 'FF',
            'setups': [setup],
            'iterations': [50000, 100000, 150000],
            'samples': [validation],
            'thresholds': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
            'merge_functions': ['mean_aff', 'max_aff'],
            'init_with_maxs': [False],
            'custom_fragments': True,
            'histogram_quantiless': [False],
            'discrete_queues': [True],
            'keep_segmentation': True,
            'dilate_mask': 0,
            'mask_fragments': True
        }

        range_keys = [
            'setups', 'iterations', 'samples', 'merge_functions',
            'init_with_maxs', 'histogram_quantiless', 'discrete_queues'
        ]

        jobs.append(EvaluateCombinations(combinations, range_keys))

    set_base_dir(os.path.abspath('..'))

    luigi.build(
        jobs,
        workers=50,
        scheduler_host='slowpoke1.int.janelia.org',
        logging_conf_file='/groups/saalfeld/home/funkej/.luigi/logging.conf')
Ejemplo n.º 39
0
 def test_Ok(self):
     task = TestTask()
     luigi.build([task], local_scheduler=True)
from luigi import build
import bqe_search

from bqe_search import combined_search

build(
    [
        combined_search(gene_list=["p53", "CCND1"],
                        combination_terms=["apoptosis", "cell cycle"])
    ],
    local_scheduler=True,
)
Ejemplo n.º 41
0
def run(tasks,
        forced=None,
        forced_all=False,
        forced_all_upstream=False,
        confirm=True,
        workers=1,
        abort=True,
        **kwargs):
    """
    Run tasks locally. See luigi.build for additional details

    Args:
        tasks (obj, list): task or list of tasks
        forced (list): list of forced tasks
        forced_all (bool): force all tasks
        forced_all_upstream (bool): force all tasks including upstream
        confirm (list): confirm invalidating tasks
        workers (int): number of workers
        abort (bool): on errors raise exception
        kwargs: keywords to pass to luigi.build

    """
    if not isinstance(tasks, (list, )):
        tasks = [tasks]

    if forced_all:
        forced = tasks
    if forced_all_upstream:
        for t in tasks:
            invalidate_upstream(t, confirm=confirm)
    if forced is not None:
        if not isinstance(forced, (list, )):
            forced = [forced]
        invalidate = []
        for tf in forced:
            for tup in tasks:
                invalidate.append(d6tflow.taskflow_downstream(tf, tup))
        invalidate = set().union(*invalidate)
        invalidate = {t for t in invalidate if t.complete()}
        if len(invalidate) > 0:
            if confirm:
                print('Forced tasks', invalidate)
                c = input('Confirm invalidating forced tasks (y/n)')
            else:
                c = 'y'
            if c == 'y':
                [t.invalidate(confirm=False) for t in invalidate]
            else:
                return None

    opts = {
        **{
            'workers': workers,
            'local_scheduler': True,
            'log_level': d6tflow.settings.log_level
        },
        **kwargs
    }
    result = luigi.build(tasks, **opts)
    if abort and not result:
        raise RuntimeError('Exception found running flow, check trace')

    return result
Ejemplo n.º 42
0
                    'Args': [
                        "spark-submit",
                        "s3a://tar7/agg.py",
                    ]
                }
            }])

#falta un paso para que espere a que terminen los steps
    #client.terminate_job_flows(JobFlowIds=[clusters['Clusters'][0]['Id']])

    while (time.time() - start_time < 360):

        flag = 1

    clusters = client.list_clusters()

    if (clusters['Clusters'][0]['Status']['State'] in ['TERMINATING']):

        #client.terminate_job_flows(JobFlowIds=[clusters['Clusters'][0]['Id']])

        print('cluster termino')

    else:
        sys.exit('tiempo mayor')

    os.system("aws s3 cp --recursive s3://tar7/promedio.parquet ./")

    # AQUI ADENTRO VA EL CODIGO -------------------------------------------------

luigi.build([StartPipeline()])
Ejemplo n.º 43
0
def predict(device_mapping, target):
    max_jobs = len(device_mapping)
    tmp_folder = './tmp_inference'
    input_path = '/g/kreshuk/data/FIB25/data.n5'
    output_path = input_path
    roi_begin = roi_end = None

    in_key = 'volumes/raw/s0'
    out_key = {'volumes/affinities/s0': (0, 3)}

    mask_path = input_path
    mask_key = 'volumes/masks/minfilter/s5'

    config_folder = './config_inference'
    if not os.path.exists(config_folder):
        os.mkdir(config_folder)

    input_blocks = (192, ) * 3
    # remove (16, 16, 16) pixels from each side in the output
    output_blocks = (160, ) * 3
    halo = [(ib - ob) // 2 for ib, ob in zip(input_blocks, output_blocks)]
    print("Found halo", halo)

    shebang = "#! /g/kreshuk/pape/Work/software/conda/miniconda3/envs/torch/bin/python"
    global_config = InferenceLocal.default_global_config()
    global_config.update({
        'shebang': shebang,
        'block_shape': output_blocks,
        'roi_begin': roi_begin,
        'roi_end': roi_end
    })
    with open(os.path.join(config_folder, 'global.config'), 'w') as f:
        json.dump(global_config, f)

    config = InferenceLocal.default_task_config()
    config.update({
        'chunks': [ob // 2 for ob in output_blocks],
        'mem_limit': 32,
        'time_limit': 720,
        'threads_per_job': 4,
        'set_visible_device': False
    })
    with open(os.path.join(config_folder, 'inference.config'), 'w') as f:
        json.dump(config, f)

    ckpt = '/g/kreshuk/matskevych/boundary_map_prediction/project_folder_old/Weights'
    task = InferenceLocal if target == 'local' else InferenceSlurm
    t = task(tmp_folder=tmp_folder,
             max_jobs=max_jobs,
             config_dir=config_folder,
             input_path=input_path,
             input_key=in_key,
             output_path=output_path,
             output_key=out_key,
             mask_path=mask_path,
             mask_key=mask_key,
             checkpoint_path=ckpt,
             framework='inferno',
             halo=halo)
    ret = luigi.build([t], local_scheduler=True)
    assert ret, "Failure"
Ejemplo n.º 44
0
import luigi
import os
from OrthoEvol.Tools.sge import SGEPipelineTask

# TIP Works on linux
logger = logging.getLogger('luigi-interface')

SGEPipelineTask.shared_tmp_dir = os.getcwd()
SGEPipelineTask.parallel_env = None


class TestPipelineTask(SGEPipelineTask):
    """Example pipeline task."""

    i = luigi.Parameter()

    def work(self):  # Use work instead of run to DEBUG
        logger.info('Running test job...')
        with open(self.output().path, 'w') as f:
            f.write('This is a test job.')
            f.close()

    def output(self):
        return luigi.LocalTarget(
            path=os.path.join(os.getcwd(), 'testjob_' + str(self.i) + '.txt'))


if __name__ == '__main__':
    tasks = [TestPipelineTask(i=str(i), select=i + 1) for i in range(3)]
    luigi.build(tasks, local_scheduler=True, workers=3)
Ejemplo n.º 45
0
 def test_priority(self):
     p, q, r = PrioTask(1), PrioTask(2), PrioTask(3)
     luigi.build([p, q, r], local_scheduler=True)
     self.assertTrue(r.t < q.t < p.t)
Ejemplo n.º 46
0
    def test_invoke(self):
        luigi.build([Popularity(datetime.date(2009, 1, 5))],
                    local_scheduler=True)

        self.assertEqual(
            MockTarget.fs.get_data('/tmp/popularity/2009-01-05.txt'), b'4\n')
Ejemplo n.º 47
0
from sqlalchemy import String
import luigi
from luigi.contrib import sqla


class SQLATask(sqla.CopyToTable):
    # columns defines the table schema, with each element corresponding
    # to a column in the format (args, kwargs) which will be sent to
    # the sqlalchemy.Column(*args, **kwargs)

    reflect = True
    connection_string = "sqlite://"  # in memory SQLite database
    table = "Author"  # name of the table to store data

    columns = [(["First_Name", String(64)], {}), (["Last_Name",
                                                   String(64)], {})]

    def rows(self):
        for row in [("First_Name", "Babacar"), ("Last_Name", "Kane")]:
            yield row


if __name__ == '__main__':
    task = SQLATask()
    luigi.build([task], local_scheduler=True)
Ejemplo n.º 48
0
import luigi
import os


class FileTarget(luigi.Target):
    def __init__(self, filename):
        self.filename = filename

    def exists(self):
        return os.path.isfile(self.filename)


class HelloTask(luigi.Task):
    def run(self):
        with open("hello.txt", "w") as f:
            f.write("Hello Mario!")

    def output(self):
        return FileTarget("hello.txt")


if __name__ == "__main__":
    luigi.build([HelloTask()])
def downscale_seg(sample, max_jobs=8, target='local'):
    input_path = '/g/kreshuk/data/cremi/realigned/sample%s_small.n5' % sample
    input_key = 'segmentation/multicut/s0'

    config_dir = './config_ds_seg'
    tmp_folder = './tmp_ds_seg_%s' % sample

    try:
        os.mkdir(config_dir)
    except OSError:
        pass

    config = DownscalingWorkflow.get_config()
    global_config = config['global']
    shebang = '#! /g/kreshuk/pape/Work/software/conda/miniconda3/envs/cluster_env/bin/python'
    global_config.update({'shebang': shebang})
    with open(os.path.join(config_dir, 'global.config'), 'w') as f:
        json.dump(global_config, f)

    ds_config = config['downscaling']
    # FIXME majority vote downscaling is broken
    # ds_config.update({'library': 'skimage', 'threads_per_job': 8})
    ds_config.update({
        'library': 'vigra',
        'library_kwargs': {
            'order': 0
        },
        'threads_per_job': 8
    })
    with open(os.path.join(config_dir, 'downscaling.config'), 'w') as f:
        json.dump(ds_config, f)

    scale_factors = [[1, 2, 2], [1, 2, 2], [1, 2, 2], 2]
    halos = [[0, 10, 10], [0, 10, 10], [0, 10, 10], [10, 10, 10]]

    task = DownscalingWorkflow(tmp_folder=tmp_folder,
                               max_jobs=1,
                               config_dir=config_dir,
                               target='local',
                               input_path=input_path,
                               input_key='segmentation/multicut/s0',
                               output_key_prefix='segmentation/multicut',
                               scale_factors=scale_factors,
                               halos=halos)
    success = luigi.build([task], local_scheduler=True)

    #
    if success and target == 'local':
        with z5py.File(input_path) as f:
            #
            ds = f['raw/s2']
            ds.n_threads = 8
            raw = ds[:]
            rshape = raw.shape
            #
            ds = f['segmentation/multicut/s2']
            ds.n_threads = 8
            seg = ds[:]
            mshape = seg.shape
            assert mshape == rshape, "%s %s" % (str(mshape), str(rshape))

        view([raw, seg])
Ejemplo n.º 50
0
 def test_run(self, os_mock):
     t = MortarSqoopTaskTest(path=S3_PATH)
     luigi.build([t], local_scheduler=True)
     self.assertEquals(EXPECTED_ARGV, t.argv)
     self.assertEquals(os.environ['AWS_ACCESS_KEY'], AWS_ACCESS_KEY)
     self.assertEquals(os.environ['AWS_SECRET_KEY'], AWS_SECRET_KEY)
Ejemplo n.º 51
0
class HelloTask(luigi.Task):

    to_whom = luigi.Parameter()

    def run(self):
        print("thinking about what to say...")
        time.sleep(5)
        with open("hello_%s.txt"%self.to_whom, "w") as f:
            f.write("Hello %s!"%self.to_whom)

    def output(self):
        return FileTarget("hello_%s.txt"%self.to_whom)

class ReplyTask(luigi.Task):

    def run(self):
        print("thinking about what to say...")
        time.sleep(5)
        with open("reply.txt", "w") as f:
            f.write("Hello Luigi!")

    def output(self):
        return FileTarget("reply.txt")

    def requires(self):
        return HelloTask(to_whom="Luigi")

if __name__ == "__main__":
    luigi.build([ReplyTask()])
Ejemplo n.º 52
0
 def test_run_options_with_jdbc_jar(self, os_mock):
     t = MortarSqoopTaskTest(path=S3_PATH, jdbc_driver='some/path')
     luigi.build([t], local_scheduler=True)
     option_string = EXPECTED_ARGV + ['-j', 'some/path']
     self.assertEqual(option_string, t.argv)
Ejemplo n.º 53
0
 def test_recursion(self):
     t = LinearSum(lo=42, hi=45)
     luigi.build([t], local_scheduler=True)
     self.assertEqual(t.s, 42 + 43 + 44)
Ejemplo n.º 54
0
 def test_run_options_with_direct(self, os_mock):
     t = MortarSqoopTaskTest(path=S3_PATH, direct=True)
     luigi.build([t], local_scheduler=True)
     option_string = EXPECTED_ARGV + ['--direct']
     self.assertEqual(option_string, t.argv)
Ejemplo n.º 55
0
 def test_run_job(self, mock_open, mock_communicate):
     if on_lsf_master():
         outfile = os.path.join(DEFAULT_HOME, 'testfile_1')
         tasks = [TestJobTask(i=str(i), n_cpu_flag=1) for i in range(3)]
         luigi.build(tasks, local_scheduler=True, workers=3)
         self.assertTrue(os.path.exists(outfile))
Ejemplo n.º 56
0
def run_full_pipeline(project_abs_path, config_dir, config_file_name, local_scheduler_=True):
    # Run pipeline this way
    luigi.build([BuildSolution(project_abs_path, config_dir, config_file_name)], local_scheduler=local_scheduler_)
Ejemplo n.º 57
0
    def test_build_internal(self):
        luigi.build([Fib(100)], local_scheduler=True)

        self.assertEqual(MockTarget.fs.get_data('/tmp/fib_10'), b'55\n')
        self.assertEqual(MockTarget.fs.get_data('/tmp/fib_100'),
                         b'354224848179261915075\n')
Ejemplo n.º 58
0
def run_stacking_only(project_abs_path, config_dir, config_file_name, local_scheduler_=True):
    # Run pipeline this way
    luigi.build([MakeStackingPredictions(project_abs_path, config_dir, config_file_name)],
                local_scheduler=local_scheduler_)
Ejemplo n.º 59
0
    def handle(self, *args, **options):
        build([LineDimLoad(subset=options["full"])], local_scheduler=True)
        df_LineDim = pd.read_parquet("./data/linedim/part.0.parquet",
                                     engine="fastparquet")

        build([AccountDimLoad(subset=options["full"])], local_scheduler=True)
        df_AccountDim = pd.read_parquet("./data/accountdim/part.0.parquet",
                                        engine="fastparquet")

        build([LimitFactLoad(subset=options["full"])], local_scheduler=True)
        df_LimitFact = pd.read_parquet("./data/limitfact/part.0.parquet",
                                       engine="fastparquet")

        build([ByMdn()], local_scheduler=True)
        local_root = "file://data/amplitude/by_mdn/"
        target = ParquetTarget(local_root, flag=False, glob="*.parquet")
        df_ActivityFact = target.read_dask(columns=["mdn", "event_time"],
                                           parse_dates=["event_time"
                                                        ]).compute()

        LineDim.objects.all().delete()

        with transaction.atomic():
            df_LineDim_objs = [
                LineDim(
                    MTN=line["MTN"],
                    Device_Grouping=line["DEVICE_GROUPING"],
                    Sales_Channel=line["SALES_CHANNEL"],
                    SVC_ACT_DT=line["SVC_ACT_DT"],
                ) for idx, line in df_LineDim.iterrows()
            ]
            LineDim.objects.bulk_create(df_LineDim_objs)

        AccountDim.objects.all().delete()
        with transaction.atomic():
            df_AcctDim_objs = [
                AccountDim(
                    Cust_Acct=acct["CUST_ACCT"],
                    Segment_Name=acct["SEGMENT_NAME"],
                    SVC_Plan=acct["SVC_PLAN"],
                ) for idx, acct in df_AccountDim.iterrows()
            ]
            AccountDim.objects.bulk_create(df_AcctDim_objs)

        LimitFact.objects.all().delete()
        with transaction.atomic():
            df_LimitFact_objs = [
                LimitFact(
                    MTN=LineDim.objects.get_or_create(MTN=limit["MTN"])[0],
                    Cust_Acct=AccountDim.objects.get_or_create(
                        Cust_Acct=limit["CUST_ACCT"])[0],
                    LIMIT_DT=limit["LIMITING_DT"],
                    LIMIT_TYPE=limit["LIMIT_TYPE"],
                ) for idx, limit in df_LimitFact.iterrows()
            ]
            LimitFact.objects.bulk_create(df_LimitFact_objs)

        ActivityFact.objects.all().delete()
        with transaction.atomic():
            df_ActivityFact_objs = [
                ActivityFact(
                    MTN=LineDim.objects.get_or_create(MTN=activity["mdn"])[0],
                    EVENT_DT=activity["event_time"],
                ) for idx, activity in df_ActivityFact.iterrows()
            ]
            ActivityFact.objects.bulk_create(df_ActivityFact_objs)
Ejemplo n.º 60
0
def mws_segmentation(offsets, path, input_key, fg_mask_key, output_key,
                     tmp_folder, target, max_jobs, stitch_mode):
    task = MwsWorkflow
    qos = 'normal'

    config_folder = os.path.join(tmp_folder, 'configs')
    os.makedirs(config_folder, exist_ok=True)
    configs = task.get_config()

    # we use a smaller block shape to speed up MWS
    block_shape = [64, 256, 256]
    conf = configs['global']
    shebang = get_default_shebang()
    block_shape = block_shape
    conf.update({'shebang': shebang, 'block_shape': block_shape})
    with open(os.path.join(config_folder, 'global.config'), 'w') as f:
        json.dump(conf, f)

    # write config for mws block task
    strides = [4, 4, 4]
    conf = configs['mws_blocks']
    conf.update({
        'randomize_strides': True,
        'strides': strides,
        'mem_limit': 12,
        'time_limit': 900
    })
    with open(os.path.join(config_folder, 'mws_blocks.config'), 'w') as f:
        json.dump(conf, f)

    # determine config for the given stitching mode
    if stitch_mode == '':
        stitch_mc = False
    elif stitch_mode == 'biased':
        stitch_mc = True
        beta1, beta2 = 0.5, 0.75
    elif stitch_mode == 'unbiased':
        stitch_mc = True
        beta1 = beta2 = 0.5

    if stitch_mc:
        # write config for stitching multicut
        conf = configs['stitching_multicut']
        conf.update({'beta1': beta1, 'beta2': beta2, 'qos': qos})
        with open(os.path.join(config_folder, 'stitching_multicut.config'),
                  'w') as f:
            json.dump(conf, f)

        conf = configs['write']
        conf.update({'mem_limit': 8, 'time_limit': 120, 'qos': qos})
        with open(os.path.join(config_folder, 'write.config'), 'w') as f:
            json.dump(conf, f)

        # write config for edge feature task
        conf = configs['block_edge_features']
        conf.update({'offsets': offsets, 'mem_limit': 4, 'qos': qos})
        with open(os.path.join(config_folder, 'block_edge_features.config'),
                  'w') as f:
            json.dump(conf, f)

        conf_names = [
            'merge_edge_features', 'merge_sub_graphs', 'map_edge_ids',
            'simple_stitch_assignments'
        ]
        for name in conf_names:
            conf = configs[name]
            conf.update({
                'mem_limit': 128,
                'time_limit': 240,
                'threads_per_job': 16,
                'qos': qos
            })
            with open(os.path.join(config_folder, '%s.config' % name),
                      'w') as f:
                json.dump(conf, f)

        conf = configs['stitching_multicut']
        # set time limit for the multicut task to 18 hours (in minutes)
        tlim_task = 18 * 60
        # set time limit for the solver to 16 hours (in seconds)
        tlim_solver = 16 * 60 * 60
        conf.update({
            'mem_limit': 256,
            'time_limit': tlim_task,
            'threads_per_job': 16,
            'qos': qos,
            'agglomerator': 'greedy-additive',
            'time_limit_solver': tlim_solver
        })
        with open(os.path.join(config_folder, 'stitching_multicut.config'),
                  'w') as f:
            json.dump(conf, f)

    t = task(tmp_folder=tmp_folder,
             config_dir=config_folder,
             max_jobs=max_jobs,
             target=target,
             input_path=path,
             input_key=input_key,
             output_path=path,
             output_key=output_key,
             mask_path=path,
             mask_key=fg_mask_key,
             stitch_via_mc=stitch_mc,
             offsets=offsets)

    ret = luigi.build([t], local_scheduler=True)
    if not ret:
        raise RuntimeError("Mws segmentation failed")