Beispiel #1
0
def save_benchmark_timings(path, tablename, table_cache, instance_id: int):

    fn = os.path.join(path, "benchmark.bench")
    if not os.path.exists(fn):
        P.get_logger().warn(
            "file {} does not exist, no tool timings uploaded".format(
                fn))
    else:
        table = pandas.read_csv(fn, sep="\t")
        table["instance_id"] = instance_id
        table_cache.add_table(table, tablename)
Beispiel #2
0
    def __init__(self, database_url, schema):
        self.database_url = database_url
        self.schema = schema
        self.cache = {}

        self.total_size = 0
        self.sizes = collections.defaultdict(int)
        self.uploaded_sizes = collections.defaultdict(int)
        self.dtypes = {}
        self.logger = P.get_logger()
        self.indices = {}
        self.have_created_indices = False
Beispiel #3
0
def check_unique(tool_functions,
                 input_combos=None,
                 input_regex=None,
                 input_alias=None,
                 is_test=False):
    # compute a list of task names
    names = []
    if input_combos:
        for toolf, input_files in itertools.product(tool_functions,
                                                    input_combos):
            taskf = copy.copy(toolf)
            taskf.register_input(input_files,
                                 regex=input_regex,
                                 alias=input_alias,
                                 is_test=is_test)
            names.append(taskf.__name__)
    else:
        for toolf in tool_functions:
            taskf = copy.copy(toolf)
            taskf.register_input(regex=input_regex,
                                 alias=input_alias,
                                 is_test=is_test)
            names.append(taskf.__name__)

    counts = collections.Counter(names)
    for name, count in list(counts.items()):
        if count > 1:
            make_unique = True
            P.get_logger().debug(
                "adding hash identifier because of duplicate name: {}={}".
                format(name, count))
            break
    else:
        make_unique = False

    return make_unique
Beispiel #4
0
def add_columns_to_table(columns, table, tablename, engine):

    pandas_engine = pandas.io.sql.SQLDatabase(engine)
    pandas_table = pandas.io.sql.SQLTable(tablename,
                                          pandas_engine,
                                          frame=table)

    new_columns = set(columns)
    logger = P.get_logger()

    for column in pandas_table.table.columns:
        if column.name in new_columns:
            statement = "ALTER TABLE {} ADD COLUMN {} {}".format(
                tablename, column.name, column.type)

            logger.debug("SQL: {}".format(statement))
            try:
                retry_sql_execute(engine, statement)
            except (sqlalchemy.exc.OperationalError,
                    sqlite3.OperationalError) as ex:
                if "duplicate column name" not in str(ex):
                    raise
Beispiel #5
0
def upload_metrics_tables(infiles: list,
                          run_id: int,
                          schema,
                          url: str,
                          max_workers: int = 10):

    logger = P.get_logger()

    engine = create_engine(url)

    Session = sessionmaker(bind=engine)
    session = Session()
    
    logger.info(f"{os.getpid()}: collecting upload items for {len(infiles)} input files")
    metric_f = generate_metric
    pool = multiprocessing.Pool(max_workers)
    metrics = pool.map(metric_f, infiles)
    pool.close()
    pool.join()

    logger.info(f"{os.getpid()}: instantiating {len(metrics)} metrics")
    data = list(tqdm.tqdm(instantiate_metrics(metrics, session, run_id),
                          total=len(metrics)))

    logger.info(f"{os.getpid()}: uploading {len(data)} items")
    upload_f = upload_metric
    initargs = (upload_f, url, schema)
    if max_workers == 1:
        setup_worker(*initargs)
        result = list(map(upload_f, data))
        global resource
        resource.table_cache.flush_all()
    else:
        logger.info(f"{os.getpid()}: loading data with {max_workers} cores")
        pool = multiprocessing.Pool(max_workers, initializer=setup_worker, initargs=initargs)
        pool.map(upload_f, data)
        pool.close()
        pool.join()
Beispiel #6
0
def reconcile_columns(tablename, engine, table):

    logger = P.get_logger()
    existing_columns = set(get_columns(tablename, engine))
    proposed_columns = set(table.columns)

    obsolete_columns = existing_columns.difference(proposed_columns)
    if obsolete_columns:
        logger.warn("the following columns are obsolete in {}: {}. "
                    "empty data will be inserted".format(
                        tablename, ", ".join(obsolete_columns)))
        # create empty columns
        for column in obsolete_columns:
            table[column] = None

    new_columns = proposed_columns.difference(existing_columns)
    if new_columns:
        logger.warn("new columns found for {}: the following columns "
                    "will be added: {} ".format(tablename,
                                                ", ".join(new_columns)))

        add_columns_to_table(new_columns, table, tablename, engine)
        # clear cache of memoization function
        get_columns.delete(tablename, engine)
Beispiel #7
0
def purge_run_id(run_id, url, dry_run=False, schemas=None):
    """remove a run from a database.
    """

    logger = P.get_logger()
    engine = sqlalchemy.create_engine(url)
    connection = engine.connect()

    # automap
    metadata = sqlalchemy.MetaData()
    metadata.reflect(engine)
    base = automap_base(metadata=metadata)
    base.prepare()

    if schemas is None:
        insp = inspect(engine)
        schemas = insp.get_schema_names()
        # note: default sqlite schema is "main"
        if 'public' in schemas:
            schemas.remove('public')
        if 'information_schema' in schemas:
            schemas.remove('information_schema')

    logger.debug("getting instance_id list of run_id={}".format(run_id))
    instance_ids = set(get_instance_ids_for_run_id(run_id, engine))
    logger.debug("found {} instances for run_id={}".format(len(instance_ids), run_id))
    non_metric_tables = ['run',
                         'instance',
                         'binary_data',
                         'metric_timings',
                         'tool_timings',
                         'metric_storage',
                         'tags']

    # delete from tables with field "instance_id"
    if instance_ids:
        for schema in schemas:
            # automap the schema
            metadata_schema = sqlalchemy.MetaData()
            metadata_schema.reflect(engine, schema=schema)
            base_schema = automap_base(metadata=metadata_schema)
            base_schema.prepare()
            for table_name in list(base_schema.metadata.tables.keys()):
                table = sqlalchemy.Table(table_name,
                                         metadata_schema,
                                         autoload=True)
                if "instance_id" not in table.c:
                    continue
                logger.debug("deleting data in {}".format(table_name))
                delete = table.delete().where(
                    table.c.instance_id.in_(instance_ids))
                if not dry_run:
                    connection.execute(delete)

    # delete from tables with field "run_id"
    for table_name in base.metadata.tables.keys():
        table = sqlalchemy.Table(table_name, metadata, autoload=True)
        if "run_id" not in table.c:
            continue
        logger.info("deleting data in {} for run_id {}".format(table_name, run_id))
        delete = table.delete().where(table.c.run_id == run_id)
        if not dry_run:
            connection.execute(delete)

    table = sqlalchemy.Table('run', metadata, autoload=True)
    delete = table.delete().where(table.c.id == run_id)
    logger.info("deleting data in 'run' for id {}".format(run_id))
    if not dry_run:
        connection.execute(delete)
Beispiel #8
0
def upload_result(infiles, outfile, *extras):
    """upload results into database.

    Connection details for the database are taken from the
    configuration dictionary given as first argument to extras.  The
    configuration directory should have an element 'database' with the
    required field ``url`` and the optional field ``schema``.  For
    example, to upload to an sqlite database in the current directory
    called csvdb, use::

        config = {"database": {"url": "sqlite:///./csvdb"}}

    To use multiple cores, try::

        config = {"database": {"url": "sqlite:///./csvdb", "cores": 10}}

    Arguments
    ---------
    infiles: list
       List of files to upload. These should be the output
       of metric tasks in a benchmarking workflow.
    outfile: output file
       On success, an empty output file is created.
    extras: list
       List of one element containing a configuration directory
       (see above).

    """

    logger = P.get_logger()

    if len(extras) != 1:
        raise ValueError(
            "expecting only one extra argument "
            "(configuration dictionary)")

    config = extras[0]

    url = config["database"]["url"]
    max_workers = config["database"].get("cores", 1)
    
    schema = config["database"].get("schema", None)
    # TODO: check if schema exists to avoid incomplete
    # transaction.

    engine = create_engine(url)

    # Catch exceptions until database access on thame available
    try:
        create_database(engine)
    except OperationalError as msg:
        logger.warn(
            "could not connect to database at {}. "
            "The data will not be uploaded. Msg={}".format(
                url, str(msg)))
        return

    # Create schema if not exists
    if schema is not None:
        engine.execute(
            str(text("CREATE SCHEMA IF NOT EXISTS {}".format(schema))))

    pipeline_name = os.path.basename(sys.argv[0])
    logger.debug("uploading data to {}, schema={}".format(url, schema))
    # TODO: add dependencies
    # dependencies = infiles[1:]
    # meta_data = dict([("dependency{}".format(x), y) \
    #                  for x, y in enumerate(dependencies)])

    # need to set created dir somehow, important when re-loading
    # as otherwise all times will be the same.
    if os.path.exists("benchmark.yml"):
        s = os.stat("benchmark.yml")
        created = datetime.datetime.fromtimestamp(s.st_mtime)
    else:
        created = datetime.datetime.now()

    Session = sessionmaker(bind=engine)
    session = Session()

    benchmark_run = BenchmarkRun(
        author=os.environ.get("USER", "unknown"),
        # needs refactoring, should be: uploaded_at, created_at, run_at
        # uploaded_at=datetime.datetime.now(),
        created=created,
        pipeline_name=pipeline_name,
        pipeline_version=P.get_version().version,
        pipeline_dir=os.getcwd(),
        title=config["title"],
        description=config["description"],
        config=json.dumps(config),
        config_hash=hash(json.dumps(config)),
        status="incomplete")

    session.add(benchmark_run)
    session.commit()

    for tag in config["tags"]:
        benchmark_tag = BenchmarkTag(run_id=benchmark_run.id, tag=tag)
        session.add(benchmark_tag)

    session.commit()
    engine.dispose()
    del engine

    upload_metrics_tables(infiles,
                          benchmark_run.id,
                          schema,
                          url,
                          max_workers=max_workers)

    # upload table sizes
    # df_sizes = pandas.DataFrame.from_records(list(table_cache.uploaded_sizes.items()),
    #                                          columns=["tablename", "bytes_uploaded"])
    # df_sizes["bytes_resident"] = df_sizes.bytes_uploaded
    # df_sizes["run_id"] = benchmark_run.id
    # df_sizes["schema"] = schema
    # save_table(df_sizes,
    #            engine,
    #            "metric_storage",
    #            schema=None,
    #            is_sqlite3=is_sqlite3)

    mark_upload_complete(url, benchmark_run.id)

    logger.info("uploaded results under run_id {}".format(benchmark_run.id))
    touch(outfile)
Beispiel #9
0
def save_metric_data(meta_data, table_cache, schema, instance_id: int, session):

    logger = P.get_logger()
    metric_table_filter = None
    if "metric_no_upload" in meta_data:
        if meta_data["metric_no_upload"] == "*":
            logger.warn("upload turned off for metric {}".format(
                meta_data["metric_name"]))
            return
        else:
            metric_table_filter = re.compile(meta_data["metric_no_upload"])

    # multiple tablenames for multiple metric output
    #
    # Tables are added into schemas to avoid cluttering
    # the public namespace.
    # (if only blobs, no metric output file)
    if "metric_output_files" in meta_data:
        assert len(meta_data["metric_output_files"]) == \
            len(meta_data["metric_tablenames"])

        for output_file, tablename in zip(
                meta_data["metric_output_files"],
                meta_data["metric_tablenames"]):

            if metric_table_filter and metric_table_filter.search(tablename):
                logger.warn("upload for table {} turned off".format(
                    tablename))
                continue

            if not os.path.exists(output_file):
                logger.warning("output file {} does not exist - ignored".format(
                    output_file))
                continue

            if IOTools.is_empty(output_file):
                logger.warn("output file {} is empty - ignored".format(
                    output_file))
                continue

            # table = pandas.DataFrame({"values": [1, 2]})
            try:
                table = pandas.read_csv(output_file,
                                        sep="\t",
                                        comment="#",
                                        skip_blank_lines=True)
            except ValueError as e:
                logger.warn("table {} can not be read: {}".format(
                    output_file, str(e)))
                continue
            except pandas.parser.CParserError as e:
                logger.warn("malformatted table {} can not be read: {}".format(
                    output_file, str(e)))
                continue

            if table.empty:
                logger.warn("table {} is empty - ignored".format(output_file))
                continue

            tablename, table, dtypes = transform_table_before_upload(tablename,
                                                                     table,
                                                                     instance_id,
                                                                     meta_data,
                                                                     table_cache)

            if schema is None:
                tn = tablename
            else:
                tn = "{}.{}".format(schema, tablename)

            # add foreign key
            table["instance_id"] = instance_id
            logger.debug(f"saving data {table.shape} from {output_file} to table {tn} under {instance_id}")
            table_cache.add_table(table, tablename, dtypes)

    if "metric_blob_globs" in meta_data:
        metric_dir = meta_data["metric_outdir"]
        files = [glob.glob(os.path.join(metric_dir, x))
                 for x in meta_data["metric_blob_globs"]]
        files = IOTools.flatten(files)
        logger.debug(
            "uploading binary data in {} files from {} to "
            "table binary_data".format(len(files), metric_dir))
        table = []
        for fn in files:
            with IOTools.open_file(fn, "rb", encoding=None) as inf:
                data_row = BenchmarkBinaryData(
                    instance_id=instance_id,
                    filename=os.path.basename(fn),
                    path=fn,
                    data=inf.read())
                session.add(data_row)
            session.commit()

    if meta_data.get("metric_tableindices", None):
        table_cache.add_indices(meta_data["metric_tableindices"])
Beispiel #10
0
def transform_table_before_upload(tablename, table, instance_id: int, meta_data, table_cache):

    dtypes = None

    logger = P.get_logger()

    # melt table if set by metric
    if "metric_upload_melted" in meta_data:
        if tablename in meta_data["metric_upload_melted"]:
            melt_data = meta_data["metric_upload_melted"][tablename]
            table = pandas.melt(
                table,
                id_vars=melt_data.get("id_vars", None),
                value_vars=melt_data.get("value_vars", None),
                var_name=melt_data.get("var_name", None),
                value_name=melt_data.get("value_name", None))
            logger.debug("melted data from table {}".format(tablename))

    if "metric_upload_transpose" in meta_data:
        if tablename in meta_data["metric_upload_transpose"]:
            table = table.transpose()

    # upload into a separate table suffixed by instance id
    if "metric_upload_separate" in meta_data:
        if tablename in meta_data["metric_upload_separate"]:
            tablename = "{}_{}".format(tablename, instance_id)

    # normalize table by factorizing a column and storing its ids
    # in a separate table
    if "metric_upload_normalize" in meta_data:
        if tablename in meta_data["metric_upload_normalize"]:
            for column in meta_data["metric_upload_normalize"][tablename]:
                if column not in table.columns:
                    raise ValueError(
                        "unknown column {} in table {} to be normalized".format(
                            tablename, column))
                factors, names = table[column].factorize()
                table[column] = factors
                table.rename(columns={column: column + "_id"}, inplace=True)

                factor_table = pandas.DataFrame(
                    {column: names,
                     "id": list(range(len(names)))})
                factor_table["instance_id"] = instance_id
                table_cache.add_table(factor_table, tablename + "_factors")

    # store table as a matrix
    if "metric_upload_as_matrix" in meta_data:
        if tablename in meta_data["metric_upload_as_matrix"]:
            groupby_columns = meta_data["metric_upload_as_matrix"][tablename]
            if not isinstance(groupby_columns, list):
                groupby_columns = [groupby_columns]
            take_columns = [x for x in table.columns if x not in groupby_columns]
            row_index_column = take_columns.pop(0)
            rows = []
            if not groupby_columns:
                matrix = table.as_matrix(take_columns)
                rows.append([",".join(map(str, table[row_index_column])),
                             ",".join(map(str, take_columns)),
                             str(matrix.dtype),
                             matrix.tostring()])
            else:
                for key, group in table.groupby(by=groupby_columns):
                    if not isinstance(key, tuple):
                        key = [key]
                    matrix = group.as_matrix(take_columns)
                    rows.append(list(key) +
                                [",".join(map(str, group[row_index_column])),
                                 ",".join(map(str, take_columns)),
                                 str(matrix.dtype),
                                 matrix.tostring()])
            table = pandas.DataFrame.from_records(
                rows,
                columns=groupby_columns + ["rows", "columns", "dtype", "data"])
            dtypes = {"data": LargeBinary}

    return tablename, table, dtypes
Beispiel #11
0
def add_metrics_to_pipeline(pipeline,
                            metrics,
                            map_metric_to_runner,
                            tool_runners,
                            suffix="tsv",
                            prefix=None,
                            config=None,
                            **kwargs):

    single_input_metric_functions = []

    for metric in metrics:
        metricc = map_metric_to_runner[metric.strip()]
        if metricc.name in config:
            conf = config[metricc.name]
        else:
            conf = {}

        conf = expand_generators(conf)
        configurations = build_combinations(conf)
        for configuration in configurations:
            single_input_metric_functions.append(metricc(**configuration))

    make_unique = check_unique(single_input_metric_functions,
                               input_combos=None,
                               input_regex=None,
                               input_alias=None)

    metric_runners = []
    for taskf in single_input_metric_functions:

        ignore = config.get(taskf.name, {}).get("ignore", [])
        taskf.register_input(make_unique=make_unique)
        unique_name = taskf.__name__

        # make task name unique by adding 'prefix' as this method might
        # be called multiple times for straight, collated and split tasks
        if prefix:
            taskf.__name__ = prefix + taskf.__name__

        filter_regex = ruffus.regex("(.*)/(.*).{}".format(suffix))
        result_dir = os.path.join(unique_name + ".dir")
        output = r"\1/{}/{}.tsv".format(result_dir, taskf.name)

        found = False
        # Note that ignore will only work on the static parts of a task
        # as result_dir contains a pattern that will be filled in at runtime,
        # e.g. \1/echidna_test.dir/echidna_test.tsv.
        for i in ignore:
            if i in result_dir:
                P.get_logger().warn("the following task will be ignored: "
                                    "{} matching {}".format(result_dir, i))
                found = True

        if found:
            continue

        metric_task = pipeline.transform(task_func=taskf,
                                         input=tool_runners,
                                         filter=filter_regex,
                                         output=output,
                                         **kwargs)

        metric_runners.append(metric_task)

    f = EmptyRunner()
    if prefix:
        f.__name__ = prefix + "metrics"
    else:
        f.__name__ = "metrics"
    pipeline.merge(task_func=f, input=metric_runners, output=None)

    return metric_runners
Beispiel #12
0
def add_collations_to_pipeline(pipeline,
                               map_tool_to_runner,
                               collations,
                               tasks=None,
                               config=None,
                               **kwargs):

    runners = []

    ignore = config["setup"].get("ignore", [])
    ignore.extend(config["input"].get("ignore", []))

    for coll in collations:

        if coll not in config:
            raise KeyError(
                "configuration file requires a section for '{}'".format(coll))

        coll_info = config[coll]

        for keyword in ("runner", "regex_in", "pattern_out"):
            if keyword not in coll_info:
                raise ValueError(
                    "section {} is missing required keyword '{}'".format(
                        coll, keyword))

        runner_options = config.get(coll_info["runner"], {})
        runner_name = runner_options.get("name", coll_info["runner"]).strip()

        colcc = map_tool_to_runner[runner_name]
        taskf = colcc(**runner_options)

        # automatically set alias through regex (required field)
        taskf._input_regex = coll_info.get("regex", None)
        taskf._input_alias = coll_info.get("alias", None)
        taskf._replicate_regex = coll_info.get("regex_replicate", None)
        taskf.__name__ = coll

        if tasks is not None:
            input_tasks = tasks
        elif "glob" in coll_info:
            input_tasks = coll_info["glob"]
        else:
            raise ValueError("need either tasks or glob expression "
                             "for collation")

        filter_regex = ruffus.regex(coll_info["regex_in"])

        filter_regex = ruffus.regex(coll_info["regex_in"])
        result_dir = os.path.join(coll + ".dir")

        output_pattern = coll_info["pattern_out"]
        output_prefix = r"{}/{}".format(result_dir, output_pattern)
        output_dir = os.path.dirname(output_prefix)

        if hasattr(taskf, "output"):
            output, multiple_outputs, flexible_outputs, _suffix = \
                build_output(taskf, output_dir)
        else:
            multiple_outputs = False
            output = output_prefix

        found = False
        for i in IOTools.val2list(ignore):
            if i in result_dir:
                P.get_logger().warn("the following task will be ignored: "
                                    "{} matching {}".format(result_dir, i))
                found = True
        if found:
            continue

        metric_task = pipeline.collate(task_func=taskf,
                                       input=input_tasks,
                                       filter=filter_regex,
                                       output=output,
                                       **kwargs).mkdir(input_tasks,
                                                       filter_regex,
                                                       output_dir)

        if multiple_outputs:
            f = EmptyRunner()
            f.__name__ = taskf.__name__ + "_passthrough"
            output = [re.sub(r"\\\d+", "*", x) for x in output]
            metric_task = pipeline.split(task_func=f,
                                         input=metric_task,
                                         output=output)

        runners.append(metric_task)

    return runners
Beispiel #13
0
def add_tools_to_pipeline(pipeline,
                          map_tool_to_runner,
                          config=None,
                          input_files=None,
                          **kwargs):
    """add tools to a workflow pipeline.

    This function adds for each input and tool combination
    a task to the workflow.

    The configuration dictionary should contain the following
    sections:

    input:
       Configuration of input files. Key/value pairs and possibly
       hierarchical.

       The following keys are optional:
          regex
          alias
          group_regex
          group_alias

    tool:
       A list of tools to apply.

    A typical configuration dictionary might look like this::

        {"input": {"bam": "*.bam"}, "tool": ["bwa_mem", "isaac"]}

    Arguments
    ---------
    pipeline : object
        The ruffus pipeline that tasks will be added to.
    map_tool_to_runner: dict
        Dictionary mapping tools to functions in the
        :ref:`tasks`.
    config: dict
        Configuration dictionary.
    input_files: list
        List of (optional) input files.
    """
    tool_functions = build_tool_functions(map_tool_to_runner, config)

    if "input" not in config:
        raise KeyError("configuration file requires an 'input' section")

    if config["input"] is None:
        raise ValueError("input section is empty")

    input_regex = config["input"].pop("regex", None)
    input_alias = config["input"].pop("alias", None)
    replicate_alias = config["input"].pop("replicate_alias", None)
    input_group_regex = config["input"].pop("group_regex", None)
    input_group_alias = config["input"].pop("group_alias", "\\1")

    ignore = config["setup"].get("ignore", [])
    ignore.extend(config["input"].get("ignore", []))

    do_replication = config["setup"].pop("replication", None)
    if do_replication:
        replications = int(do_replication)
        P.get_logger().info(
            "running experiment with {} replications".format(replications))
    else:
        replications = 1

    # update selected fields for testing purposes
    is_test = "is_test" in config
    if "test" in config["input"]:
        config["input"].update(config["input"]["test"])
        del config["input"]["test"]

    # build input/tool combinations, optionally grouping them
    config_files = expand_globs(config["input"], is_test=is_test)
    if input_group_regex:
        config_files = group_files(config_files, input_group_regex,
                                   input_group_alias)

    input_combos = build_combinations(config_files)
    tool_runners = []

    make_unique = check_unique(tool_functions,
                               input_combos=input_combos,
                               input_regex=input_regex,
                               input_alias=input_alias,
                               is_test=is_test)

    suffix = None

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        for replication_idx in range(replications):
            # create a copy of the task function and give it its unique name
            # by mangling it with the input_files
            taskf = copy.copy(toolf)

            if do_replication:
                taskf.set_replication_id(replication_idx + 1)

            taskf.register_input(input_files,
                                 regex=input_regex,
                                 alias=input_alias,
                                 make_unique=make_unique,
                                 is_test=is_test,
                                 replicate_alias=replicate_alias)

            if "name" in input_files:
                # create copy of input_files without name, do
                # not modify original as different tools require
                # the 'name'
                input_files = dict([(x, y)
                                    for x, y in list(input_files.items())
                                    if x != "name"])

            result_dir = os.path.join(taskf.__name__ + ".dir")

            found = False

            for i in IOTools.val2list(ignore):
                if i in result_dir:
                    P.get_logger().warn("the following task will be ignored: "
                                        "{} matching {}".format(result_dir, i))
                    found = True
            if found:
                continue

            output, multiple_outputs, flexible_outputs, _suffix = \
                build_output(taskf, result_dir)
            if suffix is None:
                suffix = _suffix
            elif suffix != _suffix:
                raise ValueError(
                    "tools produce output files of different type, "
                    "got {}, expected {}".format(_suffix, suffix))

            tool_task = pipeline.merge(task_func=taskf,
                                       input=list(input_files.values()),
                                       output=output,
                                       **kwargs).mkdir(result_dir)

            # if there are multilpe output files, split the task so that
            # each output file will be processed separately further down the
            # pipeline.
            if multiple_outputs:
                f = EmptyRunner()
                f.__name__ = taskf.__name__ + "_split"
                tool_task = pipeline.split(task_func=f,
                                           input=tool_task,
                                           output=output)

            tool_runners.append(tool_task)

    # convenience target
    f = EmptyRunner()
    f.__name__ = "tools"
    pipeline.merge(task_func=f, input=tool_runners, output=None)

    return suffix, tool_runners
Beispiel #14
0
def save_table(table: pandas.DataFrame,
               url: str,
               tablename: str,
               schema: str = None,
               dtypes=None,
               indices=["instance_id"]):
    logger = P.get_logger()
    table.columns = sql_sanitize_columns(table.columns)

    engine = create_engine(url)

    # pandas/sqlite3 prefers the raw connection, otherwise error:
    # AttributeError: 'Engine' object has no attribute 'rollback'
    if url.startswith("sqlite"):
        _engine = engine.raw_connection()
        # In pandas >= 0.23 and using sqlite as a backend, the
        # pandas.DataFrame.to_sql command fails with "OperationalError:
        # (sqlite3.OperationalError) too many SQL variables". The reason is a
        # fixed limit in sqlite, SQLITE_MAX_VARIABLE_NUMBER, which is by
        # default set to 999.
        sql_chunk_size = 999 // (len(table.columns) + 1)
    else:
        _engine = engine
        sql_chunk_size = None

    # lower case all table names. Otherwise issues with psql
    # mixed case access
    tablename = tablename.lower()
    create_index = False

    try:
        retry_table_to_sql(table,
                           tablename,
                           _engine,
                           schema=schema,
                           if_exists="fail",
                           index=False,
                           dtype=dtypes,
                           chunksize=sql_chunk_size)
        E.debug(f"table {tablename} was new")
        create_index = True
    except TableExistsException:
        E.debug(f"table {tablename} already exists - appending")

    if create_index:
        # sqlite requires an index name
        if schema:
            tablename = "{}.{}".format(schema, tablename)

        for field in indices:
            E.debug(f"creating index on {field} for {tablename}")
            try:
                retry_sql_execute(
                    _engine,
                    str(
                        text("CREATE INDEX {} ON {} ({})".format(
                            re.sub("[-.]", "_", tablename) + "_" + field,
                            tablename, field))))
            except IndexExistsException:
                pass
            except TypeError as ex:
                logger.warn("could not create index: {}".format(str(ex)))
            except sqlalchemy.exc.ProgrammingError as ex:
                logger.warn("could not create index: {}".format(str(ex)))
    else:
        reconcile_columns(tablename, engine, table)
        retry_table_to_sql(table,
                           tablename,
                           _engine,
                           schema=schema,
                           if_exists="append",
                           index=False,
                           dtype=dtypes,
                           chunksize=sql_chunk_size)