예제 #1
0
def merge_and_load(infiles,
                   outfile,
                   suffix=None,
                   columns=(0, 1),
                   regex=None,
                   row_wise=True,
                   retry=True,
                   options="",
                   prefixes=None):
    '''merge multiple categorical tables and load into a database.

    The tables are merged and entered row-wise, i.e, the contents of
    each file are a row.

    For example, the statement::

        mergeAndLoad(['file1.txt', 'file2.txt'],
                     "test_table.load")

    with the two files::
        > cat file1.txt
        Category    Result
        length      12
        width       100

        > cat file2.txt
        Category    Result
        length      20
        width       50

    will be added into table ``test_table`` as::
        track   length   width
        file1   12       100
        file2   20       50

    If row-wise is set::
        mergeAndLoad(['file1.txt', 'file2.txt'],
                     "test_table.load", row_wise=True)

    ``test_table`` will be transposed and look like this::
        track    file1 file2
        length   12    20
        width    20    50

    Arguments
    ---------
    infiles : list
        Filenames of the input data
    outfile : string
        Output filename. This will contain the logging information. The
        table name is derived from `outfile`.
    suffix : string
        If `suffix` is given, the suffix will be removed from the filenames.
    columns : list
        The columns to be taken. By default, the first two columns are
        taken with the first being the key. Filenames are stored in a
        ``track`` column. Directory names are chopped off.  If
        `columns` is set to None, all columns will be taken. Here,
        column names will receive a prefix given by `prefixes`. If
        `prefixes` is None, the filename will be added as a prefix.
    regex : string
        If set, the full filename will be used to extract a
        track name via the supplied regular expression.
    row_wise : bool
        If set to False, each table will be a column in the resulting
        table.  This is useful if histograms are being merged.
    retry : bool
        If True, multiple attempts will be made if the data can
        not be loaded at the first try, for example if a table is locked.
    options : string
        Command line options for the `csv2db.py` script.
    prefixes : list
        If given, the respective prefix will be added to each
        column. The number of `prefixes` and `infiles` needs to be the
        same.

    '''
    if len(infiles) == 0:
        raise ValueError("no files for merging")

    if suffix:
        header = ",".join([os.path.basename(snip(x, suffix)) for x in infiles])
    elif regex:
        header = ",".join(
            ["-".join(re.search(regex, x).groups()) for x in infiles])
    else:
        header = ",".join([os.path.basename(x) for x in infiles])

    header_stmt = "--header-names=%s" % header

    if columns:
        column_filter = "| cut -f %s" % ",".join(
            map(str, [x + 1 for x in columns]))
    else:
        column_filter = ""
        if prefixes:
            assert len(prefixes) == len(infiles)
            header_stmt = "--prefixes=%s" % ",".join(prefixes)
        else:
            header_stmt = "--add-file-prefix"

    if infiles[0].endswith(".gz"):
        filenames = " ".join(
            ["<( zcat %s %s )" % (x, column_filter) for x in infiles])
    else:
        filenames = " ".join(
            ["<( cat %s %s )" % (x, column_filter) for x in infiles])

    if row_wise:
        transform = """| perl -p -e "s/bin/track/"
        | python -m CGATCore.Table --transpose"""
    else:
        transform = ""

    load_statement = build_load_statement(to_table(outfile),
                                          options="--add-index=track " +
                                          options,
                                          retry=retry)

    statement = """python -m CGATCore.Tables
    %(header_stmt)s
    --skip-titles
    --missing-value=0
    --ignore-empty
    %(filenames)s
    %(transform)s
    | %(load_statement)s
    > %(outfile)s
    """
    to_cluster = False
    run(statement)
예제 #2
0
def load(infile,
         outfile=None,
         options="",
         collapse=False,
         transpose=False,
         tablename=None,
         retry=True,
         limit=0,
         shuffle=False,
         job_memory=None):
    """import data from a tab-separated file into database.

    The table name is given by outfile without the
    ".load" suffix.

    A typical load task in ruffus would look like this::

        @transform("*.tsv.gz", suffix(".tsv.gz"), ".load")
        def loadData(infile, outfile):
            P.load(infile, outfile)

    Upload is performed via the :doc:`csv2db` script.

    Arguments
    ---------
    infile : string
        Filename of the input data
    outfile : string
        Output filename. This will contain the logging information. The
        table name is derived from `outfile` if `tablename` is not set.
    options : string
        Command line options for the `csv2db.py` script.
    collapse : string
        If set, the table will be collapsed before loading. This
        transforms a data set with two columns where the first column
        is the row name into a multi-column table.  The value of
        collapse is the value used for missing values.
    transpose : string
        If set, the table will be transposed before loading. The first
        column in the first row will be set to the string within
        transpose.
    retry : bool
        If True, multiple attempts will be made if the data can
        not be loaded at the first try, for example if a table is locked.
    limit : int
        If set, only load the first n lines.
    shuffle : bool
        If set, randomize lines before loading. Together with `limit`
        this permits loading a sample of rows.
    job_memory : string
        Amount of memory to allocate for job. If unset, uses the global
        default.
    """

    if job_memory is None:
        job_memory = get_params()["cluster_memory_default"]

    if not tablename:
        tablename = to_table(outfile)

    statement = []

    if infile.endswith(".gz"):
        statement.append("zcat %(infile)s")
    else:
        statement.append("cat %(infile)s")

    if collapse:
        statement.append("python -m CGATCore.Table "
                         "--log=%(outfile)s.collapse.log "
                         "--collapse=%(collapse)s")

    if transpose:
        statement.append("python -m CGATCore.Table "
                         "--log=%(outfile)s.transpose.log "
                         "--transpose "
                         "--set-transpose-field=%(transpose)s")

    if shuffle:
        statement.append("python -m CGATCore.Table "
                         "--log=%(outfile)s.shuffle.log "
                         "--method=randomize-rows")

    if limit > 0:
        # use awk to filter in order to avoid a pipeline broken error from head
        statement.append("awk 'NR > %i {exit(0)} {print}'" % (limit + 1))
        # ignore errors from cat or zcat due to broken pipe
        ignore_pipe_errors = True

    statement.append(
        build_load_statement(tablename, options=options, retry=retry))

    statement = " | ".join(statement) + " > %(outfile)s"

    to_cluster = False
    run(statement)
예제 #3
0
def concatenate_and_load(infiles,
                         outfile,
                         regex_filename=None,
                         header=None,
                         cat="track",
                         has_titles=True,
                         missing_value="na",
                         retry=True,
                         tablename=None,
                         options="",
                         job_memory=None):
    """concatenate multiple tab-separated files and upload into database.

    The table name is given by outfile without the
    ".load" suffix.

    A typical concatenate and load task in ruffus would look like this::

        @merge("*.tsv.gz", ".load")
        def loadData(infile, outfile):
            P.concatenateAndLoad(infiles, outfile)

    Upload is performed via the :doc:`csv2db` script.

    Arguments
    ---------
    infiles : list
        Filenames of the input data
    outfile : string
        Output filename. This will contain the logging information. The
        table name is derived from `outfile`.
    regex_filename : string
        If given, *regex_filename* is applied to the filename to extract
        the track name. If the pattern contains multiple groups, they are
        added as additional columns. For example, if `cat` is set to
        ``track,method`` and `regex_filename` is ``(.*)_(.*).tsv.gz``
        it will add the columns ``track`` and method to the table.
    header : string
        Comma-separated list of values for header.
    cat : string
        Column title for column containing the track name. The track name
        is derived from the filename, see `regex_filename`.
    has_titles : bool
        If True, files are expected to have column titles in their first row.
    missing_value : string
        String to use for missing values.
    retry : bool
        If True, multiple attempts will be made if the data can
        not be loaded at the first try, for example if a table is locked.
    tablename: string
        Name to use for table. If unset derive from outfile.
    options : string
        Command line options for the `csv2db.py` script.
    job_memory : string
        Amount of memory to allocate for job. If unset, uses the global
        default.

    """
    if job_memory is None:
        job_memory = get_params()["cluster_memory_default"]

    if tablename is None:
        tablename = to_table(outfile)

    infiles = " ".join(infiles)

    passed_options = options
    load_options, cat_options = ["--add-index=track"], []

    if regex_filename:
        cat_options.append("--regex-filename='%s'" % regex_filename)

    if header:
        load_options.append("--header-names=%s" % header)

    if not has_titles:
        cat_options.append("--no-titles")

    cat_options = " ".join(cat_options)
    load_options = " ".join(load_options) + " " + passed_options

    load_statement = build_load_statement(tablename,
                                          options=load_options,
                                          retry=retry)

    statement = '''python -m CGATCore.Tables
    --cat=%(cat)s
    --missing-value=%(missing_value)s
    %(cat_options)s
    %(infiles)s
    | %(load_statement)s
    > %(outfile)s'''

    to_cluster = False
    run(statement)