def merge_and_load(infiles, outfile, suffix=None, columns=(0, 1), regex=None, row_wise=True, retry=True, options="", prefixes=None): '''merge multiple categorical tables and load into a database. The tables are merged and entered row-wise, i.e, the contents of each file are a row. For example, the statement:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load") with the two files:: > cat file1.txt Category Result length 12 width 100 > cat file2.txt Category Result length 20 width 50 will be added into table ``test_table`` as:: track length width file1 12 100 file2 20 50 If row-wise is set:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load", row_wise=True) ``test_table`` will be transposed and look like this:: track file1 file2 length 12 20 width 20 50 Arguments --------- infiles : list Filenames of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile`. suffix : string If `suffix` is given, the suffix will be removed from the filenames. columns : list The columns to be taken. By default, the first two columns are taken with the first being the key. Filenames are stored in a ``track`` column. Directory names are chopped off. If `columns` is set to None, all columns will be taken. Here, column names will receive a prefix given by `prefixes`. If `prefixes` is None, the filename will be added as a prefix. regex : string If set, the full filename will be used to extract a track name via the supplied regular expression. row_wise : bool If set to False, each table will be a column in the resulting table. This is useful if histograms are being merged. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. options : string Command line options for the `csv2db.py` script. prefixes : list If given, the respective prefix will be added to each column. The number of `prefixes` and `infiles` needs to be the same. ''' if len(infiles) == 0: raise ValueError("no files for merging") if suffix: header = ",".join([os.path.basename(snip(x, suffix)) for x in infiles]) elif regex: header = ",".join( ["-".join(re.search(regex, x).groups()) for x in infiles]) else: header = ",".join([os.path.basename(x) for x in infiles]) header_stmt = "--header-names=%s" % header if columns: column_filter = "| cut -f %s" % ",".join( map(str, [x + 1 for x in columns])) else: column_filter = "" if prefixes: assert len(prefixes) == len(infiles) header_stmt = "--prefixes=%s" % ",".join(prefixes) else: header_stmt = "--add-file-prefix" if infiles[0].endswith(".gz"): filenames = " ".join( ["<( zcat %s %s )" % (x, column_filter) for x in infiles]) else: filenames = " ".join( ["<( cat %s %s )" % (x, column_filter) for x in infiles]) if row_wise: transform = """| perl -p -e "s/bin/track/" | python -m CGATCore.Table --transpose""" else: transform = "" load_statement = build_load_statement(to_table(outfile), options="--add-index=track " + options, retry=retry) statement = """python -m CGATCore.Tables %(header_stmt)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s %(transform)s | %(load_statement)s > %(outfile)s """ to_cluster = False run(statement)
def load(infile, outfile=None, options="", collapse=False, transpose=False, tablename=None, retry=True, limit=0, shuffle=False, job_memory=None): """import data from a tab-separated file into database. The table name is given by outfile without the ".load" suffix. A typical load task in ruffus would look like this:: @transform("*.tsv.gz", suffix(".tsv.gz"), ".load") def loadData(infile, outfile): P.load(infile, outfile) Upload is performed via the :doc:`csv2db` script. Arguments --------- infile : string Filename of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile` if `tablename` is not set. options : string Command line options for the `csv2db.py` script. collapse : string If set, the table will be collapsed before loading. This transforms a data set with two columns where the first column is the row name into a multi-column table. The value of collapse is the value used for missing values. transpose : string If set, the table will be transposed before loading. The first column in the first row will be set to the string within transpose. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. limit : int If set, only load the first n lines. shuffle : bool If set, randomize lines before loading. Together with `limit` this permits loading a sample of rows. job_memory : string Amount of memory to allocate for job. If unset, uses the global default. """ if job_memory is None: job_memory = get_params()["cluster_memory_default"] if not tablename: tablename = to_table(outfile) statement = [] if infile.endswith(".gz"): statement.append("zcat %(infile)s") else: statement.append("cat %(infile)s") if collapse: statement.append("python -m CGATCore.Table " "--log=%(outfile)s.collapse.log " "--collapse=%(collapse)s") if transpose: statement.append("python -m CGATCore.Table " "--log=%(outfile)s.transpose.log " "--transpose " "--set-transpose-field=%(transpose)s") if shuffle: statement.append("python -m CGATCore.Table " "--log=%(outfile)s.shuffle.log " "--method=randomize-rows") if limit > 0: # use awk to filter in order to avoid a pipeline broken error from head statement.append("awk 'NR > %i {exit(0)} {print}'" % (limit + 1)) # ignore errors from cat or zcat due to broken pipe ignore_pipe_errors = True statement.append( build_load_statement(tablename, options=options, retry=retry)) statement = " | ".join(statement) + " > %(outfile)s" to_cluster = False run(statement)
def concatenate_and_load(infiles, outfile, regex_filename=None, header=None, cat="track", has_titles=True, missing_value="na", retry=True, tablename=None, options="", job_memory=None): """concatenate multiple tab-separated files and upload into database. The table name is given by outfile without the ".load" suffix. A typical concatenate and load task in ruffus would look like this:: @merge("*.tsv.gz", ".load") def loadData(infile, outfile): P.concatenateAndLoad(infiles, outfile) Upload is performed via the :doc:`csv2db` script. Arguments --------- infiles : list Filenames of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile`. regex_filename : string If given, *regex_filename* is applied to the filename to extract the track name. If the pattern contains multiple groups, they are added as additional columns. For example, if `cat` is set to ``track,method`` and `regex_filename` is ``(.*)_(.*).tsv.gz`` it will add the columns ``track`` and method to the table. header : string Comma-separated list of values for header. cat : string Column title for column containing the track name. The track name is derived from the filename, see `regex_filename`. has_titles : bool If True, files are expected to have column titles in their first row. missing_value : string String to use for missing values. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. tablename: string Name to use for table. If unset derive from outfile. options : string Command line options for the `csv2db.py` script. job_memory : string Amount of memory to allocate for job. If unset, uses the global default. """ if job_memory is None: job_memory = get_params()["cluster_memory_default"] if tablename is None: tablename = to_table(outfile) infiles = " ".join(infiles) passed_options = options load_options, cat_options = ["--add-index=track"], [] if regex_filename: cat_options.append("--regex-filename='%s'" % regex_filename) if header: load_options.append("--header-names=%s" % header) if not has_titles: cat_options.append("--no-titles") cat_options = " ".join(cat_options) load_options = " ".join(load_options) + " " + passed_options load_statement = build_load_statement(tablename, options=load_options, retry=retry) statement = '''python -m CGATCore.Tables --cat=%(cat)s --missing-value=%(missing_value)s %(cat_options)s %(infiles)s | %(load_statement)s > %(outfile)s''' to_cluster = False run(statement)