def getDatabaseName(): '''Return the database name associated with the pipeline. This method lookis in different sections in the ini file to permit both old style ``database`` and new style ``database_name``. This method has been implemented for backwards compatibility. Returns ------- databasename : string Database name. Returns empty string if not found. Raises ------ KeyError If no database name is found ''' locations = ["database_name", "database"] PARAMS = getParams() for location in locations: database = PARAMS.get(location, None) if database is not None: return database raise KeyError("database name not found")
def connect(): """connect to SQLite database used in this pipeline. .. note:: This method is currently only implemented for sqlite databases. It needs refactoring for generic access. Alternatively, use an full or partial ORM. If ``annotations_database`` is in PARAMS, this method will attach the named database as ``annotations``. Returns ------- dbh a database handle """ # Note that in the future this might return an sqlalchemy or # db.py handle. PARAMS = getParams() if PARAMS["database_backend"] == "sqlite": dbh = sqlite3.connect(getDatabaseName()) if "annotations_database" in PARAMS: statement = '''ATTACH DATABASE '%s' as annotations''' % \ (PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close() else: raise NotImplementedError( "backend %s not implemented" % PARAMS["database_backend"]) return dbh
def build_load_statement(tablename, retry=True, options=""): """build a command line statement to upload data. Upload is performed via the :doc:`csv2db` script. The returned statement is suitable to use in pipe expression. This method is aware of the configuration values for database access and the chosen database backend. For example:: load_statement = P.build_load_statement("data") statement = "cat data.txt | %(load_statement)s" P.run() Arguments --------- tablename : string Tablename for upload retry : bool Add the ``--retry`` option to `csv2db.py` options : string Command line options to be passed on to `csv2db.py` Returns ------- string """ opts = [] if retry: opts.append(" --retry ") PARAMS = getParams() backend = PARAMS["database_backend"] if backend not in ("sqlite", "mysql", "postgres"): raise NotImplementedError("backend %s not implemented" % backend) opts.append("--database-backend=%s" % backend) opts.append("--database-name=%s" % PARAMS.get("database_name")) opts.append("--database-host=%s" % PARAMS.get("database_host", "")) opts.append("--database-user=%s" % PARAMS.get("database_username", "")) opts.append("--database-password=%s" % PARAMS.get("database_password", "")) opts.append("--database-port=%s" % PARAMS.get("database_port", 3306)) db_options = " ".join(opts) statement = (''' cgat csv2db %(db_options)s %(options)s --table=%(tablename)s ''') load_statement = buildStatement(**locals()) return load_statement
def connect(): """connect to SQLite database used in this pipeline. .. note:: This method is currently only implemented for sqlite databases. It needs refactoring for generic access. Alternatively, use an full or partial ORM. If ``annotations_database`` is in PARAMS, this method will attach the named database as ``annotations``. Returns ------- dbh a database handle """ # Note that in the future this might return an sqlalchemy or # db.py handle. PARAMS = getParams() if PARAMS["database_backend"] == "sqlite": dbh = sqlite3.connect(getDatabaseName()) if "annotations_database" in PARAMS: statement = '''ATTACH DATABASE '%s' as annotations''' % \ (PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close() else: raise NotImplementedError("backend %s not implemented" % PARAMS["database_backend"]) return dbh
def build_load_statement(tablename, retry=True, options=""): """build a command line statement to upload data. Upload is performed via the :doc:`csv2db` script. The returned statement is suitable to use in pipe expression. This method is aware of the configuration values for database access and the chosen database backend. For example:: load_statement = P.build_load_statement("data") statement = "cat data.txt | %(load_statement)s" P.run() Arguments --------- tablename : string Tablename for upload retry : bool Add the ``--retry`` option to `csv2db.py` options : string Command line options to be passed on to `csv2db.py` Returns ------- string """ opts = [] if retry: opts.append(" --retry ") PARAMS = getParams() backend = PARAMS["database_backend"] if backend not in ("sqlite", "mysql", "postgres"): raise NotImplementedError( "backend %s not implemented" % backend) opts.append("--database-backend=%s" % backend) opts.append("--database-name=%s" % PARAMS.get("database_name")) opts.append("--database-host=%s" % PARAMS.get("database_host", "")) opts.append("--database-user=%s" % PARAMS.get("database_username", "")) opts.append("--database-password=%s" % PARAMS.get("database_password", "")) opts.append("--database-port=%s" % PARAMS.get("database_port", 3306)) db_options = " ".join(opts) statement = (''' cgat csv2db %(db_options)s %(options)s --table=%(tablename)s ''') load_statement = buildStatement(**locals()) return load_statement
def mergeAndLoad(infiles, outfile, suffix=None, columns=(0, 1), regex=None, row_wise=True, retry=True, options="", prefixes=None): '''merge multiple categorical tables and load into a database. The tables are merged and entered row-wise, i.e, the contents of each file are a row. For example, the statement:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load") with the two files:: > cat file1.txt Category Result length 12 width 100 > cat file2.txt Category Result length 20 width 50 will be added into table ``test_table`` as:: track length width file1 12 100 file2 20 50 If row-wise is set:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load", row_wise=True) ``test_table`` will be transposed and look like this:: track file1 file2 length 12 20 width 20 50 Arguments --------- infiles : list Filenames of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile`. suffix : string If `suffix` is given, the suffix will be removed from the filenames. columns : list The columns to be taken. By default, the first two columns are taken with the first being the key. Filenames are stored in a ``track`` column. Directory names are chopped off. If `columns` is set to None, all columns will be taken. Here, column names will receive a prefix given by `prefixes`. If `prefixes` is None, the filename will be added as a prefix. regex : string If set, the full filename will be used to extract a track name via the supplied regular expression. row_wise : bool If set to False, each table will be a column in the resulting table. This is useful if histograms are being merged. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. options : string Command line options for the `csv2db.py` script. prefixes : list If given, the respective prefix will be added to each column. The number of `prefixes` and `infiles` needs to be the same. ''' PARAMS = getParams() if len(infiles) == 0: raise ValueError("no files for merging") if suffix: header = ",".join([os.path.basename(snip(x, suffix)) for x in infiles]) elif regex: header = ",".join(["-".join(re.search(regex, x).groups()) for x in infiles]) else: header = ",".join([os.path.basename(x) for x in infiles]) header_stmt = "--header-names=%s" % header if columns: column_filter = "| cut -f %s" % ",".join(map(str, [x + 1 for x in columns])) else: column_filter = "" if prefixes: assert len(prefixes) == len(infiles) header_stmt = "--prefixes=%s" % ",".join(prefixes) else: header_stmt = "--add-file-prefix" if infiles[0].endswith(".gz"): filenames = " ".join( ["<( zcat %s %s )" % (x, column_filter) for x in infiles]) else: filenames = " ".join( ["<( cat %s %s )" % (x, column_filter) for x in infiles]) if row_wise: transform = """| perl -p -e "s/bin/track/" | cgat table2table --transpose""" % PARAMS else: transform = "" load_statement = build_load_statement( toTable(outfile), options="--add-index=track " + options, retry=retry) statement = """cgat combine_tables %(header_stmt)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s %(transform)s | %(load_statement)s > %(outfile)s """ to_cluster = False run()
def concatenateAndLoad(infiles, outfile, regex_filename=None, header=None, cat="track", has_titles=True, missing_value="na", retry=True, tablename=None, options="", job_memory=None): """concatenate multiple tab-separated files and upload into database. The table name is given by outfile without the ".load" suffix. A typical concatenate and load task in ruffus would look like this:: @merge("*.tsv.gz", ".load") def loadData(infile, outfile): P.concatenateAndLoad(infiles, outfile) Upload is performed via the :doc:`csv2db` script. Arguments --------- infiles : list Filenames of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile`. regex_filename : string If given, *regex_filename* is applied to the filename to extract the track name. If the pattern contains multiple groups, they are added as additional columns. For example, if `cat` is set to ``track,method`` and `regex_filename` is ``(.*)_(.*).tsv.gz`` it will add the columns ``track`` and method to the table. header : string Comma-separated list of values for header. cat : string Column title for column containing the track name. The track name is derived from the filename, see `regex_filename`. has_titles : bool If True, files are expected to have column titles in their first row. missing_value : string String to use for missing values. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. tablename: string Name to use for table. If unset derive from outfile. options : string Command line options for the `csv2db.py` script. job_memory : string Amount of memory to allocate for job. If unset, uses the global default. """ PARAMS = getParams() if job_memory is None: job_memory = PARAMS["cluster_memory_default"] if tablename is None: tablename = toTable(outfile) infiles = " ".join(infiles) passed_options = options load_options, cat_options = ["--add-index=track"], [] if regex_filename: cat_options.append("--regex-filename='%s'" % regex_filename) if header: load_options.append("--header-names=%s" % header) if not has_titles: cat_options.append("--no-titles") cat_options = " ".join(cat_options) load_options = " ".join(load_options) + " " + passed_options load_statement = build_load_statement(tablename, options=load_options, retry=retry) statement = '''cgat combine_tables --cat=%(cat)s --missing-value=%(missing_value)s %(cat_options)s %(infiles)s | %(load_statement)s > %(outfile)s''' to_cluster = False run()
def load(infile, outfile=None, options="", collapse=False, transpose=False, tablename=None, retry=True, limit=0, shuffle=False, job_memory=None): """import data from a tab-separated file into database. The table name is given by outfile without the ".load" suffix. A typical load task in ruffus would look like this:: @transform("*.tsv.gz", suffix(".tsv.gz"), ".load") def loadData(infile, outfile): P.load(infile, outfile) Upload is performed via the :doc:`csv2db` script. Arguments --------- infile : string Filename of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile` if `tablename` is not set. options : string Command line options for the `csv2db.py` script. collapse : string If set, the table will be collapsed before loading. This transforms a data set with two columns where the first column is the row name into a multi-column table. The value of collapse is the value used for missing values. transpose : string If set, the table will be transposed before loading. The first column in the first row will be set to the string within transpose. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. limit : int If set, only load the first n lines. shuffle : bool If set, randomize lines before loading. Together with `limit` this permits loading a sample of rows. job_memory : string Amount of memory to allocate for job. If unset, uses the global default. """ PARAMS = getParams() if job_memory is None: job_memory = PARAMS["cluster_memory_default"] if not tablename: tablename = toTable(outfile) statement = [] if infile.endswith(".gz"): statement.append("zcat %(infile)s") else: statement.append("cat %(infile)s") if collapse: statement.append( "cgat table2table --collapse=%(collapse)s") if transpose: statement.append( """cgat table2table --transpose --set-transpose-field=%(transpose)s""") if shuffle: statement.append("cgat randomize_lines --keep-header=1") if limit > 0: # use awk to filter in order to avoid a pipeline broken error from head statement.append("awk 'NR > %i {exit(0)} {print}'" % (limit + 1)) # ignore errors from cat or zcat due to broken pipe ignore_pipe_errors = True statement.append(build_load_statement(tablename, options=options, retry=retry)) statement = " | ".join(statement) + " > %(outfile)s" to_cluster = False run()
def mergeAndLoad(infiles, outfile, suffix=None, columns=(0, 1), regex=None, row_wise=True, retry=True, options="", prefixes=None): '''merge multiple categorical tables and load into a database. The tables are merged and entered row-wise, i.e, the contents of each file are a row. For example, the statement:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load") with the two files:: > cat file1.txt Category Result length 12 width 100 > cat file2.txt Category Result length 20 width 50 will be added into table ``test_table`` as:: track length width file1 12 100 file2 20 50 If row-wise is set:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load", row_wise=True) ``test_table`` will be transposed and look like this:: track file1 file2 length 12 20 width 20 50 Arguments --------- infiles : list Filenames of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile`. suffix : string If `suffix` is given, the suffix will be removed from the filenames. columns : list The columns to be taken. By default, the first two columns are taken with the first being the key. Filenames are stored in a ``track`` column. Directory names are chopped off. If `columns` is set to None, all columns will be taken. Here, column names will receive a prefix given by `prefixes`. If `prefixes` is None, the filename will be added as a prefix. regex : string If set, the full filename will be used to extract a track name via the supplied regular expression. row_wise : bool If set to False, each table will be a column in the resulting table. This is useful if histograms are being merged. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. options : string Command line options for the `csv2db.py` script. prefixes : list If given, the respective prefix will be added to each column. The number of `prefixes` and `infiles` needs to be the same. ''' PARAMS = getParams() if len(infiles) == 0: raise ValueError("no files for merging") if suffix: header = ",".join([os.path.basename(snip(x, suffix)) for x in infiles]) elif regex: header = ",".join( ["-".join(re.search(regex, x).groups()) for x in infiles]) else: header = ",".join([os.path.basename(x) for x in infiles]) header_stmt = "--header-names=%s" % header if columns: column_filter = "| cut -f %s" % ",".join( map(str, [x + 1 for x in columns])) else: column_filter = "" if prefixes: assert len(prefixes) == len(infiles) header_stmt = "--prefixes=%s" % ",".join(prefixes) else: header_stmt = "--add-file-prefix" if infiles[0].endswith(".gz"): filenames = " ".join( ["<( zcat %s %s )" % (x, column_filter) for x in infiles]) else: filenames = " ".join( ["<( cat %s %s )" % (x, column_filter) for x in infiles]) if row_wise: transform = """| perl -p -e "s/bin/track/" | cgat table2table --transpose""" % PARAMS else: transform = "" load_statement = build_load_statement(toTable(outfile), options="--add-index=track " + options, retry=retry) statement = """cgat combine_tables %(header_stmt)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s %(transform)s | %(load_statement)s > %(outfile)s """ to_cluster = False run()
def load(infile, outfile=None, options="", collapse=False, transpose=False, tablename=None, retry=True, limit=0, shuffle=False, job_memory=None): """import data from a tab-separated file into database. The table name is given by outfile without the ".load" suffix. A typical load task in ruffus would look like this:: @transform("*.tsv.gz", suffix(".tsv.gz"), ".load") def loadData(infile, outfile): P.load(infile, outfile) Upload is performed via the :doc:`csv2db` script. Arguments --------- infile : string Filename of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile` if `tablename` is not set. options : string Command line options for the `csv2db.py` script. collapse : string If set, the table will be collapsed before loading. This transforms a data set with two columns where the first column is the row name into a multi-column table. The value of collapse is the value used for missing values. transpose : string If set, the table will be transposed before loading. The first column in the first row will be set to the string within transpose. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. limit : int If set, only load the first n lines. shuffle : bool If set, randomize lines before loading. Together with `limit` this permits loading a sample of rows. job_memory : string Amount of memory to allocate for job. If unset, uses the global default. """ PARAMS = getParams() if job_memory is None: job_memory = PARAMS["cluster_memory_default"] if not tablename: tablename = toTable(outfile) statement = [] if infile.endswith(".gz"): statement.append("zcat %(infile)s") else: statement.append("cat %(infile)s") if collapse: statement.append("cgat table2table --collapse=%(collapse)s") if transpose: statement.append("""cgat table2table --transpose --set-transpose-field=%(transpose)s""") if shuffle: statement.append("cgat randomize_lines --keep-header=1") if limit > 0: # use awk to filter in order to avoid a pipeline broken error from head statement.append("awk 'NR > %i {exit(0)} {print}'" % (limit + 1)) # ignore errors from cat or zcat due to broken pipe ignore_pipe_errors = True statement.append( build_load_statement(tablename, options=options, retry=retry)) statement = " | ".join(statement) + " > %(outfile)s" to_cluster = False run()