Esempio n. 1
0
def _read_model_arguments(argv, use_argparse=False):
    if use_argparse:
        parser = argparse.ArgumentParser()
        parser.add_argument('database', metavar='DATABASE', type=str,
                            default="galaxy",
                            nargs='?',
                            help='database to target (galaxy, tool_shed, install)')
        populate_config_args(parser)
        args = parser.parse_args(argv[1:] if argv else [])
        return args.config_file, args.config_section, args.database
    else:
        config_file = None
        for arg in ["-c", "--config", "--config-file"]:
            if arg in argv:
                pos = argv.index(arg)
                argv.pop(pos)
                config_file = argv.pop(pos)
        config_section = None
        if "--config-section" in argv:
            pos = argv.index("--config-section")
            argv.pop(pos)
            config_section = argv.pop(pos)
        if argv and (argv[-1] in DATABASE):
            database = argv.pop()  # database name tool_shed, galaxy, or install.
        else:
            database = 'galaxy'
        return config_file, config_section, database
def main(argv):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k', '--secret-key', help='Key to convert pages with', default='')
    parser.add_argument('-d', '--dry-run', help='No changes, just test it.', action='store_true')
    populate_config_args(parser)
    args = parser.parse_args()
    properties = app_properties_from_args(args)
    config = galaxy.config.Configuration(**properties)
    secret = args.secret_key or config.id_secret
    security_helper = SecurityHelper(id_secret=secret)
    object_store = build_object_store_from_config(config)
    if not config.database_connection:
        print("The database connection is empty. If you are using the default value, please uncomment that in your galaxy.yml")

    model = galaxy.config.init_models_from_config(config, object_store=object_store)
    session = model.context.current
    pagerevs = session.query(model.PageRevision).all()
    mock_trans = Bunch(app=Bunch(security=security_helper), model=model, user_is_admin=lambda: True, sa_session=session)
    for p in pagerevs:
        try:
            processor = _PageContentProcessor(mock_trans, _placeholderRenderForSave)
            processor.feed(p.content)
            newcontent = unicodify(processor.output(), 'utf-8')
            if p.content != newcontent:
                if not args.dry_run:
                    p.content = unicodify(processor.output(), 'utf-8')
                    session.add(p)
                    session.flush()
                else:
                    print("Modifying revision %s." % p.id)
                    print(difflib.unified_diff(p.content, newcontent))
        except Exception:
            logging.exception("Error parsing page, rolling changes back and skipping revision %s.  Please report this error." % p.id)
            session.rollback()
Esempio n. 3
0
def parse_arguments():
    parser = argparse.ArgumentParser(
        description='Generate walltime statistics')
    parser.add_argument('tool_id', help='Tool (by ID) to collect stats about')
    parser.add_argument('--like',
                        action='store_true',
                        default=False,
                        help='Use SQL `LIKE` operator to find '
                        'a shed-installed tool using the tool\'s '
                        '"short" id')
    populate_config_args(parser)
    parser.add_argument('-d',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='Print extra info')
    parser.add_argument('-m',
                        '--min',
                        type=int,
                        default=-1,
                        help='Ignore runtimes less than MIN seconds')
    parser.add_argument('-M',
                        '--max',
                        type=int,
                        default=-1,
                        help='Ignore runtimes greater than MAX seconds')
    parser.add_argument('-u',
                        '--user',
                        help='Return stats for only this user (id, email, '
                        'or username)')
    parser.add_argument('-s',
                        '--source',
                        default='metrics',
                        help='Runtime data source (SOURCES: %s)' %
                        ', '.join(DATA_SOURCES))
    args = parser.parse_args()

    if args.like and '/' in args.tool_id:
        print('ERROR: Do not use --like with a tool shed tool id (the tool '
              'id should not contain `/` characters)')
        sys.exit(2)

    args.source = args.source.lower()
    if args.source not in ('metrics', 'history'):
        print('ERROR: Data source `%s` unknown, valid source are: %s' %
              (args.source, ', '.join(DATA_SOURCES)))

    app_properties = app_properties_from_args(args)
    config = galaxy.config.Configuration(**app_properties)
    uri = args.config.get_database_url(config)

    names = {'database': 'dbname', 'username': '******'}
    args.connect_args = url.make_url(uri).translate_connect_args(**names)

    if args.debug:
        print('Got options:')
        for i in vars(args).items():
            print('%s: %s' % i)

    return args
Esempio n. 4
0
def main(argv):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k', '--secret-key', help='Key to convert pages with', default='')
    parser.add_argument('-d', '--dry-run', help='No changes, just test it.', action='store_true')
    populate_config_args(parser)
    args = parser.parse_args()
    properties = app_properties_from_args(args)
    config = galaxy.config.Configuration(**properties)
    secret = args.secret_key or config.id_secret
    security_helper = IdEncodingHelper(id_secret=secret)
    object_store = build_object_store_from_config(config)
    if not config.database_connection:
        print("The database connection is empty. If you are using the default value, please uncomment that in your galaxy.yml")

    model = galaxy.config.init_models_from_config(config, object_store=object_store)
    session = model.context.current
    pagerevs = session.query(model.PageRevision).all()
    mock_trans = Bunch(app=Bunch(security=security_helper), model=model, user_is_admin=lambda: True, sa_session=session)
    for p in pagerevs:
        try:
            processor = _PageContentProcessor(mock_trans, _placeholderRenderForSave)
            processor.feed(p.content)
            newcontent = unicodify(processor.output(), 'utf-8')
            if p.content != newcontent:
                if not args.dry_run:
                    p.content = unicodify(processor.output(), 'utf-8')
                    session.add(p)
                    session.flush()
                else:
                    print("Modifying revision %s." % p.id)
                    print(difflib.unified_diff(p.content, newcontent))
        except Exception:
            logging.exception("Error parsing page, rolling changes back and skipping revision %s.  Please report this error." % p.id)
            session.rollback()
Esempio n. 5
0
    def __parse_args(self):
        parser = argparse.ArgumentParser()
        populate_config_args(parser)
        parser.add_argument('-d',
                            '--debug',
                            action='store_true',
                            default=False,
                            help='Enable debug logging (SQL queries)')
        parser.add_argument('--dry-run',
                            action='store_true',
                            default=False,
                            help="Dry run (rollback all transactions)")
        parser.add_argument('--force-retry',
                            action='store_true',
                            default=False,
                            help="Retry file removals (on applicable actions)")
        parser.add_argument(
            '-o',
            '--older-than',
            dest='days',
            type=int,
            default=14,
            help=
            'Only perform action(s) on objects that have not been updated since the specified number of days'
        )
        parser.add_argument('-U',
                            '--no-update-time',
                            action='store_false',
                            dest='update_time',
                            default=True,
                            help="Don't set update_time on updated objects")
        parser.add_argument(
            '-s',
            '--sequence',
            dest='sequence',
            default='',
            help='DEPRECATED: Comma-separated sequence of actions')
        parser.add_argument('-w',
                            '--work-mem',
                            dest='work_mem',
                            default=None,
                            help='Set PostgreSQL work_mem for this connection')
        parser.add_argument('-l',
                            '--log-dir',
                            default=DEFAULT_LOG_DIR,
                            help='Log file directory')
        parser.add_argument('actions',
                            nargs='*',
                            metavar='ACTION',
                            default=[],
                            help='Action(s) to perform, chosen from: %s' %
                            ', '.join(sorted(self.actions.keys())))
        self.args = parser.parse_args()

        # add deprecated sequence arg to actions
        self.args.sequence = [x.strip() for x in self.args.sequence.split(',')]
        if self.args.sequence != ['']:
            self.args.actions.extend(self.args.sequence)
        if not self.args.actions:
            parser.error("Please specify one or more actions")
Esempio n. 6
0
def _read_model_arguments(argv, use_argparse=False):
    if use_argparse:
        parser = argparse.ArgumentParser()
        parser.add_argument('database', metavar='DATABASE', type=str,
                            default="galaxy",
                            nargs='?',
                            help='database to target (galaxy, tool_shed, install)')
        populate_config_args(parser)
        args = parser.parse_args(argv[1:] if argv else [])
        return args.config_file, args.config_section, args.database
    else:
        config_file = None
        for arg in ["-c", "--config", "--config-file"]:
            if arg in argv:
                pos = argv.index(arg)
                argv.pop(pos)
                config_file = argv.pop(pos)
        config_section = None
        if "--config-section" in argv:
            pos = argv.index("--config-section")
            argv.pop(pos)
            config_section = argv.pop(pos)
        if argv and (argv[-1] in DATABASE):
            database = argv.pop()  # database name tool_shed, galaxy, or install.
        else:
            database = 'galaxy'
        return config_file, config_section, database
Esempio n. 7
0
def parse_arguments():
    parser = argparse.ArgumentParser(
        description='Generate walltime statistics')
    parser.add_argument('tool_id', help='Tool (by ID) to collect stats about')
    parser.add_argument('--like',
                        action='store_true',
                        default=False,
                        help='Use SQL `LIKE` operator to find '
                             'a shed-installed tool using the tool\'s '
                             '"short" id')
    populate_config_args(parser)
    parser.add_argument('-d', '--debug',
                        action='store_true',
                        default=False,
                        help='Print extra info')
    parser.add_argument('-m', '--min',
                        type=int,
                        default=-1,
                        help='Ignore runtimes less than MIN seconds')
    parser.add_argument('-M', '--max',
                        type=int,
                        default=-1,
                        help='Ignore runtimes greater than MAX seconds')
    parser.add_argument('-u', '--user',
                        help='Return stats for only this user (id, email, '
                             'or username)')
    parser.add_argument('-s', '--source',
                        default='metrics',
                        help='Runtime data source (SOURCES: %s)'
                             % ', '.join(DATA_SOURCES))
    args = parser.parse_args()

    if args.like and '/' in args.tool_id:
        print('ERROR: Do not use --like with a tool shed tool id (the tool '
              'id should not contain `/` characters)')
        sys.exit(2)

    args.source = args.source.lower()
    if args.source not in ('metrics', 'history'):
        print('ERROR: Data source `%s` unknown, valid source are: %s'
              % (args.source, ', '.join(DATA_SOURCES)))

    app_properties = app_properties_from_args(args)
    config = galaxy.config.Configuration(**app_properties)
    uri = args.config.get_database_url(config)

    names = {'database': 'dbname', 'username': '******'}
    args.connect_args = url.make_url(uri).translate_connect_args(**names)

    if args.debug:
        print('Got options:')
        for i in vars(args).items():
            print('%s: %s' % i)

    return args
Esempio n. 8
0
    def __parse_args(self):
        parser = argparse.ArgumentParser()
        populate_config_args(parser)
        parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='Enable debug logging', default=False)
        parser.add_argument('--dry-run', action='store_true', dest='dry_run', help="Dry run (rollback all transactions)", default=False)
        parser.add_argument('--force-retry', action='store_true', dest='force_retry', help="Retry file removals (on applicable actions)", default=False)
        parser.add_argument('-o', '--older-than', type=int, dest='days', help='Only perform action(s) on objects that have not been updated since the specified number of days', default=14)
        parser.add_argument('-U', '--no-update-time', action='store_false', dest='update_time', help="Don't set update_time on updated objects", default=True)
        parser.add_argument('-s', '--sequence', dest='sequence', help='Comma-separated sequence of actions, chosen from: %s' % self.action_names, default='')
        parser.add_argument('-w', '--work-mem', dest='work_mem', help='Set PostgreSQL work_mem for this connection', default=None)
        parser.add_argument('-l', '--log-dir', dest='log_dir', help='Log file directory', default=os.path.join(galaxy_root, 'scripts', 'cleanup_datasets'))
        self.args = parser.parse_args()

        self.args.sequence = [x.strip() for x in self.args.sequence.split(',')]

        if self.args.sequence == ['']:
            print("Error: At least one action must be specified in the action sequence\n")
            parser.print_help()
            sys.exit(0)
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser(DESCRIPTION)
    populate_config_args(parser)
    args = parser.parse_args()

    app_properties = app_properties_from_args(args)
    config = galaxy.config.Configuration(**app_properties)
    model = galaxy.config.init_models_from_config(config)

    for row in model.context.query(model.Dataset):
        if row.uuid is None:
            row.uuid = uuid.uuid4()
            print("Setting dataset:", row.id, " UUID to ", row.uuid)
    model.context.flush()

    for row in model.context.query(model.Workflow):
        if row.uuid is None:
            row.uuid = uuid.uuid4()
            print("Setting Workflow:", row.id, " UUID to ", row.uuid)
    model.context.flush()
    print("Complete")
Esempio n. 10
0
def parse_arguments():
    parser = argparse.ArgumentParser(
        description=
        'Build a disk-backed Toolshed repository index and tool index for searching.'
    )
    populate_config_args(parser)
    parser.add_argument('-d',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='Print extra info')
    args = parser.parse_args()
    app_properties = app_properties_from_args(args)
    config = ts_config.ToolShedAppConfiguration(**app_properties)
    args.dburi = config.database_connection
    args.hgweb_config_dir = config.hgweb_config_dir
    args.whoosh_index_dir = config.whoosh_index_dir
    args.file_path = config.file_path
    if args.debug:
        log.setLevel(logging.DEBUG)
        log.debug('Full options:')
        for i in vars(args).items():
            log.debug('%s: %s' % i)
    return args
def main():
    """
    Datasets that are older than the specified cutoff and for which the tool_id
    contains the specified text will be marked as deleted in user's history and
    the user will be notified by email using the specified template file.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('legacy_config', metavar='CONFIG', type=str,
                        default=None,
                        nargs='?',
                        help='config file (legacy, use --config instead)')
    parser.add_argument("-d", "--days", dest="days", action="store",
                        type=int, help="number of days (60)", default=60)
    parser.add_argument("--tool_id", default=None,
                        help="Text to match against tool_id"
                        "Default: match all")
    parser.add_argument("--template", default=None,
                        help="Mako Template file to use as email "
                        "Variables are 'cutoff' for the cutoff in days, "
                        "'email' for users email and "
                        "'datasets' which is a list of tuples "
                        "containing 'dataset' and 'history' names. "
                        "Default: admin_cleanup_deletion_template.txt")
    parser.add_argument("-i", "--info_only", action="store_true",
                        dest="info_only", help="info about the requested action",
                        default=False)
    parser.add_argument("-e", "--email_only", action="store_true",
                        dest="email_only", help="Send emails only, don't delete",
                        default=False)
    parser.add_argument("--smtp", default=None,
                        help="SMTP Server to use to send email. "
                        "Default: [read from galaxy ini file]")
    parser.add_argument("--fromaddr", default=None,
                        help="From address to use to send email. "
                        "Default: [read from galaxy ini file]")
    populate_config_args(parser)

    args = parser.parse_args()
    config_override = None
    if args.legacy_config:
        config_override = args.legacy_config

    app_properties = app_properties_from_args(args, legacy_config_override=config_override)

    if args.smtp is not None:
        app_properties['smtp_server'] = args.smtp
    if app_properties.get('smtp_server') is None:
        parser.error("SMTP Server must be specified as an option (--smtp) "
                     "or in the config file (smtp_server)")

    if args.fromaddr is not None:
        app_properties['email_from'] = args.fromaddr
    if app_properties.get('email_from') is None:
        parser.error("From address must be specified as an option "
                     "(--fromaddr) or in the config file "
                     "(email_from)")

    scriptdir = os.path.dirname(os.path.abspath(__file__))
    template_file = args.template
    if template_file is None:
        default_template = os.path.join(scriptdir,
                                        'admin_cleanup_deletion_template.txt')
        sample_template_file = "%s.sample" % default_template
        if os.path.exists(default_template):
            template_file = default_template
        elif os.path.exists(sample_template_file):
            print("Copying %s to %s" % (sample_template_file, default_template))
            shutil.copyfile(sample_template_file, default_template)
            template_file = default_template
        else:
            parser.error("Default template (%s) or sample template (%s) not "
                         "found, please specify template as an option "
                         "(--template)." % default_template,
                         sample_template_file)
    elif not os.path.exists(template_file):
        parser.error("Specified template file (%s) not found." % template_file)

    config = galaxy.config.Configuration(**app_properties)

    app = CleanupDatasetsApplication(config)
    cutoff_time = datetime.utcnow() - timedelta(days=args.days)
    now = strftime("%Y-%m-%d %H:%M:%S")

    print("##########################################")
    print("\n# %s - Handling stuff older than %i days" % (now, args.days))

    if args.info_only:
        print("# Displaying info only ( --info_only )\n")
    elif args.email_only:
        print("# Sending emails only, not deleting ( --email_only )\n")

    administrative_delete_datasets(
        app, cutoff_time, args.days, tool_id=args.tool_id,
        template_file=template_file, config=config,
        email_only=args.email_only, info_only=args.info_only)
    app.shutdown()
    sys.exit(0)
Esempio n. 12
0
def main(argv):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-r',
                        '--report-directory',
                        help='Directory to store reports in',
                        default=os.path.abspath(os.path.join('.', 'reports')))
    parser.add_argument('-g',
                        '--grt-config',
                        help='Path to GRT config file',
                        default=default_config)
    parser.add_argument(
        "-l",
        "--loglevel",
        choices=['debug', 'info', 'warning', 'error', 'critical'],
        help="Set the logging level",
        default='warning')
    parser.add_argument("-b",
                        "--batch-size",
                        type=int,
                        default=1000,
                        help="Batch size for sql queries")
    parser.add_argument(
        "-m",
        "--max-records",
        type=int,
        default=0,
        help=
        "Maximum number of records to include in a single report. This option should ONLY be used when reporting historical data. Setting this may require running GRT multiple times to capture all historical logs."
    )
    populate_config_args(parser)

    args = parser.parse_args()
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))

    _times = []
    _start_time = time.time()

    def annotate(label, human_label=None):
        if human_label:
            logging.info(human_label)
        _times.append((label, time.time() - _start_time))

    annotate('init_start', 'Loading GRT configuration...')
    try:
        with open(args.grt_config) as handle:
            config = yaml.safe_load(handle)
    except Exception:
        logging.info('Using default GRT configuration')
        with open(sample_config) as handle:
            config = yaml.safe_load(handle)
    annotate('init_end')

    REPORT_DIR = args.report_directory
    CHECK_POINT_FILE = os.path.join(REPORT_DIR, '.checkpoint')
    REPORT_IDENTIFIER = str(time.time())
    REPORT_BASE = os.path.join(REPORT_DIR, REPORT_IDENTIFIER)

    if os.path.exists(CHECK_POINT_FILE):
        with open(CHECK_POINT_FILE, 'r') as handle:
            last_job_sent = int(handle.read())
    else:
        last_job_sent = -1

    annotate('galaxy_init', 'Loading Galaxy...')
    model, object_store, gxconfig, app = _init(
        args, need_app=config['grt']['share_toolbox'])
    # Galaxy overrides our logging level.
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))
    sa_session = model.context.current
    annotate('galaxy_end')

    # Fetch jobs COMPLETED with status OK that have not yet been sent.

    # Set up our arrays
    active_users = defaultdict(int)
    job_state_data = defaultdict(int)

    annotate('san_init', 'Building Sanitizer')
    san = Sanitization(config['sanitization'], model, sa_session)
    annotate('san_end')

    if not os.path.exists(REPORT_DIR):
        os.makedirs(REPORT_DIR)

    # Pick an end point so our queries can return uniform data.
    annotate('endpoint_start', 'Identifying a safe endpoint for SQL queries')
    end_job_id = sa_session.query(model.Job.id) \
        .order_by(model.Job.id.desc()) \
        .first()[0]

    # Allow users to only report N records at once.
    if args.max_records > 0:
        if end_job_id - last_job_sent > args.max_records:
            end_job_id = last_job_sent + args.max_records

    annotate('endpoint_end',
             'Processing jobs (%s, %s]' % (last_job_sent, end_job_id))

    # Remember the last job sent.
    if end_job_id == last_job_sent:
        logging.info("No new jobs to report")
        # So we can just quit now.
        sys.exit(0)

    # Unfortunately we have to keep this mapping for the sanitizer to work properly.
    job_tool_map = {}
    blacklisted_tools = config['sanitization']['tools']

    annotate('export_jobs_start', 'Exporting Jobs')
    handle_job = open(REPORT_BASE + '.jobs.tsv', 'w')
    handle_job.write('\t'.join(('id', 'tool_id', 'tool_version', 'state',
                                'create_time')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start,
                      min(end_job_id, offset_start + args.batch_size))
        for job in sa_session.query(model.Job.id, model.Job.user_id, model.Job.tool_id, model.Job.tool_version, model.Job.state, model.Job.create_time) \
                .filter(model.Job.id > offset_start) \
                .filter(model.Job.id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # If the tool is blacklisted, exclude everywhere
            if job[2] in blacklisted_tools:
                continue

            handle_job.write(str(job[0]))  # id
            handle_job.write('\t')
            handle_job.write(job[2])  # tool_id
            handle_job.write('\t')
            handle_job.write(job[3])  # tool_version
            handle_job.write('\t')
            handle_job.write(job[4])  # state
            handle_job.write('\t')
            handle_job.write(str(job[5]))  # create_time
            handle_job.write('\n')
            # meta counts
            job_state_data[job[4]] += 1
            active_users[job[1]] += 1
            job_tool_map[job[0]] = job[2]

    handle_job.close()
    annotate('export_jobs_end')

    annotate('export_metric_num_start', 'Exporting Metrics (Numeric)')
    handle_metric_num = open(REPORT_BASE + '.metric_num.tsv', 'w')
    handle_metric_num.write('\t'.join(('job_id', 'plugin', 'name', 'value')) +
                            '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start,
                      min(end_job_id, offset_start + args.batch_size))
        for metric in sa_session.query(model.JobMetricNumeric.job_id, model.JobMetricNumeric.plugin, model.JobMetricNumeric.metric_name, model.JobMetricNumeric.metric_value) \
                .filter(model.JobMetricNumeric.job_id > offset_start) \
                .filter(model.JobMetricNumeric.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # No associated job
            if metric[0] not in job_tool_map:
                continue
            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[metric[0]] in blacklisted_tools:
                continue

            handle_metric_num.write(str(metric[0]))
            handle_metric_num.write('\t')
            handle_metric_num.write(metric[1])
            handle_metric_num.write('\t')
            handle_metric_num.write(metric[2])
            handle_metric_num.write('\t')
            handle_metric_num.write(str(metric[3]))
            handle_metric_num.write('\n')
    handle_metric_num.close()
    annotate('export_metric_num_end')

    annotate('export_params_start', 'Export Job Parameters')
    handle_params = open(REPORT_BASE + '.params.tsv', 'w')
    handle_params.write('\t'.join(('job_id', 'name', 'value')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start,
                      min(end_job_id, offset_start + args.batch_size))
        for param in sa_session.query(model.JobParameter.job_id, model.JobParameter.name, model.JobParameter.value) \
                .filter(model.JobParameter.job_id > offset_start) \
                .filter(model.JobParameter.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # No associated job
            if param[0] not in job_tool_map:
                continue
            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[param[0]] in blacklisted_tools:
                continue

            sanitized = san.sanitize_data(job_tool_map[param[0]], param[1],
                                          param[2])

            handle_params.write(str(param[0]))
            handle_params.write('\t')
            handle_params.write(param[1])
            handle_params.write('\t')
            handle_params.write(json.dumps(sanitized))
            handle_params.write('\n')
    handle_params.close()
    annotate('export_params_end')

    # Now on to outputs.
    with tarfile.open(REPORT_BASE + '.tar.gz', 'w:gz') as handle:
        for name in ('jobs', 'metric_num', 'params'):
            handle.add(REPORT_BASE + '.' + name + '.tsv')

    for name in ('jobs', 'metric_num', 'params'):
        os.unlink(REPORT_BASE + '.' + name + '.tsv')

    _times.append(('job_finish', time.time() - _start_time))
    sha = hash_util.memory_bound_hexdigest(hash_util.sha256,
                                           REPORT_BASE + ".tar.gz")
    _times.append(('hash_finish', time.time() - _start_time))

    # Now serialize the individual report data.
    with open(REPORT_BASE + '.json', 'w') as handle:
        if config['grt']['share_toolbox']:
            toolbox = [(tool.id, tool.name, tool.version, tool.tool_shed,
                        tool.repository_id, tool.repository_name)
                       for tool_id, tool in app.toolbox._tools_by_id.items()]
        else:
            toolbox = None

        json.dump(
            {
                "version": 1,
                "galaxy_version": gxconfig.version_major,
                "generated": REPORT_IDENTIFIER,
                "report_hash": "sha256:" + sha,
                "metrics": {
                    "_times": _times,
                },
                "users": {
                    "active": len(active_users.keys()),
                    "total": sa_session.query(model.User.id).count(),
                },
                "jobs": job_state_data,
                "tools": toolbox
            }, handle)

    # Write our checkpoint file so we know where to start next time.
    with open(CHECK_POINT_FILE, 'w') as handle:
        handle.write(str(end_job_id))
Esempio n. 13
0
Encodes and decodes IDs, returns Dataset IDs if provided an HDA or LDDA id,
returns the disk path of a dataset.
"""

import argparse
import os
import sys

sys.path.insert(1, os.path.join(os.path.dirname(__file__), os.pardir, 'lib'))

import galaxy.config
from galaxy.security import idencoding
from galaxy.util.script import app_properties_from_args, populate_config_args

parser = argparse.ArgumentParser()
populate_config_args(parser)
parser.add_argument('-e', '--encode-id', dest='encode_id', help='Encode an ID')
parser.add_argument('-d', '--decode-id', dest='decode_id', help='Decode an ID')
parser.add_argument('--hda', dest='hda_id', help='Display HistoryDatasetAssociation info')
parser.add_argument('--ldda', dest='ldda_id', help='Display LibraryDatasetDatasetAssociation info')
args = parser.parse_args()

app_properties = app_properties_from_args(args)
config = galaxy.config.Configuration(**app_properties)
helper = idencoding.IdEncodingHelper(id_secret=app_properties.get('id_secret'))
model = galaxy.config.init_models_from_config(config)

if args.encode_id:
    print('Encoded "{}": {}'.format(args.encode_id, helper.encode_id(args.encode_id)))

if args.decode_id:
Esempio n. 14
0
    def __parse_args(self):
        parser = argparse.ArgumentParser()
        populate_config_args(parser)
        parser.add_argument('-d',
                            '--debug',
                            action='store_true',
                            dest='debug',
                            help='Enable debug logging',
                            default=False)
        parser.add_argument('--dry-run',
                            action='store_true',
                            dest='dry_run',
                            help="Dry run (rollback all transactions)",
                            default=False)
        parser.add_argument('--force-retry',
                            action='store_true',
                            dest='force_retry',
                            help="Retry file removals (on applicable actions)",
                            default=False)
        parser.add_argument(
            '-o',
            '--older-than',
            type=int,
            dest='days',
            help=
            'Only perform action(s) on objects that have not been updated since the specified number of days',
            default=14)
        parser.add_argument('-U',
                            '--no-update-time',
                            action='store_false',
                            dest='update_time',
                            help="Don't set update_time on updated objects",
                            default=True)
        parser.add_argument(
            '-s',
            '--sequence',
            dest='sequence',
            help='Comma-separated sequence of actions, chosen from: %s' %
            self.action_names,
            default='')
        parser.add_argument('-w',
                            '--work-mem',
                            dest='work_mem',
                            help='Set PostgreSQL work_mem for this connection',
                            default=None)
        parser.add_argument('-l',
                            '--log-dir',
                            dest='log_dir',
                            help='Log file directory',
                            default=os.path.join(galaxy_root, 'scripts',
                                                 'cleanup_datasets'))
        self.args = parser.parse_args()

        self.args.sequence = [x.strip() for x in self.args.sequence.split(',')]

        if self.args.sequence == ['']:
            print(
                "Error: At least one action must be specified in the action sequence\n"
            )
            parser.print_help()
            sys.exit(0)
def main():
    """
    Datasets that are older than the specified cutoff and for which the tool_id
    contains the specified text will be marked as deleted in user's history and
    the user will be notified by email using the specified template file.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('legacy_config',
                        metavar='CONFIG',
                        type=str,
                        default=None,
                        nargs='?',
                        help='config file (legacy, use --config instead)')
    parser.add_argument("-d",
                        "--days",
                        dest="days",
                        action="store",
                        type=int,
                        help="number of days (60)",
                        default=60)
    parser.add_argument("--tool_id",
                        default=None,
                        help="Text to match against tool_id"
                        "Default: match all")
    parser.add_argument("--template",
                        default=None,
                        help="Mako Template file to use as email "
                        "Variables are 'cutoff' for the cutoff in days, "
                        "'email' for users email and "
                        "'datasets' which is a list of tuples "
                        "containing 'dataset' and 'history' names. "
                        "Default: admin_cleanup_deletion_template.txt")
    parser.add_argument("-i",
                        "--info_only",
                        action="store_true",
                        dest="info_only",
                        help="info about the requested action",
                        default=False)
    parser.add_argument("-e",
                        "--email_only",
                        action="store_true",
                        dest="email_only",
                        help="Send emails only, don't delete",
                        default=False)
    parser.add_argument("--smtp",
                        default=None,
                        help="SMTP Server to use to send email. "
                        "Default: [read from galaxy ini file]")
    parser.add_argument("--fromaddr",
                        default=None,
                        help="From address to use to send email. "
                        "Default: [read from galaxy ini file]")
    populate_config_args(parser)

    args = parser.parse_args()
    config_override = None
    if args.legacy_config:
        config_override = args.legacy_config

    app_properties = app_properties_from_args(
        args, legacy_config_override=config_override)

    if args.smtp is not None:
        app_properties['smtp_server'] = args.smtp
    if app_properties.get('smtp_server') is None:
        parser.error("SMTP Server must be specified as an option (--smtp) "
                     "or in the config file (smtp_server)")

    if args.fromaddr is not None:
        app_properties['email_from'] = args.fromaddr
    if app_properties.get('email_from') is None:
        parser.error("From address must be specified as an option "
                     "(--fromaddr) or in the config file "
                     "(email_from)")

    scriptdir = os.path.dirname(os.path.abspath(__file__))
    template_file = args.template
    if template_file is None:
        default_template = os.path.join(scriptdir,
                                        'admin_cleanup_deletion_template.txt')
        sample_template_file = "%s.sample" % default_template
        if os.path.exists(default_template):
            template_file = default_template
        elif os.path.exists(sample_template_file):
            print("Copying %s to %s" %
                  (sample_template_file, default_template))
            shutil.copyfile(sample_template_file, default_template)
            template_file = default_template
        else:
            parser.error(
                "Default template (%s) or sample template (%s) not "
                "found, please specify template as an option "
                "(--template)." % default_template, sample_template_file)
    elif not os.path.exists(template_file):
        parser.error("Specified template file (%s) not found." % template_file)

    config = galaxy.config.Configuration(**app_properties)

    app = CleanupDatasetsApplication(config)
    cutoff_time = datetime.utcnow() - timedelta(days=args.days)
    now = strftime("%Y-%m-%d %H:%M:%S")

    print("##########################################")
    print("\n# %s - Handling stuff older than %i days" % (now, args.days))

    if args.info_only:
        print("# Displaying info only ( --info_only )\n")
    elif args.email_only:
        print("# Sending emails only, not deleting ( --email_only )\n")

    administrative_delete_datasets(app,
                                   cutoff_time,
                                   args.days,
                                   tool_id=args.tool_id,
                                   template_file=template_file,
                                   config=config,
                                   email_only=args.email_only,
                                   info_only=args.info_only)
    app.shutdown()
    sys.exit(0)
sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'lib')))

from galaxy.util.script import app_properties_from_args, populate_config_args
from galaxy.web.security import SecurityHelper

logging.basicConfig()
log = logging.getLogger(__name__)

parser = argparse.ArgumentParser()
parser.add_argument('action', metavar='ACTION', type=str,
                    default=None,
                    help='decode|encode')
parser.add_argument('value', metavar='VALUE', type=str,
                    default=None,
                    help='value to encode or decode')
populate_config_args(parser)
args = parser.parse_args()

app_properties = app_properties_from_args(args)
helper = SecurityHelper(id_secret=app_properties.get('id_secret'))

# We need the ID secret for configuring the security helper to decrypt
# galaxysession cookies.
if "id_secret" not in app_properties:
    log.warning('No ID_SECRET specified. Please set the "id_secret" in your galaxy.yml.')

id_secret = app_properties.get('id_secret', 'dangerous_default')

security_helper = SecurityHelper(id_secret=id_secret)
# And get access to the models
# Login manager to manage current_user functionality
Esempio n. 17
0
def main(argv):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-r',
                        '--report-directory',
                        help='Directory to store reports in',
                        default=os.path.abspath(os.path.join('.', 'reports')))
    parser.add_argument('-g',
                        '--grt-config',
                        help='Path to GRT config file',
                        default=default_config)
    parser.add_argument(
        "-l",
        "--loglevel",
        choices=['debug', 'info', 'warning', 'error', 'critical'],
        help="Set the logging level",
        default='warning')
    parser.add_argument("-b",
                        "--batch-size",
                        type=int,
                        default=1000,
                        help="Batch size for sql queries")
    parser.add_argument(
        "-m",
        "--max-records",
        type=int,
        default=5000000,
        help=
        "Maximum number of records to include in a single report. This option should ONLY be used when reporting historical data. Setting this may require running GRT multiple times to capture all historical logs."
    )
    populate_config_args(parser)

    args = parser.parse_args()
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))

    _times = []
    _start_time = time.time()

    def annotate(label, human_label=None):
        if human_label:
            logging.info(human_label)
        _times.append((label, time.time() - _start_time))

    annotate('init_start', 'Loading GRT configuration...')
    try:
        with open(args.grt_config) as handle:
            config = yaml.safe_load(handle)
    except Exception:
        logging.info('Using default GRT configuration')
        with open(sample_config) as handle:
            config = yaml.safe_load(handle)
    annotate('init_end')

    REPORT_DIR = args.report_directory
    CHECK_POINT_FILE = os.path.join(REPORT_DIR, '.checkpoint')
    REPORT_IDENTIFIER = str(time.time())
    REPORT_BASE = os.path.join(REPORT_DIR, REPORT_IDENTIFIER)

    if os.path.exists(CHECK_POINT_FILE):
        with open(CHECK_POINT_FILE, 'r') as handle:
            last_job_sent = int(handle.read())
    else:
        last_job_sent = -1

    annotate('galaxy_init', 'Loading Galaxy...')
    model, object_store, gxconfig = _init(args)

    # Galaxy overrides our logging level.
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))
    sa_session = model.context.current
    annotate('galaxy_end')

    # Fetch jobs COMPLETED with status OK that have not yet been sent.

    # Set up our arrays
    active_users = defaultdict(int)
    job_state_data = defaultdict(int)

    if not os.path.exists(REPORT_DIR):
        os.makedirs(REPORT_DIR)

    # Pick an end point so our queries can return uniform data.
    annotate('endpoint_start', 'Identifying a safe endpoint for SQL queries')
    end_job_id = sa_session.query(model.Job.id) \
        .order_by(model.Job.id.desc()) \
        .first()[0]

    # Allow users to only report N records at once.
    if args.max_records > 0:
        if end_job_id - last_job_sent > args.max_records:
            end_job_id = last_job_sent + args.max_records

    annotate('endpoint_end',
             'Processing jobs (%s, %s]' % (last_job_sent, end_job_id))

    # Remember the last job sent.
    if end_job_id == last_job_sent:
        logging.info("No new jobs to report")
        # So we can just quit now.
        sys.exit(0)

    # Unfortunately we have to keep this mapping for the sanitizer to work properly.
    job_tool_map = {}
    blacklisted_tools = config['sanitization']['tools']

    annotate('export_jobs_start', 'Exporting Jobs')
    with io.open(REPORT_BASE + '.jobs.tsv', 'w',
                 encoding='utf-8') as handle_job:
        handle_job.write(u'\t'.join(('id', 'tool_id', 'tool_version', 'state',
                                     'create_time')) + '\n')
        for offset_start in range(last_job_sent, end_job_id, args.batch_size):
            logging.debug("Processing %s:%s", offset_start,
                          min(end_job_id, offset_start + args.batch_size))
            for job in sa_session.query(model.Job.id, model.Job.user_id, model.Job.tool_id, model.Job.tool_version, model.Job.state, model.Job.create_time) \
                    .filter(model.Job.id > offset_start) \
                    .filter(model.Job.id <= min(end_job_id, offset_start + args.batch_size)) \
                    .all():
                # If the tool is blacklisted, exclude everywhere
                if job[2] in blacklisted_tools:
                    continue

                try:
                    line = [
                        str(job[0]),  # id
                        job[2],  # tool_id
                        job[3],  # tool_version
                        job[4],  # state
                        str(job[5])  # create_time
                    ]
                    cline = unicodify('\t'.join(line) + '\n')
                    handle_job.write(cline)
                except Exception:
                    logging.warning(
                        "Unable to write out a 'handle_job' row. Ignoring the row.",
                        exc_info=True)
                    continue
                # meta counts
                job_state_data[job[4]] += 1
                active_users[job[1]] += 1
                job_tool_map[job[0]] = job[2]
    annotate('export_jobs_end')

    annotate('export_datasets_start', 'Exporting Datasets')
    with io.open(REPORT_BASE + '.datasets.tsv', 'w',
                 encoding='utf-8') as handle_datasets:
        handle_datasets.write(u'\t'.join(('job_id', 'dataset_id', 'extension',
                                          'file_size', 'param_name', 'type')) +
                              '\n')
        for offset_start in range(last_job_sent, end_job_id, args.batch_size):
            logging.debug("Processing %s:%s", offset_start,
                          min(end_job_id, offset_start + args.batch_size))

            # four queries: JobToInputDatasetAssociation, JobToOutputDatasetAssociation, HistoryDatasetAssociation, Dataset

            job_to_input_hda_ids = sa_session.query(model.JobToInputDatasetAssociation.job_id, model.JobToInputDatasetAssociation.dataset_id,
                model.JobToInputDatasetAssociation.name) \
                .filter(model.JobToInputDatasetAssociation.job_id > offset_start) \
                .filter(model.JobToInputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all()

            job_to_output_hda_ids = sa_session.query(model.JobToOutputDatasetAssociation.job_id, model.JobToOutputDatasetAssociation.dataset_id,
                model.JobToOutputDatasetAssociation.name) \
                .filter(model.JobToOutputDatasetAssociation.job_id > offset_start) \
                .filter(model.JobToOutputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all()

            # add type and concat
            job_to_hda_ids = [[list(i), "input"] for i in job_to_input_hda_ids
                              ] + [[list(i), "output"]
                                   for i in job_to_output_hda_ids]

            # put all of the hda_ids into a list
            hda_ids = [i[0][1] for i in job_to_hda_ids]

            hdas = sa_session.query(model.HistoryDatasetAssociation.id, model.HistoryDatasetAssociation.dataset_id,
                model.HistoryDatasetAssociation.extension) \
                .filter(model.HistoryDatasetAssociation.id.in_(hda_ids)) \
                .all()

            # put all the dataset ids into a list
            dataset_ids = [i[1] for i in hdas]

            # get the sizes of the datasets
            datasets = sa_session.query(model.Dataset.id, model.Dataset.total_size) \
                .filter(model.Dataset.id.in_(dataset_ids)) \
                .all()

            # datasets to dictionay for easy search
            hdas = {i[0]: i[1:] for i in hdas}
            datasets = {i[0]: i[1:] for i in datasets}

            for job_to_hda in job_to_hda_ids:

                job = job_to_hda[0]  # job_id, hda_id, name
                filetype = job_to_hda[1]  # input|output

                # No associated job
                if job[0] not in job_tool_map:
                    continue

                # If the tool is blacklisted, exclude everywhere
                if job_tool_map[job[0]] in blacklisted_tools:
                    continue

                hda_id = job[1]
                if hda_id is None:
                    continue

                dataset_id = hdas[hda_id][0]
                if dataset_id is None:
                    continue

                try:
                    line = [
                        str(job[0]),  # Job ID
                        str(hda_id),  # HDA ID
                        str(hdas[hda_id][1]),  # Extension
                        round_to_2sd(datasets[dataset_id][0]),  # File size
                        job[2],  # Parameter name
                        str(filetype)  # input/output
                    ]
                    cline = unicodify('\t'.join(line) + '\n')
                    handle_datasets.write(cline)
                except Exception:
                    logging.warning(
                        "Unable to write out a 'handle_datasets' row. Ignoring the row.",
                        exc_info=True)
                    continue
    annotate('export_datasets_end')

    annotate('export_metric_num_start', 'Exporting Metrics (Numeric)')
    with io.open(REPORT_BASE + '.metric_num.tsv', 'w',
                 encoding='utf-8') as handle_metric_num:
        handle_metric_num.write(u'\t'.join(('job_id', 'plugin', 'name',
                                            'value')) + '\n')
        for offset_start in range(last_job_sent, end_job_id, args.batch_size):
            logging.debug("Processing %s:%s", offset_start,
                          min(end_job_id, offset_start + args.batch_size))
            for metric in sa_session.query(model.JobMetricNumeric.job_id, model.JobMetricNumeric.plugin, model.JobMetricNumeric.metric_name, model.JobMetricNumeric.metric_value) \
                    .filter(model.JobMetricNumeric.job_id > offset_start) \
                    .filter(model.JobMetricNumeric.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                    .all():
                # No associated job
                if metric[0] not in job_tool_map:
                    continue
                # If the tool is blacklisted, exclude everywhere
                if job_tool_map[metric[0]] in blacklisted_tools:
                    continue

                try:
                    line = [
                        str(metric[0]),  # job id
                        metric[1],  # plugin
                        metric[2],  # name
                        str(metric[3])  # value
                    ]

                    cline = unicodify('\t'.join(line) + '\n')
                    handle_metric_num.write(cline)
                except Exception:
                    logging.warning(
                        "Unable to write out a 'handle_metric_num' row. Ignoring the row.",
                        exc_info=True)
                    continue
    annotate('export_metric_num_end')

    # Now on to outputs.
    with tarfile.open(REPORT_BASE + '.tar.gz', 'w:gz') as handle:
        for name in ('jobs', 'metric_num', 'datasets'):
            path = REPORT_BASE + '.' + name + '.tsv'
            if os.path.exists(path):
                handle.add(path)

    for name in ('jobs', 'metric_num', 'datasets'):
        path = REPORT_BASE + '.' + name + '.tsv'
        if os.path.exists(path):
            os.unlink(REPORT_BASE + '.' + name + '.tsv')

    _times.append(('job_finish', time.time() - _start_time))
    sha = hash_util.memory_bound_hexdigest(hash_func=hash_util.sha256,
                                           path=REPORT_BASE + ".tar.gz")
    _times.append(('hash_finish', time.time() - _start_time))

    # Now serialize the individual report data.
    with open(REPORT_BASE + '.json', 'w') as handle:
        json.dump(
            {
                "version": 3,
                "galaxy_version": gxconfig.version_major,
                "generated": REPORT_IDENTIFIER,
                "report_hash": "sha256:" + sha,
                "metrics": {
                    "_times": _times,
                },
                "users": {
                    "active": len(active_users.keys()),
                    "total": sa_session.query(model.User.id).count(),
                },
                "jobs": job_state_data,
            }, handle)

    # Write our checkpoint file so we know where to start next time.
    with open(CHECK_POINT_FILE, 'w') as handle:
        handle.write(str(end_job_id))
Esempio n. 18
0
def main():
    """
    Managing library datasets is a bit complex, so here is a scenario that hopefully provides clarification.  The complexities
    of handling library datasets is mostly contained in the delete_datasets() method in this script.

    Assume we have 1 library dataset with: LibraryDatasetDatasetAssociation -> LibraryDataset and Dataset
    At this point, we have the following database column values:

    LibraryDatasetDatasetAssociation deleted: False
    LibraryDataset deleted: False, purged: False
    Dataset deleted: False purged: False

    1. A user deletes the assumed dataset above from a data library via a UI menu option.
    This action results in the following database column values (changes from previous step marked with *):

    LibraryDatasetDatasetAssociation deleted: False
    LibraryDataset deleted: True*, purged: False
    Dataset deleted: False, purged: False

    2. After the number of days configured for the delete_datasets() method (option -6 below) have passed, execution
    of the delete_datasets() method results in the following database column values (changes from previous step marked with *):

    LibraryDatasetDatasetAssociation deleted: True*
    LibraryDataset deleted: True, purged: True*
    Dataset deleted: True*, purged: False

    3. After the number of days configured for the purge_datasets() method (option -3 below) have passed, execution
    of the purge_datasets() method results in the following database column values (changes from previous step marked with *):

    LibraryDatasetDatasetAssociation deleted: True
    LibraryDataset deleted: True, purged: True
    Dataset deleted: True, purged: True* (dataset file removed from disk if -r flag is used)

    This scenario is about as simple as it gets.  Keep in mind that a Dataset object can have many HistoryDatasetAssociations
    and many LibraryDatasetDatasetAssociations, and a LibraryDataset can have many LibraryDatasetDatasetAssociations.
    Another way of stating it is: LibraryDatasetDatasetAssociation objects map LibraryDataset objects to Dataset objects,
    and Dataset objects may be mapped to History objects via HistoryDatasetAssociation objects.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('legacy_config',
                        metavar='CONFIG',
                        type=str,
                        default=None,
                        nargs='?',
                        help='config file (legacy, use --config instead)')
    parser.add_argument("-d",
                        "--days",
                        dest="days",
                        action="store",
                        type=int,
                        help="number of days (60)",
                        default=60)
    parser.add_argument("-r",
                        "--remove_from_disk",
                        action="store_true",
                        dest="remove_from_disk",
                        help="remove datasets from disk when purged",
                        default=False)
    parser.add_argument("-i",
                        "--info_only",
                        action="store_true",
                        dest="info_only",
                        help="info about the requested action",
                        default=False)
    parser.add_argument(
        "-f",
        "--force_retry",
        action="store_true",
        dest="force_retry",
        help=
        "performs the requested actions, but ignores whether it might have been done before. Useful when -r wasn't used, but should have been",
        default=False)
    parser.add_argument("-1",
                        "--delete_userless_histories",
                        action="store_true",
                        dest="delete_userless_histories",
                        default=False,
                        help="delete userless histories and datasets")
    parser.add_argument("-2",
                        "--purge_histories",
                        action="store_true",
                        dest="purge_histories",
                        default=False,
                        help="purge deleted histories")
    parser.add_argument("-3",
                        "--purge_datasets",
                        action="store_true",
                        dest="purge_datasets",
                        default=False,
                        help="purge deleted datasets")
    parser.add_argument("-4",
                        "--purge_libraries",
                        action="store_true",
                        dest="purge_libraries",
                        default=False,
                        help="purge deleted libraries")
    parser.add_argument("-5",
                        "--purge_folders",
                        action="store_true",
                        dest="purge_folders",
                        default=False,
                        help="purge deleted library folders")
    parser.add_argument(
        "-6",
        "--delete_datasets",
        action="store_true",
        dest="delete_datasets",
        default=False,
        help=
        "mark deletable datasets as deleted and purge associated dataset instances"
    )
    populate_config_args(parser)

    args = parser.parse_args()
    config_override = None
    if args.legacy_config:
        config_override = args.legacy_config

    if not (args.purge_folders ^ args.delete_userless_histories
            ^ args.purge_libraries ^ args.purge_histories ^ args.purge_datasets
            ^ args.delete_datasets):
        parser.print_help()
        sys.exit(0)

    if args.remove_from_disk and args.info_only:
        parser.error("remove_from_disk and info_only are mutually exclusive")

    app_properties = app_properties_from_args(
        args, legacy_config_override=config_override)
    config = galaxy.config.Configuration(**app_properties)
    app = CleanupDatasetsApplication(config)
    cutoff_time = datetime.utcnow() - timedelta(days=args.days)
    now = strftime("%Y-%m-%d %H:%M:%S")

    log.info("##########################################")
    log.info("\n# %s - Handling stuff older than %i days" % (now, args.days))

    if args.info_only:
        log.info("# Displaying info only ( --info_only )\n")
    elif args.remove_from_disk:
        log.info("Datasets will be removed from disk.\n")
    else:
        log.info("Datasets will NOT be removed from disk.\n")

    if args.delete_userless_histories:
        delete_userless_histories(app,
                                  cutoff_time,
                                  info_only=args.info_only,
                                  force_retry=args.force_retry)
    elif args.purge_histories:
        purge_histories(app,
                        cutoff_time,
                        args.remove_from_disk,
                        info_only=args.info_only,
                        force_retry=args.force_retry)
    elif args.purge_datasets:
        purge_datasets(app,
                       cutoff_time,
                       args.remove_from_disk,
                       info_only=args.info_only,
                       force_retry=args.force_retry)
    elif args.purge_libraries:
        purge_libraries(app,
                        cutoff_time,
                        args.remove_from_disk,
                        info_only=args.info_only,
                        force_retry=args.force_retry)
    elif args.purge_folders:
        purge_folders(app,
                      cutoff_time,
                      args.remove_from_disk,
                      info_only=args.info_only,
                      force_retry=args.force_retry)
    elif args.delete_datasets:
        delete_datasets(app,
                        cutoff_time,
                        args.remove_from_disk,
                        info_only=args.info_only,
                        force_retry=args.force_retry)

    app.shutdown()
    sys.exit(0)
Esempio n. 19
0
def main(argv):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-r', '--report-directory', help='Directory to store reports in',
                        default=os.path.abspath(os.path.join('.', 'reports')))
    parser.add_argument('-g', '--grt-config', help='Path to GRT config file',
                        default=default_config)
    parser.add_argument("-l", "--loglevel", choices=['debug', 'info', 'warning', 'error', 'critical'],
                        help="Set the logging level", default='warning')
    parser.add_argument("-b", "--batch-size", type=int, default=1000,
                        help="Batch size for sql queries")
    parser.add_argument("-m", "--max-records", type=int, default=0,
                        help="Maximum number of records to include in a single report. This option should ONLY be used when reporting historical data. Setting this may require running GRT multiple times to capture all historical logs.")
    populate_config_args(parser)

    args = parser.parse_args()
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))

    _times = []
    _start_time = time.time()

    def annotate(label, human_label=None):
        if human_label:
            logging.info(human_label)
        _times.append((label, time.time() - _start_time))

    annotate('init_start', 'Loading GRT configuration...')
    try:
        with open(args.grt_config) as handle:
            config = yaml.safe_load(handle)
    except Exception:
        logging.info('Using default GRT configuration')
        with open(sample_config) as handle:
            config = yaml.safe_load(handle)
    annotate('init_end')

    REPORT_DIR = args.report_directory
    CHECK_POINT_FILE = os.path.join(REPORT_DIR, '.checkpoint')
    REPORT_IDENTIFIER = str(time.time())
    REPORT_BASE = os.path.join(REPORT_DIR, REPORT_IDENTIFIER)

    if os.path.exists(CHECK_POINT_FILE):
        with open(CHECK_POINT_FILE, 'r') as handle:
            last_job_sent = int(handle.read())
    else:
        last_job_sent = -1

    annotate('galaxy_init', 'Loading Galaxy...')
    model, object_store, gxconfig, app = _init(args, need_app=config['grt']['share_toolbox'])

    # Galaxy overrides our logging level.
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))
    sa_session = model.context.current
    annotate('galaxy_end')

    # Fetch jobs COMPLETED with status OK that have not yet been sent.

    # Set up our arrays
    active_users = defaultdict(int)
    job_state_data = defaultdict(int)

    annotate('san_init', 'Building Sanitizer')
    san = Sanitization(config['sanitization'], model, sa_session)
    annotate('san_end')

    if not os.path.exists(REPORT_DIR):
        os.makedirs(REPORT_DIR)

    # Pick an end point so our queries can return uniform data.
    annotate('endpoint_start', 'Identifying a safe endpoint for SQL queries')
    end_job_id = sa_session.query(model.Job.id) \
        .order_by(model.Job.id.desc()) \
        .first()[0]

    # Allow users to only report N records at once.
    if args.max_records > 0:
        if end_job_id - last_job_sent > args.max_records:
            end_job_id = last_job_sent + args.max_records

    annotate('endpoint_end', 'Processing jobs (%s, %s]' % (last_job_sent, end_job_id))

    # Remember the last job sent.
    if end_job_id == last_job_sent:
        logging.info("No new jobs to report")
        # So we can just quit now.
        sys.exit(0)

    # Unfortunately we have to keep this mapping for the sanitizer to work properly.
    job_tool_map = {}
    blacklisted_tools = config['sanitization']['tools']

    annotate('export_jobs_start', 'Exporting Jobs')
    handle_job = open(REPORT_BASE + '.jobs.tsv', 'w')
    handle_job.write('\t'.join(('id', 'tool_id', 'tool_version', 'state', 'create_time')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size))
        for job in sa_session.query(model.Job.id, model.Job.user_id, model.Job.tool_id, model.Job.tool_version, model.Job.state, model.Job.create_time) \
                .filter(model.Job.id > offset_start) \
                .filter(model.Job.id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # If the tool is blacklisted, exclude everywhere
            if job[2] in blacklisted_tools:
                continue

            try:
                handle_job.write(str(job[0]))  # id
                handle_job.write('\t')
                handle_job.write(job[2])  # tool_id
                handle_job.write('\t')
                handle_job.write(job[3])  # tool_version
                handle_job.write('\t')
                handle_job.write(job[4])  # state
                handle_job.write('\t')
                handle_job.write(str(job[5]))  # create_time
                handle_job.write('\n')
            except Exception:
                logging.warning("Unable to write out a 'handle_job' row. Ignoring the row.", exc_info=True)
                continue
            # meta counts
            job_state_data[job[4]] += 1
            active_users[job[1]] += 1
            job_tool_map[job[0]] = job[2]

    handle_job.close()
    annotate('export_jobs_end')

    annotate('export_datasets_start', 'Exporting Datasets')
    handle_datasets = open(REPORT_BASE + '.datasets.tsv', 'w')
    handle_datasets.write('\t'.join(('job_id', 'dataset_id', 'extension', 'file_size', 'param_name', 'type')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size))

        # four queries: JobToInputDatasetAssociation, JobToOutputDatasetAssociation, HistoryDatasetAssociation, Dataset

        job_to_input_hda_ids = sa_session.query(model.JobToInputDatasetAssociation.job_id, model.JobToInputDatasetAssociation.dataset_id,
            model.JobToInputDatasetAssociation.name) \
            .filter(model.JobToInputDatasetAssociation.job_id > offset_start) \
            .filter(model.JobToInputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \
            .all()

        job_to_output_hda_ids = sa_session.query(model.JobToOutputDatasetAssociation.job_id, model.JobToOutputDatasetAssociation.dataset_id,
            model.JobToOutputDatasetAssociation.name) \
            .filter(model.JobToOutputDatasetAssociation.job_id > offset_start) \
            .filter(model.JobToOutputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \
            .all()

        # add type and concat
        job_to_hda_ids = [[list(i), "input"] for i in job_to_input_hda_ids] + [[list(i), "output"] for i in job_to_output_hda_ids]

        # put all of the hda_ids into a list
        hda_ids = [i[0][1] for i in job_to_hda_ids]

        hdas = sa_session.query(model.HistoryDatasetAssociation.id, model.HistoryDatasetAssociation.dataset_id,
            model.HistoryDatasetAssociation.extension) \
            .filter(model.HistoryDatasetAssociation.id.in_(hda_ids)) \
            .all()

        # put all the dataset ids into a list
        dataset_ids = [i[1] for i in hdas]

        # get the sizes of the datasets
        datasets = sa_session.query(model.Dataset.id, model.Dataset.total_size) \
            .filter(model.Dataset.id.in_(dataset_ids)) \
            .all()

        # datasets to dictionay for easy search
        hdas = {i[0]: i[1:] for i in hdas}
        datasets = {i[0]: i[1:] for i in datasets}

        for job_to_hda in job_to_hda_ids:

            job = job_to_hda[0]  # job_id, hda_id, name
            filetype = job_to_hda[1]  # input|output

            # No associated job
            if job[0] not in job_tool_map:
                continue

            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[job[0]] in blacklisted_tools:
                continue

            hda_id = job[1]
            if hda_id is None:
                continue

            dataset_id = hdas[hda_id][0]
            if dataset_id is None:
                continue

            try:
                handle_datasets.write(str(job[0]))
                handle_datasets.write('\t')
                handle_datasets.write(str(hda_id))
                handle_datasets.write('\t')
                handle_datasets.write(str(hdas[hda_id][1]))
                handle_datasets.write('\t')
                handle_datasets.write(str(datasets[dataset_id][0]))
                handle_datasets.write('\t')
                handle_datasets.write(str(job[2]))
                handle_datasets.write('\t')
                handle_datasets.write(str(filetype))
                handle_datasets.write('\n')
            except Exception:
                logging.warning("Unable to write out a 'handle_datasets' row. Ignoring the row.", exc_info=True)
                continue
    handle_datasets.close()
    annotate('export_datasets_end')

    annotate('export_metric_num_start', 'Exporting Metrics (Numeric)')
    handle_metric_num = open(REPORT_BASE + '.metric_num.tsv', 'w')
    handle_metric_num.write('\t'.join(('job_id', 'plugin', 'name', 'value')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size))
        for metric in sa_session.query(model.JobMetricNumeric.job_id, model.JobMetricNumeric.plugin, model.JobMetricNumeric.metric_name, model.JobMetricNumeric.metric_value) \
                .filter(model.JobMetricNumeric.job_id > offset_start) \
                .filter(model.JobMetricNumeric.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # No associated job
            if metric[0] not in job_tool_map:
                continue
            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[metric[0]] in blacklisted_tools:
                continue

            try:
                handle_metric_num.write(str(metric[0]))
                handle_metric_num.write('\t')
                handle_metric_num.write(metric[1])
                handle_metric_num.write('\t')
                handle_metric_num.write(metric[2])
                handle_metric_num.write('\t')
                handle_metric_num.write(str(metric[3]))
                handle_metric_num.write('\n')
            except Exception:
                logging.warning("Unable to write out a 'handle_metric_num' row. Ignoring the row.", exc_info=True)
                continue
    handle_metric_num.close()
    annotate('export_metric_num_end')

    annotate('export_params_start', 'Export Job Parameters')
    handle_params = open(REPORT_BASE + '.params.tsv', 'w')
    handle_params.write('\t'.join(('job_id', 'name', 'value')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size))
        for param in sa_session.query(model.JobParameter.job_id, model.JobParameter.name, model.JobParameter.value) \
                .filter(model.JobParameter.job_id > offset_start) \
                .filter(model.JobParameter.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # No associated job
            if param[0] not in job_tool_map:
                continue
            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[param[0]] in blacklisted_tools:
                continue

            try:
                sanitized = san.sanitize_data(job_tool_map[param[0]], param[1], param[2])

                handle_params.write(str(param[0]))
                handle_params.write('\t')
                handle_params.write(param[1])
                handle_params.write('\t')
                handle_params.write(json.dumps(sanitized))
                handle_params.write('\n')
            except Exception:
                logging.warning("Unable to write out a 'handle_params' row. Ignoring the row.", exc_info=True)
                continue
    handle_params.close()
    annotate('export_params_end')

    # Now on to outputs.
    with tarfile.open(REPORT_BASE + '.tar.gz', 'w:gz') as handle:
        for name in ('jobs', 'metric_num', 'params', 'datasets'):
            handle.add(REPORT_BASE + '.' + name + '.tsv')

    for name in ('jobs', 'metric_num', 'params', 'datasets'):
        os.unlink(REPORT_BASE + '.' + name + '.tsv')

    _times.append(('job_finish', time.time() - _start_time))
    sha = hash_util.memory_bound_hexdigest(hash_util.sha256, REPORT_BASE + ".tar.gz")
    _times.append(('hash_finish', time.time() - _start_time))

    # Now serialize the individual report data.
    with open(REPORT_BASE + '.json', 'w') as handle:
        if config['grt']['share_toolbox']:
            toolbox = [
                (tool.id, tool.name, tool.version, tool.tool_shed, tool.repository_id, tool.repository_name)
                for tool_id, tool in app.toolbox._tools_by_id.items()
            ]
        else:
            toolbox = None

        json.dump({
            "version": 1,
            "galaxy_version": gxconfig.version_major,
            "generated": REPORT_IDENTIFIER,
            "report_hash": "sha256:" + sha,
            "metrics": {
                "_times": _times,
            },
            "users": {
                "active": len(active_users.keys()),
                "total": sa_session.query(model.User.id).count(),
            },
            "jobs": job_state_data,
            "tools": toolbox
        }, handle)

    # Write our checkpoint file so we know where to start next time.
    with open(CHECK_POINT_FILE, 'w') as handle:
        handle.write(str(end_job_id))
Esempio n. 20
0
def main():
    """
    Managing library datasets is a bit complex, so here is a scenario that hopefully provides clarification.  The complexities
    of handling library datasets is mostly contained in the delete_datasets() method in this script.

    Assume we have 1 library dataset with: LibraryDatasetDatasetAssociation -> LibraryDataset and Dataset
    At this point, we have the following database column values:

    LibraryDatasetDatasetAssociation deleted: False
    LibraryDataset deleted: False, purged: False
    Dataset deleted: False purged: False

    1. A user deletes the assumed dataset above from a data library via a UI menu option.
    This action results in the following database column values (changes from previous step marked with *):

    LibraryDatasetDatasetAssociation deleted: False
    LibraryDataset deleted: True*, purged: False
    Dataset deleted: False, purged: False

    2. After the number of days configured for the delete_datasets() method (option -6 below) have passed, execution
    of the delete_datasets() method results in the following database column values (changes from previous step marked with *):

    LibraryDatasetDatasetAssociation deleted: True*
    LibraryDataset deleted: True, purged: True*
    Dataset deleted: True*, purged: False

    3. After the number of days configured for the purge_datasets() method (option -3 below) have passed, execution
    of the purge_datasets() method results in the following database column values (changes from previous step marked with *):

    LibraryDatasetDatasetAssociation deleted: True
    LibraryDataset deleted: True, purged: True
    Dataset deleted: True, purged: True* (dataset file removed from disk if -r flag is used)

    This scenario is about as simple as it gets.  Keep in mind that a Dataset object can have many HistoryDatasetAssociations
    and many LibraryDatasetDatasetAssociations, and a LibraryDataset can have many LibraryDatasetDatasetAssociations.
    Another way of stating it is: LibraryDatasetDatasetAssociation objects map LibraryDataset objects to Dataset objects,
    and Dataset objects may be mapped to History objects via HistoryDatasetAssociation objects.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('legacy_config', metavar='CONFIG', type=str,
                        default=None,
                        nargs='?',
                        help='config file (legacy, use --config instead)')
    parser.add_argument("-d", "--days", dest="days", action="store", type=int, help="number of days (60)", default=60)
    parser.add_argument("-r", "--remove_from_disk", action="store_true", dest="remove_from_disk", help="remove datasets from disk when purged", default=False)
    parser.add_argument("-i", "--info_only", action="store_true", dest="info_only", help="info about the requested action", default=False)
    parser.add_argument("-f", "--force_retry", action="store_true", dest="force_retry", help="performs the requested actions, but ignores whether it might have been done before. Useful when -r wasn't used, but should have been", default=False)
    parser.add_argument("-1", "--delete_userless_histories", action="store_true", dest="delete_userless_histories", default=False, help="delete userless histories and datasets")
    parser.add_argument("-2", "--purge_histories", action="store_true", dest="purge_histories", default=False, help="purge deleted histories")
    parser.add_argument("-3", "--purge_datasets", action="store_true", dest="purge_datasets", default=False, help="purge deleted datasets")
    parser.add_argument("-4", "--purge_libraries", action="store_true", dest="purge_libraries", default=False, help="purge deleted libraries")
    parser.add_argument("-5", "--purge_folders", action="store_true", dest="purge_folders", default=False, help="purge deleted library folders")
    parser.add_argument("-6", "--delete_datasets", action="store_true", dest="delete_datasets", default=False, help="mark deletable datasets as deleted and purge associated dataset instances")
    populate_config_args(parser)

    args = parser.parse_args()
    config_override = None
    if args.legacy_config:
        config_override = args.legacy_config

    if not (args.purge_folders ^ args.delete_userless_histories ^
            args.purge_libraries ^ args.purge_histories ^
            args.purge_datasets ^ args.delete_datasets):
        parser.print_help()
        sys.exit(0)

    if args.remove_from_disk and args.info_only:
        parser.error("remove_from_disk and info_only are mutually exclusive")

    app_properties = app_properties_from_args(args, legacy_config_override=config_override)
    config = galaxy.config.Configuration(**app_properties)
    app = CleanupDatasetsApplication(config)
    cutoff_time = datetime.utcnow() - timedelta(days=args.days)
    now = strftime("%Y-%m-%d %H:%M:%S")

    print("##########################################")
    print("\n# %s - Handling stuff older than %i days" % (now, args.days))

    if args.info_only:
        print("# Displaying info only ( --info_only )\n")
    elif args.remove_from_disk:
        print("Datasets will be removed from disk.\n")
    else:
        print("Datasets will NOT be removed from disk.\n")

    if args.delete_userless_histories:
        delete_userless_histories(app, cutoff_time, info_only=args.info_only, force_retry=args.force_retry)
    elif args.purge_histories:
        purge_histories(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry)
    elif args.purge_datasets:
        purge_datasets(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry)
    elif args.purge_libraries:
        purge_libraries(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry)
    elif args.purge_folders:
        purge_folders(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry)
    elif args.delete_datasets:
        delete_datasets(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry)

    app.shutdown()
    sys.exit(0)