def __init__(self, basename, readers, group_name=None, project_name=None, **options): self.basename = basename self.group_name = group_name self.s3_base, self.job_base = \ get_s3_and_job_prefixes(basename, group_name) if 'all' in readers: self.readers = [rc.name.lower() for rc in get_reader_classes()] else: self.readers = readers self.project_name = project_name self.job_lists = {q_name: [] for q_name in self._job_queue_dict.keys()} self.options = options self.ids_per_job = None self.running = None self.submitting = False self.monitors = {} for queue_name, reader_list in self._job_queue_dict.items(): if not any(reader in reader_list for reader in self.readers): logger.info("Queue %s will not be used, no relevant readers " "selected." % queue_name) continue self.monitors[queue_name] = \ BatchMonitor(queue_name, self.job_lists[queue_name], self.job_base, self.s3_base) return
def __init__(self, basename, readers, *args, **kwargs): if 'all' in readers: self.readers = [rc.name.lower() for rc in get_reader_classes()] else: self.readers = readers self.ids_per_job = None super(ReadingSubmitter, self).__init__(basename, *args, **kwargs)
def get_parser(description, input_desc): """Get a parser that is generic to reading scripts. Parameters ---------- description : str A description of the tool, usually about one line long. input_desc: str A string describing the nature of the input file used by the reading tool. Returns ------- parser : argparse.ArgumentParser instance An argument parser object, to which further arguments can be added. """ parser = ArgumentParser(description=description) parser.add_argument(dest='input_file', help=input_desc) parser.add_argument( '-r', '--readers', choices=[rc.name.lower() for rc in get_reader_classes()], help='List of readers to be used.', nargs='+') parser.add_argument('-n', '--num_procs', dest='n_proc', help='Select the number of processes to use.', type=int, default=1) parser.add_argument( '-s', '--sample', dest='n_samp', help='Read a random sample of size N_SAMP of the inputs.', type=int) parser.add_argument( '-I', '--in_range', dest='range_str', help='Only read input lines in the range given as <start>:<end>.') parser.add_argument('-v', '--verbose', help='Include output from the readers.', action='store_true') parser.add_argument( '-q', '--quiet', help='Suppress most output. Overrides -v and -d options.', action='store_true') parser.add_argument('-d', '--debug', help='Set the logging to debug level.', action='store_true') # parser.add_argument( # '-m', '--messy', # help='Do not clean up directories created while reading.', # action='store_true' # ) return parser
def create_submit_parser(): import argparse parent_submit_parser = argparse.ArgumentParser(add_help=False) parent_submit_parser.add_argument( 'basename', help='Defines job names and S3 keys' ) parent_submit_parser.add_argument( '--group_name', help="Indicate what group of jobs this batch is a part of." ) parent_submit_parser.add_argument( '-r', '--readers', dest='readers', choices=[rc.name.lower() for rc in get_reader_classes()] + ['all'], default=['all'], nargs='+', help='Choose which reader(s) to use.' ) parent_submit_parser.add_argument( '--project', help=('Set the project name. Default is DEFAULT_AWS_PROJECT in the ' 'config.') ) return parent_submit_parser
def get_available_readers(): reader_spec = {} for rc in get_reader_classes(): _, arg_list = get_docker_file(rc, logging=False) if arg_list is None: continue reader_spec[rc.name] = arg_list return reader_spec
def get_parser(): parser = ArgumentParser(description=__doc__) parser.add_argument(dest='basename', help='The name of this specific group of jobs.') parser.add_argument(dest='job_name', help='The name of this job.') parser.add_argument( dest='s3_base', help='Specify the s3 prefix. This is also used as a prefix for any ' 'files stored locally.', type=str) parser.add_argument(dest='out_dir', help='The name of the temporary output directory') parser.add_argument( dest='read_mode', choices=['all', 'unread', 'none'], help=("Set the reading mode. If 'all', read everything, if " "'unread', only read content that does not have pre-existing " "readings of the same reader and version, if 'none', only " "use pre-existing readings. Default is 'unread'.")) parser.add_argument( dest='rslt_mode', choices=['all', 'unread', 'none'], help=( "Choose from which readings to extract results. If 'all', all " "readings that are produced or retrieved will be used to produce " "results. If 'unread', only produce results from " "previously unread content. If 'none', do not produce any " "results (only readings will be produced).")) parser.add_argument(dest='num_cores', help='Select the number of cores on which to run.', type=int) parser.add_argument( dest='start_index', help='Select the index of the first pmid in the list to read.', type=int) parser.add_argument( dest='end_index', help='Select the index of the last pmid in the list to read.', type=int) parser.add_argument( '-r', '--readers', dest='readers', choices=[rc.name.lower() for rc in get_reader_classes()], nargs='+', help='Choose which reader(s) to use.') parser.add_argument('--test', action='store_true', help="Use the test database.") parser.add_argument( '-b', '--batch', default=None, help="Select the size of batches for the content to be read.") return parser
def __init__(self, basename, readers, group_name=None, project_name=None, **options): self.basename = basename self.group_name = group_name self.s3_base, self.job_base = \ get_s3_and_job_prefixes(basename, group_name) if 'all' in readers: self.readers = [rc.name.lower() for rc in get_reader_classes()] else: self.readers = readers self.project_name = project_name self.job_lists = {q_name: [] for q_name in self._job_queue_dict.keys()} self.options = options self.ids_per_job = None self.running = None self.monitors = {} for queue_name in self._job_queue_dict.keys(): self.monitors[queue_name] = \ BatchMonitor(queue_name, self.job_lists[queue_name], self.job_base, self.s3_base) return
def get_readers(*names, **kwargs): kwargs['ResultClass'] = rdb.DatabaseReadingData return [reader_class(**kwargs) for reader_class in get_reader_classes() if (not names or reader_class.name in names)]
def main(): from sys import argv # Provide some help if '--help' in argv or '-h' in argv: print_help() return # Allow the user to limit the readers used. only_include_readers = [] if '--readers' in argv: next_idx = argv.index('--readers') + 1 while next_idx < len(argv) and not argv[next_idx].startswith('-'): only_include_readers.append(argv[next_idx].upper()) next_idx += 1 if not only_include_readers: raise ValueError("At least one reader must be specified with " "--readers.") logger.info("Updating: %s" % str(only_include_readers)) else: logger.info("Updating all readers.") # Get the AWS clients. s3 = boto3.client('s3') cb = boto3.client('codebuild') for rc in get_reader_classes(): if only_include_readers and rc.name not in only_include_readers: logger.info("%s not included. Skipping." % rc.name) continue # Put the latest dockerfile etc on s3 zip_output, arg_list = make_zip_package(rc) if zip_output is None: continue s3_key = ('indra-db/{rdr}-dockerfile/{rdr}-autogen.zip'.format( rdr=rc.name.lower())) logger.info("Writing %s to s3." % s3_key) s3.put_object(Bucket='bigmech', Key=s3_key, Body=zip_output) # Trigger the builds. ENV vars are overwritten based on CLI input. # ex: --indra_branch=dev will set the env var INDRA_BRANCH=dev env_overrides = [] for arg in arg_list: cli_arg = '--%s' % arg.lower() if cli_arg in argv: cli_idx = argv.index(cli_arg) env_overrides.append({ 'name': arg, 'value': argv[cli_idx + 1], 'type': 'PLAINTEXT' }) logger.info("Triggering build for %s with env overrides:\n%s" % (rc.name, env_overrides)) project_name = 'indra_%s_reading_docker' % rc.name.lower() try: cb.start_build(projectName=project_name, environmentVariablesOverride=env_overrides) except cb.exceptions.ResourceNotFoundException: logger.error("Project %s does not exist on AWS. Cannot trigger " "build." % project_name) return
def main(): # Load arguments. parser = make_parser() args = parser.parse_args() if args.debug and not args.quiet: logger.setLevel(logging.DEBUG) # Load the input file. if path.isdir(args.input_file): file_list = [path.join(args.input_file, fname) for fname in listdir(args.input_file)] elif path.isfile(args.input_file): with open(args.input_file, 'r') as f: file_list = [line.strip() for line in f.readlines()] else: raise ValueError("File or directory %s does not exist." % args.input_file) logger.info("Found %d files." % len(file_list)) for ftype in ['nxml', 'txt']: logger.debug('%d are %s' % ( len([f for f in file_list if f.endswith(ftype)]), ftype )) # Select only a sample of the lines, if sample is chosen. if args.n_samp is not None: file_list = random.sample(file_list, args.n_samp) # If a range is specified, only use that range. if args.range_str is not None: start_idx, end_idx = [int(n) for n in args.range_str.split(':')] file_list = file_list[start_idx:end_idx] # Create a single base directory base_dir = get_dir('run_%s' % ('_and_'.join(args.readers))) # Set the verbosity. The quiet argument overrides the verbose argument. verbose = args.verbose and not args.quiet # Get the readers objects. readers = [reader_class(base_dir=base_dir, n_proc=args.n_proc) for reader_class in get_reader_classes() if reader_class.name.lower() in args.readers] # Read the files. outputs = read_files(file_list, readers, verbose=verbose) # Dump the outputs reading_out_path = path.join(args.output_path, 'readings') if args.pickle: reading_out_path += '.pkl' with open(reading_out_path, 'wb') as f: pickle.dump(outputs, f) else: reading_out_path += '.json' dump_readings(outputs, reading_out_path) print("Reading outputs stored in %s." % reading_out_path) # Generate and dump the statements. stmts_dump_path = path.join(args.output_path, 'stmts') stmt_gen = (s for rd in outputs for s in rd.get_statements(add_metadata=args.add_metadata)) if args.pickle: stmts_dump_path += ".pkl" stmts_json = list(stmt_gen) num_stmts = len(stmts_json) with open(stmts_dump_path, 'wb') as f: pickle.dump(stmts_json, f) else: stmt_jsons = [s.to_json() for s in stmt_gen] num_stmts = len(stmt_jsons) stmts_dump_path += '.json' with open(stmts_dump_path, 'w') as f: json.dump(stmt_jsons, f) print("Stored %d statements in %s." % (num_stmts, stmts_dump_path))