def __init__(self,
              basename,
              readers,
              group_name=None,
              project_name=None,
              **options):
     self.basename = basename
     self.group_name = group_name
     self.s3_base, self.job_base = \
         get_s3_and_job_prefixes(basename, group_name)
     if 'all' in readers:
         self.readers = [rc.name.lower() for rc in get_reader_classes()]
     else:
         self.readers = readers
     self.project_name = project_name
     self.job_lists = {q_name: [] for q_name in self._job_queue_dict.keys()}
     self.options = options
     self.ids_per_job = None
     self.running = None
     self.submitting = False
     self.monitors = {}
     for queue_name, reader_list in self._job_queue_dict.items():
         if not any(reader in reader_list for reader in self.readers):
             logger.info("Queue %s will not be used, no relevant readers "
                         "selected." % queue_name)
             continue
         self.monitors[queue_name] = \
             BatchMonitor(queue_name, self.job_lists[queue_name],
                          self.job_base, self.s3_base)
     return
 def __init__(self, basename, readers, *args, **kwargs):
     if 'all' in readers:
         self.readers = [rc.name.lower() for rc in get_reader_classes()]
     else:
         self.readers = readers
     self.ids_per_job = None
     super(ReadingSubmitter, self).__init__(basename, *args, **kwargs)
Example #3
0
def get_parser(description, input_desc):
    """Get a parser that is generic to reading scripts.

    Parameters
    ----------
    description : str
        A description of the tool, usually about one line long.
    input_desc: str
        A string describing the nature of the input file used by the reading
        tool.

    Returns
    -------
    parser : argparse.ArgumentParser instance
        An argument parser object, to which further arguments can be added.
    """
    parser = ArgumentParser(description=description)
    parser.add_argument(dest='input_file', help=input_desc)
    parser.add_argument(
        '-r',
        '--readers',
        choices=[rc.name.lower() for rc in get_reader_classes()],
        help='List of readers to be used.',
        nargs='+')
    parser.add_argument('-n',
                        '--num_procs',
                        dest='n_proc',
                        help='Select the number of processes to use.',
                        type=int,
                        default=1)
    parser.add_argument(
        '-s',
        '--sample',
        dest='n_samp',
        help='Read a random sample of size N_SAMP of the inputs.',
        type=int)
    parser.add_argument(
        '-I',
        '--in_range',
        dest='range_str',
        help='Only read input lines in the range given as <start>:<end>.')
    parser.add_argument('-v',
                        '--verbose',
                        help='Include output from the readers.',
                        action='store_true')
    parser.add_argument(
        '-q',
        '--quiet',
        help='Suppress most output. Overrides -v and -d options.',
        action='store_true')
    parser.add_argument('-d',
                        '--debug',
                        help='Set the logging to debug level.',
                        action='store_true')
    # parser.add_argument(
    #     '-m', '--messy',
    #     help='Do not clean up directories created while reading.',
    #     action='store_true'
    #     )
    return parser
Example #4
0
def create_submit_parser():
    import argparse
    parent_submit_parser = argparse.ArgumentParser(add_help=False)
    parent_submit_parser.add_argument(
        'basename',
        help='Defines job names and S3 keys'
    )
    parent_submit_parser.add_argument(
        '--group_name',
        help="Indicate what group of jobs this batch is a part of."
    )
    parent_submit_parser.add_argument(
        '-r', '--readers',
        dest='readers',
        choices=[rc.name.lower() for rc in get_reader_classes()] + ['all'],
        default=['all'],
        nargs='+',
        help='Choose which reader(s) to use.'
    )
    parent_submit_parser.add_argument(
        '--project',
        help=('Set the project name. Default is DEFAULT_AWS_PROJECT in the '
              'config.')
    )
    return parent_submit_parser
def get_available_readers():
    reader_spec = {}
    for rc in get_reader_classes():
        _, arg_list = get_docker_file(rc, logging=False)
        if arg_list is None:
            continue
        reader_spec[rc.name] = arg_list
    return reader_spec
Example #6
0
def get_parser():
    parser = ArgumentParser(description=__doc__)
    parser.add_argument(dest='basename',
                        help='The name of this specific group of jobs.')
    parser.add_argument(dest='job_name', help='The name of this job.')
    parser.add_argument(
        dest='s3_base',
        help='Specify the s3 prefix. This is also used as a prefix for any '
        'files stored locally.',
        type=str)
    parser.add_argument(dest='out_dir',
                        help='The name of the temporary output directory')
    parser.add_argument(
        dest='read_mode',
        choices=['all', 'unread', 'none'],
        help=("Set the reading mode. If 'all', read everything, if "
              "'unread', only read content that does not have pre-existing "
              "readings of the same reader and version, if 'none', only "
              "use pre-existing readings. Default is 'unread'."))
    parser.add_argument(
        dest='rslt_mode',
        choices=['all', 'unread', 'none'],
        help=(
            "Choose from which readings to extract results. If 'all', all "
            "readings that are produced or retrieved will be used to produce "
            "results. If 'unread', only produce results from "
            "previously unread content. If 'none', do not produce any "
            "results (only readings will be produced)."))
    parser.add_argument(dest='num_cores',
                        help='Select the number of cores on which to run.',
                        type=int)
    parser.add_argument(
        dest='start_index',
        help='Select the index of the first pmid in the list to read.',
        type=int)
    parser.add_argument(
        dest='end_index',
        help='Select the index of the last pmid in the list to read.',
        type=int)
    parser.add_argument(
        '-r',
        '--readers',
        dest='readers',
        choices=[rc.name.lower() for rc in get_reader_classes()],
        nargs='+',
        help='Choose which reader(s) to use.')
    parser.add_argument('--test',
                        action='store_true',
                        help="Use the test database.")
    parser.add_argument(
        '-b',
        '--batch',
        default=None,
        help="Select the size of batches for the content to be read.")
    return parser
Example #7
0
 def __init__(self, basename, readers, group_name=None, project_name=None,
              **options):
     self.basename = basename
     self.group_name = group_name
     self.s3_base, self.job_base = \
         get_s3_and_job_prefixes(basename, group_name)
     if 'all' in readers:
         self.readers = [rc.name.lower() for rc in get_reader_classes()]
     else:
         self.readers = readers
     self.project_name = project_name
     self.job_lists = {q_name: [] for q_name in self._job_queue_dict.keys()}
     self.options = options
     self.ids_per_job = None
     self.running = None
     self.monitors = {}
     for queue_name in self._job_queue_dict.keys():
         self.monitors[queue_name] = \
             BatchMonitor(queue_name, self.job_lists[queue_name],
                          self.job_base, self.s3_base)
     return
Example #8
0
def get_readers(*names, **kwargs):
    kwargs['ResultClass'] = rdb.DatabaseReadingData
    return [reader_class(**kwargs) for reader_class in get_reader_classes()
            if (not names or reader_class.name in names)]
def main():
    from sys import argv

    # Provide some help
    if '--help' in argv or '-h' in argv:
        print_help()
        return

    # Allow the user to limit the readers used.
    only_include_readers = []
    if '--readers' in argv:
        next_idx = argv.index('--readers') + 1
        while next_idx < len(argv) and not argv[next_idx].startswith('-'):
            only_include_readers.append(argv[next_idx].upper())
            next_idx += 1
        if not only_include_readers:
            raise ValueError("At least one reader must be specified with "
                             "--readers.")
        logger.info("Updating: %s" % str(only_include_readers))
    else:
        logger.info("Updating all readers.")

    # Get the AWS clients.
    s3 = boto3.client('s3')
    cb = boto3.client('codebuild')

    for rc in get_reader_classes():
        if only_include_readers and rc.name not in only_include_readers:
            logger.info("%s not included. Skipping." % rc.name)
            continue

        # Put the latest dockerfile etc on s3
        zip_output, arg_list = make_zip_package(rc)
        if zip_output is None:
            continue
        s3_key = ('indra-db/{rdr}-dockerfile/{rdr}-autogen.zip'.format(
            rdr=rc.name.lower()))
        logger.info("Writing %s to s3." % s3_key)
        s3.put_object(Bucket='bigmech', Key=s3_key, Body=zip_output)

        # Trigger the builds. ENV vars are overwritten based on CLI input.
        # ex: --indra_branch=dev will set the env var INDRA_BRANCH=dev
        env_overrides = []
        for arg in arg_list:
            cli_arg = '--%s' % arg.lower()
            if cli_arg in argv:
                cli_idx = argv.index(cli_arg)
                env_overrides.append({
                    'name': arg,
                    'value': argv[cli_idx + 1],
                    'type': 'PLAINTEXT'
                })

        logger.info("Triggering build for %s with env overrides:\n%s" %
                    (rc.name, env_overrides))
        project_name = 'indra_%s_reading_docker' % rc.name.lower()
        try:
            cb.start_build(projectName=project_name,
                           environmentVariablesOverride=env_overrides)
        except cb.exceptions.ResourceNotFoundException:
            logger.error("Project %s does not exist on AWS. Cannot trigger "
                         "build." % project_name)
    return
Example #10
0
def main():
    # Load arguments.
    parser = make_parser()
    args = parser.parse_args()
    if args.debug and not args.quiet:
        logger.setLevel(logging.DEBUG)

    # Load the input file.
    if path.isdir(args.input_file):
        file_list = [path.join(args.input_file, fname)
                     for fname in listdir(args.input_file)]
    elif path.isfile(args.input_file):
        with open(args.input_file, 'r') as f:
            file_list = [line.strip() for line in f.readlines()]
    else:
        raise ValueError("File or directory %s does not exist."
                         % args.input_file)

    logger.info("Found %d files." % len(file_list))
    for ftype in ['nxml', 'txt']:
        logger.debug('%d are %s' % (
            len([f for f in file_list if f.endswith(ftype)]), ftype
        ))

    # Select only a sample of the lines, if sample is chosen.
    if args.n_samp is not None:
        file_list = random.sample(file_list, args.n_samp)

    # If a range is specified, only use that range.
    if args.range_str is not None:
        start_idx, end_idx = [int(n) for n in args.range_str.split(':')]
        file_list = file_list[start_idx:end_idx]

    # Create a single base directory
    base_dir = get_dir('run_%s' % ('_and_'.join(args.readers)))

    # Set the verbosity. The quiet argument overrides the verbose argument.
    verbose = args.verbose and not args.quiet

    # Get the readers objects.
    readers = [reader_class(base_dir=base_dir, n_proc=args.n_proc)
               for reader_class in get_reader_classes()
               if reader_class.name.lower() in args.readers]

    # Read the files.
    outputs = read_files(file_list, readers, verbose=verbose)

    # Dump the outputs
    reading_out_path = path.join(args.output_path, 'readings')
    if args.pickle:
        reading_out_path += '.pkl'
        with open(reading_out_path, 'wb') as f:
            pickle.dump(outputs, f)
    else:
        reading_out_path += '.json'
        dump_readings(outputs, reading_out_path)
    print("Reading outputs stored in %s." % reading_out_path)

    # Generate and dump the statements.
    stmts_dump_path = path.join(args.output_path, 'stmts')
    stmt_gen = (s for rd in outputs
                for s in rd.get_statements(add_metadata=args.add_metadata))
    if args.pickle:
        stmts_dump_path += ".pkl"
        stmts_json = list(stmt_gen)
        num_stmts = len(stmts_json)
        with open(stmts_dump_path, 'wb') as f:
            pickle.dump(stmts_json, f)
    else:
        stmt_jsons = [s.to_json() for s in stmt_gen]
        num_stmts = len(stmt_jsons)
        stmts_dump_path += '.json'
        with open(stmts_dump_path, 'w') as f:
            json.dump(stmt_jsons, f)
    print("Stored %d statements in %s." % (num_stmts, stmts_dump_path))