Example #1
0
def table_to_job_data(path, input_file_param_util, output_file_param_util):
    """Parses a table of parameters from a TSV.

  Args:
    path: Path to a TSV file with the first line specifying the environment
    variables, input, and output parameters as column headings. Subsequent
    lines specify parameter values, one row per job.
    input_file_param_util: Utility for producing InputFileParam objects.
    output_file_param_util: Utility for producing OutputFileParam objects.

  Returns:
    job_data: an array of records, each containing a dictionary of
    'envs', 'inputs', and 'outputs' that defines the set of parameters and data
    for each job.

  Raises:
    ValueError: If no job records were provided
  """
    job_data = []

    param_file = dsub_util.load_file(path)

    # Read the first line and extract the fieldnames
    header = param_file.readline().rstrip()
    job_params = parse_job_table_header(header, input_file_param_util,
                                        output_file_param_util)
    reader = csv.reader(param_file, delimiter='\t')

    # Build a list of records from the parsed input table
    for row in reader:
        if len(row) != len(job_params):
            dsub_util.print_error(
                'Unexpected number of fields %s vs %s: line %s' %
                (len(row), len(job_params), reader.line_num))

        # Each row can contain "envs", "inputs", "outputs"
        envs = []
        inputs = []
        outputs = []

        for i in range(0, len(job_params)):
            param = job_params[i]
            if isinstance(param, EnvParam):
                envs.append(EnvParam(param.name, row[i]))

            elif isinstance(param, InputFileParam):
                docker_path, remote_uri = input_file_param_util.parse_uri(
                    row[i], param.recursive)
                inputs.append(
                    InputFileParam(param.name, docker_path, remote_uri,
                                   param.recursive))

            elif isinstance(param, OutputFileParam):
                docker_path, remote_uri = output_file_param_util.parse_uri(
                    row[i], param.recursive)
                outputs.append(
                    OutputFileParam(param.name, docker_path, remote_uri,
                                    param.recursive))

        job_data.append({'envs': envs, 'inputs': inputs, 'outputs': outputs})

    # Ensure that there are jobs to execute (and not just a header)
    if not job_data:
        raise ValueError('No jobs found in %s' % path)

    return job_data
Example #2
0
def tasks_file_to_job_data(tasks, input_file_param_util,
                           output_file_param_util):
    """Parses task parameters from a TSV.

  Args:
    tasks: Dict containing the path to a TSV file and task numbers to run
    variables, input, and output parameters as column headings. Subsequent
    lines specify parameter values, one row per job.
    input_file_param_util: Utility for producing InputFileParam objects.
    output_file_param_util: Utility for producing OutputFileParam objects.

  Returns:
    job_data: an array of records, each containing a dictionary of
    'envs', 'inputs', and 'outputs' that defines the set of parameters and data
    for each job.

  Raises:
    ValueError: If no job records were provided
  """
    job_data = []

    path = tasks['path']
    task_min = tasks.get('min')
    task_max = tasks.get('max')

    # Load the file and set up a Reader that tokenizes the fields
    param_file = dsub_util.load_file(path)
    reader = csv.reader(param_file, delimiter='\t')

    # Read the first line and extract the parameters
    header = reader.next()
    job_params = parse_tasks_file_header(header, input_file_param_util,
                                         output_file_param_util)

    # Build a list of records from the parsed input file
    for row in reader:
        # Tasks are numbered starting at 1 and since the first line of the TSV
        # file is a header, the first task appears on line 2.
        task_id = reader.line_num - 1
        if task_min and task_id < task_min:
            continue
        if task_max and task_id > task_max:
            continue

        if len(row) != len(job_params):
            dsub_util.print_error(
                'Unexpected number of fields %s vs %s: line %s' %
                (len(row), len(job_params), reader.line_num))

        # Each row can contain "envs", "inputs", "outputs"
        envs = []
        inputs = []
        outputs = []

        for i in range(0, len(job_params)):
            param = job_params[i]
            if isinstance(param, EnvParam):
                envs.append(EnvParam(param.name, row[i]))

            elif isinstance(param, InputFileParam):
                docker_path, remote_uri = input_file_param_util.parse_uri(
                    row[i], param.recursive)
                inputs.append(
                    InputFileParam(param.name, row[i], docker_path, remote_uri,
                                   param.recursive))

            elif isinstance(param, OutputFileParam):
                docker_path, remote_uri = output_file_param_util.parse_uri(
                    row[i], param.recursive)
                outputs.append(
                    OutputFileParam(param.name, row[i], docker_path,
                                    remote_uri, param.recursive))

        job_data.append({
            'task_id': task_id,
            'envs': envs,
            'inputs': inputs,
            'outputs': outputs
        })

    # Ensure that there are jobs to execute (and not just a header)
    if not job_data:
        raise ValueError('No tasks added from %s' % path)

    return job_data