def mount_gatk_gvcf_inputs(inputs_param="inputs"):
    # Get input gVCFs for this task
    print "Mounting task input collection"
    inputs_dir = ""
    if inputs_param in arvados.current_task()['parameters']:
        inputs_dir = arvados.get_task_param_mount(inputs_param)
    else:
        inputs_dir = arvados.get_job_param_mount(inputs_param)

    # Sanity check input gVCFs
    input_gvcf_files = []
    for f in arvados.util.listdir_recursive(inputs_dir):
        if re.search(r'\.vcf\.gz$', f):
            input_gvcf_files.append(os.path.join(inputs_dir, f))
        elif re.search(r'\.tbi$', f):
            pass
        elif re.search(r'\.interval_list$', f):
            pass
        else:
            print "WARNING: collection contains unexpected file %s" % f
    if len(input_gvcf_files) == 0:
        raise errors.InvalidArgumentError(
            "Expected one or more .vcf.gz files in collection (found 0 while recursively searching %s)"
            % inputs_dir)

    # Ensure we can read the gVCF files and that they each have an index
    for gvcf_file in input_gvcf_files:
        if not os.access(gvcf_file, os.R_OK):
            raise errors.FileAccessError("gVCF file not readable: %s" %
                                         gvcf_file)

        # Ensure we have corresponding .tbi index and can read it as well
        (gvcf_file_base, gvcf_file_ext) = os.path.splitext(gvcf_file)
        assert (gvcf_file_ext == ".gz")
        tbi_file = gvcf_file_base + ".gz.tbi"
        if not os.access(tbi_file, os.R_OK):
            tbi_file = gvcf_file_base + ".tbi"
            if not os.access(tbi_file, os.R_OK):
                raise errors.FileAccessError(
                    "No readable gVCF index file for gVCF file: %s" %
                    gvcf_file)
    return input_gvcf_files
Ejemplo n.º 2
0
def get_file_path(parameter,regex):
    """
    Return the path to a file with (name) set in script parameters (parameter), using regex (regex):
    
    Basically to avoid: 
    ref_collection_id = this_job['script_parameters']['reference_index']
    ref_collection = coll(ref_collection_id)
    for file in ref_collection:
    if not re.search('.*f(ast)?a(.gz)?$',file):
        continue
    ref_file = file
    ref_path = os.path.join(arvados.get_job_param_mount("reference_index"),ref_file)
    """
    collection_id = arvados.current_job()['script_parameters'][parameter]
    collection_handle = Collection(collection_id)
    for file in collection_handle:
	if not re.search(regex,file):
	    continue
	out_file = file
    out_path = os.path.join(arvados.get_job_param_mount(parameter),out_file)
    return out_path
Ejemplo n.º 3
0
def spawn_new_task_per_bed_line(script_parameter, regex, if_sequence=0, and_end_task=True):
    """
    Generalized form of one_task_per_pair_input_file from 
    https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py

    Creates a new task if the file in the collection matches the regex
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters'][script_parameter]
    input_collection = Collection(job_input)
    for name in input_collection:
	if not re.search(regex,name):
	    continue
	name_path = os.path.join(arvados.get_job_param_mount(script_parameter),name)
        bed_lines = (line.split() for line in open(name_path, 'r'))
        # Start the biggest regions first
        def cmp_desc_region_size(a, b):
            return ((int(b[2]) - int(b[1])) -
                    (int(a[2]) - int(a[1])))
        for bed_line in sorted(bed_lines, cmp=cmp_desc_region_size):
	    print bed_line
     	    new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'chrom': bed_line[0],
			'start': bed_line[1],
			'end': bed_line[2]
                        }
                    }
            arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                   body={'success':True}
                                   ).execute()
        exit()
import numpy as np

########################################################################################################################
# Read constants
NUM_RETRIES = int(arvados.getjobparam('num-retries'))
assert NUM_RETRIES > 0, "'num-retries' must be strictly positive"

antigen_type = str(arvados.getjobparam('antigen-type'))
########################################################################################################################
#Set-up collection and logging file to write out to
out = arvados.collection.Collection(num_retries=NUM_RETRIES)
time_logging_fh = out.open('time_log.txt', 'w')
########################################################################################################################
# Load settings
t0 = time.time()
settings = imp.load_source('settings', arvados.get_job_param_mount('settings'))
t1 = time.time()
time_logging_fh.write('Loading settings %fs\n' % (t1 - t0))
########################################################################################################################
#Get path lengths and path integers
cr = arvados.CollectionReader(arvados.getjobparam('path-lengths'),
                              num_retries=NUM_RETRIES)
t0 = time.time()
with cr.open("path_integers.npy", 'r') as f:
    path_integers = np.load(f)
t1 = time.time()
with cr.open("path_lengths.npy", 'r') as f:
    path_lengths = np.load(f)
t2 = time.time()
time_logging_fh.write('Loading path integers took %fs\n' % (t1 - t0))
time_logging_fh.write('Loading path lengths took %fs\n' % (t2 - t1))
Ejemplo n.º 5
0
    # metadata:
    #   batch: your-arbitrary-batch-name
    algorithm:
      aligner: bwa
      mark_duplicates: true
      recalibrate: false
      realign: false
      variantcaller: freebayes
      platform: illumina
      quality_format: Standard
      # for targetted projects, set the region
      # variant_regions: /path/to/your.bed
''')

os.unlink("/usr/local/share/bcbio-nextgen/gemini_data")
os.symlink(arvados.get_job_param_mount("gemini_data"),
           "/usr/local/share/bcbio-nextgen/gemini_data")

os.chdir(arvados.current_task().tmpdir)

rcode = subprocess.call([
    "bcbio_nextgen.py", "--workflow", "template",
    "/tmp/crunch-job/freebayes-variant.yaml", "project1",
    subst.do_substitution(p, "$(file $(R1))"),
    subst.do_substitution(p, "$(file $(R2))")
])

os.chdir("project1/work")

os.symlink("/usr/local/share/bcbio-nextgen/galaxy/tool-data", "tool-data")
Ejemplo n.º 6
0
                NUM_PHASES_TMP += 1
            elif not QUALITY:
                NUM_PHASES_TMP += 1
        NUM_CALLSETS += 1
t1 = time.time()
NUM_PHASES = NUM_PHASES_TMP/NUM_CALLSETS
assert float(NUM_PHASES) == NUM_PHASES_TMP/float(NUM_CALLSETS), "Unequal number of phases per callset"
time_logging_fh.write("Cursory reading of 'pythonic-tiling-callset-files' took %fs\n" % (t1-t0))

#Get callset phenotype files
#Unable to use collections due to csv/json read functions
t0 = time.time()
if arvados.getjobparam('callset-phenotypes') == None:
    phenotype_file_paths = []
else:
    phenotype_path = arvados.get_job_param_mount('callset-phenotypes')
    for root, dirs, files in os.walk(phenotype_path):
        assert len(dirs) == 0, "Expects 'callset-phenotypes' to be a flat directory"
        phenotype_file_paths = [os.path.join(root, f) for f in files]
t1 = time.time()
time_logging_fh.write("Getting job param mount (and file paths) of 'callset-phenotypes' took %fs\n" % (t1-t0))
########################################################################################################################
population, subjects, callset_names, size = fns.get_population(
    ACCEPTED_PATHS,
    path_integers,
    path_lengths,
    NUM_CALLSETS,
    NUM_PHASES,
    phenotype_file_paths,
    callset_collection_reader,
    CALLSET_NAME_REGEX,
Ejemplo n.º 7
0
    #   batch: your-arbitrary-batch-name
    algorithm:
      aligner: bwa
      mark_duplicates: true
      recalibrate: false
      realign: false
      variantcaller: freebayes
      platform: illumina
      quality_format: Standard
      # for targetted projects, set the region
      # variant_regions: /path/to/your.bed
"""
    )

os.unlink("/usr/local/share/bcbio-nextgen/gemini_data")
os.symlink(arvados.get_job_param_mount("gemini_data"), "/usr/local/share/bcbio-nextgen/gemini_data")

os.chdir(arvados.current_task().tmpdir)

rcode = subprocess.call(
    [
        "bcbio_nextgen.py",
        "--workflow",
        "template",
        "/tmp/crunch-job/freebayes-variant.yaml",
        "project1",
        subst.do_substitution(p, "$(file $(R1))"),
        subst.do_substitution(p, "$(file $(R2))"),
    ]
)
Ejemplo n.º 8
0
########################################################################################################################
# Read constants
NUM_RETRIES = int(arvados.getjobparam('num-retries'))
assert NUM_RETRIES > 0, "'num-retries' must be strictly positive"

antigen_type = str(arvados.getjobparam('antigen-type'))
########################################################################################################################
#Set-up collection and logging file to write out to
out = arvados.collection.Collection(num_retries=NUM_RETRIES)
time_logging_fh = out.open('time_log.txt', 'w')
info_fh = out.open('log.txt', 'w')
########################################################################################################################
# Load settings
t0 = time.time()
settings = imp.load_source('settings', arvados.get_job_param_mount('settings'))
t1 = time.time()
time_logging_fh.write('Loading settings %fs\n' %(t1-t0))
########################################################################################################################
#Parallelize based on settings
def one_task_per_classifier(num_classifiers_to_parameterize, if_sequence=0, and_end_task=True):
    if if_sequence != arvados.current_task()['sequence']:
        return
    api_client = arvados.api('v1')
    for i in range(num_classifiers_to_parameterize):
        new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': if_sequence + 1,
            'parameters': {
                'classifier_index':i,