Ejemplo n.º 1
0
def main(**job_inputs):
    job_outputs = {}
    reads_inputs = job_inputs['reads']
    reads_ids = [r['$dnanexus_link'] for r in reads_inputs]
    reads_descriptions = {r: dxpy.DXGTable(r).describe() for r in reads_ids}
    reads_columns = {r: [col['name'] for col in desc['columns']] for r, desc in reads_descriptions.items()}

    print reads_inputs
    print reads_ids
    print reads_descriptions
    print reads_columns

    all_reads_have_FlowReads_tag = all(['FlowReads' in desc['types'] for desc in reads_descriptions.values()])
    all_reads_have_LetterReads_tag = all(['LetterReads' in desc['types'] for desc in reads_descriptions.values()])
    reads_have_names = any(['name' in columns for columns in reads_columns.values()])
    reads_are_paired = any(['sequence2' in columns for columns in reads_columns.values()])
    reads_have_qualities = any(['quality' in columns for columns in reads_columns.values()])
    if reads_have_qualities:
        assert(all(['quality' in columns for columns in reads_columns.values()]))
    if reads_are_paired:
        all_paired = all(['sequence2' in columns for columns in reads_columns.values()])
        if not all_paired:
            raise dxpy.AppError("Reads to be mapped must be either all paired or all unpaired.  App input contains both paired and unpaired reads.")

    if job_inputs["algorithm"] == "bwasw":
        assert(not reads_are_paired) # bwasw does not support paired inputs

    assert(all_reads_have_FlowReads_tag or all_reads_have_LetterReads_tag)

    reference_record_types = dxpy.describe(job_inputs['reference'])['types']
    if "BwaLetterContigSetV3" in reference_record_types:
        input_ref_is_indexed = True
    elif "ContigSet" in reference_record_types:
        input_ref_is_indexed = False
    else:
        raise dxpy.ProgramError("Unrecognized object passed as reference. It must be a ContigSet record or a BwaLetterContigSetV3 file")

    if input_ref_is_indexed:
        job_outputs['indexed_reference'] = job_inputs['reference']
    else:
        found_cached_idx = False
        for result in dxpy.find_data_objects(classname='record',
                                             typename='BwaLetterContigSetV3',
                                             link=job_inputs['reference']['$dnanexus_link']):
            job_outputs['indexed_reference'] = dxpy.dxlink(result['id'])
            found_cached_idx = True
            break
        if not found_cached_idx:
            job_outputs['indexed_reference'] = dxpy.dxlink(make_indexed_reference(job_inputs))

    table_columns = [("sequence", "string")]
    if reads_have_names:
        table_columns.append(("name", "string"))
    if reads_have_qualities:
        table_columns.append(("quality", "string"))
    table_columns.extend([("status", "string"),
                          ("chr", "string"),
                          ("lo", "int32"),
                          ("hi", "int32"),
                          ("negative_strand", "boolean"),
                          ("error_probability", "uint8"),
                          ("qc_fail", "boolean"),
                          ("duplicate", "boolean"),
                          ("cigar", "string"),
                          ("template_id", "int64"),
                          ("read_group", "int32")])

    # optional sam fields: RG BC XC XT NM CM XN SM AM XM X0 X1 XG MD XA

    if reads_are_paired:
        table_columns.extend([("mate_id", "int32"), # TODO: int8
                              ("status2", "string"),
                              ("chr2", "string"),
                              ("lo2", "int32"),
                              ("hi2", "int32"),
                              ("negative_strand2", "boolean"),
                              ("proper_pair", "boolean")])

    if all_reads_have_FlowReads_tag:
        table_columns.extend([("flowgram", "string"),
                              ("flow_indices", "string"),
                              ("clip_qual_left", "int32"),
                              ("clip_qual_right", "int32"),
                              ("clip_adapter_left", "int32"),
                              ("clip_adapter_right", "int32")])

    table_columns.extend([("sam_field_BC", "string"),
                          ("sam_field_XC", "int32"),
                          ("sam_field_XT", "string"),
                          ("sam_field_NM", "int32"),
                          ("sam_field_CM", "int32"),
                          ("sam_field_XN", "int32"),
                          ("sam_field_SM", "int32"),
                          ("sam_field_AM", "int32"),
                          ("sam_field_XM", "int32"),
                          ("sam_field_X0", "int32"),
                          ("sam_field_X1", "int32"),
                          ("sam_field_XG", "int32"),
                          ("sam_field_MD", "string"),
                          ("sam_field_XA", "string"),
                          ("sam_optional_fields", "string")])


    column_descriptors = [dxpy.DXGTable.make_column_desc(name, type) for name, type in table_columns]

    gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")
    t = dxpy.new_dxgtable(column_descriptors, indices=[gri_index])

    if input_ref_is_indexed:
        original_contigset = dxpy.get_details(job_inputs['reference'])['original_contigset']
    else:
        original_contigset = job_inputs['reference']
    t.set_details({'original_contigset': original_contigset})

    t.add_types(["LetterMappings", "Mappings", "gri"])

    # name table
    if 'output_name' in job_inputs:
        t.rename(job_inputs['output_name'])
    else:
        first_reads_name = dxpy.DXGTable( job_inputs['reads'][0] ).describe()['name']
        contig_set_name = dxpy.describe(job_inputs['reference'])['name']
        # if we're working on an indexed_reference we're not guaranteed to have access to original_contigset
        if input_ref_is_indexed:
            contig_set_name = contig_set_name.split(' (index')[0]
        t.rename(first_reads_name + " mapped to " + contig_set_name)

    # declare how many paired or single reads are in each reads table
    read_group_lengths = []
    for i in range(len(reads_ids)):
        current_length = reads_descriptions[reads_ids[i]]["length"]
        if 'sequence2' in dxpy.DXGTable(reads_ids[i]).get_col_names():
            num_pairs = current_length
            num_singles = 0
        else:
            num_pairs = 0
            num_singles = current_length

        read_group_lengths.append( {"num_singles":num_singles, "num_pairs":num_pairs} )

    details = t.get_details()
    details['read_groups'] = read_group_lengths
    t.set_details(details)

    row_offsets = []; row_cursor = 0
    for i in range(len(reads_ids)):
        row_offsets.append(row_cursor)
        row_cursor += reads_descriptions[reads_ids[i]]["length"]

    chunk_size = job_inputs["chunk_size"]

    map_job_inputs = job_inputs.copy()
    map_job_inputs["row_offsets"] = row_offsets
    map_job_inputs["num_rows"] = chunk_size
    map_job_inputs["table_id"] = t.get_id()
    map_job_inputs["indexed_reference"] = job_outputs['indexed_reference']

    postprocess_job_inputs = job_inputs.copy()
    postprocess_job_inputs["table_id"] = t.get_id()

    for start_row in xrange(0, row_cursor, chunk_size):
        map_job_inputs["start_row"] = start_row
        map_job = dxpy.new_dxjob(map_job_inputs, "map")
        print "Launched map job with", map_job_inputs
        postprocess_job_inputs["chunk%dresult" % start_row] = {'job': map_job.get_id(), 'field': 'ok'}
        postprocess_job_inputs["chunk%ddebug" % start_row] = {'job': map_job.get_id(), 'field': 'debug'}

    postprocess_job = dxpy.new_dxjob(postprocess_job_inputs, "postprocess")

    job_outputs['mappings'] = {'job': postprocess_job.get_id(), 'field': 'mappings'}

    print "MAIN OUTPUT:", job_outputs
    return job_outputs
Ejemplo n.º 2
0
def map(**job_inputs):
    print "Map:", job_inputs
    job_outputs = {}
    times = [('start', time.time())]
    reads_inputs = job_inputs['reads']
    reads_ids = [r['$dnanexus_link'] for r in reads_inputs]
    reads_descriptions = {r: dxpy.DXGTable(r).describe() for r in reads_ids}
    reads_columns = {r: [col['name'] for col in desc['columns']] for r, desc in reads_descriptions.items()}

    reads_are_paired = any(['sequence2' in columns for columns in reads_columns.values()])

    times.append(('preamble', time.time()))

    dxpy.download_dxfile(dxpy.get_details(job_inputs["indexed_reference"])['index_archive'], "reference.tar.xz")
    times.append(('download reference', time.time()))

    # TODO: Async everything below
    # subprocess.check_call("pixz -d reference.tar.xz && tar -xf reference.tar", shell=True)
    subprocess.check_call("tar -xJf reference.tar.xz", shell=True)

    if job_inputs["algorithm"] == "bwasw":
        bwa_algorithm = "bwasw"
    else:
        # algorithm = aln or auto. TODO: check what auto should do
        bwa_algorithm = "aln"

    aln_opts, sampe_opts, sw_opts, samse_opts = parse_bwa_cmd_opts(job_inputs)

    # Set the number of threads BWA parameter to the apparent number of CPUs.
    aln_opts += " -t " + str(cpu_count())
    sw_opts += " -t " + str(cpu_count())

    row_offsets = job_inputs['row_offsets']   # starting row for each reads table if you added them all up
    start_row = job_inputs['start_row']       # the position in this chunk relative to the row_offsets 'total'
    num_rows = job_inputs['num_rows']         # size of chunk to do this time
    subjobs = []
    for i in range(len(reads_ids)):
        reads_length = reads_descriptions[reads_ids[i]]["length"]
        read_group = i
        # see if the reads table is part of this chunk

        # if start is inside this reads table, add it
        # doing this in the form:   (A_start < B_end) and (A_end > B_start)
        # A is the reads tables
        # B is the current chunk
        # A_start = row_offsets[i]
        # A_end = row_offsets[i] + reads_length
        # B_start = start_row
        # B_end = start_row + num_rows
        if row_offsets[i] < (start_row+num_rows) and (row_offsets[i]+reads_length) > start_row:

            rel_start = max(start_row - row_offsets[i], 0)
            rel_end = min(reads_length, start_row - row_offsets[i] + num_rows) # Using half-open intervals: [start, end)
            subjobs.append({'reads_id': reads_ids[i], 'start_row': rel_start, 'end_row': rel_end, 'read_group':read_group})

    times.append(('parse parameters', time.time()))
    print 'SUBJOBS:', subjobs

    for subchunk_id in range(len(subjobs)):
        subjob = subjobs[subchunk_id]
        reads_id = subjob['reads_id']
        # TODO: FlowReads trimming support
        if 'quality' in reads_columns[reads_id]:
            if reads_are_paired:
                reads_file1 = "input"+str(subchunk_id)+"_1.fastq"
                reads_file2 = "input"+str(subchunk_id)+"_2.fastq"
                write_reads_to_fastq(reads_id, reads_file1, seq_col='sequence', qual_col='quality', start_row=subjob['start_row'], end_row=subjob['end_row'])
                write_reads_to_fastq(reads_id, reads_file2, seq_col='sequence2', qual_col='quality2', start_row=subjob['start_row'], end_row=subjob['end_row'])
                times.append(('fetch reads (subchunk %d)' % subchunk_id, time.time()))
                run_alignment(bwa_algorithm, reads_file1, reads_file2, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts)
                times.append(('run alignment (subchunk %d)' % subchunk_id, time.time()))
            else:
                reads_file1 = "input"+str(subchunk_id)+".fastq"
                write_reads_to_fastq(reads_id, reads_file1, start_row=subjob['start_row'], end_row=subjob['end_row'])
                run_alignment(bwa_algorithm, reads_file1, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts)
        else: # No qualities, use plain fasta
            if reads_are_paired:
                reads_file1 = "input"+str(subchunk_id)+"_1.fasta"
                reads_file2 = "input"+str(subchunk_id)+"_2.fasta"
                write_reads_to_fasta(reads_id, reads_file1, seq_col='sequence', start_row=subjob['start_row'], end_row=subjob['end_row'])
                write_reads_to_fasta(reads_id, reads_file2, seq_col='sequence2', start_row=subjob['start_row'], end_row=subjob['end_row'])
                run_alignment(bwa_algorithm, reads_file1, reads_file2, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts)
            else:
                reads_file1 = "input"+str(subchunk_id)+".fasta"
                write_reads_to_fasta(reads_id, reads_file1, start_row=subjob['start_row'], end_row=subjob['end_row'])
                run_alignment(bwa_algorithm, reads_file1, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts)

        times.append(('run alignment (subchunk %d)' % subchunk_id, time.time()))

        cmd = "dx_storeSamAsMappingsTable_bwa"
        cmd += " --alignments '%s.sam'" % reads_file1
        cmd += " --table_id '%s'" % job_inputs["table_id"]
        cmd += " --reads_id '%s'" % reads_id
        cmd += " --start_row %d" % subjob['start_row']
        cmd += " --read_group %d" % subjob['read_group']

        if job_inputs.get('discard_unmapped_rows'):
            cmd += " --discard_unmapped_rows"
        run_shell(cmd)
        times.append(('run table upload (subchunk %d)' % subchunk_id, time.time()))

    job_outputs["ok"] = True

    timing_report = {}
    for i in range(len(times)-1):
        timing_report[times[i+1][0]] = times[i+1][1] - times[i][1]
    job_outputs["debug"] = {'times': timing_report}
    return job_outputs