def process(input_gtable_id, start_row, end_row, output_gtable_id):
    DX_APP_WIZARD_||_INPUT = dxpy.DXGTable(input_gtable_id)

    # Using the context manager here is useful so that the flush()
    # method is called once the context manager exits, and any rows
    # added will be flushed to the platform.  The mode is set to "a"
    # for "append".

    with dxpy.open_dxgtable(output_gtable_id, mode="a") as DX_APP_WIZARD_||_OUTPUT:

        # The following loop iterates over each row from start_row to
        # end_row (not including end_row).  You can find documentation on
        # other useful GTable methods (such as iterating over a genomic
        # range query with iterate_query_rows) in the dxpy library here:
        # http://autodoc.dnanexus.com/bindings/python/current/dxpy_dxgtable.html

        for row in DX_APP_WIZARD_||_INPUT.iterate_rows(start_row, end_row):
            # Fill in code here to perform whatever computation is
            # necessary to process the row and compute the new row.
            #
            # *row* is an array where the first element is the row ID,
            # and the rest of the elements appear in the same order as
            # the GTable's column specification.  You can retrieve the
            # column specifications or names by using
            # DX_APP_WIZARD_||_INPUT.get_columns() or DX_APP_WIZARD_||_INPUT.get_col_names().

            new_row = []

            # The following line queues up the array new_row as a row
            # of data that should be added to the output GTable.
            # Queued rows will be flushed to the platform periodically.

            DX_APP_WIZARD_||_OUTPUT.add_row(new_row)
Ejemplo n.º 2
0
def main(**kwargs):
    if len(kwargs) == 0:
        opts = parser.parse_args(sys.argv[1:])
    else:
        opts = parser.parse_args(kwargs)

    if opts.mappings_id == None:
        parser.print_help()
        sys.exit(1)

    mappingsTable = dxpy.DXGTable(opts.mappings_id)

    if opts.file_name != None:
        fh = open(opts.file_name, "w")
    else:
        fh = sys.stdout

    if 'quality' in mappingsTable.get_col_names():
        outputFastq = True
    else:
        outputFastq = False

    for row in mappingsTable.iterate_rows(want_dict=True):
        if outputFastq:
            writeFastq(row, fh)
        else:
            writeFasta(row, fh)
Ejemplo n.º 3
0
def process(gtable_id, start_row, end_row):
    DX_APP_WIZARD_||_INPUT = dxpy.DXGTable(gtable_id)

    # The following loop iterates over each row from start_row to
    # end_row (not including end_row).  You can find documentation on
    # other useful GTable methods (such as iterating over a genomic
    # range query with iterate_query_rows) in the dxpy library here:
    # http://autodoc.dnanexus.com/bindings/python/current/dxpy_dxgtable.html

    for row in DX_APP_WIZARD_||_INPUT.iterate_rows(start_row, end_row):
        # Fill in code here to perform whatever computation is
        # necessary to process the row.
        #
        # *row* is an array where the first element is the row ID, and
        # the rest of the elements appear in the same order as the
        # GTable's column specification.  You can retrieve the column
        # specifications or names by using
        # DX_APP_WIZARD_||_INPUT.get_columns() or DX_APP_WIZARD_||_INPUT.get_col_names().

        pass

    # If your subproblem is to compute some value over the rows it was
    # given, you can return it here:

    return { "output": "placeholder value" }
Ejemplo n.º 4
0
def postprocess(**job_inputs):
    print "Postprocess:", job_inputs
    job_outputs = {}

    time_report = {k: v for k, v in job_inputs.iteritems() if re.match("chunk\d+debug", k)}

    t = dxpy.DXGTable(job_inputs["table_id"])
    d = t.get_details()
    d['time_report'] = time_report
    t.set_details(d)
    t.close()
    job_outputs['mappings'] = dxpy.dxlink(t)
    return job_outputs
Ejemplo n.º 5
0
def main(**kwargs):
    if len(kwargs) == 0:
        kwargs = vars(arg_parser.parse_args(sys.argv[1:]))

    try:
        spans = dxpy.DXGTable(kwargs['Spans'])
    except:
        raise dxpy.AppError("Failed to open Spans object for export")

    spans_types = spans.describe()['types']

    if 'Genes' in spans_types:
        export_genes(spans, kwargs['output'])
    else:
        export_generic_bed(spans, kwargs['output'])
def dump_fastqa(reads_ID, output_base):
    if 'sequence2' in dxpy.DXGTable(reads_ID).get_col_names():
        paired = True
    else:
        paired = False

    if paired:
        run_shell(" ".join([
            "dx-reads-to-fastq", reads_ID, "--output " + output_base + "_1",
            "--output2 " + output_base + "_2"
        ]))
    else:
        run_shell(" ".join(
            ["dx-reads-to-fastq", reads_ID, "--output " + output_base + "_1"]))

    if paired:
        return output_base + "_1", output_base + "_2"
    else:
        return output_base + "_1", None
def check_reads(reads_tables):
    # validate that tables contain data that can be used together (all paired or all unpaired, etc)

    if len(reads_tables) == 0:
        raise dxpy.AppError("Please enter at least one Reads table as input")

    single = 0
    paired = 0

    for table in reads_tables:
        if 'sequence2' in dxpy.DXGTable(table).get_col_names():
            paired = paired + 1
        else:
            single = single + 1

    if single > 0 and paired > 0:
        raise dxpy.AppError(
            "Found both single and paired-end reads.  Please only input one type."
        )

    return
Ejemplo n.º 8
0
def map_contaminant(Contig, Reads):
    # get ID of our mapper
    try:
        bwa = dxpy.DXApp(
            dxpy.find_apps(name="bwa_mem_fastq_read_mapper").next()['id'])
    except StopIteration:
        raise dxpy.AppError(
            "Unable to find app 'bwa_mem_fastq_read_mapper'.  Please install it to enable contaminant mapping"
        )

    # TODO: find optimal chunk size so we don't launch too many bwa jobs
    map_job = bwa.run({
        "reads": Reads,
        "reference": Contig,
        "discard_unmapped_rows": True,
        "chunk_size": 10000000
    })

    total_reads = 0
    for r in Reads:
        desc = dxpy.DXGTable(r).describe()
        current_reads = desc['length']
        if 'sequence2' in desc['columns']:
            current_reads *= 2
        total_reads += current_reads

    # launch a job to wait for the mapping and will calculate what % has mapped
    calc_job = dxpy.new_dxjob(
        {
            "num_reads": total_reads,
            "mappings": {
                "job": map_job.get_id(),
                "field": "mappings"
            }
        }, "calc_contam")

    return calc_job.get_id()
def postprocess(output_gtable_id):
    DX_APP_WIZARD_||_OUTPUT = dxpy.DXGTable(output_gtable_id)
    DX_APP_WIZARD_||_OUTPUT.close()
Ejemplo n.º 10
0
def main(**kwargs):

    if len(kwargs) == 0:
        opts = parser.parse_args(sys.argv[1:])
    else:
        opts = parser.parse_args(kwargs)

    if opts.genes_id == None:
        parser.print_help()
        sys.exit(1)

    if opts.file_name != None:
        outputFile = open(opts.file_name, 'w')
    else:
        outputFile = None

    if opts.genes_id == None:
        parser.print_help()
        sys.exit(1)

    tableId = opts.genes_id
    table = dxpy.DXGTable(tableId)

    genesTypes = {
        "exon": True,
        "CDS": True,
        "5' UTR": True,
        "3' UTR": True,
        "transcript": True,
        "gene": True
    }
    translatedTypes = {
        "transcript": "mRNA",
        "5' UTR": "five_prime_UTR",
        "3' UTR": "three_prime_UTR"
    }

    columns = table.get_col_names()
    idColumn = None
    parentColumn = None
    if "ID" in columns:
        idColumn = "ID"
    elif "Id" in columns:
        idColumn = "Id"
    elif "id" in columns:
        idColumn = "id"
    else:
        idColumn = "span_id"

    if "Parent" in columns:
        parentColumn = "Parent"
    elif "PARENT" in columns:
        parentColumn = "PARENT"
    elif "parent" in columns:
        parentColumn = "parent"
    else:
        parentColumn = "parent_id"

    for row in table.iterate_rows(want_dict=True):
        typ = row["type"]
        if opts.only_genes_types == False or genesTypes.get(typ) != None:
            if translatedTypes.get(typ) != None:
                typ = translatedTypes[typ]

            reservedColumns = [
                "chr", "lo", "hi", "span_id", "type", "strand", "score",
                "is_coding", "parent_id", "frame", "source", "__id__", "ID",
                "Id", "id", "Parent", "PARENT", "parent"
            ]
            attributes = ""

            rowId = str(row[idColumn])
            parentId = str(row[parentColumn])

            attributes += "ID=\"" + rowId + "\";"
            if not (parentColumn == "parent_id" and parentId == "-1"):
                attributes += "Parent=\"" + parentId + "\";"

            for k, v in row.iteritems():
                if k not in reservedColumns and v != '':
                    attributes += k + "=" + '"' + str(v) + '";'

            chromosome = row["chr"]
            lo = str(row["lo"] + 1)
            hi = str(row["hi"])

            strand = row["strand"]
            if strand == '':
                strand = '.'
            if row["frame"] == -1:
                frame = '.'
            else:
                frame = str(row["frame"])
            source = '.'

            # 2**31 and 2**31-1 are legacy null values that will be removed when possible
            if row.get("score") == None:
                score = "."
            if row["score"] == dxpy.NULL or row["score"] == 2**31 - 1 or row[
                    "score"] == float(2**31):
                score = "."
            else:
                score = str(row["score"])

            if row.get("source") != None:
                if row["source"] != '':
                    source = row["source"]
            result = "\t".join([
                chromosome, source, typ, lo, hi, score, strand, frame,
                attributes.rstrip(";")
            ]) + "\n"
            if outputFile != None:
                outputFile.write(result)
            else:
                sys.stdout.write(result)
Ejemplo n.º 11
0
def map(**job_inputs):
    print "Map:", job_inputs
    job_outputs = {}
    times = [('start', time.time())]
    reads_inputs = job_inputs['reads']
    reads_ids = [r['$dnanexus_link'] for r in reads_inputs]
    reads_descriptions = {r: dxpy.DXGTable(r).describe() for r in reads_ids}
    reads_columns = {r: [col['name'] for col in desc['columns']] for r, desc in reads_descriptions.items()}

    reads_are_paired = any(['sequence2' in columns for columns in reads_columns.values()])

    times.append(('preamble', time.time()))

    dxpy.download_dxfile(dxpy.get_details(job_inputs["indexed_reference"])['index_archive'], "reference.tar.xz")
    times.append(('download reference', time.time()))

    # TODO: Async everything below
    # subprocess.check_call("pixz -d reference.tar.xz && tar -xf reference.tar", shell=True)
    subprocess.check_call("tar -xJf reference.tar.xz", shell=True)

    if job_inputs["algorithm"] == "bwasw":
        bwa_algorithm = "bwasw"
    else:
        # algorithm = aln or auto. TODO: check what auto should do
        bwa_algorithm = "aln"

    aln_opts, sampe_opts, sw_opts, samse_opts = parse_bwa_cmd_opts(job_inputs)

    # Set the number of threads BWA parameter to the apparent number of CPUs.
    aln_opts += " -t " + str(cpu_count())
    sw_opts += " -t " + str(cpu_count())

    row_offsets = job_inputs['row_offsets']   # starting row for each reads table if you added them all up
    start_row = job_inputs['start_row']       # the position in this chunk relative to the row_offsets 'total'
    num_rows = job_inputs['num_rows']         # size of chunk to do this time
    subjobs = []
    for i in range(len(reads_ids)):
        reads_length = reads_descriptions[reads_ids[i]]["length"]
        read_group = i
        # see if the reads table is part of this chunk

        # if start is inside this reads table, add it
        # doing this in the form:   (A_start < B_end) and (A_end > B_start)
        # A is the reads tables
        # B is the current chunk
        # A_start = row_offsets[i]
        # A_end = row_offsets[i] + reads_length
        # B_start = start_row
        # B_end = start_row + num_rows
        if row_offsets[i] < (start_row+num_rows) and (row_offsets[i]+reads_length) > start_row:

            rel_start = max(start_row - row_offsets[i], 0)
            rel_end = min(reads_length, start_row - row_offsets[i] + num_rows) # Using half-open intervals: [start, end)
            subjobs.append({'reads_id': reads_ids[i], 'start_row': rel_start, 'end_row': rel_end, 'read_group':read_group})

    times.append(('parse parameters', time.time()))
    print 'SUBJOBS:', subjobs

    for subchunk_id in range(len(subjobs)):
        subjob = subjobs[subchunk_id]
        reads_id = subjob['reads_id']
        # TODO: FlowReads trimming support
        if 'quality' in reads_columns[reads_id]:
            if reads_are_paired:
                reads_file1 = "input"+str(subchunk_id)+"_1.fastq"
                reads_file2 = "input"+str(subchunk_id)+"_2.fastq"
                write_reads_to_fastq(reads_id, reads_file1, seq_col='sequence', qual_col='quality', start_row=subjob['start_row'], end_row=subjob['end_row'])
                write_reads_to_fastq(reads_id, reads_file2, seq_col='sequence2', qual_col='quality2', start_row=subjob['start_row'], end_row=subjob['end_row'])
                times.append(('fetch reads (subchunk %d)' % subchunk_id, time.time()))
                run_alignment(bwa_algorithm, reads_file1, reads_file2, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts)
                times.append(('run alignment (subchunk %d)' % subchunk_id, time.time()))
            else:
                reads_file1 = "input"+str(subchunk_id)+".fastq"
                write_reads_to_fastq(reads_id, reads_file1, start_row=subjob['start_row'], end_row=subjob['end_row'])
                run_alignment(bwa_algorithm, reads_file1, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts)
        else: # No qualities, use plain fasta
            if reads_are_paired:
                reads_file1 = "input"+str(subchunk_id)+"_1.fasta"
                reads_file2 = "input"+str(subchunk_id)+"_2.fasta"
                write_reads_to_fasta(reads_id, reads_file1, seq_col='sequence', start_row=subjob['start_row'], end_row=subjob['end_row'])
                write_reads_to_fasta(reads_id, reads_file2, seq_col='sequence2', start_row=subjob['start_row'], end_row=subjob['end_row'])
                run_alignment(bwa_algorithm, reads_file1, reads_file2, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts)
            else:
                reads_file1 = "input"+str(subchunk_id)+".fasta"
                write_reads_to_fasta(reads_id, reads_file1, start_row=subjob['start_row'], end_row=subjob['end_row'])
                run_alignment(bwa_algorithm, reads_file1, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts)

        times.append(('run alignment (subchunk %d)' % subchunk_id, time.time()))

        cmd = "dx_storeSamAsMappingsTable_bwa"
        cmd += " --alignments '%s.sam'" % reads_file1
        cmd += " --table_id '%s'" % job_inputs["table_id"]
        cmd += " --reads_id '%s'" % reads_id
        cmd += " --start_row %d" % subjob['start_row']
        cmd += " --read_group %d" % subjob['read_group']

        if job_inputs.get('discard_unmapped_rows'):
            cmd += " --discard_unmapped_rows"
        run_shell(cmd)
        times.append(('run table upload (subchunk %d)' % subchunk_id, time.time()))

    job_outputs["ok"] = True

    timing_report = {}
    for i in range(len(times)-1):
        timing_report[times[i+1][0]] = times[i+1][1] - times[i][1]
    job_outputs["debug"] = {'times': timing_report}
    return job_outputs
Ejemplo n.º 12
0
def main(**kwargs):

    if len(kwargs) == 0:
        opts = parser.parse_args(sys.argv[1:])
    else:
        opts = parser.parse_args(kwargs)

    if opts.mappings_id == None:
        parser.print_help()
        sys.exit(1)

    mappingsTable = dxpy.DXGTable(opts.mappings_id)
    idAsName = opts.id_as_name
    idPrepend = opts.id_prepend
    writeRowId = opts.write_row_id

    paired = "chr2" in mappingsTable.get_col_names()

    regions = []
    if opts.region_file != "":
        regions = re.findall("-L ([^:]*):(\d+)-(\d+)",
                             open(opts.region_file, 'r').read())

    name = mappingsTable.describe()['name']

    if opts.reference != None:
        originalContig = opts.reference
    else:
        try:
            originalContig = mappingsTable.get_details(
            )['original_contigset']['$dnanexus_link']
        except:
            raise dxpy.AppError(
                "The original reference genome must be attached to mappings table"
            )

    try:
        contigDetails = dxpy.DXRecord(originalContig).get_details()['contigs']
    except:
        raise dxpy.AppError("Unable to access reference with ID " +
                            originalContig)

    contigNames = contigDetails['names']
    contigSizes = contigDetails['sizes']

    if opts.file_name != None:
        outputFile = open(opts.file_name, 'w')
    else:
        outputFile = None

    header = ""

    for i in range(len(contigNames)):
        header += "@SQ\tSN:" + str(contigNames[i]) + "\tLN:" + str(
            contigSizes[i]) + "\n"

    assignReadGroup = opts.assign_read_group
    if assignReadGroup != "":
        header += "@RG\tID:" + assignReadGroup + "\tSM:Sample_0"
    else:
        for i in range(len(mappingsTable.get_details()['read_groups'])):
            header += "@RG\tID:" + str(i) + "\tSM:Sample_" + str(i)
            if opts.read_group_platform != '':
                header += "\tPL:" + opts.read_group_platform
            header += "\n"

    if outputFile != None:
        outputFile.write(header)
    else:
        sys.stdout.write(header)

    col = {}
    names = mappingsTable.get_col_names()
    for i in range(len(names)):
        col[names[i]] = i + 1

    column_descs = mappingsTable.describe()['columns']

    sam_cols = []
    sam_col_names = []
    sam_col_types = {}
    for c in column_descs:
        if c['name'].startswith(
                "sam_field_") or c['name'] == "sam_optional_fields":
            sam_cols.append(c)
            sam_col_names.append(c['name'])
            sam_col_types[c['name']] = c['type']

    defaultCol = {
        "sequence": "",
        "name": "",
        "quality": "",
        "status": "UNMAPPED",
        "chr": "",
        "lo": 0,
        "hi": 0,
        "negative_strand": False,
        "error_probability": 0,
        "qc_fail": False,
        "duplicate": False,
        "cigar": "",
        "mate_id": -1,
        "status2": "",
        "chr2": "",
        "lo2": 0,
        "hi2": 0,
        "negative_strand2": False,
        "proper_pair": False,
        "read_group": 0
    }

    #unmappedFile = open("unmapped.txt", 'w')

    if len(regions) == 0:

        if opts.start_row > mappingsTable.describe()['length']:
            raise dxpy.AppError(
                "Starting row is larger than number of rows in table")
        elif opts.end_row < opts.start_row:
            raise dxpy.AppError("Ending row is before Start")

        if opts.end_row > 0:
            generator = mappingsTable.iterate_rows(start=opts.start_row,
                                                   end=opts.end_row,
                                                   want_dict=True)
        else:
            generator = mappingsTable.iterate_rows(start=opts.start_row,
                                                   want_dict=True)

        # write each row unless we're throwing out unmapped
        for row in generator:
            if row["status"] != "UNMAPPED" or opts.discard_unmapped == False:
                if not paired:
                    writeRow(row, col, defaultCol, outputFile, idAsName,
                             idPrepend, writeRowId, assignReadGroup,
                             column_descs, sam_cols, sam_col_names,
                             sam_col_types)
                elif opts.no_interchromosomal and row["chr"] == row["chr2"]:
                    writeRow(row, col, defaultCol, outputFile, idAsName,
                             idPrepend, writeRowId, assignReadGroup,
                             column_descs, sam_cols, sam_col_names,
                             sam_col_types)
                elif opts.only_interchromosomal and opts.no_interchromosomal == False and (
                        row["chr"] != row["chr2"] or
                    (row["chr"] == "" and row["chr2"] == "")):
                    writeRow(row, col, defaultCol, outputFile, idAsName,
                             idPrepend, writeRowId, assignReadGroup,
                             column_descs, sam_cols, sam_col_names,
                             sam_col_types)
                elif opts.no_interchromosomal == False and opts.only_interchromosomal == False:
                    writeRow(row, col, defaultCol, outputFile, idAsName,
                             idPrepend, writeRowId, assignReadGroup,
                             column_descs, sam_cols, sam_col_names,
                             sam_col_types)

    else:
        for x in regions:
            # generate the query for this region
            query = mappingsTable.genomic_range_query(
                x[0],
                int(x[1]) + opts.region_index_offset,
                int(x[2]) + opts.region_index_offset,
                index='gri')
            for row in mappingsTable.get_rows(query=query, limit=1)['data']:
                startRow = row[0]
                for row in mappingsTable.iterate_rows(start=startRow,
                                                      want_dict=True):
                    if row["chr"] != x[0] or row["lo"] > int(
                            x[2]) + opts.region_index_offset:
                        break
                    if row["status"] != "UNMAPPED" or opts.discard_unmapped == False:
                        if not paired:
                            writeRow(row, col, defaultCol, outputFile,
                                     idAsName, idPrepend, writeRowId,
                                     assignReadGroup, column_descs, sam_cols,
                                     sam_col_names, sam_col_types)
                        elif opts.no_interchromosomal and row["chr"] == row[
                                "chr2"]:
                            writeRow(row, col, defaultCol, outputFile,
                                     idAsName, idPrepend, writeRowId,
                                     assignReadGroup, column_descs, sam_cols,
                                     sam_col_names, sam_col_types)
                        elif opts.only_interchromosomal and opts.no_interchromosomal == False and (
                                row["chr"] != row["chr2"] or
                            (row["chr"] == "" and row["chr2"] == "")):
                            writeRow(row, col, defaultCol, outputFile,
                                     idAsName, idPrepend, writeRowId,
                                     assignReadGroup, column_descs, sam_cols,
                                     sam_col_names, sam_col_types)
                        elif opts.no_interchromosomal == False and opts.only_interchromosomal == False:
                            writeRow(row, col, defaultCol, outputFile,
                                     idAsName, idPrepend, writeRowId,
                                     assignReadGroup, column_descs, sam_cols,
                                     sam_col_names, sam_col_types)

    if outputFile != None:
        outputFile.close()
Ejemplo n.º 13
0
def main(**kwargs):

    if len(kwargs) == 0:
        opts = parser.parse_args(sys.argv[1:])
    else:
        opts = parser.parse_args(kwargs)

    if opts.genes_id == None:
        parser.print_help()
        sys.exit(1)

    if opts.file_name != None:
        outputFile = open(opts.file_name, 'w')
    else:
        outputFile = None

    if opts.genes_id == None:
        parser.print_help()
        sys.exit(1)

    tableId = opts.genes_id
    table = dxpy.DXGTable(tableId)

    transcripts = {}
    genes = {}

    acceptedTypes = {
        "CDS": "CDS",
        "start_codon": "start_codon",
        "stop_codon": "stop_codon",
        "5' UTR": "5UTR",
        "3' UTR": "3UTR",
        "intergenic": "inter",
        "intergenic_conserved": "inter_CNS",
        "exon": "exon"
    }

    biotypePresent = False
    if "gene_biotype" in table.get_col_names():
        biotypePresent = True

    for row in table.iterate_rows(want_dict=True):
        if row["type"] == "gene":
            if genes.get(row["span_id"]) == None:
                genes[row["span_id"]] = str(row["span_id"])
                if row.get("gene_id") != None:
                    if row["gene_id"] != "":
                        genes[row["span_id"]] = row["gene_id"]
                if row.get("name") != None and genes[row["span_id"]] == str(
                        row["span_id"]):
                    if row["name"] != '':
                        genes[row["span_id"]] = row["name"]
            else:
                raise dxpy.AppError(
                    "Error: span_id was not unique, in violation of the type spec for Genes. As a result, some gene_id data may be overwritten"
                )

        if row["type"] == "transcript":
            if transcripts.get(row["span_id"]) == None:
                transcriptInfo = {"name": str(row["span_id"])}
                if row.get("gene_id") != None:
                    if row["transcript_id"] != '':
                        transcriptInfo["name"] = row["transcript_id"]
                if row.get("name") != None and transcriptInfo["name"] == str(
                        row["span_id"]):
                    if row["name"] != '':
                        transcriptInfo["name"] = row["name"]
                transcriptInfo['parent'] = row["parent_id"]
                transcriptInfo['gene'] = ''
                transcripts[row["span_id"]] = transcriptInfo
            else:
                raise dxpy.AppError(
                    "Error: span_id was not unique, in violation of the type spec for Genes. As a result, some transcript_id data may be overwritten"
                )

    for k, v in transcripts.iteritems():
        if genes.get(v["parent"]) != None:
            transcripts[k]["gene"] = genes[v["parent"]]

    warnedGeneId = False
    warnedTranscriptId = False

    for row in table.iterate_rows(want_dict=True):
        if acceptedTypes.get(row["type"]) != None:
            reservedColumns = [
                "chr", "lo", "hi", "span_id", "type", "strand", "score",
                "is_coding", "parent_id", "frame", "source", "gene_id",
                "transcript_id", "__id__"
            ]
            attributes = ""

            transcriptId = ''
            geneId = ''
            try:
                transcriptId = transcripts[row["parent_id"]]["name"]

            except:
                if not warnedTranscriptId:
                    print "Warning, at least one position had a transcriptId that could not be determined. Future warnings of this type will not be printed"
                    print "Offending position - Chr: " + row[
                        "chr"] + " lo: " + str(row["lo"]) + " hi: "
                    warnedTranscriptId = True

            try:
                geneId = transcripts[row["parent_id"]]["gene"]
            except:
                if not warnedGeneId:
                    print "Warning, at least one position had a geneId that could not be determined. Future warnings of this type will not be printed"
                    print "Offending position - Chr: " + row[
                        "chr"] + " lo: " + str(row["lo"]) + " hi: "
                    warnedGeneId = True

            attributes += "gene_id " + '"' + geneId + '"' + ";"
            attributes += " transcript_id " + '"' + transcriptId + '"' + ";"

            for k, v in row.iteritems():
                if k not in reservedColumns and v != '':
                    attributes += " " + k + " " + '"' + str(v) + '";'

            if opts.add_gene_biotype and not biotypePresent:
                if row["is_coding"]:
                    entry = "protein_coding"
                else:
                    entry = "non_protein_coding"
                attributes += " gene_biotype " + '"' + entry + '"' + '";'

            chromosome = row["chr"]
            lo = str(row["lo"] + 1)
            hi = str(row["hi"])
            typ = acceptedTypes[row["type"]]
            strand = row["strand"]
            if strand == '':
                strand = '.'
            if row["frame"] == -1:
                frame = '.'
            else:
                frame = str(row["frame"])

            #Null values 2**31 and 2**31-1 are legacy values and will be removed when possible
            if row.get("score") == None:
                score = "."
            elif row["score"] == dxpy.NULL or row["score"] == 2**31 - 1 or row[
                    "score"] == float(2**31):
                score = "."
            else:
                score = str(row["score"])

            if row.get("source") != None:
                if row["source"] != '':
                    source = row["source"]
                if opts.add_gene_biotype and not biotypePresent:
                    if row["is_coding"]:
                        source = "protein_coding"
                    else:
                        source = "non_protein_coding"
            else:
                source = "."

            result = "\t".join([
                chromosome, source, typ, lo, hi, score, strand, frame,
                attributes.rstrip(";")
            ]) + "\n"
            if outputFile != None:
                outputFile.write(result)
            else:
                sys.stdout.write(result)
Ejemplo n.º 14
0
def main(**kwargs):
    if len(kwargs) == 0:
        kwargs = vars(arg_parser.parse_args(sys.argv[1:]))

    if "end_row" not in kwargs:
        kwargs["end_row"] = None

    if kwargs["end_row"] is not None and kwargs["end_row"] <= kwargs[
            "start_row"]:
        arg_parser.error("End row %d must be greater than start row %d" %
                         (kwargs["end_row"], kwargs["start_row"]))

    try:
        table = dxpy.DXGTable(kwargs['reads_table'])
    except:
        raise dxpy.AppError("Failed to open table for export")

    existCols = table.get_col_names()

    ### sort out columns to download

    col = []
    col2 = []

    # if there's a second sequence, it's paired
    if "sequence2" in existCols:
        isPaired = True
    else:
        isPaired = False

    if "name" in existCols and kwargs['discard_names'] != True:
        hasName = True
        col.append("name")
        if isPaired == True:
            col2.append("name2")
    else:
        hasName = False

    col.append("sequence")
    if isPaired == True:
        col2.append("sequence2")

    if "quality" in existCols:
        hasQual = True
        col.append("quality")
        if isPaired == True:
            col2.append("quality2")
    else:
        hasQual = False
        # if we don't have quals we must output FASTA instead
        kwargs['output_FASTA'] = True

    if kwargs['output'] is None:
        raise dxpy.AppError("output parameter is required")

    with open(kwargs['output'], 'wb') as out_fh:
        exportToFile(columns=col,
                     table=table,
                     output_file=out_fh,
                     hasName=hasName,
                     hasQual=hasQual,
                     FASTA=kwargs['output_FASTA'],
                     start_row=kwargs['start_row'],
                     end_row=kwargs['end_row'])

    if isPaired == True:
        if kwargs['output2'] is None:
            raise dxpy.AppError(
                "output2 parameter is required for paired reads")
        with open(kwargs['output2'], 'wb') as out_fh2:
            exportToFile(columns=col2,
                         table=table,
                         output_file=out_fh2,
                         hasName=hasName,
                         hasQual=hasQual,
                         FASTA=kwargs['output_FASTA'],
                         start_row=kwargs['start_row'],
                         end_row=kwargs['end_row'])
Ejemplo n.º 15
0
def main(**job_inputs):
    job_outputs = {}
    reads_inputs = job_inputs['reads']
    reads_ids = [r['$dnanexus_link'] for r in reads_inputs]
    reads_descriptions = {r: dxpy.DXGTable(r).describe() for r in reads_ids}
    reads_columns = {r: [col['name'] for col in desc['columns']] for r, desc in reads_descriptions.items()}

    print reads_inputs
    print reads_ids
    print reads_descriptions
    print reads_columns

    all_reads_have_FlowReads_tag = all(['FlowReads' in desc['types'] for desc in reads_descriptions.values()])
    all_reads_have_LetterReads_tag = all(['LetterReads' in desc['types'] for desc in reads_descriptions.values()])
    reads_have_names = any(['name' in columns for columns in reads_columns.values()])
    reads_are_paired = any(['sequence2' in columns for columns in reads_columns.values()])
    reads_have_qualities = any(['quality' in columns for columns in reads_columns.values()])
    if reads_have_qualities:
        assert(all(['quality' in columns for columns in reads_columns.values()]))
    if reads_are_paired:
        all_paired = all(['sequence2' in columns for columns in reads_columns.values()])
        if not all_paired:
            raise dxpy.AppError("Reads to be mapped must be either all paired or all unpaired.  App input contains both paired and unpaired reads.")

    if job_inputs["algorithm"] == "bwasw":
        assert(not reads_are_paired) # bwasw does not support paired inputs

    assert(all_reads_have_FlowReads_tag or all_reads_have_LetterReads_tag)

    reference_record_types = dxpy.describe(job_inputs['reference'])['types']
    if "BwaLetterContigSetV3" in reference_record_types:
        input_ref_is_indexed = True
    elif "ContigSet" in reference_record_types:
        input_ref_is_indexed = False
    else:
        raise dxpy.ProgramError("Unrecognized object passed as reference. It must be a ContigSet record or a BwaLetterContigSetV3 file")

    if input_ref_is_indexed:
        job_outputs['indexed_reference'] = job_inputs['reference']
    else:
        found_cached_idx = False
        for result in dxpy.find_data_objects(classname='record',
                                             typename='BwaLetterContigSetV3',
                                             link=job_inputs['reference']['$dnanexus_link']):
            job_outputs['indexed_reference'] = dxpy.dxlink(result['id'])
            found_cached_idx = True
            break
        if not found_cached_idx:
            job_outputs['indexed_reference'] = dxpy.dxlink(make_indexed_reference(job_inputs))

    table_columns = [("sequence", "string")]
    if reads_have_names:
        table_columns.append(("name", "string"))
    if reads_have_qualities:
        table_columns.append(("quality", "string"))
    table_columns.extend([("status", "string"),
                          ("chr", "string"),
                          ("lo", "int32"),
                          ("hi", "int32"),
                          ("negative_strand", "boolean"),
                          ("error_probability", "uint8"),
                          ("qc_fail", "boolean"),
                          ("duplicate", "boolean"),
                          ("cigar", "string"),
                          ("template_id", "int64"),
                          ("read_group", "int32")])

    # optional sam fields: RG BC XC XT NM CM XN SM AM XM X0 X1 XG MD XA

    if reads_are_paired:
        table_columns.extend([("mate_id", "int32"), # TODO: int8
                              ("status2", "string"),
                              ("chr2", "string"),
                              ("lo2", "int32"),
                              ("hi2", "int32"),
                              ("negative_strand2", "boolean"),
                              ("proper_pair", "boolean")])

    if all_reads_have_FlowReads_tag:
        table_columns.extend([("flowgram", "string"),
                              ("flow_indices", "string"),
                              ("clip_qual_left", "int32"),
                              ("clip_qual_right", "int32"),
                              ("clip_adapter_left", "int32"),
                              ("clip_adapter_right", "int32")])

    table_columns.extend([("sam_field_BC", "string"),
                          ("sam_field_XC", "int32"),
                          ("sam_field_XT", "string"),
                          ("sam_field_NM", "int32"),
                          ("sam_field_CM", "int32"),
                          ("sam_field_XN", "int32"),
                          ("sam_field_SM", "int32"),
                          ("sam_field_AM", "int32"),
                          ("sam_field_XM", "int32"),
                          ("sam_field_X0", "int32"),
                          ("sam_field_X1", "int32"),
                          ("sam_field_XG", "int32"),
                          ("sam_field_MD", "string"),
                          ("sam_field_XA", "string"),
                          ("sam_optional_fields", "string")])


    column_descriptors = [dxpy.DXGTable.make_column_desc(name, type) for name, type in table_columns]

    gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")
    t = dxpy.new_dxgtable(column_descriptors, indices=[gri_index])

    if input_ref_is_indexed:
        original_contigset = dxpy.get_details(job_inputs['reference'])['original_contigset']
    else:
        original_contigset = job_inputs['reference']
    t.set_details({'original_contigset': original_contigset})

    t.add_types(["LetterMappings", "Mappings", "gri"])

    # name table
    if 'output_name' in job_inputs:
        t.rename(job_inputs['output_name'])
    else:
        first_reads_name = dxpy.DXGTable( job_inputs['reads'][0] ).describe()['name']
        contig_set_name = dxpy.describe(job_inputs['reference'])['name']
        # if we're working on an indexed_reference we're not guaranteed to have access to original_contigset
        if input_ref_is_indexed:
            contig_set_name = contig_set_name.split(' (index')[0]
        t.rename(first_reads_name + " mapped to " + contig_set_name)

    # declare how many paired or single reads are in each reads table
    read_group_lengths = []
    for i in range(len(reads_ids)):
        current_length = reads_descriptions[reads_ids[i]]["length"]
        if 'sequence2' in dxpy.DXGTable(reads_ids[i]).get_col_names():
            num_pairs = current_length
            num_singles = 0
        else:
            num_pairs = 0
            num_singles = current_length

        read_group_lengths.append( {"num_singles":num_singles, "num_pairs":num_pairs} )

    details = t.get_details()
    details['read_groups'] = read_group_lengths
    t.set_details(details)

    row_offsets = []; row_cursor = 0
    for i in range(len(reads_ids)):
        row_offsets.append(row_cursor)
        row_cursor += reads_descriptions[reads_ids[i]]["length"]

    chunk_size = job_inputs["chunk_size"]

    map_job_inputs = job_inputs.copy()
    map_job_inputs["row_offsets"] = row_offsets
    map_job_inputs["num_rows"] = chunk_size
    map_job_inputs["table_id"] = t.get_id()
    map_job_inputs["indexed_reference"] = job_outputs['indexed_reference']

    postprocess_job_inputs = job_inputs.copy()
    postprocess_job_inputs["table_id"] = t.get_id()

    for start_row in xrange(0, row_cursor, chunk_size):
        map_job_inputs["start_row"] = start_row
        map_job = dxpy.new_dxjob(map_job_inputs, "map")
        print "Launched map job with", map_job_inputs
        postprocess_job_inputs["chunk%dresult" % start_row] = {'job': map_job.get_id(), 'field': 'ok'}
        postprocess_job_inputs["chunk%ddebug" % start_row] = {'job': map_job.get_id(), 'field': 'debug'}

    postprocess_job = dxpy.new_dxjob(postprocess_job_inputs, "postprocess")

    job_outputs['mappings'] = {'job': postprocess_job.get_id(), 'field': 'mappings'}

    print "MAIN OUTPUT:", job_outputs
    return job_outputs
Ejemplo n.º 16
0
def generate_report(geneBody, inner_dist, junc_ann, read_dist, read_dup,
                    mappings, contam, names):

    report_details = {}

    # Gene Body Dist
    loc_in_gene = [n for n in range(100)]

    report_details['Gene Body Coverage'] = {
        "Normalized Location in Gene": loc_in_gene,
        "% of Reads Covering": geneBody
    }

    #########################
    # Inner Distance

    if inner_dist != None:

        dxpy.download_dxfile(inner_dist, "inner_dist.txt")

        inner_bucket = []
        inner_num_reads = []
        inner_total_reads = 0
        # if a bucket has less than 0.1% of reads in it then don't include it
        cutoff = 0.001

        with open("inner_dist.txt", "r") as fh:
            line = fh.readline().rstrip("\n")
            while line != "":
                inner_total_reads += int(line.split()[2])
                line = fh.readline().rstrip("\n")

        bucket_cutoff = cutoff * inner_total_reads
        print "Applying cutoff of: " + str(
            cutoff) + " for inner distance calculation"

        with open("inner_dist.txt", "r") as fh:
            line = fh.readline().rstrip("\n")
            while line != "":
                start, end, num_reads = [int(x) for x in line.split()]
                if num_reads > bucket_cutoff:
                    # store center position of this bucket
                    inner_bucket.append(int(end - ((end - start) / 2)))
                    inner_num_reads.append(num_reads)

                line = fh.readline().rstrip("\n")

        # find total to normalize
        inner_total_reads = sum(inner_num_reads)
        print "Total reads for inner distance calculation: " + str(
            inner_total_reads)
        inner_median = None
        running_total = 0
        inner_length_sum = 0
        for i in range(len(inner_bucket)):
            # multiply read length by number of observations for the mean
            inner_length_sum += inner_bucket[i] * inner_num_reads[i]

            # calculate median
            running_total += inner_num_reads[i]
            if running_total >= inner_total_reads / 2 and inner_median == None:
                inner_median = inner_bucket[i]

        inner_mean = inner_length_sum / inner_total_reads
        print "inner distance metrics: " + " ".join(
            [str(inner_length_sum),
             str(inner_total_reads)])

        # calc standard deviation
        std_sum = 0
        for i in range(len(inner_bucket)):
            std_sum += ((inner_bucket[i] - inner_mean)**2) * inner_num_reads[i]

        std_sum /= inner_total_reads
        inner_std = int(math.sqrt(std_sum) + 0.5)

        report_details['Paired Read Inner Distance'] = {
            "Inner Distance (bp)": inner_bucket,
            "Count": inner_num_reads,
            "Mean": inner_mean,
            "Median": inner_median,
            "Standard Deviation": inner_std
        }

    ############################
    # Junction Annotation

    dxpy.download_dxfile(junc_ann, "junc_ann.r")

    # initialize splicing values in case there was no splicing
    sj_k = 0
    sj_pn = 0
    sj_cn = 0

    se_k = 0
    se_pn = 0
    se_cn = 0

    if os.path.getsize("junc_ann.r") == 0:
        print "No splicing events found so setting all junction stats to 0"
    else:
        with open("junc_ann.r", "r") as fh:

            line = fh.readline()
            while line != "":
                line = line.rstrip("\n")
                if line.startswith("events"):
                    # parse out the % and assign them
                    se_pn, se_cn, se_k = [
                        float(n) / 100 for n in line[9:-1].split(",")
                    ]

                if line.startswith("junction"):
                    sj_pn, sj_cn, sj_k = [
                        float(n) / 100 for n in line[11:-1].split(",")
                    ]

                line = fh.readline()

    report_details['Junction Annotation'] = {
        "Splicing Junctions": {
            "known": sj_k,
            "partial novel": sj_pn,
            "complete novel": sj_cn
        },
        "Splicing Events": {
            "known": se_k,
            "partial novel": se_pn,
            "complete novel": se_cn
        }
    }

    ############################
    # read duplication

    dxpy.download_dxfile(read_dup, "read_dup.txt")

    pos_copy = []
    pos_num_reads = []
    pos_total_reads = 0
    seq_copy = []
    seq_num_reads = []
    seq_total_reads = 0

    with open("read_dup.txt", "r") as fh:
        # pull of first header
        line = fh.readline()
        line = fh.readline()
        # read until we hit the stats for sequence based duplication
        while not line.startswith("Occurrence"):
            c, r = [int(n) for n in line.split()]
            pos_copy.append(c)
            pos_num_reads.append(float(r))
            pos_total_reads += r
            line = fh.readline()

        #get next line to start with the data
        line = fh.readline()
        while line != "":
            c, r = [int(n) for n in line.split()]
            seq_copy.append(c)
            seq_num_reads.append(float(r))
            seq_total_reads += r
            line = fh.readline()

    pos_total_reads = float(pos_total_reads)
    seq_total_reads = float(seq_total_reads)

    for i in range(len(pos_num_reads)):
        pos_num_reads[i] /= pos_total_reads

    for i in range(len(seq_num_reads)):
        seq_num_reads[i] /= seq_total_reads

    report_details['Read Duplication'] = {
        "Position Based": {
            "Read Occurrences": pos_copy,
            "% Reads": pos_num_reads
        },
        "Sequence Based": {
            "Read Occurrences": seq_copy,
            "% Reads": seq_num_reads
        }
    }

    ############################
    # read distribution report
    if read_dist != None:
        dxpy.download_dxfile(read_dist, "read_dist.txt")

        report_details['Read Distribution'] = {}

        with open("read_dist.txt", "r") as rd_file:
            report_details['Read Distribution']['Total Reads'] = int(
                rd_file.readline().split()[-1])
            report_details['Read Distribution']['Total Tags'] = int(
                rd_file.readline().split()[-1])
            report_details['Read Distribution']['Total Assigned Tags'] = int(
                rd_file.readline().split()[-1])

            # pull out line of "="s
            rd_file.readline()
            # pull header line
            rd_file.readline()
            line = rd_file.readline()
            while not line.startswith("="):
                fields = line.split()
                report_details['Read Distribution'][fields[0]] = [
                    int(fields[1]),
                    int(fields[2]),
                    float(fields[3])
                ]
                line = rd_file.readline()

    #############################
    # add report of contaminations if calculated

    if contam != None:
        contam_report = []
        for i in range(len(contam)):
            contam_report.append({
                "Contaminant Name": names[i],
                "% Reads Mapping": contam[i]
            })

        report_details['Contamination'] = contam_report

    #############################
    # add link to mappings
    report_details['original_mappings'] = mappings

    report_name = dxpy.DXGTable(mappings).describe()['name'] + " RSeQC report"

    # create report
    report = dxpy.new_dxrecord(name=report_name,
                               details=report_details,
                               types=["Report", "RSeQC"])
    report.close()

    return {"Report": dxpy.dxlink(report.get_id())}
Ejemplo n.º 17
0
def calc_contam(num_reads, mappings):
    percent_mapped = float(
        dxpy.DXGTable(mappings).describe()['length']) / float(num_reads)

    return {"percent_mapped": percent_mapped}
Ejemplo n.º 18
0
def main(**job_inputs):
    output = {}
    reportInput = {}

    run_shell("dx-spans-to-bed --output genes.bed " +
              job_inputs["gene_model"]["$dnanexus_link"])
    bed_id = dxpy.upload_local_file("genes.bed").get_id()
    mappings_id = job_inputs["mappings"]["$dnanexus_link"]

    # get contaminant mapping started if we're doing it:
    if "contaminants" in job_inputs:
        if not "original_reads" in job_inputs:
            raise dxpy.AppError(
                "Original Reads must be input to calculate contamination levels. Please also supply the reads object that corresponds to these RNA-Seq mappings"
            )

        name_input = []
        contam_input = []

        #spawn mappings job for each ContigSet
        for contaminant in job_inputs['contaminants']:
            calc_job = map_contaminant(Reads=job_inputs['original_reads'],
                                       Contig=contaminant)

            name_input.append(dxpy.DXRecord(contaminant).describe()['name'])
            contam_input.append({"job": calc_job, "field": "percent_mapped"})

        reportInput['contam'] = contam_input
        reportInput['names'] = name_input
    else:
        reportInput['contam'] = None
        reportInput['names'] = None

    # output mappings as SAM for analysis modules
    run_shell(" ".join([
        "dx-mappings-to-sam", "--discard_unmapped", "--output mappings.sam",
        mappings_id
    ]))
    run_shell(" ".join(
        ["samtools", "view", "-S", "-b", "mappings.sam", ">", "mappings.bam"]))
    bam_id = dxpy.upload_local_file("mappings.bam",
                                    wait_on_close=True).get_id()

    job1 = dxpy.new_dxjob({
        'BED_file': bed_id,
        "BAM_file": dxpy.dxlink(bam_id)
    }, "geneBody_coverage")

    # if paired then do inner distance calculation
    if "chr2" in dxpy.DXGTable(mappings_id).get_col_names():
        job2 = dxpy.new_dxjob(
            {
                'BED_file': bed_id,
                "BAM_file": dxpy.dxlink(bam_id)
            }, "inner_distance")
    else:
        job2 = None

    job3 = dxpy.new_dxjob({
        'BED_file': bed_id,
        "BAM_file": dxpy.dxlink(bam_id)
    }, "junction_annotation")

    job4 = dxpy.new_dxjob({"BAM_file": dxpy.dxlink(bam_id)},
                          "read_duplication")

    # implement this one when we can request a large RAM instance - requires 19GB for human genome
    job5 = dxpy.new_dxjob({
        'BED_file': bed_id,
        "BAM_file": dxpy.dxlink(bam_id)
    }, "read_distribution")
    #                       {"systemRequirements": {"instanceType":"dx_m2.2xlarge"}} )

    reportInput['geneBody'] = {"job": job1.get_id(), "field": "results"}
    if job2 != None:
        reportInput['inner_dist'] = {"job": job2.get_id(), "field": "results"}
    else:
        reportInput['inner_dist'] = None

    reportInput['junc_ann'] = {"job": job3.get_id(), "field": "results"}
    reportInput['read_dup'] = {"job": job4.get_id(), "field": "results"}
    reportInput['read_dist'] = {"job": job5.get_id(), "field": "results"}
    reportInput['mappings'] = job_inputs["mappings"]

    reportJob = dxpy.new_dxjob(reportInput, "generate_report")

    output['report'] = {"job": reportJob.get_id(), "field": "Report"}

    return output