Python open_dxgtable Exemples, dxpy.open_dxgtable Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_dxpy.py Projet : jameslz/dx-toolkit

    def test_table_context_manager_error_handling(self):
        # In each case, the flush that happens at the close of the context handler should wait for
        # the asynchronous requests and then raise the resulting error.

        # Note that this test assumes that the error is a semantic error in the add_row data that
        # is NOT caught by any local error checking.

        # Use new_dxgtable
        with self.assertRaises(DXAPIError):
            with dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                    dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as table1:
                table1.add_row(["", 68719476736]) # Not in int32 range

        # Use open_dxgtable and close table
        table2_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                       dxpy.DXGTable.make_column_desc("b", "int32")], mode='w').get_id()
        with self.assertRaises(DXAPIError):
            with dxpy.open_dxgtable(table2_id) as table2:
                table2.add_row(["", 68719476736]) # Not in int32 range
        # TODO: why does the flush in this table's destructor fail? Nothing should be getting
        # flushed then...

        # Use open_dxgtable and leave table open
        table3_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                       dxpy.DXGTable.make_column_desc("b", "int32")])
        with self.assertRaises(DXAPIError):
            with dxpy.open_dxgtable(table3_id, mode='a') as table3:
                table3.add_row(["", 68719476736]) # Not in int32 range

Exemple #2

0

Afficher le fichier

Fichier : test_dxpy.py Projet : jameslz/dx-toolkit

    def test_table_context_manager(self):
        # Writing a new_dxgtable with parts
        with dxpy.new_dxgtable(
            [dxpy.DXGTable.make_column_desc("a", "string"),
             dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as self.dxgtable:
            for i in range(64):
                self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1)

        # Writing a new_dxgtable without parts
        with dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as table2:
            table2_id = table2.get_id()
            for i in range(64):
                table2.add_rows(data=[["row"+str(i), i]])
        table2 = dxpy.open_dxgtable(table2_id)
        self.assertEqual(table2.describe()["length"], 64)
        table2.remove()

        # Writing an open_dxgtable
        table3_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                       dxpy.DXGTable.make_column_desc("b", "int32")]).get_id()
        with dxpy.open_dxgtable(table3_id, mode='a') as table3:
            for i in range(64):
                table3.add_rows(data=[["row"+str(i), i]])
        with dxpy.open_dxgtable(table3_id, mode='w') as table3:
            for i in range(64):
                table3.add_rows(data=[["row"+str(i), i]])
        table3 = dxpy.open_dxgtable(table3_id)
        state = table3._get_state()
        self.assertTrue(state in ['closing', 'closed'])
        table3._wait_on_close()
        self.assertEqual(table3.describe()["length"], 128)
        table3.remove()

Exemple #3

0

Afficher le fichier

Fichier : code-input-gtable-output-gtable.py Projet : abiyani/dx-toolkit

def process(input_gtable_id, start_row, end_row, output_gtable_id):
    DX_APP_WIZARD_||_INPUT = dxpy.DXGTable(input_gtable_id)

    # Using the context manager here is useful so that the flush()
    # method is called once the context manager exits, and any rows
    # added will be flushed to the platform.  The mode is set to "a"
    # for "append".

    with dxpy.open_dxgtable(output_gtable_id, mode="a") as DX_APP_WIZARD_||_OUTPUT:

        # The following loop iterates over each row from start_row to
        # end_row (not including end_row).  You can find documentation on
        # other useful GTable methods (such as iterating over a genomic
        # range query with iterate_query_rows) in the dxpy library here:
        # http://autodoc.dnanexus.com/bindings/python/current/dxpy_dxgtable.html

        for row in DX_APP_WIZARD_||_INPUT.iterate_rows(start_row, end_row):
            # Fill in code here to perform whatever computation is
            # necessary to process the row and compute the new row.
            #
            # *row* is an array where the first element is the row ID,
            # and the rest of the elements appear in the same order as
            # the GTable's column specification.  You can retrieve the
            # column specifications or names by using
            # DX_APP_WIZARD_||_INPUT.get_columns() or DX_APP_WIZARD_||_INPUT.get_col_names().

            new_row = []

            # The following line queues up the array new_row as a row
            # of data that should be added to the output GTable.
            # Queued rows will be flushed to the platform periodically.

            DX_APP_WIZARD_||_OUTPUT.add_row(new_row)

Exemple #4

0

Afficher le fichier

def write_reads_to_fasta(reads_id, filename, seq_col='sequence', start_row=0, end_row=None):
    row_id = start_row
    with open(filename, "w") as fh:
        for row in dxpy.open_dxgtable(reads_id).iterate_rows(columns=[seq_col], start=start_row, end=end_row):
            for line in '>%d' % row_id, row[0]:
                print >>fh, line
            row_id += 1

Exemple #5

0

Afficher le fichier

Fichier : runGatk.py Projet : b1234561/gatk_unifiedgenotyper

def reduceGatk(**job_inputs):
    output = {}

    t = dxpy.open_dxgtable(job_inputs['tableId'])
    print "Closing Table"
    t.close()
    output['variants'] = dxpy.dxlink(t.get_id())

    return output

Exemple #6

0

Afficher le fichier

def postprocess(process_outputs):
    # Change the following to process whatever input this stage
    # receives.  You may also want to copy and paste the logic to download
    # and upload files here as well if this stage receives file input
    # and/or makes file output.

    for output in process_outputs:
        table = dxpy.open_dxgtable(output)
        table.close()
        break

    return { "answer": None }

Exemple #7

0

Afficher le fichier

Fichier : dx_variants_to_vcf.py Projet : jameslz/dx-toolkit

def main(**kwargs):

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args(sys.argv[1:]))

    # Attempt to resolve variants gtable name
    try:
        project, folderpath, entity_result = resolve_existing_path(kwargs['path'], expected='entity')
    except BaseException as details:
        parser.exit(1, fill(unicode(details)) + '\n')

    if entity_result is None:
        parser.exit(1, fill('Could not resolve ' + kwargs['path'] + ' to a data object') + '\n')

    filename = kwargs['output']
    if filename is None:
        filename = entity_result['describe']['name'].replace('/', '%2F') + ".vcf"

    if kwargs['output'] == '-':
        outputFile = sys.stdout
    else:
        outputFile = open(filename, 'w')
    exportRef = kwargs['export_ref_calls']
    exportNoCall = kwargs['export_no_calls']
    
    variantsTable = dxpy.open_dxgtable(entity_result['id'])
    
    try:
        originalContigSet = variantsTable.get_details()['original_contigset']
    except:
        raise dxpy.AppError("The original reference genome must be attached as a detail")        
    contigDetails = dxpy.DXRecord(originalContigSet).get_details()
    
    if kwargs['reference'] is not None:
        refFileName = kwargs['reference']
        if not os.path.isfile(refFileName):
            raise dxpy.AppError("The reference expected by the variants to vcf script was not a valid file")
    else:    
        refFileName = tempfile.NamedTemporaryFile(prefix='reference_', suffix='.txt', delete=False).name
        dxpy.download_dxfile(contigDetails['flat_sequence_file']['$dnanexus_link'], refFileName)
 
    if kwargs['write_header']:
    
       infos = variantsTable.get_details().get('infos')
       formats = variantsTable.get_details().get('formats')
       alts = variantsTable.get_details().get('alts')
       filters = variantsTable.get_details().get('filters')
       samples = variantsTable.get_details().get('samples')
    
       outputFile.write("##fileformat=VCFv4.1\n")
       if infos is not None:
           for k, v in collections.OrderedDict(sorted(infos.iteritems())).iteritems():
               outputFile.write("##INFO=<ID="+k+",Number="+v['number']+",Type="+v['type']+",Description=\""+v['description']+"\">\n")

       if len(samples) > 0:
           outputFile.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
           outputFile.write("##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n")
           outputFile.write("##FORMAT=<ID=DP,Number=1,Type=String,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n")
       if formats is not None:
           for k, v in collections.OrderedDict(sorted(formats.iteritems())).iteritems():
               outputFile.write("##FORMAT=<ID="+k+",Number="+v['number']+",Type="+v['type']+",Description=\""+v['description']+"\">\n")
       if alts is not None:
           for k, v in collections.OrderedDict(sorted(alts.iteritems())).iteritems():
               outputFile.write("##ALT=<ID="+k+",Description=\""+v['description']+"\">\n")
       if filters is not None:
           for k, v in collections.OrderedDict(sorted(filters.iteritems())).iteritems():
               outputFile.write("##FILTER=<ID="+k+",Description=\""+v+"\">\n")
       for i in range(len(contigDetails['contigs']['names'])):
           outputFile.write("##contig=<ID="+contigDetails['contigs']['names'][i]+",length="+str(contigDetails['contigs']['sizes'][i])+">\n")
       outputFile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO")
      
       if len(samples) > 0:
           outputFile.write("\tFORMAT")
           for x in samples:
               outputFile.write("\t"+x)
       outputFile.write("\n")

    chromosomeOffsets = {}
    for i in range(len(contigDetails['contigs']['names'])):
        chromosomeOffsets[contigDetails['contigs']['names'][i]] = contigDetails['contigs']['offsets'][i]

    contigSequence = open(refFileName,'r').read()

    col = {}
    names = variantsTable.get_col_names()   
    for i in range(len(names)):
        col[names[i]] = i+1
    col = collections.OrderedDict(sorted(col.items()))
    
    chromosomeList = contigDetails['contigs']['names']
    if kwargs['chr'] is not None:
        intersection = []
        for x in chromosomeList:
            if x in kwargs['chr']:
                intersection.append(x)
        chromosomeList = intersection[:]
 
    for chromosome in chromosomeList:
        lastPosition = -1
        buff = []
        query = variantsTable.genomic_range_query(chr=chromosome, lo=0, hi=sys.maxint)
        for row in variantsTable.get_rows(query=query, limit=1)['data']:
            startRow =  row[0]
            for row in variantsTable.iterate_rows(start=startRow):
                if row[1] != chromosome:
                    break
                if lastPosition < row[col["lo"]]:
                    writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall)
                    buff = []
                buff.append(row)
                lastPosition = row[col["lo"]]
        writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall)
        buff = []

Exemple #8

0

Afficher le fichier

Fichier : runSamtools.py Projet : dnanexus/samtools_mpileup

def main(**job_inputs):
    job_outputs = {}
    mappingsTable = dxpy.open_dxgtable(job_inputs["mappings"]["$dnanexus_link"])
    mappingsTableId = mappingsTable.get_id()

    # This controls the degree of parallelism
    chunks = int(mappingsTable.describe()["length"] / job_inputs["reads_per_job"]) + 1

    try:
        contigSetId = mappingsTable.get_details()["original_contigset"]["$dnanexus_link"]
        originalContigSet = mappingsTable.get_details()["original_contigset"]
    except:
        raise Exception("The original reference genome must be attached as a detail")

    # In the next major section of code, we construct a variants table. As regions of the genome are passed to each worker
    # and variants are called on them, the workers will add rows to this table concurrently.

    variants_schema = [
        {"name": "chr", "type": "string"},
        {"name": "lo", "type": "int32"},
        {"name": "hi", "type": "int32"},
        {"name": "ref", "type": "string"},
        {"name": "alt", "type": "string"},
        {"name": "qual", "type": "double"},
        {"name": "ids", "type": "string"},
    ]

    # The information in these tags is elevated into specific columns, so additional columns for these tags will not be created
    elevatedTags = ["format_GT", "format_DP", "format_AD"]

    # The info and format tags are extracted from the header printed by samtools
    # If additional code will add a tag to the output of the program, modify this header to include the tag.
    # TODO: Allow the table to be created by the first job that finishes to avoid this step.
    headerInfo = extractHeader("/tmp/header.txt", elevatedTags)
    description = {}
    samples = []

    indices = [dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")]

    ##The following section creates the sample-specific table columns
    for k, v in headerInfo["tags"]["info"].iteritems():
        variants_schema.append({"name": "info_" + k, "type": translateTagTypeToColumnType(v)})
        description[k] = {"name": k, "description": v["description"], "type": v["type"], "number": v["number"]}

    # For each sample, add the sample-specific columns to the schema, at present only one sample is supported
    numSamples = 1
    for i in range(numSamples):
        variants_schema.extend(
            [
                {"name": "genotype_" + str(i), "type": "string"},
                {"name": "phasing_" + str(i), "type": "string"},
                {"name": "type_" + str(i), "type": "string"},
                {"name": "variation_qual_" + str(i), "type": "double"},
                {"name": "genotype_qual_" + str(i), "type": "double"},
                {"name": "coverage_" + str(i), "type": "string"},
                {"name": "total_coverage_" + str(i), "type": "int32"},
            ]
        )
        indices.append(dxpy.DXGTable.lexicographic_index([["type_" + str(i), "ASC"]], "type_" + str(i)))
        samples.append("Sample_0")
        for k, v in headerInfo["tags"]["format"].iteritems():
            if "format_" + k not in elevatedTags:
                variants_schema.append({"name": "format_" + k + "_" + str(i), "type": translateTagTypeToColumnType(v)})

    # TODO: Add lexicographic indices when secondary indices are supported

    variants = dxpy.new_dxgtable(variants_schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")])
    tableId = variants.get_id()
    variants = dxpy.open_dxgtable(tableId)
    variants.add_types(["Variants", "gri"])

    details = {
        "samples": samples,
        "original_contigset": job_inputs["reference"],
        "original_mappings": job_inputs["mappings"],
        "formats": headerInfo["tags"]["format"],
        "infos": headerInfo["tags"]["info"],
    }
    # if headerInfo.get('filters') != {}:
    #  details['filters'] = headerInfo['filters']
    variants.set_details(details)

    if "output_name" in job_inputs:
        variants.rename(job_inputs["output_name"])
    else:
        variants.rename(mappingsTable.describe()["name"] + " variant calls by Samtools mpileup")

    # Split the genome into evenly sized regions
    genomeRegions = splitGenomeLengthLargePieces(originalContigSet, chunks)

    # Generate the command line arguments needed to run samtools and bcftools
    samOptions = makeSamtoolsParameters(**job_inputs)
    bcfOptions = makeBcftoolsParameters(**job_inputs)

    # The rest of the main function contains the map-reduce functionality. For each genome chunk, an input spec is created for a new child job.
    # Which specifies
    reduce_job_inputs = {}
    for i in range(len(genomeRegions)):
        if len(genomeRegions[i]) > 0:
            map_job_inputs = {
                "mappings_table_id": mappingsTableId,
                "original_contig_set": contigSetId,
                "interval": genomeRegions[i],
                "tableId": tableId,
                "compress_reference": job_inputs["compress_reference"],
                "compress_no_call": job_inputs["compress_no_call"],
                "infer_no_call": job_inputs["infer_no_call"],
                "sam_options": samOptions,
                "bcf_options": bcfOptions,
                "part_number": i,
            }
            # Run a "map" job for each chunk, passing in the inputspec from above and looking for a function entry point given as "map" (@dxpy.entry_point('map'))
            map_job = dxpy.new_dxjob(map_job_inputs, "map")
            reduce_job_inputs["mapJob" + str(i) + "TableId"] = {"job": map_job.get_id(), "field": "ok"}

    reduce_job_inputs["tableId"] = tableId

    # Run a "reduce" job, which only begins once all of the map jobs singal they have completed by sending 'ok':True
    # The reduce job closes the table. This step is explicitly needed because table closing must wait till the completion of the map jobs
    # By giving the reduce job the map jobs as input, the reduce job will wait to start.
    reduce_job = dxpy.new_dxjob(reduce_job_inputs, "reduce")
    job_outputs = {"variants": {"job": reduce_job.get_id(), "field": "variants"}}

    return job_outputs

Exemple #9

0

Afficher le fichier

Fichier : runSamtools.py Projet : dnanexus/samtools_mpileup

def reducePileup(**job_inputs):
    t = dxpy.open_dxgtable(job_inputs["tableId"])
    print "Closing Table"
    t.close()
    job_outputs = {"variants": dxpy.dxlink(t.get_id())}
    return job_outputs

Exemple #10

0

Afficher le fichier

Fichier : runSamtools.py Projet : dnanexus/samtools_mpileup

def mapPileup(**job_inputs):
    print "Downloading Reference Genome"
    subprocess.check_call("dx-contigset-to-fasta %s ref.fa" % (job_inputs["original_contig_set"]), shell=True)

    # The mappings-to-sam script takes a file with a list of regions in the form -L chrX:lo-hi, as produced by the genome splitting region.
    # This generates the file
    regionFile = open("regions.txt", "w")
    regionFile.write(job_inputs["interval"])
    regionFile.close()

    print "Indexing Dictionary"
    subprocess.check_call("samtools faidx ref.fa", shell=True)

    # The sam-to-mappings script in dx-toolkit, the region_index_offset option is specified to convert from 1-indexed to 0-indexed coordinates
    print "Converting Table to SAM"
    command = "dx-mappings-to-sam %s --output input.sam --region_index_offset -1 --region_file regions.txt" % (
        job_inputs["mappings_table_id"]
    )
    print "Running: " + command
    subprocess.check_call(command, shell=True)
    if checkSamContainsRead("input.sam"):
        print "Converting to BAM"
        subprocess.check_call("samtools view -bS input.sam > input.bam", shell=True)
        print "Indexing"
        subprocess.check_call("samtools index input.bam", shell=True)

        variants = dxpy.open_dxgtable(job_inputs["tableId"])
        command = "samtools mpileup -uf ref.fa"

        # Since samtools takes a bed file to specify the regions, this takes the interval from the -L chrX:lo-hi format and puts it into BED
        bedFile = open("regions.bed", "w")
        intervalMatch = re.findall("-L ([^:]*):(\d+)-(\d+)", job_inputs["interval"])
        if len(intervalMatch) > 0:
            for x in intervalMatch:
                bedFile.write(x[0] + "\t" + str(x[1]) + "\t" + str(x[2]) + "\n")
            bedFile.close()
            bedFile = open("regions.bed", "r")
            bedFile.close()
            command += " -l regions.bed "
        command += job_inputs["sam_options"]
        command += " input.bam | bcftools view "
        command += job_inputs["bcf_options"]
        command += " - > output.vcf"

        print "Pileup Command: " + command
        subprocess.check_call(command, shell=True)

        # Convert the vcf file into variants. The dx_vcfToVariants is a script provided as resources as it has not yet been incorporated into dx-toolkit
        command = "dx_vcfToVariants2 --table_id %s --vcf_file output.vcf --region_file regions.txt" % (
            job_inputs["tableId"]
        )
        if job_inputs["compress_reference"]:
            command += " --compress_reference"
        if job_inputs["infer_no_call"]:
            command += " --infer_no_call"
        if job_inputs["compress_no_call"]:
            command += " --compress_no_call"

        print "Import variants command: " + command
        subprocess.check_call(command, shell=True)

    # Return 'ok', a signal which the reduce job looks for in order to know when it can begin closing the table
    job_outputs = {"ok": True}
    return job_outputs

Exemple #11

0

Afficher le fichier

def main(**kwargs):

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args(sys.argv[1:]))

    # Attempt to resolve variants gtable name
    try:
        project, folderpath, entity_result = resolve_existing_path(
            kwargs['path'], expected='entity')
    except ResolutionError as details:
        parser.exit(1, fill(str(details)) + '\n')

    if entity_result is None:
        parser.exit(
            1,
            fill('Could not resolve ' + kwargs['path'] + ' to a data object') +
            '\n')

    filename = kwargs['output']
    if filename is None:
        filename = entity_result['describe']['name'].replace('/',
                                                             '%2F') + ".vcf"

    if kwargs['output'] == '-':
        outputFile = sys.stdout
    else:
        outputFile = open(filename, 'w')
    exportRef = kwargs['export_ref_calls']
    exportNoCall = kwargs['export_no_calls']

    variantsTable = dxpy.open_dxgtable(entity_result['id'])

    try:
        originalContigSet = variantsTable.get_details()['original_contigset']
    except:
        raise dxpy.AppError(
            "The original reference genome must be attached as a detail")
    contigDetails = dxpy.DXRecord(originalContigSet).get_details()

    if kwargs['reference'] is not None:
        refFileName = kwargs['reference']
        if not os.path.isfile(refFileName):
            raise dxpy.AppError(
                "The reference expected by the variants to vcf script was not a valid file"
            )
    else:
        refFileName = tempfile.NamedTemporaryFile(prefix='reference_',
                                                  suffix='.txt',
                                                  delete=False).name
        dxpy.download_dxfile(
            contigDetails['flat_sequence_file']['$dnanexus_link'], refFileName)

    if kwargs['write_header']:

        infos = variantsTable.get_details().get('infos')
        formats = variantsTable.get_details().get('formats')
        alts = variantsTable.get_details().get('alts')
        filters = variantsTable.get_details().get('filters')
        samples = variantsTable.get_details().get('samples')

        outputFile.write("##fileformat=VCFv4.1\n")
        if infos is not None:
            for k, v in collections.OrderedDict(sorted(
                    infos.iteritems())).iteritems():
                outputFile.write("##INFO=<ID=" + k + ",Number=" + v['number'] +
                                 ",Type=" + v['type'] + ",Description=\"" +
                                 v['description'] + "\">\n")

        if len(samples) > 0:
            outputFile.write(
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
            )
            outputFile.write(
                "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n"
            )
            outputFile.write(
                "##FORMAT=<ID=DP,Number=1,Type=String,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n"
            )
        if formats is not None:
            for k, v in collections.OrderedDict(sorted(
                    formats.iteritems())).iteritems():
                outputFile.write("##FORMAT=<ID=" + k + ",Number=" +
                                 v['number'] + ",Type=" + v['type'] +
                                 ",Description=\"" + v['description'] +
                                 "\">\n")
        if alts is not None:
            for k, v in collections.OrderedDict(sorted(
                    alts.iteritems())).iteritems():
                outputFile.write("##ALT=<ID=" + k + ",Description=\"" +
                                 v['description'] + "\">\n")
        if filters is not None:
            for k, v in collections.OrderedDict(sorted(
                    filters.iteritems())).iteritems():
                outputFile.write("##FILTER=<ID=" + k + ",Description=\"" + v +
                                 "\">\n")
        for i in range(len(contigDetails['contigs']['names'])):
            outputFile.write("##contig=<ID=" +
                             contigDetails['contigs']['names'][i] +
                             ",length=" +
                             str(contigDetails['contigs']['sizes'][i]) + ">\n")
        outputFile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO")

        if len(samples) > 0:
            outputFile.write("\tFORMAT")
            for x in samples:
                outputFile.write("\t" + x)
        outputFile.write("\n")

    chromosomeOffsets = {}
    for i in range(len(contigDetails['contigs']['names'])):
        chromosomeOffsets[contigDetails['contigs']['names']
                          [i]] = contigDetails['contigs']['offsets'][i]

    contigSequence = open(refFileName, 'r').read()

    col = {}
    names = variantsTable.get_col_names()
    for i in range(len(names)):
        col[names[i]] = i + 1
    col = collections.OrderedDict(sorted(col.items()))

    chromosomeList = contigDetails['contigs']['names']
    if kwargs['chr'] is not None:
        intersection = []
        for x in chromosomeList:
            if x in kwargs['chr']:
                intersection.append(x)
        chromosomeList = intersection[:]

    for chromosome in chromosomeList:
        buff = []
        lastPosition = -1
        query = variantsTable.genomic_range_query(chr=chromosome,
                                                  lo=0,
                                                  hi=sys.maxsize)
        for row in variantsTable.get_rows(query=query, limit=1)['data']:
            startRow = row[0]
            for row in variantsTable.iterate_rows(start=startRow):
                if row[1] != chromosome:
                    break
                if lastPosition < row[col["lo"]]:
                    writeBuffer(buff, col, outputFile, contigSequence,
                                chromosomeOffsets, exportRef, exportNoCall)
                    buff = []
                buff.append(row)
                lastPosition = row[col["lo"]]
        writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets,
                    exportRef, exportNoCall)
        buff = []

Exemple #12

0

Afficher le fichier

Fichier : runGatk.py Projet : b1234561/gatk_unifiedgenotyper

def main(**job_inputs):
    os.environ['CLASSPATH'] = '/opt/jar/AddOrReplaceReadGroups.jar:/opt/jar/GenomeAnalysisTK.jar'

    if job_inputs['output_mode'] == "EMIT_VARIANTS_ONLY":
        job_inputs['infer_no_call'] = False

    mappingsTable = dxpy.open_dxgtable(job_inputs['mappings'][0]['$dnanexus_link'])
    mappingsTableId = mappingsTable.get_id()

    #This controls the degree of parallelism in GATK
    reads = 0
    for x in job_inputs['mappings']:
        reads += int(dxpy.DXGTable(x).describe()['length'])
    chunks = int(reads/job_inputs['reads_per_job'])+1

    command = buildCommand(job_inputs)

    #callVariantsOnSample(job_inputs, mappingsTable, command)

    try:
        contigSetId = mappingsTable.get_details()['original_contigset']['$dnanexus_link']
        originalContigSet = mappingsTable.get_details()['original_contigset']
    except:
        raise Exception("The original reference genome must be attached as a detail")

    variants_schema = [
        {"name": "chr", "type": "string"},
        {"name": "lo", "type": "int32"},
        {"name": "hi", "type": "int32"},
        {"name": "ref", "type": "string"},
        {"name": "alt", "type": "string"},
        {"name": "qual", "type": "double"},
        {"name": "ids", "type": "string"}
         ]

    elevatedTags = ['format_GT', 'format_DP', 'format_AD']
    headerInfo = extractHeader("/tmp/header.txt", elevatedTags)
    description = {}
    samples = []

    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri')]

    formats = {}
    infos = {}
    filters = {}

    for k, v in headerInfo['tags']['info'].iteritems():
        variants_schema.append({"name": "info_"+k, "type":translateTagTypeToColumnType(v)})
        description[k] = {'name' : k, 'description' : v['description'], 'type' : v['type'], 'number' : v['number']}

    samples = []
    for i in range(len(job_inputs['mappings'])):
        samples.append(dxpy.DXGTable(job_inputs['mappings'][i]).describe()['name'].replace(" ", ""))
    numSamples = len(job_inputs['mappings'])
    if job_inputs['call_multiple_samples'] == False:
        numSamples = 1
        samples = ["Sample_0"]
    #For each sample, write the sample-specific columns
    for i in range(numSamples):
      variants_schema.extend([
        {"name": "genotype_"+str(i), "type": "string"},
        {"name": "phasing_"+str(i), "type": "string"},
        {"name": "type_"+str(i), "type": "string"},
        {"name": "variation_qual_"+str(i), "type": "double"},
        {"name": "genotype_qual_"+str(i), "type": "double"},
        {"name": "coverage_"+str(i), "type": "string"},
        {"name": "total_coverage_"+str(i), "type": "int32"}
      ])
      indices.append(dxpy.DXGTable.lexicographic_index([["type_"+str(i), "ASC"]], 'type_'+str(i)))
      for k, v in headerInfo['tags']['format'].iteritems():
        if "format_"+k not in elevatedTags:
          variants_schema.append({"name": "format_"+k+"_"+str(i), "type":translateTagTypeToColumnType(v)})

    variantsTable = dxpy.new_dxgtable(variants_schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")])
    tableId = variantsTable.get_id()
    variantsTable = dxpy.open_dxgtable(tableId)
    variantsTable.add_types(["Variants", "gri"])

    details = {'samples':samples, 'original_contigset':job_inputs['reference'], 'original_mappings':job_inputs['mappings'], 'formats':headerInfo['tags']['format'], 'infos':headerInfo['tags']['info']}
    #if headerInfo.get('filters') != {}:
    #  details['filters'] = headerInfo['filters']
    variantsTable.set_details(details)

    if 'output_name' in job_inputs:
        variantsTable.rename(job_inputs['output_name'])
    elif (job_inputs['genotype_likelihood_model'] == "SNP"):
        variantsTable.rename(mappingsTable.describe()['name'] + " SNP calls by GATK")
    elif (job_inputs['genotype_likelihood_model'] == "INDEL"):
        variantsTable.rename(mappingsTable.describe()['name'] + " indel calls by GATK")
    elif (job_inputs['genotype_likelihood_model'] == "BOTH"):
        variantsTable.rename(mappingsTable.describe()['name'] + " SNP and indel calls by GATK")
    else:
        variantsTable.rename(mappingsTable.describe()['name'] + " variant calls by GATK")

    reduceInput = {}
    #commandList = splitGenomeLengthLargePieces(originalContigSet, job_inputs['intervals_to_process'], job_inputs['intervals_to_exclude'],  job_inputs['minimum_chunk_size'], job_inputs['maximum_chunks'])
    commandList = splitGenomeLengthLargePieces(originalContigSet, chunks)

    for i in range(len(commandList)):
        if len(commandList[i]) > 0:
            mapInput = {
                'mappings_tables': job_inputs['mappings'],
                'original_contig_set': contigSetId,
                'interval': commandList[i],
                'tableId': tableId,
                'command': buildCommand(job_inputs),
                'compress_reference': job_inputs['compress_reference'],
                'infer_no_call': job_inputs['infer_no_call'],
                'compress_no_call': job_inputs['compress_no_call'],
                'intervals_to_include': job_inputs.get('intervals_to_process'),
                'intervals_to_exclude': job_inputs.get('intervals_to_exclude'),
                'intervals_merging': job_inputs['intervals_merging'],
                'part_number': i,
                'samples': samples,
                'call_multiple_samples': job_inputs['call_multiple_samples']
            }
            # Run a "map" job for each chunk
            mapJobId = dxpy.new_dxjob(fn_input=mapInput, fn_name="mapGatk").get_id()
            reduceInput["mapJob" + str(i) + "TableId"] = {'job': mapJobId, 'field': 'id'}

    reduceInput['tableId'] = tableId
    reduceJobId = dxpy.new_dxjob(fn_input=reduceInput, fn_name="reduceGatk").get_id()

    output = {'variants': {'job': reduceJobId, 'field': 'variants'}}
    return output