def setUpClass(cls):
        if RUN_JOB_ON_DX:
            if not project_name:
                print "'PROJ_NAME' environment variable must be defined!"
                sys.exit(1)
            working_project_id = dxpy.find_one_project(more_ok=False,
                                                       name=project_name)["id"]
            run_args = {}
            run_args["project"] = working_project_id
            run_args["name"] = "vcfscope-measure on chr21"
            run_args["folder"] = "/purge/" + app_name
            input_hash = {}
            input_hash["vcfgz"] = dxpy.dxlink("file-BkkjFkj098Gb2jZ1Yx533JFv",
                                              project_id)
            input_hash["bam"] = dxpy.dxlink("file-Bkkjj5Q098Gkvkb3Xx5Pxj1J",
                                            project_id)
            input_hash["bai"] = dxpy.dxlink("file-Bkkjj5Q098GzYx2bG5YJ3z34",
                                            project_id)
            input_hash["region"] = dxpy.dxlink("file-Bkkj22Q098Gz5yK1Q955G5gX",
                                               project_id)

            app = dxpy.DXApp(name=app_name, alias="9.9.7")
            cls.job = app.run(input_hash, **run_args)

        else:
            job_id = "job-F1JpY9Q0pVj0BgpYBp14f31Q"
            cls.job = dxpy.DXJob(job_id)

        cls.job.wait_on_done()
Beispiel #2
0
def main():

    inputs_file = open("inputs_stats.txt", 'w')

    print sys.argv[2]

    workflow = dxpy.DXWorkflow(sys.argv[2].split(":")[-1])
    fh = dxpy.DXFile(sys.argv[1].split(":")[-1])

    if "/Results" in fh.describe()['folder']:
        return

    app_id = sys.argv[3]

    if "applet" in app_id:
        app = dxpy.DXApplet(app_id)
    else:
        app = dxpy.DXApp(app_id)

    w_id = sys.argv[1].split(":")[1]

    existing_inputs = []
    for item in workflow.describe()['stages'][0]['input']:
        existing_inputs.append(item)
    print existing_inputs

    for x in app.describe()['inputSpec']:
        print x
        if x['class'] == 'file' and x['name'] not in existing_inputs:
            inputs_file.write(x['name'] + "\n")
            
    inputs_file.close()
Beispiel #3
0
    def __init__(self, name='bwa_mem_fastq_read_mapper', version='1.5.0'):
        # Currently doesn't look like search function allows option to search for particular version
        # Only version option is 'all_versions' boolean which indicates whether to get default or all

        self.name = name
        self.version = version
        self.dxid = None
        self.object = None

        # Get mapper app dxid
        app_generator = dxpy.find_apps(
            name=name,
            all_versions=False)  # all_versions will not get most recent
        if not list(app_generator):
            # raise dxpy.AppError('Unable to find app called %s' % name)
            print 'Error: Could not find any app with name: %s' % name
            sys.exit()
        else:
            app_generator = dxpy.find_apps(name=name, all_versions=False)
            for app in app_generator:
                app_description = dxpy.api.app_describe(app['id'])
                app_version = app_description['version']
                if app_version == self.version:
                    self.dxid = app['id']
                    break
                else:
                    print app_version
        if not self.dxid:
            print 'Could not find app: %s, version: %s' % (self.name,
                                                           self.version)
            sys.exit()
        self.object = dxpy.DXApp(
            dxid=self.dxid)  # bwa_mem : app-BXQy79Q0y7yQJVff3j9Y2B83
Beispiel #4
0
 def get_handler_from_desc(desc):
     if desc['class'] == 'applet':
         return dxpy.DXApplet(desc['id'], project=desc['project'])
     elif desc['class'] == 'app':
         return dxpy.DXApp(dxid=desc['id'])
     else:
         return dxpy.DXWorkflow(desc['id'], project=desc['project'])
Beispiel #5
0
def get_exec_handler(path, alias=None):
    handler = None
    def get_handler_from_desc(desc):
        if desc['class'] == 'applet':
            return dxpy.DXApplet(desc['id'], project=desc['project'])
        elif desc['class'] == 'app':
            return dxpy.DXApp(dxid=desc['id'])
        else:
            return dxpy.DXWorkflow(desc['id'], project=desc['project'])

    if alias is None:
        app_desc = get_app_from_path(path)
        try:
            # Look for applets and workflows
            _project, _folderpath, entity_results = resolve_existing_path(path,
                                                                          expected='entity',
                                                                          ask_to_resolve=False,
                                                                          expected_classes=['applet', 'record', 'workflow'],
                                                                          visibility="visible")
            def is_applet_or_workflow(i):
                return (i['describe']['class'] in ['applet', 'workflow'])
            if entity_results is not None:
                entity_results = [i for i in entity_results if is_applet_or_workflow(i)]
                if len(entity_results) == 0:
                    entity_results = None
        except ResolutionError:
            if app_desc is None:
                raise
            else:
                entity_results = None

        if entity_results is not None and len(entity_results) == 1 and app_desc is None:
            handler = get_handler_from_desc(entity_results[0]['describe'])
        elif entity_results is None and app_desc is not None:
            handler = get_handler_from_desc(app_desc)
        elif entity_results is not None:
            if not INTERACTIVE_CLI:
                raise ResolutionError('Found multiple executables with the path ' + path)
            print('Found multiple executables with the path ' + path)
            choice_descriptions = [get_ls_l_desc(r['describe']) for r in entity_results]
            if app_desc is not None:
                choice_descriptions.append('app-' + app_desc['name'] + ', version ' + app_desc['version'])
            choice = pick(choice_descriptions)
            if choice < len(entity_results):
                # all applet/workflow choices show up before the app,
                # of which there is always at most one possible choice
                handler = get_handler_from_desc(entity_results[choice]['describe'])
            else:
                handler = get_handler_from_desc(app_desc)
        else:
            raise ResolutionError("No matches found for " + path)
    else:
        if path.startswith('app-'):
            path = path[4:]
        handler = dxpy.DXApp(name=path, alias=alias)
    return handler
Beispiel #6
0
def run_bwa_mem(sample, fastq_dict, mapper_app_dxid, ref_genome_index,
                project_id):
    '''
    Description: Maps sample fastq files to a reference genome
    Args:
        sample (dict) - sample[<barcode>] = [<fastq files>]
        mapper (dxid) 
        ref_genome (dxid)
    '''

    ## Stock DNAnexus BWA-MEM app
    #mapper_app_name = 'bwa_mem_fastq_read_mapper'
    #mapper_app_version = '1.5.0'
    #mapper_app = MapperApp(name=mapper_app_name, version=mapper_app_version)   # DXApp object

    dxpy.set_workspace_id(project_id)
    # Create dict to store mapper app inputs
    mapper_app = dxpy.DXApp(mapper_app_dxid)
    mapper_input = {
        'genomeindex_targz': dxpy.dxlink(ref_genome_index)
    }  # hg19 : file-B6qq53v2J35Qyg04XxG0000V

    # Add fastq files to mapper app input dict
    if len(fastq_dict) == 0:
        print 'Error: No fastq files listed for sample %s' % sample
        sys.exit()
    elif len(fastq_dict) == 1:
        mapper_input['reads_fastqgz'] = dxpy.dxlink(fastq_dict['1'])
    elif len(fastq_dict) == 2:
        mapper_input['reads_fastqgz'] = dxpy.dxlink(fastq_dict['1'])
        mapper_input['reads2_fastqgz'] = dxpy.dxlink(fastq_dict['2'])
    else:
        print 'Error: More than 2 fastq files passed for mapping sample %s' % sample
        sys.exit()
    print mapper_input

    mapper_job = mapper_app.run(mapper_input)
    mapper_output = {
        "BAM": {
            "job": mapper_job.get_id(),
            "field": "sorted_bam"
        },
        "BAI": {
            "job": mapper_job.get_id(),
            "field": "sorted_bai"
        }
    }
    return mapper_output
Beispiel #7
0
def map_contaminant(Contig, Reads):
    # get ID of our mapper
    try:
        bwa = dxpy.DXApp(
            dxpy.find_apps(name="bwa_mem_fastq_read_mapper").next()['id'])
    except StopIteration:
        raise dxpy.AppError(
            "Unable to find app 'bwa_mem_fastq_read_mapper'.  Please install it to enable contaminant mapping"
        )

    # TODO: find optimal chunk size so we don't launch too many bwa jobs
    map_job = bwa.run({
        "reads": Reads,
        "reference": Contig,
        "discard_unmapped_rows": True,
        "chunk_size": 10000000
    })

    total_reads = 0
    for r in Reads:
        desc = dxpy.DXGTable(r).describe()
        current_reads = desc['length']
        if 'sequence2' in desc['columns']:
            current_reads *= 2
        total_reads += current_reads

    # launch a job to wait for the mapping and will calculate what % has mapped
    calc_job = dxpy.new_dxjob(
        {
            "num_reads": total_reads,
            "mappings": {
                "job": map_job.get_id(),
                "field": "mappings"
            }
        }, "calc_contam")

    return calc_job.get_id()
Beispiel #8
0
This script calls the DNAnexus app I built called SCGPM Clean Raw Data at https://platform.dnanexus.com/app/scgpm_clean_raw_dataRemoves to unwanted files (that drive up the storage costs) from the raw_data folder of a DNAnexus project containing sequencing results from the SCGPM sequencing workflow. Most of the files in the raw_data folder are removed. Moreover, the lane tarball is removed; the XML files RunInfo.xml and runParameters.xml are extracted from Interop.tar and then the tarball is removed; finally, metadata.tar is removed. The extracted XML files are uploaded back to the raw_data folder.

Queryies DNAnexus for all projects billed to the specified org and that were created within the last -d days.

You must have the environemnt variable DX_SECURITY_CONTEXT set (described at http://autodoc.dnanexus.com/bindings/python/current/dxpy.html?highlight=token) in order to authenticate with DNAnexus.
"""

import subprocess
import argparse

import dxpy


RAW_DATA_FOLDER = "/raw_data" #The raw_data folder location in a SCGPM DNAnexus project.
APP_NAME = "scgpm_clean_raw_data" #App's name on DNAnexus
APP = dxpy.DXApp(name="scgpm_clean_raw_data")

def get_parser():
  parser = argparse.ArgumentParser(description=__doc__,formatter_class=argparse.RawTextHelpFormatter)
  parser.add_argument('-d',"--days-ago",type=int,default=30, help="""
    The number of days ago to query for new projects that are billed to the org specified by --org.""")

  parser.add_argument('-o',"--org",required=True,help="""
    Limits the project search to only those that belong to the specified DNAnexus org. Should 
    begin with 'org-'.""")
  return parser

def main():
  parser = get_parser()
  args = parser.parse_args()
  days_ago = args.days_ago
 def test_base_input(self):
     job = dxpy.DXApp(self.app_id).run(self.base_input)
     print "Waiting for job to complete"
     job.wait_on_done()
     print json.dumps(job.describe()["output"])
def main(**job_inputs):
    print "Beginning processing of RNA data"

    output = {}

    check_reads(job_inputs['reads'])

    # Convert reads tables to FASTQ/FASTA files
    left_reads = []
    right_reads = []

    current_reads = 0
    for reads in job_inputs['reads']:
        print "Converting reads table " + str(reads['$dnanexus_link'])
        left, right = dump_fastqa(reads['$dnanexus_link'],
                                  "reads_" + str(current_reads))

        left_reads.append(left)
        if right != None:
            right_reads.append(right)

        current_reads += 1

    # Convert Genes Object to GFF file

    run_shell("dx-genes-to-gtf --output genes.gtf " +
              job_inputs['gene_model']['$dnanexus_link'])

    # Create or download indexed genome
    genome = dxpy.DXRecord(job_inputs['reference'])

    if not 'indexed_reference' in job_inputs:
        output['indexed_reference'] = dxpy.dxlink(
            make_indexed_reference(genome.get_id()))
    else:
        output['indexed_reference'] = job_inputs['indexed_reference']
        indexed_genome = dxpy.DXRecord(job_inputs['indexed_reference'])
        dxpy.download_dxfile(indexed_genome.get_details()['index_archive'],
                             "reference.tar.xz")
        run_shell("tar -xJf reference.tar.xz")

    # call tophat
    num_cpus = multiprocessing.cpu_count()

    cmd = " ".join([
        'tophat', "-p",
        str(num_cpus), job_inputs['tophat_options'], "-G genes.gtf",
        "--transcriptome-index=./genes", "-T", "indexed_ref", " ",
        ",".join(left_reads)
    ])

    if len(right_reads) != 0:
        cmd += " " + ",".join(right_reads)

    # Invoke tophat2 with FASTQ/A file(s) and indexed reference
    try:
        run_shell(cmd)
    except:
        raise dxpy.AppError(
            "Error while running Tophat.  This could be caused by an incompatible gene model and reference or incorrect optional parameters.  Please check that these are all correct"
        )

    # upload and import the BAM as a Mappings table
    accepted_hits_file = dxpy.upload_local_file('tophat_out/accepted_hits.bam',
                                                wait_on_close=True)
    name = job_inputs['output_name']
    name += "_mappings"
    sam_importer = dxpy.DXApp(name="sam_importer")
    print "Importing BAM output of Tophat"
    import_job = sam_importer.run({
        "file":
        dxpy.dxlink(accepted_hits_file.get_id()),
        "reference_genome":
        dxpy.dxlink(genome.get_id()),
        "name":
        name
    })

    cuff_cmd = " ".join(
        ['cufflinks', '-p',
         str(num_cpus), '-G genes.gtf', '-o cuff'])

    if 'cufflinks_options' in job_inputs:
        cuff_cmd += " " + job_inputs['cufflinks_options']

    cuff_cmd += " tophat_out/accepted_hits.bam"

    # now with mapped reads in hand we can run cufflinks
    try:
        run_shell(cuff_cmd)
    except:
        raise dxpy.AppError(
            "Error while running Cufflinks.  Please check that your parameters are valid"
        )

    print "Packing, uploading, and parsing cufflinks output"
    # package cufflinks output
    run_shell("tar -czf cufflinks_output.tar.gz cuff/")
    cuff_name = job_inputs['output_name'] + "_cufflinks_output.tar.gz"
    orig_trans_file = dxpy.upload_local_file("cufflinks_output.tar.gz")
    orig_trans_file.rename(cuff_name)
    transcripts_table = upload_transcripts_file('cuff/genes.fpkm_tracking',
                                                job_inputs['output_name'])

    output['mappings'] = {"job": import_job.get_id(), "field": "mappings"}
    output['transcripts'] = dxpy.dxlink(transcripts_table.get_id())
    output['cufflinks_output'] = dxpy.dxlink(orig_trans_file.get_id())

    print "DONE!"

    return output