Beispiel #1
0
    def build(incl_map):
        nm = "vg_construct_index_map" if incl_map else "vg_construct_index"
        wf = dxpy.new_dxworkflow(title=nm,
                                 name=nm,
                                 description=nm,
                                 project=project.get_id(),
                                 folder=folder,
                                 properties={"git_revision": git_revision})

        construct_applet = find_applet("vg_construct")
        construct_input = {
        }
        construct_stage_id = wf.add_stage(construct_applet, stage_input=construct_input, name="construct")

        index_input = {
            "vg_tar": dxpy.dxlink({"stage": construct_stage_id, "outputField": "vg_tar"})
        }
        index_stage_id = wf.add_stage(find_applet("vg_index"), stage_input=index_input, name="index")

        if incl_map:
            map_input = {
                "vg_indexed_tar": dxpy.dxlink({"stage": index_stage_id, "outputField": "vg_indexed_tar"})
            }
            map_stage_id = wf.add_stage(find_applet("vg_map"), stage_input=map_input, name="map")

        return wf
def main(project, folder, name):
    # Build the applet
    app_id, app_desc = upload_applet('.', None)
    app_handler = dxpy.DXApplet(app_id)

    # Build a workflow that uses that applet
    workflow = dxpy.new_dxworkflow(name=name, project=project, folder=folder)
    workflow.add_stage(app_id)

    # Delete the applet, to break the workflow
    app_handler.remove()

    return workflow.get_id()
Beispiel #3
0
def new_workflow(args):
    try_call(process_dataobject_args, args)
    try_call(process_single_dataobject_output_args, args)
    init_from = None
    if args.init is not None:
        if is_analysis_id(args.init):
            init_from = args.init
        else:
            init_project, _init_folder, init_result = try_call(
                resolve_existing_path, args.init, expected='entity')
            init_from = dxpy.get_handler(init_result['id'],
                                         project=init_project)
    if args.output is None:
        project = dxpy.WORKSPACE_ID
        folder = dxpy.config.get("DX_CLI_WD", "/")
        name = None
    else:
        project, folder, name = try_call(dxpy.utils.resolver.resolve_path,
                                         args.output)
    if args.output_folder is not None:
        try:
            # Try to resolve to a path in the project
            _ignore, args.output_folder, _ignore = resolve_path(
                args.output_folder, expected='folder')
        except:
            # But if not, just use the value directly
            pass

    try:
        dxworkflow = dxpy.new_dxworkflow(title=args.title,
                                         summary=args.summary,
                                         description=args.description,
                                         output_folder=args.output_folder,
                                         project=project,
                                         name=name,
                                         tags=args.tags,
                                         types=args.types,
                                         hidden=args.hidden,
                                         properties=args.properties,
                                         details=args.details,
                                         folder=folder,
                                         parents=args.parents,
                                         init_from=init_from)
        if args.brief:
            print(dxworkflow.get_id())
        else:
            dxpy.utils.describe.print_desc(
                dxworkflow.describe(incl_properties=True, incl_details=True),
                args.verbose)
    except:
        err_exit()
Beispiel #4
0
def createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId, appProjectId=None):
    '''This function will populate a workflow for the stepsToDo.'''

    if len(stepsToDo) < 1:
        return None
    if appProjectId == None:
        appProjectId = projectId

    # create a workflow object
    wf = dxpy.new_dxworkflow(title=extras['name'],name=extras['name'],folder=resultsFolder,
                                            project=projectId,description=extras['description'])

    # NOTE: prevStepResults dict contains links to result files to be generated by previous steps
    prevStepResults = {}
    for step in stepsToDo:
        appName = STEPS[step]['app']
        app = dxencode.find_applet_by_name(appName, appProjectId)
        appInputs = {}
        # file inputs
        for fileToken in STEPS[step]['inputs'].keys():
            appInp = STEPS[step]['inputs'][fileToken]
            if fileToken in prevStepResults:
                appInputs[ appInp ] = prevStepResults[fileToken]
            elif fileToken in priors:
                if isinstance(priors[fileToken], list):
                    appInputs[ appInp ] = []
                    for fid in priors[fileToken]:
                        appInputs[ appInp ] += [ dxencode.get_file_link(fid) ]
                else:
                    appInputs[ appInp ] = dxencode.get_file_link(priors[fileToken])
            else:
                print "ERROR: step '"+step+"' can't find input '"+fileToken+"'!"
                sys.exit(1)
        # Non-file app inputs
        if 'params' in STEPS[step]:
            for param in STEPS[step]['params'].keys():
                appParam = STEPS[step]['params'][param]
                if param in extras:
                    appInputs[ appParam ] = extras[param]
                else:
                    print "ERROR: unable to locate '"+param+"' in extras."
                    sys.exit(1)
        # Add wf stage
        stageId = wf.add_stage(app, stage_input=appInputs, folder=resultsFolder)
        # outputs, which we will need to link to
        for fileToken in STEPS[step]['results'].keys():
            #appOut = STEPS[step]['results'][fileToken]
            appOut = fileToken ## not the value
            prevStepResults[ fileToken ] = dxpy.dxlink({ 'stage': stageId,'outputField': appOut })
    wfRun = wf.run({})
    return wfRun.describe()
def main():
    args = get_args()
    if len(args.replicates) < 1:
        sys.exit('Need to have at least 1 replicate file.')

    project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    print 'Experiment to analyze: ' + args.experiment
    if not project_has_folder(project, '/'+args.experiment):
        project.new_folder('/'+args.experiment)

    #TODO get all replicate ids from encoded DB from ENCSR (args.experiment)
    #TODO error out if ENCSR not found, status not complete etc.
    if args.test:
        source_id = project.get_id()
    else:
        source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id()

    replicates = []
    for rep in args.replicates:
        dx_rep = dxpy.find_data_objects(classname='file', name=rep,
                                        name_mode='glob', project=source_id,
                                        return_handler=False)
        replicates.extend(dx_rep)

    if not args.test:
        replicates = copy_files(replicates, project.get_id(), "/"+args.experiment)

    if not replicates:
        print "No replicates found in project: " + project.name
        print "Looking for " + ", ".join(args.replicates)
        sys.exit(1)


    paired = args.paired
    gender = args.gender
    organism = args.organism
    #TODO determine paired or gender from ENCSR metadata
    # Now create a new workflow ()
    spec_name = args.experiment+'-'+'-'.join([ r.split('.')[0] for r in args.replicates])
    wf = dxpy.new_dxworkflow(title='dx_dna_me_'+spec_name,
                             name='ENCODE Bismark DNA-ME pipeline: '+spec_name,
                             description='The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment' + args.experiment,
                             folder='/'+args.experiment,
                             project=project.get_id())

    populate_workflow(wf, replicates, args.experiment, paired, gender, organism, project.id)
Beispiel #6
0
def new_workflow(args):
    try_call(process_dataobject_args, args)
    try_call(process_single_dataobject_output_args, args)
    init_from = None
    if args.init is not None:
        if is_analysis_id(args.init):
            init_from = args.init
        else:
            init_project, _init_folder, init_result = try_call(resolve_existing_path, args.init, expected="entity")
            init_from = dxpy.get_handler(init_result["id"], project=init_project)
    if args.output is None:
        project = dxpy.WORKSPACE_ID
        folder = get_env_var("DX_CLI_WD", "/")
        name = None
    else:
        project, folder, name = dxpy.utils.resolver.resolve_path(args.output)
    if args.output_folder is not None:
        try:
            # Try to resolve to a path in the project
            _ignore, args.output_folder, _ignore = resolve_path(args.output_folder, expected="folder")
        except:
            # But if not, just use the value directly
            pass
    try:
        dxworkflow = dxpy.new_dxworkflow(
            title=args.title,
            summary=args.summary,
            description=args.description,
            output_folder=args.output_folder,
            project=project,
            name=name,
            tags=args.tags,
            types=args.types,
            hidden=args.hidden,
            properties=args.properties,
            details=args.details,
            folder=folder,
            parents=args.parents,
            init_from=init_from,
        )
        if args.brief:
            print(dxworkflow.get_id())
        else:
            dxpy.utils.describe.print_desc(dxworkflow.describe(incl_properties=True, incl_details=True), args.verbose)
    except:
        err_exit()
Beispiel #7
0
def new_workflow(args):
    try_call(process_dataobject_args, args)
    try_call(process_single_dataobject_output_args, args)
    init_from = None
    if args.init is not None:
        try:
            init_project, init_folder, init_result = try_call(resolve_existing_path,
                                                              args.init,
                                                              expected='entity')
            init_from = dxpy.get_handler(init_result['id'], project=init_project)
        except:
            init_from = args.init
    if args.output is None:
        project = dxpy.WORKSPACE_ID
        folder = os.environ.get('DX_CLI_WD', '/')
        name = None
    else:
        project, folder, name = dxpy.utils.resolver.resolve_path(args.output)
    if args.output_folder is not None:
        try:
            # Try to resolve to a path in the project
            ignore, args.output_folder, ignore2 = resolve_path(args.output_folder, expected='folder')
        except:
            # But if not, just use the value directly
            pass
    try:
        dxworkflow = dxpy.new_dxworkflow(title=args.title, summary=args.summary,
                                         description=args.description,
                                         output_folder=args.output_folder,
                                         project=project, name=name,
                                         tags=args.tags, types=args.types,
                                         hidden=args.hidden, properties=args.properties,
                                         details=args.details,
                                         folder=folder,
                                         parents=args.parents, init_from=init_from)
        if args.brief:
            print dxworkflow.get_id()
        else:
            dxpy.utils.describe.print_desc(dxworkflow.describe(incl_properties=True, incl_details=True),
                                           args.verbose)
    except:
        err_exit()
Beispiel #8
0
def build_workflow():
    if parameters["folder_provided"] == "false":
        wf = dxpy.new_dxworkflow(
            name='WARDEN_workflow',
            description='RNA-SEQ Workflow',
            output_folder=parameters["Output"],
        )
    else:
        wf = dxpy.new_dxworkflow(
            name='WARDEN_workflow',
            description='RNA-SEQ Workflow',
        )
    wf_outputs = []

    combine_counts_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["combine_counts"],
        state="closed",
        return_handler=True)
    limma_applet = dxpy.search.find_one_data_object(classname="applet",
                                                    name=app_names["limma"],
                                                    state="closed",
                                                    return_handler=True)
    simple_DE_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["simple_DE"],
        state="closed",
        return_handler=True)

    sample_num = 0
    htseq_results = [dxpy.dxlink(count_id) for count_id in samples.values()]

    combine_input = {
        "count_files": htseq_results,
        "name_value": "htseq",
        "sample_files": [dxpy.dxlink(final_sample_list_id)]
    }
    combine_counts_stage_id = wf.add_stage(combine_counts_applet,
                                           stage_input=combine_input,
                                           instance_type="azure:mem2_ssd1_x1",
                                           name="COMBINE HTSEQ")
    wf_outputs += [
        {
            "name": "combined_counts",
            "class": "file",
            "outputSource": {
                "$dnanexus_link": {
                    "stage": combine_counts_stage_id,
                    "outputField": "count_file"
                }
            }
        },
    ]

    if parameters["limma_DE_viewer"] != "None":
        limma_viewer_project, limma_viewer_file = parameters[
            "limma_DE_viewer"].split(":")
        limma_viewer_link = dxpy.dxlink({
            "project": limma_viewer_project,
            "id": limma_viewer_file
        })

    if parameters["limma_runnable"] == "true":
        limma_input = {
            "input_count_file":
            dxpy.dxlink({
                "stage": combine_counts_stage_id,
                "outputField": "count_file"
            }),
            "sample_list_file":
            dxpy.dxlink(final_sample_list_id),
            "calcNormFactors_method":
            parameters["calcNormFactors_method"],
            "filter_count_type":
            parameters["filter_count_type"],
            "filter_count":
            int(parameters["filter_count"]),
            "p_value_adjust":
            parameters["p_value_adjust"],
            "contrasts_file":
            dxpy.dxlink(comparisons_limma_id)
        }
        if parameters["limma_DE_viewer"] != "None":
            limma_input["difex_viewer"] = limma_viewer_link
        limma_stage_id = wf.add_stage(limma_applet,
                                      stage_input=limma_input,
                                      instance_type="azure:mem1_ssd1_x4",
                                      name="LIMMA")
        wf_outputs += [
            {
                "name": "limma_outfiles",
                "class": "array:file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": limma_stage_id,
                        "outputField": "out_files"
                    }
                }
            },
            {
                "name": "limma_viewer",
                "class": "record",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": limma_stage_id,
                        "outputField": "viewer_bookmark"
                    }
                }
            },
        ]
    simple_DE_input = {
        "input_count_file":
        dxpy.dxlink({
            "stage": combine_counts_stage_id,
            "outputField": "count_file"
        }),
        "sample_list_file":
        dxpy.dxlink(final_sample_list_id),
        "contrasts_file":
        dxpy.dxlink(comparisons_all_id)
    }
    if parameters["limma_DE_viewer"] != "None":
        simple_DE_input["difex_viewer"] = limma_viewer_link
    simple_DE_stage_id = wf.add_stage(simple_DE_applet,
                                      stage_input=simple_DE_input,
                                      instance_type="azure:mem1_ssd1_x4",
                                      name="SIMPLE DIFFERENTIAL_EXPRESSION")
    wf_outputs += [
        {
            "name": "simple_DE_outfiles",
            "class": "array:file",
            "outputSource": {
                "$dnanexus_link": {
                    "stage": simple_DE_stage_id,
                    "outputField": "out_files"
                }
            }
        },
        {
            "name": "simple_DE_viewer",
            "class": "record",
            "outputSource": {
                "$dnanexus_link": {
                    "stage": simple_DE_stage_id,
                    "outputField": "viewer_bookmark"
                }
            }
        },
    ]

    wf.update(workflow_outputs=wf_outputs)
    wf.close()
    return wf.get_id()
Beispiel #9
0
def build_workflow(experiment, biorep_n, input_shield_stage_input, accession,
                   use_existing_folders):

    output_project = resolve_project(args.outp, 'w')
    logging.debug('Found output project %s' % (output_project.name))

    applet_project = resolve_project(args.applets, 'r')
    logging.debug('Found applet project %s' % (applet_project.name))

    mapping_applet = \
        find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id())
    logging.debug('Found applet %s' % (mapping_applet.name))

    input_shield_applet = \
        find_applet_by_name(INPUT_SHIELD_APPLET_NAME, applet_project.get_id())
    logging.debug('Found applet %s' % (input_shield_applet.name))

    folders = ['workflows', 'fastqs', 'raw_bams', 'bams']
    folder_paths = \
        ['/'.join([args.outf,
                   folder_name,
                   experiment.get('accession'),
                   'rep%d' % (biorep_n)])
         for folder_name in folders]
    paths_exist = \
        [resolve_folder(output_project, folder_path)
         for folder_path in folder_paths
         if resolve_folder(output_project, folder_path)]
    if any(paths_exist):
        msg = "%s: output paths already exist: %s" % (
            experiment.get('accession'), paths_exist)
        if use_existing_folders:
            logging.warning(msg)
        else:
            msg += "\nUse --use_existing_folders to supress but possibly create duplicate files"
            logging.error(msg)
            return None
    workflow_output_folder, fastq_output_folder, mapping_output_folder, final_output_folder = \
        tuple(create_folder(output_project, folder_path)
              for folder_path in folder_paths)

    if args.raw:
        workflow_title = \
            ('Map %s rep%d to %s (no filter)'
             % (experiment.get('accession'), biorep_n, args.assembly))
        workflow_name = 'ENCODE raw mapping pipeline'
    else:
        workflow_title = \
            ('Map %s rep%d to %s and filter'
             % (experiment.get('accession'), biorep_n, args.assembly))
        workflow_name = 'ENCODE mapping pipeline'

    if args.tag:
        workflow_title += ': %s' % (args.tag)

    workflow = dxpy.new_dxworkflow(title=workflow_title,
                                   name=workflow_name,
                                   project=output_project.get_id(),
                                   folder=workflow_output_folder)

    input_shield_stage_id = workflow.add_stage(
        input_shield_applet,
        name='Gather inputs %s rep%d' %
        (experiment.get('accession'), biorep_n),
        folder=fastq_output_folder,
        stage_input=input_shield_stage_input)

    input_names = \
        [name for name in ['reads1', 'reads2', 'crop_length', 'reference_tar',
         'bwa_version', 'bwa_aln_params', 'samtools_version', 'debug']
         if name in input_shield_stage_input]
    logging.debug('input_names: %s' % (input_names))
    mapping_stage_input = dict(
        zip(input_names, [
            dxpy.dxlink({
                'stage': input_shield_stage_id,
                'outputField': input_name
            }) for input_name in input_names
        ]))
    logging.debug('mapping_stage_input: %s' % (mapping_stage_input))
    mapping_stage_id = workflow.add_stage(
        mapping_applet,
        name='Map %s rep%d' % (experiment.get('accession'), biorep_n),
        folder=mapping_output_folder,
        stage_input=mapping_stage_input)

    if not args.raw:
        filter_qc_applet = \
            find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id())
        logging.debug('Found applet %s' % (filter_qc_applet.name))

        filter_qc_stage_id = workflow.add_stage(
            filter_qc_applet,
            name='Filter and QC %s rep%d' %
            (experiment.get('accession'), biorep_n),
            folder=final_output_folder,
            stage_input={
                'input_bam':
                dxpy.dxlink({
                    'stage': mapping_stage_id,
                    'outputField': 'mapped_reads'
                }),
                'paired_end':
                dxpy.dxlink({
                    'stage': mapping_stage_id,
                    'outputField': 'paired_end'
                }),
                'scrub':
                args.scrub
            })

        xcor_applet = find_applet_by_name(XCOR_APPLET_NAME,
                                          applet_project.get_id())
        logging.debug('Found applet %s' % (xcor_applet.name))

        xcor_stage_id = workflow.add_stage(
            xcor_applet,
            name='Calculate cross-correlation %s rep%d' %
            (experiment.get('accession'), biorep_n),
            folder=final_output_folder,
            stage_input={
                'input_bam':
                dxpy.dxlink({
                    'stage': filter_qc_stage_id,
                    'outputField': 'filtered_bam'
                }),
                'paired_end':
                dxpy.dxlink({
                    'stage': filter_qc_stage_id,
                    'outputField': 'paired_end'
                }),
                'spp_version':
                args.spp_version
            })
    ''' This should all be done in the shield's postprocess entrypoint
    if args.accession_outputs:
        derived_from = input_shield_stage_input.get('reads1')
        if reads2:
            derived_from.append(reads2)
        files_json = {dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}) : {
            'notes': 'Biorep%d | Mapped to %s' %(biorep_n, input_shield_stage_input.get('reference_tar')),
            'lab': 'j-michael-cherry',
            'award': 'U41HG006992',
            'submitted_by': '*****@*****.**',
            'file_format': 'bam',
            'output_type': 'alignments',
            'derived_from': derived_from,
            'dataset': experiment.get('accession')}
        }
        output_shield_stage_id = workflow.add_stage(
            output_shield_applet,
            name='Accession outputs %s rep%d' %(experiment.get('accession'), biorep_n),
            folder=mapping_output_folder,
            stage_input={'files': [dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'})],
                         'files_json': files_json,
                         'key': input_shield_stage_input.get('key')}
        )
    '''
    return workflow
Beispiel #10
0
def main():
    args = get_args()

    blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2)

    if args.nomap and (args.rep1pe is None or args.rep2pe is None) and not blank_workflow:
        logging.error("With --nomap, endedness of replicates must be specified with --rep1pe and --rep2pe")
        raise ValueError

    if not args.target:
        target_type = 'default'  # default
    else:
        target_type = args.target.lower()
    if target_type not in WF.keys():
        logging.error('Target type %s is not recognized')
        sys.exit(2)

    output_project = resolve_project(args.outp, 'w')
    logging.debug('Found output project %s' % (output_project.name))
    applet_project = resolve_project(args.applets, 'r')
    logging.debug('Found applet project %s' % (applet_project.name))    

    existing_folder = resolve_folder(output_project, args.outf)
    if not existing_folder:
        output_folder = create_folder(output_project, args.outf)
    elif args.use_existing_folders:
        output_folder = existing_folder
    else:
        assert (existing_folder and args.use_existing_folders), 'Output folder %s exists but --use_existing_folders is %s' % (existing_folder, args.use_existing_folders)

    logging.debug('Using output folder %s' % (output_folder))

    workflow = dxpy.new_dxworkflow(
        name=args.name or WF[target_type]['wf_name'],
        title=args.title or WF[target_type]['wf_title'],
        description=args.description or WF[target_type]['wf_description'],
        project=output_project.get_id(),
        folder=output_folder,
        properties={'pipeline_version': str(args.pipeline_version)})


    unary_control = args.unary_control or (args.rep1 and args.rep2 and args.ctl1 and not args.ctl2)

    if not args.genomesize:
        genomesize = None
    else:
        genomesize = args.genomesize
    if not args.chrom_sizes:
        chrom_sizes = None
    else:
        chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes))

    if not args.blacklist:
        blacklist = None
    else:
        blacklist = dxpy.dxlink(resolve_file(args.blacklist))

    run_idr = WF[target_type]['run_idr']

    if not args.nomap:
        # a "superstage" is just a dict with a name, name(s) of input files,
        # and then names and id's of stages that process that input
        # each superstage here could be implemented as a stage in a more
        # abstract workflow.  That stage would then call the various applets
        # that are separate
        # stages here.
        mapping_superstages = [  # the order of this list is important in that
            {'name': 'Rep1', 'input_args': args.rep1},
            {'name': 'Rep2', 'input_args': args.rep2},
            {'name': 'Ctl1', 'input_args': args.ctl1}
        ]
        if not unary_control:
            mapping_superstages.append(
                {'name': 'Ctl2', 'input_args': args.ctl2})

        mapping_applet = find_applet_by_name(
            MAPPING_APPLET_NAME, applet_project.get_id())
        # mapping_output_folder = resolve_folder(
        #     output_project, output_folder + '/' + mapping_applet.name)
        mapping_output_folder = mapping_applet.name
        reference_tar = resolve_file(args.reference)
        filter_qc_applet = find_applet_by_name(
            FILTER_QC_APPLET_NAME, applet_project.get_id())
        filter_qc_output_folder = mapping_output_folder
        xcor_applet = find_applet_by_name(
            XCOR_APPLET_NAME, applet_project.get_id())
        xcor_output_folder = mapping_output_folder

        # in the first pass create the mapping stage id's so we can use JBOR's
        # to link inputs
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            mapped_stage_id = workflow.add_stage(
                mapping_applet,
                name='Map %s' % (superstage_name),
                folder=mapping_output_folder
            )
            mapping_superstage.update({'map_stage_id': mapped_stage_id})

        # in the second pass populate the stage inputs and build other stages
        rep1_stage_id = next(ss.get('map_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1')
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            superstage_id = mapping_superstage.get('map_stage_id')

            if mapping_superstage.get('input_args') or blank_workflow:
                mapping_stage_input = {}
                if superstage_name != "Rep1":
                    mapping_stage_input.update(
                        {'reference_tar': dxpy.dxlink(
                            {'stage': rep1_stage_id,
                             'inputField': 'reference_tar'})})
                else:
                    if args.reference:
                        mapping_stage_input.update(
                            {'reference_tar': dxpy.dxlink(
                                reference_tar.get_id())})
                if not blank_workflow:
                    for arg_index, input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2
                        reads = dxpy.dxlink(resolve_file(input_arg).get_id())
                        mapping_stage_input.update({'reads%d' %(arg_index+1): reads})
                # this is now done in the first pass loop above
                # mapped_stage_id = workflow.add_stage(
                #     mapping_applet,
                #     name='Map %s' %(superstage_name),
                #     folder=mapping_output_folder,
                #     stage_input=mapping_stage_input
                # )
                # mapping_superstage.update({'map_stage_id': mapped_stage_id})
                workflow.update_stage(superstage_id, stage_input=mapping_stage_input)

                filter_qc_stage_id = workflow.add_stage(
                    filter_qc_applet,
                    name='Filter_QC %s' %(superstage_name),
                    folder=filter_qc_output_folder,
                    stage_input={
                        'input_bam': dxpy.dxlink({'stage': superstage_id, 'outputField': 'mapped_reads'}),
                        'paired_end': dxpy.dxlink({'stage': superstage_id, 'outputField': 'paired_end'})
                    }
                )
                mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id})

                xcor_stage_id = workflow.add_stage(
                    xcor_applet,
                    name='Xcor %s' %(superstage_name),
                    folder=xcor_output_folder,
                    stage_input={
                        'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}),
                        'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}),
                        'spp_version': args.spp_version
                    }
                )
                mapping_superstage.update({'xcor_stage_id': xcor_stage_id})

        exp_rep1_ta = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
                     'outputField': 'tagAlign_file'})
        exp_rep1_cc = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
                     'outputField': 'CC_scores_file'})
        exp_rep2_ta = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
                     'outputField': 'tagAlign_file'})
        exp_rep2_cc = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
                     'outputField': 'CC_scores_file'})
        ctl_rep1_ta = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'),
                     'outputField': 'tagAlign_file'})
        if unary_control:
            ctl_rep2_ta = ctl_rep1_ta
        else:
            ctl_rep2_ta = dxpy.dxlink(
                        {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'),
                         'outputField': 'tagAlign_file'})
        rep1_paired_end = dxpy.dxlink(
                        {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
                         'outputField': 'paired_end'})
        rep2_paired_end = dxpy.dxlink(
                        {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
                         'outputField': 'paired_end'})
    else: #skipped the mapping, so just bring in the inputs from arguments
        if not blank_workflow:
            exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id())
            exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id())
            ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id())
            ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id())
            exp_rep1_ta_desc = dxpy.describe(exp_rep1_ta)
            exp_rep2_ta_desc = dxpy.describe(exp_rep2_ta)
            exp_rep1_mapping_analysis_id = dxpy.describe(exp_rep1_ta_desc['createdBy']['job'])['analysis']
            exp_rep2_mapping_analysis_id = dxpy.describe(exp_rep2_ta_desc['createdBy']['job'])['analysis']
            exp_rep1_mapping_analysis = dxpy.describe(exp_rep1_mapping_analysis_id)
            exp_rep2_mapping_analysis = dxpy.describe(exp_rep2_mapping_analysis_id)


            exp_rep1_cc = next(
                stage['execution']['output']['CC_scores_file']
                for stage in exp_rep1_mapping_analysis.get('stages')
                if stage['execution']['executableName'] == 'xcor')

            exp_rep2_cc = next(
                stage['execution']['output']['CC_scores_file']
                for stage in exp_rep2_mapping_analysis.get('stages')
                if stage['execution']['executableName'] == 'xcor')

        else:
            exp_rep1_ta = None
            exp_rep2_ta = None
            ctl_rep1_ta = None
            ctl_rep2_ta = None

        rep1_paired_end = args.rep1pe
        rep2_paired_end = args.rep2pe

        # #here we need to calculate the cc scores files, because we're only being supplied tagAligns
        # #if we had mapped everything above we'd already have a handle to the cc file
        # xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id())
        # # xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name)
        # xcor_output_folder = xcor_only_applet.name
        # xcor_only_stages = []

        # exp_rep1_cc_stage_id = workflow.add_stage(
        #     xcor_only_applet,
        #     name="Rep1 cross-correlation",
        #     folder=xcor_output_folder,
        #     stage_input={
        #         'input_tagAlign': exp_rep1_ta,
        #         'paired_end': rep1_paired_end,
        #         'spp_version': args.spp_version
        #     }
        # )
        # xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id})
        # exp_rep1_cc = dxpy.dxlink(
        #             {'stage': exp_rep1_cc_stage_id,
        #              'outputField': 'CC_scores_file'})

        # exp_rep2_cc_stage_id = workflow.add_stage(
        #     xcor_only_applet,
        #     name="Rep2 cross-correlation",
        #     folder=xcor_output_folder,
        #     stage_input={
        #         'input_tagAlign': exp_rep2_ta,
        #         'paired_end': rep2_paired_end,
        #         'spp_version': args.spp_version
        #     }
        # )
        # xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id})
        # exp_rep2_cc = dxpy.dxlink(
        #             {'stage': exp_rep2_cc_stage_id,
        #              'outputField': 'CC_scores_file'})

    encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id())
    encode_macs2_stages = []
    # peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name)
    peaks_output_folder = encode_macs2_applet.name

    macs2_stage_input = {
            'rep1_ta' : exp_rep1_ta,
            'rep2_ta' : exp_rep2_ta,
            'ctl1_ta': ctl_rep1_ta,
            'ctl2_ta' : ctl_rep2_ta,
            'rep1_xcor' : exp_rep1_cc,
            'rep2_xcor' : exp_rep2_cc,
            'rep1_paired_end': rep1_paired_end,
            'rep2_paired_end': rep2_paired_end,
            'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
            'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)),
            'broadpeak_as':  dxpy.dxlink(resolve_file(args.broadpeak_as))
        }
    if genomesize:
        macs2_stage_input.update({'genomesize': genomesize})
    if chrom_sizes:
        macs2_stage_input.update({'chrom_sizes': chrom_sizes})
    encode_macs2_stage_id = workflow.add_stage(
        encode_macs2_applet,
        name='ENCODE Peaks',
        folder=peaks_output_folder,
        stage_input=macs2_stage_input
        )
    encode_macs2_stages.append({'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id})

    if run_idr:
        encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id())
        encode_spp_stages = []
        # idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name)
        idr_peaks_output_folder = encode_spp_applet.name
        PEAKS_STAGE_NAME = 'SPP Peaks'
        peaks_stage_input = {
                    'rep1_ta' : exp_rep1_ta,
                    'rep2_ta' : exp_rep2_ta,
                    'ctl1_ta': ctl_rep1_ta,
                    'ctl2_ta' : ctl_rep2_ta,
                    'rep1_xcor' : exp_rep1_cc,
                    'rep2_xcor' : exp_rep2_cc,
                    'rep1_paired_end': rep1_paired_end,
                    'rep2_paired_end': rep2_paired_end,
                    'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
                    'idr_peaks': True,
                    'spp_version': args.spp_version
                    }
        if chrom_sizes:
            peaks_stage_input.update({'chrom_sizes': chrom_sizes})
        else:
            peaks_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes'})})

        encode_spp_stage_id = workflow.add_stage(
            encode_spp_applet,
            name=PEAKS_STAGE_NAME,
            folder=idr_peaks_output_folder,
            stage_input=peaks_stage_input
            )
        encode_spp_stages.append({'name': PEAKS_STAGE_NAME, 'stage_id': encode_spp_stage_id})

        idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id())
        encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id())
        idr_stages = []
        # idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name)
        idr_output_folder = idr_applet.name
        if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow:
            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR True Replicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1_peaks'}),
                    'rep2_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2_peaks'}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooled_peaks'})
                }
            )
            idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id})

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 1 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1pr1_peaks'}),
                    'rep2_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1pr2_peaks'}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1_peaks'})
                }
            )
            idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id})

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 2 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2pr1_peaks'}),
                    'rep2_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2pr2_peaks'}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2_peaks'})
                }
            )
            idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id})

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Pooled Pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooledpr1_peaks'}),
                    'rep2_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooledpr2_peaks'}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooled_peaks'})
                }
            )
            idr_stages.append({'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id})

            final_idr_stage_input = {
                    'reps_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'),
                         'outputField': 'IDR_peaks'}),
                    'r1pr_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'),
                         'outputField': 'IDR_peaks'}),
                    'r2pr_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'),
                         'outputField': 'IDR_peaks'}),
                    'pooledpr_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'),
                         'outputField': 'IDR_peaks'}),
                    'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
                    'rep1_signal': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                         'outputField': 'rep1_fc_signal'}),
                    'rep2_signal': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                         'outputField': 'rep2_fc_signal'}),
                    'pooled_signal': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                         'outputField': 'pooled_fc_signal'})
                }
            if blacklist:
                final_idr_stage_input.update({'blacklist': blacklist})
            if chrom_sizes:
                final_idr_stage_input.update({'chrom_sizes': chrom_sizes})
            else:
                final_idr_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes'})})

            final_idr_stage_id = workflow.add_stage(
                encode_idr_applet,
                name='Final IDR peak calls',
                folder=idr_output_folder,
                stage_input=final_idr_stage_input,

            )
            idr_stages.append({'name': 'Final IDR peak calls', 'stage_id': final_idr_stage_id})

    if target_type == 'histone':
        overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME, applet_project.get_id())
        overlap_peaks_stages = []
        for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']:

            if peaktype == 'narrowpeaks':
                as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as))
                peak_type_extension = 'narrowPeak'

            elif peaktype == 'gappedpeaks':
                as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as))
                peak_type_extension = 'gappedPeak'

            elif peaktype == 'broadpeaks':
                as_file = dxpy.dxlink(resolve_file(args.broadpeak_as))
                peak_type_extension = 'broadPeak'

            overlap_peaks_stage_input = {
                'rep1_peaks': dxpy.dxlink(
                    {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                     'outputField': 'rep1_%s' %(peaktype)}),
                'rep2_peaks': dxpy.dxlink(
                    {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                     'outputField': 'rep2_%s' %(peaktype)}),
                'pooled_peaks': dxpy.dxlink(
                    {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                     'outputField': 'pooled_%s' %(peaktype)}),
                'pooledpr1_peaks': dxpy.dxlink(
                    {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                     'outputField': 'pooledpr1_%s' %(peaktype)}),
                'pooledpr2_peaks': dxpy.dxlink(
                    {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                     'outputField': 'pooledpr2_%s' %(peaktype)}),
                'as_file': as_file,
                'peak_type': peak_type_extension,
                'prefix': 'final',
                'rep1_signal': dxpy.dxlink(
                    {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                     'outputField': 'rep1_fc_signal'}),
                'rep2_signal': dxpy.dxlink(
                    {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                     'outputField': 'rep2_fc_signal'}),
                'pooled_signal': dxpy.dxlink(
                    {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                     'outputField': 'pooled_fc_signal'})
            }
            if chrom_sizes:
                overlap_peaks_stage_input.update({'chrom_sizes': chrom_sizes})
            else:
                overlap_peaks_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes'})})

            overlap_peaks_stage_id = workflow.add_stage(
                overlap_peaks_applet,
                name='Final %s' %(peaktype),
                folder=peaks_output_folder,
                stage_input=overlap_peaks_stage_input
            )
            overlap_peaks_stages.append({'name': 'Final %s' %(peaktype), 'stage_id': overlap_peaks_stage_id})

    if args.accession:
        accession_analysis_applet = find_applet_by_name(ACCESSION_ANALYSIS_APPLET_NAME, applet_project.get_id())
        accession_output_folder = accession_analysis_applet.name
        accession_stage_input = {
            'analysis_ids': ['self'],
            'force_patch': True,
            'wait_on_files': []
        }
        if target_type == 'histone':
            for stage in overlap_peaks_stages:
                for output_field in ['overlapping_peaks', 'overlapping_peaks_bb']:
                    accession_stage_input['wait_on_files'].append(
                        dxpy.dxlink({'stage': stage.get('stage_id'), 'outputField': output_field})
                    )
        elif run_idr:
            for output_field in ['conservative_set', 'conservative_set_bb', 'optimal_set', 'optimal_set_bb']:
                accession_stage_input['wait_on_files'].append(
                    dxpy.dxlink({'stage': final_idr_stage_id, 'outputField': output_field})
                )

        assert accession_stage_input['wait_on_files'], "ERROR: workflow has no wait_on_files defined, so --accession is not supported."
        accession_stage_id = workflow.add_stage(
            accession_analysis_applet,
            name='Accession results',
            folder=accession_output_folder,
            stage_input=accession_stage_input
        )

    if args.yes:
        if args.debug:
            job_id = workflow.run({}, folder=output_folder, priority='high', debug={'debugOn': ['AppInternalError', 'AppError']}, delay_workspace_destruction=True, allow_ssh=['255.255.255.255'])
        else:
            job_id = workflow.run({}, folder=output_folder, priority='normal')
        logging.info("Running as job %s" %(job_id))
def main():
    args = get_args()
    if len(args.replicates) < 1:
        sys.exit('Need to have at least 1 replicate file.')

    project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    print 'Experiment to analyze: ' + args.experiment
    if not project_has_folder(project, '/'+args.experiment):
        project.new_folder('/'+args.experiment)

    #TODO get all replicate ids from encoded DB from ENCSR (args.experiment)
    #TODO error out if ENCSR not found, status not complete etc.
    if args.test:
        source_id = project.get_id()
    else:
        source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id()

    replicates = []
    for rep in args.replicates:
        dx_rep = dxpy.find_data_objects(classname='file', name=rep,
                                        name_mode='exact', project=source_id,
                                        return_handler=False)
        replicates.extend(dx_rep)

    if not args.test:
        replicates = copy_files(replicates, project.get_id(), "/"+args.experiment)

    if not replicates:
        print "No replicates found in project: " + project.name
        print "Looking for " + ", ".join(args.replicates)
        sys.exit(1)

    inputs = {
        'rnd_seed': 12345
    }
    inputs['paired'] = args.paired
    inputs['gender']= args.gender
    inputs['organism'] = args.organism
    inputs['library_id'] = args.library
    inputs['nthreads'] = args.nthreads
    #TODO determine paired or gender from ENCSR metadata
    # Now create a new workflow ()
    inputs['spec_name'] = args.experiment+'-'+'-'.join([ r.split('.')[0] for r in args.replicates])
    title_root = 'dx_long_rna_seq_'
    name_root = 'ENCODE Long RNA Seq: '
    desc = 'The ENCODE RNA Seq pipeline for long RNAs'
    if args.paired:
        title_root = title_root + '_paired_end '
        name_root = name_root + '(paired-end) '
        inputs['stranded'] = True
    else:
        title_root = title_root + '_single_end '
        name_root = name_root + '(single-end) '
        inputs['stranded'] = False


    if args.export:
        project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT, name_mode='exact', return_handler=False)['id']
        wf = dxpy.new_dxworkflow(title=title_root,
                                 name=name_root,
                                 description=desc,
                                 folder=PUBLIC_FOLDER,
                                 project=project_id)
    else:
        project_id = project.get_id()
        wf = dxpy.new_dxworkflow(title=title_root+inputs['spec_name'],
                             name=name_root+inputs['spec_name'],
                             description=desc+' for experiment:' + args.experiment,
                             folder='/'+args.experiment,
                             project=project.get_id())

    populate_workflow(wf, replicates, args.experiment, inputs, project.id, args.export)
Beispiel #12
0
def main():
    args = get_args()

    ## resolve projects
    project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    print 'Experiment to analyze: ' + args.experiment
    if not project_has_folder(project, '/'+args.experiment):
        project.new_folder('/'+args.experiment)

    #TODO get all replicate ids from encoded DB from ENCSR (args.experiment)
    #TODO error out if ENCSR not found, status not complete etc.
    if args.test:
        source_id = project.get_id()
    else:
        source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id()

    ## resolve replicates/fastq inputs
    paired = args.paired
    if not paired:
        if len(args.replicates) < 1:
            sys.exit('Need to have at least 1 replicate file (unpaired) use -r or --replicates')

        replicates = find_replicates(args.replicates, source_id, project, args.experiment, args.test)
        if not replicates:
            print "No replicates found in project: " + project.name
            print "Looking for " + ", ".join(args.replicates)
            sys.exit(1)

        dx_reps = {
            'reads': [ dxpy.dxlink(r) for r in replicates ]
        }
        rnames = '-'.join([ r.split('.')[0] for r in args.replicates])
    else:
        if len(args.pair1) < 1 or len(args.pair2) < 1:
            sys.exit("Need to have at least 1 replicate in pair1 (--r1/--pair1) and pair2 (--r2/--pair2")

        pair1reps = find_replicates(args.pair1, source_id, project, args.experiment, args.test)
        if not pair1reps:
            print "No replicates for pair1 found in project: " + project.name
            print "Looking for " + ", ".join(args.pair1)
            sys.exit(1)

        pair2reps = find_replicates(args.pair2, source_id, project, args.experiment, args.test)
        if not pair2reps:
            print "No replicates for pair2 found in project: " + project.name
            print "Looking for " + ", ".join(args.pair2)
            sys.exit(1)

        dx_reps = {
            'pair1_reads': [ dxpy.dxlink(r) for r in pair1reps ],
            'pair2_reads': [ dxpy.dxlink(r) for r in pair2reps ]
        }
        rnames = '-'.join([ r.split('.')[0] for r in args.pair1+args.pair2])


    gender = args.gender
    organism = args.organism
    #TODO determine paired or gender from ENCSR metadata
    # Now create a new workflow ()
    spec_name = args.experiment+'-'+rnames
    title_root = 'dx_dna_me_'
    name_root = 'ENCODE Bismark DNA-ME pipeline: '
    desc = 'The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment'
    if paired:
        title_root = title_root + '_paired_end'
        name_root = name_root + '(paired-end)'
    else:
        title_root = title_root + '_single_end'
        name_root = name_root + '(single-end)'


    if args.export:
        project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT, name_mode='exact', return_handler=False)['id']
        wf = dxpy.new_dxworkflow(title=title_root,
                                 name=name_root,
                                 description=desc,
                                 folder=PUBLIC_FOLDER,
                                 project=project_id)
    else:
        project_id = project.get_id()
        wf = dxpy.new_dxworkflow(title='dx_dna_me_'+spec_name,
                             name='ENCODE Bismark DNA-ME pipeline: '+spec_name,
                             description='The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment' + args.experiment,
                             folder='/'+args.experiment,
                             project=project.get_id())

    populate_workflow(wf, dx_reps, args.experiment, paired, gender, organism, project.id, args.export)
 def test_workflow_completion(self):
     dxworkflow = dxpy.new_dxworkflow(name="my workflow")
     self.assert_completion("dx run my", "my workflow ")
     dxworkflow.hide()
     self.assert_no_completions("dx run my")
def main():
    args = get_args()

    blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2)

    if args.nomap and (args.rep1pe is None
                       or args.rep2pe is None) and not blank_workflow:
        logging.error(
            "With --nomap, endedness of replicates must be specified with --rep1pe and --rep2pe"
        )
        raise ValueError

    if not args.target:
        target_type = 'default'  # default
    else:
        target_type = args.target.lower()
    if target_type not in WF.keys():
        logging.error('Target type %s is not recognized')
        sys.exit(2)

    output_project = resolve_project(args.outp, 'w')
    logging.debug('Found output project %s' % (output_project.name))
    applet_project = resolve_project(args.applets, 'r')
    logging.debug('Found applet project %s' % (applet_project.name))

    existing_folder = resolve_folder(output_project, args.outf)
    if not existing_folder:
        output_folder = create_folder(output_project, args.outf)
    elif args.use_existing_folders:
        output_folder = existing_folder
    else:
        assert (
            existing_folder and args.use_existing_folders
        ), 'Output folder %s exists but --use_existing_folders is %s' % (
            existing_folder, args.use_existing_folders)

    logging.debug('Using output folder %s' % (output_folder))

    workflow = dxpy.new_dxworkflow(name=args.name
                                   or WF[target_type]['wf_name'],
                                   title=args.title
                                   or WF[target_type]['wf_title'],
                                   description=args.description
                                   or WF[target_type]['wf_description'],
                                   project=output_project.get_id(),
                                   folder=output_folder)

    unary_control = args.unary_control or (args.rep1 and args.rep2
                                           and args.ctl1 and not args.ctl2)

    if not args.genomesize:
        genomesize = None
    else:
        genomesize = args.genomesize
    if not args.chrom_sizes:
        chrom_sizes = None
    else:
        chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes))

    if not args.blacklist:
        blacklist = None
    else:
        blacklist = dxpy.dxlink(resolve_file(args.blacklist))

    run_idr = WF[target_type]['run_idr']

    if not args.nomap:
        # a "superstage" is just a dict with a name, name(s) of input files,
        # and then names and id's of stages that process that input
        # each superstage here could be implemented as a stage in a more
        # abstract workflow.  That stage would then call the various applets
        # that are separate
        # stages here.
        mapping_superstages = [  # the order of this list is important in that
            {
                'name': 'Rep1',
                'input_args': args.rep1
            }, {
                'name': 'Rep2',
                'input_args': args.rep2
            }, {
                'name': 'Ctl1',
                'input_args': args.ctl1
            }
        ]
        if not unary_control:
            mapping_superstages.append({
                'name': 'Ctl2',
                'input_args': args.ctl2
            })

        mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME,
                                             applet_project.get_id())
        # mapping_output_folder = resolve_folder(
        #     output_project, output_folder + '/' + mapping_applet.name)
        mapping_output_folder = mapping_applet.name
        reference_tar = resolve_file(args.reference)
        filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME,
                                               applet_project.get_id())
        filter_qc_output_folder = mapping_output_folder
        xcor_applet = find_applet_by_name(XCOR_APPLET_NAME,
                                          applet_project.get_id())
        xcor_output_folder = mapping_output_folder

        # in the first pass create the mapping stage id's so we can use JBOR's
        # to link inputs
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            mapped_stage_id = workflow.add_stage(mapping_applet,
                                                 name='Map %s' %
                                                 (superstage_name),
                                                 folder=mapping_output_folder)
            mapping_superstage.update({'map_stage_id': mapped_stage_id})

        # in the second pass populate the stage inputs and build other stages
        rep1_stage_id = next(
            ss.get('map_stage_id') for ss in mapping_superstages
            if ss['name'] == 'Rep1')
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            superstage_id = mapping_superstage.get('map_stage_id')

            if mapping_superstage.get('input_args') or blank_workflow:
                mapping_stage_input = {}
                if superstage_name != "Rep1":
                    mapping_stage_input.update({
                        'reference_tar':
                        dxpy.dxlink({
                            'stage': rep1_stage_id,
                            'inputField': 'reference_tar'
                        })
                    })
                else:
                    if args.reference:
                        mapping_stage_input.update({
                            'reference_tar':
                            dxpy.dxlink(reference_tar.get_id())
                        })
                if not blank_workflow:
                    for arg_index, input_arg in enumerate(
                            mapping_superstage['input_args']
                    ):  #read pairs assumed be in order read1,read2
                        reads = dxpy.dxlink(resolve_file(input_arg).get_id())
                        mapping_stage_input.update(
                            {'reads%d' % (arg_index + 1): reads})
                # this is now done in the first pass loop above
                # mapped_stage_id = workflow.add_stage(
                #     mapping_applet,
                #     name='Map %s' %(superstage_name),
                #     folder=mapping_output_folder,
                #     stage_input=mapping_stage_input
                # )
                # mapping_superstage.update({'map_stage_id': mapped_stage_id})
                workflow.update_stage(superstage_id,
                                      stage_input=mapping_stage_input)

                filter_qc_stage_id = workflow.add_stage(
                    filter_qc_applet,
                    name='Filter_QC %s' % (superstage_name),
                    folder=filter_qc_output_folder,
                    stage_input={
                        'input_bam':
                        dxpy.dxlink({
                            'stage': superstage_id,
                            'outputField': 'mapped_reads'
                        }),
                        'paired_end':
                        dxpy.dxlink({
                            'stage': superstage_id,
                            'outputField': 'paired_end'
                        })
                    })
                mapping_superstage.update(
                    {'filter_qc_stage_id': filter_qc_stage_id})

                xcor_stage_id = workflow.add_stage(xcor_applet,
                                                   name='Xcor %s' %
                                                   (superstage_name),
                                                   folder=xcor_output_folder,
                                                   stage_input={
                                                       'input_bam':
                                                       dxpy.dxlink({
                                                           'stage':
                                                           filter_qc_stage_id,
                                                           'outputField':
                                                           'filtered_bam'
                                                       }),
                                                       'paired_end':
                                                       dxpy.dxlink({
                                                           'stage':
                                                           filter_qc_stage_id,
                                                           'outputField':
                                                           'paired_end'
                                                       })
                                                   })
                mapping_superstage.update({'xcor_stage_id': xcor_stage_id})

        exp_rep1_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'tagAlign_file'
        })
        exp_rep1_cc = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'CC_scores_file'
        })
        exp_rep2_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'tagAlign_file'
        })
        exp_rep2_cc = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'CC_scores_file'
        })
        ctl_rep1_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Ctl1'),
            'outputField':
            'tagAlign_file'
        })
        if unary_control:
            ctl_rep2_ta = ctl_rep1_ta
        else:
            ctl_rep2_ta = dxpy.dxlink({
                'stage':
                next(
                    ss.get('xcor_stage_id') for ss in mapping_superstages
                    if ss['name'] == 'Ctl2'),
                'outputField':
                'tagAlign_file'
            })
        rep1_paired_end = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'paired_end'
        })
        rep2_paired_end = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'paired_end'
        })
    else:  #skipped the mapping, so just bring in the inputs from arguments
        if not blank_workflow:
            exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id())
            exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id())
            ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id())
            ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id())
        else:
            exp_rep1_ta = None
            exp_rep2_ta = None
            ctl_rep1_ta = None
            ctl_rep2_ta = None

        rep1_paired_end = args.rep1pe
        rep2_paired_end = args.rep2pe

        #here we need to calculate the cc scores files, because we're only being supplied tagAligns
        #if we had mapped everything above we'd already have a handle to the cc file
        xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME,
                                               applet_project.get_id())
        # xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name)
        xcor_output_folder = xcor_only_applet.name
        xcor_only_stages = []

        exp_rep1_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep1 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep1_ta,
                'paired_end': rep1_paired_end
            })
        xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id})
        exp_rep1_cc = dxpy.dxlink({
            'stage': exp_rep1_cc_stage_id,
            'outputField': 'CC_scores_file'
        })

        exp_rep2_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep2 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep2_ta,
                'paired_end': rep2_paired_end
            })
        xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id})
        exp_rep2_cc = dxpy.dxlink({
            'stage': exp_rep2_cc_stage_id,
            'outputField': 'CC_scores_file'
        })

    encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME,
                                              applet_project.get_id())
    encode_macs2_stages = []
    # peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name)
    peaks_output_folder = encode_macs2_applet.name

    macs2_stage_input = {
        'rep1_ta': exp_rep1_ta,
        'rep2_ta': exp_rep2_ta,
        'ctl1_ta': ctl_rep1_ta,
        'ctl2_ta': ctl_rep2_ta,
        'rep1_xcor': exp_rep1_cc,
        'rep2_xcor': exp_rep2_cc,
        'rep1_paired_end': rep1_paired_end,
        'rep2_paired_end': rep2_paired_end,
        'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
        'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)),
        'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as))
    }
    if genomesize:
        macs2_stage_input.update({'genomesize': genomesize})
    if chrom_sizes:
        macs2_stage_input.update({'chrom_sizes': chrom_sizes})
    encode_macs2_stage_id = workflow.add_stage(encode_macs2_applet,
                                               name='ENCODE Peaks',
                                               folder=peaks_output_folder,
                                               stage_input=macs2_stage_input)
    encode_macs2_stages.append({
        'name': 'ENCODE Peaks',
        'stage_id': encode_macs2_stage_id
    })

    if run_idr:
        encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME,
                                                applet_project.get_id())
        encode_spp_stages = []
        # idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name)
        idr_peaks_output_folder = encode_spp_applet.name
        PEAKS_STAGE_NAME = 'SPP Peaks'
        peaks_stage_input = {
            'rep1_ta': exp_rep1_ta,
            'rep2_ta': exp_rep2_ta,
            'ctl1_ta': ctl_rep1_ta,
            'ctl2_ta': ctl_rep2_ta,
            'rep1_xcor': exp_rep1_cc,
            'rep2_xcor': exp_rep2_cc,
            'rep1_paired_end': rep1_paired_end,
            'rep2_paired_end': rep2_paired_end,
            'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
            'idr_peaks': True
        }
        if chrom_sizes:
            peaks_stage_input.update({'chrom_sizes': chrom_sizes})
        else:
            peaks_stage_input.update({
                'chrom_sizes':
                dxpy.dxlink({
                    'stage': encode_macs2_stage_id,
                    'inputField': 'chrom_sizes'
                })
            })

        encode_spp_stage_id = workflow.add_stage(
            encode_spp_applet,
            name=PEAKS_STAGE_NAME,
            folder=idr_peaks_output_folder,
            stage_input=peaks_stage_input)
        encode_spp_stages.append({
            'name': PEAKS_STAGE_NAME,
            'stage_id': encode_spp_stage_id
        })

        idr_applet = find_applet_by_name(IDR2_APPLET_NAME,
                                         applet_project.get_id())
        encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME,
                                                applet_project.get_id())
        idr_stages = []
        # idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name)
        idr_output_folder = idr_applet.name
        if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow:
            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR True Replicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooled_peaks'
                    })
                })
            idr_stages.append({
                'name': 'IDR True Replicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 1 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1pr1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1pr2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1_peaks'
                    })
                })
            idr_stages.append({
                'name': 'IDR Rep 1 Self-pseudoreplicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 2 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2pr1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2pr2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2_peaks'
                    })
                })
            idr_stages.append({
                'name': 'IDR Rep 2 Self-pseudoreplicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Pooled Pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooledpr1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooledpr2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooled_peaks'
                    })
                })
            idr_stages.append({
                'name': 'IDR Pooled Pseudoreplicates',
                'stage_id': idr_stage_id
            })

            final_idr_stage_input = {
                'reps_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR True Replicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'r1pr_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'r2pr_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'pooledpr_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR Pooled Pseudoreplicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'as_file':
                dxpy.dxlink(resolve_file(args.narrowpeak_as)),
                'rep1_signal':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'rep1_fc_signal'
                }),
                'rep2_signal':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'rep2_fc_signal'
                }),
                'pooled_signal':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'pooled_fc_signal'
                })
            }
            if blacklist:
                final_idr_stage_input.update({'blacklist': blacklist})
            if chrom_sizes:
                final_idr_stage_input.update({'chrom_sizes': chrom_sizes})
            else:
                final_idr_stage_input.update({
                    'chrom_sizes':
                    dxpy.dxlink({
                        'stage': encode_spp_stage_id,
                        'inputField': 'chrom_sizes'
                    })
                })

            idr_stage_id = workflow.add_stage(
                encode_idr_applet,
                name='Final IDR peak calls',
                folder=idr_output_folder,
                stage_input=final_idr_stage_input,
            )
            idr_stages.append({
                'name': 'Final IDR peak calls',
                'stage_id': idr_stage_id
            })

    if target_type == 'histone':
        overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME,
                                                   applet_project.get_id())
        overlap_peaks_stages = []
        for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']:

            if peaktype == 'narrowpeaks':
                as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as))
                peak_type_extension = 'narrowPeak'

            elif peaktype == 'gappedpeaks':
                as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as))
                peak_type_extension = 'gappedPeak'

            elif peaktype == 'broadpeaks':
                as_file = dxpy.dxlink(resolve_file(args.broadpeak_as))
                peak_type_extension = 'broadPeak'

            overlap_peaks_stage_input = {
                'rep1_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'rep1_%s' % (peaktype)
                }),
                'rep2_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'rep2_%s' % (peaktype)
                }),
                'pooled_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'pooled_%s' % (peaktype)
                }),
                'pooledpr1_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'pooledpr1_%s' % (peaktype)
                }),
                'pooledpr2_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'pooledpr2_%s' % (peaktype)
                }),
                'as_file':
                as_file,
                'peak_type':
                peak_type_extension,
                'prefix':
                'final',
                'rep1_signal':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'rep1_fc_signal'
                }),
                'rep2_signal':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'rep2_fc_signal'
                }),
                'pooled_signal':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in encode_macs2_stages
                        if ss['name'] == 'ENCODE Peaks'),
                    'outputField':
                    'pooled_fc_signal'
                })
            }
            if chrom_sizes:
                overlap_peaks_stage_input.update({'chrom_sizes': chrom_sizes})
            else:
                overlap_peaks_stage_input.update({
                    'chrom_sizes':
                    dxpy.dxlink({
                        'stage': encode_macs2_stage_id,
                        'inputField': 'chrom_sizes'
                    })
                })

            overlap_peaks_stage_id = workflow.add_stage(
                overlap_peaks_applet,
                name='Final %s' % (peaktype),
                folder=peaks_output_folder,
                stage_input=overlap_peaks_stage_input)
            overlap_peaks_stages.append({
                'name': 'Final %s' % (peaktype),
                'stage_id': overlap_peaks_stage_id
            })

    if args.yes:
        if args.debug:
            job_id = workflow.run(
                {},
                folder=output_folder,
                priority='high',
                debug={'debugOn': ['AppInternalError', 'AppError']},
                delay_workspace_destruction=True,
                allow_ssh=['255.255.255.255'])
        else:
            job_id = workflow.run({}, folder=output_folder, priority='normal')
        logging.info("Running as job %s" % (job_id))
Beispiel #15
0
def main():
    args = get_args()

    ## resolve projects
    project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    print 'Experiment to analyze: ' + args.experiment
    if not project_has_folder(project, '/' + args.experiment):
        project.new_folder('/' + args.experiment)

    #TODO get all replicate ids from encoded DB from ENCSR (args.experiment)
    #TODO error out if ENCSR not found, status not complete etc.
    if args.test:
        source_id = project.get_id()
    else:
        source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT,
                                    level='VIEW').get_id()

    ## resolve replicates/fastq inputs
    paired = args.paired
    if not paired:
        if len(args.replicates) < 1:
            sys.exit(
                'Need to have at least 1 replicate file (unpaired) use -r or --replicates'
            )

        replicates = find_replicates(args.replicates, source_id, project,
                                     args.experiment, args.test)
        if not replicates:
            print "No replicates found in project: " + project.name
            print "Looking for " + ", ".join(args.replicates)
            sys.exit(1)

        dx_reps = {'reads': [dxpy.dxlink(r) for r in replicates]}
        rnames = '-'.join([r.split('.')[0] for r in args.replicates])
    else:
        if len(args.pair1) < 1 or len(args.pair2) < 1:
            sys.exit(
                "Need to have at least 1 replicate in pair1 (--r1/--pair1) and pair2 (--r2/--pair2"
            )

        pair1reps = find_replicates(args.pair1, source_id, project,
                                    args.experiment, args.test)
        if not pair1reps:
            print "No replicates for pair1 found in project: " + project.name
            print "Looking for " + ", ".join(args.pair1)
            sys.exit(1)

        pair2reps = find_replicates(args.pair2, source_id, project,
                                    args.experiment, args.test)
        if not pair2reps:
            print "No replicates for pair2 found in project: " + project.name
            print "Looking for " + ", ".join(args.pair2)
            sys.exit(1)

        dx_reps = {
            'pair1_reads': [dxpy.dxlink(r) for r in pair1reps],
            'pair2_reads': [dxpy.dxlink(r) for r in pair2reps]
        }
        rnames = '-'.join([r.split('.')[0] for r in args.pair1 + args.pair2])

    gender = args.gender
    organism = args.organism
    #TODO determine paired or gender from ENCSR metadata
    # Now create a new workflow ()
    spec_name = args.experiment + '-' + rnames
    title_root = 'dx_dna_me_'
    name_root = 'ENCODE Bismark DNA-ME pipeline: '
    desc = 'The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment'
    if paired:
        title_root = title_root + '_paired_end'
        name_root = name_root + '(paired-end)'
    else:
        title_root = title_root + '_single_end'
        name_root = name_root + '(single-end)'

    if args.export:
        project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT,
                                           name_mode='exact',
                                           return_handler=False)['id']
        wf = dxpy.new_dxworkflow(title=title_root,
                                 name=name_root,
                                 description=desc,
                                 folder=PUBLIC_FOLDER,
                                 project=project_id)
    else:
        project_id = project.get_id()
        wf = dxpy.new_dxworkflow(
            title='dx_dna_me_' + spec_name,
            name='ENCODE Bismark DNA-ME pipeline: ' + spec_name,
            description=
            'The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment'
            + args.experiment,
            folder='/' + args.experiment,
            project=project.get_id())

    populate_workflow(wf, dx_reps, args.experiment, paired, gender, organism,
                      project.id, args.export)
Beispiel #16
0
def main():
    args = get_args()

    output_project = resolve_project(args.outp, 'w')
    logging.debug('Found output project %s' % (output_project.name))
    output_folder = resolve_folder(output_project, args.outf)
    logging.debug('Using output folder %s' % (output_folder))
    applet_project = resolve_project(args.applets, 'r')
    logging.debug('Found applet project %s' % (applet_project.name))

    workflow = dxpy.new_dxworkflow(name=args.name,
                                   title=args.title,
                                   description=WF_DESCRIPTION,
                                   project=output_project.get_id(),
                                   folder=output_folder)

    blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2)

    if not args.genomesize:
        genomesize = None
    else:
        genomesize = args.genomesize
    if not args.chrom_sizes:
        chrom_sizes = None
    else:
        chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes))

    if not args.blacklist:
        blacklist = None
    else:
        blacklist = dxpy.dxlink(resolve_file(args.blacklist))

    if not args.nomap:
        #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input
        #each superstage here could be implemented as a stage in a more abstract workflow.  That stage would then call the various applets that are separate
        #stages here.
        mapping_superstages = [  # the order of this list is important in that
            {
                'name': 'Rep1',
                'input_args': args.rep1
            }, {
                'name': 'Rep2',
                'input_args': args.rep2
            }, {
                'name': 'Ctl1',
                'input_args': args.ctl1
            }
        ]
        if not args.unary_control:
            mapping_superstages.append({
                'name': 'Ctl2',
                'input_args': args.ctl2
            })

        mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME,
                                             applet_project.get_id())
        mapping_output_folder = resolve_folder(
            output_project, output_folder + '/' + mapping_applet.name)
        reference_tar = resolve_file(args.reference)
        filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME,
                                               applet_project.get_id())
        filter_qc_output_folder = mapping_output_folder
        xcor_applet = find_applet_by_name(XCOR_APPLET_NAME,
                                          applet_project.get_id())
        xcor_output_folder = mapping_output_folder

        # in the first pass create the mapping stage id's so we can use JBOR's
        # to link inputs
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            mapped_stage_id = workflow.add_stage(mapping_applet,
                                                 name='Map %s' %
                                                 (superstage_name),
                                                 folder=mapping_output_folder)
            mapping_superstage.update({'map_stage_id': mapped_stage_id})

        # in the second pass populate the stage inputs and build other stages
        rep1_stage_id = next(
            ss.get('map_stage_id') for ss in mapping_superstages
            if ss['name'] == 'Rep1')
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            superstage_id = mapping_superstage.get('map_stage_id')

            if mapping_superstage.get('input_args') or blank_workflow:
                mapping_stage_input = {}
                if superstage_name != "Rep1":
                    mapping_stage_input.update({
                        'reference_tar':
                        dxpy.dxlink({
                            'stage': rep1_stage_id,
                            'inputField': 'reference_tar'
                        })
                    })
                else:
                    if args.reference:
                        mapping_stage_input.update({
                            'reference_tar':
                            dxpy.dxlink(reference_tar.get_id())
                        })
                if not blank_workflow:
                    for arg_index, input_arg in enumerate(
                            mapping_superstage['input_args']
                    ):  #read pairs assumed be in order read1,read2
                        reads = dxpy.dxlink(resolve_file(input_arg).get_id())
                        mapping_stage_input.update(
                            {'reads%d' % (arg_index + 1): reads})
                # this is now done in the first pass loop above
                # mapped_stage_id = workflow.add_stage(
                #     mapping_applet,
                #     name='Map %s' %(superstage_name),
                #     folder=mapping_output_folder,
                #     stage_input=mapping_stage_input
                # )
                # mapping_superstage.update({'map_stage_id': mapped_stage_id})
                workflow.update_stage(superstage_id,
                                      stage_input=mapping_stage_input)

                filter_qc_stage_id = workflow.add_stage(
                    filter_qc_applet,
                    name='Filter_QC %s' % (superstage_name),
                    folder=filter_qc_output_folder,
                    stage_input={
                        'input_bam':
                        dxpy.dxlink({
                            'stage': superstage_id,
                            'outputField': 'mapped_reads'
                        }),
                        'paired_end':
                        dxpy.dxlink({
                            'stage': superstage_id,
                            'outputField': 'paired_end'
                        })
                    })
                mapping_superstage.update(
                    {'filter_qc_stage_id': filter_qc_stage_id})

                xcor_stage_id = workflow.add_stage(xcor_applet,
                                                   name='Xcor %s' %
                                                   (superstage_name),
                                                   folder=xcor_output_folder,
                                                   stage_input={
                                                       'input_bam':
                                                       dxpy.dxlink({
                                                           'stage':
                                                           filter_qc_stage_id,
                                                           'outputField':
                                                           'filtered_bam'
                                                       }),
                                                       'paired_end':
                                                       dxpy.dxlink({
                                                           'stage':
                                                           filter_qc_stage_id,
                                                           'outputField':
                                                           'paired_end'
                                                       })
                                                   })
                mapping_superstage.update({'xcor_stage_id': xcor_stage_id})

        exp_rep1_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'tagAlign_file'
        })
        exp_rep1_cc = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'CC_scores_file'
        })
        exp_rep2_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'tagAlign_file'
        })
        exp_rep2_cc = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'CC_scores_file'
        })
        ctl_rep1_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Ctl1'),
            'outputField':
            'tagAlign_file'
        })
        if not args.unary_control:
            ctl_rep2_ta = dxpy.dxlink({
                'stage':
                next(
                    ss.get('xcor_stage_id') for ss in mapping_superstages
                    if ss['name'] == 'Ctl2'),
                'outputField':
                'tagAlign_file'
            })
        else:
            ctl_rep2_ta = ctl_rep1_ta
        rep1_paired_end = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'paired_end'
        })
        rep2_paired_end = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'paired_end'
        })
    else:  #skipped the mapping, so just bring in the inputs from arguments
        exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id())
        exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id())
        ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id())
        ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id())
        rep1_paired_end = args.rep1pe
        rep2_paired_end = args.rep2pe

        #here we need to calculate the cc scores files, because we're only being supplied tagAligns
        #if we had mapped everything above we'd already have a handle to the cc file
        xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME,
                                               applet_project.get_id())
        xcor_output_folder = resolve_folder(
            output_project, output_folder + '/' + xcor_only_applet.name)
        xcor_only_stages = []

        exp_rep1_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep1 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep1_ta,
                'paired_end': rep1_paired_end
            })
        xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id})
        exp_rep1_cc = dxpy.dxlink({
            'stage': exp_rep1_cc_stage_id,
            'outputField': 'CC_scores_file'
        })

        exp_rep2_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep2 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep2_ta,
                'paired_end': rep2_paired_end
            })
        xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id})
        exp_rep2_cc = dxpy.dxlink({
            'stage': exp_rep2_cc_stage_id,
            'outputField': 'CC_scores_file'
        })

    encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME,
                                            applet_project.get_id())
    encode_spp_stages = []
    idr_peaks_output_folder = resolve_folder(
        output_project, output_folder + '/' + encode_spp_applet.name)
    PEAKS_STAGE_NAME = 'SPP Peaks'
    peaks_stage_input = {
        'rep1_ta': exp_rep1_ta,
        'rep2_ta': exp_rep2_ta,
        'ctl1_ta': ctl_rep1_ta,
        'ctl2_ta': ctl_rep2_ta,
        'rep1_xcor': exp_rep1_cc,
        'rep2_xcor': exp_rep2_cc,
        'rep1_paired_end': rep1_paired_end,
        'rep2_paired_end': rep2_paired_end,
        'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
        'idr_peaks': args.idr
    }
    if chrom_sizes:
        peaks_stage_input.update({'chrom_sizes': chrom_sizes})
    encode_spp_stage_id = workflow.add_stage(encode_spp_applet,
                                             name=PEAKS_STAGE_NAME,
                                             folder=idr_peaks_output_folder,
                                             stage_input=peaks_stage_input)
    encode_spp_stages.append({
        'name': PEAKS_STAGE_NAME,
        'stage_id': encode_spp_stage_id
    })

    encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME,
                                              applet_project.get_id())
    encode_macs2_stages = []
    peaks_output_folder = resolve_folder(
        output_project, output_folder + '/' + encode_macs2_applet.name)

    macs2_stage_input = {
        'rep1_ta': exp_rep1_ta,
        'rep2_ta': exp_rep2_ta,
        'ctl1_ta': ctl_rep1_ta,
        'ctl2_ta': ctl_rep2_ta,
        'rep1_xcor': exp_rep1_cc,
        'rep2_xcor': exp_rep2_cc,
        'rep1_paired_end': rep1_paired_end,
        'rep2_paired_end': rep2_paired_end,
        'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
        'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)),
        'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as))
    }
    if genomesize:
        macs2_stage_input.update({'genomesize': genomesize})
    if chrom_sizes:
        macs2_stage_input.update({'chrom_sizes': chrom_sizes})
    else:
        macs2_stage_input.update({
            'chrom_sizes':
            dxpy.dxlink({
                'stage': encode_spp_stage_id,
                'inputField': 'chrom_sizes'
            })
        })
    encode_macs2_stage_id = workflow.add_stage(encode_macs2_applet,
                                               name='ENCODE Peaks',
                                               folder=peaks_output_folder,
                                               stage_input=macs2_stage_input)
    encode_macs2_stages.append({
        'name': 'ENCODE Peaks',
        'stage_id': encode_macs2_stage_id
    })

    if args.idr:
        # if args.idrversion == "1":
        #     idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id())
        # elif args.idrversion == "2":
        #     idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id())
        # else:
        #     logging.error("Invalid IDR version: %s" %(args.idrversion))
        #     idr_applet = None
        idr_applet = find_applet_by_name(IDR2_APPLET_NAME,
                                         applet_project.get_id())
        encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME,
                                                applet_project.get_id())
        idr_stages = []
        idr_output_folder = resolve_folder(
            output_project, output_folder + '/' + idr_applet.name)
        if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow:
            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR True Replicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooled_peaks'
                    })
                })
            idr_stages.append({
                'name': 'IDR True Replicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 1 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1pr1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1pr2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1_peaks'
                    })
                })
            idr_stages.append({
                'name': 'IDR Rep 1 Self-pseudoreplicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 2 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2pr1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2pr2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2_peaks'
                    })
                })
            idr_stages.append({
                'name': 'IDR Rep 2 Self-pseudoreplicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Pooled Pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooledpr1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooledpr2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooled_peaks'
                    })
                })
            idr_stages.append({
                'name': 'IDR Pooled Pseudoreplicates',
                'stage_id': idr_stage_id
            })

            stage_input = {
                'reps_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR True Replicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'r1pr_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'r2pr_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'pooledpr_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR Pooled Pseudoreplicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'as_file':
                dxpy.dxlink(resolve_file(args.narrowpeak_as))
            }
            if blacklist:
                stage_input.update({'blacklist': blacklist})
            if chrom_sizes:
                stage_input.update({'chrom_sizes': chrom_sizes})
            else:
                stage_input.update({
                    'chrom_sizes':
                    dxpy.dxlink({
                        'stage': encode_spp_stage_id,
                        'inputField': 'chrom_sizes'
                    })
                })
            idr_stage_id = workflow.add_stage(encode_idr_applet,
                                              name='Final IDR peak calls',
                                              folder=idr_output_folder,
                                              stage_input=stage_input)
            idr_stages.append({
                'name': 'Final IDR peak calls',
                'stage_id': idr_stage_id
            })

    if not (args.nomap):
        logging.debug("Mapping stages: %s" % (mapping_superstages))
    else:
        logging.debug("xcor only stages: %s" % (xcor_only_stages))
    # if not args.idronly:
    #   logging.debug("Peak stages: %s" %(spp_stages))
    logging.debug("Peak stages: %s" % (encode_spp_stages))
    if args.idr:
        logging.debug("IDR stages: %s" % (idr_stages))

    if args.yes:
        if args.debug:
            job_id = workflow.run(
                {},
                priority='high',
                debug={'debugOn': ['AppInternalError', 'AppError']},
                delay_workspace_destruction=True,
                allow_ssh=['255.255.255.255'])
        else:
            job_id = workflow.run({}, priority='high')
        logging.info("Running as job %s" % (job_id))
Beispiel #17
0
def main():
	args = get_args()

	output_project = resolve_project(args.outp, 'w')
	logging.info('Found output project %s' %(output_project.name))
	output_folder = resolve_folder(output_project, args.outf)
	logging.info('Using output folder %s' %(output_folder))
	applet_project = resolve_project(args.applets, 'r')
	logging.info('Found applet project %s' %(applet_project.name))

	workflow = dxpy.new_dxworkflow(
		title=WF_TITLE,
		name=args.name,
		description=WF_DESCRIPTION,
		project=output_project.get_id(),
		folder=output_folder)

	blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2)

	#this whole strategy is fragile and unsatisfying
	#subsequent code assumes reps come before contols
	#a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input
	#each superstage here could be implemented as a stage in a more abstract workflow.  That stage would then call the various applets that are separate
	#stages here.
	mapping_superstages = [
		{'name': 'Rep1', 'input_args': args.rep1},
		{'name': 'Rep2', 'input_args': args.rep2},
		{'name': 'Ctl1', 'input_args': args.ctl1},
		{'name': 'Ctl2', 'input_args': args.ctl2}
		# {'name': 'Pooled Reps', 'input_args': (args.rep1 and args.rep2)},
		# {'name': 'Pooled Controls', 'input_args': (args.ctl1 and args.ctl2)} ##idea is to create a "stub" stage and then populate it's input with the output of the pool stage, defined below
	]

	mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id())
	mapping_output_folder = resolve_folder(output_project, output_folder + '/' + mapping_applet.name)
	reference_tar = resolve_file(args.reference)
	filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id())
	filter_qc_output_folder = mapping_output_folder
	xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id())
	xcor_output_folder = mapping_output_folder

	for mapping_superstage in mapping_superstages:
		superstage_name = mapping_superstage.get('name')

		if mapping_superstage.get('input_args') or blank_workflow:
			if blank_workflow:
				mapping_stage_input = None
			else:
				mapping_stage_input = {'reference_tar' : dxpy.dxlink(reference_tar.get_id())}
				for arg_index,input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2
					reads = dxpy.dxlink(resolve_file(input_arg).get_id())
					mapping_stage_input.update({'reads%d' %(arg_index+1): reads})

			mapped_stage_id = workflow.add_stage(
				mapping_applet,
				name='Map %s' %(superstage_name),
				folder=mapping_output_folder,
				stage_input=mapping_stage_input,
				instance_type=args.instance_type
			)
			mapping_superstage.update({'map_stage_id': mapped_stage_id})

			filter_qc_stage_id = workflow.add_stage(
				filter_qc_applet,
				name='Filter_QC %s' %(superstage_name),
				folder=filter_qc_output_folder,
				stage_input={
					'input_bam': dxpy.dxlink({'stage': mapped_stage_id, 'outputField': 'mapped_reads'}),
					'paired_end': dxpy.dxlink({'stage': mapped_stage_id, 'outputField': 'paired_end'})
				},
				instance_type=args.instance_type
			)
			mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id})

			xcor_stage_id = workflow.add_stage(
				xcor_applet,
				name='Xcor %s' %(superstage_name),
				folder=xcor_output_folder,
				stage_input={
					'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}),
					'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'})
				},
				instance_type=args.instance_type
			)
			mapping_superstage.update({'xcor_stage_id': xcor_stage_id})

	spp_applet = find_applet_by_name(SPP_APPLET_NAME, applet_project.get_id())
	spp_stages = []
	peaks_output_folder = resolve_folder(output_project, output_folder + '/' + spp_applet.name)
	if (args.rep1 and args.ctl1) or blank_workflow:
		rep1_spp_stage_id = workflow.add_stage(
			spp_applet,
			name='Peaks Rep1',
			folder=peaks_output_folder,
			stage_input={
				'experiment' : dxpy.dxlink(
					{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
					 'outputField': 'tagAlign_file'}),
				'control': dxpy.dxlink(
					{'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'),
					 'outputField': 'tagAlign_file'}),
				'xcor_scores_input': dxpy.dxlink(
					{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
					 'outputField': 'CC_scores_file'})
			},
			instance_type=args.instance_type
		)
		spp_stages.append({'name': 'Peaks Rep1', 'stage_id': rep1_spp_stage_id})
	if (args.rep2 and args.ctl2) or blank_workflow:
		rep2_spp_stage_id = workflow.add_stage(
			spp_applet,
			name='Peaks Rep2',
			folder=peaks_output_folder,
			stage_input={
				'experiment' : dxpy.dxlink(
					{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
					 'outputField': 'tagAlign_file'}),
				'control': dxpy.dxlink(
					{'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'),
					 'outputField': 'tagAlign_file'}),
				'xcor_scores_input': dxpy.dxlink(
					{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
					 'outputField': 'CC_scores_file'})
			},
			instance_type=args.instance_type
		)
		spp_stages.append({'name': 'Peaks Rep2', 'stage_id': rep2_spp_stage_id})

	encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id())
	encode_spp_stages = []
	if args.idr:
		idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name)
		if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow:
			encode_spp_stage_id = workflow.add_stage(
				encode_spp_applet,
				name='Peaks for IDR',
				folder=idr_peaks_output_folder,
				stage_input={
					'rep1_ta' : dxpy.dxlink(
						{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
						 'outputField': 'tagAlign_file'}),
					'rep2_ta' : dxpy.dxlink(
						{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
						 'outputField': 'tagAlign_file'}),
					'ctl1_ta': dxpy.dxlink(
						{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'),
						 'outputField': 'tagAlign_file'}),
					'ctl2_ta' : dxpy.dxlink(
						{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'),
						 'outputField': 'tagAlign_file'}),
					'rep1_xcor' : dxpy.dxlink(
						{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
						 'outputField': '"CC_scores_file"'}),
					'rep2_xcor' : dxpy.dxlink(
						{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
						 'outputField': '"CC_scores_file"'}),
					'paired_end': dxpy.dxlink(
						{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
						 'outputField': 'paired_end'}) #here we're assuming if rep1 is PE it's a PE experiment - need better error checking
				},
				instance_type=args.instance_type
			)
			encode_spp_stages.append({'name': 'Peaks for IDR', 'stage_id': encode_spp_stage_id})

	idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id())
	idr_stages = []
	if args.idr:
		idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name)
		if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow:
			idr_stage_id = workflow.add_stage(
				idr_applet,
				name='IDR True Replicates',
				folder=idr_output_folder,
				stage_input={
					'rep1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'rep1_peaks'}),
					'rep2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'rep2_peaks'}),
					'pooled_peaks': dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'pooled_peaks'})
				},
				instance_type=args.instance_type
			)
			idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id})

			idr_stage_id = workflow.add_stage(
				idr_applet,
				name='IDR Rep 1 Self-pseudoreplicates',
				folder=idr_output_folder,
				stage_input={
					'rep1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'rep1pr1_peaks'}),
					'rep2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'rep1pr2_peaks'}),
					'pooled_peaks': dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'rep1_peaks'})
				},
				instance_type=args.instance_type
			)
			idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id})

			idr_stage_id = workflow.add_stage(
				idr_applet,
				name='IDR Rep 2 Self-pseudoreplicates',
				folder=idr_output_folder,
				stage_input={
					'rep1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'rep2pr1_peaks'}),
					'rep2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'rep2pr2_peaks'}),
					'pooled_peaks': dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'rep2_peaks'})
				},
				instance_type=args.instance_type
			)
			idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id})

			idr_stage_id = workflow.add_stage(
				idr_applet,
				name='IDR Pooled Pseudoeplicates',
				folder=idr_output_folder,
				stage_input={
					'rep1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'pooledpr1_peaks'}),
					'rep2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'pooledpr2_peaks'}),
					'pooled_peaks': dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == 'Peaks for IDR'),
						 'outputField': 'pooled_peaks'})
				},
				instance_type=args.instance_type
			)
			idr_stages.append({'name': 'IDR Pooled Pseudoeplicates', 'stage_id': idr_stage_id})


	logging.debug("Mapping stages: %s" %(mapping_superstages))
	logging.debug("Peak stages: %s" %(spp_stages))
	logging.debug("Peaks for IDR stages: %s" %(encode_spp_stages))
	logging.debug("IDR stages: %s" %(idr_stages))
Beispiel #18
0
def main():
    args = get_args()

    output_project = resolve_project(args.outp, 'w')
    logging.info('Found output project %s' % (output_project.name))
    output_folder = resolve_folder(output_project, args.outf)
    logging.info('Using output folder %s' % (output_folder))
    applet_project = resolve_project(args.applets, 'r')
    logging.info('Found applet project %s' % (applet_project.name))

    workflow = dxpy.new_dxworkflow(name=args.name,
                                   title=args.title,
                                   description=WF_DESCRIPTION,
                                   project=output_project.get_id(),
                                   folder=output_folder)

    blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2)

    if not args.nomap:
        #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input
        #each superstage here could be implemented as a stage in a more abstract workflow.  That stage would then call the various applets that are separate
        #stages here.
        mapping_superstages = [
            {
                'name': 'Rep1',
                'input_args': args.rep1
            }, {
                'name': 'Rep2',
                'input_args': args.rep2
            }, {
                'name': 'Ctl1',
                'input_args': args.ctl1
            }, {
                'name': 'Ctl2',
                'input_args': args.ctl2
            }
            # {'name': 'Pooled Reps', 'input_args': (args.rep1 and args.rep2)},
            # {'name': 'Pooled Controls', 'input_args': (args.ctl1 and args.ctl2)} ##idea is to create a "stub" stage and then populate it's input with the output of the pool stage, defined below
        ]

        mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME,
                                             applet_project.get_id())
        mapping_output_folder = resolve_folder(
            output_project, output_folder + '/' + mapping_applet.name)
        reference_tar = resolve_file(args.reference)
        filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME,
                                               applet_project.get_id())
        filter_qc_output_folder = mapping_output_folder
        xcor_applet = find_applet_by_name(XCOR_APPLET_NAME,
                                          applet_project.get_id())
        xcor_output_folder = mapping_output_folder

        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')

            if mapping_superstage.get('input_args') or blank_workflow:
                if blank_workflow:
                    if args.reference:
                        mapping_stage_input = {
                            'reference_tar':
                            dxpy.dxlink(reference_tar.get_id())
                        }
                    else:
                        mapping_stage_input = None
                else:
                    mapping_stage_input = {
                        'reference_tar': dxpy.dxlink(reference_tar.get_id())
                    }
                    for arg_index, input_arg in enumerate(
                            mapping_superstage['input_args']
                    ):  #read pairs assumed be in order read1,read2
                        reads = dxpy.dxlink(resolve_file(input_arg).get_id())
                        mapping_stage_input.update(
                            {'reads%d' % (arg_index + 1): reads})

                mapped_stage_id = workflow.add_stage(
                    mapping_applet,
                    name='Map %s' % (superstage_name),
                    folder=mapping_output_folder,
                    stage_input=mapping_stage_input)
                mapping_superstage.update({'map_stage_id': mapped_stage_id})

                filter_qc_stage_id = workflow.add_stage(
                    filter_qc_applet,
                    name='Filter_QC %s' % (superstage_name),
                    folder=filter_qc_output_folder,
                    stage_input={
                        'input_bam':
                        dxpy.dxlink({
                            'stage': mapped_stage_id,
                            'outputField': 'mapped_reads'
                        }),
                        'paired_end':
                        dxpy.dxlink({
                            'stage': mapped_stage_id,
                            'outputField': 'paired_end'
                        })
                    })
                mapping_superstage.update(
                    {'filter_qc_stage_id': filter_qc_stage_id})

                xcor_stage_id = workflow.add_stage(xcor_applet,
                                                   name='Xcor %s' %
                                                   (superstage_name),
                                                   folder=xcor_output_folder,
                                                   stage_input={
                                                       'input_bam':
                                                       dxpy.dxlink({
                                                           'stage':
                                                           filter_qc_stage_id,
                                                           'outputField':
                                                           'filtered_bam'
                                                       }),
                                                       'paired_end':
                                                       dxpy.dxlink({
                                                           'stage':
                                                           filter_qc_stage_id,
                                                           'outputField':
                                                           'paired_end'
                                                       })
                                                   })
                mapping_superstage.update({'xcor_stage_id': xcor_stage_id})

        exp_rep1_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'tagAlign_file'
        })
        exp_rep1_cc = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'CC_scores_file'
        })
        exp_rep2_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'tagAlign_file'
        })
        exp_rep2_cc = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'CC_scores_file'
        })
        ctl_rep1_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Ctl1'),
            'outputField':
            'tagAlign_file'
        })
        ctl_rep2_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Ctl2'),
            'outputField':
            'tagAlign_file'
        })
        rep1_paired_end = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'paired_end'
        })
        rep2_paired_end = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'paired_end'
        })
    else:  #skipped the mapping, so just bring in the inputs from arguments
        exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id())
        exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id())
        ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id())
        ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id())
        rep1_paired_end = args.rep1pe
        rep2_paired_end = args.rep2pe

        #here we need to calculate the cc scores files, because we're only being supplied tagAligns
        #if we had mapped everything above we'd already have a handle to the cc file
        xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME,
                                               applet_project.get_id())
        xcor_output_folder = resolve_folder(
            output_project, output_folder + '/' + xcor_only_applet.name)
        xcor_only_stages = []

        exp_rep1_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep1 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep1_ta,
                'paired_end': rep1_paired_end
            })
        xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id})
        exp_rep1_cc = dxpy.dxlink({
            'stage': exp_rep1_cc_stage_id,
            'outputField': 'CC_scores_file'
        })

        exp_rep2_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep2 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep2_ta,
                'paired_end': rep2_paired_end
            })
        xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id})
        exp_rep2_cc = dxpy.dxlink({
            'stage': exp_rep2_cc_stage_id,
            'outputField': 'CC_scores_file'
        })

    encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME,
                                              applet_project.get_id())
    encode_macs2_stages = []
    peaks_output_folder = resolve_folder(
        output_project, output_folder + '/' + encode_macs2_applet.name)
    if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow:
        encode_macs2_stage_id = workflow.add_stage(
            encode_macs2_applet,
            name='ENCODE Peaks',
            folder=peaks_output_folder,
            stage_input={
                'rep1_ta': exp_rep1_ta,
                'rep2_ta': exp_rep2_ta,
                'ctl1_ta': ctl_rep1_ta,
                'ctl2_ta': ctl_rep2_ta,
                'rep1_xcor': exp_rep1_cc,
                'rep2_xcor': exp_rep2_cc,
                'rep1_paired_end': rep1_paired_end,
                'rep2_paired_end': rep2_paired_end,
                'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)),
                'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
                'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)),
                'broadpeak_as': dxpy.dxlink(resolve_file(args.broadpeak_as)),
                'genomesize': args.genomesize
            })
        encode_macs2_stages.append({
            'name': 'ENCODE Peaks',
            'stage_id': encode_macs2_stage_id
        })

    #new applet here, similar to IDR, to do naive peak processing
    if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow:

        overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME,
                                                   applet_project.get_id())
        overlap_peaks_stages = []
        for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']:

            if peaktype == 'narrowpeaks':
                as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as))
                peak_type_extension = 'narrowPeak'

            elif peaktype == 'gappedpeaks':
                as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as))
                peak_type_extension = 'gappedPeak'

            elif peaktype == 'broadpeaks':
                as_file = dxpy.dxlink(resolve_file(args.broadpeak_as))
                peak_type_extension = 'broadPeak'

            overlap_peaks_stage_id = workflow.add_stage(
                overlap_peaks_applet,
                name='Overlap %s' % (peaktype),
                folder=peaks_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep1_%s' % (peaktype)
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep2_%s' % (peaktype)
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'pooled_%s' % (peaktype)
                    }),
                    'pooledpr1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'pooledpr1_%s' % (peaktype)
                    }),
                    'pooledpr2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'pooledpr2_%s' % (peaktype)
                    }),
                    'chrom_sizes':
                    dxpy.dxlink(resolve_file(args.chrom_sizes)),
                    'as_file':
                    as_file,
                    'peak_type':
                    peak_type_extension
                })
            overlap_peaks_stages.append({
                'name': 'Overlap %s' % (peaktype),
                'stage_id': overlap_peaks_stage_id
            })

    #TODO - IDR on gapped and broad peaks
    if args.idr:
        idr_applet = find_applet_by_name(IDR_APPLET_NAME,
                                         applet_project.get_id())
        encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME,
                                                applet_project.get_id())
        idr_peaks_output_folder = resolve_folder(
            output_project, output_folder + '/' + idr_applet.name)
        idr_output_folder = resolve_folder(
            output_project, output_folder + '/' + idr_applet.name)
        idr_stages = []
        if (args.rep1 and args.ctl1 and args.rep2
                and args.ctl2) or blank_workflow:
            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR True Replicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep1_narrowpeaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep2_narrowpeaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'pooled_narrowpeaks'
                    }),
                    'idr_version':
                    int(args.idrversion)
                })
            idr_stages.append({
                'name': 'IDR True Replicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 1 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep1pr1_narrowpeaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep1pr2_narrowpeaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep1_narrowpeaks'
                    }),
                    'idr_version':
                    int(args.idrversion)
                })
            idr_stages.append({
                'name': 'IDR Rep 1 Self-pseudoreplicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 2 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep2pr1_narrowpeaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep2pr2_narrowpeaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'rep2_narrowpeaks'
                    }),
                    'idr_version':
                    int(args.idrversion)
                })
            idr_stages.append({
                'name': 'IDR Rep 2 Self-pseudoreplicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Pooled Pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'pooledpr1_narrowpeaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'pooledpr2_narrowpeaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_macs2_stages
                            if ss['name'] == 'ENCODE Peaks'),
                        'outputField':
                        'pooled_narrowpeaks'
                    }),
                    'idr_version':
                    int(args.idrversion)
                })
            idr_stages.append({
                'name': 'IDR Pooled Pseudoreplicates',
                'stage_id': idr_stage_id
            })

            final_idr_stage_input = {
                'reps_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR True Replicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'r1pr_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'r2pr_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'pooledpr_peaks':
                dxpy.dxlink({
                    'stage':
                    next(
                        ss.get('stage_id') for ss in idr_stages
                        if ss['name'] == 'IDR Pooled Pseudoreplicates'),
                    'outputField':
                    'IDR_peaks'
                }),
                'chrom_sizes':
                dxpy.dxlink(resolve_file(args.chrom_sizes)),
                'as_file':
                dxpy.dxlink(resolve_file(args.narrowpeak_as))
            }
            if args.blacklist:
                final_idr_stage_input.update(
                    {'blacklist': dxpy.dxlink(resolve_file(args.blacklist))})
            idr_stage_id = workflow.add_stage(
                encode_idr_applet,
                name='Final IDR peak calls',
                folder=idr_output_folder,
                stage_input=final_idr_stage_input)
            idr_stages.append({
                'name': 'Final IDR peak calls',
                'stage_id': idr_stage_id
            })

    if not (args.nomap):
        logging.debug("Mapping stages: %s" % (mapping_superstages))
    else:
        logging.debug("xcor only stages: %s" % (xcor_only_stages))
    logging.debug("Peaks for ENCODE stages: %s" % (encode_macs2_stages))
    logging.debug("Peak overlap stages: %s" % (overlap_peaks_stages))
    if args.idr:
        logging.debug("IDR stages: %s" % (idr_stages))

    if args.yes:
        if args.debug:
            job_id = workflow.run(
                {},
                priority='high',
                debug={'debugOn': ['AppInternalError', 'AppError']},
                delay_workspace_destruction=True,
                allow_ssh=['255.255.255.255'])
        else:
            job_id = workflow.run({}, priority='high')
        logging.info("Running as job %s" % (job_id))
Beispiel #19
0
def main():
    args = get_args()
    if len(args.replicates) < 1:
        sys.exit('Need to have at least 1 replicate file.')

    project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    print 'Experiment to analyze: ' + args.experiment
    if not project_has_folder(project, '/' + args.experiment):
        project.new_folder('/' + args.experiment)

    #TODO get all replicate ids from encoded DB from ENCSR (args.experiment)
    #TODO error out if ENCSR not found, status not complete etc.
    if args.test:
        source_id = project.get_id()
    else:
        source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT,
                                    level='VIEW').get_id()

    replicates = []
    for rep in args.replicates:
        dx_rep = dxpy.find_data_objects(classname='file',
                                        name=rep,
                                        name_mode='exact',
                                        project=source_id,
                                        return_handler=False)
        replicates.extend(dx_rep)

    if not args.test:
        replicates = copy_files(replicates, project.get_id(),
                                "/" + args.experiment)

    if not replicates:
        print "No replicates found in project: " + project.name
        print "Looking for " + ", ".join(args.replicates)
        sys.exit(1)

    inputs = {'rnd_seed': 12345}
    inputs['paired'] = args.paired
    inputs['gender'] = args.gender
    inputs['organism'] = args.organism
    inputs['library_id'] = args.library
    inputs['nthreads'] = args.nthreads
    #TODO determine paired or gender from ENCSR metadata
    # Now create a new workflow ()
    inputs['spec_name'] = args.experiment + '-' + '-'.join(
        [r.split('.')[0] for r in args.replicates])
    title_root = 'dx_long_rna_seq_'
    name_root = 'ENCODE Long RNA Seq: '
    desc = 'The ENCODE RNA Seq pipeline for long RNAs'
    if args.paired:
        title_root = title_root + '_paired_end '
        name_root = name_root + '(paired-end) '
        inputs['stranded'] = True
    else:
        title_root = title_root + '_single_end '
        name_root = name_root + '(single-end) '
        inputs['stranded'] = False

    if args.export:
        project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT,
                                           name_mode='exact',
                                           return_handler=False)['id']
        wf = dxpy.new_dxworkflow(title=title_root,
                                 name=name_root,
                                 description=desc,
                                 folder=PUBLIC_FOLDER,
                                 project=project_id)
    else:
        project_id = project.get_id()
        wf = dxpy.new_dxworkflow(title=title_root + inputs['spec_name'],
                                 name=name_root + inputs['spec_name'],
                                 description=desc + ' for experiment:' +
                                 args.experiment,
                                 folder='/' + args.experiment,
                                 project=project.get_id())

    populate_workflow(wf, replicates, args.experiment, inputs, project.id,
                      args.export)
Beispiel #20
0
def build_workflow():
    if parameters["folder_provided"] == "false":
        wf = dxpy.new_dxworkflow(
            name='WARDEN_workflow',
            description='RNA-SEQ Workflow',
            output_folder=parameters["Output"],
        )
    else:
        wf = dxpy.new_dxworkflow(
            name='WARDEN_workflow',
            description='RNA-SEQ Workflow',
        )
    wf_outputs = []

    fastqc_applet = dxpy.search.find_one_data_object(classname="applet",
                                                     name=app_names["fastqc"],
                                                     state="closed",
                                                     return_handler=True)
    star_applet = dxpy.search.find_one_data_object(classname="applet",
                                                   name=app_names["star"],
                                                   state="closed",
                                                   return_handler=True)
    combine_sj_tab_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["combine_sj_out"],
        state="closed",
        return_handler=True)
    sort_bam_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["sort_bam"],
        state="closed",
        return_handler=True)
    htseq_applet = dxpy.search.find_one_data_object(classname="applet",
                                                    name=app_names["htseq"],
                                                    state="closed",
                                                    return_handler=True)
    genome_cov_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["genome_coverage"],
        state="closed",
        return_handler=True)
    bigwig_applet = dxpy.search.find_one_data_object(classname="applet",
                                                     name=app_names["bigwig"],
                                                     state="closed",
                                                     return_handler=True)
    combine_counts_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["combine_counts"],
        state="closed",
        return_handler=True)
    combine_flagstat_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["combine_flagstat"],
        state="closed",
        return_handler=True)
    limma_applet = dxpy.search.find_one_data_object(classname="applet",
                                                    name=app_names["limma"],
                                                    state="closed",
                                                    return_handler=True)
    simple_DE_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["simple_DE"],
        state="closed",
        return_handler=True)
    bw_viewer_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["bw_viewer"],
        state="closed",
        return_handler=True)

    sample_num = 0
    htseq_results = []
    bigwig_files = []
    flagstat_files_arr = []
    index_project, index_id = parameters["index_file"].split(":")
    gtf_project, gtf_id = parameters["gtf_file"].split(":")
    genome_length_project, genome_length_id = parameters[
        "genome_sizes_file"].split(":")
    gene_length_project, gene_length_id = parameters["gene_length_file"].split(
        ":")
    fpkm_results = []
    fpkm_log2_results = []
    sj_out_files_arr = []

    star_alignment_opts = {
        "outSAMunmapped": parameters["outSAMunmapped"],
        "outSAMattributes": parameters["outSAMattributes"],
        "outFilterMultimapNmax": int(parameters["outFilterMultimapNmax"]),
        "outFilterMismatchNmax": int(parameters["outFilterMismatchNmax"]),
        "alignIntronMax": int(parameters["alignIntronMax"]),
        "outSAMstrandField": parameters["outSAMstrandField"],
        "chimSegmentMin": int(parameters["chimSegmentMin"]),
        "sjdbOverhang": int(parameters["sjdbOverhang"]),
        "chimJunctionOverhangMin": int(parameters["chimJunctionOverhangMin"]),
        "subsample_target": int(parameters["STAR_subsample_n_reads"]),
    }

    if parameters["two_pass_alignment"] == 'true':
        for sample_name in samples:
            forward_id = samples[sample_name][0]
            forward_link = dxpy.dxlink(forward_id)
            align_input = {}
            align_input["first_pass"] = True
            align_input.update(star_alignment_opts)
            align_input["read_file1"] = forward_link
            star_instance = parameters["star_instance"]
            align_input["mark_duplicates"] = False
            align_input["generate_transcriptome_BAM"] = False
            align_input["star_index_archive"] = dxpy.dxlink({
                "project": index_project,
                "id": index_id
            })
            if parameters["sjdbFileChrStartEnd"] != "null" and parameters[
                    "sjdbFileChrStartEnd"] != '':
                sjdbFileChrStartEnd_project, sjdbFileChrStartEnd_id = parameters[
                    "sjdbFileChrStartEnd"].split(":")
                align_input["sjdbFileChrStartEnd"] = dxpy.dxlink({
                    "project":
                    sjdbFileChrStartEnd_project,
                    "id":
                    sjdbFileChrStartEnd_id
                })
            if parameters["indexed_with_gtf"] != "true":
                align_input["transcriptome_gtf"] = dxpy.dxlink({
                    "project": gtf_project,
                    "id": gtf_id
                })
            align_input["output_prefix"] = sample_name
            align_stage_id = ""
            if samples[sample_name][1] != "-":
                reverse_id = samples[sample_name][1]
                reverse_link = dxpy.dxlink(reverse_id)
                align_input["read_file2"] = reverse_link
                first_align_stage_id = wf.add_stage(
                    star_applet,
                    stage_input=align_input,
                    instance_type=star_instance,
                    folder="ALIGN_Pass1",
                    name=sample_name + ":ALIGN_Pass1")
            else:
                first_align_stage_id = wf.add_stage(
                    star_applet,
                    stage_input=align_input,
                    instance_type=star_instance,
                    folder="ALIGN_Pass1",
                    name=sample_name + ":ALIGN_Pass1")

            sj_out_files_arr.append(
                dxpy.dxlink({
                    "stage": first_align_stage_id,
                    "outputField": "sj_tab_out"
                }))
        combine_sj_out_input = {"sj_out_files": sj_out_files_arr}
        combine_sj_out_stage_pass1_id = wf.add_stage(
            combine_sj_tab_applet,
            stage_input=combine_sj_out_input,
            instance_type="azure:mem2_ssd1_x1",
            name="COMBINE SJ OUT PASS1",
            folder="COMBINED_JUNCTIONS_PASS1")
        parameters["pass1_sj_out"] = dxpy.dxlink({
            "stage":
            combine_sj_out_stage_pass1_id,
            "outputField":
            "combined_sj_out"
        })

    for sample_name in samples:
        forward_id = samples[sample_name][0]
        forward_link = dxpy.dxlink(forward_id)
        if parameters["run_FastQC"] == 'true':
            forward_input = {"fastq_input": forward_link}
            fq_stage_id = wf.add_stage(fastqc_applet,
                                       stage_input=forward_input,
                                       instance_type="azure:mem2_ssd1_x2",
                                       folder="FASTQC",
                                       name=sample_name + ":Forward FASTQC")
            wf_outputs += [{
                "name": sample_name + "_forward_fastqc_html",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": fq_stage_id,
                        "outputField": "html_file"
                    }
                }
            }, {
                "name": sample_name + "_forward_fastqc_zip",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": fq_stage_id,
                        "outputField": "zip_file"
                    }
                }
            }]

        align_input = {}
        align_input.update(star_alignment_opts)
        mark_duplicates = parameters["mark_duplicates"]
        if mark_duplicates == "false":
            mark_duplicates = False
        else:
            mark_duplicates = True
        align_input["mark_duplicates"] = mark_duplicates
        if parameters["generate_transcriptome_BAM"] == "true":
            align_input["generate_transcriptome_BAM"] = True
        else:
            align_input["generate_transcriptome_BAM"] = False
        star_instance = parameters["star_instance"]
        align_input["read_file1"] = forward_link
        align_input["star_index_archive"] = dxpy.dxlink({
            "project": index_project,
            "id": index_id
        })
        if parameters["indexed_with_gtf"] != "true":
            align_input["transcriptome_gtf"] = dxpy.dxlink({
                "project": gtf_project,
                "id": gtf_id
            })
        if "pass1_sj_out" in parameters:
            align_input["sjdbFileChrStartEnd"] = parameters["pass1_sj_out"]
        elif parameters["sjdbFileChrStartEnd"] != "null" and parameters[
                "sjdbFileChrStartEnd"] != '':
            sjdbFileChrStartEnd_project, sjdbFileChrStartEnd_id = parameters[
                "sjdbFileChrStartEnd"].split(":")
            align_input["sjdbFileChrStartEnd"] = dxpy.dxlink({
                "project":
                sjdbFileChrStartEnd_project,
                "id":
                sjdbFileChrStartEnd_id
            })
        align_input["output_prefix"] = sample_name
        align_stage_id = ""
        if samples[sample_name][1] != "-":
            reverse_id = samples[sample_name][1]
            reverse_link = dxpy.dxlink(reverse_id)
            if parameters["run_FastQC"] == 'true':
                rev_input = {"fastq_input": reverse_link}
                rev_fq_stage_id = wf.add_stage(
                    fastqc_applet,
                    stage_input=rev_input,
                    instance_type="azure:mem2_ssd1_x2",
                    folder="FASTQC",
                    name=sample_name + ":Reverse FASTQC")
                wf_outputs += [{
                    "name": sample_name + "_reverse_fastqc_html",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": rev_fq_stage_id,
                            "outputField": "html_file"
                        }
                    }
                }, {
                    "name": sample_name + "_reverse_fastqc_zip",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": rev_fq_stage_id,
                            "outputField": "zip_file"
                        }
                    }
                }]

            align_input["read_file2"] = reverse_link
            align_stage_id = wf.add_stage(star_applet,
                                          stage_input=align_input,
                                          instance_type=star_instance,
                                          folder="STAR",
                                          name=sample_name + ":ALIGN")
        else:
            align_stage_id = wf.add_stage(star_applet,
                                          stage_input=align_input,
                                          instance_type=star_instance,
                                          folder="STAR",
                                          name=sample_name + ":ALIGN")
        flagstat_files_arr.append(
            dxpy.dxlink({
                "stage": align_stage_id,
                "outputField": "flagstat_out"
            }))
        sj_out_files_arr.append(
            dxpy.dxlink({
                "stage": align_stage_id,
                "outputField": "sj_tab_out"
            }))
        wf_outputs += [
            {
                "name": sample_name + "_star_bam",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": align_stage_id,
                        "outputField": "sorted_by_coord_bam"
                    }
                }
            },
            {
                "name": sample_name + "_star_log",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": align_stage_id,
                        "outputField": "log_final_out"
                    }
                }
            },
            {
                "name": sample_name + "_flagstat",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": align_stage_id,
                        "outputField": "flagstat_out"
                    }
                }
            },
            {
                "name": sample_name + "_star_splice_junctions",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": align_stage_id,
                        "outputField": "sj_tab_out"
                    }
                }
            },
            {
                "name": sample_name + "_star_chimeric_bam",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": align_stage_id,
                        "outputField": "chimeric_bam"
                    }
                }
            },
            {
                "name": sample_name + "_star_chimeric_junction",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": align_stage_id,
                        "outputField": "chimeric_junction"
                    }
                }
            },
        ]
        if parameters["generate_transcriptome_BAM"] == "true":
            wf_outputs += [
                {
                    "name": sample_name + "_star_transcriptome_bam",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": align_stage_id,
                            "outputField": "to_transcriptome_bam"
                        }
                    }
                },
            ]

        if parameters["generate_name_sorted_BAM"] == "true":
            sort_input = {
                "input_bam":
                dxpy.dxlink({
                    "stage": align_stage_id,
                    "outputField": "sorted_by_coord_bam"
                })
            }
            sort_stage_id = wf.add_stage(sort_bam_applet,
                                         stage_input=sort_input,
                                         instance_type="azure:mem2_ssd1_x2",
                                         name=sample_name + ":NAME SORT BAM",
                                         folder="STAR")
            wf_outputs += [
                {
                    "name": sample_name + "_name_sorted_bam",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": sort_stage_id,
                            "outputField": "output_bam"
                        }
                    }
                },
            ]
            htseq_input = {
                "input_bam":
                dxpy.dxlink({
                    "stage": sort_stage_id,
                    "outputField": "output_bam"
                })
            }
            htseq_input["order"] = "name"
        else:
            htseq_input = {
                "input_bam":
                dxpy.dxlink({
                    "stage": align_stage_id,
                    "outputField": "sorted_by_coord_bam"
                })
            }
            htseq_input["order"] = "pos"

        htseq_input["annotation_file"] = dxpy.dxlink({
            "project": gtf_project,
            "id": gtf_id
        })
        htseq_input["gene_length_file"] = dxpy.dxlink({
            "project": gene_length_project,
            "id": gene_length_id
        })
        htseq_input["prefix"] = sample_name
        htseq_input["strand"] = parameters["strandedness"]
        htseq_input["feature_type"] = parameters["feature_type"]
        htseq_input["id_attribute"] = parameters["id_attribute"]
        htseq_input["mode"] = parameters["mode"]
        htseq_input["nonunique"] = parameters["nonunique"]
        htseq_input["secondary_alignments"] = parameters[
            "secondary_alignments"]
        htseq_input["supplementary_alignments"] = parameters[
            "supplementary_alignments"]
        htseq_stage_id = wf.add_stage(
            htseq_applet,
            stage_input=htseq_input,
            instance_type=parameters["htseq_instance"],
            name=sample_name + ":HTSEQ COUNT",
            folder="HTSEQ")
        htseq_results.append(
            dxpy.dxlink({
                "stage": htseq_stage_id,
                "outputField": "htseq_counts"
            }))
        wf_outputs += [
            {
                "name": sample_name + "_htseqcounts",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": htseq_stage_id,
                        "outputField": "htseq_counts"
                    }
                }
            },
        ]
        if parameters["id_attribute"] == "gene_name":
            fpkm_results.append((dxpy.dxlink({
                "stage": htseq_stage_id,
                "outputField": "fpkm"
            })))
            fpkm_log2_results.append((dxpy.dxlink({
                "stage": htseq_stage_id,
                "outputField": "fpkm_log2"
            })))
            wf_outputs += [
                {
                    "name": sample_name + "_fpkm",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": htseq_stage_id,
                            "outputField": "fpkm"
                        }
                    }
                },
                {
                    "name": sample_name + "_fpkm_log2",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": htseq_stage_id,
                            "outputField": "fpkm_log2"
                        }
                    }
                },
            ]

        if parameters["run_coverage"] == 'true':
            gcb_input = {}
            gcb_input["input_bam"] = dxpy.dxlink({
                "stage":
                align_stage_id,
                "outputField":
                "sorted_by_coord_bam"
            })
            gcb_input["genome_sizes_file"] = dxpy.dxlink({
                "project": genome_length_project,
                "id": genome_length_id
            })
            gcb_input["strandedness"] = parameters["strandedness"]
            gcb_input["output_prefix"] = sample_name
            gcb_stage_id = wf.add_stage(genome_cov_applet,
                                        stage_input=gcb_input,
                                        instance_type="azure:mem3_ssd1_x8",
                                        name=sample_name + ":COVERAGE",
                                        folder="COVERAGE")

            bg2bw_all_input = {}
            bg2bw_all_input["bedgraph_file"] = dxpy.dxlink({
                "stage":
                gcb_stage_id,
                "outputField":
                "all_coverage_file"
            })
            bg2bw_all_input["genome_sizes_file"] = dxpy.dxlink({
                "project":
                genome_length_project,
                "id":
                genome_length_id
            })
            bg2bw_all_input["output_prefix"] = sample_name
            bg2bw_all_stage_id = wf.add_stage(
                bigwig_applet,
                stage_input=bg2bw_all_input,
                instance_type="azure:mem2_ssd1_x4",
                name=sample_name + ":BED To BW-ALL",
                folder="BIGWIG")
            bigwig_files.append(
                dxpy.dxlink({
                    "stage": bg2bw_all_stage_id,
                    "outputField": "bigwig"
                }))
            wf_outputs += [
                {
                    "name": sample_name + "_all_bigwig",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": bg2bw_all_stage_id,
                            "outputField": "bigwig"
                        }
                    }
                },
            ]

            if parameters["strandedness"] != "no":
                bg2bw_pos_input = {}
                bg2bw_pos_input["bedgraph_file"] = dxpy.dxlink({
                    "stage":
                    gcb_stage_id,
                    "outputField":
                    "pos_coverage_file"
                })
                bg2bw_pos_input["genome_sizes_file"] = dxpy.dxlink({
                    "project":
                    genome_length_project,
                    "id":
                    genome_length_id
                })
                bg2bw_pos_input["output_prefix"] = sample_name
                bg2bw_pos_stage_id = wf.add_stage(
                    bigwig_applet,
                    stage_input=bg2bw_pos_input,
                    instance_type="azure:mem2_ssd1_x4",
                    name=sample_name + ":BED To BW-POS",
                    folder="BIGWIG")
                wf_outputs += [
                    {
                        "name": sample_name + "_pos_bigwig",
                        "class": "file",
                        "outputSource": {
                            "$dnanexus_link": {
                                "stage": bg2bw_pos_stage_id,
                                "outputField": "bigwig"
                            }
                        }
                    },
                ]

                bg2bw_neg_input = {}
                bg2bw_neg_input["bedgraph_file"] = dxpy.dxlink({
                    "stage":
                    gcb_stage_id,
                    "outputField":
                    "neg_coverage_file"
                })
                bg2bw_neg_input["genome_sizes_file"] = dxpy.dxlink({
                    "project":
                    genome_length_project,
                    "id":
                    genome_length_id
                })
                bg2bw_neg_input["output_prefix"] = sample_name
                bg2bw_neg_stage_id = wf.add_stage(
                    bigwig_applet,
                    stage_input=bg2bw_neg_input,
                    instance_type="azure:mem2_ssd1_x4",
                    name=sample_name + ":BED To BW-NEG",
                    folder="BIGWIG")
                wf_outputs += [
                    {
                        "name": sample_name + "_neg_bigwig",
                        "class": "file",
                        "outputSource": {
                            "$dnanexus_link": {
                                "stage": bg2bw_neg_stage_id,
                                "outputField": "bigwig"
                            }
                        }
                    },
                ]

                bigwig_files.append(
                    dxpy.dxlink({
                        "stage": bg2bw_pos_stage_id,
                        "outputField": "bigwig"
                    }))
                bigwig_files.append(
                    dxpy.dxlink({
                        "stage": bg2bw_neg_stage_id,
                        "outputField": "bigwig"
                    }))
        sample_num += 1

    combine_input = {
        "count_files": htseq_results,
        "name_value": "htseq",
        "sample_files": [dxpy.dxlink(final_sample_list_id)]
    }
    combine_counts_stage_id = wf.add_stage(combine_counts_applet,
                                           stage_input=combine_input,
                                           instance_type="azure:mem2_ssd1_x1",
                                           name="COMBINE HTSEQ")
    wf_outputs += [
        {
            "name": "combined_counts",
            "class": "file",
            "outputSource": {
                "$dnanexus_link": {
                    "stage": combine_counts_stage_id,
                    "outputField": "count_file"
                }
            }
        },
    ]
    if parameters["id_attribute"] == "gene_name":
        combine_fpkm_input = {
            "count_files": fpkm_results,
            "name_value": "fpkm",
            "sample_files": [dxpy.dxlink(final_sample_list_id)]
        }
        combine_fpkm_stage_id = wf.add_stage(
            combine_counts_applet,
            stage_input=combine_fpkm_input,
            instance_type="azure:mem2_ssd1_x1",
            name="COMBINE FPKM")
        combine_fpkm_log2_input = {
            "count_files": fpkm_log2_results,
            "name_value": "fpkm.log2",
            "sample_files": [dxpy.dxlink(final_sample_list_id)]
        }
        combine_fpkm_log2_stage_id = wf.add_stage(
            combine_counts_applet,
            stage_input=combine_fpkm_log2_input,
            instance_type="azure:mem2_ssd1_x1",
            name="COMBINE FPKMlog2")
        wf_outputs += [
            {
                "name": "combined_fpkm",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": combine_fpkm_stage_id,
                        "outputField": "count_file"
                    }
                }
            },
            {
                "name": "combined_fpkm_log2",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": combine_fpkm_log2_stage_id,
                        "outputField": "count_file"
                    }
                }
            },
        ]

    combine_flagstat_input = {
        "flagstat_files": flagstat_files_arr,
        "sample_list": dxpy.dxlink(final_sample_list_id)
    }
    combine_flagstat_stage_id = wf.add_stage(
        combine_flagstat_applet,
        stage_input=combine_flagstat_input,
        instance_type="azure:mem2_ssd1_x1",
        name="COMBINE FLAGSTAT",
        folder="STAR")
    wf_outputs += [
        {
            "name": "combined_flagstat",
            "class": "file",
            "outputSource": {
                "$dnanexus_link": {
                    "stage": combine_flagstat_stage_id,
                    "outputField": "combined_flagstat"
                }
            }
        },
    ]

    if parameters["BW_VIEWER"] != "None" and parameters[
            "run_coverage"] == 'true':
        bw_project, bw_file = parameters["BW_VIEWER"].split(":")
        viewer_link = dxpy.dxlink({"project": bw_project, "id": bw_file})
        bw_viewer_input = {"viewer": viewer_link, "bigwig_files": bigwig_files}
        bw_viewer_stage_id = wf.add_stage(bw_viewer_applet,
                                          stage_input=bw_viewer_input,
                                          instance_type="azure:mem2_ssd1_x1",
                                          name="BIGWIG_VIEWER",
                                          folder="BIGWIG")
        wf_outputs += [
            {
                "name": "bw_viewer",
                "class": "record",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": bw_viewer_stage_id,
                        "outputField": "viewer_bookmark"
                    }
                }
            },
        ]

    if parameters["limma_DE_viewer"] != "None":
        limma_viewer_project, limma_viewer_file = parameters[
            "limma_DE_viewer"].split(":")
        limma_viewer_link = dxpy.dxlink({
            "project": limma_viewer_project,
            "id": limma_viewer_file
        })

    if parameters["run_limma"] == 'true' and parameters[
            "limma_runnable"] == "true":
        limma_input = {
            "input_count_file":
            dxpy.dxlink({
                "stage": combine_counts_stage_id,
                "outputField": "count_file"
            }),
            "sample_list_file":
            dxpy.dxlink(final_sample_list_id),
            "calcNormFactors_method":
            parameters["calcNormFactors_method"],
            "filter_count_type":
            parameters["filter_count_type"],
            "filter_count":
            int(parameters["filter_count"]),
            "p_value_adjust":
            parameters["p_value_adjust"],
            "contrasts_file":
            dxpy.dxlink(comparisons_limma_id)
        }
        if parameters["limma_DE_viewer"] != "None":
            limma_input["difex_viewer"] = limma_viewer_link
        limma_stage_id = wf.add_stage(limma_applet,
                                      stage_input=limma_input,
                                      instance_type="azure:mem1_ssd1_x4",
                                      name="LIMMA")
        wf_outputs += [
            {
                "name": "limma_outfiles",
                "class": "array:file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": limma_stage_id,
                        "outputField": "out_files"
                    }
                }
            },
            {
                "name": "limma_viewer",
                "class": "record",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": limma_stage_id,
                        "outputField": "viewer_bookmark"
                    }
                }
            },
        ]
    if parameters["run_simple_dif_ex"] == 'true':
        simple_DE_input = {
            "input_count_file":
            dxpy.dxlink({
                "stage": combine_counts_stage_id,
                "outputField": "count_file"
            }),
            "sample_list_file":
            dxpy.dxlink(final_sample_list_id),
            "contrasts_file":
            dxpy.dxlink(comparisons_all_id),
            "difex_viewer":
            limma_viewer_link
        }
        if parameters["limma_DE_viewer"] != "None":
            simple_DE_input["difex_viewer"] = limma_viewer_link
        simple_DE_stage_id = wf.add_stage(
            simple_DE_applet,
            stage_input=simple_DE_input,
            instance_type="azure:mem1_ssd1_x4",
            name="SIMPLE DIFFERENTIAL_EXPRESSION")
        wf_outputs += [
            {
                "name": "simple_DE_outfiles",
                "class": "array:file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": simple_DE_stage_id,
                        "outputField": "out_files"
                    }
                }
            },
            {
                "name": "simple_DE_viewer",
                "class": "record",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": simple_DE_stage_id,
                        "outputField": "viewer_bookmark"
                    }
                }
            },
        ]

    wf.update(workflow_outputs=wf_outputs)
    wf.close()
    return wf.get_id()
Beispiel #21
0
def build_workflow():
    wf = dxpy.new_dxworkflow(title='tcga_mc3_full_run',
                             name='tcga_mc3_full_run',
                             description='TCGA mc3 variant calling pipeline',
                             project=args.project,
                             folder=args.folder,
                             properties={"git_revision": git_revision})

    # variant calling tools
    pindel_applet = find_applet("pindel-tool")
    pindel_stage_id = wf.add_stage(pindel_applet)

    radia_applet = find_applet("radia-tool")
    radia_input = {
        "dnaNormalBam":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "normalInputBamFile"
        }),
        "dnaTumorBam":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBamFile"
        }),
        "fasta":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "inputReferenceFile"
        })
    }
    radia_stage_id = wf.add_stage(radia_applet, stage_input=radia_input)

    somaticsniper_applet = find_applet("somaticsniper-tool")
    somaticsniper_input = {
        "normal":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "normalInputBamFile"
        }),
        "tumor":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBamFile"
        }),
        "reference":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "inputReferenceFile"
        })
    }

    somaticsniper_stage_id = wf.add_stage(somaticsniper_applet,
                                          stage_input=somaticsniper_input,
                                          instance_type="mem2_hdd2_x1")

    samtools_pileup_applet = find_applet("samtools-pileup-tool")
    samtools_pileup_normal_input = {
        "input1":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "normalInputBamFile"
        }),
        "input1_index":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "normalInputBaiFile"
        }),
        "reference":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "inputReferenceFile"
        })
    }
    samtools_pileup_normal_stage_id = wf.add_stage(
        samtools_pileup_applet,
        stage_input=samtools_pileup_normal_input,
        instance_type="mem2_hdd2_x1")

    samtools_pileup_tumor_input = {
        "input1":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBamFile"
        }),
        "input1_index":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBaiFile"
        }),
        "reference":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "inputReferenceFile"
        })
    }
    samtools_pileup_tumor_stage_id = wf.add_stage(
        samtools_pileup_applet,
        stage_input=samtools_pileup_tumor_input,
        instance_type="mem2_hdd2_x2")

    muse_applet = find_applet("muse-tool")
    muse_input = {
        "tumor_bam":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBamFile"
        }),
        "tumor_bai":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBaiFile"
        }),
        "normal_bam":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "normalInputBamFile"
        }),
        "normal_bai":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "normalInputBaiFile"
        }),
        "reference":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "inputReferenceFile"
        }),
        "dbsnp":
        dxpy.dxlink("file-Bj1V0400kF9Z3GqJY4ZbYbYj")
    }
    muse_stage_id = wf.add_stage(muse_applet, stage_input=muse_input)

    varscan_applet = find_applet("varscan-tool")
    varscan_input = {
        "normal_pileup":
        dxpy.dxlink({
            "stage": samtools_pileup_normal_stage_id,
            "outputField": "pileup"
        }),
        "tumor_pileup":
        dxpy.dxlink({
            "stage": samtools_pileup_tumor_stage_id,
            "outputField": "pileup"
        })
    }
    varscan_stage_id = wf.add_stage(varscan_applet,
                                    stage_input=varscan_input,
                                    instance_type="mem2_hdd2_x2")

    mutect_applet = find_applet("mutect-tool")
    mutect_input = {
        "tumor_bam":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBamFile"
        }),
        "tumor_bai":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBaiFile"
        }),
        "normal_bam":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "normalInputBamFile"
        }),
        "normal_bai":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "normalInputBaiFile"
        }),
        "reference":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "inputReferenceFile"
        }),
        "dbsnp":
        dxpy.dxlink("file-Bj1V0400kF9Z3GqJY4ZbYbYj"),
        "cosmic":
        dxpy.dxlink("file-Bk9g2kQ0kF9f9XG6VZf7VGKQ"),
    }
    mutect_stage_id = wf.add_stage(mutect_applet, stage_input=mutect_input)

    # fpfilter (somaticSniper, Varscan)
    fpfilter_applet = find_applet("fpfilter-tool")

    somatcisniper_fpfilter_input = {
        "vcf":
        dxpy.dxlink({
            "stage": somaticsniper_stage_id,
            "outputField": "vcf"
        }),
        "bam":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBamFile"
        }),
        "reference":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "inputReferenceFile"
        })
    }
    somaticsniper_fpfilter_stage_id = wf.add_stage(
        fpfilter_applet,
        stage_input=somatcisniper_fpfilter_input,
        name="fpfilter-tool(somaticSniper)",
        folder="fpfiltered")

    varscan_snp_fpfilter_input = {
        "vcf":
        dxpy.dxlink({
            "stage": varscan_stage_id,
            "outputField": "snp_vcf"
        }),
        "bam":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBamFile"
        }),
        "reference":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "inputReferenceFile"
        })
    }
    varscan_snp_fpfilter_stage_id = wf.add_stage(
        fpfilter_applet,
        stage_input=varscan_snp_fpfilter_input,
        name="fpfilter-tool(varscan SNP)",
        folder="fpfiltered")

    varscan_indel_fpfilter_input = {
        "vcf":
        dxpy.dxlink({
            "stage": varscan_stage_id,
            "outputField": "indel_vcf"
        }),
        "bam":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "tumorInputBamFile"
        }),
        "reference":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "inputField": "inputReferenceFile"
        })
    }
    varscan_indel_fpfilter_stage_id = wf.add_stage(
        fpfilter_applet,
        stage_input=varscan_indel_fpfilter_input,
        name="fpfilter-tool(varscan INDEL)",
        folder="fpfiltered")

    # vcf_filter (All variant callers)
    vcf_filter_applet = find_applet("tcga-vcf-filter-tool")
    radia_vcf_filter_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": radia_stage_id,
            "outputField": "filtered_output_vcf"
        }),
        "filterRejects":
        False
    }
    radia_vcf_filter_stage_id = wf.add_stage(
        vcf_filter_applet,
        stage_input=radia_vcf_filter_input,
        name="vcffilter-tool(radia)",
        folder="final_filtered")

    somaticsniper_vcf_filter_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": somaticsniper_fpfilter_stage_id,
            "outputField": "annotated_output"
        }),
        "filterRejects":
        False
    }
    somaticsniper_vcf_filter_stage_id = wf.add_stage(
        vcf_filter_applet,
        stage_input=somaticsniper_vcf_filter_input,
        name="vcffilter-tool(somaticsniper)",
        folder="final_filtered")

    varscan_snp_vcf_filter_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": varscan_snp_fpfilter_stage_id,
            "outputField": "annotated_output"
        }),
        "filterRejects":
        True
    }
    varscan_snp_vcf_filter_stage_id = wf.add_stage(
        vcf_filter_applet,
        stage_input=varscan_snp_vcf_filter_input,
        name="vcffilter-tool(varscan SNP)",
        folder="final_filtered")

    varscan_indel_vcf_filter_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": varscan_indel_fpfilter_stage_id,
            "outputField": "annotated_output"
        }),
        "filterRejects":
        True
    }
    varscan_indel_vcf_filter_stage_id = wf.add_stage(
        vcf_filter_applet,
        stage_input=varscan_indel_vcf_filter_input,
        name="vcffilter-tool(varscan INDEL)",
        folder="final_filtered")

    muse_vcf_filter_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": muse_stage_id,
            "outputField": "mutations"
        }),
        "filterRejects":
        False
    }
    muse_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet,
                                            stage_input=muse_vcf_filter_input,
                                            name="vcffilter-tool(muse)",
                                            folder="final_filtered")

    pindel_vcf_filter_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": pindel_stage_id,
            "outputField": "outputSomaticVcf"
        }),
        "filterRejects":
        False
    }
    pindel_vcf_filter_stage_id = wf.add_stage(
        vcf_filter_applet,
        stage_input=pindel_vcf_filter_input,
        name="vcffilter-tool(pindel)",
        folder="final_filtered")

    mutect_vcf_filter_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": mutect_stage_id,
            "outputField": "mutations"
        }),
        "filterRejects":
        True
    }
    mutect_vcf_filter_stage_id = wf.add_stage(
        vcf_filter_applet,
        stage_input=mutect_vcf_filter_input,
        name="vcffilter-tool(mutect)",
        folder="final_filtered")

    vcf_reheader_applet = find_applet("tcga-vcf-reheader")
    radia_vcf_reheader_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": radia_vcf_filter_stage_id,
            "outputField": "output_vcf"
        }),
        "software_name":
        "radia",
        "software_version":
        "1",
        "software_params":
        "--dnaNormalMinTotalBases 4 --dnaNormalMinAltBases 2 --dnaNormalBaseQual 10 --dnaNormalMapQual 10 --dnaTumorDescription TumorDNASample --dnaTumorMinTotalBases 4 --dnaTumorMinAltBases 2 --dnaTumorBaseQual 10 --dnaTumorMapQual 10 --dnaNormalMitochon=MT --dnaTumorMitochon=MT --genotypeMinDepth 2 --genotypeMinPct 0.100",
        "center":
        "ucsc.edu"
    }
    radia_vcf_reheader_stage_id = wf.add_stage(
        vcf_reheader_applet,
        stage_input=radia_vcf_reheader_input,
        name="vcf-reheader(radia)",
        folder="final_reheadered")
    """
    sample_params = {
        "platform": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "platform"}),
        "participant_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "participant_uuid"}),
        "disease_code": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "disease_code"}),
        "normal_analysis_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_analysis_uuid"}),
        "normal_bam_name": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_bam_name"}),
        "normal_aliquot_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_aliquot_id"}),
        "normal_aliquot_barcode": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_aliquot_barcode"}),
        "tumor_analysis_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_analysis_uuid"}),
        "tumor_bam_name": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_bam_name"}),
        "tumor_aliquot_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_aliquot_uuid"}),
        "tumor_aliquot_barcode": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_aliquot_barcode"})
    }
    """
    somaticsniper_vcf_reheader_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": somaticsniper_vcf_filter_stage_id,
            "outputField": "output_vcf"
        }),
        "software_name":
        "somaticsniper",
        "software_version":
        "v1.0.5.0",
        "software_params":
        "-Q 40 -n NORMAL -q 1 -s 0.01 -r 0.001",
        "center":
        "wustl.edu"
    }
    #somaticsniper_vcf_reheader_input.update(sample_params)
    somaticsniper_vcf_reheader_stage_id = wf.add_stage(
        vcf_reheader_applet,
        stage_input=somaticsniper_vcf_reheader_input,
        name="vcf-reheader(somaticsniper)",
        folder="final_reheadered")

    varscan_snp_vcf_reheader_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": varscan_snp_vcf_filter_stage_id,
            "outputField": "output_vcf"
        }),
        "software_name":
        "varscan",
        "software_version":
        "2.3.9",
        "software_params":
        "--output-vcf 1 --min-coverage 3 --normal-purity 1 --p-value 0.99 --min-coverage-normal 8 --min-freq-for-hom 0.75 --min-var-freq 0.08 --somatic-p-value 0.05 --min-coverage-tumor 6 --tumor-purity 1",
        "center":
        "wustl.edu"
    }
    #varscan_snp_vcf_reheader_input.update(sample_params)
    varscan_snp_vcf_reheader_stage_id = wf.add_stage(
        vcf_reheader_applet,
        stage_input=varscan_snp_vcf_reheader_input,
        name="vcf-reheader(varscan SNP)",
        folder="final_reheadered")

    varscan_indel_vcf_reheader_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": varscan_indel_vcf_filter_stage_id,
            "outputField": "output_vcf"
        }),
        "software_name":
        "varscan",
        "software_version":
        "2.3.9",
        "software_params":
        "--output-vcf 1 --min-coverage 3 --normal-purity 1 --p-value 0.99 --min-coverage-normal 8 --min-freq-for-hom 0.75 --min-var-freq 0.08 --somatic-p-value 0.05 --min-coverage-tumor 6 --tumor-purity 1",
        "center":
        "wustl.edu"
    }
    #varscan_indel_vcf_reheader_input.update(sample_params)
    varscan_indel_vcf_reheader_stage_id = wf.add_stage(
        vcf_reheader_applet,
        stage_input=varscan_indel_vcf_reheader_input,
        name="vcf-reheader(varscan INDEL)",
        folder="final_reheadered")

    muse_vcf_reheader_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": muse_vcf_filter_stage_id,
            "outputField": "output_vcf"
        }),
        "software_name":
        "muse",
        "software_version":
        "v1.0rc",
        "software_params":
        "--mode wxs",
        "center":
        "mdanderson.org"
    }
    #muse_vcf_reheader_input.update(sample_params)
    muse_vcf_reheader_stage_id = wf.add_stage(
        vcf_reheader_applet,
        stage_input=muse_vcf_reheader_input,
        name="vcf-reheader(muse)",
        folder="final_reheadered")

    pindel_vcf_reheader_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": pindel_vcf_filter_stage_id,
            "outputField": "output_vcf"
        }),
        "software_name":
        "pindel",
        "software_version":
        "v0.2.5b8",
        "software_params":
        "--max_range_index 1 --window_size 5 --sequencing_error_rate 0.010000 --sensitivity 0.950000 --maximum_allowed_mismatch_rate 0.020000 --NM 2 --additional_mismatch 1 --min_perfect_match_around_BP 3 --min_inversion_size 50 --min_num_matched_bases 30 --balance_cutoff 0 --anchor_quality 0 --minimum_support_for_event 3 --report_long_insertions --report_duplications --report_inversions --report_breakpoints",
        "center":
        "wustl.edu"
    }
    #pindel_vcf_reheader_input.update(sample_params)
    pindel_vcf_reheader_stage_id = wf.add_stage(
        vcf_reheader_applet,
        stage_input=pindel_vcf_reheader_input,
        name="vcf-reheader(pindel)",
        folder="final_reheadered")

    mutect_vcf_reheader_input = {
        "input_vcf":
        dxpy.dxlink({
            "stage": mutect_vcf_filter_stage_id,
            "outputField": "output_vcf"
        }),
        "software_name":
        "mutect",
        "software_version":
        "1.1.5",
        "software_params":
        "--initial_tumor_lod 4.0 --tumor_lod 10.0",
        "center":
        "broad.org"
    }
    mutect_vcf_reheader_stage_id = wf.add_stage(
        vcf_reheader_applet,
        stage_input=mutect_vcf_reheader_input,
        name="vcf-reheader(mutect)",
        folder="final_reheadered")

    return wf
                                            return_handler=False)
        replicates = [dxpy.dxlink(r) for r in replicates]
        controls = dxpy.find_data_objects(classname='file',
                                          name='*.bam',
                                          name_mode='glob',
                                          project=project.get_id(),
                                          folder=CONTROLS_FOLDER,
                                          return_handler=False)
        controls = [dxpy.dxlink(c) for c in controls]
    else:
        if (len(args.replicates) < 1) or (len(args.controls) < 1):
            sys.exit(
                'Need to have at least 1 replicate file and 1 control file.')
        project.new_folder(REPLICATES_FOLDER, True)
        project.new_folder(CONTROLS_FOLDER, True)
        replicates = copy_files(args.replicates, project, REPLICATES_FOLDER)
        controls = copy_files(args.controls, project, CONTROLS_FOLDER)

    if (len(replicates) < 1) or (len(controls) < 1):
        sys.exit('Need to have at least 1 replicate file and 1 control file.')

    # Now create a new workflow
    wf = dxpy.new_dxworkflow(title='dx_chip_seq',
                             name='ENCODE ChIP-Seq 2.0',
                             description='The ENCODE ChIP-Seq Pipeline 2.0',
                             project=project.get_id())
    populate_workflow(wf, replicates, controls,
                      project.describe()['name'],
                      args.sort_filter_and_remove_dups,
                      args.duplicates_removed, args.gender, applets_project_id)
Beispiel #23
0
def build_workflow(experiment, biorep_n, input_shield_stage_input, key):

    output_project = resolve_project(args.outp, 'w')
    logging.debug('Found output project %s' % (output_project.name))

    applet_project = resolve_project(args.applets, 'r')
    logging.debug('Found applet project %s' % (applet_project.name))

    mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME,
                                         applet_project.get_id())
    logging.debug('Found applet %s' % (mapping_applet.name))

    input_shield_applet = find_applet_by_name(INPUT_SHIELD_APPLET_NAME,
                                              applet_project.get_id())
    logging.debug('Found applet %s' % (input_shield_applet.name))

    workflow_output_folder = resolve_folder(
        output_project, args.outf + '/workflows/' +
        experiment.get('accession') + '/' + 'rep%d' % (biorep_n))

    fastq_output_folder = resolve_folder(
        output_project, args.outf + '/fastqs/' + experiment.get('accession') +
        '/' + 'rep%d' % (biorep_n))
    mapping_output_folder = resolve_folder(
        output_project, args.outf + '/raw_bams/' +
        experiment.get('accession') + '/' + 'rep%d' % (biorep_n))

    if args.raw:
        workflow_title = 'Map %s rep%d to %s (no filter)' % (
            experiment.get('accession'), biorep_n, args.assembly)
        workflow_name = 'ENCODE raw mapping pipeline'
    else:
        workflow_title = 'Map %s rep%d to %s and filter' % (
            experiment.get('accession'), biorep_n, args.assembly)
        workflow_name = 'ENCODE mapping pipeline'

    if args.tag:
        workflow_title += ': %s' % (args.tag)

    workflow = dxpy.new_dxworkflow(title=workflow_title,
                                   name=workflow_name,
                                   project=output_project.get_id(),
                                   folder=workflow_output_folder)

    input_shield_stage_id = workflow.add_stage(
        input_shield_applet,
        name='Gather inputs %s rep%d' %
        (experiment.get('accession'), biorep_n),
        folder=fastq_output_folder,
        stage_input=input_shield_stage_input)

    mapping_stage_id = workflow.add_stage(
        mapping_applet,
        name='Map %s rep%d' % (experiment.get('accession'), biorep_n),
        folder=mapping_output_folder,
        stage_input={
            'input_JSON':
            dxpy.dxlink({
                'stage': input_shield_stage_id,
                'outputField': 'output_JSON'
            })
        })

    if not args.raw:
        final_output_folder = resolve_folder(
            output_project, args.outf + '/bams/' +
            experiment.get('accession') + '/' + 'rep%d' % (biorep_n))

        filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME,
                                               applet_project.get_id())
        logging.debug('Found applet %s' % (filter_qc_applet.name))

        filter_qc_stage_id = workflow.add_stage(
            filter_qc_applet,
            name='Filter and QC %s rep%d' %
            (experiment.get('accession'), biorep_n),
            folder=final_output_folder,
            stage_input={
                'input_bam':
                dxpy.dxlink({
                    'stage': mapping_stage_id,
                    'outputField': 'mapped_reads'
                }),
                'paired_end':
                dxpy.dxlink({
                    'stage': mapping_stage_id,
                    'outputField': 'paired_end'
                })
            })

        xcor_applet = find_applet_by_name(XCOR_APPLET_NAME,
                                          applet_project.get_id())
        logging.debug('Found applet %s' % (xcor_applet.name))

        xcor_stage_id = workflow.add_stage(
            xcor_applet,
            name='Calculate cross-correlation %s rep%d' %
            (experiment.get('accession'), biorep_n),
            folder=final_output_folder,
            stage_input={
                'input_bam':
                dxpy.dxlink({
                    'stage': filter_qc_stage_id,
                    'outputField': 'filtered_bam'
                }),
                'paired_end':
                dxpy.dxlink({
                    'stage': filter_qc_stage_id,
                    'outputField': 'paired_end'
                })
            })
    ''' This should all be done in the shield's postprocess entrypoint
	if args.accession_outputs:
		derived_from = input_shield_stage_input.get('reads1')
		if reads2:
			derived_from.append(reads2)
		files_json = {dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}) : {
			'notes': 'Biorep%d | Mapped to %s' %(biorep_n, input_shield_stage_input.get('reference_tar')),
			'lab': 'j-michael-cherry',
			'award': 'U41HG006992',
			'submitted_by': '*****@*****.**',
			'file_format': 'bam',
			'output_type': 'alignments',
			'derived_from': derived_from,
			'dataset': experiment.get('accession')}
		}
		output_shield_stage_id = workflow.add_stage(
			output_shield_applet,
			name='Accession outputs %s rep%d' %(experiment.get('accession'), biorep_n),
			folder=mapping_output_folder,
			stage_input={'files': [dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'})],
						 'files_json': files_json,
						 'key': input_shield_stage_input.get('key')}
		)
	'''
    return workflow
def build_workflow(experiment, biorep_n, input_shield_stage_input, accession, use_existing_folders):

    output_project = resolve_project(args.outp, 'w')
    logging.debug('Found output project %s' % (output_project.name))

    applet_project = resolve_project(args.applets, 'r')
    logging.debug('Found applet project %s' % (applet_project.name))

    mapping_applet = \
        find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id())
    logging.debug('Found applet %s' % (mapping_applet.name))

    input_shield_applet = \
        find_applet_by_name(INPUT_SHIELD_APPLET_NAME, applet_project.get_id())
    logging.debug('Found applet %s' % (input_shield_applet.name))

    folders = ['workflows', 'fastqs', 'raw_bams', 'bams']
    folder_paths = \
        ['/'.join([args.outf,
                   folder_name,
                   experiment.get('accession'),
                   'rep%d' % (biorep_n)])
         for folder_name in folders]
    paths_exist = \
        [resolve_folder(output_project, folder_path)
         for folder_path in folder_paths
         if resolve_folder(output_project, folder_path)]
    if any(paths_exist):
        msg = "%s: output paths already exist: %s" % (experiment.get('accession'), paths_exist)
        if use_existing_folders:
            logging.warning(msg)
        else:
            msg += "\nUse --use_existing_folders to supress but possibly create duplicate files"
            logging.error(msg)
            return None
    workflow_output_folder, fastq_output_folder, mapping_output_folder, final_output_folder = \
        tuple(create_folder(output_project, folder_path)
              for folder_path in folder_paths)

    if args.raw:
        workflow_title = \
            ('Map %s rep%d to %s (no filter)'
             % (experiment.get('accession'), biorep_n, args.assembly))
        workflow_name = 'ENCODE raw mapping pipeline'
    else:
        workflow_title = \
            ('Map %s rep%d to %s and filter'
             % (experiment.get('accession'), biorep_n, args.assembly))
        workflow_name = 'ENCODE mapping pipeline'

    if args.tag:
        workflow_title += ': %s' % (args.tag)

    workflow = dxpy.new_dxworkflow(
        title=workflow_title,
        name=workflow_name,
        project=output_project.get_id(),
        folder=workflow_output_folder
    )

    input_shield_stage_id = workflow.add_stage(
        input_shield_applet,
        name='Gather inputs %s rep%d' % (experiment.get('accession'), biorep_n),
        folder=fastq_output_folder,
        stage_input=input_shield_stage_input
    )

    input_names = \
        [name for name in ['reads1', 'reads2', 'crop_length', 'reference_tar',
         'bwa_version', 'bwa_aln_params', 'samtools_version', 'debug']
         if name in input_shield_stage_input]
    logging.debug('input_names: %s' % (input_names))
    mapping_stage_input = dict(zip(
        input_names,
        [dxpy.dxlink(
            {'stage': input_shield_stage_id, 'outputField': input_name})
         for input_name in input_names]))
    logging.debug('mapping_stage_input: %s' % (mapping_stage_input))
    mapping_stage_id = workflow.add_stage(
        mapping_applet,
        name='Map %s rep%d' % (experiment.get('accession'), biorep_n),
        folder=mapping_output_folder,
        stage_input=mapping_stage_input
    )

    if not args.raw:
        filter_qc_applet = \
            find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id())
        logging.debug('Found applet %s' % (filter_qc_applet.name))

        filter_qc_stage_id = workflow.add_stage(
            filter_qc_applet,
            name='Filter and QC %s rep%d' % (experiment.get('accession'), biorep_n),
            folder=final_output_folder,
            stage_input={
                'input_bam': dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}),
                'paired_end': dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'paired_end'}),
                'scrub': args.scrub
            }
        )

        xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id())
        logging.debug('Found applet %s' %(xcor_applet.name))

        xcor_stage_id = workflow.add_stage(
            xcor_applet,
            name='Calculate cross-correlation %s rep%d' %(experiment.get('accession'), biorep_n),
            folder=final_output_folder,
            stage_input={
                'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}),
                'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}),
                'spp_version': args.spp_version
            }
        )


    ''' This should all be done in the shield's postprocess entrypoint
    if args.accession_outputs:
        derived_from = input_shield_stage_input.get('reads1')
        if reads2:
            derived_from.append(reads2)
        files_json = {dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}) : {
            'notes': 'Biorep%d | Mapped to %s' %(biorep_n, input_shield_stage_input.get('reference_tar')),
            'lab': 'j-michael-cherry',
            'award': 'U41HG006992',
            'submitted_by': '*****@*****.**',
            'file_format': 'bam',
            'output_type': 'alignments',
            'derived_from': derived_from,
            'dataset': experiment.get('accession')}
        }
        output_shield_stage_id = workflow.add_stage(
            output_shield_applet,
            name='Accession outputs %s rep%d' %(experiment.get('accession'), biorep_n),
            folder=mapping_output_folder,
            stage_input={'files': [dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'})],
                         'files_json': files_json,
                         'key': input_shield_stage_input.get('key')}
        )
    '''
    return workflow
def main():
    args = get_args()

    blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2)

    if not blank_workflow:
        assert args.rep1, "Reads are required for rep1"
        assert args.ctl1, "Reads are required for ctl1"
        assert not args.nomap or args.rep1pe is not None, "With --nomap, endedness of rep1 must be specified witn --rep1pe"
        assert not args.nomap or (not args.rep2 or args.rep2pe is not None), "With --nomap, endedness of rep2 must be specified with --rep2pe"

    if not args.target:
        target_type = 'default'  # default
    else:
        target_type = args.target.lower()
    if target_type not in WF.keys():
        logging.error('Target type %s is not recognized')
        sys.exit(2)

    output_project = resolve_project(args.outp, 'w')
    logging.debug('Found output project %s' % (output_project.name))
    applet_project = resolve_project(args.applets, 'r')
    logging.debug('Found applet project %s' % (applet_project.name))    

    existing_folder = resolve_folder(output_project, args.outf)
    if not existing_folder:
        output_folder = create_folder(output_project, args.outf)
    elif args.use_existing_folders:
        output_folder = existing_folder
    else:
        assert (existing_folder and args.use_existing_folders), 'Output folder %s exists but --use_existing_folders is %s' % (existing_folder, args.use_existing_folders)

    logging.debug('Using output folder %s' % (output_folder))

    workflow = dxpy.new_dxworkflow(
        name=args.name or WF[target_type]['wf_name'],
        title=args.title or WF[target_type]['wf_title'],
        description=args.description or WF[target_type]['wf_description'],
        project=output_project.get_id(),
        folder=output_folder,
        properties={'pipeline_version': str(args.pipeline_version)})

    unary_control = args.unary_control or (not blank_workflow and args.ctl2 is None)
    simplicate_experiment = args.simplicate_experiment or (args.rep1 and not args.rep2)

    if not args.genomesize:
        genomesize = None
    else:
        genomesize = args.genomesize
    if not args.chrom_sizes:
        chrom_sizes = None
    else:
        chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes))

    if not args.blacklist:
        blacklist = None
    else:
        blacklist = dxpy.dxlink(resolve_file(args.blacklist))

    run_idr = WF[target_type]['run_idr']

    if not args.nomap:
        # a "superstage" is just a dict with a name, name(s) of input files,
        # and then names and id's of stages that process that input
        # each superstage here could be implemented as a stage in a more
        # abstract workflow.  That stage would then call the various applets
        # that are separate
        # stages here.
        mapping_superstages = [  # the order of this list is important in that
            {'name': 'Rep1', 'input_args': args.rep1}
        ]
        if not simplicate_experiment:
            mapping_superstages.append(
                {'name': 'Rep2', 'input_args': args.rep2})
        mapping_superstages.append(
            {'name': 'Ctl1', 'input_args': args.ctl1})
        if not unary_control and not simplicate_experiment:
            mapping_superstages.append(
                {'name': 'Ctl2', 'input_args': args.ctl2})

        mapping_applet = find_applet_by_name(
            MAPPING_APPLET_NAME, applet_project.get_id())
        # mapping_output_folder = resolve_folder(
        #     output_project, output_folder + '/' + mapping_applet.name)
        mapping_output_folder = mapping_applet.name
        reference_tar = resolve_file(args.reference)
        filter_qc_applet = find_applet_by_name(
            FILTER_QC_APPLET_NAME, applet_project.get_id())
        filter_qc_output_folder = mapping_output_folder
        xcor_applet = find_applet_by_name(
            XCOR_APPLET_NAME, applet_project.get_id())
        xcor_output_folder = mapping_output_folder

        # in the first pass create the mapping stage id's so we can use JBOR's
        # to link inputs
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            mapped_stage_id = workflow.add_stage(
                mapping_applet,
                name='Map %s' % (superstage_name),
                folder=mapping_output_folder
            )
            mapping_superstage.update({'map_stage_id': mapped_stage_id})

        # in the second pass populate the stage inputs and build other stages
        rep1_stage_id = next(ss.get('map_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1')
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            superstage_id = mapping_superstage.get('map_stage_id')

            if mapping_superstage.get('input_args') or blank_workflow:
                mapping_stage_input = {}
                if superstage_name != "Rep1":
                    mapping_stage_input.update(
                        {'reference_tar': dxpy.dxlink(
                            {'stage': rep1_stage_id,
                             'inputField': 'reference_tar'})})
                else:
                    if args.reference:
                        mapping_stage_input.update(
                            {'reference_tar': dxpy.dxlink(
                                reference_tar.get_id())})
                if not blank_workflow:
                    for arg_index, input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2
                        reads = dxpy.dxlink(resolve_file(input_arg).get_id())
                        mapping_stage_input.update({'reads%d' %(arg_index+1): reads})
                # this is now done in the first pass loop above
                # mapped_stage_id = workflow.add_stage(
                #     mapping_applet,
                #     name='Map %s' %(superstage_name),
                #     folder=mapping_output_folder,
                #     stage_input=mapping_stage_input
                # )
                # mapping_superstage.update({'map_stage_id': mapped_stage_id})
                workflow.update_stage(superstage_id, stage_input=mapping_stage_input)

                filter_qc_stage_input = {
                    'input_bam': dxpy.dxlink({'stage': superstage_id, 'outputField': 'mapped_reads'}),
                    'paired_end': dxpy.dxlink({'stage': superstage_id, 'outputField': 'paired_end'})
                }
                if args.scrub is not None:
                    filter_qc_stage_input.update({'scrub': args.scrub})
                filter_qc_stage_id = workflow.add_stage(
                    filter_qc_applet,
                    name='Filter_QC %s' %(superstage_name),
                    folder=filter_qc_output_folder,
                    stage_input=filter_qc_stage_input
                )
                mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id})

                xcor_stage_id = workflow.add_stage(
                    xcor_applet,
                    name='Xcor %s' %(superstage_name),
                    folder=xcor_output_folder,
                    stage_input={
                        'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}),
                        'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}),
                        'spp_version': args.spp_version
                    }
                )
                mapping_superstage.update({'xcor_stage_id': xcor_stage_id})

        exp_rep1_ta = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
                     'outputField': 'tagAlign_file'})
        exp_rep1_cc = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
                     'outputField': 'CC_scores_file'})
        rep1_paired_end = dxpy.dxlink(
                        {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
                         'outputField': 'paired_end'})
        if not simplicate_experiment:
            exp_rep2_ta = dxpy.dxlink(
                        {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
                         'outputField': 'tagAlign_file'})
            exp_rep2_cc = dxpy.dxlink(
                        {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
                         'outputField': 'CC_scores_file'})
            rep2_paired_end = dxpy.dxlink(
                            {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
                             'outputField': 'paired_end'})
        else:
            exp_rep2_ta = None
            exp_rep2_cc = None
            rep2_paired_end = None

        ctl_rep1_ta = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'),
                     'outputField': 'tagAlign_file'})
        if not unary_control and not simplicate_experiment:
            ctl_rep2_ta = dxpy.dxlink(
                        {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'),
                         'outputField': 'tagAlign_file'})
        else:
            ctl_rep2_ta = None

    else:  # skipped the mapping, so just bring in the inputs from arguments
        if not blank_workflow:
            exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id())
            exp_rep1_ta_desc = dxpy.describe(exp_rep1_ta)
            exp_rep1_mapping_analysis_id = dxpy.describe(exp_rep1_ta_desc['createdBy']['job'])['analysis']
            exp_rep1_mapping_analysis = dxpy.describe(exp_rep1_mapping_analysis_id)
            rep1_xcor_stage_description = next(
                stage
                for stage in exp_rep1_mapping_analysis.get('stages')
                if stage['execution']['executableName'] == 'xcor')
            exp_rep1_cc = rep1_xcor_stage_description['execution']['output']['CC_scores_file']
            if args.rep1pe is None:
                print("Inferring rep1 PE-ness from analysis")
                rep1_paired_end = rep1_xcor_stage_description['execution']['output']['paired_end']
            else:
                rep1_paired_end = args.rep1pe
            if not simplicate_experiment:
                exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id())
                exp_rep2_ta_desc = dxpy.describe(exp_rep2_ta)
                exp_rep2_mapping_analysis_id = dxpy.describe(exp_rep2_ta_desc['createdBy']['job'])['analysis']
                exp_rep2_mapping_analysis = dxpy.describe(exp_rep2_mapping_analysis_id)
                rep2_xcor_stage_description = next(
                    stage
                    for stage in exp_rep2_mapping_analysis.get('stages')
                    if stage['execution']['executableName'] == 'xcor')
                exp_rep2_cc = rep2_xcor_stage_description['execution']['output']['CC_scores_file']
                if args.rep2pe is None:
                    print("Inferring rep2 PE-ness from analysis")
                    rep2_paired_end = rep1_xcor_stage_description['execution']['output']['paired_end']
                else:
                    rep2_paired_end = args.rep1pe
            else:
                exp_rep2_ta = None
                exp_rep2_cc = None
                rep2_paired_end = None

            ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id())
            if not unary_control and not simplicate_experiment:
                ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id())
            else:
                ctl_rep2_ta = None
        else:  # blank workflow
            ctl_rep1_ta = None
            ctl_rep2_ta = None

            # here we need to calculate the cc scores files, because we're only
            # being supplied tagAligns
            # if we had mapped everything above we'd already have a handle to
            # the cc file
            xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id())
            # xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name)
            xcor_output_folder = xcor_only_applet.name
            xcor_only_stages = []
            rep1_xcor_input = {'spp_version': args.spp_version}
            if args.rep1pe is not None:
                rep1_xcor_input.update({'paired_end': args.rep1pe})
            exp_rep1_cc_stage_id = workflow.add_stage(
                xcor_only_applet,
                name="Rep1 cross-correlation",
                folder=xcor_output_folder,
                stage_input=rep1_xcor_input
            )
            xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id})
            exp_rep1_cc = dxpy.dxlink(
                {'stage': exp_rep1_cc_stage_id,
                 'outputField': 'CC_scores_file'})
            rep1_paired_end = dxpy.dxlink(
                {'stage': exp_rep1_cc_stage_id,
                 'outputField': 'paired_end'})
            exp_rep1_ta = dxpy.dxlink(
                {'stage': exp_rep1_cc_stage_id,
                 'inputField': 'input_tagAlign'})
            if not simplicate_experiment:
                rep2_xcor_input = {'spp_version': args.spp_version}
                if args.rep2pe is not None:
                    rep2_xcor_input.update({'paired_end': args.rep2pe})
                exp_rep2_cc_stage_id = workflow.add_stage(
                    xcor_only_applet,
                    name="Rep2 cross-correlation",
                    folder=xcor_output_folder,
                    stage_input=rep2_xcor_input
                )
                xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id})
                exp_rep2_cc = dxpy.dxlink(
                    {'stage': exp_rep2_cc_stage_id,
                     'outputField': 'CC_scores_file'})
                rep2_paired_end = dxpy.dxlink(
                    {'stage': exp_rep2_cc_stage_id,
                     'outputField': 'paired_end'})
                exp_rep2_ta = dxpy.dxlink(
                    {'stage': exp_rep2_cc_stage_id,
                     'inputField': 'input_tagAlign'})

            else:
                exp_rep2_cc = None
                exp_rep2_ta = None
                rep2_paired_end = None

    if not args.maponly:
        encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id())
        encode_macs2_stages = []
        # peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name)
        peaks_output_folder = encode_macs2_applet.name

        # for simplicate experiments and/or unary controls, some of the ta inputs
        # will have the value None
        macs2_stage_input_mapping = {
                'rep1_ta' : exp_rep1_ta,
                'rep2_ta' : exp_rep2_ta,
                'ctl1_ta': ctl_rep1_ta,
                'ctl2_ta' : ctl_rep2_ta,
                'rep1_xcor' : exp_rep1_cc,
                'rep2_xcor' : exp_rep2_cc,
                'rep1_paired_end': rep1_paired_end,
                'rep2_paired_end': rep2_paired_end,
                'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
                'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)),
                'broadpeak_as':  dxpy.dxlink(resolve_file(args.broadpeak_as)),
                'genomesize': genomesize,
                'chrom_sizes': chrom_sizes
            }

        # have to prune out any arguments with value None because DX will error
        # with arguments with null values
        macs2_stage_input = dict([(k,v) for k,v in macs2_stage_input_mapping.iteritems() if v is not None])

        encode_macs2_stage_id = workflow.add_stage(
            encode_macs2_applet,
            name='ENCODE Peaks',
            folder=peaks_output_folder,
            stage_input=macs2_stage_input
            )
        encode_macs2_stages.append({'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id})

        if run_idr:
            encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id())
            encode_spp_stages = []
            # idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name)
            idr_peaks_output_folder = encode_spp_applet.name
            PEAKS_STAGE_NAME = 'SPP Peaks'
            # for simplicate experiments and/or unary controls, some of the ta inputs
            # will have the value None
            peaks_stage_input_mapping = {
                        'rep1_ta' : exp_rep1_ta,
                        'rep2_ta' : exp_rep2_ta,
                        'ctl1_ta': ctl_rep1_ta,
                        'ctl2_ta' : ctl_rep2_ta,
                        'rep1_xcor' : exp_rep1_cc,
                        'rep2_xcor' : exp_rep2_cc,
                        'rep1_paired_end': rep1_paired_end,
                        'rep2_paired_end': rep2_paired_end,
                        'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
                        'idr_peaks': True,
                        'spp_version': args.spp_version,
                        'spp_instance': args.spp_instance
                        }
            if chrom_sizes:
                peaks_stage_input_mapping.update({'chrom_sizes': chrom_sizes})
            else:
                peaks_stage_input_mapping.update({'chrom_sizes': dxpy.dxlink({'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes'})})
            # have to prune out any arguments with value None because DX will error
            # with arguments with null values
            peaks_stage_input = dict([(k,v) for k,v in peaks_stage_input_mapping.iteritems() if v is not None])

            encode_spp_stage_id = workflow.add_stage(
                encode_spp_applet,
                name=PEAKS_STAGE_NAME,
                folder=idr_peaks_output_folder,
                stage_input=peaks_stage_input
                )
            encode_spp_stages.append({'name': PEAKS_STAGE_NAME, 'stage_id': encode_spp_stage_id})

            # TODO here I think we should abstract out all the IDR to one step like the two peak-calling steps
            idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id())
            encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id())
            idr_stages = []
            # idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name)
            idr_output_folder = idr_applet.name
            if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow or simplicate_experiment:
                idr_stage_id = workflow.add_stage(
                    idr_applet,
                    name='IDR Rep 1 Self-pseudoreplicates',
                    folder=idr_output_folder,
                    stage_input={
                        'rep1_peaks' : dxpy.dxlink(
                            {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                             'outputField': 'rep1pr1_peaks'}),
                        'rep2_peaks' : dxpy.dxlink(
                            {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                             'outputField': 'rep1pr2_peaks'}),
                        'pooled_peaks': dxpy.dxlink(
                            {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                             'outputField': 'rep1_peaks'})
                    }
                )
                idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id})

                if not simplicate_experiment:
                    idr_stage_id = workflow.add_stage(
                        idr_applet,
                        name='IDR Rep 2 Self-pseudoreplicates',
                        folder=idr_output_folder,
                        stage_input={
                            'rep1_peaks' : dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                                 'outputField': 'rep2pr1_peaks'}),
                            'rep2_peaks' : dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                                 'outputField': 'rep2pr2_peaks'}),
                            'pooled_peaks': dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                                 'outputField': 'rep2_peaks'})
                        }
                    )
                    idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id})

                    idr_stage_id = workflow.add_stage(
                        idr_applet,
                        name='IDR True Replicates',
                        folder=idr_output_folder,
                        stage_input={
                            'rep1_peaks' : dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                                 'outputField': 'rep1_peaks'}),
                            'rep2_peaks' : dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                                 'outputField': 'rep2_peaks'}),
                            'pooled_peaks': dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                                 'outputField': 'pooled_peaks'})
                        }
                    )
                    idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id})

                    idr_stage_id = workflow.add_stage(
                        idr_applet,
                        name='IDR Pooled Pseudoreplicates',
                        folder=idr_output_folder,
                        stage_input={
                            'rep1_peaks' : dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                                 'outputField': 'pooledpr1_peaks'}),
                            'rep2_peaks' : dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                                 'outputField': 'pooledpr2_peaks'}),
                            'pooled_peaks': dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                                 'outputField': 'pooled_peaks'})
                        }
                    )
                    idr_stages.append({'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id})

                final_idr_stage_input = {
                        'r1pr_peaks': dxpy.dxlink(
                            {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'),
                             'outputField': 'IDR_peaks'}),
                        'rep1_ta': exp_rep1_ta,
                        'rep1_xcor': exp_rep1_cc,
                        'paired_end': rep1_paired_end,  # applies to replicated experiments, too
                        'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
                        'rep1_signal': dxpy.dxlink(
                            {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                             'outputField': 'rep1_fc_signal'})
                    }
                if not simplicate_experiment:
                    final_idr_stage_input.update({
                            'reps_peaks' : dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'),
                                 'outputField': 'IDR_peaks'}),
                            'r2pr_peaks' : dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'),
                                 'outputField': 'IDR_peaks'}),
                            'pooledpr_peaks': dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'),
                                 'outputField': 'IDR_peaks'}),
                            'rep2_ta': exp_rep2_ta,
                            'rep2_xcor': exp_rep2_cc,
                            'rep2_signal': dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                                 'outputField': 'rep2_fc_signal'}),
                            'pooled_signal': dxpy.dxlink(
                                {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
                                 'outputField': 'pooled_fc_signal'})
                        })

                if blacklist:
                    final_idr_stage_input.update({'blacklist': blacklist})
                if chrom_sizes:
                    final_idr_stage_input.update({'chrom_sizes': chrom_sizes})
                else:
                    final_idr_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes'})})
                final_idr_stage_id = workflow.add_stage(
                    encode_idr_applet,
                    name='Final IDR peak calls',
                    folder=idr_output_folder,
                    stage_input=final_idr_stage_input,

                )
                idr_stages.append({'name': 'Final IDR peak calls', 'stage_id': final_idr_stage_id})

        if target_type == 'histone':
            PEAKS_STAGE_NAME = "ENCODE Peaks"
            overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME, applet_project.get_id())
            overlap_peaks_stages = []
            for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']:

                if peaktype == 'narrowpeaks':
                    as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as))
                    peak_type_extension = 'narrowPeak'

                elif peaktype == 'gappedpeaks':
                    as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as))
                    peak_type_extension = 'gappedPeak'

                elif peaktype == 'broadpeaks':
                    as_file = dxpy.dxlink(resolve_file(args.broadpeak_as))
                    peak_type_extension = 'broadPeak'

                overlap_peaks_stage_input = {
                    'rep1_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1_%s' % (peaktype)}),
                    'rep2_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2_%s' % (peaktype)}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooled_%s' % (peaktype)}),
                    'pooledpr1_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooledpr1_%s' % (peaktype)}),
                    'pooledpr2_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooledpr2_%s' % (peaktype)}),
                    'rep1_ta': exp_rep1_ta,
                    'rep1_xcor': exp_rep1_cc,
                    'rep2_ta': exp_rep2_ta,
                    'rep2_xcor': exp_rep2_cc,
                    'paired_end': rep1_paired_end,  # applies to replicated experiments, too
                    'as_file': as_file,
                    'peak_type': peak_type_extension,
                    'prefix': 'final',
                    'rep1_signal': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1_fc_signal'}),
                    'rep2_signal': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2_fc_signal'}),
                    'pooled_signal': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooled_fc_signal'})
                } if not simplicate_experiment else {
                    'rep1_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1pr1_%s' % (peaktype)}),
                    'rep2_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1pr2_%s' % (peaktype)}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1_%s' % (peaktype)}),
                    'rep1_ta': exp_rep1_ta,
                    'rep1_xcor': exp_rep1_cc,
                    'paired_end': rep1_paired_end,  # applies to replicated experiments, too
                    'as_file': as_file,
                    'peak_type': peak_type_extension,
                    'prefix': 'final',
                    'rep1_signal': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1_fc_signal'})
                }
                if chrom_sizes:
                    overlap_peaks_stage_input.update({'chrom_sizes': chrom_sizes})
                else:
                    overlap_peaks_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_macs2_stage_id, 'inputField': 'chrom_sizes'})})

                overlap_peaks_stage_id = workflow.add_stage(
                    overlap_peaks_applet,
                    name='Final %s' % (peaktype),
                    folder=peaks_output_folder,
                    stage_input=overlap_peaks_stage_input
                )
                overlap_peaks_stages.append({'name': 'Final %s' %(peaktype), 'stage_id': overlap_peaks_stage_id})

    if args.yes:
        if args.debug:
            analysis = workflow.run({}, folder=output_folder, priority='high', debug={'debugOn': ['AppInternalError', 'AppError']}, delay_workspace_destruction=True, allow_ssh=['*'])
        else:
            analysis = workflow.run({}, folder=output_folder, priority='normal')

        analysis.set_properties({
            "target_type": target_type,
            "unreplicated_experiment": str(simplicate_experiment),
            "unary_control": str(unary_control)
        })
        print("Running %s as %s" % (analysis.name, analysis.get_id()))

        if args.accession:
            accession_analysis_applet = find_applet_by_name(ACCESSION_ANALYSIS_APPLET_NAME, applet_project.get_id())
            accession_output_folder = '/' + accession_analysis_applet.name
            accession_job_input = {
                'analysis_ids': [analysis.get_id()],
                'wait_on_files': []
            }
            if args.fqcheck is not None:
                accession_job_input.update({'fqcheck' : args.fqcheck})
            if args.skip_control is not None:
                accession_job_input.update({'skip_control' : args.skip_control})
            if args.force_patch is not None:
                accession_job_input.update({'force_patch': args.force_patch})
            # assert accession_stage_input['wait_on_files'], "ERROR: workflow has no wait_on_files defined, so --accession is not supported."
            time.sleep(5)
            max_retries = 10
            retries = max_retries
            while retries:
                try:
                    accession_job = accession_analysis_applet.run(
                        accession_job_input,
                        name='Accession %s' % (analysis.name),
                        folder=accession_output_folder,
                        depends_on=analysis.describe()['dependsOn']
                    )
                except Exception as e:
                    logging.error("%s launching auto-accession ... %d retries left" % (e, retries))
                    time.sleep(5)
                    retries -= 1
                    continue
                else:
                    logging.info("Auto-accession will run as %s %s" % (accession_job.name, accession_job.get_id()))
                    break
            else:
                logging.error("Auto-accession failed with %s" % ())
def main():
    args = get_args()

    output_project = resolve_project(args.outp, 'w')
    logging.debug('Found output project %s' %(output_project.name))
    output_folder = resolve_folder(output_project, args.outf)
    logging.debug('Using output folder %s' %(output_folder))
    applet_project = resolve_project(args.applets, 'r')
    logging.debug('Found applet project %s' %(applet_project.name))

    workflow = dxpy.new_dxworkflow(
        name=args.name,
        title=args.title,
        description=WF_DESCRIPTION,
        project=output_project.get_id(),
        folder=output_folder)

    blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2)

    if not args.genomesize:
        genomesize = None
    else:
        genomesize = args.genomesize
    if not args.chrom_sizes:
        chrom_sizes = None
    else:
        chrom_sizes = dxpy.dxlink(resolve_file(args.chrom_sizes))

    if not args.blacklist:
        blacklist = None
    else:
        blacklist = dxpy.dxlink(resolve_file(args.blacklist))


    if not args.nomap:
        #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input
        #each superstage here could be implemented as a stage in a more abstract workflow.  That stage would then call the various applets that are separate
        #stages here.
        mapping_superstages = [ # the order of this list is important in that
            {'name': 'Rep1', 'input_args': args.rep1},
            {'name': 'Rep2', 'input_args': args.rep2},
            {'name': 'Ctl1', 'input_args': args.ctl1}
        ]
        if not args.unary_control:
            mapping_superstages.append({'name': 'Ctl2', 'input_args': args.ctl2})

        mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id())
        mapping_output_folder = resolve_folder(output_project, output_folder + '/' + mapping_applet.name)
        reference_tar = resolve_file(args.reference)
        filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id())
        filter_qc_output_folder = mapping_output_folder
        xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id())
        xcor_output_folder = mapping_output_folder

        # in the first pass create the mapping stage id's so we can use JBOR's
        # to link inputs
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            mapped_stage_id = workflow.add_stage(
                mapping_applet,
                name='Map %s' %(superstage_name),
                folder=mapping_output_folder
            )
            mapping_superstage.update({'map_stage_id': mapped_stage_id})

        # in the second pass populate the stage inputs and build other stages
        rep1_stage_id = next(ss.get('map_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1')
        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')
            superstage_id = mapping_superstage.get('map_stage_id')

            if mapping_superstage.get('input_args') or blank_workflow:
                mapping_stage_input = {}
                if superstage_name != "Rep1":
                    mapping_stage_input.update({'reference_tar': dxpy.dxlink({'stage': rep1_stage_id, 'inputField': 'reference_tar'})})
                else:
                    if args.reference:
                        mapping_stage_input.update({'reference_tar' : dxpy.dxlink(reference_tar.get_id())})
                if not blank_workflow:
                    for arg_index, input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2
                        reads = dxpy.dxlink(resolve_file(input_arg).get_id())
                        mapping_stage_input.update({'reads%d' %(arg_index+1): reads})
                # this is now done in the first pass loop above
                # mapped_stage_id = workflow.add_stage(
                #     mapping_applet,
                #     name='Map %s' %(superstage_name),
                #     folder=mapping_output_folder,
                #     stage_input=mapping_stage_input
                # )
                # mapping_superstage.update({'map_stage_id': mapped_stage_id})
                workflow.update_stage(superstage_id, stage_input=mapping_stage_input)

                filter_qc_stage_id = workflow.add_stage(
                    filter_qc_applet,
                    name='Filter_QC %s' %(superstage_name),
                    folder=filter_qc_output_folder,
                    stage_input={
                        'input_bam': dxpy.dxlink({'stage': superstage_id, 'outputField': 'mapped_reads'}),
                        'paired_end': dxpy.dxlink({'stage': superstage_id, 'outputField': 'paired_end'})
                    }
                )
                mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id})

                xcor_stage_id = workflow.add_stage(
                    xcor_applet,
                    name='Xcor %s' %(superstage_name),
                    folder=xcor_output_folder,
                    stage_input={
                        'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}),
                        'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'})
                    }
                )
                mapping_superstage.update({'xcor_stage_id': xcor_stage_id})

        exp_rep1_ta = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
                     'outputField': 'tagAlign_file'})
        exp_rep1_cc = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
                     'outputField': 'CC_scores_file'})
        exp_rep2_ta = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
                     'outputField': 'tagAlign_file'})
        exp_rep2_cc = dxpy.dxlink(
                    {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
                     'outputField': 'CC_scores_file'})
        ctl_rep1_ta = dxpy.dxlink(
                    {'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'),
                     'outputField': 'tagAlign_file'})
        if not args.unary_control:
            ctl_rep2_ta = dxpy.dxlink(
                        {'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'),
                         'outputField': 'tagAlign_file'})
        else:
            ctl_rep2_ta = ctl_rep1_ta
        rep1_paired_end = dxpy.dxlink(
                        {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
                         'outputField': 'paired_end'})
        rep2_paired_end = dxpy.dxlink(
                        {'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
                         'outputField': 'paired_end'})
    else: #skipped the mapping, so just bring in the inputs from arguments
        exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id())
        exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id())
        ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id())
        ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id())
        rep1_paired_end = args.rep1pe
        rep2_paired_end = args.rep2pe

        #here we need to calculate the cc scores files, because we're only being supplied tagAligns
        #if we had mapped everything above we'd already have a handle to the cc file
        xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id())
        xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name)
        xcor_only_stages = []

        exp_rep1_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep1 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep1_ta,
                'paired_end': rep1_paired_end
            }
        )
        xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id})
        exp_rep1_cc = dxpy.dxlink(
                    {'stage': exp_rep1_cc_stage_id,
                     'outputField': 'CC_scores_file'})

        exp_rep2_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep2 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep2_ta,
                'paired_end': rep2_paired_end
            }
        )
        xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id})
        exp_rep2_cc = dxpy.dxlink(
                    {'stage': exp_rep2_cc_stage_id,
                     'outputField': 'CC_scores_file'})

    encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME, applet_project.get_id())
    encode_spp_stages = []
    idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_spp_applet.name)
    PEAKS_STAGE_NAME = 'SPP Peaks'
    peaks_stage_input = {
                'rep1_ta' : exp_rep1_ta,
                'rep2_ta' : exp_rep2_ta,
                'ctl1_ta': ctl_rep1_ta,
                'ctl2_ta' : ctl_rep2_ta,
                'rep1_xcor' : exp_rep1_cc,
                'rep2_xcor' : exp_rep2_cc,
                'rep1_paired_end': rep1_paired_end,
                'rep2_paired_end': rep2_paired_end,
                'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
                'idr_peaks': args.idr
                }
    if chrom_sizes:
        peaks_stage_input.update({'chrom_sizes': chrom_sizes})
    encode_spp_stage_id = workflow.add_stage(
        encode_spp_applet,
        name=PEAKS_STAGE_NAME,
        folder=idr_peaks_output_folder,
        stage_input=peaks_stage_input
        )
    encode_spp_stages.append({'name': PEAKS_STAGE_NAME, 'stage_id': encode_spp_stage_id})

    encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id())
    encode_macs2_stages = []
    peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name)

    macs2_stage_input = {
            'rep1_ta' : exp_rep1_ta,
            'rep2_ta' : exp_rep2_ta,
            'ctl1_ta': ctl_rep1_ta,
            'ctl2_ta' : ctl_rep2_ta,
            'rep1_xcor' : exp_rep1_cc,
            'rep2_xcor' : exp_rep2_cc,
            'rep1_paired_end': rep1_paired_end,
            'rep2_paired_end': rep2_paired_end,
            'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
            'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)),
            'broadpeak_as':  dxpy.dxlink(resolve_file(args.broadpeak_as))
        }
    if genomesize:
        macs2_stage_input.update({'genomesize': genomesize})
    if chrom_sizes:
        macs2_stage_input.update({'chrom_sizes': chrom_sizes})
    else:
        macs2_stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes'})})
    encode_macs2_stage_id = workflow.add_stage(
        encode_macs2_applet,
        name='ENCODE Peaks',
        folder=peaks_output_folder,
        stage_input=macs2_stage_input
        )
    encode_macs2_stages.append({'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id})

    if args.idr:
        # if args.idrversion == "1":
        #     idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id())
        # elif args.idrversion == "2":
        #     idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id())
        # else:
        #     logging.error("Invalid IDR version: %s" %(args.idrversion))
        #     idr_applet = None
        idr_applet = find_applet_by_name(IDR2_APPLET_NAME, applet_project.get_id())
        encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id())
        idr_stages = []
        idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name)
        if (args.rep1 and args.ctl1 and args.rep2) or blank_workflow:
            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR True Replicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1_peaks'}),
                    'rep2_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2_peaks'}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooled_peaks'})
                }
            )
            idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id})

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 1 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1pr1_peaks'}),
                    'rep2_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1pr2_peaks'}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep1_peaks'})
                }
            )
            idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id})

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 2 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2pr1_peaks'}),
                    'rep2_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2pr2_peaks'}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'rep2_peaks'})
                }
            )
            idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id})

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Pooled Pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooledpr1_peaks'}),
                    'rep2_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooledpr2_peaks'}),
                    'pooled_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in encode_spp_stages if ss['name'] == PEAKS_STAGE_NAME),
                         'outputField': 'pooled_peaks'})
                }
            )
            idr_stages.append({'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id})

            stage_input = {
                    'reps_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'),
                         'outputField': 'IDR_peaks'}),
                    'r1pr_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'),
                         'outputField': 'IDR_peaks'}),
                    'r2pr_peaks' : dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'),
                         'outputField': 'IDR_peaks'}),
                    'pooledpr_peaks': dxpy.dxlink(
                        {'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'),
                         'outputField': 'IDR_peaks'}),
                    'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as))
                }
            if blacklist:
                stage_input.update({'blacklist': blacklist})
            if chrom_sizes:
                stage_input.update({'chrom_sizes': chrom_sizes})
            else:
                stage_input.update({'chrom_sizes': dxpy.dxlink({'stage': encode_spp_stage_id, 'inputField': 'chrom_sizes'})})
            idr_stage_id = workflow.add_stage(
                encode_idr_applet,
                name='Final IDR peak calls',
                folder=idr_output_folder,
                stage_input=stage_input
            )
            idr_stages.append({'name': 'Final IDR peak calls', 'stage_id': idr_stage_id})

    if not (args.nomap):
        logging.debug("Mapping stages: %s" %(mapping_superstages))
    else:
        logging.debug("xcor only stages: %s" %(xcor_only_stages))
    # if not args.idronly:
    #   logging.debug("Peak stages: %s" %(spp_stages))
    logging.debug("Peak stages: %s" %(encode_spp_stages))
    if args.idr:
        logging.debug("IDR stages: %s" %(idr_stages))

    if args.yes:
        if args.debug:
            job_id = workflow.run({}, priority='high', debug={'debugOn': ['AppInternalError', 'AppError']}, delay_workspace_destruction=True, allow_ssh=['255.255.255.255'])
        else:
            job_id = workflow.run({}, priority='high')
        logging.info("Running as job %s" %(job_id))
Beispiel #27
0
def main():
    args = get_args()

    output_project = resolve_project(args.outp, 'w')
    logging.info('Found output project %s' % (output_project.name))
    output_folder = resolve_folder(output_project, args.outf)
    logging.info('Using output folder %s' % (output_folder))
    applet_project = resolve_project(args.applets, 'r')
    logging.info('Found applet project %s' % (applet_project.name))

    workflow = dxpy.new_dxworkflow(name=WF_NAME,
                                   title=args.name,
                                   description=WF_DESCRIPTION,
                                   project=output_project.get_id(),
                                   folder=output_folder)

    blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2)

    if not args.nomap:
        #a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input
        #each superstage here could be implemented as a stage in a more abstract workflow.  That stage would then call the various applets that are separate
        #stages here.
        mapping_superstages = [
            {
                'name': 'Rep1',
                'input_args': args.rep1
            }, {
                'name': 'Rep2',
                'input_args': args.rep2
            }, {
                'name': 'Ctl1',
                'input_args': args.ctl1
            }, {
                'name': 'Ctl2',
                'input_args': args.ctl2
            }
            # {'name': 'Pooled Reps', 'input_args': (args.rep1 and args.rep2)},
            # {'name': 'Pooled Controls', 'input_args': (args.ctl1 and args.ctl2)} ##idea is to create a "stub" stage and then populate it's input with the output of the pool stage, defined below
        ]

        mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME,
                                             applet_project.get_id())
        mapping_output_folder = resolve_folder(
            output_project, output_folder + '/' + mapping_applet.name)
        reference_tar = resolve_file(args.reference)
        filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME,
                                               applet_project.get_id())
        filter_qc_output_folder = mapping_output_folder
        xcor_applet = find_applet_by_name(XCOR_APPLET_NAME,
                                          applet_project.get_id())
        xcor_output_folder = mapping_output_folder

        for mapping_superstage in mapping_superstages:
            superstage_name = mapping_superstage.get('name')

            if mapping_superstage.get('input_args') or blank_workflow:
                if blank_workflow:
                    mapping_stage_input = None
                else:
                    mapping_stage_input = {
                        'reference_tar': dxpy.dxlink(reference_tar.get_id())
                    }
                    for arg_index, input_arg in enumerate(
                            mapping_superstage['input_args']
                    ):  #read pairs assumed be in order read1,read2
                        reads = dxpy.dxlink(resolve_file(input_arg).get_id())
                        mapping_stage_input.update(
                            {'reads%d' % (arg_index + 1): reads})

                mapped_stage_id = workflow.add_stage(
                    mapping_applet,
                    name='Map %s' % (superstage_name),
                    folder=mapping_output_folder,
                    stage_input=mapping_stage_input)
                mapping_superstage.update({'map_stage_id': mapped_stage_id})

                filter_qc_stage_id = workflow.add_stage(
                    filter_qc_applet,
                    name='Filter_QC %s' % (superstage_name),
                    folder=filter_qc_output_folder,
                    stage_input={
                        'input_bam':
                        dxpy.dxlink({
                            'stage': mapped_stage_id,
                            'outputField': 'mapped_reads'
                        }),
                        'paired_end':
                        dxpy.dxlink({
                            'stage': mapped_stage_id,
                            'outputField': 'paired_end'
                        })
                    })
                mapping_superstage.update(
                    {'filter_qc_stage_id': filter_qc_stage_id})

                xcor_stage_id = workflow.add_stage(xcor_applet,
                                                   name='Xcor %s' %
                                                   (superstage_name),
                                                   folder=xcor_output_folder,
                                                   stage_input={
                                                       'input_bam':
                                                       dxpy.dxlink({
                                                           'stage':
                                                           filter_qc_stage_id,
                                                           'outputField':
                                                           'filtered_bam'
                                                       }),
                                                       'paired_end':
                                                       dxpy.dxlink({
                                                           'stage':
                                                           filter_qc_stage_id,
                                                           'outputField':
                                                           'paired_end'
                                                       })
                                                   })
                mapping_superstage.update({'xcor_stage_id': xcor_stage_id})

        exp_rep1_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'tagAlign_file'
        })
        exp_rep1_cc = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'CC_scores_file'
        })
        exp_rep2_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'tagAlign_file'
        })
        exp_rep2_cc = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'CC_scores_file'
        })
        ctl_rep1_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Ctl1'),
            'outputField':
            'tagAlign_file'
        })
        ctl_rep2_ta = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Ctl2'),
            'outputField':
            'tagAlign_file'
        })
        rep1_paired_end = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep1'),
            'outputField':
            'paired_end'
        })
        rep2_paired_end = dxpy.dxlink({
            'stage':
            next(
                ss.get('xcor_stage_id') for ss in mapping_superstages
                if ss['name'] == 'Rep2'),
            'outputField':
            'paired_end'
        })
    else:  #skipped the mapping, so just bring in the inputs from arguments
        exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id())
        exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id())
        ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id())
        ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id())
        rep1_paired_end = args.rep1pe
        rep2_paired_end = args.rep2pe

        #here we need to calculate the cc scores files, because we're only being supplied tagAligns
        #if we had mapped everything above we'd already have a handle to the cc file
        xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME,
                                               applet_project.get_id())
        xcor_output_folder = resolve_folder(
            output_project, output_folder + '/' + xcor_only_applet.name)
        xcor_only_stages = []

        exp_rep1_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep1 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep1_ta,
                'paired_end': rep1_paired_end
            })
        xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id})
        exp_rep1_cc = dxpy.dxlink({
            'stage': exp_rep1_cc_stage_id,
            'outputField': 'CC_scores_file'
        })

        exp_rep2_cc_stage_id = workflow.add_stage(
            xcor_only_applet,
            name="Rep2 cross-correlation",
            folder=xcor_output_folder,
            stage_input={
                'input_tagAlign': exp_rep2_ta,
                'paired_end': rep2_paired_end
            })
        xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id})
        exp_rep2_cc = dxpy.dxlink({
            'stage': exp_rep2_cc_stage_id,
            'outputField': 'CC_scores_file'
        })

    # if not args.idronly:
    # 	spp_applet = find_applet_by_name(SPP_APPLET_NAME, applet_project.get_id())
    # 	peaks_output_folder = resolve_folder(output_project, output_folder + '/' + spp_applet.name)
    # 	spp_stages = []
    # 	if (args.rep1 and args.ctl1) or blank_workflow:
    # 		rep1_spp_stage_id = workflow.add_stage(
    # 			spp_applet,
    # 			name='Peaks Rep1',
    # 			folder=peaks_output_folder,
    # 			stage_input={
    # 				'experiment': exp_rep1_ta,
    # 				'control': ctl_rep1_ta,
    # 				'xcor_scores_input': exp_rep1_cc,
    # 				'bigbed': True,
    # 				'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)),
    # 				'as_file': dxpy.dxlink(resolve_file(args.as_file))
    # 			}

    # 		)
    # 		spp_stages.append({'name': 'Peaks Rep1', 'stage_id': rep1_spp_stage_id})
    # 	if (args.rep2 and args.ctl2) or blank_workflow:
    # 		rep2_spp_stage_id = workflow.add_stage(
    # 			spp_applet,
    # 			name='Peaks Rep2',
    # 			folder=peaks_output_folder,
    # 			stage_input={
    # 				'experiment': exp_rep2_ta,
    # 				'control': ctl_rep2_ta,
    # 				'xcor_scores_input': exp_rep2_cc,
    # 				'bigbed': True,
    # 				'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)),
    # 				'as_file': dxpy.dxlink(resolve_file(args.as_file))
    # 			}
    # 		)
    # 		spp_stages.append({'name': 'Peaks Rep2', 'stage_id': rep2_spp_stage_id})

    encode_spp_applet = find_applet_by_name(ENCODE_SPP_APPLET_NAME,
                                            applet_project.get_id())
    encode_spp_stages = []
    idr_peaks_output_folder = resolve_folder(
        output_project, output_folder + '/' + encode_spp_applet.name)
    PEAKS_STAGE_NAME = 'SPP Peaks'
    if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow:
        encode_spp_stage_id = workflow.add_stage(
            encode_spp_applet,
            name=PEAKS_STAGE_NAME,
            folder=idr_peaks_output_folder,
            stage_input={
                'rep1_ta': exp_rep1_ta,
                'rep2_ta': exp_rep2_ta,
                'ctl1_ta': ctl_rep1_ta,
                'ctl2_ta': ctl_rep2_ta,
                'rep1_xcor': exp_rep1_cc,
                'rep2_xcor': exp_rep2_cc,
                'rep1_paired_end': rep1_paired_end,
                'rep2_paired_end': rep2_paired_end,
                'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)),
                'as_file': dxpy.dxlink(resolve_file(args.as_file)),
                'idr_peaks': args.idr
            })

    encode_spp_stages.append({
        'name': PEAKS_STAGE_NAME,
        'stage_id': encode_spp_stage_id
    })

    if args.idr:
        idr_applet = find_applet_by_name(IDR_APPLET_NAME,
                                         applet_project.get_id())
        encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME,
                                                applet_project.get_id())
        idr_stages = []
        idr_output_folder = resolve_folder(
            output_project, output_folder + '/' + idr_applet.name)
        if (args.rep1 and args.ctl1 and args.rep2
                and args.ctl2) or blank_workflow:
            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR True Replicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooled_peaks'
                    }),
                    'idr_version':
                    int(args.idrversion)
                })
            idr_stages.append({
                'name': 'IDR True Replicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 1 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1pr1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1pr2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep1_peaks'
                    }),
                    'idr_version':
                    int(args.idrversion)
                })
            idr_stages.append({
                'name': 'IDR Rep 1 Self-pseudoreplicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Rep 2 Self-pseudoreplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2pr1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2pr2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'rep2_peaks'
                    }),
                    'idr_version':
                    int(args.idrversion)
                })
            idr_stages.append({
                'name': 'IDR Rep 2 Self-pseudoreplicates',
                'stage_id': idr_stage_id
            })

            idr_stage_id = workflow.add_stage(
                idr_applet,
                name='IDR Pooled Pseudoeplicates',
                folder=idr_output_folder,
                stage_input={
                    'rep1_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooledpr1_peaks'
                    }),
                    'rep2_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooledpr2_peaks'
                    }),
                    'pooled_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in encode_spp_stages
                            if ss['name'] == PEAKS_STAGE_NAME),
                        'outputField':
                        'pooled_peaks'
                    }),
                    'idr_version':
                    int(args.idrversion)
                })
            idr_stages.append({
                'name': 'IDR Pooled Pseudoreplicates',
                'stage_id': idr_stage_id
            })

            blacklist = resolve_file(args.blacklist)
            idr_stage_id = workflow.add_stage(
                encode_idr_applet,
                name='Final IDR peak calls',
                folder=idr_output_folder,
                stage_input={
                    'reps_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in idr_stages
                            if ss['name'] == 'IDR True Replicates'),
                        'outputField':
                        'IDR_peaks'
                    }),
                    'r1pr_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in idr_stages if
                            ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'),
                        'outputField':
                        'IDR_peaks'
                    }),
                    'r2pr_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in idr_stages if
                            ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'),
                        'outputField':
                        'IDR_peaks'
                    }),
                    'pooledpr_peaks':
                    dxpy.dxlink({
                        'stage':
                        next(
                            ss.get('stage_id') for ss in idr_stages
                            if ss['name'] == 'IDR Pooled Pseudoreplicates'),
                        'outputField':
                        'IDR_peaks'
                    }),
                    'blacklist':
                    dxpy.dxlink(blacklist.get_id()),
                    'chrom_sizes':
                    dxpy.dxlink(resolve_file(args.chrom_sizes)),
                    'as_file':
                    dxpy.dxlink(resolve_file(args.as_file))
                })
            idr_stages.append({
                'name': 'Final IDR peak calls',
                'stage_id': idr_stage_id
            })

    if not (args.nomap):
        logging.debug("Mapping stages: %s" % (mapping_superstages))
    else:
        logging.debug("xcor only stages: %s" % (xcor_only_stages))
    # if not args.idronly:
    # 	logging.debug("Peak stages: %s" %(spp_stages))
    logging.debug("Peak stages: %s" % (encode_spp_stages))
    if args.idr:
        logging.debug("IDR stages: %s" % (idr_stages))

    if args.yes:
        job_id = workflow.run({}, delay_workspace_destruction=True)
        logging.info("Running as job %s" % (job_id))
def main():
	args = get_args()

	output_project = resolve_project(args.outp, 'w')
	logging.info('Found output project %s' %(output_project.name))
	output_folder = resolve_folder(output_project, args.outf)
	logging.info('Using output folder %s' %(output_folder))
	applet_project = resolve_project(args.applets, 'r')
	logging.info('Found applet project %s' %(applet_project.name))

	workflow = dxpy.new_dxworkflow(
		title=WF_TITLE,
		name=args.name,
		description=WF_DESCRIPTION,
		project=output_project.get_id(),
		folder=output_folder)

	blank_workflow = not (args.rep1 or args.rep2 or args.ctl1 or args.ctl2)

	if not args.nomap:
		#a "superstage" is just a dict with a name, name(s) of input files, and then names and id's of stages that process that input
		#each superstage here could be implemented as a stage in a more abstract workflow.  That stage would then call the various applets that are separate
		#stages here.
		mapping_superstages = [
			{'name': 'Rep1', 'input_args': args.rep1},
			{'name': 'Rep2', 'input_args': args.rep2},
			{'name': 'Ctl1', 'input_args': args.ctl1},
			{'name': 'Ctl2', 'input_args': args.ctl2}
			# {'name': 'Pooled Reps', 'input_args': (args.rep1 and args.rep2)},
			# {'name': 'Pooled Controls', 'input_args': (args.ctl1 and args.ctl2)} ##idea is to create a "stub" stage and then populate it's input with the output of the pool stage, defined below
		]

		mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id())
		mapping_output_folder = resolve_folder(output_project, output_folder + '/' + mapping_applet.name)
		reference_tar = resolve_file(args.reference)
		filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id())
		filter_qc_output_folder = mapping_output_folder
		xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id())
		xcor_output_folder = mapping_output_folder

		for mapping_superstage in mapping_superstages:
			superstage_name = mapping_superstage.get('name')

			if mapping_superstage.get('input_args') or blank_workflow:
				if blank_workflow:
					mapping_stage_input = None
				else:
					mapping_stage_input = {'reference_tar' : dxpy.dxlink(reference_tar.get_id())}
					for arg_index,input_arg in enumerate(mapping_superstage['input_args']): #read pairs assumed be in order read1,read2
						reads = dxpy.dxlink(resolve_file(input_arg).get_id())
						mapping_stage_input.update({'reads%d' %(arg_index+1): reads})

				mapped_stage_id = workflow.add_stage(
					mapping_applet,
					name='Map %s' %(superstage_name),
					folder=mapping_output_folder,
					stage_input=mapping_stage_input
				)
				mapping_superstage.update({'map_stage_id': mapped_stage_id})

				filter_qc_stage_id = workflow.add_stage(
					filter_qc_applet,
					name='Filter_QC %s' %(superstage_name),
					folder=filter_qc_output_folder,
					stage_input={
						'input_bam': dxpy.dxlink({'stage': mapped_stage_id, 'outputField': 'mapped_reads'}),
						'paired_end': dxpy.dxlink({'stage': mapped_stage_id, 'outputField': 'paired_end'})
					}
				)
				mapping_superstage.update({'filter_qc_stage_id': filter_qc_stage_id})

				xcor_stage_id = workflow.add_stage(
					xcor_applet,
					name='Xcor %s' %(superstage_name),
					folder=xcor_output_folder,
					stage_input={
						'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}),
						'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'})
					}
				)
				mapping_superstage.update({'xcor_stage_id': xcor_stage_id})

		exp_rep1_ta = dxpy.dxlink(
					{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
					 'outputField': 'tagAlign_file'})
		exp_rep1_cc = dxpy.dxlink(
					{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
					 'outputField': 'CC_scores_file'})
		exp_rep2_ta = dxpy.dxlink(
					{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
					 'outputField': 'tagAlign_file'})
		exp_rep2_cc = dxpy.dxlink(
					{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
					 'outputField': 'CC_scores_file'})
		ctl_rep1_ta = dxpy.dxlink(
					{'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl1'),
					 'outputField': 'tagAlign_file'})
		ctl_rep2_ta = dxpy.dxlink(
					{'stage' : next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Ctl2'),
					 'outputField': 'tagAlign_file'})
		rep1_paired_end = dxpy.dxlink(
						{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep1'),
						 'outputField': 'paired_end'})
		rep2_paired_end = dxpy.dxlink(
						{'stage': next(ss.get('xcor_stage_id') for ss in mapping_superstages if ss['name'] == 'Rep2'),
						 'outputField': 'paired_end'})
	else: #skipped the mapping, so just bring in the inputs from arguments
		exp_rep1_ta = dxpy.dxlink(resolve_file(args.rep1[0]).get_id())
		exp_rep2_ta = dxpy.dxlink(resolve_file(args.rep2[0]).get_id())
		ctl_rep1_ta = dxpy.dxlink(resolve_file(args.ctl1[0]).get_id())
		ctl_rep2_ta = dxpy.dxlink(resolve_file(args.ctl2[0]).get_id())
		rep1_paired_end = args.rep1pe
		rep2_paired_end = args.rep2pe

		#here we need to calculate the cc scores files, because we're only being supplied tagAligns
		#if we had mapped everything above we'd already have a handle to the cc file
		xcor_only_applet = find_applet_by_name(XCOR_ONLY_APPLET_NAME, applet_project.get_id())
		xcor_output_folder = resolve_folder(output_project, output_folder + '/' + xcor_only_applet.name)
		xcor_only_stages = []

		exp_rep1_cc_stage_id = workflow.add_stage(
			xcor_only_applet,
			name="Rep1 cross-correlation",
			folder=xcor_output_folder,
			stage_input={
				'input_tagAlign': exp_rep1_ta,
				'paired_end': rep1_paired_end
			}
		)
		xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id})
		exp_rep1_cc = dxpy.dxlink(
					{'stage': exp_rep1_cc_stage_id,
					 'outputField': 'CC_scores_file'})

		exp_rep2_cc_stage_id = workflow.add_stage(
			xcor_only_applet,
			name="Rep2 cross-correlation",
			folder=xcor_output_folder,
			stage_input={
				'input_tagAlign': exp_rep2_ta,
				'paired_end': rep2_paired_end
			}
		)
		xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id})
		exp_rep2_cc = dxpy.dxlink(
					{'stage': exp_rep2_cc_stage_id,
					 'outputField': 'CC_scores_file'})

	encode_macs2_applet = find_applet_by_name(ENCODE_MACS2_APPLET_NAME, applet_project.get_id())
	encode_macs2_stages = []
	peaks_output_folder = resolve_folder(output_project, output_folder + '/' + encode_macs2_applet.name)
	if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow:
		encode_macs2_stage_id = workflow.add_stage(
			encode_macs2_applet,
			name='ENCODE Peaks',
			folder=peaks_output_folder,
			stage_input={
				'rep1_ta' : exp_rep1_ta,
				'rep2_ta' : exp_rep2_ta,
				'ctl1_ta': ctl_rep1_ta,
				'ctl2_ta' : ctl_rep2_ta,
				'rep1_xcor' : exp_rep1_cc,
				'rep2_xcor' : exp_rep2_cc,
				'rep1_paired_end': rep1_paired_end,
				'rep2_paired_end': rep2_paired_end,
				'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)),
				'narrowpeak_as': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
				'gappedpeak_as': dxpy.dxlink(resolve_file(args.gappedpeak_as)),
				'broadpeak_as':  dxpy.dxlink(resolve_file(args.broadpeak_as)),
				'genomesize': args.genomesize
			}
		)
		encode_macs2_stages.append({'name': 'ENCODE Peaks', 'stage_id': encode_macs2_stage_id})

	#new applet here, similar to IDR, to do naive peak processing
	if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow:

		overlap_peaks_applet = find_applet_by_name(OVERLAP_PEAKS_APPLET_NAME, applet_project.get_id())
		overlap_peaks_stages = []
		for peaktype in ['narrowpeaks', 'gappedpeaks', 'broadpeaks']:

			if peaktype == 'narrowpeaks':
				as_file = dxpy.dxlink(resolve_file(args.narrowpeak_as))
				peak_type_extension = 'narrowPeak'

			elif peaktype == 'gappedpeaks':
				as_file = dxpy.dxlink(resolve_file(args.gappedpeak_as))
				peak_type_extension = 'gappedPeak'

			elif peaktype == 'broadpeaks':
				as_file = dxpy.dxlink(resolve_file(args.broadpeak_as))
				peak_type_extension = 'broadPeak'

			overlap_peaks_stage_id = workflow.add_stage(
				overlap_peaks_applet,
				name='Overlap %s' %(peaktype),
				folder=peaks_output_folder,
				stage_input={
					'rep1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep1_%s' %(peaktype)}),
					'rep2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep2_%s' %(peaktype)}),
					'pooled_peaks': dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'pooled_%s' %(peaktype)}),
					'pooledpr1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'pooledpr1_%s' %(peaktype)}),
					'pooledpr2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'pooledpr2_%s' %(peaktype)}),
					'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)),
					'as_file': as_file,
					'peak_type': peak_type_extension
				}
			)
			overlap_peaks_stages.append({'name': 'Overlap %s' %(peaktype), 'stage_id': overlap_peaks_stage_id})

	#TODO - IDR on gapped and broad peaks
	if args.idr:
		idr_peaks_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name)
		idr_applet = find_applet_by_name(IDR_APPLET_NAME, applet_project.get_id())
		encode_idr_applet = find_applet_by_name(ENCODE_IDR_APPLET_NAME, applet_project.get_id())
		idr_stages = []
		idr_output_folder = resolve_folder(output_project, output_folder + '/' + idr_applet.name)
		if (args.rep1 and args.ctl1 and args.rep2 and args.ctl2) or blank_workflow:
			idr_stage_id = workflow.add_stage(
				idr_applet,
				name='IDR True Replicates',
				folder=idr_output_folder,
				stage_input={
					'rep1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep1_narrowpeaks'}),
					'rep2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep2_narrowpeaks'}),
					'pooled_peaks': dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'pooled_narrowpeaks'}),
					'idr_version': int(args.idrversion)
				}
			)
			idr_stages.append({'name': 'IDR True Replicates', 'stage_id': idr_stage_id})

			idr_stage_id = workflow.add_stage(
				idr_applet,
				name='IDR Rep 1 Self-pseudoreplicates',
				folder=idr_output_folder,
				stage_input={
					'rep1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep1pr1_narrowpeaks'}),
					'rep2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep1pr2_narrowpeaks'}),
					'pooled_peaks': dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep1_narrowpeaks'}),
					'idr_version': int(args.idrversion)
				}
			)
			idr_stages.append({'name': 'IDR Rep 1 Self-pseudoreplicates', 'stage_id': idr_stage_id})

			idr_stage_id = workflow.add_stage(
				idr_applet,
				name='IDR Rep 2 Self-pseudoreplicates',
				folder=idr_output_folder,
				stage_input={
					'rep1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep2pr1_narrowpeaks'}),
					'rep2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep2pr2_narrowpeaks'}),
					'pooled_peaks': dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'rep2_narrowpeaks'}),
					'idr_version': int(args.idrversion)
				}
			)
			idr_stages.append({'name': 'IDR Rep 2 Self-pseudoreplicates', 'stage_id': idr_stage_id})

			idr_stage_id = workflow.add_stage(
				idr_applet,
				name='IDR Pooled Pseudoeplicates',
				folder=idr_output_folder,
				stage_input={
					'rep1_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'pooledpr1_narrowpeaks'}),
					'rep2_peaks' : dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'pooledpr2_narrowpeaks'}),
					'pooled_peaks': dxpy.dxlink(
						{'stage': next(ss.get('stage_id') for ss in encode_macs2_stages if ss['name'] == 'ENCODE Peaks'),
						 'outputField': 'pooled_narrowpeaks'}),
					'idr_version': int(args.idrversion)
				}
			)
			idr_stages.append({'name': 'IDR Pooled Pseudoreplicates', 'stage_id': idr_stage_id})

			final_idr_stage_input = {
				'reps_peaks' : dxpy.dxlink(
					{'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR True Replicates'),
					 'outputField': 'IDR_peaks'}),
				'r1pr_peaks' : dxpy.dxlink(
					{'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 1 Self-pseudoreplicates'),
					 'outputField': 'IDR_peaks'}),
				'r2pr_peaks' : dxpy.dxlink(
					{'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Rep 2 Self-pseudoreplicates'),
					 'outputField': 'IDR_peaks'}),
				'pooledpr_peaks': dxpy.dxlink(
					{'stage': next(ss.get('stage_id') for ss in idr_stages if ss['name'] == 'IDR Pooled Pseudoreplicates'),
					 'outputField': 'IDR_peaks'}),
				'chrom_sizes': dxpy.dxlink(resolve_file(args.chrom_sizes)),
				'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as))
			}
			if args.blacklist:
				final_idr_stage_input.update({'blacklist': dxpy.dxlink(resolve_file(args.blacklist))})
			idr_stage_id = workflow.add_stage(
				encode_idr_applet,
				name='Final IDR peak calls',
				folder=idr_output_folder,
				stage_input=final_idr_stage_input
			)
			idr_stages.append({'name': 'Final IDR peak calls', 'stage_id': idr_stage_id})

	if not (args.nomap):
		logging.debug("Mapping stages: %s" %(mapping_superstages))
	else:
		logging.debug("xcor only stages: %s" %(xcor_only_stages))
	logging.debug("Peaks for ENCODE stages: %s" %(encode_macs2_stages))
	logging.debug("Peak overlap stages: %s" %(overlap_peaks_stages))
	if args.idr:
		logging.debug("IDR stages: %s" %(idr_stages))

	if args.yes:
		job_id = workflow.run({}, delay_workspace_destruction=True)
		logging.info("Running as job %s" %(job_id))
Beispiel #29
0
def build_workflow():
    wf = dxpy.new_dxworkflow(title='tcga_mc3_full_run',
                              name='tcga_mc3_full_run',
                              description='TCGA mc3 variant calling pipeline',
                              project=args.project,
                              folder=args.folder,
                              properties={"git_revision": git_revision})

    # variant calling tools
    pindel_applet = find_applet("pindel-tool")
    pindel_stage_id  = wf.add_stage(pindel_applet)

    radia_applet = find_applet("radia-tool")
    radia_input = {
        "dnaNormalBam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}),
        "dnaTumorBam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}),
        "fasta": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"})
    }
    radia_stage_id = wf.add_stage(radia_applet, stage_input=radia_input)

    somaticsniper_applet = find_applet("somaticsniper-tool")
    somaticsniper_input = {
        "normal": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}),
        "tumor": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}),
        "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"})
    }

    somaticsniper_stage_id = wf.add_stage(somaticsniper_applet, stage_input=somaticsniper_input, instance_type="mem2_hdd2_x1")

    samtools_pileup_applet = find_applet("samtools-pileup-tool")
    samtools_pileup_normal_input = {
        "input1" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}),
        "input1_index" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBaiFile"}),
        "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"})
    }
    samtools_pileup_normal_stage_id = wf.add_stage(samtools_pileup_applet, stage_input=samtools_pileup_normal_input, instance_type="mem2_hdd2_x1")

    samtools_pileup_tumor_input = {
        "input1" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}),
        "input1_index" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBaiFile"}),
        "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"})
    }
    samtools_pileup_tumor_stage_id = wf.add_stage(samtools_pileup_applet, stage_input=samtools_pileup_tumor_input, instance_type="mem2_hdd2_x2")

    muse_applet = find_applet("muse-tool")
    muse_input = {
        "tumor_bam" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}),
        "tumor_bai" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBaiFile"}),
        "normal_bam" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}),
        "normal_bai" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBaiFile"}),
        "reference" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}),
        "dbsnp": dxpy.dxlink("file-Bj1V0400kF9Z3GqJY4ZbYbYj")
    }
    muse_stage_id = wf.add_stage(muse_applet, stage_input=muse_input)

    varscan_applet = find_applet("varscan-tool")
    varscan_input = {
        "normal_pileup": dxpy.dxlink({"stage": samtools_pileup_normal_stage_id, "outputField": "pileup"}),
        "tumor_pileup": dxpy.dxlink({"stage": samtools_pileup_tumor_stage_id, "outputField": "pileup"})
    }
    varscan_stage_id = wf.add_stage(varscan_applet, stage_input=varscan_input, instance_type="mem2_hdd2_x2")
    
    mutect_applet = find_applet("mutect-tool")
    mutect_input = {
        "tumor_bam" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}),
        "tumor_bai" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBaiFile"}),
        "normal_bam" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBamFile"}),
        "normal_bai" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "normalInputBaiFile"}),
        "reference" : dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"}),
        "dbsnp": dxpy.dxlink("file-Bj1V0400kF9Z3GqJY4ZbYbYj"),
        "cosmic": dxpy.dxlink("file-Bk9g2kQ0kF9f9XG6VZf7VGKQ"),
    }
    mutect_stage_id = wf.add_stage(mutect_applet, stage_input=mutect_input)

    
    # fpfilter (somaticSniper, Varscan)
    fpfilter_applet = find_applet("fpfilter-tool")

    somatcisniper_fpfilter_input = {
        "vcf": dxpy.dxlink({"stage": somaticsniper_stage_id, "outputField": "vcf"}),
        "bam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}),
        "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"})
    }
    somaticsniper_fpfilter_stage_id = wf.add_stage(fpfilter_applet,
                                                   stage_input=somatcisniper_fpfilter_input,
                                                   name="fpfilter-tool(somaticSniper)",
                                                   folder="fpfiltered")

    varscan_snp_fpfilter_input = {
        "vcf": dxpy.dxlink({"stage": varscan_stage_id, "outputField": "snp_vcf"}),
        "bam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}),
        "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"})
    }
    varscan_snp_fpfilter_stage_id = wf.add_stage(fpfilter_applet,
                                                 stage_input=varscan_snp_fpfilter_input,
                                                 name="fpfilter-tool(varscan SNP)",
                                                 folder="fpfiltered")

    varscan_indel_fpfilter_input = {
        "vcf": dxpy.dxlink({"stage": varscan_stage_id, "outputField": "indel_vcf"}),
        "bam": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "tumorInputBamFile"}),
        "reference": dxpy.dxlink({"stage": pindel_stage_id, "inputField": "inputReferenceFile"})
    }
    varscan_indel_fpfilter_stage_id = wf.add_stage(fpfilter_applet,
                                                   stage_input=varscan_indel_fpfilter_input,
                                                   name="fpfilter-tool(varscan INDEL)",
                                                   folder="fpfiltered")

    # vcf_filter (All variant callers)
    vcf_filter_applet = find_applet("tcga-vcf-filter-tool")
    radia_vcf_filter_input = {
        "input_vcf": dxpy.dxlink({"stage": radia_stage_id, "outputField": "filtered_output_vcf"}),
        "filterRejects": False
    }
    radia_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet,
                                             stage_input=radia_vcf_filter_input,
                                             name="vcffilter-tool(radia)",
                                             folder="final_filtered")

    somaticsniper_vcf_filter_input = {
        "input_vcf": dxpy.dxlink({"stage": somaticsniper_fpfilter_stage_id, "outputField": "annotated_output"}),
        "filterRejects": False
    }
    somaticsniper_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet,
                                                     stage_input=somaticsniper_vcf_filter_input,
                                                     name="vcffilter-tool(somaticsniper)",
                                                     folder="final_filtered")

    varscan_snp_vcf_filter_input = {
        "input_vcf": dxpy.dxlink({"stage": varscan_snp_fpfilter_stage_id, "outputField": "annotated_output"}),
        "filterRejects": True
    }
    varscan_snp_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet,
                                                   stage_input=varscan_snp_vcf_filter_input,
                                                   name="vcffilter-tool(varscan SNP)",
                                                   folder="final_filtered")

    varscan_indel_vcf_filter_input = {
        "input_vcf": dxpy.dxlink({"stage": varscan_indel_fpfilter_stage_id, "outputField": "annotated_output"}),
        "filterRejects": True
    }
    varscan_indel_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet,
                                                     stage_input=varscan_indel_vcf_filter_input,
                                                     name="vcffilter-tool(varscan INDEL)",
                                                     folder="final_filtered")

    muse_vcf_filter_input = {
        "input_vcf": dxpy.dxlink({"stage": muse_stage_id, "outputField": "mutations"}),
        "filterRejects": False
    }
    muse_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet,
                                            stage_input=muse_vcf_filter_input,
                                            name="vcffilter-tool(muse)",
                                            folder="final_filtered")

    pindel_vcf_filter_input = {
        "input_vcf": dxpy.dxlink({"stage": pindel_stage_id, "outputField": "outputSomaticVcf"}),
        "filterRejects": False
    }
    pindel_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet,
                                              stage_input=pindel_vcf_filter_input,
                                              name="vcffilter-tool(pindel)",
                                              folder="final_filtered")
    
    mutect_vcf_filter_input = {
        "input_vcf": dxpy.dxlink({"stage": mutect_stage_id, "outputField": "mutations"}),
        "filterRejects": True
    }
    mutect_vcf_filter_stage_id = wf.add_stage(vcf_filter_applet,
                                              stage_input=mutect_vcf_filter_input,
                                              name="vcffilter-tool(mutect)",
                                              folder="final_filtered")
    
    vcf_reheader_applet = find_applet("tcga-vcf-reheader")
    radia_vcf_reheader_input = {
        "input_vcf": dxpy.dxlink({"stage": radia_vcf_filter_stage_id, "outputField": "output_vcf"}),
        "software_name": "radia",
        "software_version": "1",
        "software_params": "--dnaNormalMinTotalBases 4 --dnaNormalMinAltBases 2 --dnaNormalBaseQual 10 --dnaNormalMapQual 10 --dnaTumorDescription TumorDNASample --dnaTumorMinTotalBases 4 --dnaTumorMinAltBases 2 --dnaTumorBaseQual 10 --dnaTumorMapQual 10 --dnaNormalMitochon=MT --dnaTumorMitochon=MT --genotypeMinDepth 2 --genotypeMinPct 0.100",
        "center": "ucsc.edu"
    }
    radia_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet,
                                               stage_input=radia_vcf_reheader_input,
                                               name="vcf-reheader(radia)",
                                               folder="final_reheadered")
    """
    sample_params = {
        "platform": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "platform"}),
        "participant_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "participant_uuid"}),
        "disease_code": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "disease_code"}),
        "normal_analysis_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_analysis_uuid"}),
        "normal_bam_name": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_bam_name"}),
        "normal_aliquot_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_aliquot_id"}),
        "normal_aliquot_barcode": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "normal_aliquot_barcode"}),
        "tumor_analysis_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_analysis_uuid"}),
        "tumor_bam_name": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_bam_name"}),
        "tumor_aliquot_uuid": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_aliquot_uuid"}),
        "tumor_aliquot_barcode": dxpy.dxlink({"stage": radia_vcf_reheader_stage_id, "inputField": "tumor_aliquot_barcode"})
    }
    """
    somaticsniper_vcf_reheader_input = {
        "input_vcf": dxpy.dxlink({"stage": somaticsniper_vcf_filter_stage_id, "outputField": "output_vcf"}),
        "software_name": "somaticsniper",
        "software_version": "v1.0.5.0",
        "software_params": "-Q 40 -n NORMAL -q 1 -s 0.01 -r 0.001",
        "center": "wustl.edu"
    }
    #somaticsniper_vcf_reheader_input.update(sample_params)
    somaticsniper_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet,
                                                       stage_input=somaticsniper_vcf_reheader_input,
                                                       name="vcf-reheader(somaticsniper)",
                                                       folder="final_reheadered")
    
    varscan_snp_vcf_reheader_input = {
        "input_vcf": dxpy.dxlink({"stage": varscan_snp_vcf_filter_stage_id, "outputField": "output_vcf"}),
        "software_name": "varscan",
        "software_version": "2.3.9",
        "software_params": "--output-vcf 1 --min-coverage 3 --normal-purity 1 --p-value 0.99 --min-coverage-normal 8 --min-freq-for-hom 0.75 --min-var-freq 0.08 --somatic-p-value 0.05 --min-coverage-tumor 6 --tumor-purity 1",
        "center": "wustl.edu"
    }
    #varscan_snp_vcf_reheader_input.update(sample_params)
    varscan_snp_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet,
                                                     stage_input=varscan_snp_vcf_reheader_input,
                                                     name="vcf-reheader(varscan SNP)",
                                                     folder="final_reheadered")
    
    varscan_indel_vcf_reheader_input = {
        "input_vcf": dxpy.dxlink({"stage": varscan_indel_vcf_filter_stage_id, "outputField": "output_vcf"}),
        "software_name": "varscan",
        "software_version": "2.3.9",
        "software_params": "--output-vcf 1 --min-coverage 3 --normal-purity 1 --p-value 0.99 --min-coverage-normal 8 --min-freq-for-hom 0.75 --min-var-freq 0.08 --somatic-p-value 0.05 --min-coverage-tumor 6 --tumor-purity 1",
        "center": "wustl.edu"
    }
    #varscan_indel_vcf_reheader_input.update(sample_params)
    varscan_indel_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet,
                                                       stage_input=varscan_indel_vcf_reheader_input,
                                                       name="vcf-reheader(varscan INDEL)",
                                                       folder="final_reheadered")
    
    muse_vcf_reheader_input = {
        "input_vcf": dxpy.dxlink({"stage": muse_vcf_filter_stage_id, "outputField": "output_vcf"}),
        "software_name": "muse",
        "software_version": "v1.0rc",
        "software_params": "--mode wxs",
        "center": "mdanderson.org"
    }
    #muse_vcf_reheader_input.update(sample_params)
    muse_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet,
                                             stage_input=muse_vcf_reheader_input,
                                             name="vcf-reheader(muse)",
                                             folder="final_reheadered")
    
    pindel_vcf_reheader_input = {
        "input_vcf": dxpy.dxlink({"stage": pindel_vcf_filter_stage_id, "outputField": "output_vcf"}),
        "software_name": "pindel",
        "software_version": "v0.2.5b8",
        "software_params": "--max_range_index 1 --window_size 5 --sequencing_error_rate 0.010000 --sensitivity 0.950000 --maximum_allowed_mismatch_rate 0.020000 --NM 2 --additional_mismatch 1 --min_perfect_match_around_BP 3 --min_inversion_size 50 --min_num_matched_bases 30 --balance_cutoff 0 --anchor_quality 0 --minimum_support_for_event 3 --report_long_insertions --report_duplications --report_inversions --report_breakpoints",
        "center": "wustl.edu"
    }
    #pindel_vcf_reheader_input.update(sample_params)
    pindel_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet,
                                                stage_input=pindel_vcf_reheader_input,
                                                name="vcf-reheader(pindel)",
                                                folder="final_reheadered")
    
    mutect_vcf_reheader_input = {
        "input_vcf": dxpy.dxlink({"stage": mutect_vcf_filter_stage_id, "outputField": "output_vcf"}),
        "software_name": "mutect",
        "software_version": "1.1.5",
        "software_params": "--initial_tumor_lod 4.0 --tumor_lod 10.0",
        "center": "broad.org"
    }
    mutect_vcf_reheader_stage_id = wf.add_stage(vcf_reheader_applet,
                                                stage_input=mutect_vcf_reheader_input,
                                                name="vcf-reheader(mutect)",
                                                folder="final_reheadered")

    return wf
def build_workflow(experiment, biorep_n, input_shield_stage_input, key):

    output_project = resolve_project(args.outp, 'w')
    logging.debug('Found output project %s' %(output_project.name))

    applet_project = resolve_project(args.applets, 'r')
    logging.debug('Found applet project %s' %(applet_project.name))

    mapping_applet = find_applet_by_name(MAPPING_APPLET_NAME, applet_project.get_id())
    logging.debug('Found applet %s' %(mapping_applet.name))

    input_shield_applet = find_applet_by_name(INPUT_SHIELD_APPLET_NAME, applet_project.get_id())
    logging.debug('Found applet %s' %(input_shield_applet.name))

    workflow_output_folder = resolve_folder(output_project, args.outf + '/workflows/' + experiment.get('accession') + '/' + 'rep%d' %(biorep_n))

    fastq_output_folder = resolve_folder(output_project, args.outf + '/fastqs/' + experiment.get('accession') + '/' + 'rep%d' %(biorep_n))
    mapping_output_folder = resolve_folder(output_project, args.outf + '/raw_bams/' + experiment.get('accession') + '/' + 'rep%d' %(biorep_n))

    if args.raw:
        workflow_title = 'Map %s rep%d to %s (no filter)' %(experiment.get('accession'), biorep_n, args.assembly)
        workflow_name = 'ENCODE raw mapping pipeline'
    else:
        workflow_title = 'Map %s rep%d to %s and filter' %(experiment.get('accession'), biorep_n, args.assembly)
        workflow_name = 'ENCODE mapping pipeline'

    if args.tag:
        workflow_title += ': %s' %(args.tag)

    workflow = dxpy.new_dxworkflow(
        title=workflow_title,
        name=workflow_name,
        project=output_project.get_id(),
        folder=workflow_output_folder
    )

    input_shield_stage_id = workflow.add_stage(
        input_shield_applet,
        name='Gather inputs %s rep%d' %(experiment.get('accession'), biorep_n),
        folder=fastq_output_folder,
        stage_input=input_shield_stage_input
    )
    
    mapping_stage_id = workflow.add_stage(
        mapping_applet,
        name='Map %s rep%d' %(experiment.get('accession'), biorep_n),
        folder=mapping_output_folder,
        stage_input={'input_JSON': dxpy.dxlink({'stage': input_shield_stage_id, 'outputField': 'output_JSON'})}
    )

    if not args.raw:
        final_output_folder = resolve_folder(output_project, args.outf + '/bams/' + experiment.get('accession') + '/' + 'rep%d' %(biorep_n))

        filter_qc_applet = find_applet_by_name(FILTER_QC_APPLET_NAME, applet_project.get_id())
        logging.debug('Found applet %s' %(filter_qc_applet.name))

        filter_qc_stage_id = workflow.add_stage(
            filter_qc_applet,
            name='Filter and QC %s rep%d' %(experiment.get('accession'), biorep_n),
            folder=final_output_folder,
            stage_input={
                'input_bam': dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}),
                'paired_end': dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'paired_end'})
            }
        )

        xcor_applet = find_applet_by_name(XCOR_APPLET_NAME, applet_project.get_id())
        logging.debug('Found applet %s' %(xcor_applet.name))

        xcor_stage_id = workflow.add_stage(
            xcor_applet,
            name='Calculate cross-correlation %s rep%d' %(experiment.get('accession'), biorep_n),
            folder=final_output_folder,
            stage_input={
                'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}),
                'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'})
            }
        )


    ''' This should all be done in the shield's postprocess entrypoint
    if args.accession_outputs:
        derived_from = input_shield_stage_input.get('reads1')
        if reads2:
            derived_from.append(reads2)
        files_json = {dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'}) : {
            'notes': 'Biorep%d | Mapped to %s' %(biorep_n, input_shield_stage_input.get('reference_tar')),
            'lab': 'j-michael-cherry',
            'award': 'U41HG006992',
            'submitted_by': '*****@*****.**',
            'file_format': 'bam',
            'output_type': 'alignments',
            'derived_from': derived_from,
            'dataset': experiment.get('accession')}
        }
        output_shield_stage_id = workflow.add_stage(
            output_shield_applet,
            name='Accession outputs %s rep%d' %(experiment.get('accession'), biorep_n),
            folder=mapping_output_folder,
            stage_input={'files': [dxpy.dxlink({'stage': mapping_stage_id, 'outputField': 'mapped_reads'})],
                         'files_json': files_json,
                         'key': input_shield_stage_input.get('key')}
        )
    '''
    return workflow
Beispiel #31
0
def build_workflow():
    if parameters["folder_provided"] == "false":
        wf = dxpy.new_dxworkflow(
            name='WARDEN_workflow',
            description='RNA-SEQ Workflow',
            output_folder=parameters["Output"],
        )
    else:
        wf = dxpy.new_dxworkflow(
            name='WARDEN_workflow',
            description='RNA-SEQ Workflow',
        )
    wf_outputs = []

    htseq_applet = dxpy.search.find_one_data_object(classname="applet",
                                                    name=app_names["htseq"],
                                                    state="closed",
                                                    return_handler=True)
    genome_cov_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["genome_coverage"],
        state="closed",
        return_handler=True)
    bigwig_applet = dxpy.search.find_one_data_object(classname="applet",
                                                     name=app_names["bigwig"],
                                                     state="closed",
                                                     return_handler=True)
    combine_counts_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["combine_counts"],
        state="closed",
        return_handler=True)
    limma_applet = dxpy.search.find_one_data_object(classname="applet",
                                                    name=app_names["limma"],
                                                    state="closed",
                                                    return_handler=True)
    simple_DE_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["simple_DE"],
        state="closed",
        return_handler=True)
    bw_viewer_applet = dxpy.search.find_one_data_object(
        classname="applet",
        name=app_names["bw_viewer"],
        state="closed",
        return_handler=True)

    sample_num = 0
    htseq_results = []
    bigwig_files = []
    index_project, index_id = parameters["index_file"].split(":")
    gtf_project, gtf_id = parameters["gtf_file"].split(":")
    genome_length_project, genome_length_id = parameters[
        "genome_sizes_file"].split(":")
    gene_length_project, gene_length_id = parameters["gene_length_file"].split(
        ":")
    fpkm_results = []
    fpkm_log2_results = []

    for sample_name in samples:
        bam_id = samples[sample_name]
        bam_link = dxpy.dxlink(bam_id)
        htseq_input = {"input_bam": bam_link}
        if parameters["sort_order"] == "position":
            htseq_input["order"] = "pos"
        else:
            htseq_input["order"] = "name"
        htseq_input["annotation_file"] = dxpy.dxlink({
            "project": gtf_project,
            "id": gtf_id
        })
        htseq_input["gene_length_file"] = dxpy.dxlink({
            "project": gene_length_project,
            "id": gene_length_id
        })
        htseq_input["prefix"] = sample_name
        htseq_input["strand"] = parameters["strandedness"]
        htseq_input["feature_type"] = parameters["feature_type"]
        htseq_input["id_attribute"] = parameters["id_attribute"]
        htseq_input["mode"] = parameters["mode"]
        htseq_input["nonunique"] = parameters["nonunique"]
        htseq_input["secondary_alignments"] = parameters[
            "secondary_alignments"]
        htseq_input["supplementary_alignments"] = parameters[
            "supplementary_alignments"]
        htseq_stage_id = wf.add_stage(
            htseq_applet,
            stage_input=htseq_input,
            instance_type=parameters["htseq_instance"],
            name=sample_name + ":HTSEQ COUNT",
            folder="HTSEQ")
        htseq_results.append(
            dxpy.dxlink({
                "stage": htseq_stage_id,
                "outputField": "htseq_counts"
            }))
        wf_outputs += [
            {
                "name": sample_name + "_htseqcounts",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": htseq_stage_id,
                        "outputField": "htseq_counts"
                    }
                }
            },
        ]
        if parameters["id_attribute"] == "gene_name":
            fpkm_results.append((dxpy.dxlink({
                "stage": htseq_stage_id,
                "outputField": "fpkm"
            })))
            fpkm_log2_results.append((dxpy.dxlink({
                "stage": htseq_stage_id,
                "outputField": "fpkm_log2"
            })))
            wf_outputs += [
                {
                    "name": sample_name + "_fpkm",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": htseq_stage_id,
                            "outputField": "fpkm"
                        }
                    }
                },
                {
                    "name": sample_name + "_fpkm_log2",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": htseq_stage_id,
                            "outputField": "fpkm_log2"
                        }
                    }
                },
            ]

        if parameters["run_coverage"] == 'true':
            gcb_input = {}
            gcb_input["input_bam"] = bam_link
            if parameters["sort_order"] == "name":
                gcb_input["sorted"] = False
            else:
                gcb_input["sorted"] = True
            gcb_input["genome_sizes_file"] = dxpy.dxlink({
                "project": genome_length_project,
                "id": genome_length_id
            })
            gcb_input["strandedness"] = parameters["strandedness"]
            gcb_input["output_prefix"] = sample_name
            gcb_stage_id = wf.add_stage(genome_cov_applet,
                                        stage_input=gcb_input,
                                        instance_type="azure:mem3_ssd1_x8",
                                        name=sample_name + ":COVERAGE",
                                        folder="COVERAGE")

            bg2bw_all_input = {}
            bg2bw_all_input["bedgraph_file"] = dxpy.dxlink({
                "stage":
                gcb_stage_id,
                "outputField":
                "all_coverage_file"
            })
            bg2bw_all_input["genome_sizes_file"] = dxpy.dxlink({
                "project":
                genome_length_project,
                "id":
                genome_length_id
            })
            bg2bw_all_input["output_prefix"] = sample_name
            bg2bw_all_stage_id = wf.add_stage(
                bigwig_applet,
                stage_input=bg2bw_all_input,
                instance_type="azure:mem2_ssd1_x4",
                name=sample_name + ":BED To BW-ALL",
                folder="BIGWIG")
            bigwig_files.append(
                dxpy.dxlink({
                    "stage": bg2bw_all_stage_id,
                    "outputField": "bigwig"
                }))
            wf_outputs += [
                {
                    "name": sample_name + "_all_bigwig",
                    "class": "file",
                    "outputSource": {
                        "$dnanexus_link": {
                            "stage": bg2bw_all_stage_id,
                            "outputField": "bigwig"
                        }
                    }
                },
            ]

            if parameters["strandedness"] != "no":
                bg2bw_pos_input = {}
                bg2bw_pos_input["bedgraph_file"] = dxpy.dxlink({
                    "stage":
                    gcb_stage_id,
                    "outputField":
                    "pos_coverage_file"
                })
                bg2bw_pos_input["genome_sizes_file"] = dxpy.dxlink({
                    "project":
                    genome_length_project,
                    "id":
                    genome_length_id
                })
                bg2bw_pos_input["output_prefix"] = sample_name
                bg2bw_pos_stage_id = wf.add_stage(
                    bigwig_applet,
                    stage_input=bg2bw_pos_input,
                    instance_type="azure:mem2_ssd1_x4",
                    name=sample_name + ":BED To BW-POS",
                    folder="BIGWIG")
                wf_outputs += [
                    {
                        "name": sample_name + "_pos_bigwig",
                        "class": "file",
                        "outputSource": {
                            "$dnanexus_link": {
                                "stage": bg2bw_pos_stage_id,
                                "outputField": "bigwig"
                            }
                        }
                    },
                ]

                bg2bw_neg_input = {}
                bg2bw_neg_input["bedgraph_file"] = dxpy.dxlink({
                    "stage":
                    gcb_stage_id,
                    "outputField":
                    "neg_coverage_file"
                })
                bg2bw_neg_input["genome_sizes_file"] = dxpy.dxlink({
                    "project":
                    genome_length_project,
                    "id":
                    genome_length_id
                })
                bg2bw_neg_input["output_prefix"] = sample_name
                bg2bw_neg_stage_id = wf.add_stage(
                    bigwig_applet,
                    stage_input=bg2bw_neg_input,
                    instance_type="azure:mem2_ssd1_x4",
                    name=sample_name + ":BED To BW-NEG",
                    folder="BIGWIG")
                wf_outputs += [
                    {
                        "name": sample_name + "_neg_bigwig",
                        "class": "file",
                        "outputSource": {
                            "$dnanexus_link": {
                                "stage": bg2bw_neg_stage_id,
                                "outputField": "bigwig"
                            }
                        }
                    },
                ]

                bigwig_files.append(
                    dxpy.dxlink({
                        "stage": bg2bw_pos_stage_id,
                        "outputField": "bigwig"
                    }))
                bigwig_files.append(
                    dxpy.dxlink({
                        "stage": bg2bw_neg_stage_id,
                        "outputField": "bigwig"
                    }))
        sample_num += 1

    combine_input = {
        "count_files": htseq_results,
        "name_value": "htseq",
        "sample_files": [dxpy.dxlink(final_sample_list_id)]
    }
    combine_counts_stage_id = wf.add_stage(combine_counts_applet,
                                           stage_input=combine_input,
                                           instance_type="azure:mem2_ssd1_x1",
                                           name="COMBINE HTSEQ")
    wf_outputs += [
        {
            "name": "combined_counts",
            "class": "file",
            "outputSource": {
                "$dnanexus_link": {
                    "stage": combine_counts_stage_id,
                    "outputField": "count_file"
                }
            }
        },
    ]
    if parameters["id_attribute"] == "gene_name":
        combine_fpkm_input = {
            "count_files": fpkm_results,
            "name_value": "fpkm",
            "sample_files": [dxpy.dxlink(final_sample_list_id)]
        }
        combine_fpkm_stage_id = wf.add_stage(
            combine_counts_applet,
            stage_input=combine_fpkm_input,
            instance_type="azure:mem2_ssd1_x1",
            name="COMBINE FPKM")
        combine_fpkm_log2_input = {
            "count_files": fpkm_log2_results,
            "name_value": "fpkm.log2",
            "sample_files": [dxpy.dxlink(final_sample_list_id)]
        }
        combine_fpkm_log2_stage_id = wf.add_stage(
            combine_counts_applet,
            stage_input=combine_fpkm_log2_input,
            instance_type="azure:mem2_ssd1_x1",
            name="COMBINE FPKMlog2")
        wf_outputs += [
            {
                "name": "combined_fpkm",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": combine_fpkm_stage_id,
                        "outputField": "count_file"
                    }
                }
            },
            {
                "name": "combined_fpkm_log2",
                "class": "file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": combine_fpkm_log2_stage_id,
                        "outputField": "count_file"
                    }
                }
            },
        ]

    if parameters["BW_VIEWER"] != "None" and parameters[
            "run_coverage"] == 'true':
        bw_project, bw_file = parameters["BW_VIEWER"].split(":")
        viewer_link = dxpy.dxlink({"project": bw_project, "id": bw_file})
        bw_viewer_input = {"viewer": viewer_link, "bigwig_files": bigwig_files}
        bw_viewer_stage_id = wf.add_stage(bw_viewer_applet,
                                          stage_input=bw_viewer_input,
                                          instance_type="azure:mem2_ssd1_x1",
                                          name="BIGWIG_VIEWER",
                                          folder="BIGWIG")
        wf_outputs += [
            {
                "name": "bw_viewer",
                "class": "record",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": bw_viewer_stage_id,
                        "outputField": "viewer_bookmark"
                    }
                }
            },
        ]

    if parameters["limma_DE_viewer"] != "None":
        limma_viewer_project, limma_viewer_file = parameters[
            "limma_DE_viewer"].split(":")
        limma_viewer_link = dxpy.dxlink({
            "project": limma_viewer_project,
            "id": limma_viewer_file
        })

    if parameters["run_limma"] == 'true' and parameters[
            "limma_runnable"] == "true":
        limma_input = {
            "input_count_file":
            dxpy.dxlink({
                "stage": combine_counts_stage_id,
                "outputField": "count_file"
            }),
            "sample_list_file":
            dxpy.dxlink(final_sample_list_id),
            "calcNormFactors_method":
            parameters["calcNormFactors_method"],
            "filter_count_type":
            parameters["filter_count_type"],
            "filter_count":
            int(parameters["filter_count"]),
            "p_value_adjust":
            parameters["p_value_adjust"],
            "contrasts_file":
            dxpy.dxlink(comparisons_limma_id)
        }
        if parameters["limma_DE_viewer"] != "None":
            limma_input["difex_viewer"] = limma_viewer_link
        limma_stage_id = wf.add_stage(limma_applet,
                                      stage_input=limma_input,
                                      instance_type="azure:mem1_ssd1_x4",
                                      name="LIMMA")
        wf_outputs += [
            {
                "name": "limma_outfiles",
                "class": "array:file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": limma_stage_id,
                        "outputField": "out_files"
                    }
                }
            },
            {
                "name": "limma_viewer",
                "class": "record",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": limma_stage_id,
                        "outputField": "viewer_bookmark"
                    }
                }
            },
        ]
    if parameters["run_simple_dif_ex"] == 'true':
        simple_DE_input = {
            "input_count_file":
            dxpy.dxlink({
                "stage": combine_counts_stage_id,
                "outputField": "count_file"
            }),
            "sample_list_file":
            dxpy.dxlink(final_sample_list_id),
            "contrasts_file":
            dxpy.dxlink(comparisons_all_id),
            "difex_viewer":
            limma_viewer_link
        }
        if parameters["limma_DE_viewer"] != "None":
            simple_DE_input["difex_viewer"] = limma_viewer_link
        simple_DE_stage_id = wf.add_stage(
            simple_DE_applet,
            stage_input=simple_DE_input,
            instance_type="azure:mem1_ssd1_x4",
            name="SIMPLE DIFFERENTIAL_EXPRESSION")
        wf_outputs += [
            {
                "name": "simple_DE_outfiles",
                "class": "array:file",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": simple_DE_stage_id,
                        "outputField": "out_files"
                    }
                }
            },
            {
                "name": "simple_DE_viewer",
                "class": "record",
                "outputSource": {
                    "$dnanexus_link": {
                        "stage": simple_DE_stage_id,
                        "outputField": "viewer_bookmark"
                    }
                }
            },
        ]

    wf.update(workflow_outputs=wf_outputs)
    wf.close()
    return wf.get_id()
 def test_workflow_completion(self):
     dxworkflow = dxpy.new_dxworkflow(name="my workflow")
     self.assert_completion("dx run my", "my workflow ")
     dxworkflow.hide()
     self.assert_no_completions("dx run my")
Beispiel #33
0
def createWorkflow(stepsToDo,
                   priors,
                   extras,
                   resultsFolder,
                   projectId,
                   appProjectId=None):
    '''This function will populate a workflow for the stepsToDo.'''

    if len(stepsToDo) < 1:
        return None
    if appProjectId == None:
        appProjectId = projectId

    # create a workflow object
    wf = dxpy.new_dxworkflow(title=extras['name'],
                             name=extras['name'],
                             folder=resultsFolder,
                             project=projectId,
                             description=extras['description'])

    # NOTE: prevStepResults dict contains links to result files to be generated by previous steps
    prevStepResults = {}
    for step in stepsToDo:
        appName = STEPS[step]['app']
        app = dxencode.find_applet_by_name(appName, appProjectId)
        appInputs = {}
        # file inputs
        for fileToken in STEPS[step]['inputs'].keys():
            appInp = STEPS[step]['inputs'][fileToken]
            if fileToken in prevStepResults:
                appInputs[appInp] = prevStepResults[fileToken]
            elif fileToken in priors:
                if isinstance(priors[fileToken], list):
                    appInputs[appInp] = []
                    for fid in priors[fileToken]:
                        appInputs[appInp] += [dxencode.get_file_link(fid)]
                else:
                    appInputs[appInp] = dxencode.get_file_link(
                        priors[fileToken])
            else:
                print "ERROR: step '" + step + "' can't find input '" + fileToken + "'!"
                sys.exit(1)
        # Non-file app inputs
        if 'params' in STEPS[step]:
            for param in STEPS[step]['params'].keys():
                appParam = STEPS[step]['params'][param]
                if param in extras:
                    appInputs[appParam] = extras[param]
                else:
                    print "ERROR: unable to locate '" + param + "' in extras."
                    sys.exit(1)
        # Add wf stage
        stageId = wf.add_stage(app,
                               stage_input=appInputs,
                               folder=resultsFolder)
        # outputs, which we will need to link to
        for fileToken in STEPS[step]['results'].keys():
            #appOut = STEPS[step]['results'][fileToken]
            appOut = fileToken  ## not the value
            prevStepResults[fileToken] = dxpy.dxlink({
                'stage': stageId,
                'outputField': appOut
            })
    wfRun = wf.run({})
    return wfRun.describe()