def main():
    cmnd = get_args()

    ## resolve projects
    project = dxencode.resolve_project(PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    pid =  project.get_id()

    applet = dxencode.find_applet_by_name('fastqc-exp', pid )
    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www')
    query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID
    res = dxencode.encoded_get(SERVER+query, AUTHID=AUTHID, AUTHPW=AUTHPW)
    exps = res.json()['@graph']

    n = 0
    for exp in exps:
        acc = exp['accession']
        if len(exp['replicates']) > 0:
            if exp['replicates'][0]['library'].get('size_range', "") != '>200':
                print "Skipping %s with wrong library size (%s)" % (acc, exp['replicates'][0]['library'].get('size_range', ""))
                #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                continue
            if exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', "") == "cells":
                ncells = float(exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity', 0.0))
                if ncells < 20:
                    print "Skipping %s as single-cell (%s %s)" % (acc, exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', ""), ncells)
                    #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                    continue
            run = applet.run({ "accession": acc}, project=pid)
            print "Running: %s for %s" % (run, acc)
            n = n + 1
            if n > cmnd.number:
                break
        else:
            print "Skipping %s (0 replicates)" % acc
Example #2
0
def main():
    cmnd = get_args()

    ## resolve projects
    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www')

    query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&files.file_format=fastq&frame=embedded&replicates.library.biosample.donor.organism.name=mouse' % ASSAY_TERM_ID
    res = requests.get(SERVER+query, headers=HEADERS, auth=(AUTHID, AUTHPW),allow_redirects=True, stream=True)

    exps = res.json()['@graph']

    n=0
    pid = os.getpid()
    if cmnd.maplambda:
        lambdaqc = '--maplambda'
    else:
        lambdaqc = ''

    for exp in exps:
        acc = exp['accession']
        if n >= cmnd.numberjobs:
            print "Stopping at %s replicates" % n
            break
        for rep in exp.get('replicates', []):
            try:
                runcmd = "./launchDnaMe.py %s --gzip -e %s --br %s --tr %s > runs/launch%s-%s-%s.%s%s.out" % (lambdaqc, acc, rep['biological_replicate_number'], rep['technical_replicate_number'],acc, rep['biological_replicate_number'], rep['technical_replicate_number'],pid,lambdaqc)
                print runcmd
                if not cmnd.test:
                    os.system(runcmd)
                n+=1
            except KeyError, e:
                print "%s failed: %s" % (acc, e)
def main():
    cmnd = get_args()

    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www')
    query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID
    res = dxencode.encoded_get(SERVER+query, AUTHID=AUTHID, AUTHPW=AUTHPW)
    exps = res.json()['@graph']

    n = 0
    for exp in exps:
        acc = exp['accession']
        if cmnd.only and acc != cmnd.only:
            print "skipping %s" % acc
            continue
        if len(exp['replicates']) > 0:
            if exp['replicates'][0]['library'].get('size_range', "") != '>200':
                print "Skipping %s with wrong library size (%s)" % (acc, exp['replicates'][0]['library'].get('size_range', ""))
                #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                continue
            if exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', "") == "cells":
                ncells = float(exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity', 0.0))
                if ncells < 20:
                    print "Skipping %s as single-cell (%s %s)" % (acc, exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', ""), ncells)
                    #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                    continue
        if n >= cmnd.numberjobs:
            print "Stopping at %s replicates" % n
            break
        exp_mapping = dxencode.choose_mapping_for_experiment(exp)
        for rep in exp.get('replicates', []):
            try:
                br = rep['biological_replicate_number']
                tr = rep['technical_replicate_number']
                mapping = exp_mapping[(br,tr)]
                o = GENOME_MAPPING[mapping['organism']]
                args = "-o %s" % o
                args += " -l %s" % mapping['library']
                args += " -g %s" % mapping['sex']
                if mapping['paired']:
                    paired_fqs = {
                        '1': [],
                        '2': []
                    }
                    for (p1, p2) in mapping['paired']:
                        paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz")
                        paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz")
                    args += " -1 " + " ".join(paired_fqs['1'])
                    args += " -2 " + " ".join(paired_fqs['2'])
                else:
                    args += " -1 " + " ".join([ f['accession']+".fastq.gz" for f in mapping['unpaired'] ])

                runcmd = "./lrnaLaunch.py -e %s -r %s -tr %s %s -a M4 --project %s --resultsLoc /runs --run > runs/launch%s-%s-%s-M4.%s.out" % (acc, br, tr, args, PROJECT_NAME, acc, br, tr, os.getpid())
                print runcmd
                if not cmnd.test:
                    # probably should be subprocess.Popen()
                    os.system(runcmd)
                n+=1
            except KeyError, e:
                print "%s failed: %s" % (acc, e)
def main(accession, key=None, debug=False):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    #files = [dxpy.DXFile(item) for item in files]

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    #for i, f in enumerate(files):
    #    dxpy.download_dxfile(f.get_id(), "files-" + str(i))

    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    (AUTHID, AUTHPW, SERVER) = dxencode.processkey(key)

    url = SERVER + 'experiments/%s/?format=json&frame=embedded' % (accession)
    #get the experiment object
    logger.debug("%s - %s" % (url, AUTHID))
    response = dxencode.encoded_get(url, AUTHID, AUTHPW)
    logger.debug(response)

    exp = response.json()

    for ff in exp.get('original_files', []):
        try:
            fr = dxencode.encoded_get(SERVER + ff, AUTHID, AUTHPW)
            ff = fr.json()
            if ff['status'] != 'uploading':
                continue
            notes = json.loads(ff['notes'])
            dxid = notes['dx-id']
        except Exception, e:
            logger.error("Error getting dx id: %s for %s" %
                         (e, ff['accession']))
            continue

        dx_file = dxpy.DXFile(dxid)
        local_file = dx_file.describe()['name']
        dxpy.download_dxfile(dxid, local_file)
        item = dxencode.encoded_upload_existing(local_file, ff['accession'],
                                                SERVER, AUTHID, AUTHPW)
        print item
def main(accession, key=None, debug=False):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    #files = [dxpy.DXFile(item) for item in files]

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    #for i, f in enumerate(files):
    #    dxpy.download_dxfile(f.get_id(), "files-" + str(i))

    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    (AUTHID,AUTHPW,SERVER) = dxencode.processkey(key)

    url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(accession)
    #get the experiment object
    logger.debug("%s - %s" % (url, AUTHID))
    response = dxencode.encoded_get(url, AUTHID, AUTHPW)
    logger.debug(response)

    exp = response.json()

    for ff in exp.get('original_files', []):
        try:
            fr = dxencode.encoded_get(SERVER+ff, AUTHID, AUTHPW)
            ff = fr.json()
            if ff['status'] != 'uploading':
                continue
            notes = json.loads(ff['notes'])
            dxid = notes['dx-id']
        except Exception, e:
            logger.error("Error getting dx id: %s for %s" % (e, ff['accession']))
            continue

        dx_file = dxpy.DXFile(dxid)
        local_file = dx_file.describe()['name']
        dxpy.download_dxfile(dxid, local_file)
        item = dxencode.encoded_upload_existing(local_file, ff['accession'], SERVER, AUTHID, AUTHPW)
        print item
Example #6
0
def main():
    cmnd = get_args()

    ## resolve projects
    project = dxencode.resolve_project(PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    pid = project.get_id()

    applet = dxencode.find_applet_by_name('fastqc-exp', pid)
    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www')
    query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID
    res = dxencode.encoded_get(SERVER + query, AUTHID=AUTHID, AUTHPW=AUTHPW)
    exps = res.json()['@graph']

    n = 0
    for exp in exps:
        acc = exp['accession']
        if len(exp['replicates']) > 0:
            if exp['replicates'][0]['library'].get('size_range', "") != '>200':
                print "Skipping %s with wrong library size (%s)" % (
                    acc, exp['replicates'][0]['library'].get('size_range', ""))
                #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                continue
            if exp['replicates'][0]['library'].get(
                    'nucleic_acid_starting_quantity_units', "") == "cells":
                ncells = float(exp['replicates'][0]['library'].get(
                    'nucleic_acid_starting_quantity', 0.0))
                if ncells < 20:
                    print "Skipping %s as single-cell (%s %s)" % (
                        acc, exp['replicates'][0]['library'].get(
                            'nucleic_acid_starting_quantity_units',
                            ""), ncells)
                    #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                    continue
            run = applet.run({"accession": acc}, project=pid)
            print "Running: %s for %s" % (run, acc)
            n = n + 1
            if n > cmnd.number:
                break
        else:
            print "Skipping %s (0 replicates)" % acc
Example #7
0
def main():
    cmnd = get_args()

    ## resolve projects
    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www')

    query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&files.file_format=fastq&frame=embedded&replicates.library.biosample.donor.organism.name=mouse' % ASSAY_TERM_ID
    res = requests.get(SERVER + query,
                       headers=HEADERS,
                       auth=(AUTHID, AUTHPW),
                       allow_redirects=True,
                       stream=True)

    exps = res.json()['@graph']

    n = 0
    pid = os.getpid()
    if cmnd.maplambda:
        lambdaqc = '--maplambda'
    else:
        lambdaqc = ''

    for exp in exps:
        acc = exp['accession']
        if n >= cmnd.numberjobs:
            print "Stopping at %s replicates" % n
            break
        for rep in exp.get('replicates', []):
            try:
                runcmd = "./launchDnaMe.py %s --gzip -e %s --br %s --tr %s > runs/launch%s-%s-%s.%s%s.out" % (
                    lambdaqc, acc, rep['biological_replicate_number'],
                    rep['technical_replicate_number'], acc,
                    rep['biological_replicate_number'],
                    rep['technical_replicate_number'], pid, lambdaqc)
                print runcmd
                if not cmnd.test:
                    os.system(runcmd)
                n += 1
            except KeyError, e:
                print "%s failed: %s" % (acc, e)
Example #8
0
def main():

    args = get_args()

    (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default')
    url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment)
    response = dxencode.encoded_get(url, AUTHID, AUTHPW)
    exp = response.json()

    if not exp.get('replicates') or len(exp['replicates']) < 1:
        print "No replicates found in %s\n%s" % ( args.experiment, exp )
        sys.exit(1)

    replicate = "%s_%s" % (args.br, args.tr)

    reps_mapping = dxencode.choose_mapping_for_experiment(exp)
    # could try to do all replicates here
    try:
        mapping = reps_mapping[(args.br,args.tr)]
    except KeyError:
        print "Specified replicate: %s could not be found in mapping." % replicate
        print reps_mapping
        sys.exit(1)

    if args.maplambda:
        genome = 'lambda'
    else:
        if mapping['organism'] == 'mouse':
            genome = 'mm10'
        elif mapping['organism'] == 'human':
            genome = 'hg19'
        else:
            print "Organism %s not currently supported" % mapping['organism']
            sys.exit(1)

    if mapping['unpaired'] and not mapping['paired']:
        pairedEnd = False
    elif mapping['paired'] and not mapping['unpaired']:
        pairedEnd = True
    elif not mapping['unpaired'] and not mapping['paired']:
        print "Replicate has no reads either paired or unpaired"
        print mapping
        sys.exit(1)
    else:
        print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired'])))
        print mapping
        sys.exit(1)

    extras = pipelineSpecificExtras(genome, mapping['sex'], args.experiment, replicate, mapping['library'], pairedEnd, args.gzip)
    project = dxencode.get_project(args.project)
    projectId = project.get_id()

    #    args.resultsLoc = RESULT_FOLDER_DEFAULT + '/' + genome
    args.resultsLoc = RESULT_FOLDER_DEFAULT  # not sure we need genome
    resultsFolder = args.resultsLoc + '/' + args.experiment + '/' + replicate
    if args.maplambda:
        resultsFolder = resultsFolder + '/lambda'
    if not args.test:
        if not dxencode.project_has_folder(project, resultsFolder):
            project.new_folder(resultsFolder,parents=True)

    if pairedEnd:
        paired_fqs = {
            '1': [],
            '2': []
        }
        for (p1, p2) in mapping['paired']:
            paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz")
            paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz")
        steps = STEP_ORDER['pe']
        print "Generating workflow steps (paired-end)..."
    else:
        unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ]
        steps = STEP_ORDER['se']
        print "Generating workflow steps (single-end)..."
    for step in steps:
        STEPS[step] = calculate_steps(step)

    print "Checking for prior results..."
    # Check if there are previous results
    # Perhaps reads files are already there?
    # NOTE: priors is a dictionary of fileIds that will be used to determine stepsToDo
    #       and fill in inputs to workflow steps
    priors = findPriorResults(pairedEnd,resultsFolder,projectId, maplambda=True)

    print "Checking for read files..."
    # Find all reads files and move into place
    # TODO: files could be in: dx (usual), remote (url e.g.https://www.encodeproject.org/...
    #       or possibly local, Currently only DX locations are supported.
    if pairedEnd:
        reads1 = dxencode.find_and_copy_read_files(priors, paired_fqs['1'], args.test, 'pair1_reads', resultsFolder, arrayInput=True, projectId=projectId)
        reads2 = dxencode.find_and_copy_read_files(priors, paired_fqs['2'], args.test, 'pair2_reads', resultsFolder, arrayInput=True, projectId=projectId)
    else:
        # trim-se and trim-pe use different input tokens.
        reads1 = dxencode.find_and_copy_read_files(priors, unpaired_fqs, args.test, 'reads', resultsFolder, arrayInput=True, projectId=projectId)

    print "Looking for reference files..."
    findReferenceFiles(GENOME_REFERENCES.keys(), priors,args.refLoc,extras)

    print "Determining steps to run..."
    # NOTE: stepsToDo is an ordered list of steps that need to be run
    deprecateFiles = [] # old results will need to be moved/removed if step is rerun
    stepsToDo = determineStepsToDo(pairedEnd, priors, deprecateFiles, projectId, force=args.force)

    # Report the plans
    print "Running '"+extras['title']+"'"
    print "     on "+extras['subTitle']
    if pairedEnd:
        print "- Reads1: "
    else:
        print "- Reads: "
    for fid in reads1:
        print "  " + dxencode.file_path_from_fid(fid)
    if pairedEnd:
        print "- Reads2: "
        for fid in reads2:
            print "  " + dxencode.file_path_from_fid(fid)
    print "- Reference files:"
    for token in GENOME_REFERENCES.keys():
        print "  " + dxencode.file_path_from_fid(priors[token],True)
    print "- Results written to: " + args.project + ":" +resultsFolder
    if len(stepsToDo) == 0:
        print "* All expected results are in the results folder, so there is nothing to do."
        print "  If this experiment/replicate needs to be rerun, then use the --force flag to "
        print "  rerun all steps; or remove suspect results from the folder before launching."
        sys.exit(0)
    else:
        print "- Steps to run:"
        steps = []
        if pairedEnd:
            steps = STEP_ORDER['pe']
        else:
            steps = STEP_ORDER['se']
        for step in steps:
            STEPS[step] = calculate_steps(step)
            if step in stepsToDo:
                print "  * "+STEPS[step]['app']+" will be run"
            else:
                if not step.find('concat') == 0:
                    print "    "+STEPS[step]['app']+" has already been run"

    print "Checking for currently running analyses..."
    checkRunsPreviouslyLaunched(resultsFolder,projectId)

    if len(deprecateFiles) > 0:
        if args.test:
            print "Would move "+str(len(deprecateFiles))+" prior result file(s) to '" + \
                                                                    resultsFolder+"/deprecated'."
            for fid in deprecateFiles:
                print "  " + dxencode.file_path_from_fid(fid)
        else:
            print "Moving "+str(len(deprecateFiles))+" prior result file(s) to '" + \
                                                                resultsFolder+"/deprecated'..."
            dxencode.move_files(deprecateFiles,resultsFolder+"/deprecated",projectId)

    # Exit if test only
    if args.test:
        print "TEST ONLY - exiting."
        sys.exit(0)

    print "Launch sequence initiating..."
    wfRun = createWorkflow(stepsToDo, priors, extras, resultsFolder,projectId)

    print "  We have liftoff!"
    logThisRun(wfRun['id'],resultsFolder,projectId)

    print "  Launched " + wfRun['id']
    print "(success)"
Example #9
0
def main():
    args = get_args()

    (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default')
    url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment)
    response = dxencode.encoded_get(url, AUTHID, AUTHPW)
    exp = response.json()

    if not exp.get('replicates') or len(exp['replicates']) < 1:
        print "No replicates found in %s\n%s" % ( args.experiment, exp )
        sys.exit(1)

    #replicate = "rep%s_%s" % (args.br, args.tr)
    replicate = "%s_%s" % (args.br, args.tr)

    reps_mapping = dxencode.choose_mapping_for_experiment(exp)
    # could try to do all replicates here
    try:
        mapping = reps_mapping[(args.br,args.tr)]
    except KeyError:
        print "Specified replicate: %s could not be found in mapping." % replicate
        print reps_mapping
        sys.exit(1)

    mapping['replicate'] = replicate

    try:
        mapping['genome'] = GENOME_MAPPING[mapping.get('organism', "Not Found")]

    except KeyError:
        print "Organism %s not currently supported" % mapping['organism']
        sys.exit(1)

    if mapping['unpaired'] and not mapping['paired']:
        pairedEnd = False
    elif mapping['paired'] and not mapping['unpaired']:
        pairedEnd = True
    elif not mapping['unpaired'] and not mapping['paired']:
        print "Replicate has no reads either paired or unpaired"
        print mapping
        sys.exit(1)
    else:
        print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired'])))
        print mapping
        sys.exit(1)

    psv = pipeline_specific_vars(args, mapping, pairedEnd)
    project = dxencode.get_project(args.project)
    projectId = project.get_id()


    ## TODO this is a bunch of ugly
    if pairedEnd:
        paired_fqs = {
            '1': [],
            '2': []
        }
        read1s = []
        read2s = []
        for (p1, p2) in mapping['paired']:
            paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz")
            paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz")
            read1s.append(p1['accession'])
            read2s.append(p2['accession'])
        pipePath = STEP_ORDER['pe']
        print "Generating workflow steps (paired-end)..."
    else:
        unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ]
        pipePath = STEP_ORDER['se']

    for step in pipePath:
        STEPS[step] = calculate_steps(step)

    pipeSteps = STEPS
    ## warning ugly kludge here
    file_globs = {}
    for app in STEPS.keys():
        for token in STEPS[app]['results'].keys():
            file_globs[token] = STEPS[app]['results'][token]

    print "Checking for prior results..."

    priors = dxencode.find_prior_results(pipePath,pipeSteps,psv['resultsFolder'],file_globs, projectId)

    if pairedEnd:
        priors['pair1_reads'] = dxencode.find_file_set(paired_fqs["1"], projectId)
        priors['pair2_reads'] = dxencode.find_file_set(paired_fqs["2"], projectId)
        priors['all_reads'] = priors['pair1_reads'] + priors['pair2_reads']
        submitted = {
            'all_reads': read1s + read2s
        }
    else:
        priors['reads'] = dxencode.find_file_set(unpaired_fqs, projectId)
        priors['all_reads'] = priors['reads']
        submitted = {
            'all_reads': [ f['accession'] for f in mapping['unpaired']],
        }


    print "Determining steps to run..."
    #print priors
    #sys.exit(1)
    # NOTE: stepsToDo is an ordered list of steps that need to be run
    deprecateFiles = [] # old results will need to be moved/removed if step is rerun
    stepsToDo = dxencode.determine_steps_to_run(pipePath,pipeSteps, priors, deprecateFiles, projectId, verbose=True)

    print "Checking for currently running analyses..."
    dxencode.check_run_log(psv['resultsFolder'],projectId, verbose=True)

    if len(stepsToDo):
        print "Pipeline incomplete, please resubmit jobs: %s" % stepsToDo
        sys.exit(0)

    print priors
    to_submit = [ k for k in priors.keys() if POST_TEMPLATES.get(k) ]
    n = 0 # skip reads
    print "Attempting to submit %s files to args.experiment" % len(to_submit)
    while(to_submit):
        if n > len(priors) * len(priors):
            print "Too many itereations: %s" % priors
            break
        token = to_submit.pop(0)
        print "%s %s - %s" % (token, priors[token], n)
        f_ob = POST_TEMPLATES.get(token, None)
        n += 1
        if f_ob:
            derive_check = f_ob.get('derived_from', [])
            if derive_check:
                derived = [ submitted[f] for f in derive_check if submitted.get(f) ]
                if not derived:
                    to_submit.append(token)
                    continue
                else:
                    f_ob['derived_from'] = list(itertools.chain(*derived))
            dxFile = dxpy.DXFile(dxid=priors[token])
            print "Post File: %s %s" % (token, dxFile.name)
            f_ob['dataset'] = args.experiment
            f_ob['lab'] = '/labs/j-michael-cherry/'
            f_ob['award'] = '/awards/U41HG006992/'
            f_ob['assembly'] = mapping['genome']
            ## temporary haxors until file display works
            f_ob['replicate'] = mapping['replicate_id']
            f_ob['notes'] = json.dumps(dxencode.create_notes(dxFile, get_software()))
            print json.dumps(f_ob, sort_keys=True, indent=4, separators=(',',': '))
            if args.testserver:
                server = 'test'
            else:
                server = 'www'

            if args.test:
                fake_acc = 'ENCFF%03dAAA' % n
                print "Fake submission: %s" % fake_acc
                submitted[token] = [ fake_acc ]
            else:
                applet = dxencode.find_applet_by_name('validate-post', projectId )
                job = applet.run({
                    "pipe_file": dxpy.dxlink(dxFile),
                    "file_meta": f_ob,
                    "key": server,
                    "debug": True,
                    "skipvalidate": args.skipvalidate or False
                    })
                print "Submitting %s" % job.id
                job.wait_on_done(interval=1)
                accession = job.describe()['output'].get('accession', "Unknown Acc")
                error = job.describe()['output'].get('error', "Unknown Error")
                submitted[token] = [ accession ]
                print "Posted (%s): %s" % (error, accession)

    # Exit if test only
    if args.test:
        print "Fake submitted %s files." % n
    if args.test:
        sys.exit(0)
Example #10
0
def main():
    cmnd = get_args()

    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www')
    query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID
    res = dxencode.encoded_get(SERVER + query, AUTHID=AUTHID, AUTHPW=AUTHPW)
    exps = res.json()['@graph']

    n = 0
    for exp in exps:
        acc = exp['accession']
        if cmnd.only and acc != cmnd.only:
            print "skipping %s" % acc
            continue
        if len(exp['replicates']) > 0:
            if exp['replicates'][0]['library'].get('size_range', "") != '>200':
                print "Skipping %s with wrong library size (%s)" % (
                    acc, exp['replicates'][0]['library'].get('size_range', ""))
                #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                continue
            if exp['replicates'][0]['library'].get(
                    'nucleic_acid_starting_quantity_units', "") == "cells":
                ncells = float(exp['replicates'][0]['library'].get(
                    'nucleic_acid_starting_quantity', 0.0))
                if ncells < 20:
                    print "Skipping %s as single-cell (%s %s)" % (
                        acc, exp['replicates'][0]['library'].get(
                            'nucleic_acid_starting_quantity_units',
                            ""), ncells)
                    #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                    continue
        if n >= cmnd.numberjobs:
            print "Stopping at %s replicates" % n
            break
        exp_mapping = dxencode.choose_mapping_for_experiment(exp)
        for rep in exp.get('replicates', []):
            try:
                br = rep['biological_replicate_number']
                tr = rep['technical_replicate_number']
                mapping = exp_mapping[(br, tr)]
                o = GENOME_MAPPING[mapping['organism']]
                args = "-o %s" % o
                args += " -l %s" % mapping['library']
                args += " -g %s" % mapping['sex']
                if mapping['paired']:
                    paired_fqs = {'1': [], '2': []}
                    for (p1, p2) in mapping['paired']:
                        paired_fqs[p1['paired_end']].append(p1['accession'] +
                                                            ".fastq.gz")
                        paired_fqs[p2['paired_end']].append(p2['accession'] +
                                                            ".fastq.gz")
                    args += " -1 " + " ".join(paired_fqs['1'])
                    args += " -2 " + " ".join(paired_fqs['2'])
                else:
                    args += " -1 " + " ".join([
                        f['accession'] + ".fastq.gz"
                        for f in mapping['unpaired']
                    ])

                runcmd = "./lrnaLaunch.py -e %s -r %s -tr %s %s -a M4 --project %s --resultsLoc /runs --run > runs/launch%s-%s-%s-M4.%s.out" % (
                    acc, br, tr, args, PROJECT_NAME, acc, br, tr, os.getpid())
                print runcmd
                if not cmnd.test:
                    # probably should be subprocess.Popen()
                    os.system(runcmd)
                n += 1
            except KeyError, e:
                print "%s failed: %s" % (acc, e)
        mapped = parse_map_report(folder, project)
        lambda_mapped = parse_map_report(folder + "/lambda", project)

        nreads = float(mapped.get("Sequences analysed in total", -999))
        mapeff = float(mapped.get("Mapping efficiency", "-99999%").strip("%")) / 100.0
        coverage = nreads * mapeff * float(rep["read_length"]) / APPROX_HAPLOID_GENOME_SIZE
        print "\t".join(
            [acc, rep_str]
            + [mapped.get(v, "-999.9") for v in labels]
            + [lambda_mapped.get(v, "-999.9") for v in labels]
            + ["%2.2f" % coverage]
        )


(AUTHID, AUTHPW, SERVER) = dxencode.processkey("default")


def process_exp(acc, project, skipfq):

    fqc_metrics = {}

    expr = get_fastqc.get_exp_time(acc, project, skip=skipfq)
    if not skipfq:
        fqc_metrics[expr["accession"]] = {}
        for fq in [f for f in expr["files"] if f["file_format"] == "fastq"]:
            fqc_metrics[expr["accession"]][fq["accession"]] = get_fastqc.get_fastqc(fq["accession"], project)
    # print json.dumps(fqc_metrics, indent=4)
    get_bismark_stats(expr, project)

        for fq in fqs:
            rep = fq['replicate']
            repstr = '_rep'+str(rep['biological_replicate_number'])+'_'+str(rep['technical_replicate_number'])
            total_reads[fq['accession']] = get_fastqc(fq['accession'], project)['Total Sequences']
            elapsed[repstr]['fastqs'].append(fq['accession'])
            if fq.get('paired_end', None):
                paired = 'Paired'

        for repstr in elapsed.keys():
            sizes = "\t".join([ "\t".join((fq, str(total_reads[fq]))) for fq in elapsed[repstr]['fastqs']])
            print "\t".join((exp['accession'],repstr, str(elapsed[repstr]['time']), sizes, paired))

        return exp

# woo hoo global
(AUTHID,AUTHPW,SERVER) = dxencode.processkey('default')

def main():
    argparser = get_args()
    args = argparser.parse_args()

    project = dxencode.get_project(args.project)

    if args.file:
        getr = dxencode.encoded_get(SERVER+args.file, AUTHID=AUTHID, AUTHPW=AUTHPW)
        try:
            getr.raise_for_status()
        except:
            print "Could not find %s in db" % args.file
            raise
        encff = getr.json()
Example #13
0
def main():

    args = get_args()

    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('default')
    url = SERVER + 'experiments/%s/?format=json&frame=embedded' % (
        args.experiment)
    response = dxencode.encoded_get(url, AUTHID, AUTHPW)
    exp = response.json()

    if not exp.get('replicates') or len(exp['replicates']) < 1:
        print "No replicates found in %s\n%s" % (args.experiment, exp)
        sys.exit(1)

    replicate = "%s_%s" % (args.br, args.tr)

    reps_mapping = dxencode.choose_mapping_for_experiment(exp)
    # could try to do all replicates here
    try:
        mapping = reps_mapping[(args.br, args.tr)]
    except KeyError:
        print "Specified replicate: %s could not be found in mapping." % replicate
        print reps_mapping
        sys.exit(1)

    if args.maplambda:
        genome = 'lambda'
    else:
        if mapping['organism'] == 'mouse':
            genome = 'mm10'
        elif mapping['organism'] == 'human':
            genome = 'hg19'
        else:
            print "Organism %s not currently supported" % mapping['organism']
            sys.exit(1)

    if mapping['unpaired'] and not mapping['paired']:
        pairedEnd = False
    elif mapping['paired'] and not mapping['unpaired']:
        pairedEnd = True
    elif not mapping['unpaired'] and not mapping['paired']:
        print "Replicate has no reads either paired or unpaired"
        print mapping
        sys.exit(1)
    else:
        print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (
            len(mapping['paired'], len(mapping['unpaired'])))
        print mapping
        sys.exit(1)

    extras = pipelineSpecificExtras(genome, mapping['sex'], args.experiment,
                                    replicate, mapping['library'], pairedEnd,
                                    args.gzip)
    project = dxencode.get_project(args.project)
    projectId = project.get_id()

    #    args.resultsLoc = RESULT_FOLDER_DEFAULT + '/' + genome
    args.resultsLoc = RESULT_FOLDER_DEFAULT  # not sure we need genome
    resultsFolder = args.resultsLoc + '/' + args.experiment + '/' + replicate
    if args.maplambda:
        resultsFolder = resultsFolder + '/lambda'
    if not args.test:
        if not dxencode.project_has_folder(project, resultsFolder):
            project.new_folder(resultsFolder, parents=True)

    if pairedEnd:
        paired_fqs = {'1': [], '2': []}
        for (p1, p2) in mapping['paired']:
            paired_fqs[p1['paired_end']].append(p1['accession'] + ".fastq.gz")
            paired_fqs[p2['paired_end']].append(p2['accession'] + ".fastq.gz")
        steps = STEP_ORDER['pe']
        print "Generating workflow steps (paired-end)..."
    else:
        unpaired_fqs = [
            f['accession'] + ".fastq.gz" for f in mapping['unpaired']
        ]
        steps = STEP_ORDER['se']
        print "Generating workflow steps (single-end)..."
    for step in steps:
        STEPS[step] = calculate_steps(step)

    print "Checking for prior results..."
    # Check if there are previous results
    # Perhaps reads files are already there?
    # NOTE: priors is a dictionary of fileIds that will be used to determine stepsToDo
    #       and fill in inputs to workflow steps
    priors = findPriorResults(pairedEnd,
                              resultsFolder,
                              projectId,
                              maplambda=True)

    print "Checking for read files..."
    # Find all reads files and move into place
    # TODO: files could be in: dx (usual), remote (url e.g.https://www.encodeproject.org/...
    #       or possibly local, Currently only DX locations are supported.
    if pairedEnd:
        reads1 = dxencode.find_and_copy_read_files(priors,
                                                   paired_fqs['1'],
                                                   args.test,
                                                   'pair1_reads',
                                                   resultsFolder,
                                                   arrayInput=True,
                                                   projectId=projectId)
        reads2 = dxencode.find_and_copy_read_files(priors,
                                                   paired_fqs['2'],
                                                   args.test,
                                                   'pair2_reads',
                                                   resultsFolder,
                                                   arrayInput=True,
                                                   projectId=projectId)
    else:
        # trim-se and trim-pe use different input tokens.
        reads1 = dxencode.find_and_copy_read_files(priors,
                                                   unpaired_fqs,
                                                   args.test,
                                                   'reads',
                                                   resultsFolder,
                                                   arrayInput=True,
                                                   projectId=projectId)

    print "Looking for reference files..."
    findReferenceFiles(GENOME_REFERENCES.keys(), priors, args.refLoc, extras)

    print "Determining steps to run..."
    # NOTE: stepsToDo is an ordered list of steps that need to be run
    deprecateFiles = [
    ]  # old results will need to be moved/removed if step is rerun
    stepsToDo = determineStepsToDo(pairedEnd,
                                   priors,
                                   deprecateFiles,
                                   projectId,
                                   force=args.force)

    # Report the plans
    print "Running '" + extras['title'] + "'"
    print "     on " + extras['subTitle']
    if pairedEnd:
        print "- Reads1: "
    else:
        print "- Reads: "
    for fid in reads1:
        print "  " + dxencode.file_path_from_fid(fid)
    if pairedEnd:
        print "- Reads2: "
        for fid in reads2:
            print "  " + dxencode.file_path_from_fid(fid)
    print "- Reference files:"
    for token in GENOME_REFERENCES.keys():
        print "  " + dxencode.file_path_from_fid(priors[token], True)
    print "- Results written to: " + args.project + ":" + resultsFolder
    if len(stepsToDo) == 0:
        print "* All expected results are in the results folder, so there is nothing to do."
        print "  If this experiment/replicate needs to be rerun, then use the --force flag to "
        print "  rerun all steps; or remove suspect results from the folder before launching."
        sys.exit(0)
    else:
        print "- Steps to run:"
        steps = []
        if pairedEnd:
            steps = STEP_ORDER['pe']
        else:
            steps = STEP_ORDER['se']
        for step in steps:
            STEPS[step] = calculate_steps(step)
            if step in stepsToDo:
                print "  * " + STEPS[step]['app'] + " will be run"
            else:
                if not step.find('concat') == 0:
                    print "    " + STEPS[step]['app'] + " has already been run"

    print "Checking for currently running analyses..."
    checkRunsPreviouslyLaunched(resultsFolder, projectId)

    if len(deprecateFiles) > 0:
        if args.test:
            print "Would move "+str(len(deprecateFiles))+" prior result file(s) to '" + \
                                                                    resultsFolder+"/deprecated'."
            for fid in deprecateFiles:
                print "  " + dxencode.file_path_from_fid(fid)
        else:
            print "Moving "+str(len(deprecateFiles))+" prior result file(s) to '" + \
                                                                resultsFolder+"/deprecated'..."
            dxencode.move_files(deprecateFiles, resultsFolder + "/deprecated",
                                projectId)

    # Exit if test only
    if args.test:
        print "TEST ONLY - exiting."
        sys.exit(0)

    print "Launch sequence initiating..."
    wfRun = createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId)

    print "  We have liftoff!"
    logThisRun(wfRun['id'], resultsFolder, projectId)

    print "  Launched " + wfRun['id']
    print "(success)"