Beispiel #1
0
 def gdv_create(self,ex):
     from bbcflib import gdv
     project = gdv.get_project(mail=self.globals['gdv']['email'],
                               key=self.globals['gdv']['key'],
                               project_key=self.job.options['gdv_key'])
     if 'error' in project:
         self.log_write("Creating GDV project.")
         project = gdv.new_project( self.globals['gdv']['email'],
                                    self.globals['gdv']['key'],
                                    self.job.description,
                                    self.job.assembly.id,
                                    self.globals['gdv']['url'] )
         self.debug_write("\nGDV project: "+json.dumps(project))
         add_pickle( ex, project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') )
     self.job.options['gdv_project'] = project
     return True
Beispiel #2
0
def demultiplex_workflow(ex, job, gl, file_path="../", via='lsf',
                         logfile=sys.stdout, debugfile=sys.stderr):
    script_path=gl['script_path']
    file_names = {}
    job_groups=job.groups
    resFiles={}
    for gid, group in job_groups.iteritems():
        file_names[gid] = {}

        primersFilename = 'group_' + group['name'] + "_barcode_file.fa"
        primersFile = os.path.join(file_path,primersFilename)
        ex.add(primersFile,description=set_file_descr(primersFilename,groupId=gid,step="init",type="fa"))

        paramsFilename = 'group_' + group['name'] + "_param_file.txt"
        paramsFile = os.path.join(file_path,paramsFilename)
        ex.add(paramsFile,description=set_file_descr(paramsFilename,groupId=gid,step="init",type="txt"))
        params = load_paramsFile(paramsFile)

        infiles = []
        tot_counts = 0
        allSubFiles = []
        for rid,run in group['runs'].iteritems():
            infiles.append(run)
            n=count_lines(ex,run)
            tot_counts += n/4
            if n>10000000:
                allSubFiles.extend(split_file(ex,run,n_lines=8000000))
            else:
                allSubFiles.append(run)
        (resExonerate, tot_ambiguous, tot_discarded) = parallel_exonerate( ex, allSubFiles, primersFile,
                                                                           (gid, group['name']), via=via, **params )
        gzipfile(ex,run)
        ex.add(run+".gz",description=set_file_descr(group['name']+"_full_fastq.gz",
                                                    groupId=gid,step='exonerate',view='debug',type="fastq"))
        logfile.write("Will get sequences to filter\n");logfile.flush()
        seqToFilter = getSeqToFilter(ex,primersFile)

        logfile.write("Will filter the sequences\n")
        filteredFastq = filterSeq(ex,resExonerate,seqToFilter,gid,group['name'],via=via)

        logfile.write("After filterSeq, filteredFastq=%s\n" %filteredFastq);logfile.flush()

        counts_primers = {}
        counts_primers_filtered = {}
        if len(filteredFastq):
            archive = unique_filename_in()
            tgz = tarfile.open(archive, "w:gz")
        for k,f in resExonerate.iteritems():
            counts_primers[k] = count_lines(ex,f)/4
            if k in filteredFastq:
                file_names[gid][k] = group['name']+"_"+k+"_filtered"
                ex.add(filteredFastq[k],description=set_file_descr(file_names[gid][k]+".fastq",
                                                                   groupId=gid,step="final",
                                                                   type="fastq"))
                counts_primers_filtered[k] = count_lines(ex,filteredFastq[k])/4
                tgz.add( f, arcname=group['name']+"_"+k+".fastq" )
            else:
                file_names[gid][k] = group['name']+"_"+k
                ex.add(f,description=set_file_descr(file_names[gid][k]+".fastq",
                                                    groupId=gid,step="final",
                                                    type="fastq"))
                counts_primers_filtered[k] = 0
        if len(filteredFastq):
            tgz.close()
            ex.add(archive,description=set_file_descr(group['name']+"_unfiltered_fastq.tgz",
                                                      groupId=gid,step="exonerate",
                                                      type="tar"))

        # Prepare report per group of runs
        report_ok,reportFile = prepareReport(ex,group['name'],
                                             tot_counts, counts_primers,counts_primers_filtered,
                                             tot_ambiguous, tot_discarded)
        ex.add(reportFile,description = set_file_descr(
                group['name']+"_report_demultiplexing.txt",
                groupId=gid,step="final",type="txt",view="admin"))
        if report_ok:
            reportFile_pdf = unique_filename_in()
            createReport(ex,reportFile,reportFile_pdf,script_path)
            ex.add(reportFile_pdf,description=set_file_descr(
                    group['name']+"_report_demultiplexing.pdf",
                    groupId=gid,step="final",type="pdf"))
        else:
            logfile.write("*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n");logfile.flush()
    add_pickle( ex, file_names,
                set_file_descr('file_names',step="final",type='py',view='admin') )
    return resFiles
Beispiel #3
0
def demultiplex_workflow(ex,
                         job,
                         gl,
                         file_path="../",
                         via='lsf',
                         logfile=sys.stdout,
                         debugfile=sys.stderr):
    script_path = gl['script_path']
    file_names = {}
    job_groups = job.groups
    resFiles = {}
    for gid, group in job_groups.iteritems():
        file_names[gid] = {}

        primersFilename = 'group_' + group['name'] + "_barcode_file.fa"
        primersFile = group.get("primersfile",
                                os.path.join(file_path, primersFilename))
        ex.add(primersFile,
               description=set_file_descr(primersFilename,
                                          groupId=gid,
                                          step="init",
                                          type="fa"))

        paramsFilename = 'group_' + group['name'] + "_param_file.txt"
        paramsFile = group.get("paramsfile",
                               os.path.join(file_path, paramsFilename))
        ex.add(paramsFile,
               description=set_file_descr(paramsFilename,
                                          groupId=gid,
                                          step="init",
                                          type="txt"))
        params = load_paramsFile(paramsFile)

        infiles = []
        tot_counts = 0
        allSubFiles = []
        for rid, run in group['runs'].iteritems():
            infiles.append(run)
            n = count_lines(ex, run)
            tot_counts += n / 4
            if n > 10000000:
                allSubFiles.extend(split_file(ex, run, n_lines=8000000))
            else:
                allSubFiles.append(run)
        (resExonerate, tot_ambiguous,
         tot_discarded) = parallel_exonerate(ex,
                                             allSubFiles,
                                             primersFile, (gid, group['name']),
                                             via=via,
                                             **params)

        gzipfile(ex, cat(infiles))
        ex.add(run + ".gz",
               description=set_file_descr(group['name'] + "_full_fastq.gz",
                                          groupId=gid,
                                          step='exonerate',
                                          view='debug',
                                          type="fastq"))
        logfile.write("Will get sequences to filter\n")
        logfile.flush()
        seqToFilter = getSeqToFilter(ex, primersFile)

        logfile.write("Will filter the sequences\n")
        filteredFastq = filterSeq(ex,
                                  resExonerate,
                                  seqToFilter,
                                  gid,
                                  group['name'],
                                  via=via)

        logfile.write("After filterSeq, filteredFastq=%s\n" % filteredFastq)
        logfile.flush()

        counts_primers = {}
        counts_primers_filtered = {}
        global bcDelimiter
        if len(filteredFastq):
            archive = unique_filename_in()
            tgz = tarfile.open(archive, "w:gz")
        for k, f in resExonerate.iteritems():
            counts_primers[k] = count_lines(ex, f) / 4
            if k in filteredFastq:
                k2 = k.replace(bcDelimiter, "_")
                file_names[gid][k2] = group['name'] + "_" + k2 + "_filtered"
                ex.add(filteredFastq[k],
                       description=set_file_descr(file_names[gid][k2] +
                                                  ".fastq",
                                                  groupId=gid,
                                                  step="final",
                                                  type="fastq"))
                counts_primers_filtered[k] = count_lines(ex,
                                                         filteredFastq[k]) / 4
                tgz.add(f,
                        arcname=group['name'] + "_" +
                        k.replace(bcDelimiter, "_") + ".fastq")
            else:
                k2 = k.replace(bcDelimiter, "_")
                file_names[gid][k2] = group['name'] + "_" + k2
                ex.add(f,
                       description=set_file_descr(file_names[gid][k2] +
                                                  ".fastq",
                                                  groupId=gid,
                                                  step="final",
                                                  type="fastq"))
                counts_primers_filtered[k] = 0
        if len(filteredFastq):
            tgz.close()
            ex.add(archive,
                   description=set_file_descr(group['name'] +
                                              "_unfiltered_fastq.tgz",
                                              groupId=gid,
                                              step="exonerate",
                                              type="tar"))

        # Prepare report per group of runs
        report_ok, reportFile = prepareReport(ex, group['name'], tot_counts,
                                              counts_primers,
                                              counts_primers_filtered,
                                              tot_ambiguous, tot_discarded)
        ex.add(reportFile,
               description=set_file_descr(group['name'] +
                                          "_report_demultiplexing.txt",
                                          groupId=gid,
                                          step="final",
                                          type="txt",
                                          view="admin"))
        if report_ok:
            reportFile_pdf = unique_filename_in()
            createReport(ex, reportFile, reportFile_pdf, script_path)
            ex.add(reportFile_pdf,
                   description=set_file_descr(group['name'] +
                                              "_report_demultiplexing.pdf",
                                              groupId=gid,
                                              step="final",
                                              type="pdf"))
        else:
            logfile.write(
                "*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n"
            )
            logfile.flush()
    add_pickle(
        ex, file_names,
        set_file_descr('file_names', step="final", type='py', view='admin'))
    return resFiles
Beispiel #4
0
def main(argv = None):
    via = "lsf"
    limspath = None
    hts_key = ''
    working_dir = None
    config_file = None
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts,args = getopt.getopt(sys.argv[1:],"hu:k:d:w:c:",
                                      ["help","via=","key=","minilims=",
                                       "working-directory=","config="])
        except getopt.error, msg:
            raise Usage(msg)
        for o, a in opts:
            if o in ("-h", "--help"):
                print __doc__
                print usage
                return 0
            elif o in ("-u", "--via"):
                if a=="local":
                    via = "local"
                elif a=="lsf":
                    via = "lsf"
                else:
                    raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (a,))
            elif o in ("-w", "--working-directory"):
                if os.path.exists(a):
                    os.chdir(a)
                    working_dir = a
                else:
                    raise Usage("Working directory '%s' does not exist." % a)
            elif o in ("-d", "--minilims"):
                limspath = a
            elif o in ("-k", "--key"):
                hts_key = a
            elif o in ("-c", "--config"):
                config_file = a
            else:
                raise Usage("Unhandled option: " + o)
        if not(limspath and os.path.exists(limspath)
               and (hts_key != None or (config_file and os.path.exists(config_file)))):
            raise Usage("Need a minilims and a job key or a configuration file")
        M = MiniLIMS( limspath )
        if len(hts_key)>1:
            gl = use_pickle(M, "global variables")
            htss = frontend.Frontend( url=gl['hts_mapseq']['url'] )
            job = htss.job( hts_key )
            [M.delete_execution(x) for x in M.search_executions(with_description=hts_key,fails=True)]
        elif os.path.exists(config_file):
            (job,gl) = frontend.parseConfig( config_file )
            hts_key = job.description
        else:
            raise ValueError("Need either a job key (-k) or a configuration file (-c).")
        g_rep = genrep.GenRep( url=gl["genrep_url"], root=gl["bwt_root"],
                               intype=job.options.get('input_type_id') or 0 )
        assembly = g_rep.assembly( job.assembly_id )
        if 'lims' in gl:
            dafl = dict((loc,daflims.DAFLIMS( username=gl['lims']['user'], password=pwd ))
                        for loc,pwd in gl['lims']['passwd'].iteritems())
        else:
            dafl = None
        if not('compute_densities' in job.options):
            job.options['compute_densities'] = True
        elif isinstance(job.options['compute_densities'],str):
            job.options['compute_densities'] = job.options['compute_densities'].lower() in ['1','true','t']
        if not('ucsc_bigwig' in job.options):
            job.options['ucsc_bigwig'] = True
        elif isinstance(job.options['ucsc_bigwig'],str):
            job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'].lower() in ['1','true','t']
        job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'] and job.options['compute_densities']
        if not('create_gdv_project' in job.options):
            job.options['create_gdv_project'] = False
        elif isinstance(job.options['create_gdv_project'],str):
            job.options['create_gdv_project'] = job.options['create_gdv_project'].lower() in ['1','true','t']
        if job.options.get('read_extension'):
            job.options['read_extension'] = int(job.options['read_extension'])
        if job.options.get('merge_strands'):
            job.options['merge_strands'] = int(job.options['merge_strands'])
        logfile = open(hts_key+".log",'w')
        with execution( M, description=hts_key, remote_working_directory=working_dir ) as ex:
            logfile.write("Enter execution, fetch fastq files.\n");logfile.flush()
            job = get_fastq_files( job, ex.working_directory, dafl )
            logfile.write("Map reads.\n");logfile.flush()
            mapped_files = map_groups( ex, job, ex.working_directory, assembly, {'via': via} )
            logfile.write("Make stats:\n");logfile.flush()
            for k,v in job.groups.iteritems():
                logfile.write(str(k)+str(v['name'])+"\t");logfile.flush()
                pdf = add_pdf_stats( ex, mapped_files,
                                     {k:v['name']},
                                     gl.get('script_path') or '',
                                     description=set_file_descr(v['name']+"_mapping_report.pdf",groupId=k,step='stats',type='pdf') )
            if job.options['compute_densities']:
                logfile.write("computing densities.\n");logfile.flush()
                if not(job.options.get('read_extension')>0):
                    job.options['read_extension'] = mapped_files.values()[0].values()[0]['stats']['read_length']
                density_files = densities_groups( ex, job, mapped_files, assembly.chromosomes, via=via )
                logfile.write("Finished computing densities.\n");logfile.flush()
                if job.options['create_gdv_project']:
                    logfile.write("Creating GDV project.\n");logfile.flush()
                    gdv_project = gdv.create_gdv_project( gl['gdv']['key'], gl['gdv']['email'],
                                                          job.description,
                                                          assembly.nr_assembly_id,
                                                          gdv_url=gl['gdv']['url'], public=True )
                    logfile.write("GDV project: "+str(gdv_project['project_id']+"\n"));logfile.flush()
                    add_pickle( ex, gdv_project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') )
        allfiles = get_files( ex.id, M )
        if 'ucsc_bigwig' and g_rep.intype == 0:
            ucscfiles = get_files( ex.id, M, select_param={'ucsc':'1'} )
            with open(hts_key+".bed",'w') as ucscbed:
                for ftype,fset in ucscfiles.iteritems():
                    for ffile,descr in fset.iteritems():
                        if re.search(r' \(.*\)',descr): continue
                        ucscbed.write(track_header(descr,ftype,gl['hts_mapseq']['download'],ffile))
        if job.options['create_gdv_project']:
            allfiles['url'] = {gdv_project['public_url']: 'GDV view'}
            download_url = gl['hts_mapseq']['download']
            [gdv.add_gdv_track( gl['gdv']['key'], gl['gdv']['email'],
                                gdv_project['project_id'],
                                url=download_url+str(k),
                                name = re.sub('\.sql','',str(f)),
                                gdv_url=gl['gdv']['url'] )
             for k,f in allfiles['sql'].iteritems()]
        logfile.close()
        print json.dumps(allfiles)
        with open(hts_key+".done",'w') as done:
            json.dump(allfiles,done)
        if 'email' in gl:
            r = email.EmailReport( sender=gl['email']['sender'],
                                   to=str(job.email),
                                   subject="Mapseq job "+str(job.description),
                                   smtp_server=gl['email']['smtp'] )
            r.appendBody('''
Your mapseq job has finished.

The description was:
'''+str(job.description)+'''
and its unique key is '''+hts_key+'''.

You can now retrieve the results at this url:
'''+gl['hts_mapseq']['url']+"jobs/"+hts_key+"/get_results")
            r.send()
        return 0