def gdv_create(self,ex): from bbcflib import gdv project = gdv.get_project(mail=self.globals['gdv']['email'], key=self.globals['gdv']['key'], project_key=self.job.options['gdv_key']) if 'error' in project: self.log_write("Creating GDV project.") project = gdv.new_project( self.globals['gdv']['email'], self.globals['gdv']['key'], self.job.description, self.job.assembly.id, self.globals['gdv']['url'] ) self.debug_write("\nGDV project: "+json.dumps(project)) add_pickle( ex, project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') ) self.job.options['gdv_project'] = project return True
def demultiplex_workflow(ex, job, gl, file_path="../", via='lsf', logfile=sys.stdout, debugfile=sys.stderr): script_path=gl['script_path'] file_names = {} job_groups=job.groups resFiles={} for gid, group in job_groups.iteritems(): file_names[gid] = {} primersFilename = 'group_' + group['name'] + "_barcode_file.fa" primersFile = os.path.join(file_path,primersFilename) ex.add(primersFile,description=set_file_descr(primersFilename,groupId=gid,step="init",type="fa")) paramsFilename = 'group_' + group['name'] + "_param_file.txt" paramsFile = os.path.join(file_path,paramsFilename) ex.add(paramsFile,description=set_file_descr(paramsFilename,groupId=gid,step="init",type="txt")) params = load_paramsFile(paramsFile) infiles = [] tot_counts = 0 allSubFiles = [] for rid,run in group['runs'].iteritems(): infiles.append(run) n=count_lines(ex,run) tot_counts += n/4 if n>10000000: allSubFiles.extend(split_file(ex,run,n_lines=8000000)) else: allSubFiles.append(run) (resExonerate, tot_ambiguous, tot_discarded) = parallel_exonerate( ex, allSubFiles, primersFile, (gid, group['name']), via=via, **params ) gzipfile(ex,run) ex.add(run+".gz",description=set_file_descr(group['name']+"_full_fastq.gz", groupId=gid,step='exonerate',view='debug',type="fastq")) logfile.write("Will get sequences to filter\n");logfile.flush() seqToFilter = getSeqToFilter(ex,primersFile) logfile.write("Will filter the sequences\n") filteredFastq = filterSeq(ex,resExonerate,seqToFilter,gid,group['name'],via=via) logfile.write("After filterSeq, filteredFastq=%s\n" %filteredFastq);logfile.flush() counts_primers = {} counts_primers_filtered = {} if len(filteredFastq): archive = unique_filename_in() tgz = tarfile.open(archive, "w:gz") for k,f in resExonerate.iteritems(): counts_primers[k] = count_lines(ex,f)/4 if k in filteredFastq: file_names[gid][k] = group['name']+"_"+k+"_filtered" ex.add(filteredFastq[k],description=set_file_descr(file_names[gid][k]+".fastq", groupId=gid,step="final", type="fastq")) counts_primers_filtered[k] = count_lines(ex,filteredFastq[k])/4 tgz.add( f, arcname=group['name']+"_"+k+".fastq" ) else: file_names[gid][k] = group['name']+"_"+k ex.add(f,description=set_file_descr(file_names[gid][k]+".fastq", groupId=gid,step="final", type="fastq")) counts_primers_filtered[k] = 0 if len(filteredFastq): tgz.close() ex.add(archive,description=set_file_descr(group['name']+"_unfiltered_fastq.tgz", groupId=gid,step="exonerate", type="tar")) # Prepare report per group of runs report_ok,reportFile = prepareReport(ex,group['name'], tot_counts, counts_primers,counts_primers_filtered, tot_ambiguous, tot_discarded) ex.add(reportFile,description = set_file_descr( group['name']+"_report_demultiplexing.txt", groupId=gid,step="final",type="txt",view="admin")) if report_ok: reportFile_pdf = unique_filename_in() createReport(ex,reportFile,reportFile_pdf,script_path) ex.add(reportFile_pdf,description=set_file_descr( group['name']+"_report_demultiplexing.pdf", groupId=gid,step="final",type="pdf")) else: logfile.write("*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n");logfile.flush() add_pickle( ex, file_names, set_file_descr('file_names',step="final",type='py',view='admin') ) return resFiles
def demultiplex_workflow(ex, job, gl, file_path="../", via='lsf', logfile=sys.stdout, debugfile=sys.stderr): script_path = gl['script_path'] file_names = {} job_groups = job.groups resFiles = {} for gid, group in job_groups.iteritems(): file_names[gid] = {} primersFilename = 'group_' + group['name'] + "_barcode_file.fa" primersFile = group.get("primersfile", os.path.join(file_path, primersFilename)) ex.add(primersFile, description=set_file_descr(primersFilename, groupId=gid, step="init", type="fa")) paramsFilename = 'group_' + group['name'] + "_param_file.txt" paramsFile = group.get("paramsfile", os.path.join(file_path, paramsFilename)) ex.add(paramsFile, description=set_file_descr(paramsFilename, groupId=gid, step="init", type="txt")) params = load_paramsFile(paramsFile) infiles = [] tot_counts = 0 allSubFiles = [] for rid, run in group['runs'].iteritems(): infiles.append(run) n = count_lines(ex, run) tot_counts += n / 4 if n > 10000000: allSubFiles.extend(split_file(ex, run, n_lines=8000000)) else: allSubFiles.append(run) (resExonerate, tot_ambiguous, tot_discarded) = parallel_exonerate(ex, allSubFiles, primersFile, (gid, group['name']), via=via, **params) gzipfile(ex, cat(infiles)) ex.add(run + ".gz", description=set_file_descr(group['name'] + "_full_fastq.gz", groupId=gid, step='exonerate', view='debug', type="fastq")) logfile.write("Will get sequences to filter\n") logfile.flush() seqToFilter = getSeqToFilter(ex, primersFile) logfile.write("Will filter the sequences\n") filteredFastq = filterSeq(ex, resExonerate, seqToFilter, gid, group['name'], via=via) logfile.write("After filterSeq, filteredFastq=%s\n" % filteredFastq) logfile.flush() counts_primers = {} counts_primers_filtered = {} global bcDelimiter if len(filteredFastq): archive = unique_filename_in() tgz = tarfile.open(archive, "w:gz") for k, f in resExonerate.iteritems(): counts_primers[k] = count_lines(ex, f) / 4 if k in filteredFastq: k2 = k.replace(bcDelimiter, "_") file_names[gid][k2] = group['name'] + "_" + k2 + "_filtered" ex.add(filteredFastq[k], description=set_file_descr(file_names[gid][k2] + ".fastq", groupId=gid, step="final", type="fastq")) counts_primers_filtered[k] = count_lines(ex, filteredFastq[k]) / 4 tgz.add(f, arcname=group['name'] + "_" + k.replace(bcDelimiter, "_") + ".fastq") else: k2 = k.replace(bcDelimiter, "_") file_names[gid][k2] = group['name'] + "_" + k2 ex.add(f, description=set_file_descr(file_names[gid][k2] + ".fastq", groupId=gid, step="final", type="fastq")) counts_primers_filtered[k] = 0 if len(filteredFastq): tgz.close() ex.add(archive, description=set_file_descr(group['name'] + "_unfiltered_fastq.tgz", groupId=gid, step="exonerate", type="tar")) # Prepare report per group of runs report_ok, reportFile = prepareReport(ex, group['name'], tot_counts, counts_primers, counts_primers_filtered, tot_ambiguous, tot_discarded) ex.add(reportFile, description=set_file_descr(group['name'] + "_report_demultiplexing.txt", groupId=gid, step="final", type="txt", view="admin")) if report_ok: reportFile_pdf = unique_filename_in() createReport(ex, reportFile, reportFile_pdf, script_path) ex.add(reportFile_pdf, description=set_file_descr(group['name'] + "_report_demultiplexing.pdf", groupId=gid, step="final", type="pdf")) else: logfile.write( "*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n" ) logfile.flush() add_pickle( ex, file_names, set_file_descr('file_names', step="final", type='py', view='admin')) return resFiles
def main(argv = None): via = "lsf" limspath = None hts_key = '' working_dir = None config_file = None if argv is None: argv = sys.argv try: try: opts,args = getopt.getopt(sys.argv[1:],"hu:k:d:w:c:", ["help","via=","key=","minilims=", "working-directory=","config="]) except getopt.error, msg: raise Usage(msg) for o, a in opts: if o in ("-h", "--help"): print __doc__ print usage return 0 elif o in ("-u", "--via"): if a=="local": via = "local" elif a=="lsf": via = "lsf" else: raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (a,)) elif o in ("-w", "--working-directory"): if os.path.exists(a): os.chdir(a) working_dir = a else: raise Usage("Working directory '%s' does not exist." % a) elif o in ("-d", "--minilims"): limspath = a elif o in ("-k", "--key"): hts_key = a elif o in ("-c", "--config"): config_file = a else: raise Usage("Unhandled option: " + o) if not(limspath and os.path.exists(limspath) and (hts_key != None or (config_file and os.path.exists(config_file)))): raise Usage("Need a minilims and a job key or a configuration file") M = MiniLIMS( limspath ) if len(hts_key)>1: gl = use_pickle(M, "global variables") htss = frontend.Frontend( url=gl['hts_mapseq']['url'] ) job = htss.job( hts_key ) [M.delete_execution(x) for x in M.search_executions(with_description=hts_key,fails=True)] elif os.path.exists(config_file): (job,gl) = frontend.parseConfig( config_file ) hts_key = job.description else: raise ValueError("Need either a job key (-k) or a configuration file (-c).") g_rep = genrep.GenRep( url=gl["genrep_url"], root=gl["bwt_root"], intype=job.options.get('input_type_id') or 0 ) assembly = g_rep.assembly( job.assembly_id ) if 'lims' in gl: dafl = dict((loc,daflims.DAFLIMS( username=gl['lims']['user'], password=pwd )) for loc,pwd in gl['lims']['passwd'].iteritems()) else: dafl = None if not('compute_densities' in job.options): job.options['compute_densities'] = True elif isinstance(job.options['compute_densities'],str): job.options['compute_densities'] = job.options['compute_densities'].lower() in ['1','true','t'] if not('ucsc_bigwig' in job.options): job.options['ucsc_bigwig'] = True elif isinstance(job.options['ucsc_bigwig'],str): job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'].lower() in ['1','true','t'] job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'] and job.options['compute_densities'] if not('create_gdv_project' in job.options): job.options['create_gdv_project'] = False elif isinstance(job.options['create_gdv_project'],str): job.options['create_gdv_project'] = job.options['create_gdv_project'].lower() in ['1','true','t'] if job.options.get('read_extension'): job.options['read_extension'] = int(job.options['read_extension']) if job.options.get('merge_strands'): job.options['merge_strands'] = int(job.options['merge_strands']) logfile = open(hts_key+".log",'w') with execution( M, description=hts_key, remote_working_directory=working_dir ) as ex: logfile.write("Enter execution, fetch fastq files.\n");logfile.flush() job = get_fastq_files( job, ex.working_directory, dafl ) logfile.write("Map reads.\n");logfile.flush() mapped_files = map_groups( ex, job, ex.working_directory, assembly, {'via': via} ) logfile.write("Make stats:\n");logfile.flush() for k,v in job.groups.iteritems(): logfile.write(str(k)+str(v['name'])+"\t");logfile.flush() pdf = add_pdf_stats( ex, mapped_files, {k:v['name']}, gl.get('script_path') or '', description=set_file_descr(v['name']+"_mapping_report.pdf",groupId=k,step='stats',type='pdf') ) if job.options['compute_densities']: logfile.write("computing densities.\n");logfile.flush() if not(job.options.get('read_extension')>0): job.options['read_extension'] = mapped_files.values()[0].values()[0]['stats']['read_length'] density_files = densities_groups( ex, job, mapped_files, assembly.chromosomes, via=via ) logfile.write("Finished computing densities.\n");logfile.flush() if job.options['create_gdv_project']: logfile.write("Creating GDV project.\n");logfile.flush() gdv_project = gdv.create_gdv_project( gl['gdv']['key'], gl['gdv']['email'], job.description, assembly.nr_assembly_id, gdv_url=gl['gdv']['url'], public=True ) logfile.write("GDV project: "+str(gdv_project['project_id']+"\n"));logfile.flush() add_pickle( ex, gdv_project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') ) allfiles = get_files( ex.id, M ) if 'ucsc_bigwig' and g_rep.intype == 0: ucscfiles = get_files( ex.id, M, select_param={'ucsc':'1'} ) with open(hts_key+".bed",'w') as ucscbed: for ftype,fset in ucscfiles.iteritems(): for ffile,descr in fset.iteritems(): if re.search(r' \(.*\)',descr): continue ucscbed.write(track_header(descr,ftype,gl['hts_mapseq']['download'],ffile)) if job.options['create_gdv_project']: allfiles['url'] = {gdv_project['public_url']: 'GDV view'} download_url = gl['hts_mapseq']['download'] [gdv.add_gdv_track( gl['gdv']['key'], gl['gdv']['email'], gdv_project['project_id'], url=download_url+str(k), name = re.sub('\.sql','',str(f)), gdv_url=gl['gdv']['url'] ) for k,f in allfiles['sql'].iteritems()] logfile.close() print json.dumps(allfiles) with open(hts_key+".done",'w') as done: json.dump(allfiles,done) if 'email' in gl: r = email.EmailReport( sender=gl['email']['sender'], to=str(job.email), subject="Mapseq job "+str(job.description), smtp_server=gl['email']['smtp'] ) r.appendBody(''' Your mapseq job has finished. The description was: '''+str(job.description)+''' and its unique key is '''+hts_key+'''. You can now retrieve the results at this url: '''+gl['hts_mapseq']['url']+"jobs/"+hts_key+"/get_results") r.send() return 0