def main(argv = None): """ Entry point when program start """ genrep = None assembly = None lims = None job = None config = None config_file = None background = "" matrix = "" original_sql_data = "" random_sql_data = "" track_filtered = "" track_scanned = "" project = "" username = "" identity_file = "" host = "" website = "" remote_path = "" result_path = "" track_regions_path = "" via = "" limspath = "" fdr = 0 runs = {} logging.basicConfig(filename='run_scanning.log',level=logging.INFO) if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt ( argv[1:],"hu:c:" , [ "help", "via = ", "host = " , "remote_path = " , "website = " , "minilims = ","config = " , "matrix = ", "username = "******"identity_file = ", "project = " ] ) except getopt.error, msg: raise Usage(msg) for option, value in opts: if option in ("-h", "--help"): print __doc__ print USAGE sys.exit(0) elif option == "--via": if value == "local": via = "local" elif value == "lsf": via = "lsf" else: raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (value,)) elif option == "--website": website = normalize_url(value) elif option == "--minilims": limspath = normcase(expanduser(value)) elif option == "--host": host = value elif option == "--identity_file": identity_file = value elif option == "--remote_path": remote_path = normcase(expanduser(value)) if not remote_path.endswith(sep): remote_path += sep elif option == "--matrix": matrix = {basename(value):normcase(expanduser(value))} elif option == "--username": username = value elif option == "--project": project = value elif option in ("-c", "--config"): config_file = normcase(expanduser(value)) else: raise Usage("Unhandled option: " + option) # read config file if config_file is None or not exists(config_file) or not isfile(config_file): raise Usage("Config file missing") else: job, config = parseConfig(normcase(expanduser(config_file))) if project == "": project = job.description if matrix == "": if "matrix" in job.options: path = normcase(expanduser(job.options["matrix"])) matrix = {basename(path): path} else: raise Usage("You need give value matrix file ") if limspath == "": if "minilims" in job.options: limspath = job.options["minilims"] else: raise Usage("You need give value minilims path/name") if via == "": if "via" in job.options: via = job.options["via"] else: via = "lsf" if host == "" and "host" in job.options: host = job.options["host"] if identity_file == "" and "identity_file" in job.options: identity_file = job.options["identity_file"] if remote_path == "" and "remote_path" in job.options: remote_path = job.options["remote_path"] if username == "" and "username" in job.options: username = job.options["username"] if website == "" and "website" in job.options: website = job.options["website"] genrep = GenRep(config = config) assembly = genrep.assembly(job.assembly_id) lims = MiniLIMS(limspath) json = create_gdv_project( config["gdv"]["key"], config["gdv"]["email"], project, assembly.nr_assembly_id, config["gdv"]["url"], public = True ) project_id = get_project_id( json ) # compute false discovery rate with execution(lims, description = job.description) as ex: background = genrep.statistics ( assembly, output = unique_filename_in(), frequency = True, matrix_format = True ) if len(job.groups) >2: raise ValueError("They are more than 2 group in config file") for group_number in job.groups: group = job.groups[group_number] for run_number in group["runs"]: run_iter = job.groups[group_number]["runs"][run_number] if "url" in run_iter: url = run_iter["url"] uri = "" if run_iter["run"] not in runs: runs[run_iter["run"]] = {"name":None, "control":None, "experimental":None} if url.startswith("http") or url.startswith("www."): url = normalize_url(url) # download data data = urllib2.urlopen(url) uri = unique_filename_in() with open(uri, "w") as opening_file: opening_file.write(data.read()) else: uri = normcase(expanduser(url)) if group["control"]: runs[run_iter["run"]]["control"] = uri runs[run_iter["run"]]["name"] = basename(uri) else: runs[run_iter["run"]]["experimental"] = uri for run in runs: current_run = runs[run] original_sql_data = unique_filename_in() random_sql_data = unique_filename_in() track_filtered = unique_filename_in() logging.info( "[%s]" % job.description ) logging.info( "alias %s => %s" % (current_run["experimental"], track_filtered) ) # convert data to sql with Track(current_run["experimental"], chrmeta = assembly.chromosomes) as track: # Get sqlite file if is not arleady in this format if track.format != "sql" or track.format != "db" or track.format != "sqlite": track.convert(original_sql_data, format = "sql") else: original_sql_data = current_run["experimental"] # Generate a random population from orginal if it is not give from config file if current_run["control"] is None: # create random track track.shuffle_track(random_sql_data, repeat_number = 5) else: with Track(current_run["control"], chrmeta = assembly.chromosomes) as track_random: # Get sqlite file if is not arleady in this format if track_random.format != "sql" or \ track_random.format != "db" or \ track_random.format != "sqlite": track_random.convert(random_sql_data, format = "sql") else: random_sql_data = current_run["control"] track_scanned, fdr, p_value = sqlite_to_false_discovery_rate( ex, matrix, background, genrep, assembly.chromosomes, original_sql_data, random_sql_data, threshold = -100, via = via, keep_max_only = False, alpha = 0.05, nb_sample = 5.0 ) # filter track with fdr as treshold with new(track_filtered, format = "sql", datatype = "qualitative") as track_out: chromosome_used = {} track_out.meta_track = {"source": basename(current_run["experimental"])} track_out.meta_track.update({"k":"v"}) with Track(track_scanned, format = "sql", chrmeta = assembly.chromosomes) as track_in: meta = dict([(v["name"], dict([("length", v["length"])])) for v in track_in.chrmeta.values()]) for chromosome in track_in.all_chrs: data_list = [] for data in track_in.read ( {"chr": chromosome, "score": (fdr, sys.maxsize)}, fields = Track.qualitative_fields ): data_list.append(data) chromosome_used[chromosome] = meta[chromosome] if len(data_list) > 0: track_out.write(chromosome, data_list) track_out.chrmeta = chromosome_used ex.add(track_filtered, "sql: filtred %s" % track_filtered) logging.info( "scanned: %s" % track_scanned ) logging.info( "score selected: %f with p: %.3f" % (fdr, p_value) ) logging.info( "filtred: %s" % track_filtered ) # fix track track_scanned_signal = fix_sqlite_db(track_scanned) logging.info( "scanned signal: %s" % track_scanned_signal ) ex.add(track_scanned_signal, description="%s: sql track signal %s" % (job.description, track_scanned_signal)) # send filtred track and scanned track to remote if host != "" and remote_path != "" and username != "": args = [] if identity_file != "": args = ["-i", normcase(expanduser(identity_file)), "-C" ] source_filtred = normcase(expanduser(track_filtered)) source_scanned = normcase(expanduser(track_scanned_signal)) result_destination = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_filtered) result_path = "%s%s%s.db" % (website, sep, track_filtered) track_regions_destination = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_scanned_signal) track_regions_path = "%s%s%s.db" % (website, sep, track_scanned_signal) scp(ex, source_filtred, result_destination, args = args) scp(ex, source_scanned, track_regions_destination, args = args) else: result_path = track_filtered # Send to GDV filtred track add_gdv_track ( config["gdv"]["key"], config["gdv"]["email"], project_id, result_path, name = "filtred_%s" % (splitext( basename( current_run["experimental"] ) )[0]), gdv_url = config["gdv"]["url"] ) # Send to GDV scanned track add_gdv_track ( config["gdv"]["key"], config["gdv"]["email"], project_id, track_regions_path, name = "regions_%s" % (splitext( basename( current_run["experimental"] ) )[0]), gdv_url = config["gdv"]["url"] ) logging.info( "++++++++++++") logging.info( "-------------------END--------------------")
def main(argv = None): via = "lsf" limspath = None hts_key = '' working_dir = None config_file = None if argv is None: argv = sys.argv try: try: opts,args = getopt.getopt(sys.argv[1:],"hu:k:d:w:c:", ["help","via=","key=","minilims=", "working-directory=","config="]) except getopt.error, msg: raise Usage(msg) for o, a in opts: if o in ("-h", "--help"): print __doc__ print usage return 0 elif o in ("-u", "--via"): if a=="local": via = "local" elif a=="lsf": via = "lsf" else: raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (a,)) elif o in ("-w", "--working-directory"): if os.path.exists(a): os.chdir(a) working_dir = a else: raise Usage("Working directory '%s' does not exist." % a) elif o in ("-d", "--minilims"): limspath = a elif o in ("-k", "--key"): hts_key = a elif o in ("-c", "--config"): config_file = a else: raise Usage("Unhandled option: " + o) if not(limspath and os.path.exists(limspath) and (hts_key != None or (config_file and os.path.exists(config_file)))): raise Usage("Need a minilims and a job key or a configuration file") M = MiniLIMS( limspath ) if len(hts_key)>1: gl = use_pickle(M, "global variables") htss = frontend.Frontend( url=gl['hts_mapseq']['url'] ) job = htss.job( hts_key ) [M.delete_execution(x) for x in M.search_executions(with_description=hts_key,fails=True)] elif os.path.exists(config_file): (job,gl) = frontend.parseConfig( config_file ) hts_key = job.description else: raise ValueError("Need either a job key (-k) or a configuration file (-c).") g_rep = genrep.GenRep( url=gl["genrep_url"], root=gl["bwt_root"], intype=job.options.get('input_type_id') or 0 ) assembly = g_rep.assembly( job.assembly_id ) if 'lims' in gl: dafl = dict((loc,daflims.DAFLIMS( username=gl['lims']['user'], password=pwd )) for loc,pwd in gl['lims']['passwd'].iteritems()) else: dafl = None if not('compute_densities' in job.options): job.options['compute_densities'] = True elif isinstance(job.options['compute_densities'],str): job.options['compute_densities'] = job.options['compute_densities'].lower() in ['1','true','t'] if not('ucsc_bigwig' in job.options): job.options['ucsc_bigwig'] = True elif isinstance(job.options['ucsc_bigwig'],str): job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'].lower() in ['1','true','t'] job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'] and job.options['compute_densities'] if not('create_gdv_project' in job.options): job.options['create_gdv_project'] = False elif isinstance(job.options['create_gdv_project'],str): job.options['create_gdv_project'] = job.options['create_gdv_project'].lower() in ['1','true','t'] if job.options.get('read_extension'): job.options['read_extension'] = int(job.options['read_extension']) if job.options.get('merge_strands'): job.options['merge_strands'] = int(job.options['merge_strands']) logfile = open(hts_key+".log",'w') with execution( M, description=hts_key, remote_working_directory=working_dir ) as ex: logfile.write("Enter execution, fetch fastq files.\n");logfile.flush() job = get_fastq_files( job, ex.working_directory, dafl ) logfile.write("Map reads.\n");logfile.flush() mapped_files = map_groups( ex, job, ex.working_directory, assembly, {'via': via} ) logfile.write("Make stats:\n");logfile.flush() for k,v in job.groups.iteritems(): logfile.write(str(k)+str(v['name'])+"\t");logfile.flush() pdf = add_pdf_stats( ex, mapped_files, {k:v['name']}, gl.get('script_path') or '', description=set_file_descr(v['name']+"_mapping_report.pdf",groupId=k,step='stats',type='pdf') ) if job.options['compute_densities']: logfile.write("computing densities.\n");logfile.flush() if not(job.options.get('read_extension')>0): job.options['read_extension'] = mapped_files.values()[0].values()[0]['stats']['read_length'] density_files = densities_groups( ex, job, mapped_files, assembly.chromosomes, via=via ) logfile.write("Finished computing densities.\n");logfile.flush() if job.options['create_gdv_project']: logfile.write("Creating GDV project.\n");logfile.flush() gdv_project = gdv.create_gdv_project( gl['gdv']['key'], gl['gdv']['email'], job.description, assembly.nr_assembly_id, gdv_url=gl['gdv']['url'], public=True ) logfile.write("GDV project: "+str(gdv_project['project_id']+"\n"));logfile.flush() add_pickle( ex, gdv_project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') ) allfiles = get_files( ex.id, M ) if 'ucsc_bigwig' and g_rep.intype == 0: ucscfiles = get_files( ex.id, M, select_param={'ucsc':'1'} ) with open(hts_key+".bed",'w') as ucscbed: for ftype,fset in ucscfiles.iteritems(): for ffile,descr in fset.iteritems(): if re.search(r' \(.*\)',descr): continue ucscbed.write(track_header(descr,ftype,gl['hts_mapseq']['download'],ffile)) if job.options['create_gdv_project']: allfiles['url'] = {gdv_project['public_url']: 'GDV view'} download_url = gl['hts_mapseq']['download'] [gdv.add_gdv_track( gl['gdv']['key'], gl['gdv']['email'], gdv_project['project_id'], url=download_url+str(k), name = re.sub('\.sql','',str(f)), gdv_url=gl['gdv']['url'] ) for k,f in allfiles['sql'].iteritems()] logfile.close() print json.dumps(allfiles) with open(hts_key+".done",'w') as done: json.dump(allfiles,done) if 'email' in gl: r = email.EmailReport( sender=gl['email']['sender'], to=str(job.email), subject="Mapseq job "+str(job.description), smtp_server=gl['email']['smtp'] ) r.appendBody(''' Your mapseq job has finished. The description was: '''+str(job.description)+''' and its unique key is '''+hts_key+'''. You can now retrieve the results at this url: '''+gl['hts_mapseq']['url']+"jobs/"+hts_key+"/get_results") r.send() return 0