Exemple #1
0
def main(argv = None):
    """
    Entry point when program start
    """
    genrep              = None
    assembly            = None
    lims                = None
    job                 = None
    config              = None
    config_file         = None
    background          = ""
    matrix              = ""
    original_sql_data   = ""
    random_sql_data     = ""
    track_filtered      = ""
    track_scanned       = ""
    project             = ""
    username            = ""
    identity_file       = ""
    host                = ""
    website             = ""
    remote_path         = ""
    result_path         = ""
    track_regions_path  = ""
    via                 = ""
    limspath            = ""
    fdr                 = 0
    runs                = {}
    logging.basicConfig(filename='run_scanning.log',level=logging.INFO)
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt   (
                                            argv[1:],"hu:c:"  ,
                                            [
                                                "help", "via = ", "host = "     ,
                                                "remote_path = " , "website = " ,
                                                "minilims = ","config = "       ,
                                                "matrix = ", "username = "******"identity_file = ", "project = "
                                            ]
                                        )
        except getopt.error, msg:
            raise Usage(msg)
        for option, value in opts:
            if option in ("-h", "--help"):
                print __doc__
                print USAGE
                sys.exit(0)
            elif option == "--via":
                if value == "local":
                    via = "local"
                elif value == "lsf":
                    via = "lsf"
                else:
                    raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (value,))
            elif option == "--website":
                website = normalize_url(value)
            elif option == "--minilims":
                limspath = normcase(expanduser(value))
            elif option == "--host":
                host = value
            elif option == "--identity_file":
                identity_file = value
            elif option == "--remote_path":
                remote_path = normcase(expanduser(value))
                if not remote_path.endswith(sep):
                    remote_path += sep
            elif option == "--matrix":
                matrix = {basename(value):normcase(expanduser(value))}
            elif option == "--username":
                username = value
            elif option == "--project":
                project = value
            elif option in ("-c", "--config"):
                config_file = normcase(expanduser(value))
            else:
                raise Usage("Unhandled option: " + option)

        # read config file
        if config_file is None or not exists(config_file) or not isfile(config_file):
            raise Usage("Config file missing")
        else:
            job, config = parseConfig(normcase(expanduser(config_file)))

        if project == "":
            project = job.description
        if matrix == "":
            if "matrix" in job.options:
                path = normcase(expanduser(job.options["matrix"]))
                matrix = {basename(path): path}
            else:
                raise Usage("You need give value matrix file ")
        if limspath == "":
            if "minilims" in job.options:
                limspath = job.options["minilims"]
            else:
                raise Usage("You need give value minilims path/name")
        if via == "":
            if "via" in job.options:
                via = job.options["via"]
            else:
                via = "lsf"
        if host == "" and "host" in job.options:
            host = job.options["host"]
        if identity_file == "" and "identity_file" in job.options:
            identity_file = job.options["identity_file"]
        if remote_path == "" and "remote_path" in job.options:
            remote_path = job.options["remote_path"]
        if username == "" and "username" in job.options:
            username = job.options["username"]
        if website == "" and "website" in job.options:
            website = job.options["website"]

        genrep      = GenRep(config = config)
        assembly    = genrep.assembly(job.assembly_id)
        lims        = MiniLIMS(limspath)
        json        = create_gdv_project(
                                            config["gdv"]["key"], config["gdv"]["email"],
                                            project,
                                            assembly.nr_assembly_id,
                                            config["gdv"]["url"],
                                            public = True
                                        )
        project_id  = get_project_id( json )
        # compute false discovery rate
        with execution(lims, description = job.description) as ex:
            background = genrep.statistics  (
                                                assembly,
                                                output = unique_filename_in(),
                                                frequency = True,
                                                matrix_format = True
                                            )
            if len(job.groups) >2:
                raise ValueError("They are more than 2 group in config file")

            for group_number in job.groups:
                group = job.groups[group_number]
                for run_number in group["runs"]:
                    run_iter = job.groups[group_number]["runs"][run_number]
                    if "url" in run_iter:
                        url = run_iter["url"]
                        uri = ""
                        if run_iter["run"] not in runs:
                            runs[run_iter["run"]] = {"name":None, "control":None, "experimental":None}
                        if url.startswith("http") or url.startswith("www."):
                            url = normalize_url(url)
                            # download data
                            data    = urllib2.urlopen(url)
                            uri     = unique_filename_in()
                            with open(uri, "w") as opening_file:
                                opening_file.write(data.read())
                        else:
                            uri = normcase(expanduser(url))
                        if group["control"]:
                            runs[run_iter["run"]]["control"]   = uri
                            runs[run_iter["run"]]["name"]      = basename(uri)
                        else:
                            runs[run_iter["run"]]["experimental"] = uri

            for run in runs:
                current_run         = runs[run]
                original_sql_data   = unique_filename_in()
                random_sql_data     = unique_filename_in()
                track_filtered      = unique_filename_in()
                logging.info( "[%s]" % job.description )
                logging.info( "alias %s => %s" % (current_run["experimental"], track_filtered) )

                # convert data to sql
                with Track(current_run["experimental"], chrmeta = assembly.chromosomes) as track:
                    # Get sqlite file if is not arleady in this format
                    if track.format != "sql" or track.format != "db" or track.format != "sqlite":
                        track.convert(original_sql_data, format = "sql")
                    else:
                        original_sql_data = current_run["experimental"]
                    # Generate a random population from orginal if it is not give from config file
                    if current_run["control"] is None:
                        # create random track
                        track.shuffle_track(random_sql_data, repeat_number = 5)
                    else:
                        with Track(current_run["control"], chrmeta = assembly.chromosomes) as track_random:
                            # Get sqlite file if is not arleady in this format
                            if track_random.format != "sql" or \
                                track_random.format != "db" or \
                                track_random.format != "sqlite":
                                track_random.convert(random_sql_data, format = "sql")
                            else:
                                random_sql_data = current_run["control"]
                track_scanned, fdr, p_value = sqlite_to_false_discovery_rate(
                                                                                ex,
                                                                                matrix,
                                                                                background,
                                                                                genrep,
                                                                                assembly.chromosomes,
                                                                                original_sql_data,
                                                                                random_sql_data,
                                                                                threshold = -100,
                                                                                via = via,
                                                                                keep_max_only = False,
                                                                                alpha = 0.05,
                                                                                nb_sample = 5.0
                                                                            )

                # filter track with fdr as treshold
                with new(track_filtered, format = "sql", datatype = "qualitative") as track_out:
                    chromosome_used     = {}
                    track_out.meta_track = {"source": basename(current_run["experimental"])}
                    track_out.meta_track.update({"k":"v"})
                    with Track(track_scanned, format = "sql", chrmeta = assembly.chromosomes) as track_in:
                        meta = dict([(v["name"], dict([("length", v["length"])])) for v in track_in.chrmeta.values()])
                        for chromosome in track_in.all_chrs:
                            data_list = []
                            for data in track_in.read   (
                                                            {"chr": chromosome, "score": (fdr, sys.maxsize)},
                                                            fields = Track.qualitative_fields
                                                        ):
                                data_list.append(data)
                                chromosome_used[chromosome] = meta[chromosome]
                            if len(data_list) > 0:
                                track_out.write(chromosome, data_list)
                        track_out.chrmeta = chromosome_used
                ex.add(track_filtered,      "sql: filtred %s" % track_filtered)
                logging.info( "scanned: %s" % track_scanned )
                logging.info( "score selected: %f with p: %.3f" % (fdr, p_value) )
                logging.info( "filtred: %s" % track_filtered )


                # fix track
                track_scanned_signal = fix_sqlite_db(track_scanned)
                logging.info( "scanned signal: %s" % track_scanned_signal )
                ex.add(track_scanned_signal, description="%s: sql track signal %s" % (job.description, track_scanned_signal))

                # send filtred track and scanned track to remote
                if host != "" and remote_path != "" and username != "":
                    args = []
                    if identity_file != "":
                        args = ["-i", normcase(expanduser(identity_file)), "-C" ]
                    source_filtred      = normcase(expanduser(track_filtered))
                    source_scanned      = normcase(expanduser(track_scanned_signal))
                    result_destination          = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_filtered)
                    result_path                 = "%s%s%s.db" % (website, sep, track_filtered)
                    track_regions_destination   = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_scanned_signal)
                    track_regions_path          = "%s%s%s.db" % (website, sep, track_scanned_signal)
                    scp(ex, source_filtred, result_destination, args = args)
                    scp(ex, source_scanned, track_regions_destination, args = args)
                else:
                    result_path = track_filtered
                # Send to GDV filtred track
                add_gdv_track  (
                                    config["gdv"]["key"], config["gdv"]["email"],
                                    project_id, result_path,
                                    name    = "filtred_%s" % (splitext( basename( current_run["experimental"] ) )[0]),
                                    gdv_url = config["gdv"]["url"]
                                )
                # Send to GDV scanned track
                add_gdv_track  (
                                    config["gdv"]["key"], config["gdv"]["email"],
                                    project_id, track_regions_path,
                                    name    = "regions_%s" % (splitext( basename( current_run["experimental"] ) )[0]),
                                    gdv_url = config["gdv"]["url"]
                                )
                logging.info( "++++++++++++")
            logging.info( "-------------------END--------------------")
Exemple #2
0
def main(argv = None):
    via = "lsf"
    limspath = None
    hts_key = ''
    working_dir = None
    config_file = None
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts,args = getopt.getopt(sys.argv[1:],"hu:k:d:w:c:",
                                      ["help","via=","key=","minilims=",
                                       "working-directory=","config="])
        except getopt.error, msg:
            raise Usage(msg)
        for o, a in opts:
            if o in ("-h", "--help"):
                print __doc__
                print usage
                return 0
            elif o in ("-u", "--via"):
                if a=="local":
                    via = "local"
                elif a=="lsf":
                    via = "lsf"
                else:
                    raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (a,))
            elif o in ("-w", "--working-directory"):
                if os.path.exists(a):
                    os.chdir(a)
                    working_dir = a
                else:
                    raise Usage("Working directory '%s' does not exist." % a)
            elif o in ("-d", "--minilims"):
                limspath = a
            elif o in ("-k", "--key"):
                hts_key = a
            elif o in ("-c", "--config"):
                config_file = a
            else:
                raise Usage("Unhandled option: " + o)
        if not(limspath and os.path.exists(limspath)
               and (hts_key != None or (config_file and os.path.exists(config_file)))):
            raise Usage("Need a minilims and a job key or a configuration file")
        M = MiniLIMS( limspath )
        if len(hts_key)>1:
            gl = use_pickle(M, "global variables")
            htss = frontend.Frontend( url=gl['hts_mapseq']['url'] )
            job = htss.job( hts_key )
            [M.delete_execution(x) for x in M.search_executions(with_description=hts_key,fails=True)]
        elif os.path.exists(config_file):
            (job,gl) = frontend.parseConfig( config_file )
            hts_key = job.description
        else:
            raise ValueError("Need either a job key (-k) or a configuration file (-c).")
        g_rep = genrep.GenRep( url=gl["genrep_url"], root=gl["bwt_root"],
                               intype=job.options.get('input_type_id') or 0 )
        assembly = g_rep.assembly( job.assembly_id )
        if 'lims' in gl:
            dafl = dict((loc,daflims.DAFLIMS( username=gl['lims']['user'], password=pwd ))
                        for loc,pwd in gl['lims']['passwd'].iteritems())
        else:
            dafl = None
        if not('compute_densities' in job.options):
            job.options['compute_densities'] = True
        elif isinstance(job.options['compute_densities'],str):
            job.options['compute_densities'] = job.options['compute_densities'].lower() in ['1','true','t']
        if not('ucsc_bigwig' in job.options):
            job.options['ucsc_bigwig'] = True
        elif isinstance(job.options['ucsc_bigwig'],str):
            job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'].lower() in ['1','true','t']
        job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'] and job.options['compute_densities']
        if not('create_gdv_project' in job.options):
            job.options['create_gdv_project'] = False
        elif isinstance(job.options['create_gdv_project'],str):
            job.options['create_gdv_project'] = job.options['create_gdv_project'].lower() in ['1','true','t']
        if job.options.get('read_extension'):
            job.options['read_extension'] = int(job.options['read_extension'])
        if job.options.get('merge_strands'):
            job.options['merge_strands'] = int(job.options['merge_strands'])
        logfile = open(hts_key+".log",'w')
        with execution( M, description=hts_key, remote_working_directory=working_dir ) as ex:
            logfile.write("Enter execution, fetch fastq files.\n");logfile.flush()
            job = get_fastq_files( job, ex.working_directory, dafl )
            logfile.write("Map reads.\n");logfile.flush()
            mapped_files = map_groups( ex, job, ex.working_directory, assembly, {'via': via} )
            logfile.write("Make stats:\n");logfile.flush()
            for k,v in job.groups.iteritems():
                logfile.write(str(k)+str(v['name'])+"\t");logfile.flush()
                pdf = add_pdf_stats( ex, mapped_files,
                                     {k:v['name']},
                                     gl.get('script_path') or '',
                                     description=set_file_descr(v['name']+"_mapping_report.pdf",groupId=k,step='stats',type='pdf') )
            if job.options['compute_densities']:
                logfile.write("computing densities.\n");logfile.flush()
                if not(job.options.get('read_extension')>0):
                    job.options['read_extension'] = mapped_files.values()[0].values()[0]['stats']['read_length']
                density_files = densities_groups( ex, job, mapped_files, assembly.chromosomes, via=via )
                logfile.write("Finished computing densities.\n");logfile.flush()
                if job.options['create_gdv_project']:
                    logfile.write("Creating GDV project.\n");logfile.flush()
                    gdv_project = gdv.create_gdv_project( gl['gdv']['key'], gl['gdv']['email'],
                                                          job.description,
                                                          assembly.nr_assembly_id,
                                                          gdv_url=gl['gdv']['url'], public=True )
                    logfile.write("GDV project: "+str(gdv_project['project_id']+"\n"));logfile.flush()
                    add_pickle( ex, gdv_project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') )
        allfiles = get_files( ex.id, M )
        if 'ucsc_bigwig' and g_rep.intype == 0:
            ucscfiles = get_files( ex.id, M, select_param={'ucsc':'1'} )
            with open(hts_key+".bed",'w') as ucscbed:
                for ftype,fset in ucscfiles.iteritems():
                    for ffile,descr in fset.iteritems():
                        if re.search(r' \(.*\)',descr): continue
                        ucscbed.write(track_header(descr,ftype,gl['hts_mapseq']['download'],ffile))
        if job.options['create_gdv_project']:
            allfiles['url'] = {gdv_project['public_url']: 'GDV view'}
            download_url = gl['hts_mapseq']['download']
            [gdv.add_gdv_track( gl['gdv']['key'], gl['gdv']['email'],
                                gdv_project['project_id'],
                                url=download_url+str(k),
                                name = re.sub('\.sql','',str(f)),
                                gdv_url=gl['gdv']['url'] )
             for k,f in allfiles['sql'].iteritems()]
        logfile.close()
        print json.dumps(allfiles)
        with open(hts_key+".done",'w') as done:
            json.dump(allfiles,done)
        if 'email' in gl:
            r = email.EmailReport( sender=gl['email']['sender'],
                                   to=str(job.email),
                                   subject="Mapseq job "+str(job.description),
                                   smtp_server=gl['email']['smtp'] )
            r.appendBody('''
Your mapseq job has finished.

The description was:
'''+str(job.description)+'''
and its unique key is '''+hts_key+'''.

You can now retrieve the results at this url:
'''+gl['hts_mapseq']['url']+"jobs/"+hts_key+"/get_results")
            r.send()
        return 0