Esempio n. 1
0
def main(earliest_night, latest_night, data_dir, jar, xml, db, out, queue, walltime, engine, num_runs, vmem, log_level, port, source, conditions, max_delta_t, local, password):

    level=logging.INFO
    if log_level is 'DEBUG':
        level = logging.DEBUG
    elif log_level is 'WARN':
        level = logging.WARN
    elif log_level is 'INFO':
        level = logging.INFO

    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +  '%(message)s'), level=level)

    jarpath = os.path.abspath(jar)
    xmlpath =os. path.abspath(xml)
    outpath = os.path.abspath(out)
    erna.ensure_output(out)
    db_path = os.path.abspath(db)
    output_directory = os.path.dirname(outpath)
    #create dir if it doesnt exist
    os.makedirs(output_directory, exist_ok=True)
    logger.info("Writing output data  to {}".format(out))
    factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password))
    data_conditions=dcc.conditions[conditions]
    df_runs = erna.load(earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions)

    logger.info("Would process {} jobs with {} runs per job".format(len(df_runs)//num_runs, num_runs))
    click.confirm('Do you want to continue processing and start jobs?', abort=True)

    job_list = make_jobs(jarpath, xmlpath, db_path, output_directory, df_runs,  engine, queue, vmem, num_runs, walltime)
    job_outputs = gridmap.process_jobs(job_list, max_processes=len(job_list), local=local)
    erna.collect_output(job_outputs, out, df_runs)
Esempio n. 2
0
def main( jar, xml, out, mc_path, queue, walltime, engine, num_jobs, vmem, log_level, port, local):
    '''
    Script to execute fact-tools on MonteCarlo files. Use the MC_PATH argument to specifiy the folders containing the MC
    '''
    level=logging.INFO
    if log_level is 'DEBUG':
        level = logging.DEBUG
    elif log_level is 'WARN':
        level = logging.WARN
    elif log_level is 'INFO':
        level = logging.INFO

    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +  '%(message)s'), level=level)

    erna.ensure_output(out)
    jarpath = path.abspath(jar)
    xmlpath = path.abspath(xml)
    drspath = erna.mc_drs_file()
    logger.info('Using drs file at {}'.format(drspath))

    #get data files
    files=[]
    for folder in tqdm(mc_path):
        # print("Entering folder {}".format(folder))
        pattern = path.join(folder, '**/*_Events.fit*')
        f = glob.glob(pattern, recursive=True)
        files = files + f

    num_files = len(files)
    logger.info("Found {} files.".format(num_files))
    if num_files == 1:
        logger.error("Need more than one file to work with.")
        return
    if num_jobs > num_files:
        logger.error("You specified more jobs than files. This doesn't make sense.")
        return

    click.confirm('Do you want to continue processing and start jobs?', abort=True)

    mc_paths_array = np.array(files)
    drs_paths_array = np.repeat(np.array(drspath), len(mc_paths_array))

    job_list = make_jobs(jarpath, xmlpath, mc_paths_array, drs_paths_array,  engine, queue, vmem, num_jobs, walltime)

    job_outputs = gridmap.process_jobs(job_list, max_processes=num_jobs, local=local)
    erna.collect_output(job_outputs, out)
def main(
    earliest_night,
    latest_night,
    data_dir,
    jar,
    xml,
    db,
    out,
    queue,
    mail,
    walltime,
    engine,
    num_runs,
    qjobs,
    vmem,
    log_level,
    port,
    source,
    conditions,
    max_delta_t,
    local,
    password,
):

    level = logging.INFO
    if log_level is "DEBUG":
        level = logging.DEBUG
    elif log_level is "WARN":
        level = logging.WARN
    elif log_level is "INFO":
        level = logging.INFO

    logging.captureWarnings(True)
    logging.basicConfig(format=("%(asctime)s - %(levelname)s - " + "%(message)s"), level=level)

    jarpath = os.path.abspath(jar)
    xmlpath = os.path.abspath(xml)
    outpath = os.path.abspath(out)
    erna.ensure_output(out)
    logger.info("Output data will be written to {}".format(out))

    db_path = os.path.abspath(db)
    output_directory = os.path.dirname(outpath)
    # create dir if it doesnt exist
    os.makedirs(output_directory, exist_ok=True)

    factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password))
    data_conditions = dcc.conditions[conditions]
    df_loaded = erna.load(
        earliest_night,
        latest_night,
        data_dir,
        source_name=source,
        timedelta_in_minutes=max_delta_t,
        factdb=factdb,
        data_conditions=data_conditions,
    )
    df_loaded.to_hdf(out + ".tmp", "loaded", mode="a")

    logger.info("Processing {} jobs with {} runs per job.".format(int(len(df_loaded) / num_runs), num_runs))
    click.confirm("Do you want to continue processing and start jobs?", abort=True)

    # ensure that the max number of queuable jobs is smaller than the total number of jobs
    if qjobs > len(df_loaded):
        qjobs = len(df_loaded)

    nfinished = 0
    nsubmited = 1
    running_jobs = []
    pending_jobs = []
    last_finished = []
    jobids = []
    job_output_paths = []
    df_submitted = pd.DataFrame()

    # copy then dataframe with loaded jobs to submit
    df_runs = df_loaded.copy()

    # operate submission loop, as long as jobs need to be submitted
    while nfinished < nsubmited:
        n_toqueue = qjobs - (len(pending_jobs) + len(running_jobs))
        logger.info("{} jobs to be queued".format(n_toqueue))

        if (n_toqueue > 0) and (len(df_runs) > 0):
            df_to_submit = df_runs.head(n_toqueue * num_runs).copy()
            processing_identifier = "{}_{}".format(source, time.strftime("%Y%m%d%H%M"))
            df_submitted_last = submit_qsub_jobs(
                processing_identifier,
                jarpath,
                xmlpath,
                db_path,
                df_to_submit,
                engine,
                queue,
                vmem,
                num_runs,
                walltime,
                db,
                mail,
            )
            df_submitted = df_submitted.append(df_submitted_last)

            # append submitted jobids
            jobids = df_submitted["JOBID"].unique()
            df_runs = df_runs.drop(df_to_submit.index)
            nsubmited = len(jobids)
            logger.info("Submitted {} jobs in last bunch".format(len(df_submitted_last)))
            logger.info("Submitted {} jobs in total".format(nsubmited))

        finished_jobs = q.get_finished_jobs(jobids)
        running_jobs = q.get_running_jobs(jobids)
        pending_jobs = q.get_pending_jobs(jobids)

        nfinished = len(finished_jobs)
        logger.info(
            "Processing Status: running: {}, pending: {}, queued: {}, finished: {}/{}".format(
                len(running_jobs), len(pending_jobs), nsubmited - nfinished, nfinished, nsubmited
            )
        )

        last_finished = np.setdiff1d(finished_jobs, last_finished)

        if len(last_finished) > 0:
            last_paths = last_finished_out_paths(df_submitted, last_finished)
            job_output_paths = np.append(job_output_paths, last_paths)

        last_finished = finished_jobs
        if nfinished < nsubmited:
            logger.info("Checking qstat in 5 min again")
            time.sleep(5 * 60)

    logger.info("All jobs have been finished, processing done")

    job_outputs = read_outputs_to_list(job_output_paths)
    erna.collect_output(job_outputs, out, df_started_runs=df_loaded)
    # erna.collect_output(job_output_paths, out)
    df_loaded.to_hdf(out, "loaded", mode="a")
    df_submitted.to_hdf(out, "jobinfo", mode="a")
    os.remove(out + ".tmp")