Example #1
0
def main(earliest_night, latest_night, data_dir, source,  max_delta_t, parts, password, conditions):
    '''  This script connects to the rundb and fetches all runs belonging to the specified source.
        Provide time range by specifying ealriest and lates night to fetch. As in 20131001,  20141001.
        This script will produce a json file containing paths to the data files and their drs files. The
        path prefix is specified by the DATA_DIR argument. The files in this folder will not actually be read
        by this script. It simply needs the path to construct the json file containing the full paths to the raw data files
    '''

    logging.basicConfig(level=logging.INFO)

    factdb = create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password))

    data_conditions=dcc.conditions[conditions]
    mapping = erna.load(earliest_night, latest_night, data_dir,  source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions)
    if mapping.empty:
        logger.error('No entries matching the conditions could be found in the RunDB')
        return


    if parts > 1:
        split_indices = np.array_split(np.arange(len(mapping)), parts)
        for num, indices in enumerate(split_indices):
            df = mapping[indices.min(): indices.max()]
            filename = "{}_{}_{}_part_{}.json".format(earliest_night, latest_night, source.replace(' ', '_'), num)
            logger.info("Writing {} entries to json file  {}".format(len(df), filename))
            df.to_json(filename, orient='records', date_format='epoch' )
    else:
        filename = earliest_night + "_" + latest_night + "_" + source + ".json"
        logger.info("Writing list to json file  {}".format(filename))
        mapping.to_json(filename, orient='records', date_format='epoch' )
Example #2
0
def main(earliest_night, latest_night, data_dir, jar, xml, db, out, queue, walltime, engine, num_runs, vmem, log_level, port, source, conditions, max_delta_t, local, password):

    level=logging.INFO
    if log_level is 'DEBUG':
        level = logging.DEBUG
    elif log_level is 'WARN':
        level = logging.WARN
    elif log_level is 'INFO':
        level = logging.INFO

    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +  '%(message)s'), level=level)

    jarpath = os.path.abspath(jar)
    xmlpath =os. path.abspath(xml)
    outpath = os.path.abspath(out)
    erna.ensure_output(out)
    db_path = os.path.abspath(db)
    output_directory = os.path.dirname(outpath)
    #create dir if it doesnt exist
    os.makedirs(output_directory, exist_ok=True)
    logger.info("Writing output data  to {}".format(out))
    factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password))
    data_conditions=dcc.conditions[conditions]
    df_runs = erna.load(earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions)

    logger.info("Would process {} jobs with {} runs per job".format(len(df_runs)//num_runs, num_runs))
    click.confirm('Do you want to continue processing and start jobs?', abort=True)

    job_list = make_jobs(jarpath, xmlpath, db_path, output_directory, df_runs,  engine, queue, vmem, num_runs, walltime)
    job_outputs = gridmap.process_jobs(job_list, max_processes=len(job_list), local=local)
    erna.collect_output(job_outputs, out, df_runs)
def main(
    earliest_night,
    latest_night,
    data_dir,
    jar,
    xml,
    db,
    out,
    queue,
    mail,
    walltime,
    engine,
    num_runs,
    qjobs,
    vmem,
    log_level,
    port,
    source,
    conditions,
    max_delta_t,
    local,
    password,
):

    level = logging.INFO
    if log_level is "DEBUG":
        level = logging.DEBUG
    elif log_level is "WARN":
        level = logging.WARN
    elif log_level is "INFO":
        level = logging.INFO

    logging.captureWarnings(True)
    logging.basicConfig(format=("%(asctime)s - %(levelname)s - " + "%(message)s"), level=level)

    jarpath = os.path.abspath(jar)
    xmlpath = os.path.abspath(xml)
    outpath = os.path.abspath(out)
    erna.ensure_output(out)
    logger.info("Output data will be written to {}".format(out))

    db_path = os.path.abspath(db)
    output_directory = os.path.dirname(outpath)
    # create dir if it doesnt exist
    os.makedirs(output_directory, exist_ok=True)

    factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password))
    data_conditions = dcc.conditions[conditions]
    df_loaded = erna.load(
        earliest_night,
        latest_night,
        data_dir,
        source_name=source,
        timedelta_in_minutes=max_delta_t,
        factdb=factdb,
        data_conditions=data_conditions,
    )
    df_loaded.to_hdf(out + ".tmp", "loaded", mode="a")

    logger.info("Processing {} jobs with {} runs per job.".format(int(len(df_loaded) / num_runs), num_runs))
    click.confirm("Do you want to continue processing and start jobs?", abort=True)

    # ensure that the max number of queuable jobs is smaller than the total number of jobs
    if qjobs > len(df_loaded):
        qjobs = len(df_loaded)

    nfinished = 0
    nsubmited = 1
    running_jobs = []
    pending_jobs = []
    last_finished = []
    jobids = []
    job_output_paths = []
    df_submitted = pd.DataFrame()

    # copy then dataframe with loaded jobs to submit
    df_runs = df_loaded.copy()

    # operate submission loop, as long as jobs need to be submitted
    while nfinished < nsubmited:
        n_toqueue = qjobs - (len(pending_jobs) + len(running_jobs))
        logger.info("{} jobs to be queued".format(n_toqueue))

        if (n_toqueue > 0) and (len(df_runs) > 0):
            df_to_submit = df_runs.head(n_toqueue * num_runs).copy()
            processing_identifier = "{}_{}".format(source, time.strftime("%Y%m%d%H%M"))
            df_submitted_last = submit_qsub_jobs(
                processing_identifier,
                jarpath,
                xmlpath,
                db_path,
                df_to_submit,
                engine,
                queue,
                vmem,
                num_runs,
                walltime,
                db,
                mail,
            )
            df_submitted = df_submitted.append(df_submitted_last)

            # append submitted jobids
            jobids = df_submitted["JOBID"].unique()
            df_runs = df_runs.drop(df_to_submit.index)
            nsubmited = len(jobids)
            logger.info("Submitted {} jobs in last bunch".format(len(df_submitted_last)))
            logger.info("Submitted {} jobs in total".format(nsubmited))

        finished_jobs = q.get_finished_jobs(jobids)
        running_jobs = q.get_running_jobs(jobids)
        pending_jobs = q.get_pending_jobs(jobids)

        nfinished = len(finished_jobs)
        logger.info(
            "Processing Status: running: {}, pending: {}, queued: {}, finished: {}/{}".format(
                len(running_jobs), len(pending_jobs), nsubmited - nfinished, nfinished, nsubmited
            )
        )

        last_finished = np.setdiff1d(finished_jobs, last_finished)

        if len(last_finished) > 0:
            last_paths = last_finished_out_paths(df_submitted, last_finished)
            job_output_paths = np.append(job_output_paths, last_paths)

        last_finished = finished_jobs
        if nfinished < nsubmited:
            logger.info("Checking qstat in 5 min again")
            time.sleep(5 * 60)

    logger.info("All jobs have been finished, processing done")

    job_outputs = read_outputs_to_list(job_output_paths)
    erna.collect_output(job_outputs, out, df_started_runs=df_loaded)
    # erna.collect_output(job_output_paths, out)
    df_loaded.to_hdf(out, "loaded", mode="a")
    df_submitted.to_hdf(out, "jobinfo", mode="a")
    os.remove(out + ".tmp")