def main(earliest_night, latest_night, data_dir, source, max_delta_t, parts, password, conditions): ''' This script connects to the rundb and fetches all runs belonging to the specified source. Provide time range by specifying ealriest and lates night to fetch. As in 20131001, 20141001. This script will produce a json file containing paths to the data files and their drs files. The path prefix is specified by the DATA_DIR argument. The files in this folder will not actually be read by this script. It simply needs the path to construct the json file containing the full paths to the raw data files ''' logging.basicConfig(level=logging.INFO) factdb = create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password)) data_conditions=dcc.conditions[conditions] mapping = erna.load(earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions) if mapping.empty: logger.error('No entries matching the conditions could be found in the RunDB') return if parts > 1: split_indices = np.array_split(np.arange(len(mapping)), parts) for num, indices in enumerate(split_indices): df = mapping[indices.min(): indices.max()] filename = "{}_{}_{}_part_{}.json".format(earliest_night, latest_night, source.replace(' ', '_'), num) logger.info("Writing {} entries to json file {}".format(len(df), filename)) df.to_json(filename, orient='records', date_format='epoch' ) else: filename = earliest_night + "_" + latest_night + "_" + source + ".json" logger.info("Writing list to json file {}".format(filename)) mapping.to_json(filename, orient='records', date_format='epoch' )
def main(earliest_night, latest_night, data_dir, jar, xml, db, out, queue, walltime, engine, num_runs, vmem, log_level, port, source, conditions, max_delta_t, local, password): level=logging.INFO if log_level is 'DEBUG': level = logging.DEBUG elif log_level is 'WARN': level = logging.WARN elif log_level is 'INFO': level = logging.INFO logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=level) jarpath = os.path.abspath(jar) xmlpath =os. path.abspath(xml) outpath = os.path.abspath(out) erna.ensure_output(out) db_path = os.path.abspath(db) output_directory = os.path.dirname(outpath) #create dir if it doesnt exist os.makedirs(output_directory, exist_ok=True) logger.info("Writing output data to {}".format(out)) factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password)) data_conditions=dcc.conditions[conditions] df_runs = erna.load(earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions) logger.info("Would process {} jobs with {} runs per job".format(len(df_runs)//num_runs, num_runs)) click.confirm('Do you want to continue processing and start jobs?', abort=True) job_list = make_jobs(jarpath, xmlpath, db_path, output_directory, df_runs, engine, queue, vmem, num_runs, walltime) job_outputs = gridmap.process_jobs(job_list, max_processes=len(job_list), local=local) erna.collect_output(job_outputs, out, df_runs)
def main( earliest_night, latest_night, data_dir, jar, xml, db, out, queue, mail, walltime, engine, num_runs, qjobs, vmem, log_level, port, source, conditions, max_delta_t, local, password, ): level = logging.INFO if log_level is "DEBUG": level = logging.DEBUG elif log_level is "WARN": level = logging.WARN elif log_level is "INFO": level = logging.INFO logging.captureWarnings(True) logging.basicConfig(format=("%(asctime)s - %(levelname)s - " + "%(message)s"), level=level) jarpath = os.path.abspath(jar) xmlpath = os.path.abspath(xml) outpath = os.path.abspath(out) erna.ensure_output(out) logger.info("Output data will be written to {}".format(out)) db_path = os.path.abspath(db) output_directory = os.path.dirname(outpath) # create dir if it doesnt exist os.makedirs(output_directory, exist_ok=True) factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password)) data_conditions = dcc.conditions[conditions] df_loaded = erna.load( earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions, ) df_loaded.to_hdf(out + ".tmp", "loaded", mode="a") logger.info("Processing {} jobs with {} runs per job.".format(int(len(df_loaded) / num_runs), num_runs)) click.confirm("Do you want to continue processing and start jobs?", abort=True) # ensure that the max number of queuable jobs is smaller than the total number of jobs if qjobs > len(df_loaded): qjobs = len(df_loaded) nfinished = 0 nsubmited = 1 running_jobs = [] pending_jobs = [] last_finished = [] jobids = [] job_output_paths = [] df_submitted = pd.DataFrame() # copy then dataframe with loaded jobs to submit df_runs = df_loaded.copy() # operate submission loop, as long as jobs need to be submitted while nfinished < nsubmited: n_toqueue = qjobs - (len(pending_jobs) + len(running_jobs)) logger.info("{} jobs to be queued".format(n_toqueue)) if (n_toqueue > 0) and (len(df_runs) > 0): df_to_submit = df_runs.head(n_toqueue * num_runs).copy() processing_identifier = "{}_{}".format(source, time.strftime("%Y%m%d%H%M")) df_submitted_last = submit_qsub_jobs( processing_identifier, jarpath, xmlpath, db_path, df_to_submit, engine, queue, vmem, num_runs, walltime, db, mail, ) df_submitted = df_submitted.append(df_submitted_last) # append submitted jobids jobids = df_submitted["JOBID"].unique() df_runs = df_runs.drop(df_to_submit.index) nsubmited = len(jobids) logger.info("Submitted {} jobs in last bunch".format(len(df_submitted_last))) logger.info("Submitted {} jobs in total".format(nsubmited)) finished_jobs = q.get_finished_jobs(jobids) running_jobs = q.get_running_jobs(jobids) pending_jobs = q.get_pending_jobs(jobids) nfinished = len(finished_jobs) logger.info( "Processing Status: running: {}, pending: {}, queued: {}, finished: {}/{}".format( len(running_jobs), len(pending_jobs), nsubmited - nfinished, nfinished, nsubmited ) ) last_finished = np.setdiff1d(finished_jobs, last_finished) if len(last_finished) > 0: last_paths = last_finished_out_paths(df_submitted, last_finished) job_output_paths = np.append(job_output_paths, last_paths) last_finished = finished_jobs if nfinished < nsubmited: logger.info("Checking qstat in 5 min again") time.sleep(5 * 60) logger.info("All jobs have been finished, processing done") job_outputs = read_outputs_to_list(job_output_paths) erna.collect_output(job_outputs, out, df_started_runs=df_loaded) # erna.collect_output(job_output_paths, out) df_loaded.to_hdf(out, "loaded", mode="a") df_submitted.to_hdf(out, "jobinfo", mode="a") os.remove(out + ".tmp")