def get_simulations_from_big_experiments(experiment_id): e = get_experiment_by_id(experiment_id) start_date = end_date = e.date_created import pytz limit_date = datetime.today().replace(tzinfo=pytz.utc) interval = 60 stop_flag = False results = {} while start_date < limit_date: start_date = end_date + timedelta(minutes=interval) try: batch = Simulation.get(query_criteria=QueryCriteria() .select(['id', 'state', 'date_created']).select_children('tags') .where(["experiment_id={}".format(experiment_id), "date_created>={}".format(end_date.strftime('%Y-%m-%d %T')), "date_created<={}".format(start_date.strftime('%Y-%m-%d %T'))]) ) except: interval /= 2 continue if not batch: if stop_flag: break else: interval = 120 stop_flag = True else: stop_flag = False for s in batch: results[s.id] = s end_date = start_date return results.values()
def get_asset_collection_by_id(collection_id, query_criteria=None): query_criteria = query_criteria or QueryCriteria().select_children( 'assets') try: return AssetCollection.get(collection_id, query_criteria) except (RuntimeError, ValueError): return None
def sims_from_suite_id(suite_id): exps = Experiment.get(query_criteria=QueryCriteria().where('suite_id=%s' % suite_id)) sims = [] for e in exps: sims += sims_from_experiment(e) return sims
def get_all_experiments_for_user(user): # COMPS limits the retrieval to 1000 so to make sure we get all experiments for a given user, we need to be clever # Also COMPS does not have an order_by so we have to go through all date ranges interval = 365 results = {} end_date = start_date = datetime.today() limit_date = datetime.strptime("2014-03-31", '%Y-%m-%d') # Oldest simulation in COMPS while start_date > limit_date: start_date = end_date - timedelta(days=interval) batch = Experiment.get(query_criteria=QueryCriteria().where(["owner={}".format(user), "date_created<={}".format( end_date.strftime('%Y-%m-%d')), "date_created>={}".format( start_date.strftime('%Y-%m-%d'))])) if len(batch) == 1000: # We hit a limit, reduce the interval and run again interval = interval / 2 continue if len(batch) == 0: interval *= 2 else: # Add the experiments to the dict for e in batch: results[e.id] = e # Go from there end_date = start_date return results.values()
def workdirs_from_suite_id(suite_id): # print('Simulation working directories for SuiteId = %s' % suite_id) s = Suite.get(suite_id) exps = s.get_experiments(QueryCriteria().select('id')) sims = [] for e in exps: sims.extend(sims_from_experiment(e)) return workdirs_from_simulations(sims)
def query_simulation(sid, criteria=None, children=None): from COMPS.Data import Simulation from COMPS.Data import QueryCriteria if children: criteria = criteria or QueryCriteria() criteria.select_children(children) return Simulation.get(sid, query_criteria=criteria)
def get_asset_files_for_simulation_id(sim_id, paths, output_directory=None, flatten=False, remove_prefix=None): """ Obtains AssetManager-contained files from a given simulation. :param sim_id: A simulation id to retrieve files from :param file_paths: relative to the Assets folder :param remove_prefix: if a prefix is given, will remove it from the paths :param output_directory: Write requested files into this directory if specified :param flatten: If true, all the files will be written to the root of output_directory. If false, dir structure will be kept :return: Dictionary associating filename and content """ # Get the collection_id from the simulation collection_id = get_asset_collection_id_for_simulation_id(sim_id=sim_id) # Retrieve the asset collection query_criteria = QueryCriteria().select_children('assets') asset_collection = AssetCollection.get(id=collection_id, query_criteria=query_criteria) # Return dictionary ret = {} # For each requested path, get the file content for rpath in paths: if remove_prefix and rpath.startswith(remove_prefix): path = rpath[len(remove_prefix):] else: path = rpath # Retrieve the relative_path and the file_name for the given path relative_path, file_name = os.path.split(path) relative_path = relative_path.strip('\\').strip('/') # Look for the asset file in the collection af = None for asset_file in asset_collection.assets: if asset_file.file_name == file_name and (asset_file.relative_path or '') == relative_path: af = asset_file break # We did not find the asset in the collection -> error if af is None: raise Exception('Asset not found:\n%s %s \n%s' % (relative_path, file_name, pretty_display_assets_from_collection(asset_collection.assets))) # Retrieve the file result = af.retrieve() # write the file - result is written as output_directory/file_name, where file_name (with no pathing) if output_directory: output_file = os.path.normpath(os.path.join(output_directory, os.path.split(path)[1])) dirname = os.path.dirname(output_file) os.makedirs(dirname, exist_ok=True) with open(output_file, 'wb') as f: f.write(result) # No matter what add to the return ret[rpath] = result return ret
def query_collection(cid=None, cname=None, criteria=None): from COMPS.Data import QueryCriteria from COMPS.Data import AssetCollection criteria = criteria or QueryCriteria().select_children('assets') if cid: return AssetCollection.get(id=cid, query_criteria=criteria) criteria.where_tag('Name={}'.format(cname)) results = AssetCollection.get(query_criteria=criteria) if len(results) >= 1: return results[0]
def query_experiment(eid=None, criteria=None, children=None): from COMPS.Data import Experiment from COMPS.Data import QueryCriteria criteria = criteria or QueryCriteria() children = children or ["tags"] criteria.select_children(children) exp = Experiment.get(eid, query_criteria=criteria) return exp
def get_asset_collection_by_tag(tag_name, tag_value, query_criteria=None): """ Looks to see if a collection id exists for a given collection tag :param collection_tag: An asset collection tag that uniquely identifies an asset collection :return: An asset collection id if ONE match is found, else None (for 0 or 2+ matches) """ query_criteria = query_criteria or QueryCriteria().select_children('assets') query_criteria.where_tag('%s=%s' % (tag_name, tag_value)) result = AssetCollection.get(query_criteria=query_criteria) if len(result) >= 1: return result[0] return None
def get_asset_collection(collection_id_or_name, query_criteria=None): if not collection_id_or_name: return None query_criteria = query_criteria or QueryCriteria().select_children('assets') # Try by id first collection = get_asset_collection_by_id(collection_id_or_name, query_criteria) if collection: return collection # And by name collection = get_asset_collection_by_tag("Name", collection_id_or_name, query_criteria) return collection
def download_asset_collection(collection, output_folder): if not isinstance(collection, AssetCollection): collection = AssetCollection.get(collection, query_criteria=QueryCriteria().select_children('assets')) # Get the files if len(collection.assets) > 0: # Download the collection as zip zip_path = os.path.join(output_folder, 'temp.zip') with open(zip_path, 'wb') as outfile: outfile.write(collection.retrieve_as_zip()) # Extract it zip_ref = zipfile.ZipFile(zip_path, 'r') zip_ref.extractall(output_folder) zip_ref.close() # Delete the temporary zip os.remove(zip_path)
def get_experiment_info(experiment, cache): """ Adds the experiment information for a given experiment to the cache: - raw_size: the size in bytes - size: the formatted size (in KB, MB or GB) - sims: the number of simulations This function is used by the process pool to parallelize the retrieval of experiment info :param experiment: The experiment to analyze """ if experiment.id in cache and not DiskSpaceUsage.FORCE_REFRESH: return # Login to COMPS with SetupParser.TemporarySetup(temporary_block='HPC') as sp: endpoint = sp.get('server_endpoint') COMPS_login(endpoint) # Try to get the simulations try: simulations = experiment.get_simulations( query_criteria=QueryCriteria().select(['id']).select_children( ['hpc_jobs'])) except KeyError: # No simulations found or error -> set None cache.set(experiment.id, None) return # Calculate the size size = sum(s.hpc_jobs[0].output_directory_size for s in simulations if s.hpc_jobs) # Set the info for this particular experiment in the cache cache.set( experiment.id, ExperimentInfo(experiment.id, experiment.name, experiment.owner, size, len(simulations)))
def get_experiments_per_user_and_date(user, limit_date): limit_date_str = limit_date.strftime("%Y-%m-%d") return Experiment.get( query_criteria=QueryCriteria().where('owner=%s,DateCreated>%s' % (user, limit_date_str)))
def get_experiment_ids_for_user(user): exps = Experiment.get(query_criteria=QueryCriteria().select(['id']).where( ['owner={}'.format(user)])) return [str(exp.id) for exp in exps]
def get_asset_collection_id_for_simulation_id(sim_id): query_criteria = QueryCriteria().select_children('configuration') simulation = Simulation.get(id=sim_id, query_criteria=query_criteria) collection_id = simulation.configuration.asset_collection_id return collection_id
def display(users, top=15, save=False, refresh=False): DiskSpaceUsage.OWNERS = users DiskSpaceUsage.TOP_COUNT = top if top else 15 DiskSpaceUsage.FORCE_REFRESH = refresh # Create/open the cache current_folder = os.path.dirname(os.path.realpath(__file__)) cache_folder = os.path.join(current_folder, "cache") cache = FanoutCache(shards=6, directory=cache_folder) # All experiments all_experiments = list( itertools.chain(*(Experiment.get(query_criteria=QueryCriteria(). where(["owner={}".format(owner)])) for owner in DiskSpaceUsage.OWNERS))) all_experiments_len = len(all_experiments) # Create the pool of worker p = Pool(6) r = p.starmap_async(DiskSpaceUsage.get_experiment_info, itertools.product(all_experiments, (cache, ))) p.close() print("Analyzing disk space for:") print(" | {} experiments".format(all_experiments_len)) print(" | Users: {}".format(", ".join(DiskSpaceUsage.OWNERS))) # Wait for completion and display progress sys.stdout.write( " | Experiment analyzed: 0/{}".format(all_experiments_len)) sys.stdout.flush() # While we are analyzing, display the status while not r.ready(): # Estimate how many remaining we have. This is just an estimations and needs to be bounded remaining = max( 0, min(all_experiments_len, r._number_left * r._chunksize)) sys.stdout.write("\r {} Experiment analyzed: {}/{}".format( next(animation), all_experiments_len - remaining, all_experiments_len)) sys.stdout.flush() time.sleep(.5) sys.stdout.write("\r | Experiment analyzed: {}/{}".format( all_experiments_len, all_experiments_len)) sys.stdout.flush() # Get all the results experiments_info = [ cache.get(e.id) for e in all_experiments if cache.get(e.id) ] cache.close() # Display print("\n\n---------------------------") DiskSpaceUsage.top_count_experiments(experiments_info) print("\n---------------------------") DiskSpaceUsage.total_size_per_user(experiments_info) print("\n---------------------------") DiskSpaceUsage.top_count_experiments_per_user(experiments_info) # save to a csv file if save: DiskSpaceUsage.save_to_file(experiments_info)
from COMPS.Data import QueryCriteria sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from PythonHelperFunctions.utils import load_output_path from SCOutputAnalyzer import SCOutputAnalyzer SetupParser.default_block = "HPC" if __name__ == "__main__": SetupParser.init() exp_list = pd.read_csv(os.path.join(load_output_path(), 'Experiment_tracking.csv'), index_col="Index") for index, row in exp_list.iterrows(): exp = row['Experiment ID'] print('Checking experiment ' + exp) tmp = get_experiment_by_id( exp, query_criteria=QueryCriteria().select_children(["tags"])) foldername = row['Description'] outDir = os.path.join(load_output_path(), foldername, 'simOutputs') if not os.path.exists(outDir): os.mkdir(outDir) am = AnalyzeManager(exp, analyzers=SCOutputAnalyzer(filenames=[ 'output/InsetChart.json', 'output/PropertyReport.json' ], output_path=outDir)) am.analyze()
def exps_for_suite_id(suite_id): try: return Experiment.get( query_criteria=QueryCriteria().where('suite_id=%s' % suite_id)) except: return None
def get_experiments_by_name(name,user): return Experiment.get(query_criteria=QueryCriteria().where(['name~%s' % name, 'owner=%s' % user]))
def sims_from_experiment_id(exp_id): return Simulation.get(query_criteria=QueryCriteria().select( ['id', 'state']).where('experiment_id=%s' % exp_id))
def generate_climate_files(self): # see InputDataWorker for other work options self.wo = InputDataWorker( demographics_file_path=self.demographics_file_path, wo_output_path=self.work_order_path, project_info=self.climate_project, start_year=self.start_year, num_years=self.num_years, resolution=self.resolution, idRef=self.idRef) # login to COMPS (if not already logged in) to submit climate files generation work order self.wo.wo_2_json() from COMPS.Data.WorkItem import WorkerOrPluginKey, WorkItemState from COMPS.Data import QueryCriteria from COMPS.Data import WorkItem, WorkItemFile from COMPS.Data import AssetCollection workerkey = WorkerOrPluginKey(name='InputDataWorker', version='1.0.0.0_RELEASE') wi = WorkItem('dtk-tools InputDataWorker WorkItem', workerkey, SetupParser.get('environment')) wi.set_tags({ 'dtk-tools': None, 'WorkItem type': 'InputDataWorker dtk-tools' }) with open(self.work_order_path, 'rb') as workorder_file: # wi.AddWorkOrder(workorder_file.read()) wi.add_work_order(data=workorder_file.read()) with open(self.demographics_file_path, 'rb') as demog_file: wi.add_file(WorkItemFile( os.path.basename(self.demographics_file_path), 'Demographics', ''), data=demog_file.read()) wi.save() print("Created request for climate files generation.") print("Commissioning...") wi.commission() while wi.state not in (WorkItemState.Succeeded, WorkItemState.Failed, WorkItemState.Canceled): om('Waiting for climate generation to complete (current state: ' + str(wi.state) + ')', style='flushed') time.sleep(5) wi.refresh() print("Climate files SUCCESSFULLY generated") # Get the collection with our files collections = wi.get_related_asset_collections() collection_id = collections[0].id comps_collection = AssetCollection.get( collection_id, query_criteria=QueryCriteria().select_children('assets')) # Get the files if len(comps_collection.assets) > 0: print("Found output files:") for asset in comps_collection.assets: print("- %s (%s)" % (asset.file_name, file_size(asset.length))) print("\nDownloading to %s..." % self.climate_files_output_path) # Download the collection download_asset_collection(comps_collection, self.climate_files_output_path) # return filenames; this use of re in conjunction w/ glob is not great; consider refactor rain_bin_re = os.path.abspath(self.climate_files_output_path + '/*rain*.bin') humidity_bin_re = os.path.abspath(self.climate_files_output_path + '/*humidity*.bin') temperature_bin_re = os.path.abspath( self.climate_files_output_path + '/*temperature*.bin') rain_file_name = os.path.basename(glob.glob(rain_bin_re)[0]) humidity_file_name = os.path.basename( glob.glob(humidity_bin_re)[0]) temperature_file_name = os.path.basename( glob.glob(temperature_bin_re)[0]) print('Climate files SUCCESSFULLY stored.') return rain_file_name, temperature_file_name, humidity_file_name else: print('No output files found')
def copy_simulation(simulation, to_experiment): simulation.refresh(query_criteria=QueryCriteria().select_children( ['files', 'hpc_jobs', 'tags'])) new_simulation = Simulation(simulation.name, description=simulation.description) new_simulation.experiment_id = to_experiment.id tags = copy.copy(simulation.tags) tags["CopiedFromSimulation"] = simulation.id new_simulation.set_tags(tags) job = simulation.hpc_jobs[-1] # override any fields here as necessary... if job and job.configuration: new_simulation.configuration = Configuration( environment_name=job.configuration.environment_name, simulation_input_args=job.configuration.simulation_input_args, working_directory_root=job.configuration.working_directory_root, executable_path=job.configuration.executable_path, maximum_number_of_retries=SetupParser.get(parameter='num_retries'), priority=SetupParser.get(parameter='priority'), min_cores=job.configuration.min_cores, max_cores=job.configuration.max_cores, exclusive=job.configuration.exclusive, node_group_name=SetupParser.get(parameter='node_group'), asset_collection_id=job.configuration.asset_collection_id) with tempfile.TemporaryDirectory() as dir: files_to_add_last = {} for f in simulation.files: if f.file_name == 'config.json': dest_file = os.path.join(dir, 'config.json') with open(dest_file, 'wb') as fp: fp.write(f.retrieve()) modify_config_json(dest_file) # with open(dest_file, 'rb') as fp: # data = fp.read() filename = dest_file # checksum = hashlib.md5(data).hexdigest() sf = SimulationFile(file_name=filename, file_type=f.file_type, description=f.description) files_to_add_last[filename] = sf else: filename = f.file_name checksum = f.md5_checksum sf = SimulationFile(file_name=filename, file_type=f.file_type, description=f.description, md5_checksum=checksum) new_simulation.add_file(sf) new_simulation.save(return_missing_files=False) if len(files_to_add_last) > 0: for file_path, sf in files_to_add_last.items(): new_simulation.add_file(sf, file_path=file_path) new_simulation.save(return_missing_files=False) print('new sim = ' + str(new_simulation.id)) return new_simulation
def get_experiments_by_name(name, user=None): filters = ["name~{}".format(name)] if user: filters.append("owner={}".format(user)) return Experiment.get(query_criteria=QueryCriteria().where(filters))
def sims_from_experiment(e): return e.get_simulations(QueryCriteria().select( ['id', 'state']).select_children('hpc_jobs'))
from simtools.Utilities.COMPSUtilities import get_experiment_by_id from COMPS import Client from COMPS.Data import QueryCriteria Client.login('https://comps.idmod.org') exp_list = ['ea1e506d-25a7-e811-a2c0-c4346bcb7275', '96583fdd-27a7-e811-a2c0-c4346bcb7275', 'cb60414a-fda7-e811-a2c0-c4346bcb7275', '7e7edfb2-fda7-e811-a2c0-c4346bcb7275', '30c13b96-9ba8-e811-a2c0-c4346bcb7275', 'f626a243-9ca8-e811-a2c0-c4346bcb7275', 'b95874b8-9ca8-e811-a2c0-c4346bcb7275', '64ef6572-9da8-e811-a2c0-c4346bcb7275', 'b9314306-9fa8-e811-a2c0-c4346bcb7275', '7bf00e89-44a9-e811-a2c0-c4346bcb7275', '521a5f3f-5faa-e811-a2c0-c4346bcb7275', 'f5ffd5d3-5faa-e811-a2c0-c4346bcb7275'] rural_mult = [1.0, 1.0, 1.5, 1.5, 1.0, 1.0, 1.5, 1.5, 1.0, 1.0, 1.0, 1.0] coverage = [0, 0.5, 0, 0.5, 0, 0.5, 0, 0.5, 0, 0.5, 0, 0.5] migration = [.2, .2, .2, .2, .2, .2, .2, .2, .02, .02, .002, .002] AgeAtVacc = [270, 270, 270, 270, 180, 180, 180, 180, 270, 270, 270, 270] for ii in range(len(exp_list)): exp = get_experiment_by_id(exp_list[ii], query_criteria=QueryCriteria().select_children(["tags"])) exp.merge_tags({'Rural_Infectivity_Multiplier': rural_mult[ii], 'META_Campaign_Coverage': coverage[ii], 'META_Migration': migration[ii], 'MCV1_Dose_Days': AgeAtVacc[ii]})
def experiment_needs_commission(e): return e.get_simulations(QueryCriteria().select(['id']).where( "state=%d" % SimulationState.Created.value))
config = json.load(fp) # EDIT # Change this to be the congig update you actually want # config['parameters']['AIDS_Duration_In_Months'] = 999 with open(filename, 'w') as fp json.dump(config, fp) return None #### END EDIT ####################################################################### def copy_simulation(simulation, to_experiment) simulation.refresh(query_criteria=QueryCriteria().select_children(['files', 'hpc_jobs', 'tags'])) new_simulation = Simulation(simulation.name, description=simulation.description) new_simulation.experiment_id = to_experiment.id tags = copy.copy(simulation.tags) tags[CopiedFromSimulation] = simulation.id new_simulation.set_tags(tags) job = simulation.hpc_jobs[-1] # override any fields here as necessary... if job and job.configuration new_simulation.configuration = Configuration( environment_name=job.configuration.environment_name, simulation_input_args=job.configuration.simulation_input_args,
def COMPS_experiment_to_local_db(exp_id, endpoint, verbose=False, save_new_experiment=True): """ Return a DB object representing an experiment coming from COMPS. This function saves the newly retrieved experiment in the DB by default but this behavior can be changed by switching the save_new_experiment parameter allowing to return an experiment object and save later with a batch for example. :param exp_id: :param endpoint: :param verbose: :param save_new_experiment: :return: """ # Make sure we are logged in COMPS_login(endpoint) #Ensure exp_id is a string exp_id = str(exp_id) # IF the experiment already exists and experiment = DataStore.get_experiment(exp_id) if experiment and experiment.is_done(): if verbose: print("Experiment ('%s') already exists in local db." % exp_id) # Do not bother with finished experiments return None from COMPS.Data import QueryCriteria try: query_criteria = QueryCriteria().select_children('tags') exp_comps = get_experiment_by_id(exp_id, query_criteria) or get_experiments_by_name(exp_id, query_criteria)[-1] except: if verbose: print("The experiment ('%s') doesn't exist in COMPS." % exp_id) return None # Case: experiment doesn't exist in local db if not experiment: # Cast the creation_date experiment = DataStore.create_experiment(exp_id=str(exp_comps.id), suite_id=str(exp_comps.suite_id) if exp_comps.suite_id else None, exp_name=exp_comps.name, tags=exp_comps.tags, date_created=utc_to_local(exp_comps.date_created).replace(tzinfo=None), location='HPC', selected_block='HPC', endpoint=endpoint) # Note: experiment may be new or comes from local db # Get associated simulations of the experiment sims = exp_comps.get_simulations(QueryCriteria().select(['id', 'state', 'date_created']).select_children('tags')) # Skip empty experiments or experiments that have the same number of sims if len(sims) == 0 or len(sims) == len(experiment.simulations): if verbose: if len(sims) == 0: print("Skip empty experiment ('%s')." % exp_id) elif len(sims) == len(experiment.simulations): print("Skip experiment ('%s') since local one has the same number of simulations." % exp_id) return None # Go through the sims and create them for sim in sims: # Cast the simulation tags # Create the simulation simulation = DataStore.create_simulation(id=str(sim.id), status=sim.state, # this is already a SimulationState object tags={tag:cast_number(val) for tag,val in sim.tags.items()}, date_created=utc_to_local(sim.date_created).replace(tzinfo=None)) # Add to the experiment experiment.simulations.append(simulation) # Save it to the DB if save_new_experiment: DataStore.save_experiment(experiment, verbose=verbose) return experiment