def run(self, need_onnx=False): utils.makedirs(self.model_dir) param_filename = os.path.join(self.model_dir, 'params.npz') params_loaded = self.is_up_to_date(param_filename) if params_loaded: chainer.serializers.load_npz(param_filename, self.model) chainer.config.train = False inputs = utils.as_list(self.model.inputs()) if need_onnx: need_onnx = self.gen_onnx_model(inputs) self.model.to_gpu() gpu_inputs = utils.to_gpu(inputs) gpu_outputs = self.model(*gpu_inputs) gpu_outputs = utils.as_list(gpu_outputs) outputs = utils.to_cpu(gpu_outputs) self.inputs = inputs self.outputs = outputs if need_onnx: self.gen_onnx_test(inputs, outputs) if not params_loaded: chainer.serializers.save_npz(param_filename, self.model) return inputs, outputs
def match_usage(self, candidate_usage, usage_requested, usage_matching, usage_exclude=None): """Match candidate usage with the requested usage type """ usage_exclude = as_list(usage_exclude) if any(x in candidate_usage for x in usage_exclude): return False # no spec. usage if usage_requested is None: return True # no matching if len(usage_requested) == 1 and usage_requested[0] == "ignore": return True candidate_usage = as_list(candidate_usage) if usage_matching != "all": usage_requested = as_list(usage_requested) if usage_matching == "all": return True if usage_matching == "exact": if not equal_lengths(candidate_usage, usage_requested): return False return set(candidate_usage) == set(usage_requested) elif usage_matching == "subset": return all(x in candidate_usage for x in usage_requested) elif usage_matching == "any": return any(x in usage_requested for x in candidate_usage) else: error(f"Specified undefined usage matching: {usage_matching}")
def add_feeders(self, chain_names, component_names): """Add feeders""" if chain_names is not None: chain_names = as_list(chain_names) for chain_name in chain_names: self.feeder_chains.append(chain_name) if component_names is not None: component_names = as_list(component_names) for component_name in component_names: self.feeder_components.append(component_name)
def request_data(self, data_type, usage, client, usage_matching="exact", usage_exclude=None, must_be_single=True, on_error_message="Data request failed:", reference_data=None): """Get data from the data pool Args: data_type (str): Name of datatype usage (str): Name or class of usage usage_matching (str): How to match the usage, candidates are "exact", "any", "all", "subset", "ignore". Defaults to "exact" client ([type]): [description] must_be_single (bool, optional): Singleton enforcer. Defaults to True. on_error_message (str, optional): What to print on error. Defaults to "Data request failed:". reference_data (list, optional): Data list to draw candidates from. Defaults to None, which is resolved to the current chain feeders. Returns: [type]: [description] """ # get the data available to the client if reference_data is None: curr_inputs = self.get_current_inputs() else: curr_inputs = reference_data res = [] # all to string if data_type is not None: # data_type = data_type.get_matching_names() if type(data_type) is not str and issubclass(data_type, Datatype) else data_type data_type = data_type.name if type(data_type) is not str and issubclass(data_type, Datatype) else data_type if usage is not None: usage = as_list(usage) if any(type(x) is not str and issubclass(x, DataUsage) for x in usage): # usage = [x.get_matching_names() if type(x) is not str and issubclass(x, DataUsage) else x for x in usage] # flatten # usage = [k for x in usage for k in x] usage = [x.name if type(x) is not str and issubclass(x, DataUsage) else x for x in usage] if usage_exclude is not None: usage_exclude = as_list(usage_exclude) usage_exclude = [x.name if type(x) is not str and issubclass(x, DataUsage) else x for x in usage_exclude] for data in curr_inputs: matches_usage = self.match_usage(data.get_usage_names(), usage, usage_matching, usage_exclude) if matches_usage and (data_type is None or data.get_datatype() in data_type): res.append(data) if must_be_single: if len(res) != 1: if len(curr_inputs) == 0: warning("No available current inputs to fetch requrested data from! Did you omit a cross-chain linkage?") else: warning(f"Examined current inputs for requesting client {client}:") for i, c in enumerate(curr_inputs): warning(f"{i+1}/{len(curr_inputs)}: {str(c)}") warning(f"Feeder chains: {self.feeder_chains}, components:{self.feeder_components}") error(on_error_message + f" Requested: type: {data_type}, usages: {'/'.join(usage)}, usage-matching: {usage_matching}. \n num matches: {len(res)}.") res = res[0] else: # else keep all and drop empty ones res = drop_empty_datapacks(res) return res
def __init__(self, config): self.config = config Manipulation.__init__(self) self.tag = config.tag self.target_tags = self.config.target_tags if self.target_tags is not None: self.target_tags = as_list(self.target_tags)
def __init__(self, config): """Constructor for the learner configuration""" super().__init__(config) if config is None: return self.name = config["name"] try: self.sequence_length = self.get_value("sequence_length", default=1, base=config) except KeyError: self.sequence_length = 1 if "num_clusters" in config: self.num_clusters = self.get_value("num_clusters", default=None, base=config) self.layers = self.get_value("layers", base=config) self.layers = as_list(self.layers) self.use_gpu = self.get_value("use_gpu", default=True, base=config) self.do_test = self.get_value("do_test", default=True, base=config) self.model_id = self.get_value("model_id", base=config) self.retain_embedding_matrix = self.get_value("retain_embeddings", default=False, base=config) # training parameters self.train = learner_conf.train() trconf = config["train"] if "train" in config else {} getval = lambda x, y, et=None: self.get_value(x, default=y, base=trconf, expected_type=et) self.train.epochs = getval("epochs", 50) self.train.batch_size = getval("batch_size", 50) self.train.train_embedding = getval("train_embedding", False) self.train.optimizer = getval("optimizer", "sgd") self.train.lr_scheduler = getval("lr_scheduler", None) self.train.base_lr = getval("base_lr", 0.01) self.train.folds = getval("folds", None) self.train.validation_portion = getval("validation_portion", None) self.train.early_stopping_patience = getval("early_stopping_patience", None) self.save_interval = getval("save_interval", 1)
def run_first(self, task, inputs, sample_outputs): self.model = task.model self.model.to_gpu() self.inputs = utils.to_gpu(inputs) gpu_outputs = self.run_task() gpu_outputs = utils.as_list(gpu_outputs) outputs = utils.to_cpu(gpu_outputs) return outputs
def generator(task, **kwargs): keys = [] values = [] for k, v in kwargs.items(): keys.append(k) values.append(as_list(v)) args_list = [dict(zip(keys, items)) for items in itertools.product(*values)] for args in args_list: yield partial(task, **args)
def __init__(self, config): """Constructor for the sampling component configuration""" super().__init__(config) self.label_dict = self.get_value("label_dict", base=config) self.min_freq = self.get_value("min_freq", base=config) self.max_freq = self.get_value("max_freq", base=config) self.exclude_tags = self.get_value("exclude_tags", base=config) if self.exclude_tags is not None: self.exclude_tags = as_list(self.exclude_tags)
def from_dict(cls, d): """ Override default, adding the capture of members. """ o = super(DistributionList, cls).from_dict(d) o.members = [] if d.has_key('dlm'): o.members = [utils.get_content(member) for member in utils.as_list(d["dlm"])] return o
def from_dict(cls, d): """ Override default, adding the capture of members. """ o = super(DistributionList, cls).from_dict(d) o.members = [] if d.has_key('dlm'): o.members = [ utils.get_content(member) for member in utils.as_list(d["dlm"]) ] return o
def __init__(self, data, usage=None, source=None, chain=None): self.data = data self.usages = [] if usage is not None: usage = as_list(usage) self.usages.extend(usage) if source is not None: self.source = source if chain is not None: self.chain = chain
def __init__(self, synctrex, name, **params): # Note: Synctrex initialize value using the yaml's generated # data. That means that they are unsafe, untyped, and that there # are not yet cross-references between objects. super().__init__() self.synctrex = synctrex self.name = name self.method = params.get('method') self.shell = params.get('shell') or 'bash' self.source = Address(params.get('source') or '') self.dest = Address(params.get('dest') or '') self.source_dir = Address(params.get('source_dir') or '') self.dest_dir = Address(params.get('dest_dir') or '') self.options = params.get('options') or [] self.options = as_list(self.options) self.group = params.get('group') or [] self.groups = params.get('groups') or [] self.groups = as_list(self.groups) #self.require = params.get('require') or [] #self.require = as_list_of(str, self.require) self.files = params.get('files') or [] self.files = as_list_of(Address, self.files) self.exclude = params.get('exclude') or [] self.exclude = as_list_of(str, self.exclude) self.mode = params.get('mode') or 'normal' handlers = params.get('events') or {} for type, cmds in handlers.items(): commands = as_list_of(str, cmds) for command in commands: if command[0] == '`' == command[-1]: command = events.Shell(command[1:-1], self.shell) else: command = events.Script(command) self.add_handler(type, command)
def get_production(self, chain_name): self.produces = as_list( self.produces) if self.produces is not None else [] res = [] for pr in self.produces: try: dtype, usage = pr except (ValueError, TypeError): dtype, usage = pr, None res.append(Produces(dtype, usage, self.get_name(), chain_name)) return res
def __init__(self, instances, tags, epi=None, skip_empty=True): # only numbers super().__init__([Numeric]) instances = as_list(instances) tags = as_list(tags) if epi is None: epi = [np.ones((len(ind), ), np.int32) for ind in instances] self.elements_per_instance = epi self.instances = [] self.tags = [] for i in range(len(instances)): inst = instances[i] if len(inst) == 0: if skip_empty: continue self.instances.append(np.asarray(inst)) try: tag = tags[i] self.tags.append(tag) except (TypeError, IndexError): pass
def __init__(self, config=None): super().__init__(config) if config is None: config = {} self.baselines = self.get_value("baselines", base=config) self.averages = self.get_value("averages", base=config, default=True) self.top_k = self.get_value("top_k", base=config, default=3, expected_type=int) self.iter_aggregations = self.get_value("iter_aggregations", base=config) self.label_aggregations = self.get_value("label_aggregations", base=config) self.measures = self.get_value("measures", default=["f1"], base=config) self.measures = as_list(self.measures) self.print_individual_models = self.get_value("print_individual_models", default=False, base=config) self.label_distribution = self.get_value("show_label_distributions", base=config, default="logs")
def performImputation(): X, y, ids = ut.load_ndvi_uts(cfg.data_path, ut.as_list(2015), cfg.balance_flag) print("nonimputed Dataset length") print(X.shape) imputer = SimpleImputer(missing_values=np.nan, strategy="mean") x_non_missing = imputer.fit_transform(X) concat = np.concatenate([ x_non_missing, np.expand_dims(ids, axis=1), np.expand_dims(y, axis=1) ], axis=1) print("Imputed dataset length") print(concat.shape) return concat
def get_topK_preds(self, predictions, label_mapping, only_report_labels): """ Return topK predictions and predicted classes from a predictions matrix and index label mapping dict """ # get top k from input / static parameters, or revert to default if self.topk is None: try: self.topk = self.input_parameters.top_k except KeyError: self.topk = self.params.top_k except AttributeError: self.topk = 5 self.messages.append(f"Defaulted to top_k of {self.topk}") if predictions.size == 0: top_k_preds = [] top_k_predicted_classes = [] else: # argsort the column prediction probas descending, get top k top_k_idxs = np.argsort(predictions, axis=1)[:, ::-1][:, :self.topk] # make a reordered probs container top_k_preds = [ row[top_k_idxs[row_idx]].tolist() for row_idx, row in enumerate(predictions) ] # take the classses corresponding to the argsorted indexes probs top_k_predicted_classes = [[label_mapping[ix] for ix in idxs] for idxs in top_k_idxs] if only_report_labels is not None: only_report_labels = as_list(only_report_labels) for i in range(len(top_k_predicted_classes)): retained_label_idxs = [] for lbl in only_report_labels: if lbl not in top_k_predicted_classes[i]: error( f"Requested to only report label {lbl} but top {self.topk} predicted labels are {top_k_predicted_classes[i]}" ) lbl_idx = top_k_predicted_classes[i].index(lbl) retained_label_idxs.append(lbl_idx) top_k_predicted_classes[i] = [ top_k_predicted_classes[i][l] for l in retained_label_idxs ] top_k_preds[i] = [ top_k_preds[i][l] for l in retained_label_idxs ] return top_k_preds, top_k_predicted_classes
def __init__(self, cid: str, logic_player): super(Player, self). \ __init__(cid, cid, logic_player.team.color, utils.first(logic_player.tags).location.x, utils.first(logic_player.tags).location.y, font=("Helvetica", 23), ctag='Player') self.logic_player = logic_player self.tags_component = [ TextComponent(tag.tag_id, '.', Player.colors[index], 0, 0, ctag='Tag', font=("Helvetica", 30)) for index, tag in enumerate(utils.as_list(logic_player.tags)) ] self.info = TextComponent(str(cid) + "_info", "", "white", 0, 0, font=("Helvetica", 6))
@author: daniyalusmani1 """ import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import torch import utils as ut from sklearn.impute import SimpleImputer from sklearn import preprocessing from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score from sklearn.feature_selection import VarianceThreshold from sklearn.pipeline import Pipeline X, y, ids = ut.load_ndvi_uts(cfg.data_path, ut.as_list(2015), cfg.balance_flag) print(X.head(5)) orig = pd.read_csv('../ndvi_1_30_365_all_years_for_daniyal.csv') print(orig.head(5)) data = pd.read_csv('../crop_tsc_balanced_imputed_2015.csv', header=None, index_col=None) data = data.drop(columns=[9]) print(data.head(5)) #labels = torch.Tensor(data.values[:, 9]).long() #print(labels.unique()) data = data.to_numpy() sel = VarianceThreshold(threshold=.8) print(sel.fit_transform(data))
def main(input_path, only_report=False, force_dir=False, no_config_check=False, restart=False, is_testing_run=False, manual_config_tag=None): # settable parameters ############################################################ email = "*****@*****.**" passw = None ############################################################ # set the experiment parameters error("Non-existent input path: {} ".format(input_path), not exists(input_path)) if isdir(input_path): # assume a single .yml file in the directory ymls = [ x for x in listdir(input_path) if any( x.endswith(suff) for suff in [".yaml", ".yml"]) ] error( "Input path {} is a directory with no yaml configuration files.". format(input_path), not ymls) error( "Input path is a directory with more than one yaml configuration files." .format(input_path), len(ymls) > 1) config_file = join(input_path, ymls[0]) else: config_file = input_path # if input file is existing csv scores, just print them if config_file.endswith(".csv"): print_existing_csv_results(config_file) return conf = read_ordered_yaml(config_file) try: exps = conf[EXPERIMENTS_KEY_NAME] except KeyError: error( f"Need an [{EXPERIMENTS_KEY_NAME}] key for large-scale experiments." ) # folder to run experiments in run_dir = exps["run_folder"] if force_dir: warning( "Overriding experiment folder from yml value: {} to current dir: {}, due to force-dir" .format(run_dir, dirname(run_dir))) run_dir = dirname(input_path) if not isabs(run_dir): run_dir = join(os.getcwd(), run_dir) # dir checks # ---------- # virtualenv folder venv_dir = conf[EXPERIMENTS_KEY_NAME]["venv"] if "venv" in conf[ EXPERIMENTS_KEY_NAME] else None # results csv file # results_file = conf["experiments"]["results_file"] results_file = join(run_dir, "run_results.csv") if venv_dir and not exists(venv_dir): error("Virtualenv dir {} not found".format(venv_dir)) if not exists(run_dir): info("Run dir {} not found, creating.".format(run_dir)) makedirs(run_dir) else: error( "Specified a non-dir path as the running directory: {}".format( run_dir), not isdir(run_dir)) if restart: warning( "Specified restart, and experiment dir {} exists. Deleting!") rmtree(run_dir) makedirs(run_dir) # logging os.makedirs(run_dir, exist_ok=True) setup_simple_logging(conf["print"]["log_level"], logging_dir=run_dir) info("Generating configurations from source file {}".format(config_file)) # evaluation measures try: eval_measures = as_list(exps["measures"]) if "measures" in exps else [ "f1-score", "accuracy" ] print(eval_measures) aggr_measures = as_list(exps["label_aggregation"]) if "label_aggregation" in exps \ else ["macro", "micro"] stat_functions = as_list( exps["fold_aggregation"]) if "fold_aggregation" in exps else [ "mean" ] run_types = as_list( exps["run_types"]) if "run_types" in exps else ["run"] do_sstests = "sstests" in exps if not do_sstests: warning("No statistical tests specified.") else: sstests = ["tukeyhsd" ] if "names" not in exps["sstests"] else as_list( exps["sstests"]["names"]) sstests_measures = [ "f1-score" ] if "measures" not in exps["sstests"] else as_list( exps["sstests"]["measures"]) sstests_aggregations = [ "macro" ] if "aggregations" not in exps["sstests"] else as_list( exps["sstests"]["aggregations"]) sstests_limit_vars = None if "limit_variables" not in exps[ "sstests"] else as_list(exps["sstests"]["limit_variables"]) except Exception as ex: error( "Failed to read evaluation / testing options due to: [{}]".format( ex)) # folder where run scripts are sources_dir = exps["sources_dir"] if "sources_dir" in exps else os.getcwd() warning("Defaulting sources folder to the current directory: {}".format( sources_dir)) error( "Main module: {} not found. Is the sources dir ok?".format( join(sources_dir, "main.py")), not exists(join(sources_dir, "main.py"))) configs = make_configs(conf, run_dir, sources_dir) # check run id uniqueness if len(set([c.id for c in configs])) != len(configs): error("Duplicate run folders from the input: {}".format( [c.id for c in configs])) if len(set([c['folders']['run'] for c in configs])) != len(configs): error("Duplicate run folders from the input: {}".format( [c["folders"]["run"] for c in configs])) # if we're running a testing suite, filter out incompatible configs if is_testing_run: configs = filter_testing(configs, config_file) # mail do_send_mail = exps["send_mail"] if "send_mail" in exps else None if do_send_mail: passw = getpass.getpass() # copy the experiments configuration file in the target directory experiments_conf_path = join(run_dir, basename(config_file)) if exists(experiments_conf_path): # make sure it's the same effing config, unless check is overriden if not no_config_check: config_to_copy = OrderedDict( {k: v for (k, v) in conf.items() if k != EXPERIMENTS_KEY_NAME}) existing_exp_conf = read_ordered_yaml(experiments_conf_path) existing_exp_conf = OrderedDict({ k: v for (k, v) in existing_exp_conf.items() if k != EXPERIMENTS_KEY_NAME }) equal, diff = compare_dicts(config_to_copy, existing_exp_conf) if not equal: error( "The workflow contents derived from the original config [{}] differ from the ones in the experiment directory: [{}]!\nDifference is: {}" .format(config_file, experiments_conf_path, diff)) else: if not only_report: info("Copying experiments configuration at {}".format( experiments_conf_path)) with open(experiments_conf_path, "w") as f: write_ordered_dump(OrderedDict(conf), f) else: info( "Only-report run: will not copy experiment configuration at {}" .format(experiments_conf_path)) results, result_paths = {}, {} ################################################################################# skipped_configs = [] # prelim experiments for conf_index, conf in enumerate(configs): run_id = conf.id # prepend a configuration id tag, if supplied if manual_config_tag is not None: run_id += manual_config_tag experiment_dir = conf["folders"]["run"] + manual_config_tag else: experiment_dir = conf["folders"]["run"] info("Running experimens for configuration {}/{}: {}".format( conf_index + 1, len(configs), run_id)) completed_file = join(experiment_dir, "completed") error_file = join(experiment_dir, "error") # results to run folders, if not specified otherwise respath = join(experiment_dir, "results") if not isabs(respath): conf["folders"]["results"] = join(experiment_dir, respath) if exists(completed_file): info("Skipping completed experiment {}".format(run_id)) elif only_report: info("Only-report execution: skipping non-completed experiment {}". format(run_id)) skipped_configs.append(run_id) continue else: # run it if exists(error_file): os.remove(error_file) makedirs(experiment_dir, exist_ok=True) conf_path = join(experiment_dir, "config.yml") if exists(conf_path) and not no_config_check: warning("Configuration file at {} already exists!".format( conf_path)) existing = read_ordered_yaml(conf_path) equal, diff = compare_dicts(existing, conf) if not equal: error( "Different local config encountered: {} \nDifference: {}" .format(conf_path, diff)) #if not (OrderedDict(conf) == existing): # error("Different local config encountered at {}".format(conf_path)) else: with open(conf_path, "w") as f: write_ordered_dump(OrderedDict(conf), f) info("Configuration file: {}".format(conf_path)) # write the run script file script_path = join(experiment_dir, "run.sh") with open(script_path, "w") as f: if venv_dir: f.write("source \"{}/bin/activate\"".format(venv_dir)) f.write("cd \"{}\"\n".format(sources_dir)) f.write( "python3 \"{}\" \"{}\" && touch \"{}\" && exit 0\n".format( join(sources_dir, "main.py"), conf_path, completed_file)) f.write("touch '{}' && exit 1\n".format(error_file)) subprocess.run(["/usr/bin/env", "bash", script_path]) if exists(error_file): print("An error has occurred in the run, exiting.") info("An error has occurred in the run, exiting.") if do_send_mail: sendmail(email, passw, "an error occurred") exit(1) # read experiment results exp_res_file = join(experiment_dir, "results", "results.pkl") with open(exp_res_file, "rb") as f: res_data = pickle.load(f) results[run_id] = res_data result_paths[run_id] = exp_res_file # messages = [] total_results = {} # show results for stat in stat_functions: info("Results regarding {} statistic:".format(stat)) print_vals = {} for run_id in results: print_vals[run_id] = {} for m in eval_measures: for run in run_types: for ag in aggr_measures: try: results[run_id][run][m][ag] except KeyError: continue header = "{}.{}.{}.{}".format(run[:3], m[:3], ag[:3], stat) if stat in "var mean std".split(): val = results[run_id][run][m][ag][stat] if val is None: continue val = round(val, decimals=4) print_vals[run_id][header] = val # print'em info("SCORES:") print_dataframe_results(print_vals) total_results[stat] = print_vals info("Writing these results to file {}".format(results_file)) total_df = pd.DataFrame.from_dict(total_results, orient='index') if total_df.size == 0: info("No results parsed.") else: total_df.to_csv(results_file) if skipped_configs: for s, sk in enumerate(skipped_configs): info("Skipped incomplete config: {}/{} : {}".format( s + 1, len(skipped_configs), sk)) if do_sstests: do_stat_sig_testing(sstests, sstests_measures, sstests_aggregations, configs, results, sstests_limit_vars) # [info(msg) for msg in messages] if do_send_mail: sendmail(email, passw, "run complete.")
def set_tag(self, tag: Tag): if tag.tag_id not in [t.tag_id for t in utils.as_list(self.tags)]: return False self.tags[tag.tag_id] = tag return True
def speed(self): return utils.as_list(self.tags)[0].speed
def direction(self): return utils.as_list(self.tags)[0].direction
def acceleration(self): return utils.as_list(self.tags)[0].acceleration
def get_player_locations(self): return [t.location for t in utils.as_list(self.tags)]
def __init__(self, config): super().__init__(config) if config is None: return # pass the linking value(s) self.links = as_list(config)
""" self.already_run.append((sync, None)) def can_sync_run(self, sync, args = None): """ Return False if a sync should not be run, True otherwise """ if sync.name in self.exclude: logger.debug('sync "%s" has been excluded', sync.name) return False if not self.allow_rerun and (sync, args) in self.already_run: logger.debug('sync "%s" (%s) has already been run', sync.name, args) return False return True # run! run! run! args = args.parse_args() if args.get('verbosity'): logging.basicConfig(level=logging.DEBUG) synctrex = Synctrex(**args) config = args.get('config') for config in utils.as_list(config): synctrex.load(config) synctrex.prepare() synctrex.run()