def preprocess(self, dataset): try: pool = Pool(nodes=self.n_jobs) results = [ pool.apipe(self._extract_features, data_file, self.pipeline, dataset.is_metamapped()) for data_file in dataset.get_data_files() ] while any([i.ready() is False for i in results]): time.sleep(1) for idx, i in enumerate(results): X, y = i.get() self.X_data += X self.y_data += y except TypeError as error: if str(error ) == "can not serialize 'cupy.core.core.ndarray' object": logging.info( 'Ran into GPU error. Switching to synchronous preprocessing...' ) self.X_data == [] self.y_data == [] for data_file in dataset.get_data_files(): features, labels = self._extract_features( data_file, self.pipeline, dataset.is_metamapped()) self.X_data += features self.y_data += labels
def preprocess(self): # Check if orderline should be extracted extract_orderline = conf['extract_orderline'] pool = Pool() start = time.time() # Run in parallel if extract_orderline: res_orderline = pool.apipe(self.create_orderline, return_dataframe=False) res_warehouse = pool.apipe(self.create_warehouse) res_district = pool.apipe(self.create_district) res_order = pool.apipe(self.create_order) res_customer = pool.apipe(self.create_customer) res_stock = pool.apipe(self.create_stock) # Consolidate result pool.close() pool.join() list_of_processed_files = [res_warehouse.get(), res_district.get(), res_order.get(), res_customer.get(), res_stock.get()] if extract_orderline: list_of_processed_files.append(res_orderline.get()) end = time.time() self.debug("Preprocessing of csv file took {}s".format(end - start)) return list_of_processed_files
def main(): # 5 trials # writes online performance to text file function_representation_trial#.txt # endpoint in interval must be end - step to make sure the number of discrete points on each axis is a power of 2 # e.g. is literature says the search space is -5.12 <= x <= 5.12 with resolution \delta x = 0.01, input # (-5.12, 5.11, 0.01) as the interval pool = Pool(mp.cpu_count()) jobs = [] funcs = [tf.f1, tf.f2, tf.f3, tf.f4, tf.f5] ranges = [(-5.12, 5.11, 0.01), (-2.048, 2.047, 0.001), (-5.12, 5.11, 0.01), (-1.28, 1.27, 0.01), (-65.536, 65.535, 0.001)] search = lambda i, j: GA_SEARCH(m, c, p, g, GRAY_CODE, "f" + str( j) + "_BRG_T" + str(i), funcs[j - 1], ranges[j - 1], min) for j in range(1, len(funcs) + 1): print(str(funcs[j - 1])) for i in range(1, NUM_RUNS + 1): job = pool.apipe(GA_SEARCH, m, c, p, g, NGG_CODE, "f" + str(j) + "_NGG_T" + str(i), funcs[j - 1], ranges[j - 1], min) jobs.append(job) job = pool.apipe(GA_SEARCH, m, c, p, g, UBL_CODE, "f" + str(j) + "_UBL_T" + str(i), funcs[j - 1], ranges[j - 1], min) jobs.append(job) job = pool.apipe(GA_SEARCH, m, c, p, g, GRAY_CODE, "f" + str(j) + "_BRG_T" + str(i), funcs[j - 1], ranges[j - 1], min) jobs.append(job) job = pool.apipe(GA_SEARCH, m, c, p, g, BINARY_CODE, "f" + str(j) + "_BIN_T" + str(i), funcs[j - 1], ranges[j - 1], min) jobs.append(job) for job in jobs: job.get()
def fit(self, dataset): """ Runs dataset through the designated pipeline, extracts features, and fits a conditional random field. :param training_data_loader: Instance of Dataset. :return model: a trained instance of a sklearn_crfsuite.CRF model. """ if not isinstance(dataset, Dataset): raise TypeError( "Must pass in an instance of Dataset containing your training files" ) if not isinstance(self.pipeline, BasePipeline): raise TypeError( "Model object must contain a medacy pipeline to pre-process data" ) pool = Pool(nodes=self.n_jobs) results = [ pool.apipe(self._extract_features, data_file, self.pipeline, dataset.is_metamapped()) for data_file in dataset.get_data_files() ] while any([i.ready() is False for i in results]): time.sleep(1) for idx, i in enumerate(results): X, y = i.get() self.X_data += X self.y_data += y logging.info("Currently Waiting") learner_name, learner = self.pipeline.get_learner() logging.info("Training: %s", learner_name) assert self.X_data, "Training data is empty." train_data = [x[0] for x in self.X_data] learner.fit(train_data, self.y_data) logging.info("Successfully Trained: %s", learner_name) self.model = learner return self.model
def wrapper(*args, **kwargs): obj, data, _args = tuple(), tuple(), tuple() if hasattr(args[0].__class__, fn.__name__): obj, data, *_args = args obj = (obj, ) else: data, *_args = args if type(data) != list: data = list(data) total_size = len(data) _batch_size = total_size // workers + 1 if batch_size is None else batch_size # assert type(data) == list, "Type of data must be list" print( f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: parallel for {fn.__qualname__}." ) if shuffle: print( f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: shuffle data for {fn.__qualname__}." ) random.shuffle(data) pool = Pool(workers) pool.terminate() pool.restart() proc = [] for beg, end in zip( range(0, total_size, _batch_size), range(_batch_size, total_size + _batch_size, _batch_size)): batch = data[beg:end] p = pool.apipe(fn, *obj, batch, *_args, **kwargs) proc.append(p) pool.close() pool.join() result = reduce_seqs([p.get() for p in proc]) if after_hook is not None: result = after_hook(result) return result
def _exec_sample(X): from pathos.multiprocessing import ProcessingPool try: p = ProcessingPool(n_cpus) X = np.array(X) x = np.array_split(X, n_cpus) pipe = [] for i in range(n_cpus): pipe.append(p.apipe(func, x[i])) rs = [] for i in range(n_cpus): rs.append(pipe[i].get()) rs = [item for sublist in rs for item in sublist] return ot.NumericalSample(rs) except ValueError: # Get there if the chuck size left some single evaluations left return func(X)
def preprocess(self, dataset, asynchronous=False): """ Preprocess dataset into a list of sequences and tags. :param dataset: Dataset object to preprocess. :param asynchronous: Boolean for whether the preprocessing should be done asynchronously. """ if asynchronous: logging.info('Preprocessing data asynchronously...') self.X_data = [] self.y_data = [] pool = Pool(nodes=self.n_jobs) results = [ pool.apipe(self._extract_features, data_file) for data_file in dataset ] while any([i.ready() is False for i in results]): time.sleep(1) for i in results: X, y = i.get() self.X_data += X self.y_data += y else: logging.info('Preprocessing data synchronously...') self.X_data = [] self.y_data = [] # Run all Docs through the pipeline before extracting features, allowing for pipeline components # that require inter-dependent doc objects docs = [ self._run_through_pipeline(data_file) for data_file in dataset ] for doc in docs: features, labels = self._extract_features(doc) self.X_data += features self.y_data += labels
def _exec_sample(X): from pathos.multiprocessing import ProcessingPool try: p = ProcessingPool(n_cpus) X = np.array(X) x = np.array_split(X, n_cpus) # array_split is not supposed to return a list of length n_cpus when len(X)<n_cpus n_active = min(len(X), n_cpus) pipe = [] for i in range(n_active): pipe.append(p.apipe(func, x[i])) rs = [] for i in range(n_active): rs.append(pipe[i].get()) rs = [item for sublist in rs for item in sublist] return ot.Sample(rs) except ValueError: # Get there if the chuck size left some single evaluations left return func(X)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str) parser.add_argument('--exp_name', type=str, default='vpg') parser.add_argument('--render', action='store_true') parser.add_argument('--logdir', '-dir', type=str, default='data') parser.add_argument('--discount', type=float, default=1.0) parser.add_argument('--n_iter', '-n', type=int, default=100) parser.add_argument('--batch_size', '-b', type=int, default=1000) parser.add_argument('--ep_len', '-ep', type=float, default=-1.) parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) parser.add_argument('--reward_to_go', '-rtg', action='store_true') parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true') parser.add_argument('--nn_baseline', '-bl', action='store_true') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--n_experiments', '-e', type=int, default=1) parser.add_argument('--n_layers', '-l', type=int, default=1) parser.add_argument('--size', '-s', type=int, default=32) parser.add_argument('--gae', '-gae', action='store_true') parser.add_argument('--lambd', '-ld', type=float, default=1.0) parser.add_argument('--threads', '-th', type=int, default=1) parser.add_argument('--max_threads_pool', '-max_tp', type=int, default=16) parser.add_argument('--thread_timeout', '-th_to', type=int, default=None) parser.add_argument('--offpol', '-ofp', action='store_true') parser.add_argument('--n_iter_pol', '-np', type=int, default=1) parser.add_argument('--n_iter_pol_sched', '-nps', type=str, default='const', choices=['const', 'exp_dec']) parser.add_argument('--n_iter_pol_exp_base', '-npexpb', type=int, default=5) parser.add_argument('--n_iter_pol_exp_decay', '-npexpd', type=float, default=0.95) parser.add_argument('--weight_importance_samp', '-wis', action='store_true') parser.add_argument('--record', '-rec', type=int, default=None) args = parser.parse_args() it_pol_fn = None if args.offpol: if args.n_iter_pol_sched == 'exp_dec': it_pol_fn = lambda it: \ int(np.ceil(args.n_iter_pol * pow(args.n_iter_pol_exp_decay,it / args.n_iter_pol_exp_base))) if not (os.path.exists(args.logdir)): os.makedirs(args.logdir) logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") logdir = os.path.join(args.logdir, logdir) if not (os.path.exists(logdir)): os.makedirs(logdir) max_path_length = args.ep_len if args.ep_len > 0 else None start = time.time() for e in range(args.n_experiments): seed = args.seed + 10 * e print('Running experiment with seed %d' % seed) def train_func(): train_PG(exp_name=args.exp_name, env_name=args.env_name, n_iter=args.n_iter, gamma=args.discount, min_timesteps_per_batch=args.batch_size, max_path_length=max_path_length, learning_rate=args.learning_rate, reward_to_go=args.reward_to_go, animate=args.render, logdir=os.path.join(logdir, '%d' % seed), normalize_advantages=not (args.dont_normalize_advantages), nn_baseline=args.nn_baseline, seed=seed, n_layers=args.n_layers, size=args.size, gae=args.gae, lambd=args.lambd, threads=args.threads, max_threads_pool=args.max_threads_pool, thread_timeout=args.thread_timeout, offpol=args.offpol, n_it_pol=args.n_iter_pol, n_it_pol_fn=it_pol_fn, wis=args.weight_importance_samp, record=args.record) # Awkward hacky process runs, because Tensorflow does not like # repeatedly calling train_PG in the same thread. # p = Process(target=train_func) # p.start() # p.join() p = ProcessingPool(1) p.apipe(train_func).get() p.clear() print('All training took: {:.3f}s'.format(time.time() - start))
class RPKI_Validator_Wrapper: """This class gets validity data from ripe""" __slots__ = ['total_prefix_origin_pairs', "_process", "_table_input", "_rpki_file"] # Sorry for the crazy naming scheme, must be done to avoid # having install file names in multiple locations temp_install_path = "/tmp/temp_rpki_validator_install" rpki_package_path = RPKI_PACKAGE_PATH rpki_run_name = RPKI_RUN_NAME rpki_run_path = RPKI_PACKAGE_PATH + RPKI_RUN_NAME rpki_db_paths = [RPKI_PACKAGE_PATH + x for x in ["db/", "rsync/"]] port = 8080 api_url = "http://[::1]:8080/api/" def __init__(self, **kwargs): config_logging(kwargs.get("stream_level", logging.INFO), kwargs.get("section")) self._table_input = kwargs.get("table_input", "mrt_rpki") if not os.path.exists(self.rpki_package_path): logging.warning("Looks like validator is not installed") logging.warning("Installing validator now") RPKI_Validator_Wrapper.install(**kwargs) ################################# ### Context Manager Functions ### ################################# def __enter__(self): """Runs the RPKI Validator""" utils.kill_port(self.port) # Must remove these to ensure a clean run utils.clean_paths(self.rpki_db_paths) cmds = [f"cd {self.rpki_package_path}", f"chown -R root:root {self.rpki_package_path}"] utils.run_cmds(cmds) # Writes validator file and serves it # Can't use cntext manager here since it returns it self._rpki_file = RPKI_File(self._table_input) self._rpki_file.spawn_process() self._process = ProcessingPool() self._process.apipe(self._start_validator) self.total_prefix_origin_pairs = self._rpki_file.total_lines return self def __exit__(self, type, value, traceback): """Closes RPKI Validator""" self._process.close() self._process.terminate() self._process.join() self._process.clear() utils.kill_port(self.port, wait=False) logging.debug("Closed rpki validator") self._rpki_file.close() def _start_validator(self): """Sends start cmd to RPKI Validator""" logging.info("Starting RPKI Validator") utils.run_cmds((f"cd {self.rpki_package_path} && " f"./{self.rpki_run_name}")) ######################### ### Wrapper Functions ### ######################### def load_trust_anchors(self): """Loads all trust anchors""" utils.write_to_stdout(f"{datetime.now()}: Loading RPKI Validator\n", logging.root.level) time.sleep(60) while self._get_validation_status() is False: time.sleep(10) utils.write_to_stdout(".", logging.root.level) utils.write_to_stdout("\n", logging.root.level) self._wait(30, "Waiting for upload to bgp preview") def make_query(self, api_endpoint: str, data=True) -> dict: """Makes query to api of rpki validator""" result = utils.get_json(os.path.join(self.api_url, api_endpoint), RPKI_Validator_Wrapper.get_headers()) return result["data"] if data else result def get_validity_data(self) -> dict: """Gets the data from ripe and formats it for csv insertions""" logging.info("Getting data from ripe") assert self.total_prefix_origin_pairs < 10000000, "page size too small" # Then we get the data from the ripe RPKI validator # Todo for later, change 10mil to be total count return self.make_query("bgp/?pageSize=10000000") ######################## ### Helper Functions ### ######################## def _wait(self, time_to_sleep: int, msg: str): """logs a message and waits""" logging.debug(msg) if logging.root.level == logging.INFO: # Number of times per second to update tqdm divisor = 100 for _ in trange(time_to_sleep * divisor, desc=msg): time.sleep(1 / divisor) def _get_validation_status(self) -> bool: """Returns row count of json object for waiting""" try: for x in self.make_query("trust-anchors/statuses"): if x["completedValidation"] is False: # If anything has not been validated return false return False # All are validated. Return true return True except urllib.error.URLError as e: self._wait(60, "Connection was refused") return False ###################### ### Static methods ### ###################### @staticmethod def get_validity_dict() -> dict: """Returns the validity dict for the RPKI Validator to decode results I could have this as a class attribute but too messy I think. """ return {"VALID": ROA_Validity.VALID.value, "UNKNOWN": ROA_Validity.UNKNOWN.value, "INVALID_LENGTH": ROA_Validity.INVALID_BY_LENGTH.value, "INVALID_ASN": ROA_Validity.INVALID_BY_ORIGIN.value} @staticmethod def get_headers() -> dict: """Gets the headers for all url queries to the validator""" return {"Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": 1, "User-Agent": ("Mozilla/5.0 (X11; Linux x86_64)" " AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/73.0.3683.86 Safari/537.36"), "Accept": ("text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp," "image/apng,*/*;q=0.8," "application/signed-exchange;v=b3"), "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"} ######################### ### Install Functions ### ######################### @staticmethod def install(**kwargs): """Installs RPKI validator with our configs. This might break in the future, but we need to do it this way for now to be able to do what we want with our own prefix origin table. """ config_logging(kwargs.get("stream_level", logging.DEBUG), kwargs.get("section")) utils.delete_paths([RPKI_Validator_Wrapper.rpki_package_path, RPKI_Validator_Wrapper.temp_install_path]) RPKI_Validator_Wrapper._download_validator() RPKI_Validator_Wrapper._change_file_hosted_location() path = RPKI_Validator_Wrapper._change_server_address() RPKI_Validator_Wrapper._config_absolute_paths(path) @staticmethod def _download_validator(): """Downloads validator into proper location""" rpki_url = ("https://ftp.ripe.net/tools/rpki/validator3/beta/generic/" "rpki-validator-3-latest-dist.tar.gz") arin_tal = ("https://www.arin.net/resources/manage/rpki/" "arin-ripevalidator.tal") # This is the java version they use so we will use it cmds = [f"mkdir {RPKI_Validator_Wrapper.temp_install_path}", f"cd {RPKI_Validator_Wrapper.temp_install_path}", "sudo apt-get -y install openjdk-8-jre", f"wget {rpki_url}", "tar -xvf rpki-validator-3-latest-dist.tar.gz", "rm -rf rpki-validator-3-latest-dist.tar.gz", f"mv rpki-validator* {RPKI_Validator_Wrapper.rpki_package_path}", f"cd {RPKI_Validator_Wrapper.rpki_package_path}", "cd preconfigured-tals", f"wget {arin_tal}"] utils.run_cmds(cmds) @staticmethod def _change_file_hosted_location(): """Changes location of input ann for bgp preview file""" # Changes where the file is hosted path = (f"{RPKI_Validator_Wrapper.rpki_package_path}conf" "/application-defaults.properties") prepend = "rpki.validator.bgp.ris.dump.urls=" replace = ("https://www.ris.ripe.net/dumps/riswhoisdump.IPv4.gz," "https://www.ris.ripe.net/dumps/riswhoisdump.IPv6.gz") replace_with = (f"http://localhost:{RPKI_File.port}" f"/{RPKI_File.hosted_name}") utils.replace_line(path, prepend, replace, replace_with) @staticmethod def _change_server_address(): """Prob because of a proxy, but on our server this is necessary""" # Changes the server address path = (f"{RPKI_Validator_Wrapper.rpki_package_path}conf" "/application.properties") prepend = "server.address=" replace = "localhost" replace_with = "0.0.0.0" utils.replace_line(path, prepend, replace, replace_with) return path @staticmethod def _config_absolute_paths(path): """Configure rpki validator to run off absolute paths This is necessary due to script being called from elsewhere In other words not from inside the RPKI dir. """ # Since I am calling the script from elsewhere these must be # absolute paths prepend = "rpki.validator.data.path=" replace = "." # Must remove trailing backslash at the end replace_with = RPKI_Validator_Wrapper.rpki_package_path[:-1] utils.replace_line(path, prepend, replace, replace_with) prepend = "rpki.validator.preconfigured.trust.anchors.directory=" replace = "./preconfigured-tals" replace_with = (f"{RPKI_Validator_Wrapper.rpki_package_path}" "preconfigured-tals") utils.replace_line(path, prepend, replace, replace_with) prepend = "rpki.validator.rsync.local.storage.directory=" replace = "./rsync" replace_with = f"{RPKI_Validator_Wrapper.rpki_package_path}rsync" utils.replace_line(path, prepend, replace, replace_with)
class RPKI_File: """This class gets validity data from ripe""" __slots__ = ["path", "total_lines", "_process"] _dir = "/tmp/" hosted_name = "upo_csv_path.csv.gz" port = 8000 def __init__(self, table_input): """Downloads and stores roas from a json""" self.path = self._dir + self.hosted_name.replace(".gz", "") with Unique_Prefix_Origins_Table(clear=True) as _db: _db.fill_table(table_input) _db.copy_table(self.path) self.total_lines = utils.get_lines_in_file(self.path) self._gzip_file() ################################# ### Context Manager Functions ### ################################# def __enter__(self): """What to do when the context manager is called on this class Starts the process for serving the file""" self.spawn_process() return self def __exit__(self, type, value, traceback): """Closes the file process""" self.close() ############################ ### Serve File Functions ### ############################ def spawn_process(self): """Spawns file serving process""" utils.kill_port(self.port) self._process = ProcessingPool() self._process.apipe(self._serve_file) logging.debug("Served RPKI File") def close(self): """Closes file process""" utils.kill_port(self.port, wait=False) self._process.close() self._process.terminate() self._process.join() self._process.clear() # changed to absolute path utils.delete_paths(os.path.join(self._dir, self.hosted_name)) logging.debug("Closed RPKI File") ######################## ### Helper Functions ### ######################## def _gzip_file(self): """gzips the file for proper formatting in rpki validator""" with open(self.path, 'rb') as f_in, gzip.open( os.path.join(self._dir, self.hosted_name), 'wb') as f_out: f_out.writelines(f_in) utils.delete_paths(self.path) def _serve_file(self): """Makes a simple http server and serves a file in /tmp""" class Handler(http.server.SimpleHTTPRequestHandler): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Changes directory to be in /tmp os.chdir(self._dir) # Serve the file on port 8000 socketserver.TCPServer(("", RPKI_File.port), Handler).serve_forever()