def fs_sharing(): prev_strategy = mp.get_sharing_strategy() mp.set_sharing_strategy('file_system') try: yield finally: mp.set_sharing_strategy(prev_strategy)
def fs_sharing(): prev_strategy = multiprocessing.get_sharing_strategy() multiprocessing.set_sharing_strategy('file_system') try: yield finally: multiprocessing.set_sharing_strategy(prev_strategy)
def read_data(dataset: Union[Video_2D_Inference, Video_3D_Inference], batch_size: int, num_worker: int, data_queue: mp.Queue): mp.set_sharing_strategy('file_system') for item in DataLoader(dataset, batch_size=batch_size, num_workers=num_worker): data_queue.put(item)
def fs_sharing(): prev_strategy = mp.get_sharing_strategy() mp.set_sharing_strategy('file_system') try: yield finally: mp.set_sharing_strategy(prev_strategy)
def main(): print("{:=^100}".format(' Test ')) print("run parameters: {}".format(sys.argv)) import torch.multiprocessing as mp mp.set_sharing_strategy("file_system") #is this necessary?ss # if output path does not exist, create it if not os.path.exists(output_path): os.makedirs(output_path) model = MPNN(T=args.T, p=args.p, target=args.target, output_type=args.output_type, output_dim=args.output_dim, readout_dim=args.readout_dim) model.load_state_dict(torch.load(args.model_path)) model.eval() processes = [] for rank in range(args.n_test_process): p = mp.Process(target=test, args=(rank, model)) p.start() processes.append(p) print("joining {} processes.".format(len(processes))) for p in processes: p.join()
def run_spawn(config): world_size = config.world_size master_address = config.federator_host nic = config.nic mp.set_sharing_strategy("file_system") mp.set_start_method("spawn", True) mp.spawn(run_single, args=(world_size, master_address, config, nic), nprocs=world_size, join=True)
def full_run(self, n_runs_per_device, n_processes_per_device, devices_list, epochs_per_simulation: Union[int, List[int]]): """ Runs the experiment with multiple seeds, distributing the simulations across different devices and saving the results to disk. """ if isinstance(epochs_per_simulation, int): epochs_per_simulation = [epochs_per_simulation] multiprocessing.set_sharing_strategy('file_system') context = multiprocessing.get_context('spawn') experiment_id = 0 for simulation_factory, epochs in zip(self.simulation_factories, cycle(epochs_per_simulation)): # Create a pool for each device. pools = [] for device in devices_list: pools.append(context.Pool(processes=n_processes_per_device)) # For each pool execute jobs. results = [] for pool_id, (pool, device) in enumerate(zip(pools, devices_list)): args = [] for i in range(n_runs_per_device): args.append(( simulation_factory, pool_id * n_runs_per_device + i, # seed, device, epochs)) results.append(pool.starmap_async(_job, args)) # Save all runs of the current simulation configuration to disk. all_outputs = [] for pool, result in zip(pools, results): all_outputs += result.get() pool.close() pool.join() # Now process all the outputs to save only what we need. processed_outputs = [] for simulation in all_outputs: simulation_history, used_seed, used_device = simulation processed_output = self.handle_simulation_output( simulation_history) processed_outputs.append( (processed_output, used_seed, used_device)) # We create a new simulation object to get an identifier. simulation_identifier = self.construct_simulation_identifier( simulation_factory(0, torch.device('cpu'))) # Write processed_outputs to disk. file_path = _outputs_prefix + self.name + '/experiment_' + \ str(experiment_id) save_to_disk((simulation_identifier, processed_outputs), file_path) experiment_id += 1
def run_in_parallel(times_per_device, n_processes_per_device, simulation_parameters, pytorch_configs, process_initialiser=None, initialiser_args=()): """ Executes the set up experiment on the given devices. Parameters: times The number of times the given experiment has to be repeated on each device. n_processes_per_device The number of independent processes to be used for each pytorch_config. simulation_parameters An instance of subclass of SimulationParameters class. observers_factory A factory method for the required observers. pytorch_configs A list of PyTorchConfig objects specifying the devices and data types to be used. For example, this list could contain two gpus, then, setting n_processes = 2 (or more) both gpus will be used to execute the simulations. process_initialiser Each process will be initialised by calling this function. process_initialiser_args Arguments to be passed to the initialiser function. This parameter must be a list, providing the arguments for each different device. """ multiprocessing.set_sharing_strategy('file_system') context = multiprocessing.get_context('spawn') pools = [] for config, initargs in \ itertools.zip_longest(pytorch_configs, initialiser_args): pools.append( context.Pool(processes=n_processes_per_device, initializer=process_initialiser, initargs=initargs)) results = [] for pool_id, (pool, config) in enumerate(zip(pools, pytorch_configs)): args = [] for i in range(times_per_device): args.append((pool_id * times_per_device + i, simulation_parameters, config)) results.append(pool.starmap_async(job, args)) all_outputs = [] for pool, result in zip(pools, results): all_outputs += result.get() pool.close() pool.join() return _process_all_outputs(all_outputs)
def main(): import torch.multiprocessing as mp mp.set_sharing_strategy("file_system") mp = mp.get_context("forkserver") from src.model import MPNN torch.manual_seed(args.seed) print("{:=^100}".format(' Train ')) print("experiment: {}".format(args.exp_name)) print("run parameters: {} \n".format(sys.argv)) model_path = args.model_path print("instantiating model...") model = MPNN(T=args.T, p=args.p, target=args.target, output_type=args.output_type, output_dim=args.output_dim, readout_dim=args.readout_dim) if model_path is not None: model.load_state_dict(model_path) model.share_memory() print(model) # Train the model print("Training Model...") processes = [] for rank in range(args.n_train_process): p = mp.Process(target=train, args=(rank, args, model)) p.start() processes.append(p) for p in processes: p.join() print("Finished training model")
def log(args): ''' Folder settings when saving training results''' if not os.path.exists('result') and ~args.debug: os.makedirs('result') if not os.path.exists('result/' + args.info) and ~args.debug: os.mkdir('result/' + args.info) if not os.path.exists('result/' + args.info + '/img') and ~args.debug: os.mkdir('result/' + args.info + '/img') if not os.path.exists('result/' + args.info + '/scripts') and ~args.debug: os.mkdir('result/' + args.info + '/scripts') if not os.path.exists('result/' + args.info + '/ckp') and ~args.debug: os.mkdir('result/' + args.info + '/ckp') print('[*] Info:', time.ctime()) print('[*] Info:', os.path.basename(__file__)) # if ~args.debug and args.log == True and args.resume == False: if ~args.debug and args.resume == False: from shutil import copyfile copyfile(os.path.basename(__file__), 'result/' + args.info + '/scripts/' + os.path.basename(__file__)) copyfile('config.py', 'result/' + args.info + '/scripts/config.py') copyfile('head.py', 'result/' + args.info + '/scripts/head.py') copyfile('train.py', 'result/' + args.info + '/scripts/train.py') copyfile('test.py', 'result/' + args.info + '/scripts/test.py') copytree('./data_loader/', 'result/' + args.info + '/scripts/data_loader') copytree('./model/', 'result/' + args.info + '/scripts/model') copytree('./utils/', 'result/' + args.info + '/scripts/utils') sys.stdout = Unbuffered(sys.stdout) torch.cuda.set_device(args.gpu_idx) from torch import multiprocessing multiprocessing.set_sharing_strategy('file_system') torch.set_num_threads(1)
def main(data_dir: str, save_dir: str, segment: int): mp.set_sharing_strategy("file_system") os.makedirs(save_dir, exist_ok=True) wav2mel = Wav2Mel() file2mel = partial(process_files, wav2mel=wav2mel) meta_data = {} speakers = sorted(os.listdir(data_dir)) for spk in tqdm(speakers): spk_dir = os.path.join(data_dir, spk) wav_files = librosa.util.find_files(spk_dir) mels = [file2mel(wav_file) for wav_file in wav_files] mels = list( filter(lambda x: x is not None and x.shape[-1] > segment, mels)) rnd_paths = [f"{uuid4().hex}.pt" for _ in range(len(mels))] dummy = [ torch.save(mel, os.path.join(save_dir, path)) for (mel, path) in zip(mels, rnd_paths) ] meta_data[spk] = rnd_paths with open(os.path.join(save_dir, "metadata.json"), "w") as f: json.dump(meta_data, f, indent=4)
def set_sharing_strategy(new_strategy=None): """ https://pytorch.org/docs/stable/multiprocessing.html https://discuss.pytorch.org/t/how-does-one-setp-up-the-set-sharing-strategy-strategy-for-multiprocessing/113302 https://stackoverflow.com/questions/66426199/how-does-one-setup-the-set-sharing-strategy-strategy-for-multiprocessing-in-pyto """ from sys import platform if new_strategy is not None: mp.set_sharing_strategy(new_strategy=new_strategy) else: if platform == 'darwin': # OS X # only sharing strategy available at OS X mp.set_sharing_strategy('file_system') else: # ulimit -n 32767 or ulimit -n unlimited (perhaps later do try catch to execute this increase fd limit) mp.set_sharing_strategy('file_descriptor')
def actor(rank, args, T, BEST, memory_queue, model_queue, p2): mp.set_sharing_strategy('file_system') # rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) # resource.setrlimit(resource.RLIMIT_NOFILE, (args.nofile, rlimit[1])) torch.manual_seed(args.seed + rank) print("Process {} fighting with {}".format(rank, p2)) env = gym.make(args.env, java_env_path="..", port=args.port + rank * 2, p2=p2) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) shared_average_model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) memory = EpisodicReplayMemory(args.num_processes, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # Actor loop t_value = T.value() discard = False round_score = 0 episode_length = 0 sum_entropy = 0 if not model_queue.empty(): print("Process {} going to load new model at EPISODE {}......". format(rank, t_value)) received_obj = model_queue.get() model_dict, average_model_dict = copy.deepcopy(received_obj) model.load_state_dict(model_dict) shared_average_model.load_state_dict(average_model_dict) print("Process {} finished loading new mode at EPISODE {}!!!!!!". format(rank, t_value)) del received_obj # Reset or pass on hidden state if done: hx, avg_hx = torch.zeros(1, args.hidden_size), torch.zeros( 1, args.hidden_size) cx, avg_cx = torch.zeros(1, args.hidden_size), torch.zeros( 1, args.hidden_size) # Reset environment and done flag try: with timeout(seconds=30): s = env.reset() except TimeoutError: print("Time out to reset env") env.close() continue state = state_to_tensor(s) action_mask = [[False for _ in range(56)]] action_mask = torch.BoolTensor(action_mask) done = False else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done: # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx), action_mask) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( state, (avg_hx, avg_cx), action_mask) # Sample action action = torch.multinomial(policy, 1)[0, 0] sum_entropy += Categorical(probs=policy.detach()).entropy() # Step next_state, reward, done, info = env.step(action.item()) valid_actions = info.get('my_action_enough', {}) # get valid actions if len(valid_actions) > 0: action_mask = [[ False if i in valid_actions else True for i in range(56) ]] else: action_mask = [[False for _ in range(56)]] action_mask = torch.BoolTensor(action_mask) round_score += reward if info.get('no_data_receive', False): env.close() discard = True memory.append_transition(state, None, None, None, action_mask.detach(), discard=discard) break next_state = state_to_tensor(next_state) reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards # Save (beginning part of) transition for offline training memory.append_transition(state, action, reward, policy.detach(), action_mask.detach()) # Save just tensors [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies), (policy, Q, V, torch.LongTensor([[action]]), torch.Tensor([[reward]]), average_policy)) ] # Increment counters t += 1 episode_length += 1 # Increase episode counter # Update state state = next_state if discard: done = True continue # Finish on-policy episode # Do not need to increate T in actor # T.increment() print( """Process: {}, EPISODE: {},BEST: {}, episode: {}, round_reward: {}""" .format(rank, t_value, BEST.value(), t, round_score)) # Save terminal state for offline training memory.append_transition(state, None, None, None, action_mask.detach()) last_trajectory = copy.deepcopy(memory.last_trajectory()) on_policy_data = (last_trajectory, (episode_length, round_score, sum_entropy / episode_length)) send_object = copy.deepcopy(on_policy_data) memory_queue.put(send_object, ) print("Process {} send trajectory".format(rank)) # TODO: add TD error of the trajectory as the priority done = True env.close()
import torch.multiprocessing as mp from torch.multiprocessing import set_sharing_strategy, set_start_method try: set_start_method('spawn') set_sharing_strategy("file_descriptor") except RuntimeError: pass import torch import os, sys, pdb from eval_env import FileEnv from utils import log done = mp.Event() class ParallelSampler: """ Manages multithreaded sampling from a FileEnv """ NUM_WORKERS = 1 def __init__(self, file_env_args, tac_template, agent, train=False): "sampler" self.tac_template = tac_template self.file_env_args = file_env_args self.agent = agent self.train = train agent.model.share_memory() # os.environ["CUDA_VISIBLE_DEVICES"] = "0" def sample_trajectories(self, n_epochs=1, **kwargs):
no limitation of timestep one batch 1. separate ProcessUnit to preprocess.py 2. add shared noise table ''' import os import click import gym import torch import time import pickle import logging import numpy as np import torch.multiprocessing as mp mp.set_sharing_strategy('file_system') from config import N_POPULATION, N_GENERATION, LR, SIGMA, TIMESTEP_LIMIT, reference_batch_size from optimizer import SGD from train import train, test, explore_for_vbn torch.set_num_threads(1) LogFolder = os.path.join(os.getcwd(), 'log') model_storage_path = '/home/yyl/model/es-rl/' Small_value = -1000000 def setup_logging(logfile): if logfile == 'default.log': timenow = time.localtime(time.time())
def detection_by_tracking( frame_dir, json_file, tracker_model, detection_threshold=0.9, tracking_threshold=0.9, save_json_file="data/demo_tracking/detection_by_tracking.tracking_json", offset=0, low=None, high=None, step=1, parallel=False, multithreading=False): # Load annotations data = json.load(open(json_file, "r")) annotations = dict() for annotation in data['annotations']: if annotation['image_id'] in annotations: annotations[annotation['image_id']] += [annotation] else: annotations[annotation['image_id']] = [annotation] # Load frames frame_files = general_utils.get_all_files(frame_dir, keep_dir=True, sort=True) num_frame = len(frame_files) tracking_data = dict() tracking_data["images"] = data["images"] tracking_data["categories"] = data["categories"] tracking_data["annotations"] = list() if low is None: low = -int(1e9) if high is None: high = int(1e9) start = time.time() last_count = 0 # Set up parallel processing if parallel: mp.set_start_method('spawn', force=True) mp.set_sharing_strategy('file_system') pool = Pool() else: pool = None results = [None for _ in range(num_frame)] # Set up multithreading processing if multithreading: executor = ThreadPoolExecutor() else: executor = None # Loop over frames for frame_id in range(num_frame): # Align id frame_id += offset num_box = len(annotations[frame_id]) # Count boxes with high confidence count = 0 for box_id in range(num_box): score = annotations[frame_id][box_id]["score"] if score > detection_threshold: count += 1 # If this frame has more boxes, track from it for certain; else check skip criteria if count <= last_count: last_count = count # Skip frame if frame_id % step != 0: continue else: last_count = count print("Process frame ", frame_id) forward_tracker = build_tracker(tracker_model) backward_tracker = build_tracker(tracker_model) # Loop over detection boxes for box_id in range(num_box): # print("=> Process box ", box_id) # Filter by detection score score = annotations[frame_id][box_id]["score"] if score < detection_threshold: # print("==> Skip") continue if multithreading: print( f"---> Multithread tracking for box {box_id} frame {frame_id}" ) executor.submit(single_box_in_single_frame_tracking, (frame_files, frame_id, box_id, annotations, tracking_threshold, forward_tracker, backward_tracker, offset, low, high)) if parallel: print( f"---> Parallel tracking for box {box_id} frame {frame_id}" ) results[frame_id - offset] = pool.apply_async( single_box_in_single_frame_tracking, [ frame_files, frame_id, box_id, annotations, tracking_threshold, forward_tracker, backward_tracker, offset, low, high ]) if not multithreading and not parallel: tracking_data[ "annotations"] += single_box_in_single_frame_tracking( frame_files, frame_id, box_id, annotations, tracking_threshold, forward_tracker, backward_tracker, offset, low, high) for result in results: if result is not None: tracking_data["annotations"] += result.get() end = time.time() print(f"Total time: {(end - start)} s") with open(save_json_file, "w") as outfile: json.dump(tracking_data, outfile)
from caffe2.torch.fb.distributed.model_parallel.share_memory import ( ShareMemoryRPCPickler, ) from caffe2.torch.fb.distributed.pytorch.adagrad_jit import ( Adagrad as FunctionalAdagrad, RowWiseSparseAdagrad, ) from caffe2.torch.fb.training_toolkit.backend.data.dpp_session import DppSession from torch import multiprocessing, nn from torch.distributed import rpc from torch.distributed.rpc.api import _use_rpc_pickler from torch.nn import functional as F from .iteration_controller import IterationControllerFactory # only support "file_system". See comments in comm.ShareMemory for detail multiprocessing.set_sharing_strategy("file_system") _BATCH_COUNT_PER_PRINT = 100 class Trainer: r""" Multi threading Hogwild trainer with EASGD and DPP """ def __init__( self, model: nn.Module, ea_client: ElasticAveragingClient, use_multithread_hogwild: bool, hogwild_workers_names: List[str], iteration_controller_factory: IterationControllerFactory, loss_fn: Optional[torch.jit.ScriptModule] = None,
from typing import Dict, Tuple, List # # Multiprocess input pipeline # ------------------------------- # # single epoch batch generators with multiple subprocesses, each subprocess works on its own file until the file is parsed completely # # - the processes have as little communication as possible (because it is prohibitly expensive in python) # - the finished batches go into shared memory and then the queue to be picked up by the train/validaton loops # mp.get_logger().setLevel( logging.WARNING) # ignore useless process start console logs mp.set_sharing_strategy( "file_system" ) # VERY MUCH needed for linux !! makes everything MUCH faster -> from 10 to 30+ batches/s fasttext_vocab_cached_mapping = None fasttext_vocab_cached_data = None # # we need to wrap the individual process queues, because they might be filled in different order # now we make sure to always get the same training samples in the same order for all runs # class DeterministicQueue(): def __init__(self, distributed_queues): self.distributed_queues = distributed_queues self.num_queues = len(distributed_queues) self.current_idx = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--DescEmb', action='store_true', help='True if DescEmb, False if CodeEmb') parser.add_argument('--source_file', choices=['mimic', 'eicu', 'both'], type=str, default='mimic', help='both for pooling') parser.add_argument('--target', choices=[ 'readmission', 'mortality', 'los>3day', 'los>7day', 'dx_depth1_unique' ], type=str, default='readmission') parser.add_argument('--item', choices=['all'], type=str, default='lab') parser.add_argument('--time_window', choices=['12', '24', '36', '48', 'Total'], type=str, default='12') parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.3) parser.add_argument('--embedding_dim', type=int, default=128) parser.add_argument('--hidden_dim', type=int, default=256) parser.add_argument('--n_epochs', type=int, default=1000) parser.add_argument('--lr', type=float, default=1e-4) parser.add_argument('--max_length', type=str, default='150') parser.add_argument('--bert_model', choices=[ 'bert', 'bio_clinical_bert', 'bio_bert', 'pubmed_bert', 'blue_bert', 'bert_mini', 'bert_tiny', 'bert_small' ], type=str) parser.add_argument('--cls_freeze', action='store_true') parser.add_argument('--input_path', type=str, default='/home/jylee/data/pretrained_ehr/input_data/', help='data directory') parser.add_argument( '--path', type=str, default='/home/jylee/data/pretrained_ehr/output/KDD_output/', help='model saving directory') args = parser.parse_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') if args.DescEmb: from dataset.DescEmb_dataloader import DescEmb_get_dataloader as get_dataloader from trainer.DescEmb_trainer import DescEmb_Trainer as Trainer if args.cls_freeze: print('DesEmb-FR') else: print('DescEmb-FT') elif not args.DescEmb: from dataset.CodeEmb_dataloader import CodeEmb_get_dataloader as get_dataloader from trainer.CodeEmb_trainer import Trainer print('CodeEmb') mp.set_sharing_strategy('file_system') SEED = [2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029] for seed in SEED: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # if use multi-GPU torch.backends.cudnn.deterministic = True args.seed = seed print('seed_number', args.seed) train_loader = get_dataloader(args=args, data_type='train') trainer = Trainer(args, train_loader, device) trainer.train() print('Finished training seed: {}'.format(seed))
import torch.multiprocessing as mp mp.set_sharing_strategy('file_system') # otherwise, weird bug import numpy as np import time import torch import os from torch.utils.data import Dataset, DataLoader from librosa.core import load as loadwav import json import pickle from tqdm import tqdm import pdb from scattering_autoencoder.scattering_recurrent import RecurrentScatteringNP def get_files_timit(path, **kwargs): """ Explores the TIMIT folder to retrieve all wav addresses """ all_files = {} regions = os.listdir(path) for id_region in range(len(regions)): speakers = os.listdir(os.path.join(path, regions[id_region])) for id_speaker in range(len(speakers)): subdir = os.path.join(path, regions[id_region], speakers[id_speaker]) files = [f for f in os.listdir(subdir) if '.WAV' in f] for f in files: prefix = str.split(f, '.')[0] key = (regions[id_region], speakers[id_speaker], prefix)
import torch.nn.functional as F import torch #DIST import torch.distributed as dist import torch.multiprocessing as multiprocessing from torch.multiprocessing import Process from datasets import DatasetManager from fid_score import * from inception import * from time import sleep, time import random import sys from scipy import stats from queue import Queue multiprocessing.set_sharing_strategy('file_system') torch.autograd.set_detect_anomaly(True) # Set random seed for reproducibility manualSeed = 999 #manualSeed = random.randint(1, 10000) # use if you want new results print("Random Seed: ", manualSeed) random.seed(manualSeed) def weights_init_normal(m): classname = m.__class__.__name__ if classname.find("Conv") != -1: torch.nn.init.normal_(m.weight.data, 0.0, 0.02) elif classname.find("BatchNorm") != -1: torch.nn.init.normal_(m.weight.data, 1.0, 0.02) torch.nn.init.constant_(m.bias.data, 0.0)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--DescEmb', action='store_true') parser.add_argument('--source_file', choices=['mimic', 'eicu', 'both'], type=str) parser.add_argument('--few_shot', choices=[0.1, 0.3, 0.5, 0.7, 0.9, 1.0], type=float) # training_dataset_size ratio parser.add_argument('--target', choices=[ 'readmission', 'mortality', 'los>3day', 'los>7day', 'dx_depth1_unique' ], type=str) parser.add_argument('--item', choices=['all'], type=str) parser.add_argument('--max_length', type=str, default='150') parser.add_argument('--bert_model', choices=[ 'bio_clinical_bert', 'bio_bert', 'pubmed_bert', 'blue_bert', 'bert', 'bert_mini', 'bert_small' ], type=str) parser.add_argument( '--path', type=str, default='/home/jylee/data/pretrained_ehr/output/KDD_output2/') parser.add_argument('--cls_freeze', action='store_true') parser.add_argument('--input_path', type=str, default='/home/jylee/data/pretrained_ehr/input_data/') args = parser.parse_args() args.time_window = '12' args.rnn_model_type = 'gru' args.batch_size = 512 args.n_epochs = 1000 # hyperparameter tuning args.dropout = 0.3 args.embedding_dim = 128 args.hidden_dim = 256 args.lr = 1e-4 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') mp.set_sharing_strategy('file_system') SEED = [2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029] for seed in SEED: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True args.seed = seed train_loader = get_dataloader(args=args, data_type='train') Trainer = DataSize_Trainer(args, train_loader, device) Trainer.train()
def embed(n_epochs, eval_every, gpu, train_threads, sparse, tensorboard_dir, embed_manifold_name, embed_manifold_dim, embed_manifold_params, loss_params, conformal_loss_params, sample_neighbors_every, resume_training, model, _log): model_ingredient_data = model device = torch.device(f'cuda:{gpu}' if gpu >= 0 else 'cpu') torch.set_num_threads(1) logging_thread.initialize(tensorboard_dir, _log) curvature_scale = [ torch.nn.Parameter(torch.tensor(0.)), torch.nn.Parameter(torch.tensor(0.)), torch.tensor(0., requires_grad=False) ] embed_manifold_params = embed_manifold_params.copy() embed_manifold_params["curvature_scale"] = curvature_scale embed_manifold = RiemannianManifold.from_name_params( embed_manifold_name, embed_manifold_params) tensorboard_watch = { "hyper_scale": curvature_scale[0], "sphere_scale": curvature_scale[1] } data, eval_data = load_dataset(embed_manifold) embed_eval.initialize_eval(adjacent_list=get_adjacency_dict(data)) if resume_training: model, save_data = load_model() model.to(device) if "features" in save_data: model = FeaturizedModelEmbedding(model, data.features, save_data["in_manifold"], embed_manifold, embed_manifold_dim, device=device) else: model = gen_model(data, device, embed_manifold, embed_manifold_dim) if train_threads > 1: mp.set_sharing_strategy('file_system') model = model.share_memory() if model_ingredient_data["input_manifold"] == "Spherical": feature_manifold = RiemannianManifold.from_name_params( "SphericalManifold", None) else: feature_manifold = RiemannianManifold.from_name_params( "EuclideanManifold", None) shared_params = { "manifold": embed_manifold, "dimension": embed_manifold_dim, "objects": data.objects, "in_manifold": feature_manifold } if hasattr(model, "get_additional_embeddings" ) and model.get_additional_embeddings() is not None: optimizer = RiemannianSGD( [ { 'params': model.get_savable_model().parameters() }, # {'params': model.main_deltas.parameters(), 'lr':300}, # {'params': model.additional_deltas.parameters(), 'lr':300}, # {'params': curvature_scale[:2], 'lr':0.001}, { 'params': model.get_additional_embeddings().parameters(), 'lr': get_fixed_embedding_lr() } ], lr=get_base_lr(), adam_for_euc=False) # optimizer = RiemannianSGD(list(model.get_savable_model().parameters()) + list(model.get_additional_embeddings().parameters()) + curvature_scale[1:], lr=get_base_lr(), adam_for_euc=False) else: optimizer = RiemannianSGD( [{ 'params': model.get_savable_model().parameters() } # {'params': curvature_scale[:2], 'lr':0.001} ], lr=get_base_lr(), adam_for_euc=False) lr_scheduler = get_lr_scheduler(optimizer) threads = [] if train_threads > 1: try: for i in range(train_threads): args = [ device, model, embed_manifold, embed_manifold_dim, data, optimizer, loss_params, n_epochs, eval_every, sample_neighbors_every, lr_scheduler, shared_params, i, feature_manifold, conformal_loss_params, tensorboard_watch, eval_data ] threads.append(mp.Process(target=train, args=args)) threads[-1].start() for thread in threads: thread.join() finally: for thread in threads: try: thread.close() except: thread.terminate() # embed_eval.close_thread(wait_to_finish=True) logging_thread.close_thread(wait_to_finish=True) else: args = [ device, model, embed_manifold, embed_manifold_dim, data, optimizer, loss_params, n_epochs, eval_every, sample_neighbors_every, lr_scheduler, shared_params, 0, feature_manifold, conformal_loss_params, tensorboard_watch, eval_data ] try: train(*args) finally: # embed_eval.close_thread(wait_to_finish=True) logging_thread.close_thread(wait_to_finish=True)
import logging import math import numpy as np import torch.multiprocessing as mp import sys import matplotlib.pyplot as plt from src.optimizer import optimize_parallel from src.train import train_individual,train_individual_cpu,test from src.train import train_parallel,train_serial from src.model import build_model, build_mean,build_sigma from src.util import mk_folder,save, load, setup_logging from src.vbn import explore_for_vbn # set up multiprocessing mp.set_sharing_strategy("file_system") # log and save path setting torch.set_num_threads(1) class ARGS(object): """ Global shared setting. """ env_type = "atari" state_dim = 0 action_dim = 0 action_lim = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--DescEmb', action='store_true') parser.add_argument('--source_file', choices=['mimic', 'eicu'], type=str, default='mimic') parser.add_argument('--test_file', choices=['mimic', 'eicu', 'both'], type=str, default='eicu') parser.add_argument('--few_shot', type=float, choices=[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0], default=0.0) parser.add_argument('--target', choices=[ 'readmission', 'mortality', 'los>3day', 'los>7day', 'dx_depth1_unique' ], type=str, default='readmission') parser.add_argument('--item', choices=['all'], type=str, default='med') parser.add_argument('--time_window', choices=['12', '24', '36', '48', 'Total'], type=str, default='12') parser.add_argument('--batch_size', type=int, default=512) parser.add_argument('--dropout', type=float, default=0.3) parser.add_argument('--embedding_dim', type=int, default=128) parser.add_argument('--hidden_dim', type=int, default=256) parser.add_argument('--n_epochs', type=int, default=1000) parser.add_argument('--lr', type=float, default=1e-4) parser.add_argument('--max_length', type=str, default='150') parser.add_argument('--bert_model', choices=[ 'bio_clinical_bert', 'bio_bert', 'pubmed_bert', 'blue_bert', 'bert_mini', 'bert_tiny' ], type=str, default='bio_bert') parser.add_argument('--input_path', type=str, default='/home/jylee/data/pretrained_ehr/input_data/', help='data directory') parser.add_argument( '--path', type=str, default='/home/jylee/data/pretrained_ehr/output/KDD_output/', help='model parameter directory') parser.add_argument('--cls_freeze', action='store_true') args = parser.parse_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') if args.source_file == args.test_file: assert args.few_shot == 0.0, "there is no few_shot if source and test file are the same" mp.set_sharing_strategy('file_system') SEED = [2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029] for seed in SEED: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True args.seed = seed train_loader = get_test_dataloader(args=args, data_type='train') valid_loader = get_test_dataloader(args=args, data_type='eval') test_loader = get_test_dataloader(args=args, data_type='test') tester = Tester(args, train_loader, valid_loader, test_loader, device, seed) if args.few_shot == 0.0: print('Only test') tester.zero_shot_test() else: print('Train then test') tester.train()