def cast_data(header, tablename, data): typedict = get_typedict(tablename) type_casters = [] for i in range(len(header)): sql_type = typedict[header[i]] if sql_type == text_type: type_casters.append(lambda str: str.encode('UTF-8')) #type_casters.append(lambda passer: passer) elif sql_type == int_type: type_casters.append(int) elif sql_type == date_type: type_casters.append(timestamp_parser.parse) log('casting data for ' + str(len(data)) + " rows") def cast_line(dataln): cast_line = [] for col_id in range(len(dataln)): cast_line.append(type_casters[col_id](dataln[col_id])) return cast_line tpool = Pool(processes=6) ret = tpool.map(cast_line, data) tpool.close() return ret
def process_experiment(_experiment, _overwrite=False): _arguments = [(_experiment, int(_series.split('_')[1]), _overwrite) for _series in paths.image_files(paths.serieses(_experiment)) ] _p = Pool(CPUS_TO_USE) _p.starmap(process_series, _arguments) _p.close()
def process_experiment(_experiment, _overwrite=False): _arguments = [] for _tuple in load.experiment_groups_as_tuples(_experiment): _experiment, _series_id, _group = _tuple _arguments.append((_experiment, _series_id, _group, _overwrite)) _p = Pool(CPUS_TO_USE) _p.starmap(process_group, _arguments) _p.close()
def runMultiProcessTrajectories(self, repeat): pool=Pool(processes=len(self.posIni)) result = pool.map(partial(self.runNtrajectory, repeat=repeat) , [(x, y) for x, y in self.posIni]) pool.close() pool.join() meanCost, meanTraj=0., 0. for Cost, traj in result: meanCost+=Cost meanTraj+=traj size = len(result) return meanCost/size, meanTraj/size
def runMultiProcessTrajectories(self, repeat): pool = Pool(processes=len(self.posIni)) result = pool.map(partial(self.nTraj, repeat=repeat), [(x, y) for x, y in self.posIni]) pool.close() pool.join() meanCost, meanTraj = 0, 0 for CostCMAES, traj in result: meanCost += CostCMAES meanTraj += traj size = len(result) return meanCost / size, meanTraj / size
def filter(dirty_data): log("starting filter") tpool = Pool(processes=cpus) ret = [] log("filtering deleted and not english") for line in tpool.map(Filter.__is_not_deleted_or_not_non_english, dirty_data): if line[1]: ret.append(line[0]) def clean_links_and_punctuation(comment): words = comment.split(" ") words = list(map(Filter.__filter_links, words)) comment = reduce(lambda x, y: x + " " + y, words) return comment log("filtering links and punctuation") ret = tpool.map(clean_links_and_punctuation, ret) tpool.close() log("filter done") return ret
def connect_structures_find_saddles(sequence, structure_list): pairs = {} #fc = RNA.fold_compound(sequence) fp_pool = Pool(Max_Threads) res_list=[] for i, se_1 in enumerate(structure_list): for j in range(i+1, len(structure_list)): se_2 = structure_list[j] a = fp_pool.apply_async(find_saddle, args=(se_1.Structure, se_2.Structure, i, j)) res_list.append(a) #saddle_energy_dcal = fc.path_findpath_saddle(se_1.Structure, se_2.Structure) #saddle_energy_kcal = saddle_energy_dcal / 100.0 #pairs[(i,j)] = saddle_energy_kcal #pairs[(j,i)] = saddle_energy_kcal fp_pool.close() for a in res_list: i,j, saddle_energy_kcal = a.get() pairs[(i,j)] = saddle_energy_kcal pairs[(j,i)] = saddle_energy_kcal # get lowest saddle for each structure that ends in a structure with lower energy than the first structure. minimal_saddle_list = [] for i in range(0, len(structure_list)): se_1 = structure_list[i] min_saddle_energy = sys.maxsize tree_neighbor = None for j in range(0, len(structure_list)): if i == j: continue se_2 = structure_list[j] saddle_energy = pairs[(i,j)] if saddle_energy <= min_saddle_energy and se_2.Energy < se_1.Energy: min_saddle_energy = saddle_energy tree_neighbor = j if tree_neighbor == None: # it could be the root. tree_neighbor = -1 minimal_saddle_list.append((i, tree_neighbor, min_saddle_energy)) return minimal_saddle_list
def save_word_error_parallel(results_file, output_file, X_test, y_test): file = open(results_file, 'r') lines = file.readlines() file.close() pool = Pool(4) # uses up 4 cores per call (for 8 core machine), so change this to match half your cpu count f_vals = [] # packs a tuple - line, index, X_test, y_test, verbose (should print or not) data = [(l, i, X_test, y_test, True) for i, l in enumerate(lines)] # process it in correct order accuracies = pool.imap(process_line, data) # maintains order when processing in parallel # wait while processing the data pool.close() for acc in accuracies: f_vals.append(str(acc) + "\n") file = open(output_file, 'w') file.writelines(f_vals) file.close()
neat = NEAT(nrOfOrgamisms, 1890, 4) pyplot.show(block=False) stillRunning = True while stillRunning: results = {} # randomPhenotype = random.choice(neat.phenotypes) # randomPhenotype.toDraw = True pool = Pool(nrOfInstances) finishedIndex = multiprocessing.Manager().Value('i', 0) for i, phenotype in enumerate(neat.phenotypes): results[i] = pool.apply_async(testOrganism, (phenotype, instances, finishedIndex, len(neat.phenotypes))) # results[i] = pool.apply_async(LogExceptions(testOrganism), (phenotype, instances, finishedIndex, len(neat.phenotypes))) pool.close() pool.join() distances = [result.get()[0] for func, result in results.items()] fitnessScores = [result.get()[1] for func, result in results.items()] for p, d in zip(neat.phenotypes, distances): p.genome.distance = d print("") print("-----------------------------------------------------") print("Running epoch") neat.phenotypes = neat.epoch(fitnessScores) print(distances) print("Generation: " + str(neat.generation)) print("Number of species: " + str(len(neat.species)))
costAll, trajTimeAll = np.zeros(repeat), np.zeros(repeat) for i in range(repeat): costAll[i], trajTimeAll[i] = self.runOneTrajectoryOpti(x, y) meanCost = np.mean(costAll) meanTrajTime = np.mean(trajTimeAll) self.costStore.append([x, y, meanCost]) self.trajTimeStore.append([x, y, meanTrajTime]) return meanCost, meanTrajTime def mapableTrajecrtoryFunction(self,x,y,useless): return self.runOneTrajectory(x, y) def runNtrajectoryMulti(self, (x, y), repeat): pool=Pool(processes=4) result = pool.map(partial(self.mapableTrajecrtoryFunction,x,y) , range(repeat)) pool.close() pool.join() meanCost, meanTraj=0., 0. for Cost, traj in result: meanCost+=Cost meanTraj+=traj size = len(result) return meanCost/size, meanTraj/size def runOneTrajectoryOpti(self, x, y): #self.tm.saveTraj = True cost, trajTime, lastX = self.tm.runTrajectoryOpti(x, y) #cost, trajTime, lastX = self.tm.runTrajectoryOpti(x, y) #print "Exp local x y cost : ", x, y, cost if lastX != -1000:
def pre_stat(paras, df_microsatellites): # reference=paras["reference"] path_pre_stat = paras["output"].rstrip("/") + "/" + get_value( "case") + ".stat" path_pre_stat_tmp = paras["output_tmp"].rstrip("/") + "/" + get_value( "case") + ".stat" file_all_stat = open(path_pre_stat, "w") file_all_stat.write("\t".join([ "repeat_unit_length", "repeat_times", "num_forward", "num_reversed", "this_repeat_mean_mean", "this_repeat_mean_std", "this_repeat_std_mean", "this_repeat_std_std", "forward_prefix", "forward_ms", "forward_suffix", "reversed_prefix", "reversed_ms", "reversed_suffix" ]) + "\n") df_microsatellites_download_sample = microsatellites_sampling( df_microsatellites, paras) for repeat_unit, info in df_microsatellites_download_sample.items(): for repeat_times, ms_infos in info.items(): logger.info("Processing repeat unit: " + str(repeat_unit) + " repeat times: " + str(repeat_times)) infos = [] for id, info in ms_infos.iterrows(): info["reference"] = paras["reference"] info["prefix_len"] = paras["prefix_len"] info["suffix_len"] = paras["suffix_len"] infos.append(info) pool = Pool(processes=paras["threads"]) res_infos = pool.map(process_one_ms, infos) pool.close() pool.join() suffix_str = "." + str(repeat_unit) + "." + str(repeat_times) file = open(path_pre_stat_tmp + suffix_str + ".repeat", "w") this_repeat_means = [] this_repeat_stds = [] num_forward = 0 num_reversed = 0 prefix_forward = [] suffix_forward = [] ms_forward = [] prefix_reversed = [] suffix_reversed = [] ms_reversed = [] for res in res_infos: if None not in res: file.write("\t".join(map(str, res[:-2])) + "\n") this_repeat_means.append(res[3]) this_repeat_stds.append(res[4]) prefix_forward.extend(res[-1]["prefix_forward"]) suffix_forward.extend(res[-1]["suffix_forward"]) ms_forward.extend(res[-1]["ms_forward"]) prefix_reversed.extend(res[-1]["prefix_reversed"]) suffix_reversed.extend(res[-1]["suffix_reversed"]) ms_reversed.extend(res[-1]["ms_reversed"]) num_forward += res[-1]["num_forward"] num_reversed += res[-1]["num_reversed"] file.close() if num_forward + num_reversed < 2: continue this_repeat_mean_mean = np.mean(this_repeat_means) this_repeat_mean_std = np.std(this_repeat_means) this_repeat_std_mean = np.mean(this_repeat_stds) this_repeat_std_std = np.std(this_repeat_stds) pd.concat( [ pd.DataFrame( [np.nanmean(np.array(prefix_forward), axis=0)]), pd.DataFrame([np.nanmean(np.array(ms_forward), axis=0)]), pd.DataFrame( [np.nanmean(np.array(suffix_forward), axis=0)]) ], axis=1, ).to_csv(path_pre_stat_tmp + suffix_str + ".forward.qual") pd.concat( [ pd.DataFrame( [np.nanmean(np.array(prefix_reversed), axis=0)]), pd.DataFrame([np.nanmean(np.array(ms_reversed), axis=0)]), pd.DataFrame( [np.nanmean(np.array(suffix_reversed), axis=0)]) ], axis=1, ).to_csv(path_pre_stat_tmp + suffix_str + ".reversed.qual") forward_prefix = np.nanmean(prefix_forward) forward_ms = np.nanmean(ms_forward) forward_suffix = np.nanmean(suffix_forward) reversed_prefix = np.nanmean(prefix_reversed) reversed_ms = np.nanmean(ms_reversed) reversed_suffix = np.nanmean(suffix_reversed) this_info_list = list( map(str, [ repeat_unit, repeat_times, num_forward, num_reversed, this_repeat_mean_mean, this_repeat_mean_std, this_repeat_std_mean, this_repeat_std_std, forward_prefix, forward_ms, forward_suffix, reversed_prefix, reversed_ms, reversed_suffix ])) file_all_stat.write("\t".join(this_info_list) + "\n") file_all_stat.close() return
def create_test_raw_data(self, ticker_list=None, start_date=None, finish_date=None, folder_prefix=None): """Downloads FX tick data from DukasCopy and then dumps each ticker in a separate HDF5 file if a folder is specified. If no folder is specified returns a list of DataFrames (note: can be a very large list in memory) Parameters ---------- ticker_list : str (list) List of FX tickers to download start_date : datetime/str Start date of FX tick data download finish_date : datetime/str Finish date of FX tick data download folder_prefix : str Folder to dump everything Returns ------- DataFrame (list) """ from findatapy.market import MarketDataRequest, MarketDataGenerator, Market if start_date is None and finish_date is None: finish_date = datetime.datetime.utcnow().date() - timedelta( days=30) start_date = finish_date - timedelta(days=30 * 15) start_date = self._compute_random_date(start_date, finish_date) finish_date = start_date + timedelta(days=90) df_list = [] result = [] # From multiprocessing.dummy import Pool # threading from multiprocess.pool import Pool # actuall new processes import time # If we don't specify a folder if folder_prefix is None: mini_ticker_list = self._split_list(ticker_list, 2) # Use multiprocess to speed up the download for mini in mini_ticker_list: pool = Pool(processes=2) for ticker in mini: time.sleep(1) self.logger.info("Loading " + ticker) md_request = MarketDataRequest( start_date=start_date, finish_date=finish_date, category='fx', tickers=ticker, fields=['bid', 'ask', 'bidv', 'askv'], data_source='dukascopy', freq='tick') # self._download(md_request) result.append( pool.apply_async(self._download, args=( md_request, folder_prefix, ))) pool.close() pool.join() else: market = Market(market_data_generator=MarketDataGenerator()) for ticker in ticker_list: md_request = MarketDataRequest( start_date=start_date, finish_date=finish_date, category='fx', tickers=ticker, fields=['bid', 'ask', 'bidv', 'askv'], data_source='dukascopy', freq='tick') df = market.fetch_market(md_request=md_request) df.columns = ['bid', 'ask', 'bidv', 'askv'] df['venue'] = 'dukascopy' df['ticker'] = ticker # print(df) if folder_prefix is not None: self.dump_hdf5_file(df, folder_prefix + "_" + ticker + ".h5") # df.to_csv(folder_prefix + "_" + ticker + ".csv") # CSV files can be very large, so try to avoid else: df_list.append(df) return df_list
class ParallelBackend: """ The unified backend for parallelization. Currently, we support `multiprocess`, `dask`, `sharedmem` and `loky`. `multiprocess` usually has better performance on single-node machines, while `dask` can be used for multi-node parallelization. Note the following known issues: when used for sampling, (1) `dask` and `loky` do not respect the global bayesfast random seed; (2) `sharedmem` may not display the progress messages correctly (multiple messages in the same line); (3) `loky` does not print any messages at all in Jupyter. So we recommend using the default `multiprocess` backend when possible. Parameters ---------- backend : None, int, Pool, Client or MapReduce, optional The backend for parallelization. If `None` or `int`, will be passed as the `processes` argument to initialize a Pool in a with context. Set to `None` by default. """ def __new__(cls, backend=None): if isinstance(backend, ParallelBackend): return backend else: return super(ParallelBackend, cls).__new__(cls) def __init__(self, backend=None): if isinstance(backend, ParallelBackend): return self.backend = backend def __enter__(self): if self.backend is None or isinstance(self.backend, int): self._backend_activated = Pool(self.backend) elif HAS_SHAREDMEM and isinstance(self.backend, MapReduce): self.backend.__enter__() return self def __exit__(self, exc_type, exc_val, exc_tb): if self.backend is None or isinstance(self.backend, int): self._backend_activated.close() self._backend_activated.join() self._backend_activated = None elif HAS_SHAREDMEM and isinstance(self.backend, MapReduce): self.backend.__exit__(exc_type, exc_val, exc_tb) @property def backend(self): return self._backend @backend.setter def backend(self, be): if be is None or (isinstance(be, int) and be > 0): pass elif isinstance(be, Pool): pass elif HAS_RAY and isinstance(be, RayPool): pass elif HAS_DASK and isinstance(be, Client): pass elif HAS_SHAREDMEM and isinstance(be, MapReduce): pass elif HAS_LOKY and isinstance(be, reusable_executor._ReusablePoolExecutor): pass # elif be == 'serial': # pass else: raise ValueError('invalid value for backend.') self._backend_activated = be self._backend = be @property def backend_activated(self): return self._backend_activated @property def kind(self): if self.backend is None or isinstance(self.backend, int): return 'multiprocess' elif isinstance(self.backend, Pool): return 'multiprocess' elif HAS_RAY and isinstance(self.backend, RayPool): return 'ray' elif HAS_DASK and isinstance(self.backend, Client): return 'dask' elif HAS_SHAREDMEM and isinstance(self.backend, MapReduce): return 'sharedmem' elif HAS_LOKY and isinstance(self.backend, reusable_executor._ReusablePoolExecutor): return 'loky' # elif self.backend == 'serial': # return 'serial' else: raise RuntimeError('unexpected value for self.backend.') def map(self, fun, *iters): if self.backend_activated is None: raise RuntimeError( 'the backend is not activated. Please use it in ' 'a with context.') elif isinstance(self.backend_activated, Pool): return self.backend_activated.starmap(fun, zip(*iters)) elif HAS_RAY and isinstance(self.backend_activated, RayPool): return self.backend_activated.starmap(fun, list(zip(*iters))) # https://github.com/ray-project/ray/issues/11451 # that's why I need to explicitly convert it to a list for now elif HAS_DASK and isinstance(self.backend_activated, Client): return self.gather(self.backend_activated.map(fun, *iters)) elif HAS_SHAREDMEM and isinstance(self.backend_activated, MapReduce): return self.backend_activated.map(fun, list(zip(*iters)), star=True) elif HAS_LOKY and isinstance(self.backend_activated, reusable_executor._ReusablePoolExecutor): return self.gather(self.backend_activated.map(fun, *iters)) # elif self.backend_activated == 'serial': # return [deepcopy(fun)(*[i[j] for i in iters]) for j in range(l)] else: raise RuntimeError('unexpected value for self.backend_activated.') def map_async(self, fun, *iters): if self.backend_activated is None: raise RuntimeError( 'the backend is not activated. Please use it in ' 'a with context.') elif isinstance(self.backend_activated, Pool): return self.backend_activated.starmap_async(fun, zip(*iters)) elif HAS_RAY and isinstance(self.backend_activated, RayPool): return self.backend_activated.starmap_async(fun, list(zip(*iters))) elif HAS_DASK and isinstance(self.backend_activated, Client): return self.backend_activated.map(fun, *iters) elif HAS_SHAREDMEM and isinstance(self.backend_activated, MapReduce): warnings.warn( 'sharedmem does not support map_async. Using map ' 'instead.', RuntimeWarning) return self.backend_activated.map(fun, list(zip(*iters)), star=True) elif HAS_LOKY and isinstance(self.backend_activated, reusable_executor._ReusablePoolExecutor): return self.backend_activated.map(fun, *iters) # elif self.backend_activated == 'serial': # return self.map(fun, *iters) else: raise RuntimeError('unexpected value for self.backend_activated.') def gather(self, async_result): if self.backend_activated is None: raise RuntimeError( 'the backend is not activated. Please use it in ' 'a with context.') elif isinstance(self.backend_activated, Pool): return async_result.get() elif isinstance(self.backend_activated, RayPool): return async_result.get() elif HAS_DASK and isinstance(self.backend_activated, Client): return self.backend_activated.gather(async_result) elif HAS_SHAREDMEM and isinstance(self.backend_activated, MapReduce): return async_result elif HAS_LOKY and isinstance(self.backend_activated, reusable_executor._ReusablePoolExecutor): return list(async_result) # elif self.backend_activated == 'serial': # return async_result else: raise RuntimeError('unexpected value for self.backend_activated.')
('SN20_Bleb_fromStart', 14, 0, 1, -235, 30), ('SN20_Bleb_fromStart', 14, 0, 2, 120, 230), ('SN20_Bleb_fromStart', 14, 0, 3, -230, 105), ('SN20_Bleb_fromStart', 14, 0, 4, 205, 35), ('SN20_Bleb_fromStart', 14, 1, 2, 110, -180), ('SN20_Bleb_fromStart', 14, 1, 3, -220, 25), ('SN20_Bleb_fromStart', 14, 1, 4, -150, 0), ('SN20_Bleb_fromStart', 14, 2, 3, 160, -130), ('SN20_Bleb_fromStart', 14, 2, 4, -75, 210), ('SN20_Bleb_fromStart', 14, 3, 4, 220, 105), ('SN20_Bleb_fromStart', 15, 0, 1, 0, 235), ('SN20_Bleb_fromStart', 16, 0, 1, 0, -225), ('SN20_Bleb_fromStart', 16, 0, 2, -80, 130), ('SN20_Bleb_fromStart', 16, 1, 2, -60, -120), ('SN20_Bleb_fromStart', 17, 0, 2, -180, 0), ('SN20_Bleb_fromStart', 17, 0, 3, 155, 0), ('SN20_Bleb_fromStart', 17, 1, 2, -225, -115), ('SN20_Bleb_fromStart', 17, 1, 3, -135, 20), ('SN20_Bleb_fromStart', 18, 0, 1, -110, -175), ('SN20_Bleb_fromStart', 19, 0, 1, 70, -150), ('SN20_Bleb_fromStart', 19, 1, 2, -100, 115), ('SN20_Bleb_fromStart', 19, 1, 3, 60, -170), ('SN20_Bleb_fromStart', 19, 2, 3, 135, 185), ('SN20_Bleb_fromStart', 20, 0, 1, 175, 20), ('SN20_Bleb_fromStart', 20, 0, 2, 205, -60), ('SN20_Bleb_fromStart', 20, 1, 2, -135, 80), ] _p = Pool(CPUS_TO_USE) _answers = _p.starmap(process_fake_following, _arguments) _p.close()
class DataPipelineWithReward: """ Creates a data pipeline that also outputs discounted reward. """ def __init__(self, observables: List[AgentHandler], actionables: List[AgentHandler], mission_handlers: List[AgentHandler], nsteps, gamma, data_directory, num_workers, worker_batch_size, min_size_to_dequeue): """ Sets up a tensorflow dataset to load videos from a given data directory. :param data_directory: the directory of the data to be loaded, eg: 'minerl.herobraine_parse/output/rendered/' """ self.data_dir = data_directory self.observables = observables self.actionables = actionables self.mission_handlers = mission_handlers # self.vectorizer = vectorizer self.number_of_workers = num_workers self.worker_batch_size = worker_batch_size self.size_to_dequeue = min_size_to_dequeue self.nsteps = nsteps self.gamma = gamma self.processing_pool = Pool(self.number_of_workers) self.m = multiprocessing.Manager() self.data_queue = self.m.Queue(maxsize=self.size_to_dequeue // self.worker_batch_size * 4) pool_size = self.size_to_dequeue * 4 self.random_queue = PriorityQueue(maxsize=pool_size) def batch_iter(self, batch_size): """ Returns a generator for iterating through batches of the dataset. :param batch_size: :param number_of_workers: :param worker_batch_size: :param size_to_dequeue: :return: """ logger.info("Starting batch iterator on {}".format(self.data_dir)) data_list = self._get_all_valid_recordings(self.data_dir) load_data_func = self._get_load_data_func(self.data_queue, self.nsteps, self.worker_batch_size, self.mission_handlers, self.observables, self.actionables, self.gamma) map_promise = self.processing_pool.map_async(load_data_func, data_list) # We map the files -> load_data -> batch_pool -> random shuffle -> yield. # batch_pool = [] start = 0 incr = 0 while not map_promise.ready() or not self.data_queue.empty( ) or not self.random_queue.empty(): #print("d: {} r: {}".format(data_queue.qsize(), random_queue.qsize())) while not self.data_queue.empty() and not self.random_queue.full(): for ex in self.data_queue.get(): if not self.random_queue.full(): r_num = np.random.rand(1)[0] * (1 - start) + start self.random_queue.put((r_num, ex)) incr += 1 # print("d: {} r: {} rqput".format(data_queue.qsize(), random_queue.qsize())) else: break if incr > self.size_to_dequeue: if self.random_queue.qsize() < (batch_size): if map_promise.ready(): break else: continue batch_with_incr = [ self.random_queue.get() for _ in range(batch_size) ] r1, batch = zip(*batch_with_incr) start = 0 traj_obs, traj_acts, traj_handlers, traj_n_obs, discounted_rewards, elapsed = zip( *batch) observation_batch = [ HandlerCollection({ o: np.asarray(traj_ob[i]) for i, o in enumerate(self.observables) }) for traj_ob in traj_obs ] action_batch = [ HandlerCollection({ a: np.asarray(traj_act[i]) for i, a in enumerate(self.actionables) }) for traj_act in traj_acts ] mission_handler_batch = [ HandlerCollection({ m: np.asarray(traj_handler[i]) for i, m in enumerate(self.mission_handlers) }) for traj_handler in traj_handlers ] next_observation_batch = [ HandlerCollection({ o: np.asarray(traj_n_ob[i]) for i, o in enumerate(self.observables) }) for traj_n_ob in traj_n_obs ] yield observation_batch, action_batch, mission_handler_batch, next_observation_batch, discounted_rewards, elapsed # Move on to the next batch bool. # Todo: Move to a running pool, sampling as we enqueue. This is basically the random queue impl. # Todo: This will prevent the data from getting arbitrarily segmented. # batch_pool = [] try: map_promise.get() except RuntimeError as e: logger.error("Failure in data pipeline: {}".format(e)) logger.info("Epoch complete.") def close(self): self.processing_pool.close() self.processing_pool.join() ############################ ## PRIVATE METHODS ############################# @staticmethod def _get_load_data_func(data_queue, nsteps, worker_batch_size, mission_handlers, observables, actionables, gamma): def _load_data(inst_dir): recording_path = str(os.path.join(inst_dir, 'recording.mp4')) univ_path = str(os.path.join(inst_dir, 'univ.json')) try: cap = cv2.VideoCapture(recording_path) # Litty uni with open(univ_path, 'r') as f: univ = {int(k): v for (k, v) in (json.load(f)).items()} univ = OrderedDict(univ) univ = np.array(list(univ.values())) # Litty viddy batches = [] rewards = [] frames_queue = Queue(maxsize=nsteps) # Loop through the video and construct frames # of observations to be sent via the multiprocessing queue # in chunks of worker_batch_size to the batch_iter loop. frame_num = 0 while True: ret, frame = cap.read() if not ret or frame_num >= len(univ): break else: #print("Batches {} and worker batch size {}".format(len(batches), self.worker_batch_size)) if len(batches) >= worker_batch_size: data_queue.put(batches) batches = [] try: # Construct a single observation object. vf = (np.clip(frame[:, :, ::-1], 0, 255)) uf = univ[frame_num] frame = {'pov': vf} frame.update(uf) cur_reward = 0 for m in mission_handlers: try: if isinstance(m, RewardHandler): cur_reward += m.from_universal(frame) except NotImplementedError: pass rewards.append(cur_reward) #print("Frames queue size {}".format(frames_queue.qsize())) frames_queue.put(frame) if frames_queue.full(): next_obs = [ o.from_universal(frame) for o in observables ] frame = frames_queue.get() obs = [ o.from_universal(frame) for o in observables ] act = [ a.from_universal(frame) for a in actionables ] mission = [] for m in mission_handlers: try: mission.append(m.from_universal(frame)) except NotImplementedError: mission.append(None) pass batches.append( (obs, act, mission, next_obs, DataPipelineWithReward. _calculate_discount_rew( rewards[-nsteps:], gamma), frame_num + 1 - nsteps)) except Exception as e: # If there is some error constructing the batch we just start a new sequence # at the point that the exception was observed logger.warn( "Exception {} caught in the middle of parsing {} in " "a worker of the data pipeline.".format( e, inst_dir)) frame_num += 1 return batches except Exception as e: logger.error("Caught Exception") raise e return None return _load_data @staticmethod def _calculate_discount_rew(rewards, gamma): total_reward = 0 for i, rew in enumerate(rewards): total_reward += (gamma**i) * rew return total_reward @staticmethod def _get_all_valid_recordings(path): directoryList = [] # return nothing if path is a file if os.path.isfile(path): return [] # add dir to directorylist if it contains .txt files if len([f for f in os.listdir(path) if f.endswith('.mp4')]) > 0: if len([f for f in os.listdir(path) if f.endswith('.json')]) > 0: directoryList.append(path) for d in os.listdir(path): new_path = os.path.join(path, d) if os.path.isdir(new_path): directoryList += DataPipelineWithReward._get_all_valid_recordings( new_path) directoryList = np.array(directoryList) np.random.shuffle(directoryList) return directoryList.tolist()