def process(cls, named_seqs: List[List[str]]): if named_seqs is None: return 0, None seqs: List[List[BaseGenerator]] = [ list(map(lambda x: cls.generators[x], s)) for s in named_seqs ] max_seq_trials = cls.args.max_seq_trials results: List[Dict] = [] for idx, seq in enumerate(seqs): engine = RandProgEngine(seq, cls.args) for trial in range(max_seq_trials): try: spec: ExplorationSpec = engine.generate() except Exception as e: if cls.args.debug: logger.warn("Encountered exception for", named_seqs[idx]) logger.log(e) logging.exception(e) continue if spec is None: continue dpoint = { 'inputs': spec.inputs, 'output': spec.output, 'intermediates': spec.intermediates, 'program_str': str(spec.program), 'program': spec.program, 'function_sequence': named_seqs[idx], 'generator_tracking': spec.tracking } # print("-" * 50) # print(dpoint) # print("-" * 50) # print([t.record for t in spec.tracking]) # print(spec.program) # Confirm it's picklable. Sometimes, unpickling throws an error # when the main process is receiving the msg, and things break down # in a very, very nasty manner # TODO : Can we switch to dill while using multiprocessing/pebble? try: a = pickle.dumps(dpoint) pickle.loads(a) except: continue results.append(dpoint) break return len(named_seqs), results
def generate(self): self.init() num_generated = 0 num_processed = 0 num_raw_points = -1 if os.path.exists(self.args.raw_data_path + '.index'): reader = IndexedFileReader(self.args.raw_data_path) num_raw_points = len(reader) reader.close() start_time = time.time() with pebble.ProcessPool( max_workers=self.args.processes, initializer=FunctionSeqDataGenerator.Worker.init, initargs=(self.args, )) as p: chunksize = self.args.processes * self.args.chunksize for chunk in misc.grouper(chunksize, self.raw_data_iterator()): future = p.map(FunctionSeqDataGenerator.Worker.process, chunk, timeout=self.args.task_timeout) res_iter = future.result() idx = -1 while True: idx += 1 if idx < len(chunk) and chunk[idx] is not None: num_processed += 1 try: result = next(res_iter) if chunk[idx] is None: continue if result is not None: self.process_result(result) num_generated += 1 except StopIteration: break except TimeoutError as error: pass except Exception as e: try: logger.warn("Failed for", chunk[idx]) logging.exception(e) except: pass finally: speed = round( num_processed / (time.time() - start_time), 1) if num_raw_points != -1: time_remaining = round( (num_raw_points - num_processed) / speed, 1) else: time_remaining = '???' logger.log( "Generated/Processed : {}/{} ({}/s, TTC={}s)". format(num_generated, num_processed, speed, time_remaining), end='\r') p.stop() try: p.join(10) except: pass self.fwriter.close() logger.log("\n-------------------------------------------------") logger.info("Total Time : {:.2f}s".format(time.time() - start_time)) logger.info( "Generated {} training points from {} raw data points".format( num_generated, num_processed))
def generate(self): self.init() num_generated = 0 num_processed = 0 num_required = self.args.num_training_points self.sequences = self.load_sequences() start_time = time.time() speed = 0 time_remaining = 'inf' with pebble.ProcessPool(max_workers=self.args.processes, initializer=RawDataGenerator.Worker.init, initargs=(self.args, )) as p: # First do smaller chunksizes to allow the blacklist to take effect chunksize = self.args.processes * self.args.chunksize if self.args.blacklist_threshold == -1: chunksize_blacklist = chunksize else: chunksize_blacklist = max( (self.args.blacklist_threshold // self.args.max_seq_trials), 1) * len(self.sequences) for chunk in misc.grouper([chunksize_blacklist, chunksize], self.gen_named_seqs()): if not p.active: break future = p.map(RawDataGenerator.Worker.process, chunk, timeout=self.args.task_timeout) res_iter = future.result() idx = -1 while True: idx += 1 if num_generated >= num_required: p.stop() try: p.join(10) except: pass break try: returned = next(res_iter) if returned is None: self.report_error_seqs(chunk[idx]) continue num_input_seqs, results = returned num_processed += num_input_seqs if results is not None and len(results) > 0: for seq in chunk[idx]: self.whitelist.add(tuple(seq)) for result in results: num_generated += 1 self.process_dpoint(result) speed = round( num_generated / (time.time() - start_time), 1) time_remaining = round( (num_required - num_generated) / speed, 1) elif num_input_seqs > 0: self.report_error_seqs(chunk[idx]) logger.log("Num Generated : {} ({}/s, TTC={}s)".format( num_generated, speed, time_remaining), end='\r') except StopIteration: break except TimeoutError as error: pass except Exception as e: logger.warn("Failed for", chunk[idx]) p.stop() try: p.join(10) except: pass self.fwriter.close() logger.log("\n-------------------------------------------------") logger.info("Total Time : {:.2f}s".format(time.time() - start_time)) logger.info("Number of sequences processed :", num_processed) logger.info("Number of training points generated :", num_generated)