class Counter(object): """ A process-safe counter providing atomic incrementAndGet() and value() functions """ def __init__(self, initval=0): """ Initialize this counter Args: initval (int): set the initialize value of the counter """ self.val = Value('i', initval) def incrementAndGet(self): """ Atomically increment this counter, and return the new value stored. Returns: int: The updated value of this counter. """ with self.val.get_lock(): self.val.value += 1 return self.val.value def value(self): """ Atomically get the current value of this counter. Returns: int: The current value of this counter. """ with self.val.get_lock(): return self.val.value
def run_check(ok: Value, path: Path, args): """Run check without updating progress.""" path_md5 = Path("%s.md5" % path) if not _check_gz_integrity(path): logger.error("GZip file integrity check failed: %s", path) with ok.get_lock(): ok.value = False elif not path.exists(): # does not exist => error logger.error("Does not exist: %s", path) with ok.get_lock(): ok.value = False elif path_md5.exists(): logger.debug("MD5 file exists: %s", path_md5) if _check_md5(path): logger.debug(" => MD5 OK") else: logger.error(" => MD5 mismatch for %s", path) with ok.get_lock(): ok.value = False elif args.missing_md5_error: # => .md5 missing and is error => error logger.error("MD5 file does not exist for: %s", path) with ok.get_lock(): ok.value = False elif args.compute_md5: # .md5 missing and is not error => recreate recreated = _recreate_md5_for(path) if recreated: logger.info("Created MD5 file: %s", path_md5) elif args.create_md5_fail_error: logger.error("Could not create MD5 file for: %s", path) with ok.get_lock(): ok.value = False else: logger.info("Not attempting to recreate %s", path_md5)
def _test_all_options(position: int, death_count: Value, kill_count: Value, play_map: ManuelCalculatedGame, is_not_6th_step: bool, move_list: []): processes = [] # Iterates every possible action for the active player/ the player at this position. for move in move_list: tmp_map = deepcopy(play_map) # Calls the _calculate_move-function to set the new action and checking whether the player survives. if play_map.players[position].surviving: tmp_map = _calculate_move(position, move, tmp_map, is_not_6th_step) # If the last player is reached, the result will be calculated here. if position == len(play_map.players) - 1: for index, player in enumerate(tmp_map.players): if not player.surviving: if index == 0: with death_count.get_lock(): death_count.value += 1 else: with kill_count.get_lock(): kill_count.value += 1 else: # Function calls itself (recursion) p = Process(target=_test_all_options, args=(position + 1, death_count, kill_count, deepcopy(play_map), is_not_6th_step, move_list)) processes.append(p) p.start() for process in processes: process.join()
def work(simulate_one, queue, n_eval: Value, n_acc: Value, n: int, check_max_eval: bool, max_eval: int, all_accepted: bool, sample_factory): # unwrap arguments if isinstance(simulate_one, bytes): simulate_one = pickle.loads(simulate_one) random.seed() np.random.seed() sample = sample_factory() while (n_acc.value < n and (not all_accepted or n_eval.value < n) and (not check_max_eval or n_eval.value < max_eval)): with n_eval.get_lock(): particle_id = n_eval.value n_eval.value += 1 new_sim = simulate_one() sample.append(new_sim) if new_sim.accepted: # increase number of accepted particles with n_acc.get_lock(): n_acc.value += 1 # put into queue queue.put((particle_id, sample)) # create empty sample and record until next accepted sample = sample_factory() # indicate worker finished queue.put(DONE)
def work(simulate_one, queue, n_eval: Value, n_acc: Value, n: int, all_accepted: bool, sample_factory): random.seed() np.random.seed() sample = sample_factory() while n_acc.value < n and \ (not all_accepted or n_eval.value < n): with n_eval.get_lock(): particle_id = n_eval.value n_eval.value += 1 new_sim = simulate_one() sample.append(new_sim) if new_sim.accepted: # increase number of accepted particles with n_acc.get_lock(): n_acc.value += 1 # put into queue queue.put((particle_id, sample)) # create empty sample and record until next accepted sample = sample_factory() # indicate worker finished queue.put(DONE)
class TTS: def __init__(self): self.text_queue = Queue(maxsize=3) self.run = Value('i', 1) self.done_loading = Value('i', 0) self._is_speaking = Value('i', 0) self.process = Process(target=tts_worker, args=(self.text_queue, self.run, self.done_loading, self._is_speaking)) @property def is_speaking(self): return self._is_speaking.value def say(self, text, block=False): with self._is_speaking.get_lock(): self._is_speaking.value = 1 self.text_queue.put(text) while block and self.is_speaking: time.sleep(0.1) def start(self): logging.info('Starting TTS process...') self.process.start() while not self.done_loading.value: time.sleep(0.1) logging.info('TTS process finished starting.') def stop(self): with self.run.get_lock(): self.run.value = 0 self.process.join()
class ProgressTracker(Thread): def __init__(self): super().__init__() self.lock = Condition() self.done = Value("H", 0) self.file = Value(c_wchar_p, "") self.progress = Value("d", 0.0) self.callbacks = [] def update(self, file, progress): with self.lock: self.file = file self.progress = progress self.lock.notifyAll() def complete(self): with self.done.get_lock(): self.done.value = 1 with self.lock: self.lock.notifyAll() def registerUpdateCallback(self, callback): self.callbacks.append(callback) def run(self): while True: with self.done.get_lock(): if self.done.value: break self.lock.wait() with self.file.get_lock() and self.progress.get_lock(): for callback in self.callbacks: callback(file.value, progress.value)
def work(simulate_one, queue, n_eval: Value, n_particles: Value, sample_factory): random.seed() np.random.seed() sample = sample_factory() while n_particles.value > 0: with n_eval.get_lock(): particle_id = n_eval.value n_eval.value += 1 new_sim = simulate_one() sample.append(new_sim) if new_sim.accepted: # reduce number of required particles with n_particles.get_lock(): n_particles.value -= 1 # put into queue queue.put((particle_id, sample)) # create empty sample and record until next accepted sample = sample_factory() # indicate worker finished queue.put(DONE)
class Counter(object): def __init__(self, initval=0): self.val = Value('i', initval) def __repr__(self): return str(self.val.value) def __add__(self, other): with self.val.get_lock(): self.val.value += other return self def __sub__(self, other): with self.val.get_lock(): self.val.value -= other return self def increase(self, other=1): with self.val.get_lock(): self.val.value += other def decrease(self, other=1): with self.val.get_lock(): self.val.value -= other def set(self, value): with self.val.get_lock(): self.val.value = value @property def value(self): return self.val.value
def worker(inq: Queue, outq: Queue, sharedAlpha: Value): while True: args = inq.get() if args is None: break # Stop worker board, turn, depth, alpha, beta, future, options = args move = board.peek() maxDepth = options.get("maxDepth", depth) # If there at least three moves in tree, # we know the best next move from the previous iteration # and trying to dig into it with more depth if len(future) >= 12: # each moves tree coded like: f8b4c2c3f6e4 if move.uci() == future[8:12]: depth = maxDepth with sharedAlpha.get_lock(): alpha = max(alpha, sharedAlpha.value) score, tree = negamax(board, turn, depth - 1, -beta, -alpha, move.uci()) score = -score with sharedAlpha.get_lock(): if score > sharedAlpha.value: sharedAlpha.value = score ##if score >= beta: # TODO Is it ever possible? ## outq.put( (board.peek(), beta) ) # Return bestMove and bestScore outq.put((move, score, tree))
class Query_To_Database(object): def __init__(self): self.params = pika.ConnectionParameters(host='rmq') self.params.heartbeat = 0 self.params.socket_timeout = 2 self.connection = pika.BlockingConnection(self.params) self.channel = self.connection.channel() self.write_response_data = None self.write_correl_id = None self.read_correl_id = None self.read_response_data = None self.write_lock = Value('i', 0) self.write_result = self.channel.queue_declare(queue='', exclusive=True) self.read_result = self.channel.queue_declare(queue='Response_Queue', exclusive=True) self.write_callback_queue = self.write_result.method.queue self.channel.basic_consume(queue=self.write_callback_queue,on_message_callback=self.write_response,auto_ack=True) self.channel.basic_consume(queue='Response_Queue',on_message_callback=self.read_response,auto_ack=True) def write_response(self,channel,method,properties,message_body): if self.write_correl_id == properties.correlation_id: self.write_response_data = json.loads(message_body) def read_response(self,channel,method,properties,message_body): if self.read_correl_id == properties.correlation_id: self.read_response_data = json.loads(message_body) def write_query_to_database(self,request_data): try: with self.write_lock.get_lock(): self.write_response_data = None self.write_correl_id = str(uuid.uuid4()) data_to_send = request_data.get_json() self.channel.basic_publish(exchange='',routing_key='Write_Queue',body=json.dumps(data_to_send), properties=pika.BasicProperties(reply_to=self.write_callback_queue,correlation_id=self.write_correl_id,)) while(self.write_response_data is None): self.connection.process_data_events() return Response("{}".format(self.write_response_data['data']), status = self.write_response_data['status_code'],mimetype = 'application/json') except Exception as e: return Response("{}".format(e), status = 500, mimetype = 'application/json') def read_query_from_database(self,request_data): try: with self.write_lock.get_lock(): self.read_response_data = None self.read_correl_id = str(uuid.uuid4()) data_to_send = request_data.get_json() self.channel.basic_publish(exchange='',routing_key='Read_Queue',body=json.dumps(data_to_send), properties=pika.BasicProperties(reply_to='Response_Queue',correlation_id=self.read_correl_id,)) while(self.read_response_data is None): self.connection.process_data_events() return Response("{}".format(self.read_response_data['data']), status = self.read_response_data['status_code'],mimetype = 'application/json') except Exception as e: return Response("{}".format(e), status = 500, mimetype = 'application/json')
class SyncedCrawlingProgress: def __init__(self, total_count=1000, update_every=100000): # Variables that need to be synced across Threads self.count = Value('i', 0) self.last_time = Value('d', time.time()) self.last_count = Value('i', 0) self.start_time = time.time() self.update_every = update_every self.total_count = total_count print(self.row_string(COLUMNS)) print(ROW_SEPARATOR * (len(COLUMNS) * COL_WIDTH + len(COLUMNS) - 1)) def row_string(self, values): string = "" for value in values[0:-1]: string += str(value).center(COL_WIDTH) + COL_SEPARATOR string += str(values[-1]).center(COL_WIDTH) return string def inc(self, by=1): with self.count.get_lock(): self.count.value += by if self.count.value - self.last_count.value >= self.update_every: # Print update self.print_update() # Then update relevant variables with self.last_time.get_lock(), self.last_count.get_lock(): self.last_count.value = self.count.value self.last_time.value = time.time() def print_update(self): # Prints current number, total number, percentage, runtime, increase per second, expected remaining runtime percentage = self.count.value / self.total_count * 100 runtime = time.time() - self.start_time increases_per_second = (self.count.value - self.last_count.value) / ( time.time() - self.last_time.value) expected_remaining_runtime = (self.total_count - self.count.value) / increases_per_second print( self.row_string([ self.count.value, self.total_count, "%02.0d%%" % percentage, self.time_str(runtime), "%.02f" % increases_per_second, self.time_str(expected_remaining_runtime) ])) def time_str(self, seconds): return '%02d:%02d:%02d' % (seconds / 3600, seconds / 60 % 60, seconds % 60) def set_total_count(self, total_count): self.total_count = total_count
class StopWatch(Process): def __init__(self, hotkey=None): super().__init__(name='Stop Watch') self._stop_event = Event() self._stop_event.clear() self._waiter = Event() self._waiter.clear() self._elapsed_time = Value('Q', 0) self._hotkey = hotkey or 'space' super().start() # Waits till the process is started. self._stop_event.wait() def start(self): if self.is_alive: self._stop_event.set() else: raise RuntimeError() def join(self, *args, forceStop=False, **kwarks): zeroTimer = not self._waiter.is_set() if forceStop and self.is_alive(): self._waiter.set() super().join(*args, **kwarks) if zeroTimer: with self._elapsed_time.get_lock(): self._elapsed_time.value = 0 def getValue(self): return self._elapsed_time.value or None def run(self): s, e = None, None def hotkey_action(): if s is not None: self._waiter.set() remove = add_hotkey(self._hotkey, hotkey_action) # Inform the main process that this process is started self._stop_event.set() # Clears the stop event so that process halts till it is started again with start function. self._stop_event.clear() self._stop_event.wait() s = timer() self._waiter.wait() e = timer() with self._elapsed_time.get_lock(): self._elapsed_time.value = e - s remove_hotkey(remove)
class Counter: def __init__(self): self.counter = Value('i', 0) def increment(self): with self.counter.get_lock(): self.counter.value += 1 def get_value(self): with self.counter.get_lock(): return self.counter.value
class PoolManager: def __init__(self, n_workers=None): self.n_workers = ensure_n_workers(n_workers) self.ready = False self.workers = None # type: Pool self._remain_tasks = None self._work_done_event = None def __enter__(self): self.open() return self def __exit__(self, exc_type, exc_value, tb): self.close() def open(self): assert not self.ready self._remain_tasks = Value('i', 0) self._work_done_event = Event() self.workers = Pool(self.n_workers, initializer=pool_init) self.ready = True return self def close(self, force=False): assert self.ready if not force: self._work_done_event.clear() while self.count_remaining_tasks() > 0: self._work_done_event.wait() self._work_done_event.clear() self.workers.close() self.workers.join() def count_remaining_tasks(self): with self._remain_tasks.get_lock(): return self._remain_tasks.value def increase_task_counter(self): with self._remain_tasks.get_lock(): self._remain_tasks.value += 1 def decrease_task_counter(self): with self._remain_tasks.get_lock(): self._remain_tasks.value -= 1 self._work_done_event.set()
class Control(object): """Shared (long) value for passing control information between main and worker threads. Args: initial_value: Initial value of the shared control variable. """ def __init__(self, initial_value=CONTROL_ACTIVE): self.control = Value('l', initial_value) def check_value(self, value, lock=False): """Check that the current control value == `value`. Args: value: The value to check. lock: Whether to lock the shared variable before checking. Returns: True if the values are equal. """ return self.get_value(lock=lock) == value def check_value_positive(self, lock=False): """Check that the current control value is positive. Args: lock: Whether to lock the shared variable before checking. """ return self.get_value(lock=lock) > 0 def get_value(self, lock=True): """Returns the current control value. Args: lock: Whether to lock the shared variable before checking. """ if lock: with self.control.get_lock(): return self.control.value else: return self.control.value def set_value(self, value): """Set the control value. The shared variable is always locked. Args: value: The value to set. """ with self.control.get_lock(): self.control.value = value
class Counter: def __init__(self): self.value = Value(ctypes.c_int) def __enter__(self): with self.value.get_lock(): self.value.value += 1 def __exit__(self, exc_type, exc_val, exc_tb): with self.value.get_lock(): self.value.value -= 1 def __repr__(self): return str(self.value.value)
class Control(object): """Shared (long) value for passing control information between main and worker threads. Args: initial_value: Initial value of the shared control variable. """ def __init__(self, initial_value=CONTROL_ACTIVE): self.control = Value('l', initial_value) def check_value(self, value, lock=False): """Check that the current control value == `value`. Args: value: The value to check. lock: Whether to lock the shared variable before checking. Returns: True if the values are equal. """ return self.get_value(lock=lock) == value def check_value_positive(self, lock=False): """Check that the current control value is positive. Args: lock: Whether to lock the shared variable before checking. """ return self.get_value(lock=lock) > 0 def get_value(self, lock=True): """Returns the current control value. Args: lock: Whether to lock the shared variable before checking. """ if lock: with self.control.get_lock(): return self.control.value else: return self.control.value def set_value(self, value): """Set the control value. The shared variable is always locked. Args: value: The value to set. """ with self.control.get_lock(): self.control.value = value
class TemporaryChatAvatarsManager: """В директории /static/img/temporary_chat_avatars хранятся аватары чатов, которые выбирают пользователи на этапе создания чата, не создавая при этом чат окончательно. Эти фотографии сохраняются и через 10 секунд удаляются. Данный менеджер управляет этим процессом.""" def __init__(self): self.dir = os.path.join(PATH_TO_ROOT, "static", "img", "temporary_chat_avatars") # Директория, в которой хранятся временные аватары self.files_counter = Value('i', 0) # Имя временного файла будет соответствовать его порядковому номеру, для этого нужен счётчик self.released_values = [] # В этом списке будут лежать отработанные значения счётчика self.clear_temporary_chat_avatars_dir() def clear_temporary_chat_avatars_dir(self): for file in os.listdir(self.dir): if not file.endswith(".md"): os.remove(os.path.join(self.dir, file)) def load_avatar(self, data: bytes) -> str: """Сохраняет файл, переданный пользователем, сжимая его до размера 200x200, и возвращает относительный путь до него""" with self.files_counter.get_lock(): self.files_counter.value += 1 current_value = self.files_counter.value filename = f"{self.files_counter.value}.png" path_to_avatar = os.path.join(self.dir, filename) make_icon(data, path_to_avatar) self.delete_avatar(path_to_avatar, current_value) return os.path.relpath(path_to_avatar, "app") @delayed_procedure(10) def delete_avatar(self, path_to_avatar, value): """Удаляет аватар и освобождает имя, отданное этому аватару""" os.remove(path_to_avatar) self.release_value(value) def release_value(self, value): """Помещает значение счётчика в список отработанных и откатывает счётчик файлов насколько это возможно""" self.released_values.append(value) for released_value in reversed(self.released_values): if self.files_counter.value == released_value + 1: with self.files_counter.get_lock(): self.files_counter.value -= 1 try: self.released_values.pop() except IndexError: continue
class WaitGroup(object): def __init__(self): self.counter = Value('i', 0) def wait(self, interval=0.001): while self.counter.value != 0: time.sleep(interval) def add(self, count): with self.counter.get_lock(): self.counter.value += count def done(self): with self.counter.get_lock(): self.counter.value -= 1
class Counter(object): def __init__(self): self.e = Value('i', 0) self.t = Value('i', 0) def inc_error(self): with self.e.get_lock(): self.e.value += 1 def inc_total(self): with self.t.get_lock(): self.t.value += 1 def print_error(self): print(self.e.value / self.t.value * 100, "% error.")
class Transformator(Device): def __init__(self, nomenclature="", width=0., height=0.): Device.__init__(self, nomenclature, width, height) self.count = Value('i', 0) def __repr__(self): r = str(self) + "(" r += "width=" + str(self.width) + "m, " r += "height=" + str(self.height) + "m, " r += "length=" + str(self.length) + "m, " r += "count=" + str(self.count.value) + ")" return r def transport(self, particle): if not self.is_particle_lost(particle): with self.count.get_lock(): self.count.value += 1 if self.next: return self.next.transport(particle) def reset(self): self.count.value = 0 if self.next: self.next.reset()
def start(self): global n_finished n_finished = Value('i', 0) print('generating gridpacks for %s:' % self.card_dir + (', '.join(' %s' % i for i in self.processes))) print('starting pool with %i workers' % self.worker) pool = Pool(processes=self.worker, initializer=init, initargs=(n_finished, )) result = pool.map_async(submit_job_unpack, [(self, i) for i in range(self.worker)], chunksize=1) while not result.ready(): with n_finished.get_lock(): done = n_finished.value sys.stdout.write("\r(" + str(done) + "/" + str(self.worker) + ") done.") sys.stdout.flush() time.sleep(5) print(result.get()) pool.close() pool.join()
class PoseEstimator: def __init__(self, camera_class, draw=True): self.camera_class = camera_class self.draw = draw self.keypoint_queue = Queue(maxsize=10) self.run = Value('i', 1) self.done_loading = Value('i', 0) self.process = Process(target=pose_estimation_worker, args=(self.keypoint_queue, self.run, self.done_loading, self.camera_class, self.draw)) @property def keypoints_available(self): return not self.keypoint_queue.empty() def get_keypoints(self): return self.keypoint_queue.get() def start(self): logging.info('Starting PoseEstimator process...') self.process.start() while not self.done_loading.value: time.sleep(0.1) logging.info('PoseEstimator process finished starting.') def stop(self): with self.run.get_lock(): self.run.value = 0 self.process.join()
class counter_obj(object): def __init__(self): # read saved count value self.val = Value('i', get_val_from_file()) def increment(self): with self.val.get_lock(): tmp_value = int(get_val_from_file() + 1) # save result with open(r'./static/count.txt', 'w') as f: f.write(str(tmp_value)) def get_value(self): with self.val.get_lock(): # return the value in counter file return get_val_from_file()
def _calibrate_axis(self, axis_cur: multiprocessing.Value, axis_label, axis_min, axis_max, axis_calibration_to_max): with axis_cur.get_lock(): if axis_calibration_to_max: self._smc.write("G28 {0}{1}".format( axis_label, config.CALIBRATION_DISTANCE)) # "ok\r\n" response = self._smc.read_some() if response == self.RESPONSE_OK: axis_cur.value = axis_max else: return response else: self._smc.write("G28 {0}{1}".format( axis_label, -config.CALIBRATION_DISTANCE)) # "ok\r\n" response = self._smc.read_some() if response == self.RESPONSE_OK: axis_cur.value = axis_min else: return response # set fresh current coordinates on smoothie too self._smc.write("G92 {0}{1}".format(axis_label, axis_cur.value)) # "ok\r\n" return self._smc.read_some()
def run_with_exception_except_test(self): """ Subclass StoppableExceptionThread and raise exception in method `run_with_exception` """ class IncrementThread(StoppableExceptionThread): """ Used to test _stop in `run` """ def __init__(self, *args, **kwargs): self.x = args[0] StoppableExceptionThread.__init__(self, *args[1:], **kwargs) def run_with_exception(self): while not self._stop.is_set(): with self.x.get_lock(): self.x.value += 1 if self.x.value > 5: raise ValueError('x > 5') x = Value('i', 0) st = IncrementThread(x) st.start() sleep(1) assert_equals(st.stopped, False) with self.assertRaises(ValueError): st.join() assert_equals(st.is_alive(), False) with x.get_lock(): assert_equals(x.value, 6)
class TPSBucket: def __init__(self, expected_tps): self.number_of_tokens = Value('i', 0) self.expected_tps = expected_tps self.bucket_refresh_thread = threading.Thread( target=self.refill_bucket_per_second) self.bucket_refresh_thread.setDaemon(True) def refill_bucket_per_second(self): while True: self.refill_bucket() time.sleep(1) def refill_bucket(self): self.number_of_tokens.value = self.expected_tps def start(self): self.bucket_refresh_thread.start() def stop(self): self.bucket_refresh_thread.kill() def get_token(self): response = False if self.number_of_tokens.value > 0: with self.number_of_tokens.get_lock(): if self.number_of_tokens.value > 0: self.number_of_tokens.value -= 1 response = True return response
class ScriptRunnerCallbacks(DefaultRunnerCallbacks): def __init__(self, pbar): self.pbar = pbar self.counter = Value('i', 0) super(ScriptRunnerCallbacks, self).__init__() def on_failed(self, host, res, ignore_errors=False): self.update_pbar() def on_ok(self, host, res): self.update_pbar() def on_skipped(self, host, item=None): logger.warning('{host} skipped'.format(host=host)) def on_unreachable(self, host, res): self.update_pbar() def on_no_hosts(self): print('no hosts matched\n', file=sys.stderr) def update_pbar(self): with self.counter.get_lock(): self.counter.value += 1 self.pbar.update(self.counter.value)
def run_stop_test(self): """ Subclass StoppableThread and stop method `run` """ class IncrementThread(StoppableThread): """ Used to test _stop in `run` """ def __init__(self, *args, **kwargs): self.x = args[0] super(IncrementThread, self).__init__(*args[1:], **kwargs) def run(self): while not self._stop.is_set(): with self.x.get_lock(): self.x.value += 1 x = Value('i', 0) st = IncrementThread(x) st.start() assert_equals(st.stopped, False) assert_equals(st.is_alive(), True) sleep(0.5) st.stop() assert_equals(st.stopped, True) st.join() assert_equals(st.is_alive(), False) with x.get_lock(): assert_greater(x.value, 0)
def process(page: int, page_leap: int, parsing: mp.Value): logger = init_logging(f'mtgtop8_scrapper_{page}.log') try: with psycopg2.connect(user=user, dbname=database) as con: con.autocommit = True with con.cursor() as cursor: while parsing.value > 0: base_url = re.match('.*.com/', search_url).group() logger.info(f'Fetching for page {page} in format {url_format} from url {search_url}') child_page_value = {'cp': page} # set page value url_soup = get_and_wait(search_url, child_page_value) events = prs.get_events_from_page(url_soup, base_url, logger) if events: for event_name, event_date, event_url in events: dbc.insert_into_tournament_info(event_name, event_date, url_format, event_url, cursor, logger, prod_mode) tourny_id = dbc.get_tournament_info_id(event_name, event_date, url_format, event_url, cursor) parse_event(tourny_id, event_url, base_url, cursor, logger, prod_mode) page += page_leap else: with parsing.get_lock(): parsing.value = 0 except Exception as e: if prod_mode: logger.warning(str(e)) else: raise e
def start_log_server( host: str, logname: str, event: threading.Event, port: multiprocessing.Value, filename: str, logging_config: Dict, output_dir: str, ) -> None: setup_logger(filename=filename, logging_config=logging_config, output_dir=output_dir) while True: # Loop until we find a valid port _port = random.randint(10000, 65535) try: receiver = LogRecordSocketReceiver( host=host, port=_port, logname=logname, event=event, ) with port.get_lock(): port.value = _port receiver.serve_until_stopped() break except OSError: continue
def create_image(queue: mp.Queue, magnification: int, n_generations: int, n_calculated: mp.Value): while True: data = queue.get() matrix: np.ndarray = data["board"] ants = data["ants"] n_gen = data["i"] print(f"creating {n_gen} image") image = np.zeros((matrix.shape[1] * magnification, matrix.shape[0] * magnification, 3)) for y, x in np.ndindex(matrix.shape): color = [255, 255, 255] x_lower, x_upper, y_lower, y_upper = scale_indexes_to_range(x, y, magnification) if matrix[y, x]: color = [0, 0, 0] image[y_lower:y_upper, x_lower:x_upper, :] = color for ant in ants: color = [255, 0, 0] x_lower, x_upper, y_lower, y_upper = scale_indexes_to_range(ant[0], ant[1], magnification) image[y_lower:y_upper, x_lower:x_upper, :] = color im = Image.fromarray(image.astype('uint8')) im.save(f'results/gen{n_gen}.png', 'PNG') with n_calculated.get_lock(): n_calculated.value += 1
class Stat: def __init__(self): self.total = Value('L') self.cerr = Counter() def inc(self): with self.total.get_lock(): self.total.value += 1 def err(self, id): self.cerr[id] += 1 def print(self): tqdm.write('-------------------------------') self.print_errors() def print_errors(self): tqdm.write('%-20s | %s' % ('function', 'errors')) tqdm.write('---------------------+---------') for id, cnt in self.cerr.most_common(): tqdm.write('%-20s | %d' % (id, cnt)) tqdm.write('Processed lines: %d (%0.2f%% errors)' % (self.total.value, sum(self.cerr.values()) / self.total.value * 100)) tqdm.write('')
class Control(object): def __init__(self, initial_value): self.control = Value('l', initial_value) def check_value(self, value, lock=False): return self.get_value(lock=lock) == value def check_value_positive(self, lock=False): return self.get_value(lock=lock) > 0 def get_value(self, lock=True): if lock: with self.control.get_lock(): return self.control.value else: return self.control.value def set_value(self, value): with self.control.get_lock(): self.control.value = value
class Counter(object): def __init__(self): self.val = Value('i', 0) def increment(self, n=1): with self.val.get_lock(): self.val.value += n @property def value(self): return self.val.value
class AtomicCounter(object): def __init__(self, init_value=0): self._val = Value('i', init_value) def increase(self, incr=1): with self._val.get_lock(): self._val.value += incr return self._val.value def decrease(self, decr=1): with self._val.get_lock(): self._val.value -= decr return self._val.value @property def value(self): with self._val.get_lock(): return self._val.value @property def lock(self): return self._val.get_lock()
def run_in_parallel(cmds_queue, NPROC): running_ps = Value("i") while len(cmds_queue) > 0: # if we already have the maximum number of processes running, # then sleep and wait for a process to become free if running_ps.value >= NPROC: time.sleep(1) continue # get commands to run, and increment the number of processes # if we can't then we are out of commands, so break try: cmds = cmds_queue.pop() except KeyError: break with running_ps.get_lock(): running_ps.value += 1 # fork a process. If we are still in main, then do nothing. Otherwise, # run the grabbed commands, decrement the running processes value # and exit pid = os.fork() if pid != 0: print "FORKED" continue else: print(cmds) os.system(cmds) with running_ps.get_lock(): running_ps.value -= 1 os._exit(0) # wait for outstanding processes to finish while True: with running_ps.get_lock(): if running_ps.value == 0: break time.sleep(1) continue return
class Counter(object): def __init__(self, maximum): self.max = Value('i', maximum) self.val = Value('i', 0) def increment_both(self): with self.max.get_lock(): self.max.value += 1 return self.increment() def increment(self, n=1): with self.val.get_lock(): self.val.value += n result = self.value return result @property def value(self): return self.val.value @property def maximum(self): return self.max.value
class count(object): def __init__(self, c=0): self.c = Value('L', c) def __iter__(self): return self def __next__(self): with self.c.get_lock(): rv = self.c.value self.c.value += 1 return rv def next(self): return self.__next__()
class State(object): def __init__(self): self.counter = Value('i', 0) self.start_ticks = Value('d', time.process_time()) def increment(self, n=1): with self.counter.get_lock(): self.counter.value += n @property def value(self): return self.counter.value @property def start(self): return self.start_ticks.value
def target_except_test(self): """ propogate exception from target function """ def target_with_exception(x, stop_event): stop_event.wait(0.5) while not stop_event.is_set(): with x.get_lock(): x.value += 1 if x.value > 5: raise ValueError('x > 5') x = Value('i', 0) st = StoppableExceptionThread(target=target_with_exception, args=(x,)) st.start() assert_equals(st.stopped, False) assert_equals(st.is_alive(), True) with self.assertRaises(ValueError): st.join() assert_equals(st.stopped, False) assert_equals(st.is_alive(), False) with x.get_lock(): assert_equals(x.value, 6)
def target_with_args_finishes_test(self): """ run target function with arguments """ def target_finite(x, stop_event): stop_event.wait(0.5) while not stop_event.is_set(): with x.get_lock(): x.value += 1 if x.value > 5: break x = Value('i', 0) st = StoppableExceptionThread(target=target_finite, args=(x,)) st.start() assert_equals(st.stopped, False) assert_equals(st.is_alive(), True) st.join() assert_equals(st.stopped, False) assert_equals(st.is_alive(), False) with x.get_lock(): assert_equals(x.value, 6)
class ltaSlave(): def __init__(self, config): configFile = config try: self.readConfig(configFile) except Exception as e: print ('\n%s' % e) print('The Configuration is incomplete, exiting') exit(2) self.jobs = Value('i', 0) self.logger.info('Slave %s initialized' % self.host) def readConfig(self, configFile): exec(eval("'from %s import *' % configFile")) self.host = host self.ltacpport = ltacpport self.mailSlCommand = mailSlCommand self.jobsdir = jobsdir self.logger = logger self.logdir = logdir self.ltaClient = ltaClient self.exportClient = exportClient self.momClient = momClient self.pipelineRetry = pipelineRetry self.momRetry = momRetry self.ltaRetry = ltaRetry self.srmRetry = srmRetry self.srmInit = srmInit self.momServer = momServer self.masterAddress = masterAddress self.masterPort = masterPort self.masterAuth = masterAuth self.maxTalkQueue = maxSlaveTalkerQueue self.parallelJobs = parallelJobs def serve(self): class Manager(SyncManager): pass Manager.register('add_slave') Manager.register('remove_slave') Manager.register('slave_done') self.manager = Manager(address=(self.masterAddress, self.masterPort), authkey=self.masterAuth) self.manager.connect() self.logger.debug('Master found') self.queue = self.manager.add_slave(self.host) self.momTalker = momTalker(self.logger, self.exportClient, self.momRetry, self.maxTalkQueue) self.momTalker.start() talker = self.momTalker.getQueue() self.logger.info('Slave %s started' % self.host) while True: if self.jobs.value < self.parallelJobs: try: job = self.queue.get(True, 10) except QueueEmpty: job = None if job: with self.jobs.get_lock(): self.jobs.value += 1 runner = executer(self.logger, self.logdir, job, talker, self.jobs, self.momClient, self.ltaClient, self.host, self.ltacpport, self.mailSlCommand, self.manager, self.pipelineRetry, self.momRetry, self.ltaRetry, self.srmRetry, self.srmInit) runner.start() else: time.sleep(10)
class Task(object): """ Container of Jobs""" # TODO: Implement timeout support in add/delJob def __init__(self, name, timeout=0, onstart=None, ondone=None, params=None, stdout=sys.stdout, stderr=sys.stderr): """Initialize task, which is a group of jobs to be executed name - task name timeout - execution timeout. Default: 0, means infinity onstart - callback which is executed on the task starting (before the execution started) in the CONTEXT OF THE CALLER (main process) with the single argument, the task. Default: None ATTENTION: must be lightweight ondone - callback which is executed on successful completion of the task in the CONTEXT OF THE CALLER (main process) with the single argument, the task. Default: None ATTENTION: must be lightweight params - additional parameters to be used in callbacks stdout - None or file name or PIPE for the buffered output to be APPENDED stderr - None or file name or PIPE or STDOUT for the unbuffered error output to be APPENDED ATTENTION: PIPE is a buffer in RAM, so do not use it if the output data is huge or unlimited tstart - start time is filled automatically on the execution start (before onstart). Default: None tstop - termination / completion time after ondone """ assert isinstance(name, str) and timeout >= 0, "Parameters validaiton failed" self.name = name self.timeout = timeout self.params = params self.onstart = types.MethodType(onstart, self) if onstart else None self.ondone = types.MethodType(ondone, self) if ondone else None self.stdout = stdout self.stderr = stderr self.tstart = None self.tstop = None # SyncValue() # Termination / completion time after ondone # Private attributes self._jobsnum = Value(ctypes.c_uint) # Graceful completion of all tasks or at least one of the tasks was terminated self._graceful = Value(ctypes.c_bool) self._graceful.value = True def addJob(self): """Add one more job to the task return - updated task """ initial = False with self._jobsnum.get_lock(): if self._jobsnum.value == 0: initial = True self._jobsnum.value += 1 # Run onstart if required if initial: self.tstart = time.time() if self.onstart: self.onstart() return self def delJob(self, graceful): """Delete one job from the task graceful - the job is successfully completed or it was terminated return - None """ final = False with self._jobsnum.get_lock(): self._jobsnum.value -= 1 if self._jobsnum.value == 0: final = True # Finalize if required if not graceful: self._graceful.value = False elif final: if self.ondone and self._graceful.value: self.ondone() self.tstop = time.time() return None
class imagequeue: """ This class keeps a queue of images which may be worked on in threads. :param SAXS.calibration Cal: The SAXS Calibration to use for the processing :param optparser options: The object with the comandline options of the saxsdog :param list args: List of command line options """ def __init__(self,Cals,options,args,conf): self.pool=[] self.cals=Cals self.conf=conf self.options=options self.picturequeue=Queue() self.histqueue=Queue() self.args=args self.allp=Value('i',0) self.stopflag=Value('i',0) self.dirwalker=False if not options.plotwindow: plt.switch_backend("Agg") self.fig=plt.figure() if options.plotwindow: plt.ion() def getlastdata(self): print "getdatata" + str(self.lastfile) return self.lastfile,self.lastdata def fillqueuewithexistingfiles(self): """ Fill the queue with the list of images that is already there. """ if self.options.walkdirinthreads: self.dirwalker=Process(target=filler,args=(self.picturequeue,self.args[0])) self.dirwalker.start() else: self.dirwalker=Process() self.dirwalker.start() filler(self.picturequeue,self.args[0]) def procimage(self,picture,threadid): #im=Image.open(picture,"r") #im.tag.tags max=60 if not self.options.silent: print "[",threadid,"] open: ",picture for i in range(max): try: image=misc.imread(picture) #tif = TiffFile(picture) #image = tif.asarray() except KeyboardInterrupt: return except IOError as e: try: print "cannot open ", picture, ", lets wait.", max-i ," s" print e.message, sys.exc_info()[0] time.sleep(1) continue except KeyboardInterrupt: return except: print "############" print sys.exc_info() continue if image.shape==tuple(self.cals[0].config["Geometry"]["Imagesize"]): break print "cannot open ", picture, ", lets wait.", max-i ," s" time.sleep(1) else: print "image ", picture, " has wrong format" return if self.options.outdir!="": basename=self.options.outdir+os.sep+('_'.join(picture.replace('./','').split(os.sep))[:-3]).replace('/',"_") basename=basename.replace(':', '').replace('.','') else: reldir=os.path.join( os.path.dirname(picture), self.options.relpath) if not os.path.isdir(reldir): os.mkdir(reldir) basename=os.path.join( reldir, os.path.basename(picture)[:-3]) data=[] for calnum,cal in enumerate(self.cals): basename+="_"+str(calnum) if not self.options.resume or not os.path.isfile(basename+'.chi'): data.append((cal.integratechi(image,basename+".chi").tolist())) if threadid==0 and self.options.plotwindow: # this is a hack it really schould be a proper GUI cal.plot(image,fig=self.fig) plt.draw() if self.options.writesvg: if not self.options.resume or not os.path.isfile(basename+'.svg'): cal.plot(image,basename+".svg",fig=self.fig) if self.options.writepng: if not self.options.resume or not os.path.isfile(basename+'.svg'): misc.imsave(basename+".png",image) #self.picturequeue.task_done() with self.allp.get_lock(): self.allp.value+=1 if self.options.silent: if np.mod(self.allp.value,100)==0: print "[",threadid,"] ",self.allp.value else: print "[",threadid,"] write: ",basename+".chi" return basename ,data def start(self): """ Start threads and directory observer. """ #start threads for threadid in range(1,self.options.threads): print "start proc [",threadid,"]" worker=Process(target=funcworker, args=(self,threadid)) worker.daemon=True self.pool.append(worker) worker.start() #self.processimage(picture,options) self.starttime=time.time() if self.options.watch: eventhandler=addtoqueue(self.picturequeue) observer = Observer() observer.schedule(eventhandler, self.args[0], recursive=True) observer.start() #We let the master process do some work because its useful for matplotlib. if not self.dirwalker: self.dirwalker=Process() self.dirwalker.start() if self.options.servermode: context = zmq.Context() socket = context.socket(zmq.REQ) tokenlist= self.conf['Server'].split(":") server=":".join([tokenlist[0],tokenlist[1],self.options.serverport]) print server socket.connect (server) from Leash import addauthentication try: while ( self.options.servermode or (not self.picturequeue.empty()) or self.dirwalker.is_alive() or self.options.watch): try: picture = self.picturequeue.get(timeout=1) except KeyboardInterrupt : break except Empty: continue lastfile, data =self.procimage(picture,0) self.histqueue.put(time.time()) if self.options.servermode: request={"command":"putplotdata","argument":{"data":{ "result":"plot","data":{"filename":lastfile,"array":data, "stat":{}} }}} socket.send_multipart([json.dumps(addauthentication( request,self.conf))]) socket.recv() if np.mod(self.allp.value,500)==0: self.timreport() except KeyboardInterrupt: if self.options.watch: observer.stop() observer.join() if self.options.servermode: context.destroy() self.stop() self.timreport() return self.allp.value, time.time()-self.starttime def stop(self): print "\n\nWaiting for the processes to terminate." self.stopflag.value=1 for worker in self.pool: worker.join(3) def timreport(self): tottime=time.time()-self.starttime if self.allp.value==0: print "We didn't do any pictures " else: print "\n\nelapsed time: ",tottime print "\nProcessed: ",self.allp.value," pic" print " time per pic: ", tottime/self.allp.value,"[s]" print " pic per second: ",self.allp.value/tottime,"[/s]"
class imagequeue: """ This class keeps a queue of images which may be worked on in threads. :param SAXS.calibration Cal: The SAXS Calibration to use for the processing :param optparser options: The object with the comandline options of the saxsdog :param list args: List of command line options """ def __init__(self,Cals,options,directory,conf): self.pool=[] self.cals=Cals self.conf=conf self.options=options self.picturequeue=Queue() self.histqueue=Queue(maxsize=10000) self.plotdataqueue=Queue(maxsize=1) self.directory=directory self.allp=Value('i',0) self.stopflag=Value('i',0) self.dirwalker=None self.observer=None if not options.plotwindow: plt.switch_backend("Agg") self.fig=plt.figure() if options.plotwindow: plt.ion() def getlastdata(self): print "getdatata" + str(self.lastfile) return self.lastfile,self.lastdata def fillqueuewithexistingfiles(self): """ Fill the queue with the list of images that is already there. """ if self.options.walkdirinthreads: self.dirwalker=Thread(target=filler,args=(self.picturequeue,self.directory)) self.dirwalker.start() else: filler(self.picturequeue,self.directory) def procimage(self,picture,threadid): filelist={} max=60 if not self.options.silent: print "[",threadid,"] open: ",picture for i in range(max): try: image=misc.imread(picture) except KeyboardInterrupt: return except IOError as e: try: print "cannot open ", picture, ", lets wait.", max-i ," s" print e.message, sys.exc_info()[0] time.sleep(1) continue except KeyboardInterrupt: return except: print "############" print sys.exc_info() continue if image.shape==tuple(self.cals[0].config["Geometry"]["Imagesize"]): break print "cannot open ", picture, ", lets wait.", max-i ," s" time.sleep(1) else: print "image ", picture, " has wrong format" return if self.options.outdir!="": basename=self.options.outdir+os.sep+('_'.join(picture.replace('./','').split(os.sep))[:-3]).replace('/',"_") basename=basename.replace(':', '').replace('.','') else: reldir=os.path.join( os.path.dirname(picture), self.options.relpath) if not os.path.isdir(reldir): os.mkdir(reldir) basename=os.path.join( reldir, os.path.basename(picture)[:-4]) data=[] integparams={} imgMetaData=datamerge.readtiff(picture) if "date" in imgMetaData: imgTime=imgMetaData["date"] else: imgTime="" for calnum,cal in enumerate(self.cals): if len(list(enumerate(self.cals)))==1: filename=basename else: filename=basename+"_c"+cal.kind[0]+str(calnum) chifilename=filename+".chi" if self.options.GISAXSmode == True and calnum==0: #pass on GISAXSmode information to calibration.integratechi chifilename="xxx" filelist[cal.kind+str(calnum)]=chifilename if not self.options.resume or not os.path.isfile(chifilename): result=cal.integratechi(image,chifilename,picture) result["Image"]=picture if "Integparam" in result: integparams[cal.kind[0]+str(calnum)]=result["Integparam"] data.append(result) if threadid==0 and self.options.plotwindow: # this is a hack it really schould be a proper GUI cal.plot(image,fig=self.fig) plt.draw() if self.options.writesvg: if not self.options.resume or not os.path.isfile(filename+'.svg'): cal.plot(image,filename+".svg",fig=self.fig) if self.options.writepng: if not self.options.resume or not os.path.isfile(filename+'.svg'): misc.imsave(filename+".png",image) if self.options.silent: if np.mod(self.allp.value,100)==0: print "[",threadid,"] ",self.allp.value else: print "[",threadid,"] write: ",filename+".chi" with self.allp.get_lock(): self.allp.value+=1 filelist["JSON"]=basename+".json" try: self.histqueue.put({"Time":float(time.time()), "ImgTime":imgTime, "FileList":filelist, "BaseName":basename, "IntegralParameters":integparams},block=False) except Full: print "Full" return basename ,data def clearqueue(self): while self.histqueue.empty()==False: self.histqueue.get() print "History Queue cleared" def start(self): """ Start threads and directory observer. """ #start threads for threadid in range(1,self.options.threads): print "start proc [",threadid,"]" worker=Process(target=funcworker, args=(self,threadid)) worker.daemon=True self.pool.append(worker) worker.start() #self.processimage(picture,options) self.starttime=time.time() if self.options.watch: eventhandler=addtoqueue(self.picturequeue) self.observer = Observer() self.observer.schedule(eventhandler, self.args[0], recursive=True) self.observer.start() #We let the master process do some work because its useful for matplotlib. if not self.options.nowalk: self.fillqueuewithexistingfiles() if self.options.servermode: from Leash import addauthentication try: while ( self.options.servermode or (not self.picturequeue.empty()) or (self.dirwalker and self.dirwalker.is_alive() ) or self.options.watch): try: picture = self.picturequeue.get(timeout=1) except Empty: continue lastfile, data =self.procimage(picture,0) if self.options.servermode: request={"command":"putplotdata","argument":{"data":{ "result":"plot","data":{"filename":lastfile,"graphs":data, "stat":{}} }}} self.plotdataqueue.put(request) if np.mod(self.allp.value,500)==0: self.timreport() except KeyboardInterrupt: pass self.stop() self.timreport() return self.allp.value, time.time()-self.starttime def stop(self): print "\n\nWaiting for the processes to terminate." if self.observer: self.observer.stop() self.observer.observer.join(1) self.stopflag.value=1 for worker in self.pool: print "join worker" worker.join(1) if self.dirwalker: self.dirwalker.join(1) print "empty pic queue" while True: try: self.picturequeue.get(False) except Empty: break print "empty hist queue" while True: try: self.histqueue.get(False) except Empty: break print "empty plot queue" while True: try: self.plotdataqueue.get(False) except Empty: break if os.sys.platform!="win32": try: self.histqueue.close() self.plotdataqueue.close() except Exception as e: print e def timreport(self): tottime=time.time()-self.starttime count=self.allp.value #print count if count==0: print "We didn't do any pictures " else: print "\n\nelapsed time: ",tottime print "\nProcessed: ",count," pic" print " time per pic: ", tottime/count,"[s]" print " pic per second: ",count/tottime,"[/s]" time.sleep(1)
class HogwildWorld(World): """Creates a separate world for each thread (process). Maintains a few shared objects to keep track of state: - A Semaphore which represents queued examples to be processed. Every call of parley increments this counter; every time a Process claims an example, it decrements this counter. - A Condition variable which notifies when there are no more queued examples. - A boolean Value which represents whether the inner worlds should shutdown. - An integer Value which contains the number of unprocessed examples queued (acquiring the semaphore only claims them--this counter is decremented once the processing is complete). """ def __init__(self, world_class, opt, agents): self.inner_world = world_class(opt, agents) self.queued_items = Semaphore(0) # counts num exs to be processed self.epochDone = Condition() # notifies when exs are finished self.terminate = Value('b', False) # tells threads when to shut down self.cnt = Value('i', 0) # number of exs that remain to be processed self.threads = [] for i in range(opt['numthreads']): self.threads.append(HogwildProcess(i, world_class, opt, agents, self.queued_items, self.epochDone, self.terminate, self.cnt)) for t in self.threads: t.start() def __iter__(self): raise NotImplementedError('Iteration not available in hogwild.') def display(self): self.shutdown() raise NotImplementedError('Hogwild does not support displaying in-run' + ' task data. Use `--numthreads 1`.') def episode_done(self): return False def parley(self): """Queue one item to be processed.""" with self.cnt.get_lock(): self.cnt.value += 1 self.queued_items.release() def getID(self): return self.inner_world.getID() def report(self): return self.inner_world.report() def save_agents(self): self.inner_world.save_agents() def synchronize(self): """Sync barrier: will wait until all queued examples are processed.""" with self.epochDone: self.epochDone.wait_for(lambda: self.cnt.value == 0) def shutdown(self): """Set shutdown flag and wake threads up to close themselves""" # set shutdown flag with self.terminate.get_lock(): self.terminate.value = True # wake up each thread by queueing fake examples for _ in self.threads: self.queued_items.release() # wait for threads to close for t in self.threads: t.join()
class StressRunner(object): """This class contains functionality related to producing/consuming queries for the purpose of stress testing Impala. Queries will be executed in separate processes since python threading is limited to the use of a single CPU. """ # This is the point at which the work queue will block because it is full. WORK_QUEUE_CAPACITY = 10 def __init__(self): self._mem_broker = None # Synchronized blocking work queue for producer/consumers. self._query_queue = Queue(self.WORK_QUEUE_CAPACITY) # The Value class provides cross-process shared memory. self._mem_mb_needed_for_next_query = Value("i", 0) # All values below are cumulative. self._num_queries_dequeued = Value("i", 0) self._num_queries_started = Value("i", 0) self._num_queries_finished = Value("i", 0) self._num_queries_exceeded_mem_limit = Value("i", 0) self._num_queries_cancelled = Value("i", 0) self._num_queries_timedout = Value("i", 0) self.cancel_probability = 0 self.spill_probability = 0 def run_queries(self, queries, impala, num_queries_to_run, mem_overcommit_pct, should_print_status): """Runs queries randomly chosen from 'queries' and stops after 'num_queries_to_run' queries have completed. Before a query is run, a mem limit will be chosen. 'spill_probability' determines the likelihood of choosing a mem limit that will cause spilling. To induce spilling, a value is randomly chosen below the min memory needed to avoid spilling but above the min memory needed with spilling. So the min/max query memory requirements must be determined before calling this method. If 'mem_overcommit_pct' is zero, an exception will be raised if any queries fail for any reason other than cancellation (controlled by the 'cancel_probability' property), since each query should have enough memory to run successfully. If non-zero, failures due to insufficient memory will be ignored if memory was overcommitted at any time during execution. If a query completes without error, the result will be verified. An error will be raised upon a result mismatch. """ self._mem_broker = MemBroker(impala.min_impalad_mem_mb, int(impala.min_impalad_mem_mb * mem_overcommit_pct / 100)) # Print the status to show the state before starting. if should_print_status: self._print_status_header() self._print_status() lines_printed = 1 last_report_secs = 0 # Start producing queries. def enque_queries(): try: for _ in xrange(num_queries_to_run): self._query_queue.put(choice(queries)) except Exception as e: current_thread().error = e raise e enqueue_thread = create_and_start_daemon_thread(enque_queries) # Start a thread to check if more producers are needed. More producers are needed # when no queries are currently dequeued and waiting to be started. runners = list() def start_additional_runners_if_needed(): try: while self._num_queries_started.value < num_queries_to_run: # Remember num dequeued/started are cumulative. if self._num_queries_dequeued.value == self._num_queries_started.value: impalad = impala.impalads[len(runners) % len(impala.impalads)] runner = Process(target=self._start_single_runner, args=(impalad, )) runner.daemon = True runners.append(runner) runner.start() sleep(1) except Exception as e: current_thread().error = e raise e runners_thread = create_and_start_daemon_thread(start_additional_runners_if_needed) # Wait for everything to finish but exit early if anything failed. sleep_secs = 0.1 while enqueue_thread.is_alive() or runners_thread.is_alive() or runners: if enqueue_thread.error or runners_thread.error: sys.exit(1) for idx, runner in enumerate(runners): if runner.exitcode is not None: if runner.exitcode == 0: del runners[idx] else: sys.exit(runner.exitcode) sleep(sleep_secs) if should_print_status: last_report_secs += sleep_secs if last_report_secs > 5: last_report_secs = 0 lines_printed %= 50 if lines_printed == 0: self._print_status_header() self._print_status() lines_printed += 1 # And print the final state. if should_print_status: self._print_status() def _start_single_runner(self, impalad): """Consumer function to take a query of the queue and run it. This is intended to run in a separate process so validating the result set can use a full CPU. """ runner = QueryRunner() runner.impalad = impalad runner.connect() while not self._query_queue.empty(): try: query = self._query_queue.get(True, 1) except Empty: continue with self._num_queries_dequeued.get_lock(): query_idx = self._num_queries_dequeued.value self._num_queries_dequeued.value += 1 if not query.required_mem_mb_without_spilling: mem_limit = query.required_mem_mb_with_spilling solo_runtime = query.solo_runtime_secs_with_spilling elif self.spill_probability < random(): mem_limit = query.required_mem_mb_without_spilling solo_runtime = query.solo_runtime_secs_without_spilling else: mem_limit = randrange(query.required_mem_mb_with_spilling, query.required_mem_mb_without_spilling + 1) solo_runtime = query.solo_runtime_secs_with_spilling while query_idx > self._num_queries_started.value: sleep(0.1) self._mem_mb_needed_for_next_query.value = mem_limit with self._mem_broker.reserve_mem_mb(mem_limit) as reservation_id: self._num_queries_started.value += 1 should_cancel = self.cancel_probability > random() if should_cancel: timeout = randrange(1, max(int(solo_runtime), 2)) else: timeout = solo_runtime * max(10, self._num_queries_started.value - self._num_queries_finished.value) report = runner.run_query(query, timeout, mem_limit) if report.timed_out and should_cancel: report.was_cancelled = True self._update_from_query_report(report) if report.non_mem_limit_error: error_msg = str(report.non_mem_limit_error) # There is a possible race during cancellation. If a fetch request fails (for # example due to hitting a mem limit), just before the cancellation request, the # server may have already unregistered the query as part of the fetch failure. # In that case the server gives an error response saying the handle is invalid. if "Invalid query handle" in error_msg and report.timed_out: continue # Occasionally the network connection will fail, and depending on when the # failure occurred during run_query(), an attempt to get the profile may be # made which results in "Invalid session id" since the server destroyed the # session upon disconnect. if "Invalid session id" in error_msg: continue raise Exception("Query failed: %s" % str(report.non_mem_limit_error)) if report.mem_limit_exceeded \ and not self._mem_broker.was_overcommitted(reservation_id): raise Exception("Unexpected mem limit exceeded; mem was not overcommitted\n" "Profile: %s" % report.profile) if not report.mem_limit_exceeded \ and not report.timed_out \ and report.result_hash != query.result_hash: raise Exception("Result hash mismatch; expected %s, got %s" % (query.result_hash, report.result_hash)) def _print_status_header(self): print(" Done | Running | Mem Exceeded | Timed Out | Canceled | Mem Avail | Mem Over " "| Next Qry Mem") def _print_status(self): print("%5d | %7d | %12d | %9d | %8d | %9d | %8d | %12d" % ( self._num_queries_finished.value, self._num_queries_started.value - self._num_queries_finished.value, self._num_queries_exceeded_mem_limit.value, self._num_queries_timedout.value - self._num_queries_cancelled.value, self._num_queries_cancelled.value, self._mem_broker.available_mem_mb, self._mem_broker.overcommitted_mem_mb, self._mem_mb_needed_for_next_query.value)) def _update_from_query_report(self, report): self._num_queries_finished.value += 1 if report.mem_limit_exceeded: self._num_queries_exceeded_mem_limit.value += 1 if report.was_cancelled: self._num_queries_cancelled.value += 1 if report.timed_out: self._num_queries_timedout.value += 1
class MemBroker(object): """Provides memory usage coordination for clients running in different processes. The broker fulfills reservation requests by blocking as needed so total memory used by clients never exceeds the total available memory (including an 'overcommitable' amount). The lock built in to _available is also used to protect access to other members. The state stored in this class is actually an encapsulation of part of the state of the StressRunner class below. The state here is separated for clarity. """ def __init__(self, real_mem_mb, overcommitable_mem_mb): """'real_mem_mb' memory should be the amount of memory that each impalad is able to use. 'overcommitable_mem_mb' is the amount of memory that will be dispensed over the 'real' amount. """ self._available = Value("i", real_mem_mb + overcommitable_mem_mb) self._max_overcommitment = overcommitable_mem_mb # Each reservation will be assigned an id. Ids are monotonically increasing. When # a reservation crosses the overcommitment threshold, the corresponding reservation # id will be stored in '_last_overcommitted_reservation_id' so clients can check # to see if memory was overcommitted since their reservation was made (this is a race # but an incorrect result will be on the conservative side). self._next_reservation_id = Value("L", 0) self._last_overcommitted_reservation_id = Value("L", 0) @property def overcommitted_mem_mb(self): return max(self._max_overcommitment - self._available.value, 0) @property def available_mem_mb(self): return self._available.value @property def last_overcommitted_reservation_id(self): return self._last_overcommitted_reservation_id.value @contextmanager def reserve_mem_mb(self, mem_mb): """Blocks until the requested amount of memory is available and taken for the caller. This function should be used in a 'with' block. The taken memory will automatically be released when the 'with' context exits. A numeric id is returned so clients can compare against 'last_overcommitted_reservation_id' to see if memory was overcommitted since the reservation was obtained. with broker.reserve_mem_mb(100) as reservation_id: # Run query using 100 MB of memory if <query failed>: # Immediately check broker.was_overcommitted(reservation_id) to see if # memory was overcommitted. """ reservation_id = self._wait_until_reserved(mem_mb) try: yield reservation_id finally: self._release(mem_mb) def _wait_until_reserved(self, req): while True: with self._available.get_lock(): if req <= self._available.value: self._available.value -= req LOG.debug("Reserved %s MB; %s MB available; %s MB overcommitted", req, self._available.value, self.overcommitted_mem_mb) reservation_id = self._next_reservation_id.value self._next_reservation_id.value += 1 if self.overcommitted_mem_mb > 0: self._last_overcommitted_reservation_id.value = reservation_id return reservation_id sleep(0.1) def _release(self, req): with self._available.get_lock(): self._available.value += req LOG.debug("Released %s MB; %s MB available; %s MB overcommitted", req, self._available.value, self.overcommitted_mem_mb) def was_overcommitted(self, reservation_id): """Returns True if memory was overcommitted since the given reservation was made. For an accurate return value, this should be called just after the query ends or while the query is still running. """ return reservation_id <= self._last_overcommitted_reservation_id.value
class FixedDialogTeacher(Teacher): """A teacher agent for all teachers involved in tasks with fixed data. This class provides the following functionality for its subclasses: - Resets a teacher - Provides an observe method - Computes and retrieves the next episode index for a teacher - Provides a threadpool option for loading data (especially useful for large data, e.g. images) To utilize the DataLoader for threadpool loading, a teacher should implement the ``submit_load_request`` function to send a load request to the DataLoader by calling ``self.data_loader.request_load`` with the appropriate arguments (``receive_fn, load_fn, args``). The DataLoader then returns the data to the teacher's ``data_queue``, which the teacher can poll in its ``act`` method. The following is an example of the DataLoader usage in the VQA-V1 teacher. 1. In the teacher's ``init`` function, the teacher calls its ``submit_load_request`` function to preload an image. 2. The ``submit_load_request`` function gets the next ``episode_idx``, and computes the image path for the load request. 3. At the end of ``submit_load_request``, the teacher calls ``self.data_loader.request_load`` with three args: - ``self.receive_data`` - the function that the DataLoader calls to return the the loaded object - ``self.image_loader.load`` - the function used to load the image from the image path - ``[img_path]`` - a list of arguments for the load function, which in this case is the path of the image. 4. In the teacher's ``act`` function, the teacher loads the data from its data queue. 5. At the end of the ``act`` function, the teacher calls ``submit_load_request`` to preload an image for the next example. """ def __init__(self, opt, shared=None): super().__init__(opt, shared) if not hasattr(self, 'datatype'): self.datatype = opt['datatype'] if not hasattr(self, 'random'): self.random = self.datatype == 'train' if not hasattr(self, 'training'): self.training = self.datatype.startswith('train') if not hasattr(self, 'datafile'): self.datafile = opt.get('datafile') # set up support for multithreaded data loading self.data_queue = queue.Queue() if shared: self.index = shared['index'] if 'data_loader' in shared: self.data_loader = shared['data_loader'] else: self.index = AttrDict(value=-1) if not hasattr(self, 'data_loader'): self.data_loader = DataLoader(opt) self.data_loader.start() # set up batching self.bsz = opt.get('batchsize', 1) self.batchindex = opt.get('batchindex', 0) dt = opt.get('datatype', '').split(':') self.use_batch_act = (opt.get('batch_sort', False) and self.bsz > 1 and 'stream' not in dt) if self.use_batch_act: if shared: self.lastYs = shared['lastYs'] if 'sorted_data' in shared: self.sorted_data = shared['sorted_data'] self.batches = shared['batches'] else: self.lastYs = [None] * self.bsz ordered_opt = opt.copy() ordered_opt['datatype'] = ':'.join((dt[0], 'ordered')) ordered_opt['batchsize'] = 1 ordered_opt['numthreads'] = 1 ordered_teacher = create_task_agent_from_taskname(ordered_opt)[0] clen = opt.get('context_length', -1) incl = opt.get('include_labels', True) if ordered_teacher.num_examples() > 1000000: # one million print('WARNING: this dataset is large, and batch sorting ' 'may use too much RAM or take too long to set up. ' 'Consider disabling batch sorting, setting ' 'context-length to a small integer (if this dataset ' 'has episodes of multiple examples), or streaming ' 'the data using a streamed data mode if supported.') flatdata = flatten(ordered_teacher, context_length=clen, include_labels=incl) self.sorted_data = sort_data(flatdata) self.batches = make_batches(self.sorted_data, self.bsz) def _lock(self): if hasattr(self.index, 'get_lock'): return self.index.get_lock() else: return no_lock() def reset(self): """Reset the dialog so that it is at the start of the epoch, and all metrics are reset. """ super().reset() self.metrics.clear() self.lastY = None self.episode_done = True self.epochDone = False self.data_queue = queue.Queue() self.episode_idx = -1 with self._lock(): self.index.value = -1 if self.use_batch_act and self.random and hasattr(self, 'batches'): random.shuffle(self.batches) def submit_load_request(self): """An agent should implement this method to submit requests to the data loader. At the end of this method, the agent should call ``self.data_loader.request_load()`` with the appropriate args. """ pass def receive_data(self, future): """Function for receiving data from the data loader.""" data = future.result() self.data_queue.put(data) def share(self): shared = super().share() if hasattr(self, 'lastYs'): # share lastYs to communicate between batch_act and observe shared['lastYs'] = self.lastYs if self.opt.get('numthreads', 1) > 1: if type(self.index) is not multiprocessing.sharedctypes.Synchronized: # for multithreading need to move index into threadsafe memory self.index = Value('l', -1) if hasattr(self, 'sorted_data'): shared['sorted_data'] = self.sorted_data shared['batches'] = self.batches else: shared['data_loader'] = self.data_loader shared['index'] = self.index return shared def next_episode_idx(self, num_eps=None, loop=None): if num_eps is None: num_eps = self.num_episodes() if loop is None: loop = self.training if self.random: new_idx = random.randrange(num_eps) else: with self._lock(): self.index.value += 1 if loop: self.index.value %= num_eps new_idx = self.index.value return new_idx def next_example(self): if self.episode_done: self.episode_idx = self.next_episode_idx() self.entry_idx = 0 else: self.entry_idx += 1 if self.episode_idx >= self.num_episodes(): return {'episode_done': True}, True ex = self.get(self.episode_idx, self.entry_idx) self.episode_done = ex['episode_done'] if (not self.random and self.episode_done and self.episode_idx + 1 >= self.num_episodes()): epoch_done = True else: epoch_done = False return ex, epoch_done def next_batch(self): # get next batch with self._lock(): self.index.value += 1 if self.training: self.index.value %= len(self.batches) batch_idx = self.index.value if batch_idx + 1 >= len(self.batches): if self.random: random.shuffle(self.batches) self.epochDone = True else: self.epochDone = False if batch_idx >= len(self.batches): return [{'episode_done': True, 'id': self.getID()}] * self.bsz return self.batches[batch_idx] def num_episodes(self): """Get the number of episodes in this dataset.""" if self.use_batch_act: # when using batch_act, this is length of sorted data return len(self.sorted_data) raise RuntimeError('"num_episodes" must be overriden by children.') def num_examples(self): """Get the total number of examples in this dataset.""" if self.use_batch_act: # when using batch_act, this is length of sorted data return len(self.sorted_data) raise RuntimeError('"num_examples" must be overriden by children.') def get(self, episode_idx, entry_idx=0): """Get the specified episode and the specified entry in that episode. Many datasets have only single-entry episodes, so entry_idx defaults to zero. Children must override this method in order to inherit the `next_example` method. """ raise RuntimeError('"Get" method must be overriden by children.') def observe(self, observation): """Process observation for metrics.""" if self.use_batch_act: self.lastY = self.lastYs[self.batchindex] self.lastYs[self.batchindex] = None if hasattr(self, 'lastY') and self.lastY is not None: self.metrics.update(observation, self.lastY) self.lastY = None return observation def batch_act(self, observations): # we ignore observations if not hasattr(self, 'epochDone'): # reset if haven't yet self.reset() batch = self.next_batch() # pad batch if len(batch) < self.bsz: batch += [{'episode_done': True, 'id': self.getID()}] * (self.bsz - len(batch)) # remember correct answer if available (for padding, None) for i, ex in enumerate(batch): self.lastYs[i] = ex.get('labels', ex.get('eval_labels')) return batch def act(self): """Send new dialog message.""" if not hasattr(self, 'epochDone'): # reset if haven't yet self.reset() # get next example, action is episode_done dict if already out of exs action, self.epochDone = self.next_example() action['id'] = self.getID() # remember correct answer if available self.lastY = action.get('labels', None) if not self.datatype.startswith('train') and 'labels' in action: # move labels to eval field so not used for training # but this way the model can use the labels for perplexity or loss action['eval_labels'] = action.pop('labels') return action
class Brute_Force(): minKeyLength = 6 maxKeyLength = 16 alphabet = string.ascii_lowercase + string.ascii_uppercase + string.digits #+ string.punctuation algorithm = None origHash = None key = '' rec = None charactersToCheck = 3 queue = Queue(cpu_count()*5) chunk_size = 0 countey = Value('I', 0) done = Value('b', False) total_work_units = 1 possibilities_exhausted = False first_unit = True children = [] result_queue = None processes_running = False hashlib_mode = False hash_list = [] list_mode = False schemes = ["sha1_crypt", "sha256_crypt", "sha512_crypt", "md5_crypt", "des_crypt", 'ldap_salted_sha1', 'ldap_salted_md5', 'ldap_sha1', 'ldap_md5', 'ldap_plaintext', "mysql323"] myctx = CryptContext(schemes) def __init__(self): if not __name__ == '__main__': return current_process().authkey = "Popcorn is awesome!!!" def set_params(self, alphabet, algorithm, origHash, min_key_length, max_key_length): self.alphabet = alphabet self.algorithm = algorithm self.origHash = origHash self.minKeyLength = min_key_length self.maxKeyLength = max_key_length self.set_chars_to_check() def resetVariables(self): self.minKeyLength = 1 self.maxKeyLength = 16 self.alphabet = string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation self.algorithm = "" self.origHash = '' self.key = '' self.rec = None self.charactersToCheck = 3 self.queue = Queue(cpu_count()*5) self.chunk_size = 0 self.countey = Value('I', 0) self.done = Value('b', False) self.total_work_units = 0 self.possibilities_exhausted = False self.first_unit = True def isFound(self): return self.done.value def returnKey(self): return self.key def possibilitiesEhausted(self): return self.possibilities_exhausted def set_result_queue(self, result_queue): self.result_queue = result_queue def get_total_chunks(self): return self.total_work_units # start pool of check_keys workers def start_processes(self): if not self.processes_running: for j in range(0, cpu_count()): self.children.append(Process(target=self.check_keys, args=(self.queue,))) self.children[j].start() #print "bf internal process %i started." % self.children[j].pid self.processes_running = True # shutdown process pool def terminate_processes(self): for process in self.children: #print "killing process: %i" % process.pid process.terminate() process.join(timeout=.1) #if process.is_alive(): # print "process %i did not die." % process.pid # checks keys of length minKeyLength to charsToCheck, these keys will not use a prefix def check_short_keys(self): if self.done.value: return #print "check_short_keys called for lengths %d-%d and no prefix." % (self.minKeyLength, self.charactersToCheck) # compound iterable creates strings with a range of lengths keylist = itertools.chain.from_iterable(itertools.product(self.alphabet, repeat=j) for j in range(self.minKeyLength, self.charactersToCheck+1)) for key in keylist: tempkey = ''.join(key) if self.isSolution(tempkey): self.result_queue.put(('w', tempkey)) if not self.list_mode: while not self.queue.empty(): self.queue.get() self.countey.value += 1 #print "We win!" return True self.countey.value += 1 params = "bruteforce\n" + self.algorithm + "\n" + self.origHash + "\n" + self.alphabet + "\n" \ + str(self.minKeyLength) + "\n" + str(self.maxKeyLength) + "\n" \ + "-99999999999999999999999999999999999" + "\n0\n0\n0" self.result_queue.put(('f', params)) return False # take prefixes from the job queue and iterate through the possibilities for keys starting with that prefix def check_keys(self, queue): while queue: if self.done.value: return # get a workunit off the queue workunit = queue.get() if workunit.prefix == "******possibilities exhausted******": time.sleep(10) self.result_queue.put(('e', "sadness")) self.algorithm = workunit.algorithm self.origHash = workunit.hash prefix = ''.join(workunit.prefix) #create an iterable to produce suffixes to append to the prefix keylist = itertools.product(self.alphabet, repeat=self.charactersToCheck) # check possibilities until iterable is consumed for key in keylist: tempkey = prefix + ''.join(key) #print tempkey if self.isSolution(tempkey): try: # send key with success message if self.list_mode: self.result_queue.put(('w', (tempkey + '\n' + hashlib.new(self.algorithm, tempkey).hexdigest(),))) else: self.result_queue.put(('w', tempkey), timeout=1) except Exception: return if not self.list_mode: while not self.queue.empty(): queue.get() self.countey.value += 1 queue.close() #print "We win!" return True self.countey.value += 1 # send back parameters with a fail result params = "bruteforce\n" + self.algorithm + "\n" + self.origHash + "\n" + self.alphabet + "\n" \ + str(self.minKeyLength) + "\n" + str(self.maxKeyLength) + "\n" + prefix + "\n0\n0\n0" self.result_queue.put(('f', params)) return False # get_prefix() is an iterator which produces all possible prefixes of appropriate length # as defined by min/max key lengths and charactersToCheck def get_prefix(self): if self.minKeyLength < self.charactersToCheck: yield '' if self.minKeyLength < self.charactersToCheck: # all keys up to charsToCheck will be handled by check_short_keys, so start with 1 char prefixes min_length = 1 else: min_length = self.minKeyLength-self.charactersToCheck for i in range(min_length, (self.maxKeyLength - self.charactersToCheck + 1)): prefixes = itertools.chain.from_iterable(itertools.product(self.alphabet, repeat=j)for j in range(i, i+1)) for prefix in prefixes: if self.done.value: return yield ''.join(prefix) yield "******possibilities exhausted******" # Hash a possible key and check if it is equal to the hashed input. def isSolution(self, key): if self.hashlib_mode: temp_key = hashlib.new(self.algorithm, key).hexdigest() if self.list_mode: for hash in self.hash_list: if hash == temp_key: return True return False else: if temp_key == self.origHash: self.rec = "found" #print "Solution found!\nKey is : %s\nWith a hash of %s" % (key, temp_key) if not self.list_mode: with self.done.get_lock(): self.done.value = True self.key = key return True else: return False else: if self.myctx.verify(key, self.origHash): print "Solution found!\nKey is : %s\nWith a hash of %s" % (key, self.origHash) self.done.value = True self.key = key return True else: return False # setup here is to make chunks large enough that constant network communications are avoided but that won't last # forever on slower machines. A maximum chunk size of 10M hashes seemed a reasonable compromise. def set_chars_to_check(self): self.charactersToCheck = 1 iterations = self.alphabet.__len__() while True: iterations *= self.alphabet.__len__() self.charactersToCheck += 1 if iterations > 10000000: self.charactersToCheck -= 1 iterations /= self.alphabet.__len__() break # calculate chunk size and total number of chunks self.chunk_size = self.alphabet.__len__() ** self.charactersToCheck for i in range(self.minKeyLength, self.maxKeyLength+1): self.total_work_units += ((self.alphabet.__len__() ** i)/self.chunk_size) # get_chunk() is an iterator which yields a new chunk of data each time get_chunk.next() is called. # BROKEN, DO NOT USE! # def get_chunk(self): # # for prefix in self.get_prefix(): # # print "get chunk prefix: %s" % prefix # if prefix == '': # prefix = "-99999999999999999999999999999999999" # # chunk = Chunk.Chunk() # chunk.params = "bruteforce\n" + self.algorithm + "\n" + self.origHash + "\n" + self.alphabet + "\n" + str(self.minKeyLength) + "\n" + str(self.maxKeyLength) + "\n" + prefix + "\n0\n0\n0" # # yield chunk # self.possibilities_exhausted = True # run_chunk takes an object of type Chunk.Chunk(), checks all possibilities within the parameters of the chunk, # sets global variables according to the chunk data and returns True or False to indicate if the cracking succeeded. def run_chunk(self, chunk): settings = chunk.params settings_list = settings.split('\n') #print settings prefix = settings_list[6] if self.first_unit: self.algorithm = settings_list[1] self.hashlib_mode = True for algorithm in self.schemes: if self.algorithm == algorithm: self.hashlib_mode = False self.origHash = settings_list[2] self.alphabet = settings_list[3] self.minKeyLength = int(settings_list[4]) self.maxKeyLength = int(settings_list[5]) self.set_chars_to_check() if prefix == "-99999999999999999999999999999999999": prefix = '' if prefix == '' and self.first_unit: shorts = Process(target=self.check_short_keys) shorts.start() #shorts.join() #shorts.terminate() #print "short keys started" else: if self.done.value: return True else: #print "run chunk prefix: %s" % prefix self.queue.put(WorkUnit(prefix, self.charactersToCheck, self.alphabet, self.algorithm, self.origHash)) self.first_unit = False if self.done: return True else: return False
class Downloader: def __init__(self): self.manager = Manager() self.queue = self.manager.dict() self.downloading = self.manager.list() self.index = Value('i', 0) self.print_lock = Lock() self.multi = MultitaskQueue(self.start_download) self.status = 0 Timer(0.5, self.print_daemon).start() self.FNULL = open(os.devnull, 'w') options = { 'format': 'flv', 'logger': DownloadLogger(), } def start_download(self, dic): writeln('[' + color('DOWN', 'cyan') + '] Starting download of %s from %s, saving as ID %d' % (dic['name'], dic['url'], dic['id'])) # cur_option = self.options # cur_option['progress_hooks'] = [partial(self.download_progress, dic['id'])] # cur_option['outtmpl'] = 'video/' + str(dic['id']) + '/' + str(dic['id']) + r'.%(title)s-%(id)s.%(ext)s' # downloader = youtube_dl.YoutubeDL(cur_option) # try: # downloader.download([dic['url']]) # self.download_progress(dic['id'], {'status': 'complete'}) # except youtube_dl.DownloadError as e: # writeln('[' + color('ERROR', 'red') + '] youtube_dl error for %s: ' % dic['name'] + e.message) # self.download_progress(dic['id'], {'status': 'error'}) self.download_progress(dic['id'], {'status': 'downloading'}) outpath = 'video/' + str(dic['id']) + '/' try: os.makedirs(outpath) except: pass log = open(outpath + 'log.txt', 'w') process = subprocess.Popen(["you-get", dic['url']], stdout=log, stderr=subprocess.STDOUT, cwd=outpath) retcode = process.wait() log.close() log = open(outpath + 'log.txt', 'r') if retcode != 0 or ' '.join(log.readlines()).find('error') != -1: self.download_progress(dic['id'], {'status': 'error'}) else: self.download_progress(dic['id'], {'status': 'complete'}) def print_progress(self): self.print_lock.acquire() down_cnt, all_cnt = len(self.downloading), len(self.queue) if down_cnt == 0: return with self.index.get_lock(): index = self.index.value if index >= down_cnt: self.index.value = 0 index = 0 rows, columns = map(int, os.popen('stty size', 'r').read().split()) dic = self.queue[self.downloading[index]] message = '' message += color('Job %d/%d' % (index + 1, down_cnt), 'green') if all_cnt > down_cnt: message += color(' (%d pending)' % (all_cnt - down_cnt), 'green') message += color(': ' + dic['name'] + ' Part %d' % (dic['done_part'] + 1), 'green') total, down = dic.get('total_bytes', None), dic.get('downloaded_bytes', None) if total is not None and down is not None: submessage = ' %6.2lf%%' % (float(down) * 100 / float(total)) if length(message) + length(submessage) <= columns: message += submessage eta = dic.get('eta', None) if eta is not None: submessage = color(' ETA:', 'cyan') + ' %ds' % eta if length(message) + length(submessage) <= columns: message += submessage speed = dic.get('speed', None) if speed is not None: units = [('MB/s', 10 ** 6), ('KB/s', 10 ** 3)] unit = ('B/s', 1) for x in units: if speed > x[1]: unit = x break submessage = color(' Speed:', 'cyan') + ' %.1lf%s' % (speed / unit[1], unit[0]) if length(message) + length(submessage) <= columns: message += submessage remain = columns - length(message) refresh(message) self.print_lock.release() REFRESH_TIME = 1 def print_daemon(self): if len(self.downloading) > 0: with self.index.get_lock(): self.index.value += 1 self.print_progress() Timer(self.REFRESH_TIME, self.print_daemon).start() def download_progress(self, id, dic): self.print_lock.acquire() try: name = self.queue[id]['name'] done_part = self.queue[id]['done_part'] except KeyError: print 'KeyError' self.print_lock.release() return if self.queue[id]['status'] == 'none': self.downloading.append(id) dic['name'] = name dic['time'] = time.time() if dic['status'] in ['complete', 'error']: if dic['status'] == 'complete': message = 'All %d parts of %s downloaded' % (done_part, name) writeln(color('[DONE] ' + message, 'green')) else: message = 'Download of %s aborted due to error' % name writeln(color('[ABORT] ' + message, 'red')) self.downloading.remove(id) self.queue.pop(id) elif dic['status'] == 'finished': dic['done_part'] = done_part + 1 message = 'Finished downloading part %d of %s. File saved to %s' \ % (dic['done_part'], name, dic['filename']) # message = 'Finished downloading part %d/%d of %s. File saved to %s' \ # % (dic['fragment_index'], dic['fragment_count'], name, dic['filename']) down = dic.get('downloaded_bytes', None) if down is not None: message += ', size is %.1lfMB' % (float(down) / (10 ** 6)) writeln('[' + color('DOWN', 'green') + '] ' + message) self.queue[id] = dic elif dic['status'] == 'downloading': dic['done_part'] = done_part total = 0 for x in ['total_bytes', 'total_bytes_estimate']: if x in dic and dic[x] is not None: total = dic[x] break dic['total_bytes'] = total self.queue[id] = dic self.print_lock.release() if dic['status'] == 'finished': self.print_progress() def download(self, name, url, id): self.queue[id] = { 'name': name, 'status': 'none', 'time': time.time(), 'done_part': 0, } self.multi.add({'name': name, 'url': url, 'id': id})
class FastqFilter(object): """ @class FastqFilter @brief Main class of the package Require the third party package Biopython """ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #~~~~~~~FONDAMENTAL METHODS~~~~~~~# def __init__(self, R1, R2, quality_filter=None, adapter_trimmer=None, outdir="./fastq/", input_qual="fastq-sanger", numprocs=None, compress_output=True): """ Instanciate the object by storing call parameters and init shared memory counters for interprocess communication. A reader process iterate over the input paired fastq files and add coupled R1 and R2 sequences as Biopython seqRecord to a first shared queue. Then according to the initial parametring, a multiprocessing filter pull out seqRecord couples from the queue and apply a quality filtering and/or adapter trimming. Couples passing throught the filters are added to a second shared queue. Finally, couples in the second queue are written in an output fastq file @param R1 Path to the forward read fastq file (can be gzipped) @param R2 Path to the reverse read fastq file (can be gzipped) @param quality_filter A QualityFilter object, if a quality filtering is required. @param adapter_trimmer An AdapterTrimmer object, if a adapter trimming is required. @param outdir Directory where to write the filtered fastq sequences. @param input_qual Quality scale of the fastq (fastq-sanger for illumina 1.8+) @param numprocs Number of parrallel processes for the filtering steps. If not provide the maximum number of thread available will be automatically used. @param compress_output If True the output fastq will be written directly in a gzipped file. False will generate an uncompressed a much bigger file but will be around """ # Start a timer start_time = time() # Create object variables self.numprocs = numprocs if numprocs else cpu_count() self.qual = quality_filter self.adapt = adapter_trimmer self.input_qual = input_qual self.R1_in = R1 self.R2_in = R2 self.outdir = outdir self.compress_output = compress_output if compress_output: self.R1_out = path.join(self.outdir, file_basename(self.R1_in)+"_1_filtered.fastq.gz") self.R2_out = path.join(self.outdir, file_basename(self.R2_in)+"_2_filtered.fastq.gz") else: self.R1_out = path.join(self.outdir, file_basename(self.R1_in)+"_1_filtered.fastq") self.R2_out = path.join(self.outdir, file_basename(self.R2_in)+"_2_filtered.fastq") # Init shared memory counters self.total = Value('i', 0) self.pass_qual = Value('i', 0) self.pass_trim = Value('i', 0) self.total_pass = Value('i', 0) if self.qual: self.min_qual_found = Value('i', 100) self.max_qual_found = Value('i', 0) self.weighted_mean = Value('d', 0.0) if self.adapt: self.seq_untrimmed = Value('i', 0) self.seq_trimmed = Value('i', 0) self.base_trimmed = Value('i', 0) self.len_pass = Value('i', 0) self.len_fail = Value('i', 0) # Count lines in fastq file to prepare a counter of progression print ("Count the number of fastq sequences") self.nseq = count_seq(R1, "fastq") print("fastq files contain {} sequences to align".format(self.nseq)) self.nseq_list = [int(self.nseq*i/100.0) for i in range(5,101,5)] # 5 percent steps # Init queues for input file reading and output file writing (limited to 10000 objects) self.inq = Queue(maxsize=10000) self.outq = Queue(maxsize=10000) # Init processes for file reading, distributed filtering and file writing self.pin = Process(target=self.reader, args=()) self.ps = [Process(target=self.filter, args=()) for i in range(self.numprocs)] self.pout = Process(target=self.writer, args=()) # Start processes self.pin.start() self.pout.start() for p in self.ps: p.start() # Blocks until the process is finished self.pin.join() print ("\tReading done") for i in range(len(self.ps)): self.ps[i].join() print ("\tFiltering done") self.pout.join() print ("\tWriting done\n") # Stop timer and store the value self.exec_time = round(time()-start_time, 3) def __repr__(self): msg = "FASTQ FILTER Parallel Processing\n" msg += "\tExecution time : {} s\n".format(self.exec_time) msg += "\tInput fastq files\n\t\t{}\n\t\t{}\n".format (self.R1_in, self.R2_in) msg += "\tOutput fastq files\n\t\t{}\n\t\t{}\n".format (self.R1_out, self.R2_out) msg += "\tInput quality score : {}\n".format (self.input_qual) msg += "\tNumber of parallel processes : {}\n".format (self.numprocs) msg += "\tTotal pair processed : {}\n".format(self.total.value) msg += "\tTotal pair passed : {}\n".format(self.total_pass.value) if self.qual: msg += "QUALITY FILTER\n" msg += "\tPair pass quality filter : {}\n".format(self.pass_qual.value) msg += "\tMean quality value : {}\n".format(self.weighted_mean.value/self.total.value/2) msg += "\tMin quality value : {}\n".format(self.min_qual_found.value) msg += "\tMax quality value : {}\n".format(self.max_qual_found.value) if self.adapt: msg += "ADAPTER TRIMMER\n" msg += "\tPair pass adapter Trimming : {}\n".format(self.pass_trim.value) msg += "\tSequences untrimmed : {}\n".format(self.seq_untrimmed.value) msg += "\tSequences trimmed : {}\n".format(self.seq_trimmed.value) msg += "\tDNA base trimmed : {}\n".format(self.base_trimmed.value) msg += "\tFail len filtering: {}\n".format(self.len_fail.value) msg += "\tPass len filtering : {}\n".format(self.len_pass.value) return msg def __str__(self): return "<Instance of {} from {} >\n".format(self.__class__.__name__, self.__module__) def get(self, key): return self.__dict__[key] def getCTypeVal(self, key): return self.__dict__[key].value def getTrimmed (self): return (self.R1_out, self.R2_out) def set(self, key, value): self.__dict__[key] = value #~~~~~~~PRIVATE METHODS~~~~~~~# def reader(self): """ Initialize SeqIO.parse generators to iterate over paired fastq files. Data ara sent over inqueue for the workers to do their thing and a n = numprocs STOP pills are added at the end of the queue for each worker. """ try: # Open input fastq streams for reading if self.R1_in[-2:].lower() == "gz": in_R1 = gzip.open(self.R1_in, "rb") else: in_R1 = open(self.R1_in, "rb") if self.R2_in[-2:].lower() == "gz": in_R2 = gzip.open(self.R2_in, "rb") else: in_R2 = open(self.R2_in, "rb") except (IOError, TypeError, ValueError) as E: print E exit # Init generators to iterate over files genR1 = SeqIO.parse(in_R1, self.input_qual) genR2 = SeqIO.parse(in_R2, self.input_qual) i = 0 while True: # Parse sequences in generators until one of then is empty seqR1 = next(genR1, None) seqR2 = next(genR2, None) if not seqR1 or not seqR2: break # Add a tuple position, seqR1 and seqR2 to the end of the queue self.inq.put( (seqR1, seqR2) ) i+=1 if i in self.nseq_list: print ("\t{} sequences: {}%".format(i, int(i*100.0/self.nseq))) # Close files in_R1.close() in_R2.close() # Add a STOP pill to the queue for i in range(self.numprocs): self.inq.put("STOP") def filter(self): """ Parallelized filter that take as input a sequence couple in inqueue until a STOP pill is found. Sequences go through a QualityFilter and a AdapterTrimmer object and ifthe couple is able to pass filters then it is put at the end of outqueue. at the ebd of the process a STOP pill is added to the outqueue. """ # Consume inq and produce answers on outq for seqR1, seqR2 in iter(self.inq.get, "STOP"): with self.total.get_lock(): self.total.value+=1 # Quality filtering if self.qual: seqR1 = self.qual.filter(seqR1) seqR2 = self.qual.filter(seqR2) if not seqR1 or not seqR2: continue with self.pass_qual.get_lock(): self.pass_qual.value+=1 # Adapter trimming and size filtering if self.adapt: seqR1 = self.adapt.trimmer(seqR1) seqR2 = self.adapt.trimmer(seqR2) if not seqR1 or not seqR2: continue with self.pass_trim.get_lock(): self.pass_trim.value+=1 # If both filters passed = add to the output queue self.outq.put( (seqR1, seqR2) ) # Add a STOP pill to the queue self.outq.put("STOP") # Fill shared memomory counters from process specific object instances. if self.qual: with self.weighted_mean.get_lock(): self.weighted_mean.value += (self.qual.get_mean_qual()*self.qual.get('total')) if self.qual.get_min_qual() < self.min_qual_found.value: self.min_qual_found.value = self.qual.get_min_qual() if self.qual.get_max_qual() > self.max_qual_found.value: self.max_qual_found.value = self.qual.get_max_qual() if self.adapt: with self.seq_untrimmed.get_lock(): self.seq_untrimmed.value += self.adapt.get('seq_untrimmed') with self.seq_trimmed.get_lock(): self.seq_trimmed.value += self.adapt.get('seq_trimmed') with self.base_trimmed.get_lock(): self.base_trimmed.value += self.adapt.get('base_trimmed') with self.len_pass.get_lock(): self.len_pass.value += self.adapt.get('len_pass') with self.len_fail.get_lock(): self.len_fail.value += self.adapt.get('len_fail') def writer(self): """ Write sequence couples from outqueue in a pair of compressed fastq.gz files. Sequences will remains paired (ie at the same index in the 2 files) but they may not be in the same order than in the input fastq files. The process will continue until n = numprocs STOP pills were found in the outqueue (ie. the queue is empty) """ # Open output fastq streams for writing if self.compress_output: out_R1 = gzip.open(self.R1_out, "wb") out_R2 = gzip.open(self.R2_out, "wb") else: out_R1 = open(self.R1_out, "wb") out_R2 = open(self.R2_out, "wb") # Keep running until all numprocs STOP pills has been passed for works in range(self.numprocs): # Will exit the loop as soon as a Stop pill will be found for seqR1, seqR2 in iter(self.outq.get, "STOP"): out_R1.write(seqR1.format("fastq-sanger")) out_R2.write(seqR2.format("fastq-sanger")) with self.total_pass.get_lock(): self.total_pass.value+=1 out_R1.close() out_R2.close()
class Pipeline: END_OF_STREAM_SIGNAL = "!end_of_stream!" RUNNING_STATUS_STANDBY = 0 RUNNING_STATUS_RUNNING = 1 RUNNING_STATUS_FINISH = 2 RUNNING_STATUS_INTERRUPTED = -999 @staticmethod def is_end_of_stream(data): return data == Pipeline.END_OF_STREAM_SIGNAL def __init__(self, alias=None): self.logger = _get_logger(__name__) self._alias = alias self._pipe_builders = [] self._pipes = {} self._pipe_processes = [] self._first_pipe = None self._last_pipe = None self._func_read_stream = (lambda: range(0)) self._cleanups = [] self._already_cleanup = Value(ctypes.c_bool, False) self._running_status = Value(ctypes.c_int, Pipeline.RUNNING_STATUS_STANDBY) self._interrupted_by_exception = False self._thread_watching_running_status = None self._thread_watching_remaining_processes = None self._stream_reader_process = None def reset(self): self._pipes = {} self._pipe_processes = [] self._first_pipe = None self._last_pipe = None self._func_read_stream = (lambda: range(0)) self._cleanups = [] self._already_cleanup = Value(ctypes.c_bool, False) self._running_status = Value(ctypes.c_int, Pipeline.RUNNING_STATUS_STANDBY) self._interrupted_by_exception = False self._thread_watching_running_status = None self._thread_watching_remaining_processes = None self._stream_reader_process = None def add(self, builder): """ :param builder: :return: Pipeline """ self._pipe_builders.append(builder) return self def stream(self, generator=None): """ start to stream data from generator into pipeline, yielding data passed through pipeline :param generator: Iterable or Generator implementation :return: """ self._check_if_runnable() try: # change running status self._mark_started() # determine stream generator self._configure_stream_reader(generator) # configure pipes and create processes for them self._configure_pipes() # open pipes in a new process respectably self._open_pipes() # start process reading stream from generator self._start_streaming_data() # yield data passed through this pipeline self.logger.info("start to yield streams passed through pipeline...") while True: message = self._last_pipe.outbound.get() if Pipeline.is_end_of_stream(message): break yield message self.logger.info("finished yielding streams passed through pipeline") # if interrupted if self._interrupted_by_exception: raise Exception("processing was interrupted by unexpected exception") self.logger.info("finished successfully") finally: self._cleanup() def _mark_started(self): self.set_running_status_to_running() self._add_running_status_reset_func_to_cleanup() self._configure_running_status_watcher() def _add_running_status_reset_func_to_cleanup(self): def cleanup_func_reset_running_status(): with self._running_status.get_lock(): if self.running_status != Pipeline.RUNNING_STATUS_INTERRUPTED: self.set_running_status_to_finish() self._add_cleanup_func("reset running status of pipeline", cleanup_func_reset_running_status) def _configure_running_status_watcher(self): def watch_running_status(pipeline=None): pipeline.logger.info("start thread watching running status...") while True: if pipeline.running_status == Pipeline.RUNNING_STATUS_INTERRUPTED: pipeline.logger.error("got an interruption, stops pipeline, see logs") pipeline._interrupted_by_exception = True pipeline.stop_force() pipeline.set_running_status_to_finish() break elif pipeline.running_status == Pipeline.RUNNING_STATUS_FINISH: break time.sleep(0.001) pipeline.logger.info("stop thread watching running status") self._thread_watching_running_status = Thread( name="running_status_watcher", target=watch_running_status, kwargs={"pipeline": self}) self._thread_watching_running_status.daemon = True self._thread_watching_running_status.start() def _start_streaming_data(self): self.logger.info("start process for streaming data into pipeline...") self._add_cleanup_func("terminate the stream reader process", lambda: self._stream_reader_process.terminate()) self._stream_reader_process.start() def _open_pipes(self): self.logger.info("start Processes for pipes(%s)...", len(self._pipe_processes)) map(lambda process: process.start(), reduce(lambda p_group1, p_group2: p_group1 + p_group2, self._pipe_processes, [])) self._add_cleanup_func("terminate all the pipe processes", lambda: map(lambda each_p: each_p.terminate(), reduce(lambda p1, p2: p1 + p2, self._pipe_processes, []))) def _configure_stream_reader(self, generator): if isinstance(generator, DataGenerator): self._func_read_stream = generator.produce elif isinstance(generator, collections.Iterable): self._func_read_stream = (lambda: generator) elif inspect.isgeneratorfunction(generator): self._func_read_stream = generator else: raise Exception("generator should be either Producer or Iterable") self._stream_reader_process = create_process_with( process_alias="stream_reader", target_func=lambda: self._read_and_stream_from_generator()) def _check_if_runnable(self): # check running status if self.running_status != Pipeline.RUNNING_STATUS_STANDBY: raise Exception("invalid running status. Call reset() before call this") def _configure_pipes(self): if self._pipe_builders is None or len(self._pipe_builders) <= 0: raise Exception("There are no pipes to stream data") # chaining pipes pipes = [] pipe_outbound = Queue() self._pipe_builders.reverse() for builder in self._pipe_builders: pipe = builder.build() pipe.outbound = pipe_outbound pipes.append(pipe) pipe_outbound = pipe.inbound self._pipe_builders.reverse() pipes.reverse() self._pipes = pipes # capture entry and terminal self._first_pipe = self._pipes[0] self._last_pipe = self._pipes[-1] processes = [] for pipe in self._pipes: processes_for_pipe = map(lambda i: create_process_with(process_alias="process-%s-%s" % (pipe.alias, i), target_func=func_to_be_invoked_with_new_process, target_pipe=pipe, pipeline_running_status=self._running_status), range(pipe.number_of_consumer)) processes.append(processes_for_pipe) self._pipe_processes = processes def _read_and_stream_from_generator(self): try: map(lambda m: self.__stream_data(m), self._func_read_stream()) self.__stream_data(Pipeline.END_OF_STREAM_SIGNAL) except Exception as e: self.logger.error("while reading stream from generator, an unexpected exception occurred, stopping pipeline. " "see cause -> %s\n%s", e, traceback.format_exc()) self.set_running_status_to_interrupted() def __stream_data(self, data): self._first_pipe.inbound.put(data) def _join_pipes(self): def watch_remaining_processes(pipeline=None, processes=None): pipeline.logger.info("start thread watching pipe processes remaining...") while True: processes_alive = filter(lambda p: p.is_alive(), reduce(lambda plist1, plist2: plist1 + plist2, processes, [])) if len(processes_alive) <= 0: pipeline.logger.info("no remaining processes") break else: pipeline.logger.info("%s remaining processes : %s", len(processes_alive), map(lambda p: (p.pid, p.name), processes_alive)) time.sleep(5) pipeline.logger.info("stop thread watching pipe processes remaining") self._thread_watching_remaining_processes = Thread( name="remaining_processes_watcher", target=watch_remaining_processes, kwargs={"pipeline": self, "processes": self._pipe_processes} ) self._thread_watching_remaining_processes.daemon = True self._thread_watching_remaining_processes.start() map(lambda p: self.logger.info("joining(waiting) the process(name:%s, id:%s, alive:%s)...", p.name, p.pid, p.is_alive()) or p.join() or self.logger.info("released joining the process(name:%s, id:%s, alive:%s)", p.name, p.pid, p.is_alive()), reduce(lambda plist1, plist2: plist1 + plist2, self._pipe_processes, [])) self._thread_watching_remaining_processes.join() def _add_cleanup_func(self, desc="", func=(lambda: None)): """ :rtype : object """ self._cleanups.append((desc, func)) def _cleanup(self): with self._already_cleanup.get_lock(): if self._already_cleanup.value: return self.logger.info("start cleaning up...") map(lambda cleanup_tuple: self.logger.info("call cleanup func -> %s", cleanup_tuple[0]) or cleanup_tuple[1](), self._cleanups) self.logger.info("finished cleaning up") self._already_cleanup.value = True def stop_force(self): """ terminate all spawned processes :return: void """ # call registered cleanups self._cleanup() # send end signal to terminal queue for pipeline self._last_pipe.outbound.put(Pipeline.END_OF_STREAM_SIGNAL) @property def running_status(self): return self._running_status.value def set_running_status_to_standby(self): self._set_running_status(Pipeline.RUNNING_STATUS_STANDBY) def set_running_status_to_running(self): self._set_running_status(Pipeline.RUNNING_STATUS_RUNNING) def set_running_status_to_finish(self): self._set_running_status(Pipeline.RUNNING_STATUS_FINISH) def set_running_status_to_interrupted(self): self._set_running_status(Pipeline.RUNNING_STATUS_INTERRUPTED) def _set_running_status(self, value): with self._running_status.get_lock(): self._running_status.value = value
print "[+] Starting requests..." if (arg_verbosity > 0): print "[+]\t\tURL: ", arg_url print "[+]\t\tThreads: ", arg_nthreads print threads = [] stdout_lock = Lock() if (arg_nthreads): for i in range(arg_nthreads): t = Process(target=wracost.run, args=(stdout_lock, arg_getreq)) threads.append(t) t.start() else: paramsdictionary = parser.get_params_dict() for singleparam in paramsdictionary: # Meed a .copy() because each thread needs it's own object t = Process(target=wracost.run, args=(stdout_lock, singleparam.copy())) threads.append(t) t.start() if arg_auto or raw_input("[+] All threads synchronised! Launch attack?(Y/n): ") != 'n': with shared_lock_launch.get_lock(): shared_lock_launch.value = False print "[+] Requests launched!" else: for thread in threads: thread.terminate()
class Child: """An abstraction upon a process for our :class:`~sw.pool.Pool`. Serves to more easily house a separate process and communicate with it crossprocess. A Child is not entirely a separate container that is spawned from Pool and given free reign. The bulk of a Child is stored on the primary thread with the Pool, UI, and Reporting. However, :py:func:`~sw.child.Child.think` is on a separate `multiprocessing.Process` along with the provided *func* and GhostDriver / PhantomJS. All communication between Pool and Child is conducted over Child.statusVar (:py:func:`~multiprocessing.Value`) and Child.cq / Child.wq (:py:class:`~multiprocessing.Queue`) to avoid locks (they are multiprocess-safe). The off-thread child handles its own log, status reporting, error reporting, and getting new jobs. Once the process is started control is handed back over to the Pool which then manages the processes. :param cq: ChildQueue reference from :class:`~sw.pool.Pool`. Used to transmit the status of this Child to our Pool. :param wq: WorkQueue reference from :class:`~sw.pool.Pool`. This Child pops a function off this Queue then executes it, then repeats. :param num: Number of the Child relevant to :class:`~sw.pool.Pool`'s self.data array. This index is used to easily communicate results and relate them to the child in that array. This number is actually one less than the index displayed on the console (which starts at 1 for the end user's sake). :param log: Base log directory which we spit logs and screenshots into. Just a string which should never change. :param options: Dict of kwargs which contain specific options passed to our wrapper. :return: Child (self) """ def __init__( self, cq, wq, num, log, options ): self.cq = cq # Our shared output queue (childqueue) (multiprocessing) self.wq = wq # Our shared input queue (workqueue) (multiprocessing) self.num = num self.driver = None self.log = log self.lh = "" self.options = options self.level = self.options.get( 'level', NOTICE ) self.func = None self.sleepTime = self.options.get( 'childsleeptime', 1 ) self.cache = ElementCache( ) self.statusVar = Value( 'i', STARTING ) self.start( ) def think( self ): """This method is spawned on a separate process from our main thread. It takes no arguments, just reads from self variables set in :py:class:`~sw.child.Child` that are multiprocess-safe: wq, cq, and statusVar (and various static variables). It also uses various on class variables for storage which are not touched by pool. The purpose of this method is to cleanly start a loop of running PhantomJS with a fresh function pulled from wq every time. When think ends, our Child process ends as well. :return: None """ # Change our status ASAP so users actually see it change to black (and know the Child started). self.display( DISP_START ) wq = self.wq cq = self.cq # This allows custom service arguments to be forced into PhantomJS, as it is not supported with the Python # bindings by default. webdriver.phantomjs.webdriver.Service = PhantomJSNoImages sargs = [ ''.join( [ '--load-images=', str( self.options['images'] ).lower( ) ] ), ''.join( [ '--disk-cache=', str( self.options.get( 'browsercache', "true" ) ).lower( ) ] ), ''.join( [ '--ignore-ssl-errors=', str( self.options.get( 'ignoresslerrors', "yes" ) ).lower( ) ] ) ] if 'proxy' in self.options: sargs.append( ''.join( [ '--proxy=', self.options['proxy'] ] ) ) if 'proxytype' in self.options: sargs.append( ''.join( [ '--proxy-type=', self.options['proxytype'] ] ) ) try: self.driver = webdriver.PhantomJS( service_log_path=os.path.join( self.log, self.options.get( 'ghostdriverlog', "ghostdriver.log" ) ), service_args=sargs ) except Exception as e: self.logMsg( ''.join( [ "Webdriver failed to load: ", str( e ), "\n", traceback.format_exc( ) ] ), CRITICAL ) try: self.driver.quit( ) except: return return # This enables custom callbacks from WebDriver to this Child. Primarily used to read options and throw errors. Usage # can be seen in `sw.utils`. self.driver.child = self # WebDriver, by default, waits 15 seconds while intensively scanning the DOM for an element. This forces it to # throw an error instantly if the element does not exist. self.driver.implicitly_wait( 0 ) cq.put( [ self.num, READY, "" ] ) self.logMsg( "Child process started and loaded" ) while not wq.empty( ): # Block and wait otherwise exceptions are thrown. I've never seen it fail to get something here as # there's a check below. self.func = wq.get( True, 5 ) res = [] start = 0 # Below we set to an error / done and wait. # FIXME: Waiting a second to show a status isn't appropriate. The Pool should change the status # for the child after enough time has elapsed. self.status( RUNNING ) try: self.cache.clear( ) start = time.time( ) self.display( DISP_GOOD ) cq.put( [ self.num, MESSAGE, time.time( ), R_JOB_START ] ) self.func( self.driver ) except TimeoutException as e: self.display( DISP_ERROR ) screen = self.logError( str( e ) ) self.logMsg( ''.join( [ "Stack trace: ", traceback.format_exc( ) ] ), CRITICAL ) cq.put( [ self.num, FAILED, ( time.time( ) - start ), str( e ), screen ] ) self.logMsg( "Timeout when finding element." ) time.sleep( 1 ) except Exception as e: self.display( DISP_ERROR ) screen = self.logError( str( e ) ) # Capture the exception and log it self.logMsg( ''.join( [ "Stack trace: ", traceback.format_exc( ) ] ), CRITICAL ) cq.put( [ self.num, FAILED, ( time.time( ) - start ), str( e ), screen ] ) time.sleep( 1 ) break else: self.display( DISP_FINISH ) t = time.time( ) - start cq.put( [ self.num, DONE, ( time.time( ) - start ), "" ] ) self.logMsg( ''.join( [ "Successfully finished job (", format( t ), "s)" ] ) ) time.sleep( 0.5 ) # This line will cleanly kill PhantomJs for us. self.driver.quit( ) self.display( DISP_DONE ) self.status( FINISHED ) def logError( self, e, screenshot=True ): """Takes a JSON-encoded Selenium exception's text and spits it into the log in a more meaningful format. Can optionally take a screenshot too. :param e: Unicode JSON-encoded string from a WebDriver-thrown exception. *Must be a String*. :param True screenshot: Take a screenshot of the error automatically. :return: String for screenshot location, if any. """ o = pformat( formatError( e, "log" ) ) self.logMsg( o, CRITICAL ) if screenshot: return self.screenshot( CRITICAL ) def screenshot( self, level=NOTICE ): """Saves a screenshot to error_#.png and prints a message into the log specifying the file logged to. :param NOTICE level: This determines whether or not the error message will be logged according to the level set in self.level. The screenshot will print anyway. If this error is not greater or equal to the level specified in self.level, it is not printed. If it is, the message is printed into log.txt with the level specified by the timestamp. :return: String for screenshot location """ fn = "" i = 0 # If we are writing several errors, number them appropriately if not os.path.exists( self.log ): raise ValueError( ''.join( [ "Cannot write to a log directory that doesn't exist. ", self.log ] ), CRITICAL ) return while True: fn = os.path.join( self.log, ''.join( [ 'error_', str( i ), '.png' ] ) ) i += 1 if not os.path.isfile( fn ): break self.driver.save_screenshot( fn ) self.logMsg( ''.join( [ "Wrote screenshot to: ", fn ] ), level ) return fn def logMsg( self, e, level=NOTICE, **kwargs ): """Writes to our message log if level is greater than or equal to our level (in self.log). :param e: The message to be written to the log. :param NOTICE level: This determines whether or not the error message will be logged according to the level set in self.level. If this error is not greater or equal to the level specified in self.level, it is not printed. If it is, the message is printed into log.txt with the level specified by the timestamp. :Kwargs: * **locals** (*None*): Optional locals dict to print out cleanly. :return: None """ locals = kwargs.get( 'locals', None ) # Send error if appropriate if level >= ERROR: self.display( DISP_ERROR ) # Determine if we're logging this low if level < self.level: return # Get our timestamp timestamp = datetime.now( ).strftime( "%H:%M:%S" ) # String w = ''.join( [ "[", timestamp, "] ", errorLevelToStr( level ), "\t", e, "\n" ] ) # Locals if specified if locals != None: self.logMsg( ''.join( [ "Local variables: ", pformat( locals ) ] ), level ) # This typically errors out the first time through try: self.lh.write( w ) except: self.lh = open( os.path.join( self.log, ''.join( [ 'log-', str( self.num + 1 ), '.txt' ] ) ), 'a+', 0 ) self.lh.write( w ) def display( self, t ): """Sends a display message to the main loop, which is then translated to the UI. :param t: The status this child will now show, a constant starting with DISP in const.py. :returns: None """ self.cq.put( [ self.num, DISPLAY, t ] ) def is_alive( self ): """Checks if the child's process is still running, if it is then it returns True, otherwise False. There's a check for if the process is None, which is set when a child terminates. :return: Boolean for if Child process is still active (different from if a child is processing data). """ if self.proc != None: return self.proc.is_alive( ) else: return False def status( self, type=None ): """Uses a multiprocess-safe variable to transmit our status upstream. These values are listed under universal status types in const.py. The status types allow better logging and, for example, prevent children that were already terminated from being terminated again (and throwing an exception). When called with a type it will set this child's status on both the main process and the child's process. When called without it, it reads from the status variable. :param None type: The new value of our status. :returns: If type isn't specified, our status. If it is, it sets our type and returns None. """ if type is None: return self.statusVar.value else: with self.statusVar.get_lock( ): self.statusVar.value = type def start( self, flag=DISP_LOAD ): """Starts our child process off properly, used after a restart typically. :param DISP_LOAD flag: A custom flag to change the display color of the child, if desired. :return: None """ # Not stopped anymore self.status( STARTING ) # Create our path if not os.path.isdir( self.log ): os.makedirs( self.log ) # Open our handle self.lh = open( os.path.join( self.log, ''.join( [ 'log-', str( self.num + 1 ), '.txt' ] ) ), 'a+' ) # Show loading self.display( flag ) # Our process self.proc = Process( target=self.think, args=( ) ) self.proc.start( ) def restart( self, msg="restarting", flag=None ): """Restarts the child process and gets webdriver running again. :param "RESTARTING" msg: A message to print out in parenenthesis. :param None flag: A custom flag to change the display color of the child, if desired. :return: None """ if flag is not None: self.stop( msg, flag ) self.start( flag ) else: self.stop( msg ) self.start( ) def stop( self, msg="", flag=FINISHED, disp_flag=DISP_DONE ): """Stops a child process properly and sets its self.proc to None. Optionally takes a message to print out. :param "" msg: A message to show in parenthesis on the console next to ``Child #: STOPPING (msg)``. :param FINISHED flag: A custom status flag for if the child is finished, paused, stopped, or whatever is desired. :param DISP_DONE disp_flag: A custom display flag for the status of the child after stopping. :return: None """ if self.proc == None: return # Prevent the pool from trying to restart us self.status( flag ) if msg != "": self.logMsg( ''.join( [ "Stopping child process: \"", msg, "\"" ] ) ) else: self.logMsg( "Stopping child process" ) # Kill our process if self.proc != None: if os.name != "posix": subprocess.call( [ 'taskkill', '/F', '/T', '/PID', str( self.proc.pid ) ], stdout=open( os.devnull, 'wb' ), stderr=open( os.devnull, 'wb' ) ) else: subprocess.call( [ 'pkill', '-TERM', '-P', str( self.proc.pid ) ], stdout=open( os.devnull, 'wb' ), stderr=open( os.devnull, 'wb' ) ) self.proc.join( ) self.proc = None # Inform the TUI that we're done. self.display( disp_flag ) # Close our log self.lh.close( ) def flush( self ): """Flushes our log so that messages are retained on an internal error. :return: None """ self.lh.flush( )
class Infinity: ''' Main class, does everything ''' def __init__(self): parser = argparse.ArgumentParser() parser.add_argument("thread", help="URL of thread to scrape") parser.add_argument("--directory", "-d", help="Specify dir to save to (Default: ~/4chan)") parser.add_argument("--name", "-n", help="Specify name of dir to download to (Default: Topic/OP Post number)") parser.add_argument("--workers", type=int, help="Number of threads to run (Default: 10)") parser.add_argument("--version", "-v", action="version", version=VERSION) self.args = parser.parse_args() save_path = self.args.directory or os.path.join( os.path.expanduser('~'), "4chan") self.header = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11\ (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,\ application/xml;q=0.9,*/*;q=0.8'} self.thread_url = self.args.thread self.board = self.thread_url.split('/')[3] self.thread_name = "" self.downloads = [] self.filename = [] self.save_path = save_path self.counter = Value('i', 0) self.total_count = Value('i', 0) self.workers = self.args.workers self.down_dir = "" def url_open(self): """ Returns raw data from thread """ url = "{0}.json".format(os.path.splitext(self.thread_url)[0]) return requests.get(url, headers=self.header) def jsonify(self): """ Converts raw data to text to json object """ jsonable = self.url_open().text return json.loads(jsonable) def download_image(self, url, filename): """ Creates new dir if doesn't exist, downloads image """ down_dir = os.path.join(self.save_path, self.thread_name.title()) img_dir = os.path.join(down_dir, filename) if not os.path.exists(down_dir): os.makedirs(down_dir) image = requests.get(url, headers=self.header, stream=True) with open(img_dir, 'wb') as location: image.raw.decode_content = True shutil.copyfileobj(image.raw, location) self.down_dir = down_dir def image_urls(self): """ Iterates over json obj, gets image links Creates pool of workers, creates new workers """ json_obj = self.jsonify() for post in json_obj['posts']: if 'ext' in post: self.total_count.value += 1 self.thread_name = json_obj['posts'][0]['semantic_url'] for post in json_obj['posts']: if 'ext' in post: filename = str(post['tim']) + post['ext'] image_url = 'https://i.4cdn.org/{board}/{file}'.format( board=self.board, file=filename) self.filename.append(filename) self.downloads.append(image_url) self.download_image(image_url, filename) with self.counter.get_lock(): self.counter.value += 1 update_progress(self.counter.value, self.total_count.value) manager = Manager() pool_data = manager.list(self.downloads) partial_data = partial(self.download_image, pool_data) pool = Pool(self.workers) pool_map = pool.map_async(partial_data, self.filename) try: pool.close() pool.join() except KeyboardInterrupt: print("Aborting") pool.terminate() pool.join()