def evaluate_parallel(self, ast): from multiprocessing import Pool, cpu_count, RLock, Manager workers = self.num_process if workers == -1: workers = cpu_count() times = int(ast.times.val) each_time = int(times / workers) # equally divide work between count - 1 threads work_times = [int(each_time)] * (workers - 1) # dump all the extra work on last thread work_times.append(times - (each_time * (workers - 1))) ast_list = [ast.val] * len(work_times) randomer = [random.Random() for _ in work_times] optional_arg = [(w, r) for w, r in zip(work_times, randomer)] manager = Manager() l = manager.RLock() u = manager.dict() pool = Pool(processes=workers, initializer=init_child, initargs=(u, l)) ret = pool.starmap(self.visit_optional, zip(ast_list, optional_arg)) pool.close() if not self.generate_only: result = [] for y in ret: result.extend(y[0]) return result
def main_filter(args): """The main function for filtering the documents.""" install_mp_handler() if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) logging.info('Filtering frequent paragraphs from index {}...'.format( args.index)) with Pool(args.processes, initializer=init_filter, initargs=[args.frequents, args.old_frequents]) as pool: m = Manager() frequents_seen = m.dict() lock = m.RLock() group_it = enumerate(grouper2(read_index(args.index), args.documents), start=1) f = partial(filter_file, args=args, frequents_seen=frequents_seen, lock=lock) sum_stats = FilterStats() for stats in pool.starmap(f, group_it): sum_stats += stats pool.close() pool.join() logging.info( 'Done filtering: documents {} -> {}, paragraphs {} -> {}.'.format( sum_stats.old_docs, sum_stats.new_docs, sum_stats.old_ps, sum_stats.new_ps))
def __init__(self): manager = Manager() self.detail_dict = manager.dict() self.waiting_urls = manager.list() self.running_urls = manager.list() self.finished_urls = manager.list() self.failed_urls = manager.list() self.lock = manager.RLock()
def train_ac(max_workers, global_var_kv): _max_workers = min(cpu_count(), max_workers) - 1 manager = Manager() lock = manager.RLock() result_queue = manager.Queue() with ProcessPoolExecutor(max_workers=_max_workers, ) as executor: # executor.map( # train_worker, # #cpu_count # [(lock, master_agent, worker_idx) for worker_idx in range(_max_workers)]) future = executor.submit(train_worker, lock, result_queue, master_agent, 0) print(future.result())
def generate_chart_data(rts, func, **kwargs): ''' This is the entry function to be called to generate data for creating charts. ''' stopwatch = timer.Timer() plugin = retrieve_plugin(func) if not plugin: available_plugins = inventory.available_analyses() raise exceptions.UnknownPluginError(plugin, available_plugins) plugin = getattr(plugin, func) feedback(func, rts) tasks = JoinableQueue() result = JoinableQueue() mgr = Manager() lock = mgr.RLock() obs = dict() obs_proxy = mgr.dict(obs) db = storage.init_database(rts.storage, rts.dbname, rts.collection) editors = db.retrieve_distinct_keys('editor') #editors = editors[:500] if rts.collection.find('editors_dataset') > -1: min_year, max_year = determine_project_year_range(db, 'new_wikipedian') kwargs['min_year'] = min_year kwargs['max_year'] = max_year fmt = kwargs.pop('format', 'long') time_unit = kwargs.pop('time_unit', 'year') var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs) try: print 'Determining whether plugin requires preloaded data...' preloader = getattr(plugin, 'preload') print 'Preloading data...' data = preloader(rts) except Exception, error: data = None
def evaluate_population(self, population: Population) -> np.ndarray: """ Evaluates population of solutions. Parameters ---------- population : Population Collection of solutions wrapped as `Population`. Returns ------- np.ndarray An array of fitness values. Order is the same as input population. """ solutions = population.get_not_evaluated_solutions() if self.VERBOSE: print(f'\nEvaluating population of {population.size} solutions\n') solutions = tqdm(solutions) if self.MULTIPROCESSING: pool = Pool() manager = Manager() lock = manager.RLock() fitness_map = pool.map( partial(self.evaluate_solution, gene_order=self.gene_order, lock=lock), solutions) for solution, fitness in zip(solutions, fitness_map): solution.fitness = fitness else: for solution in solutions: solution.fitness = self.evaluate_solution(solution) return population.fitness
def evaluate(num_sub_iterations, evaluator): overall_results = collections.OrderedDict() start = time.time() for iteration, (train, test) in enumerate(evaluator._cv): start = time.time() manager = Manager() results = manager.dict() lock = manager.RLock() pool = Pool(processes=int(multiprocessing.cpu_count())) for _ in xrange(0, num_sub_iterations): pool.apply_async(_run_subiteration, args=(lock, results, evaluator, train, test)) pool.close() pool.join() for key, value in results.items(): if overall_results.get(key) is None: overall_results[key] = numpy.array(value) else: overall_results[key] = numpy.vstack( [overall_results[key], numpy.array(value)]) print "Iteration:", iteration + 1, " Time:", (time.time() - start) return overall_results
def run_instances(database, instances, filter_string, ubxlib_dir, working_dir, clean, summary_report_file, test_report_file, debug_file): '''Run the given instances''' return_value = 0 processes = [] platform_locks = [] misc_locks = {} alive_count = 0 report_thread = None report_queue = None reporter = None summary_report_file_path = None test_report_file_path = None debug_file_path = None summary_report_handle = None manager = Manager() # Create a lock to cover things that cross # platforms or that any process of u_run.main() # may need to perform outside of its working # directory misc_locks["system_lock"] = manager.RLock() # Create a lock which can be used on Nordic # platforms (nRF5 and Zephyer): performing a # JLink download to a board while JLink RTT logging # is active on any other board will often stop # the RTT logging even though the sessions are # aimed at debuggers with entirely different # serial numbers. misc_locks["jlink_lock"] = manager.RLock() # Create a "lock" that can be used on STM32F4 # platforms to ensure that all downloads are # completed before logging commences. We # can do this, rather than locking a tool for the # whole time as we have to do with Nordic, because # each STM32F4 board only runs a single instance misc_locks["stm32f4_downloads_list"] = manager.list() # It is possible for some platforms to be a bit # pants at running in multiple instances # hence here we create a lock per platform and pass it # into the instance for it to be able to manage # multiplicity if required create_platform_locks(database, instances, manager, platform_locks) # Launch a thread that prints stuff out # nicely from multiple sources print_queue = manager.Queue() print_thread = u_utils.PrintThread(print_queue) print_thread.start() # Set up a printer for this thread to print to the queue printer = u_utils.PrintToQueue(print_queue, None, True) if summary_report_file: # Launch a thread that manages reporting # from multiple sources summary_report_file_path = working_dir + os.sep + summary_report_file summary_report_handle = open(summary_report_file_path, "w") if summary_report_handle: printer.string("{}writing overall summary report to \"{}\".". \ format(PROMPT, summary_report_file_path)) else: printer.string("{}unable to open file \"{}\" for overall summary report.". \ format(PROMPT, summary_report_file_path)) report_queue = manager.Queue() report_thread = u_report.ReportThread(report_queue, summary_report_handle) report_thread.start() reporter = u_report.ReportToQueue(report_queue, None, None, printer) reporter.open() # From this post: # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python # ...create a pool of worker processes to run our # instances, then they will handle sigint correctly # and tidy up after themselves. # SIGINT is ignored while the pool is created original_sigint_handler = signal(SIGINT, SIG_IGN) pool = NoDaemonPool(len(instances)) signal(SIGINT, original_sigint_handler) # Create locks for connections u_connection.init_locks(manager) try: # Set up all the instances for instance in instances: # Provide a working directory that is unique # for each instance and make sure it exists if working_dir: this_working_dir = working_dir + os.sep + \ INSTANCE_DIR_PREFIX + \ u_utils.get_instance_text(instance).replace(".", "_") else: this_working_dir = os.getcwd() + os.sep + \ INSTANCE_DIR_PREFIX + \ u_utils.get_instance_text(instance).replace(".", "_") if not os.path.isdir(this_working_dir): os.makedirs(this_working_dir) # Only clean the working directory if requested if clean: u_utils.deltree(this_working_dir, printer, PROMPT) os.makedirs(this_working_dir) # Create the file paths for this instance if summary_report_file: summary_report_file_path = this_working_dir + os.sep + summary_report_file if test_report_file: test_report_file_path = this_working_dir + os.sep + test_report_file if debug_file: debug_file_path = this_working_dir + os.sep + debug_file # Start u_run.main in each worker thread process = {} process["platform"] = u_data.get_platform_for_instance( database, instance) process["instance"] = instance process["platform_lock"] = None process["connection_lock"] = u_connection.get_lock(instance) for platform_lock in platform_locks: if process["platform"] == platform_lock["platform"]: process["platform_lock"] = platform_lock["lock"] break process["handle"] = pool.apply_async( u_run.main, (database, instance, filter_string, True, ubxlib_dir, this_working_dir, process["connection_lock"], process["platform_lock"], misc_locks, print_queue, report_queue, summary_report_file_path, test_report_file_path, debug_file_path)) alive_count += 1 processes.append(process.copy()) # Wait for all the launched processes to complete printer.string("{}all instances now launched.".format(PROMPT)) loop_count = 0 while alive_count > 0: for process in processes: instance_text = u_utils.get_instance_text(process["instance"]) if not "dealt_with" in process and process["handle"].ready(): try: # If the return value has gone negative, i.e. # an infrastructure failure, leave it there, # else add the number of test failures to it if (return_value >= 0 and process["handle"].get() > 0) or \ (return_value <= 0 and process["handle"].get() < 0): return_value += process["handle"].get() except KeyboardInterrupt as ex: raise KeyboardInterrupt from ex except Exception as ex: # If an instance threw an exception then flag an # infrastructure error return_value = -1 printer.string("{}instance {} threw exception \"{}:" \ " {}\" but I can't tell you where" \ " I'm afraid.". \ format(PROMPT, instance_text, type(ex).__name__, str(ex))) if reporter: reporter.event(u_report.EVENT_TYPE_INFRASTRUCTURE, u_report.EVENT_FAILED, "instance {} threw exception \"{}: {}\"". \ format(instance_text, type(ex).__name__, str(ex))) alive_count -= 1 process["dealt_with"] = True if not process["handle"].ready() and \ (loop_count == STILL_RUNNING_REPORT_SECONDS): printer.string("{}instance {} still running.". \ format(PROMPT, instance_text)) loop_count += 1 if loop_count > STILL_RUNNING_REPORT_SECONDS: loop_count = 0 sleep(1) except KeyboardInterrupt: # Pools can tidy themselves up on SIGINT printer.string( "{}caught CTRL-C, terminating instances...".format(PROMPT)) if reporter: reporter.event(u_report.EVENT_TYPE_INFRASTRUCTURE, u_report.EVENT_FAILED, "CTRL-C received, terminating") pool.terminate() return_value = -1 # Tidy up pool.close() pool.join() if reporter: reporter.event_extra_information("return value overall {} (0 = success, negative =" \ " probable infrastructure failure, positive =" \ " failure(s) (may still be due to infrastructure))". \ format(return_value)) reporter.close() # Wait for the print and report queues to empty # and stop the print process printer.string("{}all runs complete, return value {}.".format( PROMPT, return_value)) sleep(1) print_thread.stop_thread() print_thread.join() # Stop the reporting process if report_thread: report_thread.stop_thread() report_thread.join() if summary_report_handle: summary_report_handle.close() return return_value
import re from datetime import datetime from app import app, db, u from sqlalchemy.orm.exc import NoResultFound, StaleDataError, MultipleResultsFound import os, glob from werkzeug.security import generate_password_hash, check_password_hash from search import add_to_index, remove_from_index, query_index from flask_login import UserMixin from app import login from flask_login import current_user from diagnostic_text import * from multiprocessing import RLock, Manager from pathlib import Path commit_manager = Manager() commit_lock = commit_manager.RLock( ) # Recursive lock because db.session.commit() is a non-thread-safe operation class User(UserMixin, db.Model): id = db.Column(db.Integer, primary_key=True) username = db.Column(db.String(64), index=True, unique=True) email = db.Column(db.String(120), index=True, unique=True) password_hash = db.Column(db.String(128)) last_seen = db.Column(db.DateTime, default=datetime.utcnow) def set_password(self, password): self.password_hash = generate_password_hash(password) def check_password(self, password): return check_password_hash(self.password_hash, password)
def main(): time0 = time.time() parser = OptionParser() parser.add_option( '--years', dest='s_years', action='store', type=str, help='Give a list of years as a string, such as "1980,1981". Optional.' ) parser.add_option('--noverify', dest='do_noverify', action='store_true', default=False, help='If chosen, do not verify the SSL connection.') parser.add_option('--local', dest='do_local', action='store_true', default=False, help='Check for locally running plex server.') parser.add_option( '--dirname', dest='dirname', action='store', type=str, default=os.getcwd(), help='Directory into which to store those plots. Default is %s.' % os.getcwd()) opts, args = parser.parse_args() # ## function to do the processing step = 0 print('%d, started on %s' % (step, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p'))) if opts.s_years is not None: try: years = sorted( set(map(lambda tok: int(tok), opts.s_years.split(',')))) except: step += 1 print('%d, did not give a valid set of years.' % step) years = [] else: years = [] # ## get plex server token dat = plexcore.checkServerCredentials(doLocal=True) if dat is None: step += 1 print('\n'.join([ '%d, error, could not access local Plex server in %0.3f seconds. Exiting...' % (step, time.time() - time0), '%d, finished on %s.' % (step + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) ])) return fullURL, token = dat # ## first find out which libraries are the TV show ones library_dict = plexcore.get_libraries(token, fullURL=fullURL, do_full=True) if library_dict is None: step += 1 print('\n'.join([ '%d, error, could not access libraries in plex server in %0.3f seconds. Exiting...' % (step, time.time() - time0), '%d, finished on %s.' % (step + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) ])) return # valid_keys = list( filter(lambda key: library_dict[key][-1] == 'show', library_dict)) if len(valid_keys) == 0: step += 1 print('\n'.join([ '%d, Error, could not find a TV show library in %0.3f seconds. Exiting...' % (time.time() - time0, step), '%d, finished on %s.' % (step + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) ])) return tvlib_title = library_dict[max(valid_keys)][0] step += 1 print('%d, found TV library: %s.' % (step, tvlib_title)) # ## now get the TV shows tvdata = plexcore.get_library_data(tvlib_title, token=token, fullURL=fullURL, num_threads=16) showsToExclude = plextvdb.get_shows_to_exclude(tvdata) if len(showsToExclude) != 0: step += 1 print('%d, excluding these TV shows: %s.' % (step, '; '.join(showsToExclude))) # ## now actual meat of the computation tvdata_date_dict = plextvdb.get_tvdata_ordered_by_date(tvdata) min_year = min(tvdata_date_dict.keys()).year max_year = max(tvdata_date_dict.keys()).year possible_years_set = set(map(lambda date: date.year, tvdata_date_dict)) step += 1 if len(years) == 0: years = sorted(possible_years_set) print('%d, no years specified. We will use %s total: %s.' % (step, _print_years(len(years)), ', '.join( map(lambda year: '%d' % year, years)))) else: cand_years = sorted(set(years) & possible_years_set) if len(cand_years) == 0: print('\n'.join([ '%d, no intersection between the %s chosen (%s) and the %d years in the library.' % (step, _print_years(len(years)), ', '.join( lambda yr: '%d' % year, years), len(possible_years_set)), 'Instead, we will use %s total: %s.' % (_print_years(len(possible_years_set)), ', '.join( map(lambda year: '%d' % year, sorted(possible_years_set)))) ])) years = sorted(possible_years_set) else: print('%d, we found %s to use: %s.' % (step, _print_years(len(cand_years)), ', '.join( map(lambda year: '%d' % year, cand_years)))) years = cand_years step += 1 print('%d, started processing %s of TV shows after %0.3f seconds.' % (step, _print_years(len(years)), time.time() - time0)) manager = Manager() shared_step = manager.Value('step', step) num_procced = manager.Value('nump', 0) lock = manager.RLock() pool = Pool(processes=cpu_count()) def _process_year(year): plextvdb.create_plot_year_tvdata(tvdata_date_dict, year, shouldPlot=True, dirname=opts.dirname) lock.acquire() shared_step.value += 1 num_procced.value += 1 print( '%d, finished processing year = %d (%02d / %02d) in %0.3f seconds.' % (shared_step.value, year, num_procced.value, len(years), time.time() - time0)) lock.release() _ = list(pool.map(_process_year, years)) step = shared_step.value + 1 print('\n'.join([ '%d, processed all %s in %0.3f seconds.' % (step, _print_years(len(years)), time.time() - time0), '%d, finished everything on %s.' % (step + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) ]))
class MocaSharedMemory: """ This class can manage the shared data between processes. Attributes ---------- self._manager: Manager the instance of multiprocessing.Manager. self._data_dict: dict {"some_name": self._manager.list([data_value, self._manager.RLock()])} """ def __init__( self, other_shared_data_manager=None, ): self._data_dict: dict # {"some_name": self._manager.list([data_value, self._manager.RLock()])} if other_shared_data_manager is None: self._manager = Manager() self._data_dict = self._manager.dict( ) # Create a shared dict for data else: self._manager = other_shared_data_manager._manager self._data_dict = other_shared_data_manager._data_dict def get(self, name: str, default: Any = None) -> Any: """Get value (copy) by name.""" data = self._data_dict.get(name, None) if data is not None: return data[_DATA] return copy(default) def set(self, name: str, value: Any) -> None: """Set value by name.""" if name in self._data_dict: self._data_dict[name][_DATA] = value else: self._data_dict[name] = self._manager.list( (value, self._manager.RLock())) def increment(self, name: str, value: int = 1) -> int: """Increment the value.""" with self.lock(name): data = self.get(name, 0) data = data + value if isinstance(data, int) else 1 self.set(name, data) return data def decrement(self, name: str, value: int) -> int: """Decrement the value.""" with self.lock(name): data = self.get(name, 0) data = data - value if isinstance(data, int) else -1 self.set(name, data) return data def change(self, name: str, func: Callable, *args, **kwargs) -> Any: """ Use the current value call the function, and set the return value as the new value. If can't get a value by the `name` parameter, the current value will be None. :param name: the name of target data. :param func: the function to call. :param args: the arguments of the function. :param kwargs: the keyword arguments of the function. :return: the return value of the function. """ with self.lock(name): data = self.get(name, None) new_value = func(data, *args, **kwargs) self.set(name, new_value) return new_value def lock(self, name): """ Get a lock on the resource by name (with a wait if the lock is already captured). Returns ContextManager-object. """ if name not in self._data_dict: self._data_dict[name] = self._manager.list( (None, self._manager.RLock())) return MocaMultiProcessLock(self._data_dict[name][_LOCK], True) def try_lock(self, name): """ Get a lock on the resource by name (without waiting, if the lock is already captured). Returns ContextManager-object. """ if name not in self._data_dict: self._data_dict[name] = self._manager.list( (None, self._manager.RLock())) return MocaMultiProcessLock(self._data_dict[name][_LOCK], False)
bin_locks[bin].release() #seq_dict_lock.release() #bin_list_lock.release() if __name__ == '__main__': mgr = Manager() bins = ['a', 'b', 'c', 'd', 'e'] num_bins = len(bins) seq_dict = mgr.dict() #seq, bin tuple to idx number bin_list = mgr.list() #idx to counts bin_locks = mgr.dict() #lock for each bin seq_dict_lock = mgr.RLock() #lock for seq_dict bin_list_lock = mgr.RLock() #lock for whole bin list for bin in bins: bin_locks[bin] = mgr.RLock() seqs = mgr.dict() seqs['a'] = ['GATC', 'GATC', 'ATC', 'AAG'] seqs['b'] = ['GATC', 'CCTC', 'TTC', 'GGT'] seqs['c'] = ['GATC', 'CCTC', 'GCG', 'TCT'] seqs['d'] = ['GATC', 'AAGT', 'CTC', 'CTC'] seqs['e'] = ['GATC', 'AAGT', 'CCT', 'CCT'] ppool = list()
class AsynchronousDataStore(AbstractDataStore): def __init__(self): super(AsynchronousDataStore).__init__() self._manager = Manager() self._managed = self._manager.Namespace() self._managed.name = "data-logger" self._managed.path = "." self._managed.entries = self._manager.list() self._managed.data = self._manager.dict() self._managed.lockers = self._manager.dict() self._managed.counters = self._manager.dict() self._managed.on_push_callables = self._manager.dict() self._managed.on_reset_callables = self._manager.dict() self._managed.on_dump_callables = self._manager.dict() def get_name(self): return self._managed.name def set_name(self, name): self._managed.name = name def get_path(self): return self._managed.path def is_empty(self): return len(self._managed.entries) == 0 def set_path(self, path): self._managed.path = path def declare_entry(self, entry, on_push_callables, on_dump_callables, on_reset_callables): self._managed.entries.append(entry) self._managed.lockers[entry] = self._manager.RLock() self._managed.data[entry] = self._manager.dict() self._managed.counters[entry] = 0 self._managed.on_push_callables[entry] = self._manager.list( on_push_callables) self._managed.on_reset_callables[entry] = self._manager.list( on_reset_callables) self._managed.on_dump_callables[entry] = self._manager.list( on_dump_callables) def has_entry(self, entry): return entry in self._managed.entries def get_entries(self): return self._managed.entries def get_locker(self, entry): return self._managed.lockers[entry] def get_push_callables(self, entry): return self._managed.on_push_callables[entry] def get_reset_callables(self, entry): return self._managed.on_reset_callables[entry] def get_dump_callables(self, entry): return self._managed.on_dump_callables[entry] def get_data(self, entry): return self._managed.data[entry] def append_data(self, entry, time, data): self._managed.data[entry][time] = data self._managed.counters[entry] += 1 def clear_data(self, entry): self._managed.data[entry] = dict() self._managed.counters[entry] = 0 def get_counter(self, entry): return self._managed.counters[entry]
class Browser(): """ A browser site manager, with some data collection capabilities In case of using more than one browser it's highly recommended to open them all before opening them """ def __init__(self, sites:Iterable[Union[str, Site]]=[], load_timeout:float=20, max_tabs:int=25, headless:bool=False, load_images:bool=True, autoload_videos:bool=False, load_wait:float=3, disable_downloads:bool=False, proxy_dict:Dict[str,List[str]]={'socksProxy':[],'httpProxy':[],'ftpProxy':[],'sslProxy':[]}, *, jq_filename:str=jq_file): ### Start loading jq = AGenerator() jq.append(load(jq_filename)) ## Shared memory # Create a manager to create shared objects self._manager = Manager() # Check sites to be an OrderedList of {Site:depth} # Workaround. Improvement needed self._links = parse_sites_arg(sites, _manager=self._manager) # List to store timed out links self._timed_out_links = self._manager.list() # Some semaphores to aid in some functionalities self.__lock_loaded = self._manager.RLock() self.__lock_sited = self._manager.RLock() self.__lock_timed = self._manager.RLock() self.__lock_proxy = self._manager.RLock() # Store all visited sites and how many times it was visited self._visited_sites_counter = self._manager.dict() self._visited_domains_counter = self._manager.dict() # All proxys to be accessed self.proxys = self._manager.dict() for key, proxy_list in proxy_dict.items(): shuffle(proxy_list) self.proxys[key] = Browser.check_proxys(proxy_list) ### PERFORMANCE settings # How many tabs per process will be able to open the browser # This setting affects severely the ram usage and speed of the crawling self.max_tabs = max_tabs # Should it load the images in the sites it visits? # This setting is meant to be used in low-speed connections self.load_images = load_images # If videos will be automatically loaded without the user's interaction. # May be necessary when video scrapping self.autoload_videos = autoload_videos # How much time wait before releasing a timeout exception on a site. # This setting can have false positives on low-conn networks, but # it is meant to close blank urls self.load_timeout = load_timeout self.load_wait = load_wait # Store the jQuery script as plain text. There should've take shorter, # since it has been loading async self.__jQuery_script = next(jq) ### OTHER configurations # If the browser will be shown self.headless = headless # Wether to block download/open file requests self.disable_downloads = disable_downloads def __enter__(self): self.open() return self def __exit__(self,_,__,___): self.close() def open(self, *, options=None, profile=None, capabilities=None, overwrite_config:bool=True, proxys:Dict[str,List[str]]=None): from selenium import webdriver from os import getcwd try: from Settings.browser_configuration import get_configuration from Structures.Async_generator import AGenerator except ModuleNotFoundError: from .Settings.browser_configuration import get_configuration from .Structures.Async_generator import AGenerator print(f"Starting the opening of a Browser{' (headless)' if self.headless else ''}") #breakpoint() if(not proxys is None): self.proxys = proxys if(options is None or profile is None or capabilities is None or not overwrite_config): options2, profile2, capabilities2 = get_configuration(tabs_per_window=1000, headless=self.headless, load_images=self.load_images, autoload_videos=self.autoload_videos, disable_downloads=self.disable_downloads, proxys=self.get_proxy(http=True, ssl=True, ftp=True, socks=True), options=options if not overwrite_config else None, profile=profile if not overwrite_config else None, capabilities=capabilities if not overwrite_config else None) if(not overwrite_config or options is None): options = options2 if(not overwrite_config or profile is None): profile = profile2 if(not overwrite_config or capabilities is None): capabilities = capabilities2 del options2, profile2, capabilities2 print("Configuration complete. Trying to run the drivers. This could take some time...") try: self.driver = webdriver.Firefox(executable_path=f"{getcwd()}//geckodriver", options=options, firefox_profile=profile, capabilities=capabilities) #self._profile = profile print("Drivers ran succesfully!") except Exception as ex: print(f"Looks like something failed. Error message: {ex}") self.driver = webdriver.Firefox() self.driver.get("about:config") self.driver.execute_script("document.querySelector('button').click();") #breakpoint() ### After driver configs #self.driver.set_page_load_timeout(0) self.driver.set_page_load_timeout(self.load_timeout) ### Non-shared memory self._site_from_tab = {} ## Async site_sleep self._async_site_sleep = AGenerator() print("New Browser opened") def close(self): self.driver.quit() def _get_sites(self, *, _number:int=1): # Ask for access self.__lock_sited.acquire() # There's work to be done. It's not the same a depth 0 site # as a depth N. Should be able to get more than one in one request if(not len(self._links)): self.__lock_sited.release() return None result = self._links[0] self._links.pop(0) self.__lock_sited.release() return result def _get_loaded_sites(self, *, wait:bool=True) -> Iterable: sites = [] while(not len(sites)): sites = list(filter(None, next(self._async_site_sleep))) for site in sites: self._site_from_tab[site.tab] = site if(len(self._site_from_tab) or not wait): return self._site_from_tab.values() def open_tab(self, *, link:str=None, site:Site=None) -> int: self.__lock_timed.acquire() self.driver.switch_to.window(self.driver.window_handles[0]) self.driver.execute_script("window.open();") #Switch to new tab instead of next self.driver.switch_to.window(self.driver.window_handles[self.driver.window_handles.index( self.driver.current_window_handle)+1]) tab = self.driver.current_window_handle self.__lock_timed.release() if(not link is None): self.open_link(link, new_tab=False, site=site) return tab def open_link(self, link:str, *, new_tab:bool=False, site:Site=None, wait_load:bool=True): tab = self.driver.current_window_handle bef_tab = self.driver.window_handles.index(tab) if(new_tab): tab = self.open_tab(link=None) self.__lock_timed.acquire() try: self.driver.switch_to.window(tab) self.driver.get(link) except TimeoutException: pass self.__lock_timed.release() # window.array = [performance.getEntries()]; window.interval = setInterval(function foo(){var a = performance.getEntries(); if(window.array[window.array.length-1].length != a.length){window.array.push(a);}},200) domain = Browser.domain_from_link(link) self._visited_sites_counter[link] = self._visited_sites_counter.get(link,0) + 1 self._visited_domains_counter[domain] = self._visited_domains_counter.get(domain,0) + 1 if(site is None): site = Site(link, tab=tab) else: site.tab = tab site.link = link self._site_from_tab[tab] = site if(wait_load): self._async_site_sleep.append(self.async_sleep(tab, self.load_wait, _return=site)) return site async def async_sleep(self, tab, load_wait:float, *, _return=None, period:float=0.1): load_start = datetime.now() timeout = load_start + timedelta(seconds=int(load_wait)) while(not ready_state(self.driver, tab, self.__lock_timed) and (datetime.now() < timeout)): self.__lock_timed.release() await asleep(period) self.__lock_timed.release() if(timeout > datetime.now()): return await asleep(load_wait) return _return # open_link seems to be faster than async_open_link def async_open_link(self, link:str, *, new_tab:bool=False, site:Site=None): #raise DeprecationWarning() if(not link is None): self._async_site_sleep.append(self.__a_open_link(link, new_tab=new_tab, site=site)) async def __a_open_link(self, link:str, *, new_tab:bool=False, site:Site=None): bef_tab = self.driver.window_handles.index(self.driver.current_window_handle) if(new_tab): tab = self.open_tab(link=None) else: tab = self.driver.current_window_handle try: self.driver.get(link) except TimeoutException: print(f"The site timed out. Skipping {link}") self._timed_out_links.append(link) self.driver.execute_script("window.close();") self.switch_to_window(self.driver.window_handles[bef_tab%len(self.driver.window_handles)]) return domain = Browser.domain_from_link(link) self._visited_sites_counter[link] = self._visited_sites_counter.get(link,0) + 1 self._visited_domains_counter[domain] = self._visited_domains_counter.get(domain,0) + 1 if(site is None): site = Site(link, tab=tab) else: site.tab = tab site.link = link self._site_from_tab[tab] = site await asleep(self.load_wait) return site def old_open_link(self, link:str, *, new_tab:bool=False, site:Site=None): if(new_tab): tab = self.open_tab(link=None) else: tab = self.driver.current_window_handle try: self.driver.get(link) except TimeoutException: print(f"The site timed out. Skipping {link}") self._timed_out_links.append(link) self.driver.execute_script("window.close();") self.switch_to_window(self.driver.window_handles[0]) return domain = Browser.domain_from_link(link) self._visited_sites_counter[link] = self._visited_sites_counter.get(link,0) + 1 self._visited_domains_counter[domain] = self._visited_domains_counter.get(domain,0) + 1 if(site is None): site = Site(link, tab=tab) else: site.tab = tab site.link = link self._site_from_tab[tab] = site sleep(self.load_wait) return site def switch_to_window(self, tab): if(tab in self.driver.window_handles): self.driver.switch_to_window(tab) return self.driver.switch_to_window(self.driver.window_handles[0]) def restore_timed_out(self): self.__lock_timed.acquire() for zombie_tab in [tab for tab in self.driver.window_handles[1:] if not tab in self._site_from_tab.keys()]: print("Found a non-properly closed tab. Closing it...") self.close_tab(zombie_tab) self._links.extend(self._timed_out_links) while(len(self._timed_out_links)): self._timed_out_links.pop(0) self.__lock_timed.release() def extract_text(self, element:str="document") -> str: return self.driver.execute_script( f"var query={element}.evaluate('//*[not(self::script)][not(self::style)]/text()',{element},null,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,null);" "return Array(query.snapshotLength).fill(0).map((element,index) => query.snapshotItem(index)" ").map(x => function(e){if(e.data.replace('\\n','').trim()){return e.data;} return '';}(x)).join('\\n');") def extract_hrefs(self, element:str="document", *, site:Site=None): if(site is None): return list(filter(validate_url, self.driver.execute_script( f"var query={element}.evaluate('//body[1]//@href[not(self::script)][not(self::link)][not(self::style)]',{element},null,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,null);" "return Array.from(new Set(Array(query.snapshotLength).fill(0).map((element,index) => query.snapshotItem(index).value)" ".map(x => function(e){if(e[0] == '/'){return location.origin+e;} return e}(x))))" ))) site.hrefs = list(filter(validate_url, self.driver.execute_script( f"var query={element}.evaluate('//@href',{element},null,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,null);" "return Array.from(new Set(Array(query.snapshotLength).fill(0).map((element,index) => query.snapshotItem(index).value)" ".map(x => function(e){if(e[0] == '/'){return location.origin+e;} return e}(x))))" ))) return site.hrefs def extract_buttons(self): return self.driver.execute_script(""" var _ = document.evaluate("//*", document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); return (jQuery(Array.from(new Set(Array(_.snapshotLength).fill(0).map((element, index) => _.snapshotItem(index).nodeName))).filter(function(value,_,_){return value!="IFRAME"}).join(", ")).contents().filter(function b(e){return jQuery._data(e, 'events') != undefined;})).toArray() """) """ var c = (jQuery(window.some||Array.from(new Set(Array(_.snapshotLength).fill(0).map((element, index) => _.snapshotItem(index).nodeName))).filter(function(value,_,_){return value!="IFRAME"}).join(", ")).contents().toArray().filter(function b(e){return e.click != undefined;})); window.some = Array.from(new Set(c.map(function d(e){return e.nodeName;}))).filter(function f(e){return e != "IFRAME"}).join(', ') """ def extract_video(self): return self.driver.execute_script("return performance.getEntries().map(e => (e.initiatorType == 'xmlhttprequest' && e.name.split('?')[0])).filter(e => (e && (e.endsWith('.mp4') || e.endsWith('.m3u8'))));") def store_cookies(self, filename:str="cookies.dat"): import pickle pickle.dump(self.driver.get_cookies(), open(filename, 'ab')) def load_cookies(self, filename:str="cookies.dat"): import pickle link_bef = self.driver.current_url for cookie in pickle.load(open(filename,'rb')): self.inject_cookie(cookie) print(f"Loaded cookie: {cookie}") self.driver.get(link_bef) def inject_cookie(self, cookie:dict): domain = cookie.get("domain",None) if(not domain is None): if(domain[0] == '.'): domain = "www"+domain if("about" in self.driver.current_url or Browser.domain_from_link(self.driver.current_url) != domain): self.driver.get(f"http://{domain}") else: if("about" in self.driver.current_url): self.driver.get("https://duckduckgo.com") if(isinstance(cookie, str)): raise NotImplementedError #cookie = cookie.split(":") #self.driver.add_cookie(dict(zip([][:len(cookie)],cookie))) self.driver.add_cookie(cookie) def inject_jQuery(self): self.driver.execute_script("window.el = document.createElement('script'); n = function(e){" "e.type='text/javascript';" f"e.innerHTML={self.__jQuery_script};e.onload=function()" "{console.log('Checking jQuery');jQuery.noConflict();console.log('jQuery injected')};" "console.log('Injecting');document.head.appendChild(e);console.log(e);}(el);") def get_proxy(self, *, http:bool=False, ssl:bool=False, ftp:bool=False, socks:bool=False) -> dict: result = {} self.__lock_proxy.acquire() if(http and len(self.proxys.get('httpProxy', []))): aux = self.proxys['httpProxy'][0] result['httpProxy'] = aux self.proxys['httpProxy'].pop(0) self.proxys['httpProxy'].append(aux) if(ssl and len(self.proxys.get('sslProxy', []))): aux = self.proxys['sslProxy'][0] result['sslProxy'] = aux self.proxys['sslProxy'].pop(0) self.proxys['sslProxy'].append(aux) if(ftp and len(self.proxys.get('ftpProxy', []))): aux = self.proxys['ftpProxy'][0] result['ftpProxy'] = aux self.proxys['ftpProxy'].pop(0) self.proxys['ftpProxy'].append(aux) if(socks and len(self.proxys.get('socksProxy', []))): aux = self.proxys['socksProxy'][0] result['socksProxy'] = aux self.proxys['socksProxy'].pop(0) self.proxys['socksProxy'].append(aux) self.__lock_proxy.release() return result def set_proxy(self, httpProxy:str=None, sslProxy:str=None, ftpProxy:str=None, socksProxy:str=None) -> None: self.driver.execute("SET_CONTEXT", {"context": "chrome"}) if(not httpProxy is None): try: self.driver.execute_script(""" Services.prefs.setCharPref("network.proxy.http", arguments[0]); Services.prefs.setIntPref("network.proxy.http_port", Number(arguments[1])); """, *httpProxy.split(':')) except: pass if(not sslProxy is None): try: self.driver.execute_script(""" Services.prefs.setCharPref("network.proxy.ssl", arguments[0]); Services.prefs.setIntPref("network.proxy.ssl_port", Number(arguments[1])); """, *sslProxy.split(':')) except: pass if(not ftpProxy is None): try: self.driver.execute_script(""" Services.prefs.setCharPref('network.proxy.ftp', arguments[0]); Services.prefs.setIntPref('network.proxy.ftp_port', Number(arguments[1])); """, *ftpProxy.split(':')) except: pass if(not socksProxy is None): try: self.driver.execute_script(""" Services.prefs.setCharPref('network.proxy.socks', arguments[0]); Services.prefs.setIntPref('network.proxy.socks_port', Number(arguments[1])); """, *socksProxy.split(':')) except: pass self.driver.execute("SET_CONTEXT", {"context": "content"}) def close_tab(self, tab:int=None): if(not tab is None): if(not tab in self.driver.window_handles): #breakpoint() self._site_from_tab.pop(tab) for zombie_tab in [tab for tab in self.driver.window_handles[1:] if not tab in self._site_from_tab.keys()]: print("Found a non-properly closed tab. Closing it...") self.close_tab(zombie_tab) return self.__lock_timed.acquire() self.driver.switch_to.window(tab) else: self.__lock_timed.acquire() actual_tab = self.driver.current_window_handle self.driver.execute_script("window.close();") self.__lock_timed.release() # Suddenly this stopped working. Whatev try: self._site_from_tab.pop(actual_tab) except KeyError: pass # Aesthetics function def set_size_pos(self, size:Tuple[int,int], position:Tuple[int,int]=(0,0)) -> None: self.driver.set_window_rect(*position,*size) @staticmethod def domain_from_link(link:str) -> Optional[str]: return re_search(r"(?<=:\/\/)?(([A-Z]|[a-z]|[0-9])+\.)+([A-Z]|[a-z]|[0-9])+", link).group() @staticmethod def check_proxys(proxys:list) -> list: return proxys
def run(self): time0 = time.time() final_data_out = {} mytxt = '0, started loading in data on %s.' % ( datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) logging.info(mytxt) self.emitString.emit(mytxt) # libraries_dict = plexcore.get_libraries(self.token, fullURL=self.fullURL, do_full=True) if not any( map(lambda value: 'show' in value[-1], libraries_dict.values())): raise ValueError('Error, could not find TV shows.') library_name = max( map( lambda key: libraries_dict[key][0], filter(lambda key: libraries_dict[key][1] == 'show', libraries_dict))) final_data_out['library_name'] = library_name mytxt = '1, found TV library in %0.3f seconds.' % (time.time() - time0) logging.info(mytxt) self.emitString.emit(mytxt) # if self.tvdata_on_plex is None: self.tvdata_on_plex = plexcore.get_library_data( library_name, fullURL=self.fullURL, token=self.token, num_threads=self.num_threads) if self.tvdata_on_plex is None: raise ValueError('Error, could not find TV shows on the server.') mytxt = '2, loaded TV data from Plex server in %0.3f seconds.' % ( time.time() - time0) logging.info(mytxt) self.emitString.emit(mytxt) # ## using a stupid-ass pattern to shave some seconds off... manager = Manager() shared_list = manager.list() myLock = manager.RLock() myStage = manager.Value('stage', 2) # def _process_didend(): if self.didend is not None: shared_list.append(('didend', self.didend)) return didEnd = plextvdb.get_all_series_didend(self.tvdata_on_plex, verify=self.verify, tvdb_token=self.tvdb_token) myLock.acquire() myStage.value += 1 mytxt = '%d, added information on whether shows ended in %0.3f seconds.' % ( myStage.value, time.time() - time0) logging.info(mytxt) self.emitString.emit(mytxt) myLock.release() shared_list.append(('didend', didEnd)) def _process_missing(): if self.toGet is not None: shared_list.append(('toGet', self.toGet)) return toGet = plextvdb.get_remaining_episodes( self.tvdata_on_plex, showSpecials=False, showsToExclude=self.showsToExclude, verify=self.verify, token=self.tvdb_token) myLock.acquire() myStage.value += 1 mytxt = '%d, found missing episodes in %0.3f seconds.' % ( myStage.value, time.time() - time0) logging.info(mytxt) self.emitString.emit(mytxt) myLock.release() shared_list.append(('toGet', toGet)) def _process_plot_tvshowstats(): tvdata_date_dict = plextvdb.get_tvdata_ordered_by_date( self.tvdata_on_plex) years_have = set(map(lambda date: date.year, tvdata_date_dict)) with multiprocessing.Pool( processes=multiprocessing.cpu_count()) as pool: figdictdata = dict( pool.map( lambda year: (year, plextvdb.create_plot_year_tvdata( tvdata_date_dict, year, shouldPlot=False)), years_have)) myLock.acquire() myStage.value += 1 mytxt = '%d, made plots of tv shows added in %d years in %0.3f seconds.' % ( myStage.value, len(years_have), time.time() - time0) logging.info(mytxt) self.emitString.emit(mytxt) myLock.release() shared_list.append(('plotYears', figdictdata)) jobs = [ Process(target=_process_didend), Process(target=_process_missing) ] # Process( target = _process_plot_tvshowstats ) ] for process in jobs: process.start() for process in jobs: process.join() # final_data = dict(shared_list) assert (set(final_data) == set(['didend', 'toGet'])) didend = final_data['didend'] toGet = final_data['toGet'] for seriesName in self.tvdata_on_plex: self.tvdata_on_plex[seriesName]['didEnd'] = didend[seriesName] final_data_out['tvdata_on_plex'] = self.tvdata_on_plex mytxt = '%d, finished loading in all data on %s.' % ( myStage.value + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) logging.info(mytxt) self.emitString.emit(mytxt) missing_eps = dict( map( lambda seriesName: (seriesName, toGet[seriesName]['episodes']), set(self.tvdata_on_plex) & set(toGet) - set(self.showsToExclude))) final_data_out['missing_eps'] = missing_eps self.finalData.emit(final_data_out)
class DataLogger(metaclass=Singleton): """Stores and save various type of data under various forms.""" @staticmethod def _futures_callback(future: Future): """Called at future completion.""" if future.exception(): print( f"Future {future} raised the exception {repr(future.exception())}" ) @staticmethod def _push(managed, entry, value, time): """Push method called by the pool executors""" with managed.lockers[entry]: managed.data[entry][time] = value managed.counters[entry] += 1 for f in managed.on_push_callables[entry]: try: f(entry, managed.data[entry], path=managed.path) except Exception as e: logging.getLogger("datalogger").warning( f"{managed.name} DataLogger: function {f} of {entry} failed: {e}" ) @staticmethod def _dump(managed, entry): """Dump method called by the pool executors""" with managed.lockers[entry]: for f in managed.on_dump_callables[entry]: try: f(entry, managed.data[entry], path=managed.path) except Exception as e: logging.getLogger("datalogger").warning( f"{managed.name} DataLogger: function {f} of {entry} failed: {e}" ) @staticmethod def _reset(managed, entry): """Inner reset method called by the pool executor""" with managed.lockers[entry]: for f in managed.on_reset_callables[entry]: try: f(entry, managed.data[entry], path=managed.path) except Exception as e: logging.getLogger("datalogger").warning( f"{managed.name} DataLogger: function {f} of {entry} failed: {e}" ) managed.data[entry].clear() managed.counters[entry] = 0 def __init__(self): # Init and set attributes super(DataLogger, self).__init__() # Managed resources (accessible by remote threads or remote processes) self._manager = Manager() self._managed = self._manager.Namespace() self._managed.name = "data-logger" self._managed.path = "." self._managed.entries = self._manager.list() self._managed.data = self._manager.dict() self._managed.lockers = self._manager.dict() self._managed.counters = self._manager.dict() self._managed.on_push_callables = self._manager.dict() self._managed.on_reset_callables = self._manager.dict() self._managed.on_dump_callables = self._manager.dict() self.tick = datetime.datetime.now() self.futures = list() self.pool = ThreadPoolExecutor(max_workers=1) # Log logging.getLogger("datalogger").info( "{} DataLogger initialized!".format(self._managed.name)) def set_path(self, path): """Sets the root path of the logger. Used by all the handlers that write on disk. :param string path: A valid path to write the data in. """ if len(self._managed.lockers) != 0: raise Exception( "You tried to change logger path after having registered some entries." ) os.makedirs(path, exist_ok=True) self._managed.path = path def set_pool(self, pool, n_par=5): """Sets the executor to be used to call handlers. :param string pool: The type of executor to use to call handlers. Either "thread" or "process". :param int n_par: The number of executor to use. """ if len(self._managed.lockers) != 0: raise Exception( "You tried to pool after having registered some entries.") if pool == "thread": self.pool = ThreadPoolExecutor(max_workers=n_par) elif pool == "process": self.pool = ProcessPoolExecutor(max_workers=n_par) else: raise Exception(f"Unknown pool type `{pool}`") def set_name(self, name): """Sets the name of the logger. :param string name: Name of the logger """ self._managed.name = name def declare(self, entry, on_push_callables, on_dump_callables, on_reset_callables): """Register a recurring log entry. Registering an entry gives access to the `push`, `reset` and `dump` methods. Note that all the handlers must be able to handle the data that will be pushed. :param string entry: Name of the log entry. :param List[handlers] on_push_callables: Handlers called on data when `push` is called. :param List[handlers] on_reset_callables: Handlers called on data when `reset` is called. :param List[handlers] on_dump_callables: Handlers called on the data when `dump` is called. """ if entry in self._managed.entries: raise Exception("You tried to declare an existing log entry") self._managed.entries.append(entry) self._managed.lockers[entry] = self._manager.RLock() self._managed.data[entry] = self._manager.dict() self._managed.counters[entry] = 0 self._managed.on_push_callables[entry] = self._manager.list( on_push_callables) self._managed.on_reset_callables[entry] = self._manager.list( on_reset_callables) self._managed.on_dump_callables[entry] = self._manager.list( on_dump_callables) if os.path.dirname(entry) != "": os.makedirs(os.path.join(self._managed.path, os.path.dirname(entry)), exist_ok=True) def push(self, entry, value, time=None): """Append data to a recurring log. All handlers registered for the `on_push` event will be called. :param string entry: Name of the log entry :param Any value: Object containing the data to log. Should be of same type from call to call... :param int or None time: Date of the logging (epoch, iteration, tic ...). Will be used as key in the data dictionary. If `None`, the last data key plus one will be used. """ future = self.pool.submit( DataLogger._push, self._managed, entry, value, time if time is not None else self._managed.counters[entry]) future.add_done_callback(DataLogger._futures_callback) self.futures.append(future) def dump(self): """Calls handlers declared for `on_dump` event, for all registered log entries. """ for entry in self._managed.entries: future = self.pool.submit(DataLogger._dump, self._managed, entry) future.add_done_callback(DataLogger._futures_callback) self.futures.append(future) def reset(self, entry): """Resets the data of a recurring log entry. All handlers registered for the `on_reset` event will be called before the storage is emptied. :param string entry: name of the log entry. """ future = self.pool.submit(DataLogger._reset, self._managed, entry) future.add_done_callback(DataLogger._futures_callback) self.futures.append(future) def get_entry_length(self, entry): """Retrieves the number of data saved for a log entry. :param string entry: Name of the log entry :return: Number of data pieces in the entry storage :rtype: int """ return self._managed.counters[entry] def get_serie(self, entry): """Returns the data in a list ordered by keys. :param string entry: Name of the log entry :return: Serie of data ordered by key :rtype: List[any] """ return [i[1] for i in sorted(self._managed.data[entry].items())] def wait(self, log_durations=True): """Wait for the handling queue to be emptied. :param bool log_durations: Whether to log the wait duration. """ b = datetime.datetime.now() while True: self.futures = list(filter(lambda x: not x.done(), self.futures)) if self.futures: time.sleep(.1) else: break if log_durations: logging.getLogger("datalogger").info( f"{self._managed.name} DataLogger: Last wait occured {self.tick - b} ago." ) logging.getLogger("datalogger").info( f"{self._managed.name} DataLogger: Waited {datetime.datetime.now() - b} for completion." ) self.tick = datetime.datetime.now()