def CommandPull(): LogSuccess("\nInitiating PBGet pull command...", False) print("\n*************************\n") # Do not execute if Unreal Editor is running if PBTools.CheckRunningProcess("UE4Editor.exe"): LogError( "Unreal Editor is running. Please close it before running pull command" ) sys.exit(1) # Parse packages xml file config_xml = ET.parse(config_name) fmt = '{:<28} {:<37} {:<10}' print(fmt.format(" ~Package Name~", "~Version~", "~Result~")) packages = IgnoreExistingInstallations(config_xml.getroot()) # Async process packages pool = ThreadPool(cpu_count()) pool.map_async(ProcessPackage, [package for package in packages.findall("package")]) # Release threads pool.close() pool.join()
def __run_test_read_hive_inserts(self, unique_database, partitioned): """Check that Impala can read a single insert only ACID table (over)written by Hive several times. Consistency can be checked by using incremental values for overwrites ('run') and inserts ('i'). """ tbl_name = "%s.test_read_hive_inserts" % unique_database part_expr = "partitioned by (p int)" if partitioned else "" CREATE_SQL = """create table %s (run int, i int) %s TBLPROPERTIES ( 'transactional_properties' = 'insert_only', 'transactional' = 'true') """ % (tbl_name, part_expr) self.client.execute(CREATE_SQL) def do_role(role): try: if role == "hive": self.__hive_role_write_hive_inserts(tbl_name, partitioned) else: self.__impala_role_read_hive_inserts(tbl_name) except Exception: traceback.print_exc() raise # TODO: CTRL+C can't interrupt the test pool = ThreadPool(processes=2) pool.map_async(do_role, ["impala", "hive"]).get(600)
def addBounties(bounties): """Add a list of bounties in parallel using multiprocessing.Pool for verification""" from multiprocessing.pool import ThreadPool pool = ThreadPool() safeprint("Mapping verifications",verbosity=3) async = pool.map_async(verify,bounties) #defer this for possible efficiency boost internal = pool.map(internalVerify,bounties) safeprint("Waiting for verifications",verbosity=3) external = async.get() safeprint("Received verifications",verbosity=3) rvals = [] safeprint(internal) safeprint(external) for i in range(len(bounties)): safeprint("Finishing the processing of bounty " + str(i+1) + "/" + str(len(bounties)),verbosity=2) if not internal[i]: rvals.append(-3) elif not external[i]: rvals.append(-2) elif bounties[i] in bountyList: rvals.append(-1) elif internal[i] == -1: rvals.append(0) else: rvals.append(1) safeprint("Passed first if",verbosity=3) if rvals[i] == 1: addValidBounty(bounties[i]) safeprint("Verifications parsed",verbosity=3) return rvals
def follow(self, penguin_id, dx=0, dy=0): @self._safe def equip(item_name): setattr(self, item_name, getattr(penguin, item_name)) dx = self._require_int("dx", dx) dy = self._require_int("dy", dy) penguin = self.get_penguin(penguin_id) self._info('Following "{}"...'.format(penguin.name)) if penguin.id == self.id: self._error("Cannot follow self") if penguin.id not in self._penguins: if penguin.id not in self.buddies or not self.buddies[ penguin.id].online: self._error('Penguin "{}" not in room'.format(penguin.name)) self.room = self.find_buddy(penguin.id) self._follow = (penguin.id, dx, dy) pool = ThreadPool() pool.map_async(equip, [ "color", "head", "face", "neck", "body", "hand", "feet", "pin", "background" ]) pool.apply_async(self._safe(self.walk), (penguin.x + dx, penguin.y + dy)) pool.apply_async(self._safe(self.add_buddy), (penguin_id, )) pool.close() pool.join()
def get_rxns_kegg(reactome, threads=20): rxns = list() pool = ThreadPool(processes=threads) pool.map_async(get_kegg_rxn_kegg, reactome, callback=rxns.append) pool.close() pool.join() return [j for i in rxns for j in i]
def got_password_entries(self): if GlobalState.options.no_password_policies: self.controller.show_panel(views.ChoosePasswordsPanel) return def check_password_update_endpoint(login): print time.time(), threading.current_thread() if not login.get('domain'): return None print "checking", login['domain'] scheme = login['scheme'] if GlobalState.options.ssl_not_required else 'https' announce_url = "%s://%s/.well-known/password-policy" % (scheme, login['domain']) try: result = requests.get(announce_url, verify=True, allow_redirects=False, timeout=5) except Exception as e: print e return if result.status_code != 200: return try: data = yaml.load(result.content) except Exception as e: print e return if not type(data)==dict or not data.get('endpoint') or not data['endpoint'].startswith('/'): return login['rule'] = PasswordEndpointRule(login['domain'], announce_url, data) print "got", login['domain'], data def check_complete(results): print "DONE" wx.CallAfter(self.controller.show_panel, views.ChoosePasswordsPanel) pool = ThreadPool(processes=50) pool.map_async(check_password_update_endpoint, GlobalState.logins, callback=check_complete) pub.sendMessage('wait')
def main(): # the processes set the process count for this pool. # default is the cpu num. # TODO: How about the thread stdout\stderr? what will happen? why? pool = ThreadPool(processes=3) # like built-in map, worker will yield the iterator. # TODO: chunksize? # TODO: callback? # pool.map(worker, [("test_worker_{}".format(x), ) for x in range(100)]) # apply_async task # name = "test_worker" # pool.apply_async(worker, args=(name, ), kwds={}, callback=None) # map async pool.map_async(worker, [("test_worker_{}".format(x), ) for x in range(100)]) # TODO: what happened? # _worker_handler\_task_handler\_result_handler? # the pool class? # the _multiprocessing C lib? pool.close() pool.join()
def createthreadparser(thread_count, files): pool = ThreadPool(int(thread_count)) pool.map_async(getinn, files) pool.close() pool.join() return None
def command_clean(): log_success("\nInitiating PBGet clean command...", False) print("\n*************************\n") # Do not execute if Unreal Editor is running if PBTools.check_running_process("UE4Editor.exe"): log_error( "Unreal Editor is running. Please close it before running pull command" ) sys.exit(1) # Parse packages xml file config_xml = ET.parse(config_name) packages = config_xml.getroot() if no_threading: for package in packages.findall("package"): clean_package(package) else: pool = ThreadPool(cpu_count()) # Async process packages pool.map_async(clean_package, [package for package in packages.findall("package")]) # Release threads pool.close() pool.join()
def command_pull(): log_success("\nInitiating PBGet pull command...", False) print("\n*************************\n") # Do not execute if Unreal Editor is running if PBTools.check_running_process("UE4Editor.exe"): log_error( "Unreal Editor is running. Please close it before running pull command" ) sys.exit(1) # Parse packages xml file config_xml = ET.parse(config_name) fmt = '{:<28} {:<37} {:<10}' print(fmt.format(" ~Package Name~", "~Version~", "~Result~")) packages = ignore_existing_installations(config_xml.getroot()) if no_threading: for package in packages.findall("package"): process_package(package) else: # Async process packages pool = ThreadPool(cpu_count()) pool.map_async(process_package, [package for package in packages.findall("package")]) # Release threads pool.close() pool.join()
def __init__(self, filename, pw, th): self.id_list = [ i.strip() for i in open(filename).readlines() if i.strip() != '' and i ] if arg.random: random.shuffle(self.id_list) if arg.reverse: self.id_list = self.id_list[::-1] if arg.number: self.id_list = self.id_list[:arg.number] self.pw = pw # <-- data result --> self.data = {'succeeded': [], 'checkpoint': [], 'failed': []} self.t = 0 self.raw = 0 self.start = time.time() p = ThreadPool(int(th)) try: p.map_async(self.run, self.id_list).get(9999) except KeyboardInterrupt: p.close() except Exception as e: p.terminate() self.print_data() p.close()
def _parallel_execute(datasources, options, outs_dir, pabot_args, suite_names): original_signal_handler = signal.signal(signal.SIGINT, keyboard_interrupt) pool = ThreadPool(pabot_args['processes']) if pabot_args.get("vectors"): result = pool.map_async(execute_and_wait_with, [(datasources, outs_dir, options, suite, pabot_args['command'], pabot_args['verbose'], vector) for suite in suite_names for vector in pabot_args['vectors']]) else: result = pool.map_async(execute_and_wait_with, [(datasources, outs_dir, options, suite, pabot_args['command'], pabot_args['verbose'], None) for suite in suite_names]) pool.close() while not result.ready(): # keyboard interrupt is executed in main thread and needs this loop to get time to get executed try: time.sleep(0.1) except IOError: keyboard_interrupt() signal.signal(signal.SIGINT, original_signal_handler)
def get_list_of_cities_async(city_href_list: list, processes_count=-1) -> list: def split_list(l: list, n) -> list: return [l[i:i + n] for i in range(0, len(l), n)] if processes_count == -1: processes_count = len(city_href_list) try: list_splitted = split_list(city_href_list, len(city_href_list) // processes_count) except ZeroDivisionError: print("No tasks available...") return [] except: print( "ERROR! Too many processes. To use maximum number of threads set 'processes_count' key-arg to: -1" ) return [] result_list = [] def log_result(result): result_list.append(result) pool = ThreadPool(processes=processes_count) pool.map_async(get_city_attr_async, list_splitted, callback=log_result) pool.close() pool.join() return flatten(flatten(result_list))
def addBounties(bounties): """Add a list of bounties in parallel using multiprocessing.Pool for verification""" from multiprocessing.pool import ThreadPool pool = ThreadPool() safeprint("Mapping verifications", verbosity=3) async = pool.map_async(verify, bounties) # defer this for possible efficiency boost internal = pool.map(internalVerify, bounties) safeprint("Waiting for verifications", verbosity=3) external = async.get() safeprint("Received verifications", verbosity=3) rvals = [] safeprint(internal) safeprint(external) for i in range(len(bounties)): safeprint("Finishing the processing of bounty " + str(i+1) + "/" + str(len(bounties)), verbosity=2) if not internal[i]: rvals.append(-3) elif not external[i]: rvals.append(-2) elif bounties[i] in bountyList: rvals.append(-1) elif internal[i] == -1: rvals.append(0) else: rvals.append(1) addValidBounty(bounties[i]) safeprint("Passed first if", verbosity=3) safeprint("Verifications parsed", verbosity=3) return rvals
def upload(stack, args): def upload_file(file): try: stack.file(file) log(f'Skipping: {file!r} (already exists)') except StackException: stack.upload(file) log(f'Uploaded: {file!r}') if os.path.isfile(args.file_or_directory): return upload_file(args.file_or_directory) log('Setting up directory structure..', prefix='+') # Set up directory structure, can't be threaded as a # sub directory might be created before a parent directory is created. for directory in directories(args.file_or_directory): log(f'Creating directory: {directory!r}') stack.mkdir(directory) log('Starting upload..', prefix='+') pool = ThreadPool(processes=args.threads) pool.map_async(upload_file, files(args.file_or_directory)) pool.close() pool.join()
def start_bots(): bots = ['sound_bot', 'ambience_bot', 'music_bot'] pool = ThreadPool(processes=len(bots)) pool.map_async(functools.partial(run_bot), (bot for bot in bots)) pool.close() return 'Success'
def main(): hosts = ['192.168.200.134'] #hosts=('192.168.200.134', '192.168.200.134') pool = ThreadPool(processes=3) #pool.map(request_ngx, hosts) pool.map_async(request_ngx, hosts, callback=Analysis_keys) pool.close() pool.join()
def _parallel_execute_arrange(datasources, options, outs_dir, pabot_args, suite_names, para_mode): if (para_mode=="single"): original_signal_handler = signal.signal(signal.SIGINT, keyboard_interrupt) suite_length=len(suite_names) suiteGroupNum=[] i=0 for suite in suite_names: if("Init" in suite): suiteGroupNum.append(i) elif("Destory" in suite): suiteGroupNum.append(i) i=i+1 suiteGroup=[] if(len(suiteGroupNum)==0): suiteGroup=suite_names else: for j in range(0,len(suiteGroupNum)): list_start=suite_names[suiteGroupNum[j]] suiteGroup.append(list_start) list_middle=[] if(j==(len(suiteGroupNum)-1) and suiteGroupNum[j]==(len(suite_names)-1)): break elif(j==(len(suiteGroupNum)-1) and suiteGroupNum[j]<(len(suite_names)-1)): suiteGroupNum.append(len(suite_names)) if(suiteGroupNum[j]+1<suiteGroupNum[j+1]): for m in range(suiteGroupNum[j]+1,suiteGroupNum[j+1]): list_middle.append(suite_names[m]) suiteGroup.append(list_middle) for i in range(0,len(suiteGroup)): suite_names=suiteGroup[i] if(isinstance(suite_names,str)): print "Running Init or End tests" pool = ThreadPool(1) suite=suite_names for argfile in pabot_args['argumentfiles'] or [("", None)]: pollArgsList=[(datasources, outs_dir, options, suite,pabot_args['command'], pabot_args['verbose'], argfile)] result=pool.map_async(execute_and_wait_with,pollArgsList) pool.close() pool.join() else: print "Running Middle tests" pool = ThreadPool(pabot_args['processes']) result = pool.map_async(execute_and_wait_with, ((datasources, outs_dir, options, suite, pabot_args['command'], pabot_args['verbose'], argfile) for suite in suite_names for argfile in pabot_args['argumentfiles'] or [("", None)])) pool.close() pool.join() while not result.ready(): #keyboard interrupt is executed in main thread #and needs this loop to get time to get executed try: time.sleep(0.3) except IOError: keyboard_interrupt() if (para_mode == "single"): signal.signal(signal.SIGINT, original_signal_handler)
def get_discord_audio_function(): audio_url = request.args.get('audio_url') audio_type = request.args.get('audio_type') pool = ThreadPool(processes=1) pool.map_async(functools.partial(check_bot_run, audio_url=audio_url), [type for type in [audio_type]]) pool.close() return 'Playing music in discord!'
def get_reactome_kegg(genome, threads=20): reactome = list() pool = ThreadPool(processes=threads) pool.map_async(get_kegg_rxns_from_gene_kegg, genome, callback=reactome.extend) pool.close() pool.join() return set([j for i in reactome for j in i])
def main(): ACCESS_TOKEN = os.getenv('MIXIA_ACCESS_TOKEN') if not ACCESS_TOKEN: raise FetchFailed("`MIXIA_ACCESS_TOKEN` not found.") user = account.MiXiaUser.from_access_token(ACCESS_TOKEN) client = user.mixia_client try: album_ids = sys.argv[1:] except Exception: raise FetchFailed("Album id not found.") for aid in album_ids: thread_pool = ThreadPool(processes=10) album = song.MiXiaAlbum.from_id(aid, client) thread_pool.map_async(lambda s: s.fetch_detail(client, consts.TRACK_HIGH_QUALITY), album.songs) thread_pool.close() thread_pool.join() ensure_dir(str(album.album_id)) album_logo_resp = requests.get( album.big_logo, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' }) album_logo_resp.raise_for_status() album_logo = album_logo_resp.content for s in album.songs: detail = s.track_detail filename = './{}/{}.mp3'.format(album.album_id, s.song_id) print filename subprocess.call(['wget', '-O', filename, detail.track_url]) song_name = '{}_{}_{}'.format(detail.cd_serial, detail.track, detail.song_name.replace('/', '_')) if not eyed3: print "no eyed3, skip update ID3." os.rename(filename, os.path.join(album.album_id, song_name)) continue song_id3 = eyed3.load(filename) song_id3.initTag() song_id3.rename(song_name) song_id3.tag.images.set(type_=3, img_data=album_logo, mime_type='image/jpeg') song_id3.tag.title = detail.song_name song_id3.tag.album = detail.album_name song_id3.tag.album_artist = detail.artist_name song_id3.tag.artist = detail.artist_name song_id3.tag.disc_num = (detail.cd_serial, album.cd_count) song_id3.tag.track_num = (detail.track, album.song_count) song_id3.tag.save()
def thread(self, user): try: self.t = int(raw_input("[?] Threads : ")) except: print "[!] Masukan angka pada thread" self.thread(user) p = ThreadPool(self.t) try: p.map_async(self.klon, user).get(9999) self.result() except KeyboardInterrupt: p.close()
def search(self, query, num_results=10, prefetch_pages=True, num_prefetch_threads=10): '''Perform the Google search. Parameters: String to search. Minimum number of result to stop search. Prefetch answered pages. Number of threads used t prefetch the pages. Time between thread executions in second to void IP block. ''' search_results = [] pages = int( math.ceil(num_results / float(GoogleSearch.RESULTS_PER_PAGE))) total = None thread_pool = None if prefetch_pages: thread_pool = ThreadPool(num_prefetch_threads) for i in range(pages): start = i * GoogleSearch.RESULTS_PER_PAGE opener = urllib.build_opener() opener.addheaders = GoogleSearch.DEFAULT_HEADERS with closing( opener.open(GoogleSearch.SEARCH_URL + "?hl=en&q=" + urllib.quote(query) + ("" if start == 0 else ( "&start=" + str(start))))) as response: soup = BeautifulSoup(response.read(), "lxml") if total is None: if sys.version_info[0] > 2: totalText = soup.select( GoogleSearch.TOTAL_SELECTOR)[0].children.__next__() else: totalText = soup.select( GoogleSearch.TOTAL_SELECTOR)[0].children.next() total = int( re.sub( "[', ]", "", re.search("(([0-9]+[', ])*[0-9]+)", totalText).group(1))) selector = GoogleSearch.RESULT_SELECTOR_PAGE1 if i == 0 else GoogleSearch.RESULT_SELECTOR self.results = self.parse_results(soup.select(selector), i) # if len(search_results) + len(self.results) > num_results: # del self.results[num_results - len(search_results):] search_results += self.results if prefetch_pages: thread_pool.map_async(SearchResult.get_text, self.results) if prefetch_pages: thread_pool.close() thread_pool.join() return SearchResponse(search_results, total)
def scan_remote(self): """ Start scanning the remote host and return the results :return: None """ self.log(f"Scanning remote host ({len(self.all_files)} files " f"over {len(self._files_local)} {self.mode}s)..") pool = ThreadPool(self.__max_remote_threads) pool.map_async(self.request, self.all_files) pool.close() pool.join()
def generate_complete_href_list(): clst = [c[COUNTRY] for c in COUNTRIES] pool = ThreadPool(processes=len(clst)) result_list = [] def get_res(res): result_list.extend(res) pool.map_async(get_all_wiki_href, clst, callback=get_res) pool.close() pool.join() return flatten(result_list)
def _run_command_threaded(self, cmd): """ Runs the command `cmd` threaded. If `self.async` execute all concurrently, otherwise use a single thread. The single thread is needed so that the repl is not blocking. Parameter --------- cmd : str The command to execute in the qemu instance """ # one thread per node for async, else 1 pool = ThreadPool(processes=len(self.nodes) if self. async else 1) pool.map_async(self.run_command, zip(self.nodes, repeat(cmd)))
def main(*xunitfile_and_result_dirs): tests = [] for xunit_filename, result_dir in xunitfile_and_result_dirs: test_dir = os.path.dirname(os.path.abspath(xunit_filename)) tree = ElementTree.parse(xunit_filename) root = tree.getroot() assemblies = root.findall('.//assembly') for filename in (node.attrib['filename'] for node in assemblies): tests.append((filename, test_dir, result_dir)) threads = ThreadPool() threads.map_async(star_test, tests).get()
class ImageReader(): def __init__(self, image_names, batch_size, threads): self.pool = ThreadPool(processes=threads) self.image_names = image_names self.batch_size = batch_size self.pos = 0 def prefetch(self): if self.pos >= len(self.image_names): return False else: batch = self.image_names[ self.pos:min(self.pos + self.batch_size, len(self.image_names))] self.pos += self.batch_size self.p = self.pool.map_async(get_image, batch) return True def get_next(self): if self.prefetch(): res = self.p.get() res = np.float32(res) return res else: print('Iterator exceed length') return None
def _parallel_execute(datasources, options, outs_dir, pabot_args, suite_names): original_signal_handler = signal.signal(signal.SIGINT, keyboard_interrupt) pool = ThreadPool(pabot_args['processes']) if (pabot_args.has_key("hostsfile")): hosts = [host.rstrip('\r\n') for host in open(pabot_args["hostsfile"])] else: hosts = None if pabot_args["verbose"]: print [(suite,host) for (suite,host) in TestsuitesHosts(suite_names, hosts)] result = pool.map_async(execute_and_wait_with, [(datasources, outs_dir, options, suite, pabot_args['command'], pabot_args['verbose'], host) for (suite,host) in TestsuitesHosts(suite_names, hosts)]) pool.close() while not result.ready(): # keyboard interrupt is executed in main thread and needs this loop to get time to get executed try: time.sleep(0.1) except IOError: keyboard_interrupt() signal.signal(signal.SIGINT, original_signal_handler)
def discover(self): "extract subnet from given ip" if request.method == "POST": if "serverIP" in request.json: server_ip = request.json["serverIP"] if not server_ip: logging.warning("No IP given, using local subnet") server_ip = self.get_own_ip() else: logging.warning("No IP given, using local subnet") server_ip = self.get_own_ip() elif request.method == 'GET': server_ip = self.get_own_ip() else: logging.error("Unknown Method API Call") logging.info("Server ip: {}".format(server_ip)) subnet = ".".join(server_ip.split(".")[0:-1]) start_time = timeit.default_timer() p = ThreadPool(200) try: result = p.map_async(self.check_connection, self.get_allIP(subnet)) result.wait(timeout=2) p.terminate() except: pass elapsed = timeit.default_timer() - start_time logging.info("Time elapsed: {} secs".format(round(elapsed, 2))) ip_list = self.ip_addr["ip"] # store list in another variable self.ip_addr["ip"] = [] # Clear the list for next call # if len(ip_list) > 1: # logging.warning(" More than one IP address received") return jsonify({"ip": ip_list})
def launch_parallel_tests(self): image_name = "django_parallel_tests/%s" % self.project_name if len(self.docker.images(name=image_name)) == 0: self.build_image() req_hash = hashlib.sha224(str(sorted(self.requirements))).hexdigest() try: last_req_hash = open(".last_requirements").read().strip() except: last_req_hash = None if req_hash != last_req_hash: self.build_image() with open(".last_requirements", "w") as f: f.write(req_hash) pool = ThreadPool() tests = [[test] for test in self.tests] run_tests = partial(run_tests_for_project, self.project_name) result = pool.map_async(run_tests, tests) try: while True: time.sleep(0.1) if result.ready(): print "got result", result.get() return except KeyboardInterrupt: pool.terminate() pool.join() else: pool.close() pool.join()
def async_request(n_request): pool = ThreadPool() result = pool.map_async(make_request, [LONG_TEXT for _ in range(n_request)]) pool.close() return result.get()
def check_online_streams(self): self.all_streams_offline = True self.set_status(' Checking online streams...') done_queue = queue.Queue() def check_stream_managed(args): url, queue = args status = self._check_stream(url) done_queue.put(url) return status pool = Pool(self.config.CHECK_ONLINE_THREADS) args = [(s['url'], done_queue) for s in self.streams] statuses = pool.map_async(check_stream_managed, args) n_streams = len(self.streams) while not statuses.ready(): sleep(0.1) self.set_status(' Checked {0}/{1} streams...'.format( done_queue.qsize(), n_streams)) self.s.refresh() statuses = statuses.get() for i, s in enumerate(self.streams): s['online'] = statuses[i] if s['online']: self.all_streams_offline = False self.refilter_streams() self.last_autocheck = int(time()) pool.close()
class ThreadMailer(object): def __init__(self, message, mailer): self.message = message self.mailer = mailer self.threadPool = ThreadPool() def initMessage(self, *args, **kwargs): message = Message(*args, **kwargs) return message def initMailer(self, host, user, pwd): mailer = Mailer(host) mailer.login(user, pwd) return mailer def sendEmail(self, receiver=None): if receiver: self.message.To = receiver result = self.mailer.send(self.message) return result def send(self): result = self.threadPool.map_async(self.sendEmail, self.message.To) _g = None try: _g = result.get() except Exception, e: logger.error("send mail error.") return _g
def check_online_streams(self): self.all_streams_offline = True self.set_status(' Checking online streams...') done_queue = queue.Queue() def check_stream_managed(args): url, queue = args status = self._check_stream(url) done_queue.put(url) return status pool = Pool(self.config.CHECK_ONLINE_THREADS) args = [(s['url'], done_queue) for s in self.streams] statuses = pool.map_async(check_stream_managed, args) n_streams = len(self.streams) while not statuses.ready(): sleep(0.1) self.set_status(' Checked {0}/{1} streams...'.format(done_queue.qsize(), n_streams)) self.s.refresh() statuses = statuses.get() for i, s in enumerate(self.streams): s['online'] = statuses[i] if s['online']: self.all_streams_offline = False self.refilter_streams() self.last_autocheck = int(time()) pool.close()
def extract_reports_mp(city_list, process_count=12, path='./reports', report_count=50, page_count=1, mapping_csv='location_mapping.csv'): ''' Extracts reports for specified cities from newspaper website. city_list : List of city str. A list of cities, for which reports are to be extracted. Available city name ('Delhi','Mumbai','Bangalore',Kolkata') process_count : Int. Number of parallel threads on which the scraping will happen. path : Valid path to directory (str). Path to the folder, where extracted reports will be writtern on disk. report_count : Int. Number of reports to be extracted. page_count : Int. Number of report pages to be scraped(Each page contains ~30 reports.) mapping_csv : Valid path to csv file (str). Path to the csv file, where report and its location mapping will be preserved. ''' p = ThreadPool(process_count) argument_list = list( zip(city_list, [path] * len(city_list), [report_count] * len(city_list), [page_count] * len(city_list))) result = p.map_async(extract_reports, argument_list) master_location_mapping = {} for mapping in result.get(): master_location_mapping.update(mapping) result_df = pd.DataFrame(master_location_mapping.items(), columns=['filename', 'location']) result_df.to_csv(mapping_csv, index=False) print(f''' Reports location : {path} location mapping : {mapping_csv}''')
def runSqlCmdWithTimeOut(sql, user, host, port, tmpPath, database="postgres", mpprcFile="", needmpara=False, timeout=60): """ function: run sql cmd with timeout input : sql, user, host, port, tmpPath, database mpprcFile, needmpara, timeou output : str """ infoList = [[ sql, user, host, port, tmpPath, database, mpprcFile, needmpara ]] endTime = datetime.now() + timedelta(seconds=timeout) pool = ThreadPool(1) result = pool.map_async(executeSql, infoList) while datetime.now() < endTime: if (result._ready): pool.close() if (result._value[0] == "NO RESULT"): return "" elif (result._value[0].startswith("ERROR")): raise SQLCommandException(sql, result._value[0]) else: return result._value[0] else: time.sleep(1) pool.close() raise SQLCommandException( sql, "Running timeout, exceed the limit %s seconds" % timeout)
def run(self): if not self.platforms: raise RuntimeError("No enabled platform to build on") thread_pool = ThreadPool(len(self.platforms)) result = thread_pool.map_async(self.select_and_start_cluster, self.platforms) try: result.get() # Always clean up worker builds on any error to avoid # runaway worker builds (includes orchestrator build cancellation) except Exception: thread_pool.terminate() self.log.info('build cancelled, cancelling worker builds') if self.worker_builds: ThreadPool(len(self.worker_builds)).map( lambda bi: bi.cancel_build(), self.worker_builds) while not result.ready(): result.wait(1) raise else: thread_pool.close() thread_pool.join() fail_reasons = { build_info.platform: build_info.get_fail_reason() for build_info in self.worker_builds if not build_info.build or not build_info.build.is_succeeded() } if fail_reasons: raise PluginFailedException(json.dumps(fail_reasons))
def get(self): key = self.request.get('key') if key: next_key, result = channel.read(key) response = json.dumps(dict( next_key=next_key, result=result, )) self.response.write(response) return handles = self.request.get_all('handles') key = channel.create() pool = ThreadPool(len(handles)) pool.map_async(lambda handle: get_last_tweet(key, handle), handles) response = dict(next_key=key) self.response.write(json.dumps(response))
def index(request): """ Pretty much runs a Map-Reduce job on the Title-Value Sets return_objs looks like: [ {T1: [V1, V2, V3]}, {T2: [V1, V2, V3]}, {T3: [V1, V2, V3]}, {T4: [V1, V2, V3]}, ] """ # Limit represents the number of times the title occurs on amazon.com's query sets. Used for filtering by weight. return_objs = [] touched_titles = [] limit = 1000 def append_title(title): """ Appends the title to the array along with the value set belonging to it. """ touched_titles.append(title) touched_values = [] matched_values = Value.objects.filter(query_title__name=title).exclude(name=None).values("name") if len(matched_values) <= limit: print "Returned" return def append_value(value): touched_values.append(value["name"]) # Appends the value to the value set if it's not None. map(append_value, filter(lambda x: x != "None", matched_values.values())) # Sets the value set of the title if the value set is not empty. if len(touched_values) > 0: touched_values = list(set(touched_values)) touched_values.append(unicode("None", "utf-8")) print "Added Values" return_objs.append({ title: reversed(touched_values) }) # Loads all values for the queried titles into RAM titles = Title.objects.all().exclude(name=None).prefetch_related('values').values('name').distinct() # Makes a unique set of title strings titles = list(set([title["name"] for title in titles])) # Multithreads the queried titles to grab each value set for the title. pool = ThreadPool() res = pool.map_async(append_title, titles) m = res.get() pool.close() # Returns the object to the Django Template as a dinctionary. return render_to_response("list.html", dict(list_titles=return_objs), context_instance=RequestContext(request))
def get_list(self): def validate_distro(distro): if check_url_path(distro['path']): return distro['name'] n_processes = len(self.distros.keys()) pool = ThreadPool(processes=n_processes) map_res = pool.map_async(validate_distro, self.distros.values()) pool.close() pool.join() res = list(set(map_res.get()) - set([None])) return sorted(res)
def _parallel_execute(items, processes): original_signal_handler = signal.signal(signal.SIGINT, keyboard_interrupt) pool = ThreadPool(processes) result = pool.map_async(execute_and_wait_with, items, 1) pool.close() while not result.ready(): # keyboard interrupt is executed in main thread # and needs this loop to get time to get executed try: time.sleep(0.1) except IOError: keyboard_interrupt() signal.signal(signal.SIGINT, original_signal_handler)
def run(self): if not self.platforms: raise RuntimeError("No enabled platform to build on") self.set_build_image() thread_pool = ThreadPool(len(self.platforms)) result = thread_pool.map_async(self.select_and_start_cluster, self.platforms) try: result.get() # Always clean up worker builds on any error to avoid # runaway worker builds (includes orchestrator build cancellation) except Exception: thread_pool.terminate() self.log.info('build cancelled, cancelling worker builds') if self.worker_builds: ThreadPool(len(self.worker_builds)).map( lambda bi: bi.cancel_build(), self.worker_builds) while not result.ready(): result.wait(1) raise else: thread_pool.close() thread_pool.join() annotations = {'worker-builds': { build_info.platform: build_info.get_annotations() for build_info in self.worker_builds if build_info.build }} self._apply_repositories(annotations) labels = self._make_labels() fail_reasons = { build_info.platform: build_info.get_fail_reason() for build_info in self.worker_builds if not build_info.build or not build_info.build.is_succeeded() } workspace = self.workflow.plugin_workspace.setdefault(self.key, {}) workspace[WORKSPACE_KEY_UPLOAD_DIR] = self.koji_upload_dir workspace[WORKSPACE_KEY_BUILD_INFO] = {build_info.platform: build_info for build_info in self.worker_builds} if fail_reasons: return BuildResult(fail_reason=json.dumps(fail_reasons), annotations=annotations, labels=labels) return BuildResult.make_remote_image_result(annotations, labels=labels)
def execute(filename): def run(size): # for size in size_block: cmd = ['cjpeg', '-q', '-n {}'.format(size), '--no-save', '{}'.format(filename)] try: process = subprocess.Popen(cmd) process.wait() except Exception: pass runner = ThreadPool(processes=_MAX_THREADS) result = runner.map_async(run, size_block) result.wait()
def get_list(self): def validate_distro(distro): if check_url_path(distro['path']): return distro['name'] n_processes = len(self.distros.keys()) # Avoid problems if the for some reason the files are not in the right # place, or were deleted, or moved or not supported in the arch if n_processes < 1: return [] pool = ThreadPool(processes=n_processes) map_res = pool.map_async(validate_distro, self.distros.values()) pool.close() pool.join() res = list(set(map_res.get()) - set([None])) return sorted(res)
def main(): print "Fetching exercise data..." request = requests.get("http://khanacademy.org/api/v1/exercises") if request.status_code != 200: print "Error: failed to fetch exercises" sys.exit(1) exercises = [(e["name"], e["ka_url"]) for e in request.json()] pool = ThreadPool() try: # see http://stackoverflow.com/a/1408476 results = pool.map_async(process_exercise, exercises).get(99999) except KeyboardInterrupt: sys.exit(1) success_count = results.count(True) failure_count = len(results) - success_count print "Done (%s successes, %s failures)" % (success_count, failure_count)
def process_all(self): """ process all desired_results in the database """ self.lap_timer() # reset timer q = self.query_pending_desired_results() if self.interface.parallel_compile: desired_results = [] thread_args = [] def compile_result(args): interface, data, result_id = args return interface.compile(data, result_id) for dr in q.all(): if self.claim_desired_result(dr): desired_results.append(dr) thread_args.append((self.interface, dr.configuration.data, dr.id)) if len(desired_results) == 0: return thread_pool = ThreadPool(len(desired_results)) # print 'Compiling %d results' % len(thread_args) try: # Use map_async instead of map because of bug where keyboardinterrupts are ignored # See http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool compile_results = thread_pool.map_async(compile_result, thread_args).get(9999999) except Exception: # Need to kill other processes because only one thread receives # exception self.interface.kill_all() raise # print 'Running %d results' % len(thread_args) for dr, compile_result in zip(desired_results, compile_results): # Make sure compile was successful self.run_desired_result(dr, compile_result, dr.id) try: self.interface.cleanup(dr.id) except RuntimeError as e: print(e) # print 'Done!' thread_pool.close() else: for dr in q.all(): if self.claim_desired_result(dr): self.run_desired_result(dr)
def _parallel_execute(datasources, options, outs_dir, pabot_args, suite_names): if suite_names: original_signal_handler = signal.signal(signal.SIGINT, keyboard_interrupt) pool = ThreadPool(pabot_args['processes']) if pabot_args['seed']: rand = random.Random( pabot_args['seed'] ) rand.shuffle( suite_names ) if pabot_args['load_balancing']: shared_resources = (None, None) if pabot_args['resources']: shared_resources = _create_queue( pabot_args['resources'] ) suite_names_distrib = [([i], shared_resources) for i in suite_names] else: shared_resources = [ (None, None) ] * pabot_args['processes'] if pabot_args['resources']: assert( pabot_args['processes'] == len(pabot_args['resources']) ) shared_resources = [ _create_queue([e]) for e in pabot_args['resources'] ] suite_names_distrib = zip( _pre_compute_distrib( suite_names, pabot_args['processes'] ), shared_resources ) if pabot_args['verbose']: print 'Parallel execution of suites: ' for (suite, (resources_queue, resources_names)) in suite_names_distrib: print '- %s' % (str(suite) if len(suite) > 1 else suite[0]), if resources_names: print "using resource %s" % ("from %s" % str(resources_names) if len(resources_names) > 1 else "'%s'" % resources_names[0]), print result = pool.map_async(execute_and_wait_with, [(datasources, outs_dir, options, suite, pabot_args['command'], pabot_args['verbose'], resources_queue) for (suite, (resources_queue, resources_names)) in suite_names_distrib]) while not result.ready(): # keyboard interrupt is executed in main thread and needs this loop to get time to get executed try: time.sleep(0.1) except IOError: keyboard_interrupt() pool.close() pool.join() result.get() #throw exception from workers if any signal.signal(signal.SIGINT, original_signal_handler)
def _parallel_execute(datasources, options, outs_dir, pabot_args, suite_names): original_signal_handler = signal.signal(signal.SIGINT, keyboard_interrupt) pool = ThreadPool(pabot_args['processes']) result = pool.map_async(execute_and_wait_with, ((datasources, outs_dir, options, suite, pabot_args['command'], pabot_args['verbose'], argfile) for suite in suite_names for argfile in pabot_args['argumentfiles'] or [("", None)])) pool.close() while not result.ready(): # keyboard interrupt is executed in main thread # and needs this loop to get time to get executed try: time.sleep(0.1) except IOError: keyboard_interrupt() signal.signal(signal.SIGINT, original_signal_handler)
def _parallel_execute(datasources, options, outs_dir, pabot_args, suite_names): original_signal_handler = signal.signal(signal.SIGINT, keyboard_interrupt) pool = ThreadPool(pabot_args["processes"]) result = pool.map_async( execute_and_wait_with, [ (datasources, outs_dir, options, suite, pabot_args["command"], pabot_args["verbose"]) for suite in suite_names ], ) pool.close() while not result.ready(): # keyboard interrupt is executed in main thread and needs this loop to get time to get executed try: time.sleep(0.1) except IOError: keyboard_interrupt() signal.signal(signal.SIGINT, original_signal_handler)
def _map_async(pool_size, func, args): """Async map (threading), handling python 2.6 edge case. :param pool_size: Maximum number of threads. :param func: Function to run. :param args: Iterable of arguments (one per thread). This is necessary since using `map` will in general prevent keyboard interrupts from functioning properly (see this thread for more details - http://stackoverflow.com/a/1408476/1062617), but `map_async` hangs in python 2.6. """ pool = ThreadPool(pool_size) if sys.version_info <= (2, 6): return pool.map(func, args) else: return pool.map_async(func, args).get(1 << 31)
def main(): pattener = ["*.jpg", '*.jpeg', '*.tiff', '*.bmp', "*.png"] times = 10 _pool = ThreadPool(processes=_MAX_THREADS) files = [] root = argv[1] if len(argv) > 1 else '.' for pat in pattener: files += glob.glob('{}/**/{}'.format(root, pat), recursive=True) stdout.write('{} files found\n'.format(len(files))) if len(files) == 0: return for i in range(1, times): print('run {}/{}'.format(i, times)) result = _pool.map_async(execute, files) result.wait() print(result)
def process_all(self): ''' process all desired_results in the database ''' self.lap_timer() #reset timer q = (self.session.query(DesiredResult) .filter_by(tuning_run = self.tuning_run, state = 'REQUESTED') .order_by(DesiredResult.generation, DesiredResult.priority.desc())) if self.interface.parallel_compile: desired_results = [] thread_args = [] def compile_result(args): interface, data, result_id = args return interface.compile(data, result_id) for dr in q.all(): if self.claim_desired_result(dr): desired_results.append(dr) thread_args.append((self.interface, dr.configuration.data, dr.id)) thread_pool = ThreadPool(len(desired_results)) # print 'Compiling %d results' % len(thread_args) try: # Use map_async instead of map because of bug where keyboardinterrupts are ignored # See http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool compile_results = thread_pool.map_async(compile_result, thread_args).get(9999999) except Exception: # Need to kill other processes because only one thread receives exception self.interface.kill_all() raise # print 'Running %d results' % len(thread_args) for dr, compile_result in zip(desired_results, compile_results): # Make sure compile was successful self.run_desired_result(dr, compile_result, dr.id) try: self.interface.cleanup(dr.id) except RuntimeError, e: print e
class CloudBigTableDataStore(data_store.DataStore): """GCP CloudBigtable based data storage system. Note Cloud Bigtable only supports timestamp precision in milli seconds. All other GRR datastores support microseconds. Note that currently it isn't safe to use the bigtable garbage collection to make data disappear out from under the system, except for the two cases we use by default here. Also, exposing the full power of the bigtable garbage collection system via configuration is very complicated. You can have nested AND and OR garbage collection rules, see http://goo.gl/L6Oh9i. If we decide to use this more extensively in the future we'll provide a sensible default gc strategy and tell people to modify using the bigtable client if they want to change it later. """ COLUMN_FAMILIES = { "aff4": {}, "metadata": { "versions": 1 }, "flow": { "versions": 1 }, "index": {}, "notify": {}, "kw_index": {}, "task": {}, } def __init__(self): super(CloudBigTableDataStore, self).__init__() self.lock = threading.RLock() self.instance = None self.table = None self._CalculateAttributeStorageTypes() # We can deprecate this once there is something included in the library: # https://github.com/GoogleCloudPlatform/gcloud-python/issues/2117 def WaitOnOperation(self, operation, max_tries=4, delay=1, backoff=2): tries = 0 while tries < max_tries: if operation.finished(): return operation delay *= backoff**tries time.sleep(delay) tries += 1 def GetInstance(self, btclient, instance_id): instances, _ = btclient.list_instances() for instance in instances: if instance.instance_id == instance_id: return instance return None def GetTable(self, instance, table_name): for table in instance.list_tables(): if table.table_id == table_name: return table return None def StartClient(self, project_id=None, instance_id=None): # Connection to bigtable is fairly expensive so we open one and re-use it. # https://cloud.google.com/bigtable/docs/performance self.btclient = bigtable.Client(project=project_id) self.instance = self.btclient.instance(instance_id) self.table = self.instance.table( config_lib.CONFIG["CloudBigtable.table_name"]) def Initialize(self, project_id=None, instance_id=None): super(CloudBigTableDataStore, self).Initialize() project_id = project_id or config_lib.CONFIG["CloudBigtable.project_id"] if not project_id: raise AccessError( "No Google Cloud project ID specified, can't create instance.") instance_id = instance_id or config_lib.CONFIG["CloudBigtable.instance_id"] self.CreateInstanceAndTable(project_id=project_id, instance_id=instance_id) self.StartClient(project_id=project_id, instance_id=instance_id) self.pool = ThreadPool(config_lib.CONFIG["CloudBigtable.threadpool_size"]) def CreateInstanceAndTable(self, project_id=None, instance_id=None): # The client must be created with admin=True because it will create a # table. btclient = bigtable.Client(project=project_id, admin=True) tablename = config_lib.CONFIG["CloudBigtable.table_name"] instance_name = config_lib.CONFIG["CloudBigtable.instance_name"] btinstance = self.GetInstance(btclient, instance_id) if not btinstance: logging.info("Creating cloud bigtable: %s.%s in %s", instance_id, tablename, project_id) btinstance = btclient.instance( instance_id, display_name=instance_name, serve_nodes=config_lib.CONFIG["CloudBigtable.serve_nodes"], location=config_lib.CONFIG["CloudBigtable.instance_location"]) operation = btinstance.create() self.WaitOnOperation(operation) table = self.GetTable(btinstance, tablename) if not table: table = btinstance.table(tablename) table.create() for column, gc_rules in self.COLUMN_FAMILIES.iteritems(): gc_rule = None if gc_rules: age = gc_rules.get("age", None) if age: gc_rule = bigtable.column_family.MaxAgeGCRule(age) version_max = gc_rules.get("versions", None) if version_max: gc_rule = bigtable.column_family.MaxVersionsGCRule(version_max) cf = table.column_family(column, gc_rule=gc_rule) cf.create() return btinstance def DeleteSubject(self, subject, sync=False, token=None): self.DeleteSubjects([subject], sync=sync, token=token) def DeleteSubjects(self, subjects, sync=False, token=None): self.security_manager.CheckDataStoreAccess(token, subjects, "w") # Currently there is no multi-row mutation support, but it exists in the # RPC API. # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2411 # So we delete all subjects at once using a threadpool pool_args = [] for subject in subjects: row = self.table.row(utils.SmartStr(subject)) row.delete() pool_args.append(((row.commit, "delete"), {})) if sync: self.pool.map(self._WrapCallWithRetry, pool_args) else: self.pool.map_async(self._WrapCallWithRetry, pool_args) def _CalculateAttributeStorageTypes(self): """Build a mapping between column names and types. Since BT only stores bytes, we need to record the basic types that are required to be stored for each column. """ self._attribute_types = {} for attribute in aff4.Attribute.PREDICATES.values(): self._attribute_types[attribute.predicate] = ( attribute.attribute_type.data_store_type) def Encode(self, attribute, value): """Encode the value for the attribute.""" required_type = self._attribute_types.get(attribute, "bytes") if required_type in ("integer", "unsigned_integer"): return structs.VarintEncode(int(value)) elif hasattr(value, "SerializeToString"): return value.SerializeToString() else: # Types "string" and "bytes" are stored as strings here. return utils.SmartStr(value) def Decode(self, attribute, value): """Decode the value to the required type.""" required_type = self._attribute_types.get(attribute, "bytes") if required_type in ("integer", "unsigned_integer"): return structs.VarintReader(value, 0)[0] elif required_type == "string": return utils.SmartUnicode(value) else: return value def DBSubjectLock(self, subject, lease_time=None, token=None): return CloudBigtableLock(self, subject, lease_time=lease_time, token=token) def DatetimeToMicroseconds(self, datetime_utc): # How much do I hate datetime? let me count the ways. if datetime_utc.tzinfo != pytz.utc: raise ValueError( "DatetimeToMicroseconds can only safely convert UTC datetimes") epoch = datetime.datetime(1970, 1, 1, 0, 0, tzinfo=pytz.utc) # pylint: disable=g-tzinfo-datetime diff = datetime_utc - epoch return int(diff.total_seconds() * 1e6) def DatetimeFromMicroseconds(self, time_usec): seconds = float(time_usec) / 1000000 dt = datetime.datetime.utcfromtimestamp(seconds) return dt.replace(tzinfo=pytz.utc) # pylint: disable=g-tzinfo-replace def GetFamilyColumn(self, attribute): return utils.SmartStr(attribute).split(":", 1) def _DeleteAllTimeStamps(self, row, attribute_list): """Add delete mutations to row, but don't commit.""" delete_dict = {} # Group column families together so we can use delete_cells for attribute in attribute_list: family, column = self.GetFamilyColumn(attribute) delete_dict.setdefault(family, []).append(column) for family, column in delete_dict.iteritems(): row.delete_cells(family, column) def Set(self, subject, attribute, value, timestamp=None, token=None, replace=True, sync=True): self.MultiSet( subject, {attribute: [value]}, timestamp, token=token, replace=replace, sync=sync) def MultiSet(self, subject, values, timestamp=None, replace=True, sync=True, to_delete=None, token=None): self.security_manager.CheckDataStoreAccess(token, [subject], "w") row = self.table.row(utils.SmartStr(subject)) if to_delete: self._DeleteAllTimeStamps(row, to_delete) for attribute, value_list in values.items(): # Attributes must be strings family, column = self.GetFamilyColumn(attribute) if replace: row.delete_cell(family, column) for value in value_list: element_timestamp = timestamp if isinstance(value, tuple): try: value, element_timestamp = value except (TypeError, ValueError): pass if element_timestamp is None: datetime_ts = datetime.datetime.utcnow() else: datetime_ts = self.DatetimeFromMicroseconds(element_timestamp) # Value parameter here is bytes, so we need to encode unicode to a byte # string: # https://googlecloudplatform.github.io/google-cloud-python/stable/bigtable-row.html#google.cloud.bigtable.row.DirectRow.set_cell value = self.Encode(attribute, value) row.set_cell(family, column, value, timestamp=datetime_ts) if sync: self.CallWithRetry(row.commit, "write") else: self.pool.map_async(self._WrapCallWithRetry, [((row.commit, "write"), {})]) def DeleteAttributes(self, subject, attributes, start=None, end=None, sync=True, token=None): self.MultiDeleteAttributes( [subject], attributes, start=start, end=end, sync=sync, token=token) def MultiDeleteAttributes(self, subjects, attributes, start=None, end=None, sync=True, token=None): subjects = [utils.SmartStr(subject) for subject in subjects] self.security_manager.CheckDataStoreAccess(token, subjects, "w") if isinstance(attributes, basestring): raise ValueError( "String passed to DeleteAttributes (non string iterable expected).") attributes = [utils.SmartStr(x) for x in attributes] for subject in subjects: row = self.table.row(subject) for attribute in attributes: if start is None and end is None: self._DeleteAllTimeStamps(row, [attribute]) else: family, column = self.GetFamilyColumn(attribute) row.delete_cell( family, column, time_range=self._TimestampRangeFromTuple((start, end))) if sync: self.CallWithRetry(row.commit, "delete") else: self.pool.map_async(self._WrapCallWithRetry, [((row.commit, "delete"), {})]) def _TimestampRangeFromTuple(self, ts_tuple): start, end = ts_tuple if start is not None: if start == 0: start = None else: # Convert RDFDatetime to usec start = float(start) # Bigtable can only handle ms precision: # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2626 # If we give it a filter with usec values it raises RPC error with # "Timestamp granularity mismatch". Truncate to ms here. start -= start % 1000 start = self.DatetimeFromMicroseconds(start) if end is not None: # Convert RDFDatetime to usec end = float(end) # Some searches use 2**64 signed int to signal "no upper limit", there's a # better way to do that with the API using None. if end >= (2**64) / 2: end = None else: # Truncate to ms end -= end % 1000 # GRR expects inclusive timestamps for upper and lower. TimestampRange # is exclusive on the end. So we add 1ms to the upper bound, which is # the next smallest timestamp bigtable will accept. # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2608 end += 1000 end = self.DatetimeFromMicroseconds(end) return row_filters.TimestampRange(start=start, end=end) def _TimestampToFilter(self, timestamp): if timestamp == data_store.DataStore.ALL_TIMESTAMPS: return None if timestamp is None or timestamp == data_store.DataStore.NEWEST_TIMESTAMP: # Latest value only return row_filters.CellsColumnLimitFilter(1) if isinstance(timestamp, tuple): return row_filters.TimestampRangeFilter( self._TimestampRangeFromTuple(timestamp)) raise data_store.Error("Invalid timestamp specification: %s." % timestamp) def CallWithRetry(self, callback, mode, *args, **kwargs): """Make the bigtable RPC with retries. Args: callback: a function to call, typically a bigtable row mutation.commit mode: A string to indicate what kind of db operation this is "read", "write", "delete". *args: args to pass to the callback **kwargs: keyword args to pass to the callback Returns: Callback result. Raises: AccessError: if we hit our RPC retry limit, or the RPC error isn't retryable. ValueError: if you pass an unknown operation in mode. """ if mode not in set(["read", "write", "delete"]): raise ValueError("Mode must be 'read', 'write', 'delete'") retry_count = 0 sleep_interval = config_lib.CONFIG["CloudBigtable.retry_interval"] while retry_count < config_lib.CONFIG["CloudBigtable.retry_max_attempts"]: try: response = callback(*args, **kwargs) return response except (face.ExpirationError, face.AbortionError) as e: last_error = e last_traceback = traceback.format_exc() print "Retrying %s" % last_traceback time.sleep(sleep_interval.seconds) logging.info("Retrying callback: %s", callback) retry_count += 1 stats.STATS.IncrementCounter("grr_cloud_bigtable_%s_retries" % mode) sleep_interval *= config_lib.CONFIG["CloudBigtable.retry_multiplier"] stats.STATS.IncrementCounter("grr_cloud_bigtable_%s_failures" % mode) logging.error("Gave up on %s %s after %s retries. %s", mode, callback, retry_count, last_traceback) raise AccessError( "Giving up on %s callback:%s after %s retries. Last error: %s." % (mode, callback, retry_count, last_error)) def _WrapCallWithRetry(self, argstuple): """Workaround not being able to pass kwargs to threadpool callback.""" callargs, kwargs = argstuple return self.CallWithRetry(*callargs, **kwargs) def _SortResultsByAttrTimestampValue(self, result_list): """Sort order: attribute ASC, timestamp DESC, value ASC.""" return sorted(result_list, key=lambda (a, val, ts): (a, -ts, val)) def _GetSubjectResults(self, result, limit): subject_results = [] for attribute, cells in result.to_dict().iteritems(): for cell in cells: subject_results.append((attribute, self.Decode(attribute, cell.value), self.DatetimeToMicroseconds(cell.timestamp))) limit -= 1 if limit <= 0: return subject_results, limit return subject_results, limit def MultiResolvePrefix(self, subjects, attribute_prefix, timestamp=None, limit=None, token=None): """Get results from multiple rows matching multiple attributes. We could implement this using read_rows, but it is a table scan. Our current data model makes that slow because it is a directory hierarchy that includes entries for subdirectories interleaved. So if you want all the results for a directory you need to skip those in the scan. Instead we make an RPC for each subject all at once using a threadpool. We pay more in RPC overhead but we get to do it concurrently. Args: subjects: A list of subjects. attribute_prefix: The attribute prefix. timestamp: A range of times for consideration (In microseconds). Can be a constant such as ALL_TIMESTAMPS or NEWEST_TIMESTAMP or a tuple of ints (start, end). limit: The total number of result values to return. token: An ACL token. Yields: A list of tuples: (subject, [(attribute, value string, timestamp)]) that can be simply converted to a dict. Values with the same attribute (happens when timestamp is not NEWEST_TIMESTAMP, but ALL_TIMESTAMPS or time range) are guaranteed to be ordered in the decreasing timestamp order. Raises: AccessError: if anything goes wrong. ValueError: if we get a string instead of a list of subjects. """ self.security_manager.CheckDataStoreAccess( token, subjects, self.GetRequiredResolveAccess(attribute_prefix)) if isinstance(subjects, basestring): raise ValueError("Expected list of subjects, got string: %s" % subjects) if isinstance(attribute_prefix, basestring): attribute_prefix_list = [utils.SmartStr(attribute_prefix)] else: attribute_prefix_list = [utils.SmartStr(x) for x in attribute_prefix] timestamp_filter = self._TimestampToFilter(timestamp) filter_union = [] for attribute_prefix in attribute_prefix_list: family, column = self.GetFamilyColumn(attribute_prefix) family_filter = row_filters.FamilyNameRegexFilter(family) row_filter_list = [family_filter] if column: # Make it an actual regex column += ".*" col_filter = row_filters.ColumnQualifierRegexFilter(column) row_filter_list.append(col_filter) if timestamp_filter: row_filter_list.append(timestamp_filter) if len(row_filter_list) > 1: row_filter = row_filters.RowFilterChain(filters=row_filter_list) else: row_filter = row_filter_list[0] filter_union.append(row_filter) # More than one set of prefixes, use a union, otherwise just use the # existing filter chain. if len(filter_union) > 1: attribute_filter = row_filters.RowFilterUnion(filters=filter_union) else: attribute_filter = filter_union[0] # Apply those filters to each subject as a separate RPC using a threadpool pool_args = [] original_subject_map = {} for subject in subjects: # List of *args, **kwargs to pass to the RPC caller pool_args.append(((self.table.read_row, "read", utils.SmartStr(subject)), { "filter_": attribute_filter })) # We're expected to return subjects as their original type, which can be # URN, unicode, or string. Keep a mapping in this dict. original_subject_map[utils.SmartStr(subject)] = subject max_results = limit or 2**64 for result in self.pool.imap_unordered(self._WrapCallWithRetry, pool_args): if max_results <= 0: break if result: subject_results, max_results = self._GetSubjectResults(result, max_results) yield original_subject_map[ result.row_key], self._SortResultsByAttrTimestampValue( subject_results) @utils.Synchronized def Flush(self): """Wait for threadpool jobs to finish, then make a new pool.""" self.pool.close() self.pool.join() self.pool = ThreadPool(config_lib.CONFIG["CloudBigtable.threadpool_size"]) def Resolve(self, subject, attribute, token=None): """Retrieve the latest value set for a subject's attribute. Args: subject: The subject URN. attribute: The attribute. token: The security token used in this call. Returns: A (string, timestamp in microseconds) stored in the bigtable cell, or (None, 0). Raises: AccessError: if anything goes wrong. """ subject = utils.SmartStr(subject) self.security_manager.CheckDataStoreAccess( token, [subject], self.GetRequiredResolveAccess(attribute)) attribute = utils.SmartStr(attribute) family, column = self.GetFamilyColumn(attribute) col_filter = row_filters.ColumnRangeFilter( family, start_column=column, end_column=column) # Most recent latest_filter = row_filters.CellsColumnLimitFilter(1) row_filter = row_filters.RowFilterChain(filters=[col_filter, latest_filter]) row_data = self.table.read_row(subject, filter_=row_filter) if row_data: for cell in row_data.cells[family][column]: return self.Decode( attribute, cell.value), self.DatetimeToMicroseconds(cell.timestamp) return None, 0 def ResolveMulti(self, subject, attributes, timestamp=None, limit=None, token=None): """Resolve multiple attributes for a subject. Results will be returned in arbitrary order (i.e. not ordered by attribute or timestamp). Args: subject: The subject to resolve. attributes: The attribute string or list of strings to match. Note this is an exact match, not a regex. timestamp: A range of times for consideration (In microseconds). Can be a constant such as ALL_TIMESTAMPS or NEWEST_TIMESTAMP or a tuple of ints (start, end). limit: The maximum total number of results we return. token: The security token used in this call. Yields: A unordered list of (attribute, value string, timestamp). Raises: AccessError: if anything goes wrong. """ subject = utils.SmartStr(subject) self.security_manager.CheckDataStoreAccess( token, [subject], self.GetRequiredResolveAccess(attributes)) if isinstance(attributes, basestring): attributes = [utils.SmartStr(attributes)] else: attributes = [utils.SmartStr(x) for x in attributes] filter_union = [] for attribute in attributes: family, column = self.GetFamilyColumn(attribute) col_filter = row_filters.ColumnRangeFilter( family, start_column=column, end_column=column) filter_union.append(col_filter) # More than one attribute, use a union, otherwise just use the # existing filter. if len(filter_union) > 1: filter_union = row_filters.RowFilterUnion(filters=filter_union) else: filter_union = filter_union[0] # Essentially timestamp AND (attr1 OR attr2) timestamp_filter = self._TimestampToFilter(timestamp) if timestamp_filter: row_filter = row_filters.RowFilterChain( filters=[filter_union, timestamp_filter]) else: row_filter = filter_union row_data = self.CallWithRetry( self.table.read_row, "read", subject, filter_=row_filter) if row_data: max_results = limit or 2**64 for column, cells in row_data.cells[family].iteritems(): attribute = ":".join((family, column)) for cell in cells: if max_results <= 0: raise StopIteration max_results -= 1 yield attribute, self.Decode( attribute, cell.value), self.DatetimeToMicroseconds(cell.timestamp) def _GetAttributeFilterUnion(self, attributes, timestamp_filter=None): filters = [] for attribute_prefix in attributes: family, column = self.GetFamilyColumn(attribute_prefix) family_filter = row_filters.FamilyNameRegexFilter(family) row_filter_list = [family_filter] if column: col_filter = row_filters.ColumnQualifierRegexFilter(column) row_filter_list.append(col_filter) if timestamp_filter: row_filter_list.append(timestamp_filter) if len(row_filter_list) > 1: row_filter = row_filters.RowFilterChain(filters=row_filter_list) else: row_filter = row_filter_list[0] filters.append(row_filter) # More than one attribute, use a union, otherwise just use the # existing filter. if len(filters) > 1: filters = row_filters.RowFilterUnion(filters=filters) else: filters = filters[0] return filters def _ReOrderRowResults(self, row_data): subject_results = {} for family, column_dict in row_data.cells.iteritems(): for column, cells in column_dict.iteritems(): attribute = ":".join((family, column)) subject_results[attribute] = [] for cell in cells: subject_results[attribute].append( (self.DatetimeToMicroseconds(cell.timestamp), self.Decode(attribute, cell.value))) subject_results[attribute] = sorted( subject_results[attribute], key=lambda x: -x[0]) if len(subject_results[attribute]) == 1: subject_results[attribute] = subject_results[attribute][0] return subject_results def ScanAttributes(self, subject_prefix, attributes, after_urn=None, max_records=None, token=None, relaxed_order=False): subject_prefix = self._CleanSubjectPrefix(subject_prefix) after_urn = self._CleanAfterURN(after_urn, subject_prefix) # Turn subject prefix into an actual regex subject_prefix += ".*" self.security_manager.CheckDataStoreAccess(token, [subject_prefix], "rq") subject_filter = row_filters.RowKeyRegexFilter( utils.SmartStr(subject_prefix)) latest_value = row_filters.CellsColumnLimitFilter(1) attribute_filters = self._GetAttributeFilterUnion(attributes) # Subject AND (attr1 OR attr2) AND latest_value query_filter = row_filters.RowFilterChain( [subject_filter, attribute_filters, latest_value]) # The API results include the start row, we want to exclude it, append a # null to do so. if after_urn is not None: after_urn += "\x00" rows_data = self.CallWithRetry( self.table.read_rows, "read", start_key=after_urn, limit=max_records, filter_=query_filter) # Ideally we should be able to stream and yield, but it seems we can't: # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/1812 self.CallWithRetry(rows_data.consume_all, "read") results = [] if rows_data.rows: for subject, row_data in rows_data.rows.iteritems(): subject_results = self._ReOrderRowResults(row_data) results.append((subject, subject_results)) return sorted(results, key=lambda x: x[0])
from multiprocessing.pool import ThreadPool from pyprimes import isprime_division as isprime LIMIT = 1000000 CONCURRENCY = cpu_count() def check_prime(num): return isprime(num), num class benchmark(object): from timeit import default_timer as timer def __init__(self, name): self.name = name def __enter__(self): self.start = self.timer() def __exit__(self, ty, val, tb): end = self.timer() print("%s : %0.3f seconds" % (self.name, end-self.start)) return False pool = ThreadPool(CONCURRENCY) print("Starting...") with benchmark("multithreaded primality test"): results = pool.map_async(check_prime, xrange(LIMIT)) results.get() print("{0} prime(s) detected.".format(sum(1 for res in results.get() if res[0])))
def run_parallel(func, args, threads, callback): """Run processing arguments with function in multiple threads.""" pool = Pool(processes=threads) pool.map_async(func, args, callback=callback) pool.close() pool.join()
ok = verify_file_cmd(args, file, cmd) status = 'OK ' if ok else 'FAIL' report_write(args, '%s %s\n' % (status, file)) if not ok: time.sleep(0.5) # to break it with Ctrl+C elif args.mode in ('write', 'append'): if os.path.exists(args.out): backup_name = args.out + '.orig' os.system('cp %s %s' %\ (escape_file(args.out), escape_file(backup_name))) if args.out == '-': args.o = sys.stdout elif args.mode == 'write': args.o = open(args.out, 'w') elif args.mode == 'append': args.o = open(args.out, 'a') base_dir = args.dir files = list_files(base_dir) if args.mode in ('write', 'append') \ and args.reuse in ('yes', 'verify'): files.sort(key=lambda file: 0 if file in file2cmd else 1) if append: o_write(args, "# PlowBackup begin\n") pool = ThreadPool(args.workers) async_result = pool.map_async(do_upload, files) while not async_result.ready(): time.sleep(1) # to break it with Ctrl+C if append: o_write(args, "# PlowBackup end\n")
from multiprocessing.pool import ThreadPool import random from itertools import count import time def process(index): print "trying" time.sleep(random.randint(1,10)) val = 'Done {}'.format(index) return val def callback(x): print x pool = ThreadPool(10) pool.map_async(process, xrange(10), 3, callback=callback) print 'Done Done'
class Impala(Service): """This class represents an Impala service running on a cluster. The class is intended to help with basic tasks such as connecting to an impalad or checking if queries are running. """ def __init__(self, cluster, impalads): Service.__init__(self, cluster) self.impalads = impalads for i in impalads: i.impala = self self._thread_pool = ThreadPool() @property def warehouse_dir(self): return self.cluster.hive.warehouse_dir def connect(self, db_name=None, impalad=None): if not impalad: impalad = choice(self.impalads) conn = ImpalaConnection( host_name=impalad.host_name, port=impalad.hs2_port, user_name=self.cluster.hadoop_user_name, db_name=db_name, use_kerberos=self.cluster.use_kerberos, use_ssl=self.cluster.use_ssl, ca_cert=self.cluster.ca_cert, ) conn.cluster = self.cluster return conn @contextmanager def cursor(self, db_name=None, impalad=None): with self.connect(db_name=db_name, impalad=impalad) as conn: with conn.cursor() as cur: yield cur def find_stopped_impalads(self): stopped = list() for idx, pid in enumerate(self.for_each_impalad(lambda i: i.find_pid())): if not pid: stopped.append(self.impalads[idx]) return stopped def find_and_set_path_to_running_impalad_binary(self): self.for_each_impalad(lambda i: i.find_and_set_path_to_running_binary()) def cancel_queries(self): self.for_each_impalad(lambda i: i.cancel_queries()) def get_version_info(self): return self.for_each_impalad(lambda i: i.get_version_info(), as_dict=True) def queries_are_running(self): return any(self.for_each_impalad(lambda i: i.queries_are_running())) def find_impalad_mem_mb_limit(self): return self.for_each_impalad(lambda i: i.find_process_mem_mb_limit()) def find_impalad_mem_mb_reported_usage(self): return self.for_each_impalad( lambda i: i.find_reported_mem_mb_usage()) def find_impalad_mem_mb_actual_usage(self): return self.for_each_impalad(lambda i: i.find_actual_mem_mb_usage()) def find_crashed_impalads(self, start_time): """If any impalads are found not running, they will assumed to have crashed. A crash info message will be return for each stopped impalad. The return value is a dict keyed by impalad. See Impalad.find_last_crash_message() for info about the returned messages. 'start_time' is used to filter log messages and core dumps, it should be set to the time when the Impala service was started. Impalads that have non-generic crash info will be sorted last in the returned dict. """ stopped_impalads = self.find_stopped_impalads() if not stopped_impalads: return dict.fromkeys(stopped_impalads) messages = OrderedDict() impalads_with_message = dict() for i, message in izip(stopped_impalads, self.for_each_impalad( lambda i: i.find_last_crash_message(start_time), impalads=stopped_impalads)): if message: impalads_with_message[i] = "%s crashed:\n%s" % (i.host_name, message) else: messages[i] = "%s crashed but no info could be found" % i.host_name messages.update(impalads_with_message) return messages def for_each_impalad(self, func, impalads=None, as_dict=False): if impalads is None: impalads = self.impalads promise = self._thread_pool.map_async(func, impalads) # Python doesn't handle ctrl-c well unless a timeout is provided. results = promise.get(maxint) if as_dict: results = dict(izip(impalads, results)) return results def restart(self): raise NotImplementedError()