def testingStepFinished(self, testingStep, executionSessions): pool = multiprocessing.Pool( self.config['testing_video_generation_processes'], maxtasksperchild=1) futures = [] for session in executionSessions: future = pool.apply_async(func=createDebugVideoSubProcess, args=(self.config.serialize(), str(session.id), "", False, False, None, None, "annotated_videos")) futures.append((session, future)) for session, future in futures: localFuture = future # for retry in range(5): # try: value = localFuture.get( timeout=self.config['debug_video_generation_timeout']) if value: getLogger().error(value) # break # except billiard.exceptions.WorkerLostError: # if retry == 4: # raise # localFuture = pool.apply_async(func=createDebugVideoSubProcess, # args=(self.config.serialize(), str(session.id), "", False, False, None, None, "annotated_videos")) # except BrokenPipeError: # if retry == 4: # raise # localFuture = pool.apply_async(func=createDebugVideoSubProcess, # args=(self.config.serialize(), str(session.id), "", False, False, None, None, "annotated_videos")) pool.close() pool.join()
def getMovies(): ''' Fetch information pertaining to all released movies ''' r = requests.get(settings.MOVIES_LINK) movies_objects = [] if (r.status_code == 200): try: soup = BeautifulSoup(r.text, 'html5lib') movies_html = soup.find('ul', {'class': 'drop-list masterBorderColor'}) movies = [settings.NOS_CINEMAS_URL + movie['href'] for movie in movies_html.find_all('a', {'class': 'list-item'})] # partially apply getMovie getMoviePart = partial(getMovie, released=True) with multiprocessing.Pool(15) as proc_pool: movies_objects = proc_pool.map(getMoviePart, movies) except Exception as e: print(f"[getMovies] Error while parsing HTML {str(e)}") raise Exception(f"[getMovies] {str(e)}") else: print("[getMovies] Não foi possível obter a lista de filmes") raise Exception(f"GET {settings.MOVIES_LINK} returned unexpected response code: {r.status_code}") return list(filter(lambda x: x != None, movies_objects))
def generateAllCharts(config, applicationId=None, enableCumulativeCoverage=False): getLogger().info(f"Generating charts based on results.") pool = multiprocessing.Pool(config['chart_generation_workers'], initializer=setupLocalLogging) futures = [] futures.append(pool.apply_async(generateRewardChart, [config.serialize(), applicationId])) futures.append(pool.apply_async(generateFitnessChart, [config.serialize(), applicationId])) futures.append(pool.apply_async(generateTracesWithNewBranchesChart, [config.serialize(), applicationId])) if enableCumulativeCoverage: futures.append(pool.apply_async(generateCoverageChart, [config.serialize(), applicationId])) futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'totalLosses', "Total Loss", 'total_loss_chart.png'])) futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'presentRewardLosses', "Present Reward Loss", 'present_reward_loss_chart.png'])) futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'discountedFutureRewardLosses', "Discounted Future Reward Loss", 'discounted_future_reward_loss_chart.png'])) futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'stateValueLosses', "State Value Loss", 'state_value_loss_chart.png'])) futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'advantageLosses', "Advantage Loss", 'advantage_loss_chart.png'])) futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'actionProbabilityLosses', "Action Probability Loss", 'action_probability_loss_chart.png'])) if config['chart_enable_cumulative_coverage_chart'] and enableCumulativeCoverage: futures.append(pool.apply_async(generateCumulativeCoverageChart, [config.serialize(), applicationId, 100])) futures.append(pool.apply_async(generateCumulativeCoverageChart, [config.serialize(), applicationId, 25])) futures.append(pool.apply_async(generateCumulativeCoverageChart, [config.serialize(), applicationId, 10])) futures.append(pool.apply_async(generateCumulativeCoverageChart, [config.serialize(), applicationId, 5])) if config['chart_enable_cumulative_errors_chart']: futures.append(pool.apply_async(generateCumulativeErrorsFoundChart, [config.serialize(), applicationId])) for future in futures: future.get() pool.close() pool.join() getLogger().info(f"Completed generating all the charts.")
def get_intersection_buffers(roads, road_bounds, intersection_buffer_units, tile_max_units): """Buffers all intersections :param roads: List of shapely geometries representing road segments :param road_bounds: Bounding box of the roads shapefile :param intersection_buffer_units: Number of units to use for buffer radius :param tile_max_units: Maxium number of units for each side of a tile """ # As an optimization, the road network is divided up into a grid of tiles, # and intersections are calculated within each tile. def roads_per_tile_iter(): """Generator which yields a set of roads for each tile""" min_x, min_y, max_x, max_y = road_bounds bounds_width = max_x - min_x bounds_height = max_y - min_y x_divisions = ceil(bounds_width / tile_max_units) y_divisions = ceil(bounds_height / tile_max_units) tile_width = bounds_width / x_divisions tile_height = bounds_height / y_divisions # Create a spatial index for roads to efficiently match up roads to tiles logger.info('Generating spatial index for intersections') roads_index = rtree.index.Index() for idx, road in enumerate(roads): roads_index.insert(idx, road.bounds) logger.info('Number of tiles: {}'.format(int(x_divisions * y_divisions))) for x_offset in range(0, int(x_divisions)): for y_offset in range(0, int(y_divisions)): road_ids_in_tile = roads_index.intersection([ min_x + x_offset * tile_width, min_y + y_offset * tile_height, min_x + (1 + x_offset) * tile_width, min_y + (1 + y_offset) * tile_height ]) roads_in_tile = [ roads[road_id] for road_id in road_ids_in_tile ] if len(roads_in_tile) > 1: yield roads_in_tile # Allocate one worker per core, and parallelize the discovery of intersections pool = multiprocessing.Pool(multiprocessing.cpu_count()) tile_intersections = pool.imap(get_intersections, roads_per_tile_iter()) pool.close() pool.join() logger.info('Buffering intersections') # Note: tile_intersections is a list of multipoints (which is a list of points). # itertools.chain.from_iterable flattens the list into a list of single points. buffered_intersections = [ intersection.buffer(intersection_buffer_units) for intersection in itertools.chain.from_iterable(tile_intersections) ] # If intersection buffers overlap, union them to treat them as one logger.info('Performing unary union on buffered intersections') return unary_union(buffered_intersections)
def _grid_optimize_pool(self, X, Y, regressor_class, grid): pool = billiard.Pool(processes=self.cores, soft_timeout=self.timeout) pool_res = [] results = [] for params in self._grid_iterator(grid): regressor = regressor_class(**params) r = pool.apply_async(_pickle_bypass, (self, "_cross_validation", X, Y, regressor)) pool_res.append((r, params)) for r, reg_c_params in pool_res: start_time = time() err_msg = False try: score, start_time = r.get(self.timeout + 5) if self.verbose > 1: print "* %.5t *" % score, regressor_class.__name__, params results.append((score, params)) except billiard.SoftTimeiLmitExceeded: print "TIME LIM" err_msg = True except billiard.TimeoutError: print "* TIMEOUT * same shit" err_msg = True if self.verbose > 1 or err_msg: print "time" pool.terminate() # TODO still not sure pool.join() if self.verbose > 1: print "Terminated" results.sort(reverse=self.maximise) return results[0] if results else None
def computeCumulativeCoverageForTestingSteps(testingStepIds, config): futures = [] pool = multiprocessing.Pool(config['chart_generation_dataload_workers']) for stepId in testingStepIds: futures.append(pool.apply_async(computeCumulativeBranchTraceForTestingSteps, [stepId, config])) cumulativeBranchTrace = {} for future in futures: branchTrace = future.get() for fileName in branchTrace: if fileName not in cumulativeBranchTrace: cumulativeBranchTrace[fileName] = branchTrace[fileName] else: cumulativeBranchTrace[fileName] = cumulativeBranchTrace[fileName].maximum(branchTrace[fileName]) total = 0 executedAtleastOnce = 0 for fileName in cumulativeBranchTrace: total += cumulativeBranchTrace[fileName].shape[0] executedAtleastOnce += len(numpy.nonzero(cumulativeBranchTrace[fileName])[0]) # Just an extra check here to cover our ass in case of division by zero if total == 0: total += 1 pool.close() pool.join() return float(executedAtleastOnce) / float(total), executedAtleastOnce, total
def multiprocess(tasks, pool_size=get_num_cpus()): """ Executes several tasks concurrently via Python ``multiprocessing`` processes, puts the results into a queue, and generates these back to the caller. """ pool = billiard.Pool(pool_size) result_q = Queue() def build_results(result): if type(result) in (types.GeneratorType, list, tuple, set): for r in result: result_q.put(r) else: result_q.put(result) for task in tasks: run = pool.apply_async(task.func, args=task.args, callback=build_results) run.get() pool.close() pool.join() while not result_q.empty(): result = result_q.get_nowait() yield result
def _cross_validation_pool(self, X, Y, regressor): pool = billiard.Pool(processes=self.cores, soft_timeout=self.timeout) kf = cv.KFold(X.shape[0], n_folds=self.folds, shuffle=self.shuffle, random_state=self.random_state) Y_prime = np.zeros(Y.shape) cv_res = [] for train_ind, test_ind in kf: X_train, X_test, Y_train = X[train_ind], X[test_ind], Y[train_ind] r = pool.apply_async(regressor, (X_train, Y_train, X_test)) cv_res.append((r, test_ind)) try: for r, test_ind in cv_res: if len(Y_prime.shape) == 1: Y_prime[test_ind] = r.get(self.timeout + 5) else: Y_prime[test_ind, :] = r.get(self.timeout + 5) except (billiard.SoftTimeLimitExceeded, billiard.TimeoutError): pool.terminate() if self.verbose > 2: print "TERMINATED" return None pool.close() pool.join() if self.verbose > 2: print "CLOSED" return Y_prime
def concurrent_twitter_query_wad(self, username, tweets_num=10, threads=2): if current_cred >= len(all_creds): raise Exception('API keys exhausted.') try: all_tweets = twitterdata.get_all_tweets(username, tweets_num) except: raise try: tweets_output = getOutput(all_tweets) except: raise pool = mp.Pool(threads) try: pool_results = pool.starmap(wad_helper, [(tweets_output, 'weapon'), (tweets_output, 'drugs'), (tweets_output, 'alcohol')]) self.update_state(state="PROGRESS") except: raise finally: pool.terminate() weapon_vals = pool_results[0] drug_vals = pool_results[1] alcohol_vals = pool_results[2] print('Data acquired concurrently for user: '******'weapons': weapon_vals, 'alcohol': alcohol_vals, 'drugs': drug_vals } result = {username: result} print(result) self.update_state("FINISHED") return {"result": result}
def do_prediction(self, intbl, selections, gene_names, filteropt=1, filterval=1, spec_ecutoff=0.4, nonspec_ecutoff=0.35): ''' intbl: preprocessed table filteropt: 1 for highest t-val, 2 for p-val cutoff filterval: # TFs for opt 1 and p-val cutoff for opt 2 ''' if type(intbl) is str: # got an error in the pipeline from inittbl return {'current': 1, 'total': 1, 'error': intbl} # intbl: #rowidx,seq,val,diff,t,pbmname,escore_seq start_time = time.time() #while not inittask.ready(): # time.sleep(1) #intbl = inittask.get() predfiles = [app.config['PREDDIR'] + "/" + s for s in selections] # os.listdir(preddir) preds = [l for l in utils.chunkify(predfiles,app.config['PCOUNT']) if len(l) != 0] # chunks the predfiles for each process # collect the short2long_map -- shared, so only one i/o emap = pd.read_csv("%s/index_short_to_long.csv" % (app.config["ESCORE_DIR"]), header=0, index_col=0, sep=',', dtype='Int32') # pd.DataFrame emap = np.array(emap[emap.columns[0]]) - 1 #emap[emap.columns[0]].to_numpy() - 1 # ---- MULTIPROCESSING PART ---- pool = mp.Pool(processes=app.config['PCOUNT']) # need to use manager here shared_ready_sum = mp.Manager().Value('i', 0) predict_partial = ft.partial(predict, **{'dataset':intbl, 'ready_count':shared_ready_sum, 'emap':emap, 'filteropt':filteropt, 'filterval':filterval, 'spec_ecutoff':spec_ecutoff, 'nonspec_ecutoff':nonspec_ecutoff}) async_pools = [pool.apply_async(predict_partial, (preds[i], )) for i in range(0,len(preds))] # run the job, update progress bar total = len(predfiles) while not all([p.ready() for p in async_pools]): time.sleep(2) # super important to avoid checking every loop self.update_state(state='PROGRESS', meta={'current': shared_ready_sum.value, 'total': total, 'status': 'Processing input data...'}) res = [p.get() for p in async_pools] self.update_state(state='PROGRESS', meta={'current': shared_ready_sum.value, 'total': total, 'status': 'post-processing'}) print("Terminate all children process..") pool.terminate() # terminate to kill all child processes !!! Like.. super important, # to avoid memory leak, seriously... datavalues = postprocess(res,predfiles,gene_names,filteropt,filterval) ''' SET the values in redis ''' #print("marktesting",colnames,datavalues) savetomongo(self.request.id, datavalues.to_dict('records') ,app.config['USER_DATA_EXPIRY']) # significance_score can be z-score or p-value depending on the out_type #db.expire("%s:vals:*" % self.request.id, app.config['USER_DATA_EXPIRY']) return {'current': shared_ready_sum.value, 'total': len(predfiles), 'status': 'Task completed!', 'result': 'done', 'taskid': self.request.id, 'time':(time.time()-start_time)} # -- somehow cannot do jsonify(postproc)
def fast_calculate_features(self, words): import billiard pool = billiard.Pool(4) feats = pool.map(bio_chunk_features, words) pool.close() #result = job.apply_async() #feats = result.get() return feats
def generateCoverageChart(config, applicationId): getLogger().info(f"Generating the coverage chart") config = KwolaCoreConfiguration(config) testingSteps = sorted( [step for step in TrainingManager.loadAllTestingSteps(config, applicationId=applicationId) if step.status == "completed"], key=lambda step: step.startTime, reverse=False) coverageValueFutures = [] pool = multiprocessing.Pool(config['chart_generation_dataload_workers']) for step in testingSteps: coverageValueFutures.append(pool.apply_async(computeCumulativeCoverageForTestingSteps, [[step.id], config])) coverageValues = [future.get()[0] for future in coverageValueFutures] executedLinesValues = [future.get()[1] for future in coverageValueFutures] totalLinesValues = [future.get()[2] for future in coverageValueFutures] coverageValues = scipy.signal.medfilt(coverageValues, kernel_size=9) executedLinesValues = scipy.signal.medfilt(executedLinesValues, kernel_size=9) totalLinesValues = scipy.signal.medfilt(totalLinesValues, kernel_size=9) fig, ax = plt.subplots() ax.plot(range(len(coverageValues)), coverageValues, color='green') ax.set(xlabel='Testing Step #', ylabel='Coverage', title='Code Coverage') ax.grid() _, localFilePath = tempfile.mkstemp(suffix=".png") fig.savefig(localFilePath) with open(localFilePath, 'rb') as f: config.saveKwolaFileData("charts", "coverage_chart.png", f.read()) os.unlink(localFilePath) fig, ax = plt.subplots() ax.plot(range(len(executedLinesValues)), executedLinesValues, color='green') ax2 = ax.twinx() ax2.plot(range(len(totalLinesValues)), totalLinesValues, color='red') ax.set(xlabel='Testing Step #', ylabel='Lines Executed (green)', title='Lines Available / Lines Triggered') ax2.set(ylabel="Lines Available (red)") ax.grid() ax2.grid() _, localFilePath = tempfile.mkstemp(suffix=".png") fig.savefig(localFilePath) with open(localFilePath, 'rb') as f: config.saveKwolaFileData("charts", "lines_triggered.png", f.read()) os.unlink(localFilePath) pool.close() pool.join()
def _multiple_pool(self, X, Y, regressor_classes, grids): # only valid solution pool = billiard.Pool(processes=self.cores, soft_timeout=self.timeout) pool_res = [] results = [] timer = time() if type(grids) is dict: grids = [grids] * len(regressor_classes) else: assert len(regressor_classes) == len(grids) for reg_c, grid in izip(regressor_classes, grids): for params in self._grid_iterator(grid): regressor = reg_c(**params) r = pool.apply_async( _pickle_bypass, (self, "_cross_validation", X, Y, regressor)) pool_res.append((r, reg_c, params)) for r, reg_c_params in pool_res: start_time = time() try: score, start_time = r.get(self.timeout + 5) if self.verbose > 1: print "* %.5t *" % score, reg_c, params results.append((score, reg_c, params)) except billiard.SoftTimeLimitExceeded: print "TIME LIM" except billiard.TimeoutError: print "* TIMEOUT * same shit" if self.verbose > 1: print "time" pool.terminate() # TODO still not sure pool.join() best_reg = None best_score = -np.inf if self.maximise else np.inf best_params = None for score, reg_c, prms in results: if (self.maximise and score > best_score) or (not self.maximise and score < best_score): best_score = score best_reg = reg_c best_params = prms if self.verbose > 1: print "multiple done in timer" print best_score, best_reg, best_params return best_score, best_reg, best_params
def updateSessionsAvailability(date): ''' Fetch availability for sessions after the given date :param: lower limit for session date ''' print('[updateSessionsAvailability] Updating session information...') sessions = Session.objects \ .filter(start_date__gte=date) \ .all() p = multiprocessing.Pool(processes=15) sessions_updated = p.map(getSessionAvailability, sessions) Session.objects.bulk_update(sessions_updated, ['availability']) print('[updateSessionsAvailability] Sessions updated!')
def testingStepFinished(self, testingStep, executionSessions): pool = multiprocessing.Pool(self.config['video_generation_processes'], maxtasksperchild=1) futures = [] for session in executionSessions: future = pool.apply_async(func=createDebugVideoSubProcess, args=(self.config.configurationDirectory, str(session.id), "", False, False, None, None, "annotated_videos")) futures.append(future) for future in futures: future.get() pool.close() pool.join()
def load_data(): pool = mp.Pool(8) jobs = [] f = open('sample.csv', 'r', buffering=(2 << 16)) i = 0 current_objects = [] lines = f.readlines(100000) while lines: jobs.append(pool.apply_async(post_objects, [lines])) lines = f.readlines(100000) for job in jobs: job.get() #clean up pool.close()
def generateCumulativeErrorsFoundChart(configDir, applicationId): getLogger().info(f"Generating the cumulative errors chart") config = KwolaCoreConfiguration(configDir) testingSteps = sorted([ step for step in TrainingManager.loadAllTestingSteps( config, applicationId=applicationId) if step.status == "completed" ], key=lambda step: step.startTime, reverse=False) bugsByTestingStepId = {step.id: 0 for step in testingSteps} for bug in loadAllBugs(config, applicationId): if bug.testingStepId in bugsByTestingStepId: bugsByTestingStepId[bug.testingStepId] += 1 cumulativeErrorsFound = [] pool = multiprocessing.Pool(config['chart_generation_dataload_workers']) currentTotal = 0 for step in testingSteps: currentTotal += bugsByTestingStepId[step.id] cumulativeErrorsFound.append(currentTotal) fig, ax = plt.subplots() ax.plot(range(len(cumulativeErrorsFound)), cumulativeErrorsFound, color='green') ax.set(xlabel='Testing Step #', ylabel='Total Errors Found', title='Cumulative Errors Found') ax.grid() _, localFilePath = tempfile.mkstemp(suffix=".png") fig.savefig(localFilePath) with open(localFilePath, 'rb') as f: filePath = f"{config.getKwolaUserDataDirectory('charts')}/errors_found.png" saveKwolaFileData(filePath, f.read(), config) os.unlink(localFilePath) pool.close() pool.join()
def generateRewardChart(configDir, applicationId): getLogger().info(f"Generating the reward chart") config = KwolaCoreConfiguration(configDir) testingSteps = sorted([ step for step in TrainingManager.loadAllTestingSteps( config, applicationId=applicationId) if step.status == "completed" ], key=lambda step: step.startTime, reverse=False) rewardValueFutures = [] pool = multiprocessing.Pool(config['chart_generation_dataload_workers']) for step in testingSteps: rewardValueFutures.append( pool.apply_async(averageRewardForTestingStep, [config, step.id])) rewardValues = [ future.get() for future in rewardValueFutures if future.get() is not None ] fig, ax = plt.subplots() rewardValues = scipy.signal.medfilt(rewardValues, kernel_size=9) ax.plot(range(len(rewardValues)), rewardValues, color='green') ax.set_ylim(0, 15) ax.set(xlabel='Testing Step #', ylabel='Reward', title='Reward per session') ax.grid() _, localFilePath = tempfile.mkstemp(suffix=".png") fig.savefig(localFilePath) with open(localFilePath, 'rb') as f: filePath = f"{config.getKwolaUserDataDirectory('charts')}/reward_chart.png" saveKwolaFileData(filePath, f.read(), config) os.unlink(localFilePath) pool.close() pool.join()
def generateVideoFilesForBugs(self, testingStep, bugObjects): pool = multiprocessing.Pool(self.config['video_generation_processes'], maxtasksperchild=1) futures = [] for bugIndex, bug in enumerate(bugObjects): future = pool.apply_async( func=createDebugVideoSubProcess, args=(self.config.configurationDirectory, str(bug.executionSessionId), f"{bug.id}_bug", False, False, bug.stepNumber, bug.stepNumber + 3, "bugs")) futures.append(future) for future in futures: future.get() pool.close() pool.join()
def generateLossChart(configDir, applicationId, attribute, title, fileName): getLogger().info(f"Generating the loss chart for {attribute}") config = KwolaCoreConfiguration(configDir) trainingStepIds = findAllTrainingStepIds(config, applicationId=applicationId) pool = multiprocessing.Pool(config['chart_generation_dataload_workers']) lossValueFutures = [] for id in trainingStepIds: lossValueFutures.append( pool.apply_async(loadTrainingStepLossData, [config, id, attribute])) lossValuesSorted = sorted([ future.get() for future in lossValueFutures if future.get()[2] == "completed" ], key=lambda result: result[1], reverse=False) lossValues = [result[0] for result in lossValuesSorted] fig, ax = plt.subplots() lossValues = scipy.signal.medfilt(lossValues, kernel_size=9) ax.plot(range(len(lossValues)), lossValues, color='green') ax.set_ylim(0, numpy.percentile(lossValues, 99)) ax.set(xlabel='Training Step #', ylabel='Reward', title=title) ax.grid() _, localFilePath = tempfile.mkstemp(suffix=".png") fig.savefig(localFilePath) with open(localFilePath, 'rb') as f: filePath = os.path.join(config.getKwolaUserDataDirectory('charts'), fileName) saveKwolaFileData(filePath, f.read(), config) os.unlink(localFilePath) pool.close() pool.join()
def twitter_bulk_query_wad(self, user_list, tweets_num=10, threads=2): if current_cred >= len(all_creds): raise Exception('You have reached the daily limit of 1500 requests!') print(user_list) results = {} pool = mp.Pool(threads) try: pool_results = pool.starmap(twitter_query_wad, [(user, tweets_num) for user in user_list]) self.update_state(state="PROGRESS") except: raise finally: pool.terminate() results = [{user_list[i]: pool_results[i]} for i in range(len(user_list))] print('twitter_bulk_query_wad() completed.') self.update_state(state="FINISHED") print(results) return {"result": results}
def generateFitnessChart(config, applicationId): getLogger().info(f"Generating the fitness chart") config = KwolaCoreConfiguration(config) testingSteps = sorted( [step for step in TrainingManager.loadAllTestingSteps(config, applicationId=applicationId) if step.status == "completed"], key=lambda step: step.startTime, reverse=False) fitnessValueFutures = [] pool = multiprocessing.Pool(config['chart_generation_dataload_workers']) for step in testingSteps: fitnessValueFutures.append(pool.apply_async(averageFitnessForTestingStep, [config, step.id])) fitnessValues = [future.get() for future in fitnessValueFutures if future.get() is not None] if len(fitnessValues) > 0: bestFitness = numpy.max(fitnessValues) fig, ax = plt.subplots() fitnessValues = scipy.signal.medfilt(fitnessValues, kernel_size=9) ax.plot(range(len(fitnessValues)), fitnessValues, color='green') ax.set_ylim(0, 100) ax.set(xlabel='Testing Step #', ylabel='Fitness', title='Fitness per session') ax.grid() _, localFilePath = tempfile.mkstemp(suffix=".png") fig.savefig(localFilePath) with open(localFilePath, 'rb') as f: config.saveKwolaFileData("charts", "fitness_chart.png", f.read()) os.unlink(localFilePath) getLogger().info(f"Best Fitness Value: {bestFitness}") pool.close() pool.join()
def getDiagonalOfInverse(matrix, stop_at=0): global p_solver global p_num_variables if __name__ == '__main__' or __name__ == 'gmrf.pool_inverse': p_solver = scipy.sparse.linalg.factorized(matrix) p_num_variables = matrix.shape[0] if stop_at == 0: stop_at = p_num_variables with billiard.Pool() as pool: diagonal = pool.map(invOfColumn, [i for i in range(0, stop_at)]) pool.close() pool.terminate() return np.array(diagonal) else: return 0
def get_dataset(features: list, dataset_id: str, nthread: int = -1): all_features = features dataframe = pd.DataFrame() if nthread > 0: for features in np.array_split(all_features, int(len(all_features) / 32) + 1): with mp.Pool(nthread) as p: partial_create_features = functools.partial( get_feature, dataset_id=dataset_id) # dataframe = pd.concat(p.map(partial_create_features, features), axis=1) dataframes = p.map(partial_create_features, features) if len(dataframe) > 0: dataframes.append(dataframe) dataframe = pd.concat(dataframes, axis=1) for df in dataframes: del df del dataframes else: for feature_name in tqdm(features): if (feature_name, dataset_id) in FEATURES.keys(): f = FEATURES[(feature_name, dataset_id)] df = f.load_or_create() if len(df.columns) == 1: dataframe[feature_name] = df[f.feature_name] else: if len(dataframe) > 0: dataframe = pd.concat([dataframe, df], axis=1) else: dataframe = df else: raise Exception(f"Feature {feature_name} not found ") # dataframe = pd.concat([get_feature(feature_name, dataset_id) for feature_name in features], axis=1) # Some columns are not in the format XGB expects, so the following block of code will cast them to the right format for column in dataframe.columns: if str(dataframe[column].dtype).lower()[:3] == "int": dataframe[column] = dataframe[column].fillna(0).astype(np.int64, copy=False) elif str(dataframe[column].dtype).lower() == "boolean": dataframe[column] = dataframe[column].fillna(False).astype( np.bool, copy=False) return dataframe
def get_dataset_batch(features: list, dataset_id: str, total_n_split: int, split_n: int, sample: float): assert split_n < total_n_split, "split_n parameter should be less than total_n_split parameter" if sample < 1: with mp.Pool(16) as p: partial_create_features = functools.partial( get_feature_batch, dataset_id=dataset_id, total_n_split=total_n_split, split_n=split_n, sample=sample) dataframe = pd.concat(p.map(partial_create_features, features), axis=1) else: dataframe = pd.DataFrame() for feature_name in tqdm(features): if (feature_name, dataset_id) in FEATURES.keys(): f = FEATURES[(feature_name, dataset_id)] df = np.array_split(f.load_or_create(), total_n_split)[split_n] if len(df.columns) == 1: dataframe[feature_name] = df[f.feature_name] else: if len(dataframe) > 0: dataframe = pd.concat([dataframe, df], axis=1) else: dataframe = df else: raise Exception(f"Feature {feature_name} not found ") # dataframe = pd.concat([np.array_split(get_feature(feature_name, dataset_id), # total_n_split)[split_n] for feature_name in features], axis=1) # Some columns are not in the format XGB expects, so the following block of code will cast them to the right format for column in dataframe.columns: if str(dataframe[column].dtype).lower()[:3] == "int": dataframe[column] = dataframe[column].fillna(0).astype(np.int64, copy=False) elif str(dataframe[column].dtype).lower() == "boolean": dataframe[column] = dataframe[column].fillna(False).astype( np.bool, copy=False) return dataframe
def generateTracesWithNewBranchesChart(config, applicationId): getLogger().info(f"Generating the traces with new branches chart") config = KwolaCoreConfiguration(config) testingSteps = sorted( [step for step in TrainingManager.loadAllTestingSteps(config, applicationId=applicationId) if step.status == "completed"], key=lambda step: step.startTime, reverse=False) countTracesWithNewBranchesFutures = [] pool = multiprocessing.Pool(config['chart_generation_dataload_workers']) for step in testingSteps: countTracesWithNewBranchesFutures.append(pool.apply_async(averageTracesWithNewBranchesForTestingStep, [config, step.id])) countTracesWithNewBranchesValues = [future.get() for future in countTracesWithNewBranchesFutures if future.get() is not None] if len(countTracesWithNewBranchesValues) > 0: fig, ax = plt.subplots() countTracesWithNewBranchesValues = scipy.signal.medfilt(countTracesWithNewBranchesValues, kernel_size=9) ax.plot(range(len(countTracesWithNewBranchesValues)), countTracesWithNewBranchesValues, color='green') ax.set_ylim(0, config['testing_sequence_length']) ax.set(xlabel='Testing Step #', ylabel='Traces with new branches', title='# of testing traces that have new branches') ax.grid() _, localFilePath = tempfile.mkstemp(suffix=".png") fig.savefig(localFilePath) with open(localFilePath, 'rb') as f: config.saveKwolaFileData("charts", "traces_with_new_branches.png", f.read()) os.unlink(localFilePath) pool.close() pool.join()
def poolHandle(zip,nid): if DEBUG_LEVEL ==0 : p = Pool(80) for sub in zip.namelist(): fobj = getSubFobj(zip,sub) if fobj != None : p.apply_async(handleSub,args=(fobj,nid)) p.close() p.join() elif DEBUG_LEVEL ==1 : p = billiard.Pool() _finalizers.append(Finalize(p, p.terminate)) try: p.map_async(handleSub, [(getSubFobj(zip,sub),nid) for sub in zip.namelist()]) p.close() p.join() finally: p.terminate() else : for sub in zip.namelist(): fobj = getSubFobj(zip,sub) if fobj != None : handleSub(fobj,nid) zip.close()
def generateVideoFilesForBugs(self, testingStep, bugObjects): pool = multiprocessing.Pool( self.config['testing_video_generation_processes'], maxtasksperchild=1) futures = [] for bugIndex, bug in enumerate(bugObjects): future = pool.apply_async(func=createDebugVideoSubProcess, args=(self.config.serialize(), str(bug.executionSessionId), f"{bug.id}_bug", False, False, bug.stepNumber, bug.stepNumber + 3, "bugs")) futures.append((bugIndex, bug, future)) for bugIndex, bug, future in futures: localFuture = future # for retry in range(5): # try: value = localFuture.get( timeout=self.config['debug_video_generation_timeout']) if value: getLogger().error(value) # break # except billiard.exceptions.WorkerLostError: # if retry == 4: # raise # localFuture = pool.apply_async(func=createDebugVideoSubProcess, args=( # self.config.serialize(), str(bug.executionSessionId), f"{bug.id}_bug", False, False, bug.stepNumber, # bug.stepNumber + 3, "bugs")) # except BrokenPipeError: # if retry == 4: # raise # localFuture = pool.apply_async(func=createDebugVideoSubProcess, args=( # self.config.serialize(), str(bug.executionSessionId), f"{bug.id}_bug", False, False, bug.stepNumber, # bug.stepNumber + 3, "bugs")) pool.close() pool.join()
def deepspeech_run(video, ds_wav_path): msg = msg = "\nTRANSCRIPTING FILE : %s" % time.ctime() if DEBUG: start_timer = timer() lang = video.main_lang vad = VoiceActivityDetector(ds_wav_path) seg_list, sample_rate = vad.vad_segment_generator() if seg_list: msg += "\n- Start Transcript Process : %s" % time.ctime() p = billiard.Pool(processes=NB_WORKERS_POOL, initializer=initfunc, initargs=(lang, ), threads=True) res = p.map_async(deepspeech_aux, seg_list).get() p.close() p.join() msg += "\n- End Transcript Process : %s" % time.ctime() if DEBUG: end_timer = timer() print('Transcription duration : %f s' % (end_timer - start_timer)) msg2, webvtt = createVTT(res, vad.sample_rate) msg += msg2 msg += saveVTT(video, webvtt) return msg
def describe(df, bins=10, check_correlation=True, correlation_threshold=0.9, correlation_overrides=None, check_recoded=False, pool_size=multiprocessing.cpu_count(), **kwargs): """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`. Used has is it will output its content as an HTML report in a Jupyter notebook. Parameters ---------- df : DataFrame Data to be analyzed bins : int Number of bins in histogram. The default is 10. check_correlation : boolean Whether or not to check correlation. It's `True` by default. correlation_threshold: float Threshold to determine if the variable pair is correlated. The default is 0.9. correlation_overrides : list Variable names not to be rejected because they are correlated. There is no variable in the list (`None`) by default. check_recoded : boolean Whether or not to check recoded correlation (memory heavy feature). Since it's an expensive computation it can be activated for small datasets. `check_correlation` must be true to disable this check. It's `False` by default. pool_size : int Number of workers in thread pool The default is equal to the number of CPU. Returns ------- dict Containing the following keys: * table: general statistics on the dataset * variables: summary statistics for each variable * freq: frequency table Notes: ------ * The section dedicated to check the correlation should be externalized """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle")) # Clearing the cache before computing stats base.clear_cache() if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() kwargs.update({'bins': bins}) # Describe all variables in a univariate way if pool_size == 1: local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in map(local_multiprocess_func, df.iteritems())} else: pool = multiprocessing.Pool(pool_size) local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())} pool.close() # Get correlations dfcorrPear = df.corr(method="pearson") dfcorrSpear = df.corr(method="spearman") # Check correlations between variable if check_correlation is True: ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = dfcorrPear.copy() for x, corr_x in corr.iterrows(): if correlation_overrides and x in correlation_overrides: continue for y, corr in corr_x.iteritems(): if x == y: break if corr > correlation_threshold: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation']) if check_recoded: categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT'] for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2): if correlation_overrides and name1 in correlation_overrides: continue confusion_matrix=pd.crosstab(data1,data2) if confusion_matrix.values.diagonal().sum() == len(df): ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var']) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {} table_stats['n'] = len(df) table_stats['nvar'] = len(df.columns) table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar']) unsupported_columns = variable_stats.transpose()[variable_stats.transpose().type != base.S_TYPE_UNSUPPORTED].index.tolist() table_stats['n_duplicates'] = sum(df.duplicated(subset=unsupported_columns)) if len(unsupported_columns) > 0 else 0 memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL", "UNSUPPORTED")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED'] return { 'table': table_stats, 'variables': variable_stats.T, 'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns}, 'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear} }