def main(): #match.objects.all().delete() s_docs_count = scopus_doc.objects.filter(DO__exists=True,shingle__exists=True).count() print(s_docs_count) model = Doc2Vec.load("/queries/title_model") s_docs_count = 10025 chunk_size= 10000 for i in range(s_docs_count//chunk_size+1): f = i*chunk_size l = (i+1)*chunk_size-1 if l > s_docs_count: l = s_docs_count-1 s_docs = scopus_doc.objects.filter(DO__exists=True,shingle__exists=True)[f:l] matches = [] sims = [] pool = Pool(processes=16) #matches.append(pool.map(partial(find_match,),s_docs)) sims.append(pool.map(partial(find_sim,),s_docs)) pool.terminate() #matches = [x for x in matches[0] if x is not None] #matches = list(filter(None.__ne__, matches[0])) sims = list(filter(None.__ne__, sims[0])) #match.objects.insert(matches) similarity.objects.insert
def run(): def init_pool(): logging.info('Init pool') signal.signal(signal.SIGINT, signal.SIG_IGN) pool = Pool(args.process, init_pool) try: task_inputs = gen_inputs() if args.process == 1: results = itertools.imap(task, task_inputs) else: results = pool.imap(task, task_inputs, 1) #results = imap_wrap(pool, task, task_inputs, 100) data_header = ['chrom', 'start', 'end', 'region_id', 'nsamples'] param_header = [ 'status', 'ia', 'ib', 'ic', 'a', 'b', 'c', 'top_cn', 'top_theta', 'theta', 'll', 'step' ] out_header = data_header + param_header print(*out_header, sep='\t') for data, params in results: row = [data[c] for c in data_header] \ + [params[c] for c in param_header] print(*row, sep='\t') pool.close() except Exception as e: pool.terminate() raise e finally: pool.join()
def run_parallel_fep(self, mutant_params, system_idx, mutant_idx, n_steps, n_iterations, windows): logger.debug('Computing FEP for {}...'.format(self.name)) if not self.opt: mutant_systems = mutant_params.build_fep_systems( system_idx, mutant_idx, windows) else: mutant_systems = mutant_params nstates = len(mutant_systems) chunk = math.ceil(nstates / self.num_gpu) groups = grouper(range(nstates), chunk) pool = Pool(processes=self.num_gpu) system = copy.deepcopy(self.wt_system) box_vectors = self.input_pdb.topology.getPeriodicBoxVectors() system.setDefaultPeriodicBoxVectors(*box_vectors) system.addForce( mm.MonteCarloBarostat(1 * unit.atmospheres, self.temperature * unit.kelvin, 25)) ### fep = partial(run_fep, sim=self, system=system, pdb=self.extended_pdb, n_steps=n_steps, n_iterations=n_iterations, all_mutants=mutant_systems) u_kln = pool.map(fep, groups) pool.close() pool.join() pool.terminate() ddg = FSim.gather_dg(self, u_kln, nstates) return ddg
def main(): model = Doc2Vec.load("/usr/local/apsis/queries/title_model") do_shingle = True print(scopus_doc.objects.filter(TI__isnull=True).count()) if do_shingle: unshingled_s_docs = scopus_doc.objects.filter(shingle__exists=False) print(unshingled_s_docs.count()) for sd in unshingled_s_docs: if not hasattr(sd, 'TI'): print(sd) sd.delete() else: #try: sd.shingle = list(shingle(sd.TI, 2)) sd.save() #except: # pass scopus_docs_all = scopus_doc.objects.filter(shingle__exists=True, DO__exists=True, doc2vec_checked=False) s_docs_i = scopus_docs_all.count() #s_docs_i = 10 chunk_size = 3 #similarity.objects.all().delete() for i in range(s_docs_i // chunk_size + 1): print(i) #t0 = time.time() f = i * chunk_size l = (i + 1) * chunk_size if l > s_docs_i: l = s_docs_i - 1 s_docs = scopus_docs_all[f:l] print(s_docs) # initialise an empty list, and append sim items to it in parallel sims = [] pool = Pool(processes=chunk_size) sims.append(pool.map(partial(compare, model=model), s_docs)) pool.terminate() #sims = [item for sublist in sims for item in sublist] #try: # sims = [item for sublist in sims for item in sublist] #except: # pass # Flatten and remove nones #print(sims) sims = flatten(sims) sims = list(filter(None.__ne__, sims)) similarity.objects.insert(sims) s_docs.update(doc2vec_checked=True)
def pcall_mp(fun, args, cores=cores): """Calls a function for every input in args""" mainpool = Pool(cores) # create pool out = mainpool.map(fun, args) # return list mainpool.terminate() del mainpool # delete pool return out
def handle(self, *args, **options): qid = options['qid'] q = Query.objects.get(pk=qid) docs = Doc.objects.filter(query=q, wosarticle__cr__isnull=False, cdo__citation__isnull=True) docs = docs ndocs = docs.count() print(ndocs) # Chunk size, so as to prevent overuse of memory chunk_size = 1000 for i in range(ndocs // chunk_size + 1): cdos = [] f = i * chunk_size print(f) l = (i + 1) * chunk_size if l > ndocs: l = ndocs - 1 chunk_docs = docs[f:l] pool = Pool(processes=4) cdos.append(pool.map(doc_cites, chunk_docs)) pool.terminate() gc.collect() django.db.connections.close_all() cdos = flatten(cdos) CDO.objects.bulk_create(cdos)
def msolve(A, Y, init=None): if use_cuda: Z = np.asarray(np.hstack(list( map(lambda y: cg(A, y, x0=None, tol=tol, atol=atol, use_cuda=True), np.split(Y, n_jobs, axis=1))))) else: if n_jobs <= 1: Z = np.asarray(np.hstack([scipy.sparse.linalg.cg(A, Y[:, i], x0=init[:, i] if init is not None else None, tol=tol, atol=atol)[0][:, np.newaxis] for i in range(Y.shape[1])])) else: p = Pool(n_jobs) try: if init is None: Z = np.asarray(np.hstack(list( p.map(lambda y: cg(A, y, x0=None, tol=tol, atol=atol, use_cuda=False), np.split(Y, Y.shape[1], axis=1))))) else: Z = np.asarray(np.hstack(list( p.map(lambda y, init_: cg(A, y, x0=init_, tol=tol, atol=atol, use_cuda=False), zip(np.split(Y, Y.shape[1], axis=1), np.split(init, init.shape[1], axis=1)))))) p.close() p.join() except KeyboardInterrupt: print("Caught KeyboardInterrupt, terminating workers") p.terminate() p.join() raise return Z
def pcall_mp(fun,args,cores=cores): """Calls a function for every input in args""" mainpool = Pool(cores) # create pool # print("Using",cores,"cores") out = mainpool.map(fun,args) # return list mainpool.terminate() del mainpool # delete pool return out
def pcall_mp(fun, args, cores=cores): """Calls a function for every input in args""" mainpool = Pool(cores) # create pool # print("Using",cores,"cores") out = mainpool.map(fun, args) # return list mainpool.terminate() # clear the pool del mainpool # delete pool return out
def convertpool(self): if len(self.todo) > 0: if self.type in [".h264", ".mp4", ".avi"]: pool = Pool(min(self.pools, len(self.todo))) try: pool.map(self.conv_single, self.todo) pool.close() lineprint("Done converting all videofiles!", label="pirecorder") except KeyboardInterrupt: lineprint("User terminated converting pool..", label="pirecorder") self.terminated = True pool.terminate() return except Exception as e: excep = "Got exception: %r, terminating pool" % (e, ) lineprint(excep, label="pirecorder") pool.terminate() finally: pool.join() if self.delete: for filein in self.todo: os.remove(filein) lineprint("Deleted all original videofiles..", label="pirecorder") elif self.type in [".jpg", ".jpeg", ".png"]: vidname = commonpref(self.todo) lineprint("Start converting " + str(len(self.todo)) + " images", label="pirecorder") frame_array = [] for filename in self.todo: frame = cv2.imread(filename) frame_array.append(frame) #os.rename(filename, self.outdir+"/"+filename) h, w, _ = frame_array[0].shape if self.outdir != "": vidname = self.outdir + "/" + os.path.basename(vidname) vidout = videowriter(vidname, w, h, self.imgfps, self.resizeval) for i in range(len(frame_array)): vidout.write(frame_array[i]) vidout.release() lineprint("Finished converting " + os.path.basename(vidname), label="pirecorder") else: lineprint("No video or image files found..", label="pirecorder")
def get_gameplays(): PlayTypeDict = {} PlayTypeStrings = { 'Pass': ['pass incomplete', 'pass complete', 'sacked'], 'Admin': ['spiked the ball', 'Timeout', 'Penalty', 'aborted'], 'Kneel': ['knee', 'knelt'], 'Punt': ['Punts'], 'Field Goal': ['field goal', 'no good'], 'Special Teams': ['kicks off', 'kicks onside', 'extra point', 'two point'], 'Run': [ 'left end', 'right end', ' for ', 'up the middle', 'middle for', 'left tackle', 'left guard', 'right guard', 'right tackle' ], } YearStart = 1998 YearsToGo = 20 for Year in range(YearStart, YearStart + YearsToGo): PlayTypeCounts = { 'Pass': 0, 'Run': 0, 'Punt': 0, 'Field Goal': 0, 'Admin': 0, 'Kneel': 0, 'Special Teams': 0 } for GameNumber in range(1, 17): print('Game', GameNumber, 'in', Year, 'Time: ', datetime.now()) PlayTypeDict = {} PathList = [] for Team in TeamLookup: for GameLocation in ['H', 'A']: path = 'https://widgets.sports-reference.com/wg.fcgi?css=1&site=pfr&url=%2Fplay-index%2Fplay_finder.cgi%3Frequest%3D1%26match%3Dall%26year_min%3D{YEAR}%26year_max%3D{YEAR}%26game_type%3DR%26game_num_min%3D{GameNumber}%26game_num_max%3D{GameNumber}%26week_num_min%3D0%26week_num_max%3D99%26game_location%3D{GameLocation}%26minutes_max%3D15%26seconds_max%3D0%26minutes_min%3D0%26seconds_min%3D0%26team_id%3D{TEAM}%26field_pos_min_field%3Dteam%26field_pos_max_field%3Dteam%26end_field_pos_min_field%3Dteam%26end_field_pos_max_field%3Dteam%26type%255B%255D%3DPASS%26type%255B%255D%3DRUSH%26type%255B%255D%3DPUNT%26type%255B%255D%3DKOFF%26type%255B%255D%3DONSD%26type%255B%255D%3DFG%26type%255B%255D%3DXP%26type%255B%255D%3D2PC%26no_play%3DN%26turnover_type%255B%255D%3Dinterception%26turnover_type%255B%255D%3Dfumble%26score_type%255B%255D%3Dtouchdown%26score_type%255B%255D%3Dfield_goal%26score_type%255B%255D%3Dsafety%26order_by%3Dyds_to_go&div=div_all_plays&del_col=1,11,12,13,14'.format( YEAR=Year, GameNumber=GameNumber, TEAM=Team, GameLocation=GameLocation) PathList.append(path) #req = get(path) p = Pool(8) # Pool tells how many at a time records = p.map(GetAndParsePath, PathList) p.terminate() p.join() with open( 'output/PlayTypeCounts-Year-' + str(Year) + '-Game-' + str(GameNumber) + '.json', 'w') as outfile: json.dump(PlayTypeDict, outfile)
def fmultiprocess(log, function, inputArray, poolSize=False, timeout=3600, **kwargs): """multiprocess pool **Key Arguments:** - ``log`` -- logger - ``function`` -- the function to multiprocess - ``inputArray`` -- the array to be iterated over - ``poolSize`` -- limit the number of CPU that are used in multiprocess job - ``timeout`` -- time in sec after which to raise a timeout error if the processes have not completed **Return:** - ``resultArray`` -- the array of results **Usage:** .. code-block:: python from fundamentals import multiprocess # DEFINE AN INPUT ARRAY inputArray = range(10000) results = multiprocess(log=log, function=functionName, poolSize=10, timeout=300, inputArray=inputArray, otherFunctionKeyword="cheese") """ log.debug('starting the ``multiprocess`` function') # DEFINTE POOL SIZE - NUMBER OF CPU CORES TO USE (BEST = ALL - 1) if not poolSize: poolSize = psutil.cpu_count() if poolSize: p = Pool(processes=poolSize) else: p = Pool() # MAP-REDUCE THE WORK OVER MULTIPLE CPU CORES if "log" in inspect.getargspec(function)[0]: mapfunc = partial(function, log=log, **kwargs) resultArray = p.map_async(mapfunc, inputArray) else: mapfunc = partial(function, **kwargs) resultArray = p.map_async(mapfunc, inputArray) resultArray = resultArray.get(timeout=timeout) p.close() p.terminate() log.debug('completed the ``multiprocess`` function') return resultArray
def fmultiprocess(log, function, inputArray, poolSize=False, **kwargs): """multiprocess pool **Key Arguments:** - ``log`` -- logger - ``function`` -- the function to multiprocess - ``inputArray`` -- the array to be iterated over **Return:** - ``resultArray`` -- the array of results **Usage:** .. code-block:: python from fundamentals import multiprocess # DEFINE AN INPUT ARRAY inputArray = range(10000) results = multiprocess(log=log, function=functionName, inputArray=inputArray, otherFunctionKeyword="cheese") """ log.info('starting the ``multiprocess`` function') # DEFINTE POOL SIZE - NUMBER OF CPU CORES TO USE (BEST = ALL - 1) # if cpu_count() > 1: # poolSize = cpu_count() - 1 # else: # poolSize = 1 # if len(inputArray) < poolSize: # poolSize = len(inputArray) if poolSize: p = Pool(processes=poolSize) else: p = Pool() # MAP-REDUCE THE WORK OVER MULTIPLE CPU CORES try: mapfunc = partial(function, log=log, **kwargs) resultArray = p.map(mapfunc, inputArray) except: try: mapfunc = partial(function, **kwargs) resultArray = p.map(mapfunc, inputArray) except: mapfunc = partial(function, log=log, **kwargs) resultArray = p.map(mapfunc, inputArray) p.close() p.terminate() p.join() log.info('completed the ``multiprocess`` function') return resultArray
def inner(*args): pool = Pool(processes=1) res = pool.apply_async(f,args) try: v = res.get(timeout=sec) except Exception as inst: print(inst) v = None finally: pool.terminate() return v
def loop_fourier(snaps, modes, settings, Rd=1.47492, folder='./', offsets=None, max_amp=False, parttype='stars'): """ Measure the Fourier modes for a range of snapshots """ pool = Pool() snapbase = 'snap_' settings_array = [] for i, s in enumerate(snaps): settings_array.append(settings.copy()) settings_array[i]['filename'] = s nsnaps = len(snaps) if offsets is None: use_offset = False else: use_offset = True argd = utils.check_args(snaps, offsets=offsets) offsets = argd['offsets'] for i, offset in enumerate(offsets): settings_array[i]['offset'] = offset try: amps = pool.map(fourier_mode_helper, settings_array, [Rd] * nsnaps, [modes] * nsnaps, [max_amp] * nsnaps, [use_offset] * nsnaps) except KeyboardInterrupt: print('got ^C while pool mapping, terminating the pool') pool.terminate() print('pool is terminated') except Exception as e: print('got exception: %r, terminating the pool' % (e, )) pool.terminate() print('pool is terminated') return amps
def apply_function(self, function, *args): """ Map a user supplied function over the snapshots. Uses pathos.multiprocessing (https://github.com/uqfoundation/pathos.git). """ pool = Pool() try: val = pool.map(function, self.snaps) return val except KeyboardInterrupt: print('got ^C while pool mapping, terminating the pool') pool.terminate() print('pool is terminated') except Exception as e: print('got exception: %r, terminating the pool' % (e, )) pool.terminate() print('pool is terminated')
def get_mutant_energy(self, parameters, dcd, top, num_frames): chunk = math.ceil(len(parameters) / self.num_gpu) groups = grouper(range(len(parameters)), chunk) pool = Pool(processes=self.num_gpu) mutant_eng = partial(mutant_energy, sim=self, dcd=dcd, top=top, num_frames=num_frames, all_mutants=parameters) mutants_systems_energies = pool.map(mutant_eng, groups) pool.close() pool.join() pool.terminate() mutants_systems_energies = [ x for y in mutants_systems_energies for x in y ] return mutants_systems_energies
def eval_EFG(self,x,num_procs=None,info=False): from multiprocess import Pool,cpu_count if not num_procs: num_procs = cpu_count() num_samples = self.parameters['num_samples'] pool = Pool(num_procs) num = int(np.ceil(float(num_samples)/float(num_procs))) results = list(zip(*pool.map(lambda i: self.eval_EFG_sequential(x,num,i,info),range(num_procs),chunksize=1))) pool.terminate() pool.join() if not info: assert(len(results) == 4) else: assert(len(results) == 5) assert(all([len(vals) == num_procs for vals in results])) return [sum(vals)/float(num_procs) for vals in results]
class ProcessPoolExecutor(Executor): """Process Pool Executor""" def __init__(self): super(ProcessPoolExecutor, self).__init__() import os from multiprocess import Pool self.pool = Pool(os.cpu_count() or 1) def submit(self, func, *args, **kwargs): from concurrent.futures import Future fut = Future() self.tasks[fut] = self.pool.apply_async(func, args, kwargs, fut.set_result, fut.set_exception) fut.add_done_callback(self.tasks.pop) return fut def shutdown(self, wait=True): super(ProcessPoolExecutor, self).shutdown(wait) self.pool.terminate() self.pool.join()
class ProcessPoolExecutor(Executor): """Process Pool Executor""" def __init__(self): super(ProcessPoolExecutor, self).__init__() import os from multiprocess import Pool self.pool = Pool(os.cpu_count() or 1) def submit(self, func, *args, **kwargs): from concurrent.futures import Future fut = Future() self.tasks[fut] = self.pool.apply_async( func, args, kwargs, fut.set_result, fut.set_exception ) fut.add_done_callback(self.tasks.pop) return fut def shutdown(self, wait=True): super(ProcessPoolExecutor, self).shutdown(wait) self.pool.terminate() self.pool.join()
def _fi_multi_process(fi_func, arg_list, progressbar=True, n_jobs=None): executor_instance = Pool(n_jobs) mapper = executor_instance.imap if progressbar else executor_instance.map if progressbar: warn("Progress bars slow down runs by 10-20%. For slightly \n" "faster runs, do progress_bar=False") n_iter = len(arg_list) p = ProgressBar(n_iter, units='features') try: importance_dicts = [] for importance in mapper(fi_func, arg_list): importance_dicts.append(importance) if progressbar: p.animate() except: warn("Multiprocessing failed, going single process") importance_dicts = FeatureImportance._fi_single_process( fi_func, arg_list, progressbar=progressbar) finally: executor_instance.close() executor_instance.join() executor_instance.terminate() return importance_dicts
def run_parallel_dynamics(self, output_folder, name, n_steps, equi, mutant_parameters): system = copy.deepcopy(self.wt_system) n_steps = math.ceil(n_steps / self.num_gpu) n_steps = n_steps * 2500 if mutant_parameters is not None: non_bonded_force = system.getForce(self.nonbonded_index) self.apply_nonbonded_parameters(non_bonded_force, mutant_parameters[0], mutant_parameters[1], mutant_parameters[2], mutant_parameters[3]) box_vectors = self.input_pdb.topology.getPeriodicBoxVectors() system.setDefaultPeriodicBoxVectors(*box_vectors) system.addForce( mm.MonteCarloBarostat(1 * unit.atmospheres, self.temperature * unit.kelvin, 25)) ### dcd_names = [ output_folder + name + '_gpu' + str(i) + '.dcd' for i in range(self.num_gpu) ] groups = grouper(dcd_names, 1) pool = Pool(processes=self.num_gpu) run = partial(run_dynamics, system=system, sim=self, equi=equi, n_steps=n_steps) pool.map(run, groups) pool.close() pool.join() pool.terminate() return dcd_names
def get_missing_ids(self): ## First argument: Make function that inputs a enterez id and spits out a hugo id ## Second argument: Make a list of all of the missing missing = [] values = self.df.values for i in range(len(values)): # if type(values[i][0]) is type(3.6) or values[i][0] == "nan": enterez = values[i][1] missing.append({'index': i, 'enterez': enterez}) p = Pool(self.num_workers) hugo_ids = p.map(get_id, missing) p.terminate() p.join() ## Change Hugo Ids to new ids number_found = 0 for i in range(len(hugo_ids)): self.df.iloc[missing[i]['index'], 0] = hugo_ids[i] if hugo_ids[i] != "nan": number_found += 1 print("Found ", number_found, "/", len(missing), "of the missing values") return self.df
def eval_EQ(self,p,num_procs=None,quiet=True): """ Evaluates E[Q(p,r)] and its gradient in parallel. Parameters ---------- p : generator powers num_procs : number of parallel processes quiet : flag """ from multiprocess import Pool,cpu_count if not num_procs: num_procs = cpu_count() num_samples = self.parameters['num_samples'] pool = Pool(num_procs) num = int(np.ceil(float(num_samples)/float(num_procs))) results = list(zip(*pool.map(lambda i: self.eval_EQ_sequential(p,num,i,quiet),range(num_procs),chunksize=1))) pool.terminate() pool.join() assert(len(results) == 2) assert(all([len(vals) == num_procs for vals in results])) return [sum(vals)/float(num_procs) for vals in results]
def test(): print('cpuCount() = %d\n' % cpuCount()) # # Create pool # PROCESSES = 4 print('Creating pool with %d processes\n' % PROCESSES) pool = Pool(PROCESSES) # # Tests # TASKS = [(mul, (i, 7)) for i in range(10)] + \ [(plus, (i, 8)) for i in range(10)] results = [pool.apply_async(calculate, t) for t in TASKS] imap_it = pool.imap(calculatestar, TASKS) imap_unordered_it = pool.imap_unordered(calculatestar, TASKS) print('Ordered results using pool.apply_async():') for r in results: print('\t', r.get()) print() print('Ordered results using pool.imap():') for x in imap_it: print('\t', x) print() print('Unordered results using pool.imap_unordered():') for x in imap_unordered_it: print('\t', x) print() print('Ordered results using pool.map() --- will block till complete:') for x in pool.map(calculatestar, TASKS): print('\t', x) print() # # Simple benchmarks # N = 100000 print('def pow3(x): return x**3') t = time.time() A = list(map(pow3, xrange(N))) print('\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() B = pool.map(pow3, xrange(N)) print('\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() C = list(pool.imap(pow3, xrange(N), chunksize=N//8)) print('\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \ ' seconds' % (N, N//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() L = [None] * 1000000 print('def noop(x): pass') print('L = [None] * 1000000') t = time.time() A = list(map(noop, L)) print('\tmap(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() B = pool.map(noop, L) print('\tpool.map(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() C = list(pool.imap(noop, L, chunksize=len(L)//8)) print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \ (len(L)//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() del A, B, C, L # # Test error handling # print('Testing error handling:') try: print(pool.apply(f, (5,))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.apply()') else: raise AssertionError('expected ZeroDivisionError') try: print(pool.map(f, range(10))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.map()') else: raise AssertionError('expected ZeroDivisionError') try: print(list(pool.imap(f, range(10)))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from list(pool.imap())') else: raise AssertionError('expected ZeroDivisionError') it = pool.imap(f, range(10)) for i in range(10): try: x = it.next() except ZeroDivisionError: if i == 5: pass except StopIteration: break else: if i == 5: raise AssertionError('expected ZeroDivisionError') assert i == 9 print('\tGot ZeroDivisionError as expected from IMapIterator.next()') print() # # Testing timeouts # print('Testing ApplyResult.get() with timeout:', end='') res = pool.apply_async(calculate, TASKS[0]) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % res.get(0.02)) break except TimeoutError: sys.stdout.write('.') print() print() print('Testing IMapIterator.next() with timeout:', end='') it = pool.imap(calculatestar, TASKS) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % it.next(0.02)) except StopIteration: break except TimeoutError: sys.stdout.write('.') print() print() # # Testing callback # print('Testing callback:') A = [] B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729] r = pool.apply_async(mul, (7, 8), callback=A.append) r.wait() r = pool.map_async(pow3, range(10), callback=A.extend) r.wait() if A == B: print('\tcallbacks succeeded\n') else: print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B)) # # Check there are no outstanding tasks # assert not pool._cache, 'cache = %r' % pool._cache # # Check close() methods # print('Testing close():') for worker in pool._pool: assert worker.is_alive() result = pool.apply_async(time.sleep, [0.5]) pool.close() pool.join() assert result.get() is None for worker in pool._pool: assert not worker.is_alive() print('\tclose() succeeded\n') # # Check terminate() method # print('Testing terminate():') pool = Pool(2) ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] pool.terminate() pool.join() for worker in pool._pool: assert not worker.is_alive() print('\tterminate() succeeded\n') # # Check garbage collection # print('Testing garbage collection:') pool = Pool(2) processes = pool._pool ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] del results, pool time.sleep(0.2) for worker in processes: assert not worker.is_alive() print('\tgarbage collection succeeded\n')
def partial_dependence(self, feature_ids, modelinstance, filter_classes=None, grid=None, grid_resolution=30, n_jobs=-1, grid_range=None, sample=True, sampling_strategy='random-choice', n_samples=1000, bin_count=50, return_metadata=False, progressbar=True, variance_type='estimate'): """ Approximates the partial dependence of the predict_fn with respect to the variables passed. Parameters: ----------- feature_ids: list the names/ids of the features for which partial dependence is to be computed. Note that the algorithm's complexity scales exponentially with additional features, so generally one should only look at one or two features at a time. These feature ids must be available in the class's associated DataSet. As of now, we only support looking at 1 or 2 features at a time. modelinstance: skater.model.model.Model subtype an estimator function of a fitted model used to derive prediction. Supports classification and regression. Supports classification(binary, multi-class) and regression. predictions = predict_fn(data) Can either by a skater.model.remote.DeployedModel or a skater.model.local.InMemoryModel filter_classes: array type The classes to run partial dependence on. Default None invokes all classes. Only used in classification models. grid: numpy.ndarray 2 dimensional array on which we fix values of features. Note this is determined automatically if not given based on the percentiles of the dataset. grid_resolution: int how many unique values to include in the grid. If the percentile range is 5% to 95%, then that range will be cut into <grid_resolution> equally size bins. Defaults to 30. n_jobs: int The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. Defaults to using all cores(-1). grid_range: tuple the percentile extrama to consider. 2 element tuple, increasing, bounded between 0 and 1. sample: boolean Whether to sample from the original dataset. sampling_strategy: string If sampling, which approach to take. See DataSet.generate_sample for details. n_samples: int The number of samples to use from the original dataset. Note this is only active if sample = True and sampling strategy = 'uniform'. If using 'uniform-over-similarity-ranks', use samples per bin bin_count: int The number of bins to use when using the similarity based sampler. Note this is only active if sample = True and sampling_strategy = 'uniform-over-similarity-ranks'. total samples = bin_count * samples per bin. samples_per_bin: int The number of samples to collect for each bin within the sampler. Note this is only active if sample = True and sampling_strategy = 'uniform-over-similarity-ranks'. If using sampling_strategy = 'uniform', use n_samples. total samples = bin_count * samples per bin. variance_type: string return_metadata: boolean :Example: >>> from skater.model import InMemoryModel >>> from skater.core.explanations import Interpretation >>> from sklearn.ensemble import RandomForestClassier >>> from sklearn.datasets import load_boston >>> boston = load_boston() >>> X = boston.data >>> y = boston.target >>> features = boston.feature_names >>> rf = RandomForestClassier() >>> rf.fit(X,y) >>> model = InMemoryModel(rf, examples = X) >>> interpreter = Interpretation() >>> interpreter.load_data(X) >>> feature_ids = ['ZN','CRIM'] >>> interpreter.partial_dependence.partial_dependence(features,model) """ if self.data_set is None: load_data_not_called_err_msg = "self.interpreter.data_set not found. \n" \ "Please call Interpretation.load_data \n" \ "before running this method." raise ( exceptions.DataSetNotLoadedError(load_data_not_called_err_msg)) feature_ids = self._check_features(feature_ids) if filter_classes: err_msg = "members of filter classes must be \n" \ "members of modelinstance.classes. \n" \ "Expected members of: \n" \ "{0}\n" \ "got: \n" \ "{1}".format(modelinstance.target_names, filter_classes) filter_classes = list(filter_classes) assert all([ i in modelinstance.target_names for i in filter_classes ]), err_msg # TODO: There might be a better place to do this check if not isinstance(modelinstance, ModelType): raise (exceptions.ModelError( "Incorrect estimator function used for computing partial dependence, try one \n" "creating one with skater.model.local.InMemoryModel or \n" "skater.model.remote.DeployedModel")) if modelinstance.model_type == 'classifier' and modelinstance.probability is False: if modelinstance.unique_values is None: raise (exceptions.ModelError( 'If using classifier without probability scores, unique_values cannot \n' 'be None')) self.interpreter.logger.warn( "Classifiers with probability scores can be explained \n" "more granularly than those without scores. If a prediction method with \n" "scores is available, use that instead.") # TODO: This we can change easily to functional style missing_feature_ids = [] for feature_id in feature_ids: if feature_id not in self.data_set.feature_ids: missing_feature_ids.append(feature_id) if missing_feature_ids: missing_feature_id_err_msg = "Features {0} not found in \n" \ "Interpretation.data_set.feature_ids \n" \ "{1}".format(missing_feature_ids, self.data_set.feature_ids) raise (KeyError(missing_feature_id_err_msg)) if grid_range is None: grid_range = (.05, 0.95) else: if not hasattr(grid_range, "__iter__"): err_msg = "Grid range {} needs to be an iterable".format( grid_range) raise (exceptions.MalformedGridRangeError(err_msg)) self._check_grid_range(grid_range) if not modelinstance.has_metadata: examples = self.data_set.generate_sample(strategy='random-choice', sample=True, n_samples=10) examples = DataManager(examples, feature_names=self.data_set.feature_ids) modelinstance._build_model_metadata(examples) # if you dont pass a grid, build one. grid = np.array(grid) if not grid.any(): # Currently, if a given feature has fewer unique values than the value # of grid resolution, then the grid will be set to those unique values. # Otherwise it will take the percentile # range according with grid_resolution bins. grid = self.data_set.generate_grid(feature_ids, grid_resolution=grid_resolution, grid_range=grid_range) else: # want to ensure all grids have 2 axes if len(grid.shape) == 1 and \ (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])): grid = grid[:, np.newaxis].T grid_resolution = grid.shape[1] self.interpreter.logger.debug("Grid shape used for pdp: {}".format( grid.shape)) self.interpreter.logger.debug( "Grid resolution for pdp: {}".format(grid_resolution)) # make sure data_set module is giving us correct data structure self._check_grid(grid, feature_ids) # generate data data_sample = self.data_set.generate_sample(strategy=sampling_strategy, sample=sample, n_samples=n_samples, bin_count=bin_count) assert type(data_sample) == self.data_set.data_type, "Something went wrong\n" \ "Theres a type mismatch between\n" \ "the sampled data and the origina\nl" \ "training set. Check Skater.models\n" _pdp_metadata = self._build_metadata_dict(modelinstance, feature_ids, self.data_set.feature_ids, filter_classes, variance_type) self.interpreter.logger.debug("Shape of sampled data: {}".format( data_sample.shape)) self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids)) self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata)) # cartesian product of grid grid_expanded = pd.DataFrame(list(product(*grid))).values if grid_expanded.shape[0] <= 0: empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \ "grid shape: {}".format(grid_expanded.shape) raise (exceptions.MalformedGridError(empty_grid_expanded_err_msg)) predict_fn = modelinstance._get_static_predictor() n_jobs = None if n_jobs < 0 else n_jobs pd_func = functools.partial(_compute_pd, estimator_fn=predict_fn, grid_expanded=grid_expanded, pd_metadata=_pdp_metadata, input_data=data_sample, filter_classes=filter_classes) arg_list = [i for i in range(grid_expanded.shape[0])] executor_instance = Pool(n_jobs) if progressbar: self.interpreter.logger.warn( "Progress bars slow down runs by 10-20%. For slightly \n" "faster runs, do progress_bar=False") mapper = executor_instance.imap p = ProgressBar(len(arg_list), units='grid cells') else: mapper = executor_instance.map pd_list = [] try: if n_jobs == 1: raise ValueError("Skipping to single processing") for pd_row in mapper(pd_func, arg_list): if progressbar: p.animate() pd_list.append(pd_row) except: self.interpreter.logger.warn( "Multiprocessing failed, going single process") for pd_row in map(pd_func, arg_list): if progressbar: p.animate() pd_list.append(pd_row) finally: executor_instance.close() executor_instance.join() executor_instance.terminate() if return_metadata: return pd.DataFrame(list(pd_list)), _pdp_metadata else: return pd.DataFrame(list(pd_list))
"--image_dir", required=True, help="path to the input dir") ap.add_argument("-p", "--plot", type=bool, default=False, required=False, help="plot results") ap.add_argument("-ps", "--pool_size", type=int, default=1, required=False, help="pool size for multiprocessing") args = vars(ap.parse_args()) images = glob.glob(args["image_dir"] + '*') plot = bool(args["plot"]) pool_size = int(args["pool_size"]) print(args["image_dir"], plot) pool = Pool(pool_size) pool_outputs = pool.map(partial(get_boxes, config=config, plot=plot), images[:]) pool.close() pool.join() pool.terminate() for output in pool_outputs: cv2.imwrite(output[2].replace('\\in', '\\out'), output[3])
def main(): qid = sys.argv[1] very_par = True ## Get query global q q = Query.objects.get(pk=qid) i = 0 n_records = 0 records = [] if very_par == True: record = [] else: record = {} max_chunk_size = 2000 chunk_size = 0 print(q.title) title = str(q.id) with open(settings.QUERY_DIR + title + "/s_results.txt", encoding="utf-8") as res: for line in res: if '\ufeff' in line: # BOM on first line continue if 'ER -' in line: # end of record - save it and start a new one n_records += 1 records.append(record) if very_par: record = [] else: record = {} chunk_size += 1 if chunk_size == max_chunk_size: #print("doing chunk starting from record:") #print(records[0]) # parallely add docs pool = Pool(processes=32) if very_par: pool.map(add_doc_text, records) else: pool.map(add_doc, records) pool.terminate() records = [] chunk_size = 0 continue if re.match("^EF", line): #end of file #done! continue record.append(line) print(chunk_size) if chunk_size < max_chunk_size: # parallely add docs pool = Pool(processes=32) if very_par: pool.map(add_doc_text, records) else: pool.map(add_doc, records) pool.terminate() django.db.connections.close_all() q.r_count = len(Doc.objects.filter(query=q)) q.save()
def fmultiprocess( log, function, inputArray, poolSize=False, timeout=3600, **kwargs): """multiprocess pool **Key Arguments:** - ``log`` -- logger - ``function`` -- the function to multiprocess - ``inputArray`` -- the array to be iterated over - ``poolSize`` -- limit the number of CPU that are used in multiprocess job - ``timeout`` -- time in sec after which to raise a timeout error if the processes have not completed **Return:** - ``resultArray`` -- the array of results **Usage:** .. code-block:: python from fundamentals import multiprocess # DEFINE AN INPUT ARRAY inputArray = range(10000) results = multiprocess(log=log, function=functionName, poolSize=10, timeout=300, inputArray=inputArray, otherFunctionKeyword="cheese") """ log.debug('starting the ``multiprocess`` function') # DEFINTE POOL SIZE - NUMBER OF CPU CORES TO USE (BEST = ALL - 1) if not poolSize: poolSize = psutil.cpu_count() if poolSize: p = Pool(processes=poolSize) else: p = Pool() cpuCount = psutil.cpu_count() chunksize = int((len(inputArray) + 1) / (cpuCount * 3)) if chunksize == 0: chunksize = 1 # MAP-REDUCE THE WORK OVER MULTIPLE CPU CORES if "log" in inspect.getargspec(function)[0]: mapfunc = partial(function, log=log, **kwargs) resultArray = p.map_async(mapfunc, inputArray, chunksize=chunksize) else: mapfunc = partial(function, **kwargs) resultArray = p.map_async(mapfunc, inputArray, chunksize=chunksize) resultArray = resultArray.get(timeout=timeout) p.close() p.terminate() log.debug('completed the ``multiprocess`` function') return resultArray
def read_wos(res, q, update, deduplicate=False): from django.db import connection connection.close() if deduplicate: print("nonstandard WoS, searching for duplicates") i = 0 n_records = 0 records = [] record = {} mfields = ['AU', 'AF', 'CR', 'C1', 'WC'] max_chunk_size = 2000 chunk_size = 0 q.doc_set.clear() p = 4 for line in res: if '\ufeff' in line: # BOM on first line continue if line == 'ER\n': # end of record - save it and start a new one n_records += 1 records.append(record) record = {} chunk_size += 1 if chunk_size == max_chunk_size: # parallely add docs if deduplicate: print("adding as if scopus") # pool = Pool(processes=1) pool.map(partial(add_scopus_doc, q=q, update=update), records) else: pool = Pool(processes=p) pool.map(partial(add_doc, q=q, update=update), records) pool.terminate() records = [] chunk_size = 0 continue if re.match("^EF", line): #end of file if chunk_size < max_chunk_size: # parallely add doc #pool.map(update_doc, records) if deduplicate: print("adding as if scopus") from django.db import connection connection.close() pool = Pool(processes=1) pool.map(partial(add_scopus_doc, q=q, update=update), records) else: pool = Pool(processes=p) pool.map(partial(add_doc, q=q, update=update), records) pool.terminate() #done! break if re.match("(^[A-Z][A-Z1-9]) (.*)", line): s = re.search("(^[A-Z][A-Z1-9]) (.*)", line) key = s.group(1).strip() value = s.group(2).strip() if key in mfields: record[key] = [value] else: record[key] = value elif len(line) > 1: if key in mfields: record[key].append(line.strip()) else: try: record[key] += " " + line.strip() except: print(line) print(record) return n_records
def main(): qid = sys.argv[1] ## Get query global q q = Query.objects.get(pk=qid) #Doc.objects.filter(query=qid).delete() # doesn't seem like a good idea i = 0 n_records = 0 records = [] record = {} mfields = ['AU', 'AF', 'CR', 'C1', 'WC'] max_chunk_size = 2000 chunk_size = 0 print(q.title) title = str(q.id) with open(settings.QUERY_DIR + title + "/results.txt", encoding="utf-8") as res: for line in res: if '\ufeff' in line: # BOM on first line continue if line == 'ER\n': # end of record - save it and start a new one n_records += 1 records.append(record) record = {} chunk_size += 1 if chunk_size == max_chunk_size: # parallely add docs pool = Pool(processes=50) pool.map(update_doc, records) #pool.map(partial(add_doc, q=q),records) pool.terminate() records = [] chunk_size = 0 continue if re.match("^EF", line): #end of file if chunk_size < max_chunk_size: # parallely add docs pool = Pool(processes=50) pool.map(update_doc, records) #pool.map(partial(add_doc, q=q),records) pool.terminate() #done! break if re.match("(^[A-Z][A-Z1-9])", line): s = re.search("(^[A-Z][A-Z1-9]) (.*)", line) key = s.group(1).strip() value = s.group(2).strip() if key in mfields: record[key] = [value] else: record[key] = value elif len(line) > 1: if key in mfields: record[key].append(line.strip()) else: record[key] += " " + line.strip() django.db.connections.close_all() q.r_count = n_records q.save()
def recursive_function(self, remotepath, *args, **kwargs): """The recursive wrapper around the original function.""" recursive = kwargs.pop('recursive', False) recursive_se = kwargs.pop('recursivese', None) if recursive_se is not None and recursive == False: recursive = True list_file = kwargs.pop('list', None) parallel = kwargs.pop('parallel', 1) if 'verbose' in kwargs: verbose = kwargs['verbose'] else: verbose = False if isinstance(recursive, str): regex = re.compile(recursive) recursive = True else: regex = None if list_file is not None: list_file = open(list_file, 'wt') good = 0 bad = 0 def is_good(path): if verbose: print_(self.iterating + " " + path) try: ret = self.function(path, *args, **kwargs) except Exception as e: print_(self.iterating + " " + path + " failed.") print_(e) return False, path else: if ret == 0: return True, path else: return False, path if recursive is True: if parallel > 1: # Deal with signal weirdness when using a Pool # Otherwise we won't be able to kill things with CTRL-C def abort(*args, **kwargs): print_("%d Aborting!" % (os.getpid(), )) raise Exception("%d Aborting!" % (os.getpid(), )) orig_sigint = signal.getsignal(signal.SIGINT) orig_sigterm = signal.getsignal(signal.SIGTERM) signal.signal(signal.SIGINT, abort) signal.signal(signal.SIGTERM, abort) p = Pool(parallel) mapper = p.imap else: mapper = map try: for g, path in mapper( is_good, utils.remote_iter_recursively(remotepath, regex, se=recursive_se, ignore_exceptions=True)): if g: good += 1 else: bad += 1 if list_file is not None: list_file.write(path + '\n') except Exception: if parallel > 1: # Kill all child processes for proc in p._pool: os.kill(proc.pid, signal.SIGTERM) raise finally: if parallel > 1: # Clean up processing pool p.terminate() p.join() del p # Resetting signal handlers signal.signal(signal.SIGINT, orig_sigint) signal.signal(signal.SIGTERM, orig_sigterm) if verbose: print_("%s %d files. %d files failed." % (self.iterated, good, bad)) if list_file is not None: list_file.close() if bad == 0: return 0 else: return 1 else: g, path = is_good(remotepath) if list_file is not None: if not g: list_file.write(remotepath + '\n') list_file.close() if g: return 0 else: return 1
def main(): print("starting topic model - about to read WoS results") #df = scrapeWoS.readWoS('../query/results.txt') df = scrapeWoS.readWoS('results.txt') # filter articles with missing abstracts, and only include articles and reviews df = df[(pd.notnull(df.AB)) & (pd.notnull(df.UT)) & (df.DT.isin(["Article", "Review"]))].reset_index() # Randomly reorder (or take a sample) the docs df = df.sample(frac=0.1).reset_index(drop=True) print("found " + str(len(df)) + " articles") sys.stdout.flush() df.IN = df.index.tolist() # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = len(df) documentstoanalyze = int(D) iterations = D // batchsize # The number of topics try: K = int(sys.argv[1]) print(K) except: K = 100 docset = list(df.AB) # ####### making a corpus stoplist = set(nltk.corpus.stopwords.words("english")) stoplist.add('Elsevier') mycorpus = MyCorpus(docset, stoplist) dictionary = mycorpus.dictionary #.filter_extremes(no_below=5, no_above=0.9) dictionary.filter_extremes(no_below=5, no_above=0.9) x = list(dictionary.values()) y = list(dictionary) vocab = sorted(zip(y, x)) W = len(vocab) print("found " + str(W) + " terms") sys.stdout.flush() run_id = db.init() # add terms to db db.add_terms(vocab) # add empty topics to db db.add_topics(K) # for k in range(K): # db.add_topic(k) # add all docs def f_doc(d): django.db.connections.close_all() db.add_doc(d[0], d[1], d[2], d[3], d[4], d[5]) django.db.connections.close_all() def f_gamma(d): django.db.connections.close_all() doc_size = len(docset[d]) doc_id = iteration * 100 + d + doc_diff for k in range(len(gamma[d])): db.add_doc_topic(doc_id, k, gamma[d][k], gamma[d][k] / doc_size) django.db.connections.close_all() def f_lambda(topic_no): django.db.connections.close_all() lambda_sum = sum(ldalambda[topic_no]) db.clear_topic_terms(topic_no) for term_no in range(len(ldalambda[topic_no])): db.add_topic_term(topic_no, term_no, ldalambda[topic_no][term_no] / lambda_sum) django.db.connections.close_all() # Take the information we need from the doc list then delete to free up memory docs = zip(df.TI, df.AB, df.IN, df.UT, df.PY, df.AU) all_docs = list(df.AB) # Add the documents to the database print("Adding docs to database") pool = Pool(processes=8) pool.map(f_doc, docs) pool.terminate() global doc_diff doc_diff = db.docdiff(df.IN[-1]) del (df) del (docs) gc.collect() # docs = zip(df.TI,df.AB,df.IN,df.UT,df.PY,df.AU) # pool = Pool(processes=8) # pool.map(f_auth,docs) # pool.terminate() print("All docs added, initialising topic model") olda = gensim.models.LdaMulticore(num_topics=K, id2word=dictionary, workers=8) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, iterations): t0 = time.time() firstDoc = batchsize * iteration lastDoc = batchsize * (iteration + 1) if lastDoc > D: lastDoc = D docset = all_docs[firstDoc:lastDoc] mycorpus = MiniCorpus(docset, stoplist, dictionary) olda_t0 = time.time() olda.update(mycorpus) elapsed = time.time() - t0 print("updated with docs " + str(firstDoc) + " to " + str(lastDoc) + ": took " + str(elapsed)) # Get the gamma (doc-topic matrix) and add to database gamma = olda.inference(mycorpus) gamma = gamma[0] docs = range(len(gamma)) pool = Pool(processes=8) pool.map(f_gamma, docs) pool.terminate() # Every 10th iteration, get the lambda (topic-term matrix) and add to database if (iteration % 20 == 0): ldalambda = olda.inference(mycorpus, collect_sstats=True)[1] numpy.savetxt('lambda-%d.dat' % iteration, ldalambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma) topics = range(len(ldalambda)) pool = Pool(processes=8) pool.map(f_lambda, topics) pool.terminate() gc.collect() sys.stdout.flush() elapsed = time.time() - t0 print(elapsed) db.increment_batch_count() olda.save("olda" + str(run_id))
def partial_dependence(self, feature_ids, modelinstance, filter_classes=None, grid=None, grid_resolution=30, n_jobs=-1, grid_range=None, sample=True, sampling_strategy='random-choice', n_samples=1000, bin_count=50, return_metadata=False, progressbar=True, variance_type='estimate'): """ Approximates the partial dependence of the predict_fn with respect to the variables passed. Parameters: ----------- feature_ids: list the names/ids of the features for which partial dependence is to be computed. Note that the algorithm's complexity scales exponentially with additional features, so generally one should only look at one or two features at a time. These feature ids must be available in the class's associated DataSet. As of now, we only support looking at 1 or 2 features at a time. modelinstance: skater.model.model.Model subtype an estimator function of a fitted model used to derive prediction. Supports classification and regression. Supports classification(binary, multi-class) and regression. predictions = predict_fn(data) Can either by a skater.model.remote.DeployedModel or a skater.model.local.InMemoryModel filter_classes: array type The classes to run partial dependence on. Default None invokes all classes. Only used in classification models. grid: numpy.ndarray 2 dimensional array on which we fix values of features. Note this is determined automatically if not given based on the percentiles of the dataset. grid_resolution: int how many unique values to include in the grid. If the percentile range is 5% to 95%, then that range will be cut into <grid_resolution> equally size bins. Defaults to 30. n_jobs: int The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. Defaults to using all cores(-1). grid_range: tuple the percentile extrama to consider. 2 element tuple, increasing, bounded between 0 and 1. sample: boolean Whether to sample from the original dataset. sampling_strategy: string If sampling, which approach to take. See DataSet.generate_sample for details. n_samples: int The number of samples to use from the original dataset. Note this is only active if sample = True and sampling strategy = 'uniform'. If using 'uniform-over-similarity-ranks', use samples per bin bin_count: int The number of bins to use when using the similarity based sampler. Note this is only active if sample = True and sampling_strategy = 'uniform-over-similarity-ranks'. total samples = bin_count * samples per bin. samples_per_bin: int The number of samples to collect for each bin within the sampler. Note this is only active if sample = True and sampling_strategy = 'uniform-over-similarity-ranks'. If using sampling_strategy = 'uniform', use n_samples. total samples = bin_count * samples per bin. variance_type: string return_metadata: boolean :Examples: >>> from skater.model import InMemoryModel >>> from skater.core.explanations import Interpretation >>> from sklearn.ensemble import RandomForestClassier >>> from sklearn.datasets import load_boston >>> boston = load_boston() >>> X = boston.data >>> y = boston.target >>> features = boston.feature_names >>> rf = RandomForestClassier() >>> rf.fit(X,y) >>> model = InMemoryModel(rf.predict_proba, examples = X) >>> interpreter = Interpretation() >>> interpreter.load_data(X) >>> feature_ids = ['ZN','CRIM'] >>> interpreter.partial_dependence.partial_dependence(features,model) """ if self.data_set is None: load_data_not_called_err_msg = "self.interpreter.data_set not found. \n" \ "Please call Interpretation.load_data \n" \ "before running this method." raise(exceptions.DataSetNotLoadedError(load_data_not_called_err_msg)) feature_ids = self._check_features(feature_ids) if filter_classes: err_msg = "members of filter classes must be \n" \ "members of modelinstance.classes. \n" \ "Expected members of: \n" \ "{0}\n" \ "got: \n" \ "{1}".format(modelinstance.target_names, filter_classes) filter_classes = list(filter_classes) assert all([i in modelinstance.target_names for i in filter_classes]), err_msg # TODO: There might be a better place to do this check if not isinstance(modelinstance, ModelType): raise(exceptions.ModelError("Incorrect estimator function used for computing partial dependence, try one \n" "creating one with skater.model.local.InMemoryModel or \n" "skater.model.remote.DeployedModel")) if modelinstance.model_type == 'classifier' and modelinstance.probability is False: if modelinstance.unique_values is None: raise(exceptions.ModelError('If using classifier without probability scores, unique_values cannot \n' 'be None')) self.interpreter.logger.warn("Classifiers with probability scores can be explained \n" "more granularly than those without scores. If a prediction method with \n" "scores is available, use that instead.") # TODO: This we can change easily to functional style missing_feature_ids = [] for feature_id in feature_ids: if feature_id not in self.data_set.feature_ids: missing_feature_ids.append(feature_id) if missing_feature_ids: missing_feature_id_err_msg = "Features {0} not found in \n" \ "Interpretation.data_set.feature_ids \n" \ "{1}".format(missing_feature_ids, self.data_set.feature_ids) raise(KeyError(missing_feature_id_err_msg)) if grid_range is None: grid_range = (.05, 0.95) else: if not hasattr(grid_range, "__iter__"): err_msg = "Grid range {} needs to be an iterable".format(grid_range) raise(exceptions.MalformedGridRangeError(err_msg)) self._check_grid_range(grid_range) if not modelinstance.has_metadata: examples = self.data_set.generate_sample(strategy='random-choice', sample=True, n_samples=10) examples = DataManager(examples, feature_names=self.data_set.feature_ids) modelinstance._build_model_metadata(examples) # if you dont pass a grid, build one. grid = np.array(grid) if not grid.any(): # Currently, if a given feature has fewer unique values than the value # of grid resolution, then the grid will be set to those unique values. # Otherwise it will take the percentile # range according with grid_resolution bins. grid = self.data_set.generate_grid(feature_ids, grid_resolution=grid_resolution, grid_range=grid_range) else: # want to ensure all grids have 2 axes if len(grid.shape) == 1 and \ (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])): grid = grid[:, np.newaxis].T grid_resolution = grid.shape[1] self.interpreter.logger.debug("Grid shape used for pdp: {}".format(grid.shape)) self.interpreter.logger.debug("Grid resolution for pdp: {}".format(grid_resolution)) # make sure data_set module is giving us correct data structure self._check_grid(grid, feature_ids) # generate data data_sample = self.data_set.generate_sample(strategy=sampling_strategy, sample=sample, n_samples=n_samples, bin_count=bin_count) assert type(data_sample) == self.data_set.data_type, "Something went wrong\n" \ "Theres a type mismatch between\n" \ "the sampled data and the origina\nl" \ "training set. Check Skater.models\n" _pdp_metadata = self._build_metadata_dict(modelinstance, feature_ids, self.data_set.feature_ids, filter_classes, variance_type) self.interpreter.logger.debug("Shape of sampled data: {}".format(data_sample.shape)) self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids)) self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata)) # cartesian product of grid grid_expanded = pd.DataFrame(list(product(*grid))).values if grid_expanded.shape[0] <= 0: empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \ "grid shape: {}".format(grid_expanded.shape) raise(exceptions.MalformedGridError(empty_grid_expanded_err_msg)) predict_fn = modelinstance._get_static_predictor() n_jobs = None if n_jobs < 0 else n_jobs pd_func = functools.partial(_compute_pd, estimator_fn=predict_fn, grid_expanded=grid_expanded, pd_metadata=_pdp_metadata, input_data=data_sample, filter_classes=filter_classes) arg_list = [i for i in range(grid_expanded.shape[0])] executor_instance = Pool(n_jobs) if progressbar: self.interpreter.logger.warn("Progress bars slow down runs by 10-20%. For slightly \n" "faster runs, do progress_bar=False") mapper = executor_instance.imap p = ProgressBar(len(arg_list), units='grid cells') else: mapper = executor_instance.map pd_list = [] try: if n_jobs == 1: raise ValueError("Skipping to single processing") for pd_row in mapper(pd_func, arg_list): if progressbar: p.animate() pd_list.append(pd_row) except: self.interpreter.logger.warn("Multiprocessing failed, going single process") for pd_row in map(pd_func, arg_list): if progressbar: p.animate() pd_list.append(pd_row) finally: executor_instance.close() executor_instance.join() executor_instance.terminate() if return_metadata: return pd.DataFrame(list(pd_list)), _pdp_metadata else: return pd.DataFrame(list(pd_list))
def test(): print('cpuCount() = %d\n' % cpuCount()) # # Create pool # PROCESSES = 4 print('Creating pool with %d processes\n' % PROCESSES) pool = Pool(PROCESSES) # # Tests # TASKS = [(mul, (i, 7)) for i in range(10)] + \ [(plus, (i, 8)) for i in range(10)] results = [pool.apply_async(calculate, t) for t in TASKS] imap_it = pool.imap(calculatestar, TASKS) imap_unordered_it = pool.imap_unordered(calculatestar, TASKS) print('Ordered results using pool.apply_async():') for r in results: print('\t', r.get()) print() print('Ordered results using pool.imap():') for x in imap_it: print('\t', x) print() print('Unordered results using pool.imap_unordered():') for x in imap_unordered_it: print('\t', x) print() print('Ordered results using pool.map() --- will block till complete:') for x in pool.map(calculatestar, TASKS): print('\t', x) print() # # Simple benchmarks # N = 100000 print('def pow3(x): return x**3') t = time.time() A = list(map(pow3, range(N))) print('\tmap(pow3, range(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() B = pool.map(pow3, range(N)) print('\tpool.map(pow3, range(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() C = list(pool.imap(pow3, range(N), chunksize=N // 8)) print('\tlist(pool.imap(pow3, range(%d), chunksize=%d)):\n\t\t%s' \ ' seconds' % (N, N//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() L = [None] * 1000000 print('def noop(x): pass') print('L = [None] * 1000000') t = time.time() A = list(map(noop, L)) print('\tmap(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() B = pool.map(noop, L) print('\tpool.map(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() C = list(pool.imap(noop, L, chunksize=len(L) // 8)) print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \ (len(L)//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() del A, B, C, L # # Test error handling # print('Testing error handling:') try: print(pool.apply(f, (5, ))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.apply()') else: raise AssertionError('expected ZeroDivisionError') try: print(pool.map(f, range(10))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.map()') else: raise AssertionError('expected ZeroDivisionError') try: print(list(pool.imap(f, range(10)))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from list(pool.imap())') else: raise AssertionError('expected ZeroDivisionError') it = pool.imap(f, range(10)) for i in range(10): try: x = it.next() except ZeroDivisionError: if i == 5: pass except StopIteration: break else: if i == 5: raise AssertionError('expected ZeroDivisionError') assert i == 9 print('\tGot ZeroDivisionError as expected from IMapIterator.next()') print() # # Testing timeouts # print('Testing ApplyResult.get() with timeout:', end='') res = pool.apply_async(calculate, TASKS[0]) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % res.get(0.02)) break except TimeoutError: sys.stdout.write('.') print() print() print('Testing IMapIterator.next() with timeout:', end='') it = pool.imap(calculatestar, TASKS) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % it.next(0.02)) except StopIteration: break except TimeoutError: sys.stdout.write('.') print() print() # # Testing callback # print('Testing callback:') A = [] B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729] r = pool.apply_async(mul, (7, 8), callback=A.append) r.wait() r = pool.map_async(pow3, range(10), callback=A.extend) r.wait() if A == B: print('\tcallbacks succeeded\n') else: print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B)) # # Check there are no outstanding tasks # assert not pool._cache, 'cache = %r' % pool._cache # # Check close() methods # print('Testing close():') for worker in pool._pool: assert worker.is_alive() result = pool.apply_async(time.sleep, [0.5]) pool.close() pool.join() assert result.get() is None for worker in pool._pool: assert not worker.is_alive() print('\tclose() succeeded\n') # # Check terminate() method # print('Testing terminate():') pool = Pool(2) ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] pool.terminate() pool.join() for worker in pool._pool: assert not worker.is_alive() print('\tterminate() succeeded\n') # # Check garbage collection # print('Testing garbage collection:') pool = Pool(2) processes = pool._pool ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] del results, pool time.sleep(0.2) for worker in processes: assert not worker.is_alive() print('\tgarbage collection succeeded\n')
def upload_dtm(run_id, output_path): stat = RunStats.objects.get(pk=run_id) print("upload dtm results to db") info = readInfo(os.path.join(output_path, "lda-seq/info.dat")) topic_ids = db.add_topics(stat.K, stat.run_id) vocab_ids = [] input_path = output_path.replace("-output-", "-input-") with open(os.path.join(input_path, 'foo-vocab.dat'), 'r') as f: for l in f: try: vocab_ids.append(int(l.split(':')[0].strip())) except: pass ids = [] docsizes = [] with open(os.path.join(input_path, 'foo-docids.dat'), 'r') as f: for l in f: try: id, s = [int(x.strip()) for x in l.split(':')] ids.append(id) docsizes.append(s) except: pass time_range = sorted([tp.n for tp in stat.periods.all().order_by('n')]) ################################# # TopicTerms print("writing topic terms") topics = range(info['NUM_TOPICS']) pool = Pool(processes=8) pool.map( partial(dtm_topic, info=info, topic_ids=topic_ids, vocab_ids=vocab_ids, ys=time_range, run_id=run_id, output_path=output_path), topics) pool.terminate() gc.collect() ###################################### # Doctopics print("writing doctopics") gamma = np.fromfile(os.path.join(output_path, 'lda-seq/gam.dat'), dtype=float, sep=" ") gamma = gamma.reshape((int(len(gamma) / stat.K), stat.K)) gamma = find(csr_matrix(gamma)) glength = len(gamma[0]) chunk_size = 100000 ps = 16 parallel_add = True all_dts = [] make_t = 0 add_t = 0 for i in range(glength // chunk_size + 1): dts = [] values_list = [] f = i * chunk_size l = (i + 1) * chunk_size if l > glength: l = glength docs = range(f, l) doc_batches = [] for p in range(ps): doc_batches.append([x for x in docs if x % ps == p]) pool = Pool(processes=ps) make_t0 = time() values_list.append( pool.map( partial(db.f_gamma_batch, gamma=gamma, docsizes=docsizes, docUTset=ids, topic_ids=topic_ids, run_id=run_id), doc_batches)) pool.terminate() make_t += time() - make_t0 django.db.connections.close_all() add_t0 = time() values_list = [item for sublist in values_list for item in sublist] pool = Pool(processes=ps) pool.map(db.insert_many, values_list) pool.terminate() add_t += time() - add_t0 gc.collect() sys.stdout.flush() stat = RunStats.objects.get(run_id=run_id) stat.last_update = timezone.now() stat.status = 3 # 3 = finished stat.save() management.call_command('update_run', run_id)
def run_blei_dtm(stat, call_to_blei_algorithm=True, dtm_path="/home/galm/software/dtm/dtm/main", archiving=True): """ Run dynamic NMF model on utterances (speeches) or paragraphs from the parliament data :param stat: RunStats object with the parameters to run the model with :param call_to_blei_algorithm: boolean whether to call the blei algorithm or not :param dtm_path: path to the dtm binary :return: 0 if successful, 1 otherwise """ start_datetime = timezone.now() print("starting topic model for runstat with settings:") for field in stat._meta.fields: field_value = getattr(stat, field.name) if field_value: print("{}: {}".format(field.name, field_value)) run_id = stat.run_id s = Search.objects.get(pk=stat.psearch.id) ########################## ## create input and output folder input_path = './dtm-input-sid{}_{}'.format(stat.psearch.id, stat.pk) output_path = './dtm-output-sid{}_{}'.format(stat.psearch.id, stat.pk) if os.path.isdir(input_path): if call_to_blei_algorithm: shutil.rmtree(input_path) os.mkdir(input_path) else: os.mkdir(input_path) if os.path.isdir(output_path): if call_to_blei_algorithm: shutil.rmtree(output_path) os.mkdir(output_path) else: os.mkdir(output_path) # load text from database if s.search_object_type == 1: ps = Paragraph.objects.filter(search_matches=s) docs = ps.filter( text__iregex='\w').order_by('utterance__document__parlperiod__n') texts, docsizes, ids = process_texts(docs) tc = ps.order_by().values( 'utterance__document__parlperiod__n').annotate( count=models.Count('utterance__document__parlperiod__n')) time_counts = { item['utterance__document__parlperiod__n']: item['count'] for item in tc } pps = ParlPeriod.objects.filter( document__utterance__paragraph__in=ps).distinct() elif s.search_object_type == 2: uts = Utterance.objects.filter( search_matches=s).order_by('document__parlperiod__n') texts, docsizes, ids = merge_utterance_paragraphs(uts) tc = uts.order_by().values('document__parlperiod__n').annotate( count=models.Count('document__parlperiod__n')) time_counts = { item['document__parlperiod__n']: item['count'] for item in tc } pps = ParlPeriod.objects.filter(document__utterance__in=uts).distinct() else: print("search object type invalid") return 1 for i, pp in enumerate(pps.order_by('n')): try: tp, created = TimePeriod.objects.get_or_create(parlperiod=pp, n=i, ys=pp.years, title=str(pp)) except MultipleObjectsReturned: tp = TimePeriod.objects.filter( parlperiod=pp, n=i, ys=pp.years, title=str(pp)).order_by('id').first() stat.periods.add(tp) time_range = [s.n for s in stat.periods.all().order_by('n')] ######################### ## Get the features now print("Extracting word features...") if stat.language is "german": stemmer = SnowballStemmer("german") tokenizer = german_stemmer() stopword_list = [stemmer.stem(t) for t in stopwords.words("german")] elif stat.language is "english": stemmer = SnowballStemmer("english") stopword_list = [stemmer.stem(t) for t in stopwords.words("english")] tokenizer = snowball_stemmer() else: print("Language not recognized: {}".format(stat.language)) return 1 if stat.extra_stopwords: stopword_list = list(set(stopword_list) | set(stat.extra_stopwords)) if stat.max_features == 0: n_features = 100000000 else: n_features = stat.max_features vectorizer = CountVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(1, stat.ngram), tokenizer=tokenizer, stop_words=stopword_list) t0 = time() dtm = vectorizer.fit_transform(texts) print("done in %0.3fs." % (time() - t0)) with open(os.path.join(input_path, 'foo-doctexts.dat'), 'w') as f: for i, text in enumerate(texts): f.write("D#{}: ".format(i) + text + "\n") f.write('\n') del texts gc.collect() print("Save terms to DB") # Get the vocab, add it to db vocab = vectorizer.get_feature_names() vocab_ids = [] pool = Pool(processes=8) vocab_ids.append(pool.map(partial(db.add_features, run_id=run_id), vocab)) pool.terminate() vocab_ids = vocab_ids[0] with open(os.path.join(input_path, 'foo-vocab.dat'), 'w') as f: for i, w in enumerate(vocab): f.write(str(vocab_ids[i]) + ": " + w + "\n") f.write('\n') del vocab django.db.connections.close_all() print("write input files for Blei algorithm") with open(os.path.join(input_path, 'foo-mult.dat'), 'w') as mult: for d in range(dtm.shape[0]): words = find(dtm[d]) uwords = len(words[0]) mult.write(str(uwords) + " ") for w in range(uwords): index = words[1][w] count = words[2][w] mult.write(str(index) + ": " + str(count) + " ") mult.write('\n') ########################## ##put counts per time step in the seq file with open(os.path.join(input_path, 'foo-seq.dat'), 'w') as seq: seq.write(str(len(time_range))) for key, value in time_counts.items(): seq.write('\n') seq.write(str(value)) ########################## # Run the dtm if call_to_blei_algorithm: print("Calling Blei algorithm") process_output = open( os.path.join(output_path, 'blei_dtm_algorithm.log'), 'w') subprocess.Popen([ dtm_path, "--ntopics={}".format( stat.K), "--mode=fit", "--rng_seed={}".format(stat.rng_seed), "--initialize_lda=true", "--corpus_prefix={}".format( os.path.join(os.path.abspath(input_path), 'foo')), "--outname={}".format( os.path.abspath(output_path)), "--top_chain_var={}".format( stat.top_chain_var), "--alpha={}".format(stat.alpha), "--lda_sequence_min_iter=10", "--lda_sequence_max_iter={}".format( stat.max_iter), "--lda_max_em_iter=20" ], stdout=process_output, stderr=process_output).wait() print("Blei algorithm done") ########################## ## Upload the dtm results to the db print("upload dtm results to db") info = readInfo(os.path.join(output_path, "lda-seq/info.dat")) topic_ids = db.add_topics(stat.K, stat.run_id) ################################# # TopicTerms print("writing topic terms") topics = range(info['NUM_TOPICS']) pool = Pool(processes=8) pool.map( partial(dtm_topic, info=info, topic_ids=topic_ids, vocab_ids=vocab_ids, ys=time_range, run_id=run_id, output_path=output_path), topics) pool.terminate() gc.collect() ###################################### # Doctopics print("writing doctopics") gamma = np.fromfile(os.path.join(output_path, 'lda-seq/gam.dat'), dtype=float, sep=" ") gamma = gamma.reshape((int(len(gamma) / stat.K), stat.K)) gamma = find(csr_matrix(gamma)) glength = len(gamma[0]) chunk_size = 100000 ps = 16 parallel_add = True all_dts = [] make_t = 0 add_t = 0 for i in range(glength // chunk_size + 1): dts = [] values_list = [] f = i * chunk_size l = (i + 1) * chunk_size if l > glength: l = glength docs = range(f, l) doc_batches = [] for p in range(ps): doc_batches.append([x for x in docs if x % ps == p]) pool = Pool(processes=ps) make_t0 = time() values_list.append( pool.map( partial(db.f_gamma_batch, gamma=gamma, docsizes=docsizes, docUTset=ids, topic_ids=topic_ids, run_id=run_id), doc_batches)) pool.terminate() make_t += time() - make_t0 django.db.connections.close_all() add_t0 = time() values_list = [item for sublist in values_list for item in sublist] pool = Pool(processes=ps) if s.search_object_type == 1: pool.map(db.insert_many_pars, values_list) elif s.search_object_type == 2: pool.map(db.insert_many_utterances, values_list) pool.terminate() add_t += time() - add_t0 gc.collect() sys.stdout.flush() stat = RunStats.objects.get(run_id=run_id) stat.last_update = timezone.now() stat.runtime = timezone.now() - start_datetime stat.status = 3 # 3 = finished stat.save() management.call_command('update_run', run_id) totalTime = time() - t0 tm = int(totalTime // 60) ts = int(totalTime - (tm * 60)) print("done! total time: " + str(tm) + " minutes and " + str(ts) + " seconds") print("a maximum of " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000) + " MB was used") if archiving: subprocess.Popen([ "zip", "-r", output_path + ".zip", output_path + "/", ], stdout=process_output, stderr=process_output).wait() subprocess.Popen([ "zip", "-r", input_path + ".zip", input_path + "/", ], stdout=process_output, stderr=process_output).wait() subprocess.Popen(["rm", "-r", output_path], stdout=process_output, stderr=process_output).wait() subprocess.Popen(["rm", "-r", input_path], stdout=process_output, stderr=process_output).wait() return 0
def feature_importance(self, model_instance, ascending=True, filter_classes=None, n_jobs=-1, progressbar=True, n_samples=5000, method='prediction-variance', scorer_type='default', use_scaling=False): """ Computes feature importance of all features related to a model instance. Supports classification, multi-class classification, and regression. Wei, Pengfei, Zhenzhou Lu, and Jingwen Song. "Variable Importance Analysis: A Comprehensive Review". Reliability Engineering & System Safety 142 (2015): 399-432. Parameters ---------- model_instance: skater.model.model.Model subtype the machine learning model "prediction" function to explain, such that predictions = predict_fn(data). ascending: boolean, default True Helps with ordering Ascending vs Descending filter_classes: array type The classes to run partial dependence on. Default None invokes all classes. Only used in classification models. n_jobs: int How many concurrent processes to use. Defaults -1, which grabs as many as are available. Use 1 to avoid multiprocessing altogether. progressbar: bool Whether to display progress. This affects which function we use to operate on the pool of processes, where including the progress bar results in 10-20% slowdowns. n_samples: int How many samples to use when computing importance. method: string (default 'prediction-variance'; 'model-scoring' for estimator specific scoring metric How to compute feature importance. 'model-scoring' requires Interpretation.training_labels. Note this choice should only rarely makes any significant differences prediction-variance: mean absolute value of changes in predictions, given perturbations. model-scoring: difference in log_loss or MAE of training_labels given perturbations. scorer_type: string only used when method='model-scoring', and in this case defines which scoring function to use. Default value is 'default', which evaluates to: regressors: mean absolute error classifiers with probabilities: cross entropy classifiers without probabilities: f1 score See Skater.model.scorers for details. use_scaling: bool Whether to weight the importance values by the strength of the perturbations. Generally doesn't effect results unless n_samples is very small. Returns ------- importances : Sorted Series Examples -------- >>> from skater.model import InMemoryModel >>> from skater.core.explanations import Interpretation >>> from sklearn.ensemble import RandomForestClassier >>> rf = RandomForestClassier() >>> rf.fit(X,y) >>> model = InMemoryModel(rf.predict_proba, examples = X) >>> interpreter = Interpretation() >>> interpreter.load_data(X) >>> interpreter.feature_importance.feature_importance(model) """ if filter_classes: err_msg = "members of filter classes must be" \ "members of model_instance.classes." \ "Expected members of: {0}\n" \ "got: {1}".format(model_instance.target_names, filter_classes) filter_classes = list(filter_classes) assert all([ i in model_instance.target_names for i in filter_classes ]), err_msg if n_samples <= self.data_set.n_rows: inputs, labels = self.data_set.generate_sample( strategy='random-choice', include_y=True, sample=True, n_samples=n_samples) else: inputs, labels = self.data_set.X, self.data_set.y if method == 'model-scoring' and labels is None: raise FeatureImportanceError( "If labels are not set, you" "can only use feature importance methods that do " "not require ground truth labels") original_predictions = model_instance.predict(inputs) n_jobs = None if n_jobs < 0 else n_jobs predict_fn = model_instance._get_static_predictor() executor_instance = Pool(n_jobs) arg_list = self.data_set.feature_ids scorer = model_instance.scorers.get_scorer_function( scorer_type=scorer_type) if progressbar: self.interpreter.logger.warn( "Progress bars slow down runs by 10-20%. For slightly \n" "faster runs, do progress_bar=False") n_iter = len(self.data_set.feature_ids) p = ProgressBar(n_iter, units='features') mapper = executor_instance.imap else: mapper = executor_instance.map fi_func = partial(compute_feature_importance, input_data=inputs, estimator_fn=predict_fn, original_predictions=original_predictions, feature_info=self.data_set.feature_info, feature_names=self.data_set.feature_ids, training_labels=labels, method=method, scaled=use_scaling, scorer=scorer) importances = {} try: if n_jobs == 1: raise ValueError("Skipping to single processing") importance_dicts = [] for importance in mapper(fi_func, arg_list): importance_dicts.append(importance) if progressbar: p.animate() except: self.interpreter.logger.warn( "Multiprocessing failed, going single process") importance_dicts = [] for importance in map(fi_func, arg_list): importance_dicts.append(importance) if progressbar: p.animate() finally: executor_instance.close() executor_instance.join() executor_instance.terminate() for i in importance_dicts: importances.update(i) importances = pd.Series(importances).sort_values(ascending=ascending) if not importances.sum() > 0: self.interpreter.logger.debug( "Importances that caused a bug: {}".format(importances)) raise (FeatureImportanceError( "Something went wrong. Importances do not sum to a positive value\n" "This could be due to:\n" "1) 0 or infinite divisions\n" "2) perturbed values == original values\n" "3) feature is a constant\n" "Please submit an issue here:\n" "https://github.com/datascienceinc/Skater/issues")) importances = divide_zerosafe( importances, (np.ones(importances.shape[0]) * importances.sum())) return importances