def format_to_lines(args): if not os.path.isdir(args.map_path): os.makedirs(args.map_path) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) data_splitter = SplitRawFiles(args.raw_path, args.map_path) data_splitter.get_and_split_filenames() data_splitter.save_fnames_to_corresponding_files() corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): # temp.append(hashhex(line.strip())) temp.append(line) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] i=0 for f in glob.glob(pjoin(args.raw_path, '*.json')): # real_name = f.split('/')[-1].split('.')[0] # real_name = hashhex(f.split('/')[-1].split('.')[0]) real_name = f.split('/')[-1] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) i+=1 # if i > 100: # break corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} # import ipdb; ipdb.set_trace() for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) # import ipdb; ipdb.set_trace() with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def createZips(self): t1 = time() if __name__ == '__main__': self.get_list_of_id() # get set of string id p = Pool() p.map(self.createZip, range(self.count_zips)) p.close() p.join() print('Create .zip files time = ' + str(time() - t1) + 's')
def format_to_lines(args): # corpus_mapping = {} # for corpus_type in ['valid', 'test', 'train']: # temp = [] # for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): # temp.append(hashhex(line.strip())) # corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] # 随机划分数据集,train:valid:test = 8:1:1 import random random.seed(1) for f in glob.glob(pjoin(args.raw_path, '*.json')): # real_name = f.split('/')[-1].split('.')[0] # if (real_name in corpus_mapping['valid']): # valid_files.append(f) # elif (real_name in corpus_mapping['test']): # test_files.append(f) # elif (real_name in corpus_mapping['train']): # train_files.append(f) n = random.random() if n <= 0.1: valid_files.append(f) elif n <= 0.2: test_files.append(f) else: train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] if args.map_on and args.map_path != 'empty': for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} else: tr, va, te = manual_corp_assign(args) corpus_mapping['valid'] = va corpus_mapping['test'] = te corpus_mapping['train'] = tr train_files, valid_files, test_files = [], [], [] # path = glob.glob(pjoin(args.raw_path, '*.json')) # sh hinzu # if len(path) < 1: # path = glob.glob(pjoin(os.getcwd() + '\\' + args.raw_path, '*.json')) # print(os.getcwd() + '\\' + args.raw_path) for f in glob.glob(pjoin(args.raw_path, '*.json')): # sh geändert if args.map_on and args.map_path != 'empty': real_name = f.split('\\')[-1].split('.')[0] # SH geändert real_name = f.split('/')[-1].split('.')[0] else: real_name = f if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def get_gameplays(): PlayTypeDict = {} PlayTypeStrings = { 'Pass': ['pass incomplete', 'pass complete', 'sacked'], 'Admin': ['spiked the ball', 'Timeout', 'Penalty', 'aborted'], 'Kneel': ['knee', 'knelt'], 'Punt': ['Punts'], 'Field Goal': ['field goal', 'no good'], 'Special Teams': ['kicks off', 'kicks onside', 'extra point', 'two point'], 'Run': [ 'left end', 'right end', ' for ', 'up the middle', 'middle for', 'left tackle', 'left guard', 'right guard', 'right tackle' ], } YearStart = 1998 YearsToGo = 20 for Year in range(YearStart, YearStart + YearsToGo): PlayTypeCounts = { 'Pass': 0, 'Run': 0, 'Punt': 0, 'Field Goal': 0, 'Admin': 0, 'Kneel': 0, 'Special Teams': 0 } for GameNumber in range(1, 17): print('Game', GameNumber, 'in', Year, 'Time: ', datetime.now()) PlayTypeDict = {} PathList = [] for Team in TeamLookup: for GameLocation in ['H', 'A']: path = 'https://widgets.sports-reference.com/wg.fcgi?css=1&site=pfr&url=%2Fplay-index%2Fplay_finder.cgi%3Frequest%3D1%26match%3Dall%26year_min%3D{YEAR}%26year_max%3D{YEAR}%26game_type%3DR%26game_num_min%3D{GameNumber}%26game_num_max%3D{GameNumber}%26week_num_min%3D0%26week_num_max%3D99%26game_location%3D{GameLocation}%26minutes_max%3D15%26seconds_max%3D0%26minutes_min%3D0%26seconds_min%3D0%26team_id%3D{TEAM}%26field_pos_min_field%3Dteam%26field_pos_max_field%3Dteam%26end_field_pos_min_field%3Dteam%26end_field_pos_max_field%3Dteam%26type%255B%255D%3DPASS%26type%255B%255D%3DRUSH%26type%255B%255D%3DPUNT%26type%255B%255D%3DKOFF%26type%255B%255D%3DONSD%26type%255B%255D%3DFG%26type%255B%255D%3DXP%26type%255B%255D%3D2PC%26no_play%3DN%26turnover_type%255B%255D%3Dinterception%26turnover_type%255B%255D%3Dfumble%26score_type%255B%255D%3Dtouchdown%26score_type%255B%255D%3Dfield_goal%26score_type%255B%255D%3Dsafety%26order_by%3Dyds_to_go&div=div_all_plays&del_col=1,11,12,13,14'.format( YEAR=Year, GameNumber=GameNumber, TEAM=Team, GameLocation=GameLocation) PathList.append(path) #req = get(path) p = Pool(8) # Pool tells how many at a time records = p.map(GetAndParsePath, PathList) p.terminate() p.join() with open( 'output/PlayTypeCounts-Year-' + str(Year) + '-Game-' + str(GameNumber) + '.json', 'w') as outfile: json.dump(PlayTypeDict, outfile)
def mp_pooler(self,nCORES,func,*args): pool=Pool(nCORES) print('computing with',nCORES,'processes in parallel') results=[] for i in range(len(args[0])-1): results.append(pool.apply_async(func,(args[0][i],args[0][i+1],*args[1:],i,))) self.loading.progress2['value']+=1 self.update() pool.close() pool.join() return results
def format_to_lines_tfds(args): """ Formats source text and target text as pt file. """ tokenized_sub_dirs = os.listdir(args.raw_path) dataset_name = os.path.dirname(args.save_path).split('/')[-1] # Make directory if not os.path.isdir(args.save_path): os.makedirs(args.save_path) # Create file list for each split directory corpora = {} for tokenized_sub_dir in tokenized_sub_dirs: path = pjoin(args.raw_path, tokenized_sub_dir) files = [] for f in glob.glob(pjoin(path, '*.json')): files.append(f) corpora[tokenized_sub_dir] = files files = [] for corpus_type in tokenized_sub_dirs: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) # NOTE: save files according to shard_size if (len(dataset) >= args.shard_size): if (corpus_type == 'validation'): type_name = 'valid' else: type_name = corpus_type pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name, p_ct) with open(pjoin(args.save_path, pt_file), 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() # For the last few data (< shard size) if (len(dataset) > 0): if (corpus_type == 'validation'): type_name = 'valid' else: type_name = corpus_type pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name, p_ct) with open(pjoin(args.save_path, pt_file), 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def calculate(self, data): """ run graph calculations """ # make sure data is valid when using schema if self._schema: try: import jsonschema except ImportError: msg = 'jsonschema package is needed for validating data' raise ImportError(msg) jsonschema.validate(instance=data, schema=self._schema) t1 = dt.datetime.utcnow() LOGGER.info('Starting calculation...') self._data = Data(data) self._data.check_inputs(self.sim_inputs, self.sim_outputs) if not self._sorted_dep: self._topological_sort() for items in self._sorted_dep: # loading node with inputs for item in items: node = self._get_node(item) inputs = [i for i in node.inputs_without_constants] for inp in inputs: node.set_value_to_input(inp.name, self._data[inp.map]) # running nodes if self._parallel: try: from multiprocess import Pool except ImportError: msg = 'multiprocess package is needed for parralelism' raise ImportError(msg) pool = Pool(self._pool_size) results = pool.map(Graph.run_node, [self._get_node(i) for i in items]) pool.close() pool.join() results = {k: v for k, v in results} else: results = {} for item in items: node = self._get_node(item) res = node.run_with_loaded_inputs() results[node.id] = res # save results for item in items: node = self._get_node(item) res = results[node.id] if len(node.outputs) == 1: self._data[node.outputs[0].map] = res else: for i, out in enumerate(node.outputs): self._data[out.map] = res[i] t2 = dt.datetime.utcnow() LOGGER.info('Calculation finished in {}'.format(t2 - t1)) return res
def fmultiprocess(log, function, inputArray, poolSize=False, **kwargs): """multiprocess pool **Key Arguments:** - ``log`` -- logger - ``function`` -- the function to multiprocess - ``inputArray`` -- the array to be iterated over **Return:** - ``resultArray`` -- the array of results **Usage:** .. code-block:: python from fundamentals import multiprocess # DEFINE AN INPUT ARRAY inputArray = range(10000) results = multiprocess(log=log, function=functionName, inputArray=inputArray, otherFunctionKeyword="cheese") """ log.info('starting the ``multiprocess`` function') # DEFINTE POOL SIZE - NUMBER OF CPU CORES TO USE (BEST = ALL - 1) # if cpu_count() > 1: # poolSize = cpu_count() - 1 # else: # poolSize = 1 # if len(inputArray) < poolSize: # poolSize = len(inputArray) if poolSize: p = Pool(processes=poolSize) else: p = Pool() # MAP-REDUCE THE WORK OVER MULTIPLE CPU CORES try: mapfunc = partial(function, log=log, **kwargs) resultArray = p.map(mapfunc, inputArray) except: try: mapfunc = partial(function, **kwargs) resultArray = p.map(mapfunc, inputArray) except: mapfunc = partial(function, log=log, **kwargs) resultArray = p.map(mapfunc, inputArray) p.close() p.terminate() p.join() log.info('completed the ``multiprocess`` function') return resultArray
def extract_patterns_matching_async(self): startTime = time.time() print "running on {} processors".format(WORKERS) pool = Pool(processes=WORKERS, initargs=(sent_locker, lock, sentence_counter)) pool.map(self.extract_patterns_from_file, self.data_wrapper.ngrams_files) pool.close() pool.join() total_time = time.time() - startTime print "extract_patterns_matching_async running time: {}".format( total_time)
def zte_gpon_svlan_check(): clear_log() nodes = graph.cypher.execute( "match(n:Olt)--(c:Card) where c.name='GTGO' return n.ip,collect(c.slot)") olts = ((x[0], x[1]) for x in nodes) lzte_gpon_svlan = lambda x: zte_gpon_svlan(ip=x[0], slots=x[1]) pool = Pool(8) lock = Manager().Lock() func = partial(svlan_entry, lock) list(pool.map(compose(func, lzte_gpon_svlan), olts)) pool.close() pool.join()
def prime_calculate(self): break_points = [] # List that will have start and stopping points for i in range(cores): # Creates start and stopping points based on length of range_finish break_points.append( {"start": int(math.ceil(((self.maximum_prime + 1) + 0.0) / cores * i)), "stop": int(math.ceil(((self.maximum_prime + 1) + 0.0) / cores * (i + 1)))}) p = Pool(cores) # Number of processes to create. for i in break_points: # Cycles though the breakpoints list created above. a = p.apply_async(self.prime_calculator, kwds=i, args=tuple(), callback=self.update_num) # This will start the separate processes. p.close() # Prevents any more processes being started p.join() # Waits for worker process to end
def func_parallel(self, processes=4): # global shared_arr # shared_arr = sharedctypes.RawArray(ctypes.c_double, self.size) # arr = np.frombuffer(shared_arr) # arr[:] = self.x # arr_orig = arr.copy() # p = Pool(processes=4, initializer=init, initargs=(shared_arr,)) p = Pool(processes=processes) res = p.map(self.func, range(processes)) p.close() p.join() print res
def svlan_check(): clear_log() # nodes = graph.find('Olt', property_key='ip', property_value='9.192.96.246') nodes = graph.find('Olt') # nodes = graph.find('Olt', property_key='company', property_value='zte') olts = [(x['ip'], x['company'], x['area']) for x in nodes] # list(map(compose(card_entry, get_card), olts)) pool = Pool(16) lock = Manager().Lock() func = partial(svlan_entry, lock) list(pool.map(compose(func, get_svlan), olts)) pool.close() pool.join()
def zte_gpon_svlan_check(): clear_log() nodes = graph.cypher.execute( "match(n:Olt)--(c:Card) where c.name='GTGO' return n.ip,collect(c.slot)" ) olts = ((x[0], x[1]) for x in nodes) lzte_gpon_svlan = lambda x: zte_gpon_svlan(ip=x[0], slots=x[1]) pool = Pool(8) lock = Manager().Lock() func = partial(svlan_entry, lock) list(pool.map(compose(func, lzte_gpon_svlan), olts)) pool.close() pool.join()
def interface_check_m(): clear_log() # cmd = "match(s: Switch) where s.model in ['S8505','S8508'] return s.ip, s.model" cmd = "match(s: Switch) return s.ip, s.model" # cmd = "match(s:Switch) where s.model='S9306' or s.model='s9303' return s.ip,s.model limit 2" nodes = graph.cypher.execute(cmd) switchs = [(x[0], x[1]) for x in nodes] pool = Pool(16) lock = Manager().Lock() out_inf = partial(output_interface_m, lock) list(pool.map(compose(out_inf, get_interface), switchs)) pool.close() pool.join()
def format_to_bert(args): if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['test'] if len(args.sent_numbers_file) > 0: sent_numbers = pickle.load(open(args.sent_numbers_file, "rb")) else: sent_numbers = None check_path_existence(args.save_path) for corpus_type in datasets: a_lst = [] c = 0 for json_f in glob.glob(pjoin(args.raw_path, corpus_type + '.*.json')): real_name = json_f.split('/')[-1] c += 1 a_lst.append( (corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt')), sent_numbers, 1)) print("Number of files: " + str(c)) ########################## ###### <DEBUGGING> ####### ########################## # for a in a_lst: # _format_to_bert(a) # single # json_f = args.raw_path + '/train.6.json' # _format_to_bert(('val', str(json_f), args, pjoin(args.save_path, str(json_f).replace('json', 'bert.pt')), kws, bart, # sent_numbers, 25)) ########################## ###### <DEBUGGING> ####### ########################## pool = Pool(args.n_cpus) print('Processing {} set with {} json files...'.format(corpus_type, len(a_lst))) all_papers_count = 0 all_paper_ids = {} for d in tqdm(pool.imap(_format_to_bert, a_lst), total=len(a_lst), desc=''): all_paper_ids[d[0]] = d[1] all_papers_count += d[2] pool.close() pool.join()
def add_infs(): funcs = {'zte': Zte.get_infs, 'hw': Huawei.get_infs} get_infs = partial(_company, funcs) clear_log() nodes = graph.cypher.execute( 'match (n:Olt) return n.ip as ip,n.company as company') olts = [dict(ip=x['ip'], company=x['company']) for x in nodes] pool = Pool(128) lock = Manager().Lock() _add_infs_p = partial(_add_infs, lock) list(pool.map(compose(_add_infs_p, get_infs), olts)) pool.close() pool.join()
def main(args): filedate = args.filedate database = args.database slablist = ['alu','cal','cam','car','cas','cot','hal','hel','him','hin','izu','jap','ker','kur','mak','man','mue','pam','png','phi','puy','ryu','sam','sco','sol','sul','sum','van'] indices = range(len(slablist)) pool1 = Pool(args.nCores) partial_loop1 = partial(calls2d, database, filedate, slablist) pts = pool1.map(partial_loop1, indices) pool1.close() pool1.join()
def format_to_lines(args): ''' corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} ''' train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): #real_name = f.split('/')[-1].split('.')[0] f = str(f) # if (f[97] == "a"): valid_files.append(f) elif (f[97] == "e"): test_files.append(f) elif (f[97] == "r"): train_files.append(f) # else: # train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): #pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) pt_file = args.save_path + corpus_type + str(p_ct) + ".json" #print(pt_file) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = args.save_path + corpus_type + str(p_ct) + ".json" with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open( pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(line.strip()) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} logger.info("txt read finished") train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) else: logger.info("not in mapping file", f) train_files.append(f) logger.info("data split over") corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def convertpool(self): if len(self.todo) > 0: if self.type in [".h264", ".mp4", ".avi"]: pool = Pool(min(self.pools, len(self.todo))) try: pool.map(self.conv_single, self.todo) pool.close() lineprint("Done converting all videofiles!") except KeyboardInterrupt: lineprint("User terminated converting pool..") pool.terminate() except Exception as e: excep = "Got exception: %r, terminating pool" % (e, ) lineprint(excep) pool.terminate() finally: pool.join() if self.delete: for filein in self.todo: os.remove(filein) lineprint("Deleted all original videofiles..") elif self.type in [".jpg", ".jpeg", ".png"]: vidname = commonpref(self.todo) lineprint("Start converting " + str(len(self.todo)) + " images") frame_array = [] for filename in self.todo: frame = cv2.imread(filename) frame_array.append(frame) #os.rename(filename, self.outdir+"/"+filename) h, w, _ = frame_array[0].shape if self.outdir != "": vidname = self.outdir + "/" + os.path.basename(vidname) vidout = videowriter(vidname, w, h, self.imgfps, self.resizeval) for i in range(len(frame_array)): vidout.write(frame_array[i]) vidout.release() lineprint("Finished converting " + os.path.basename(vidname)) else: lineprint("No video or image files found..")
def _cv_scores_importances(self, X, y, groups=None, n_jobs=1, **fit_params): assert self.cv is not None cv = check_cv(self.cv, y, is_classifier(self.estimator)) feature_importances = [] # type: List base_scores = [] # type: List[float] pool = Pool(self.n_jobs) #, maxtasksperchild=1) result = pool.map(lambda train_test: self._parallel_cv_scores_sub(X, y, *train_test, **fit_params), cv.split(X, y, groups), chunksize=1) #close and join the pools pool.close() pool.join() #unpack tuples to lists and flatten flatten = lambda z: [x for y in z for x in y] base_scores = flatten(map(list, zip(*result))[0]) feature_importances = flatten(map(list, zip(*result))[1]) return base_scores, feature_importances
def main(): workers = 5 while True: try: # Worklist contains data to be distributed worklist = [] # Launch workers process = Pool(workers) # Map data to worker_main function process.map(worker_main, worklist) # Block until all work completed process.close() process.join() except Exception as ex: print(str(ex))
def hostname_check(): clear_log() nodes = graph.find('Olt') # nodes = graph.find('Olt', property_key='ip', property_value='172.18.0.46') olts = [(x['ip'], x['company']) for x in nodes] pool = Pool(16) lock = Manager().Lock() func = partial(hostname_entry, lock) list(pool.map(compose(func, get_hostname), olts)) pool.close() pool.join() ip_hostname = (x.split(',') for x in open(result_file)) cmd = "match (n:Olt) where n.ip={ip} set n.hostname={hostname}" list(map(lambda x: graph.cypher.execute( cmd, ip=x[0], hostname=x[1]), ip_hostname))
def monteCarlo(agent, maxDepth=3, trials=12, frequency=10): PROCESSES = 4 model = VelocityModel( regressionModel=joblib.load('models/gradient-m.model'), frequency=frequency) actions = np.array(agent.getActions()) initialState, isTerminal = agent.getState(), 0 jobs = [None] * len(actions) * trials while bool(isTerminal) is False: initialState = agent.getState() qs = {i: [] for i in actions} for index, a in enumerate(np.repeat(actions, trials)): virtualAgent, isTerminal = RLAgent( 'virtual', alternativeModel=model, decisionFrequency=math.inf, maxDepth=maxDepth, initialState=initialState), False virtualAgent.setReward(reward) virtualAgent.goal = agent.getGoal() virtualAgent.goalMargins = agent.getGoalMargins() virtualAgent.setRl( partial(monteCarloSearch, actions=getRandomActions(a, actions, maxDepth))) jobs[index] = virtualAgent pool = Pool(8) results = [pool.apply_async(job.run) for job in jobs] for result in results: action, score = result.get() qs[action].append(score) pool.close() pool.join() yield actions[np.argmax([np.average(qs[a]) for a in actions])] r, nextState, isTerminal = (yield) f = 1 / (nextState.lastUpdate - initialState.lastUpdate) # correct for deviations from desired freq. model.frequency = f agent.logger.info(f) yield
def get_vlan_usersP(bras): def _get_vlan_users(bas): funcs = {'m6k': M6k.get_vlan_users, 'me60': ME60.get_vlan_users} _gvu = partial(_model, funcs) return _gvu(bas) bras = [dict(ip=x[0], model=x[1], inf=x[2]) for x in bras] pool = Pool(len(bras)) temp = pool.map(_get_vlan_users, bras) pool.close() pool.join() temp = [x[1] for x in temp if x[1]] rslt = reduce(lambda x, y: merge_with(sum, x, y), temp) return rslt
def create_csv(self): t1 = time() file1 = open(self.out_csv1, "w") file1.write("id" + ',' + "level" + '\n') file2 = open(self.out_csv2, "w") file2.write("id" + ',' + "object_name" + '\n') file1.close() file2.close() if __name__ == '__main__': i = range(len(self.list_of_zips)) p = Pool() p.map(self.parse_Zip, i) p.close() p.join() print('Create .csv files time = ' + str(time() - t1) + 's')
def preprocess(self): if (self.args.dataset != ''): datasets = [self.args.dataset] else: datasets = ['dev', 'train', 'test'] for corpus_type in datasets: a_lst = [] for json_f in glob.glob(pjoin(self.args.raw_path, '*' + corpus_type + '.*.json')): real_name = json_f.split('/')[-1] a_lst.append((corpus_type, json_f, self.args, pjoin(self.args.save_path, real_name.replace('json', 'bert.pt')))) print(a_lst) pool = Pool(self.args.n_cpus) for d in pool.imap(self._preprocess, a_lst): pass pool.close() pool.join()
def hostname_check(): clear_log() nodes = graph.find('Olt') # nodes = graph.find('Olt', property_key='ip', property_value='172.18.0.46') olts = [(x['ip'], x['company']) for x in nodes] pool = Pool(16) lock = Manager().Lock() func = partial(hostname_entry, lock) list(pool.map(compose(func, get_hostname), olts)) pool.close() pool.join() ip_hostname = (x.split(',') for x in open(result_file)) cmd = "match (n:Olt) where n.ip={ip} set n.hostname={hostname}" list( map(lambda x: graph.cypher.execute(cmd, ip=x[0], hostname=x[1]), ip_hostname))
def evaluate_fitness(self): """ Calculates fitness of all organisms in the generation and sorts by most fit. """ # Run fitness calculations in parallel on multiple cores pool = Pool(processes=self.num_jobs) fitness_output = np.array(pool.map(self.fitness_func, \ np.transpose(self._organisms, (1,0,2)))) self._fitness_lists = fitness_output.transpose().reshape((\ self.num_organisms, self.gen_size)) pool.close() pool.join() self._sort() # Make sure to sort at the end for fitness
def format_to_bert(args): for corpus_type in ['train', 'test']: a_lst = [] for json_f in glob.glob( pjoin(args.raw_path, '*' + corpus_type + '*.json')): real_name = json_f.split('/')[-1] a_lst.append((corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt')))) print(a_lst) pool = Pool(args.n_cpus) for d in pool.imap(_format_to_bert, a_lst): pass pool.close() pool.join()
def multi_align_tr(self,imstack,TrM,nsz,shx,shy,stfolder,sfn,nCORES,fnames,ext): if not sfn in os.listdir(stfolder): os.makedirs(os.path.join(stfolder,sfn)) print('directory created') pool=Pool(nCORES) print('applying transformations with',nCORES,'processes in parallel ') results=[] for i in range(len(imstack)): # results.append(transform(imstack[i],TrM,nsz,shx,shy,stfolder,sfn,fnames,i,ext,)) results.append(pool.apply_async(transform,(imstack[i],TrM,nsz,shx,shy,stfolder,sfn,fnames,i,ext,))) self.loading.progress2['value']+=1 self.update() pool.close() pool.join() print('successfully transformed all the images in the stack') return results
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] print("Real name is:", real_name) # print("this needs to equal:") # print("corpus mapping of valid/test/train:", corpus_mapping['valid']) if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) else: # Bryan edit this out later (original) train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): corpus_mapping = {} for corpus_type in ["valid", "test", "train"]: temp = [] for line in open( pjoin(args.map_path, "mapping_" + corpus_type + ".txt")): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, "*.json")): real_name = f.split("/")[-1].split(".")[0] if real_name in corpus_mapping["valid"]: valid_files.append(f) elif real_name in corpus_mapping["test"]: test_files.append(f) elif real_name in corpus_mapping["train"]: train_files.append(f) # else: # train_files.append(f) corpora = {"train": train_files, "valid": valid_files, "test": test_files} for corpus_type in ["train", "valid", "test"]: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if len(dataset) > args.shard_size: pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, "w") as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if len(dataset) > 0: pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, "w") as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def calculate(self, data): t1 = dt.datetime.utcnow() LOGGER.info('Starting calculation...') self._data = deepcopy(data) self._check_inputs(data) dep = self._dependencies() sorted_dep = topological_sort(dep) for items in sorted_dep: # loading node with inputs for item in items: node = self._get_node(item) args = [i_name for i_name in node.input_names if i_name not in node.kwargs] data_to_pass = [] for arg in args: data_to_pass.append(self._data[arg]) kwargs_to_pass = {} for kwarg in node.kwargs: kwargs_to_pass[kwarg] = self._data[kwarg] node.load_inputs(data_to_pass, kwargs_to_pass) # running nodes if self._parallel: pool = Pool(self._pool_size) results = pool.map( Graph.run_node, [self._get_node(i) for i in items] ) pool.close() pool.join() results = {k: v for k, v in results} else: results = {} for item in items: node = self._get_node(item) res = node.run_with_loaded_inputs() results[node.id] = res # save results for item in items: node = self._get_node(item) res = results[node.id] if len(node.output_names) == 1: self._data[node.output_names[0]] = res else: for i, out in enumerate(node.output_names): self._data[out] = res[i] t2 = dt.datetime.utcnow() LOGGER.info('Calculation finished in {}'.format(t2-t1)) return res
def zhongji_check(): clear_log() nodes = graph.find('Olt') # nodes = graph.find('Olt', property_key='ip', property_value='172.18.0.46') olts = [(x['ip'], x['company']) for x in nodes] pool = Pool(16) lock = Manager().Lock() func = partial(zhongji_entry, lock) list(pool.map(compose(func, get_zhongji), olts)) pool.close() pool.join() ports = (x.split(',') for x in open(result_file)) cmd = """match(n: Olt) where n.ip = {ip} merge(n) - [:HAS]->(m: Etrunk{name: {sm}}) merge(m) - [:Include]->(p: Port{name: {interface}})""" list(map(lambda x: graph.cypher.execute( cmd, ip=x[0], sm=x[1], interface=x[2]), ports))
def eval_EFG(self,x,num_procs=None,info=False): from multiprocess import Pool,cpu_count if not num_procs: num_procs = cpu_count() num_samples = self.parameters['num_samples'] pool = Pool(num_procs) num = int(np.ceil(float(num_samples)/float(num_procs))) results = list(zip(*pool.map(lambda i: self.eval_EFG_sequential(x,num,i,info),range(num_procs),chunksize=1))) pool.terminate() pool.join() if not info: assert(len(results) == 4) else: assert(len(results) == 5) assert(all([len(vals) == num_procs for vals in results])) return [sum(vals)/float(num_procs) for vals in results]
def parallel_cdist(data1, data2, n_rows_per_job=100): from scipy.spatial.distance import cdist data1 = np.array(data1) data2 = np.array(data2) pool = Pool(12) start_indices = np.arange(0, data1.shape[0], n_rows_per_job) end_indices = start_indices + n_rows_per_job - 1 partial_distance_matrices = pool.map(lambda (si, ei): cdist(data1[si:ei+1].copy(), data2), zip(start_indices, end_indices)) pool.close() pool.join() distance_matrix = np.concatenate(partial_distance_matrices) return distance_matrix
def add_power_info(): funcs = {'S8508': S85.get_power_info, 'S8505': S85.get_power_info, 'T64G': T64.get_power_info, 'S8905': S89.get_power_info, 'S8905E': S8905E.get_power_info, 'S9306': S93.get_power_info, 'S9303': S93.get_power_info} get_power_info = partial(_model, funcs) # clear_log() nodes = graph.cypher.execute( "match (s:Switch) where s.snmpState='normal' return s.ip as ip,s.model as model") switches = [dict(ip=x['ip'], model=x['model']) for x in nodes] pool = Pool(processor) lock = Manager().Lock() _ff = partial(_add_power_info, lock) list(pool.map(compose(_ff, get_power_info), switches)) pool.close() pool.join()
def add_traffics(): funcs = {'S8508': S85.get_traffics, 'S8505': S85.get_traffics, 'T64G': T64.get_traffics, 'S8905': S89.get_traffics, 'S8905E': S8905E.get_traffics, 'S9306': S93.get_traffics, 'S9303': S93.get_traffics} get_traffics = partial(_model, funcs) # clear_log() nodes = graph.cypher.execute( "match (s:Switch)--(i:Inf) where s.snmpState='normal' return s.ip as ip,collect(i.name) as infs,s.model as model") switchs = [dict(ip=x['ip'], infs=x['infs'], model=x['model']) for x in nodes] pool = Pool(processor) lock = Manager().Lock() _ff = partial(_add_traffics, lock) list(pool.map(compose(_ff, get_traffics), switchs)) pool.close() pool.join()
class ProcessPoolExecutor(Executor): """Process Pool Executor""" def __init__(self): super(ProcessPoolExecutor, self).__init__() import os from multiprocess import Pool self.pool = Pool(os.cpu_count() or 1) def submit(self, func, *args, **kwargs): from concurrent.futures import Future fut = Future() self.tasks[fut] = self.pool.apply_async( func, args, kwargs, fut.set_result, fut.set_exception ) fut.add_done_callback(self.tasks.pop) return fut def shutdown(self, wait=True): super(ProcessPoolExecutor, self).shutdown(wait) self.pool.terminate() self.pool.join()
def compute_jaccard_pairwise(indices, square_form=True, parallel=True, return_poses=False): n = len(indices) if parallel: pool = Pool(16) scores_poses_tuples = pool.map(lambda x: compute_jaccard_i_vs_list(x[0],x[1]), [(indices[i], indices[i+1:]) for i in range(n)]) pool.close() pool.join() else: scores_poses_tuples = [compute_jaccard_i_vs_list(indices[i], indices[i+1:]) for i in range(n)] pairwise_scores = np.array([scores for scores, poses in scores_poses_tuples]) if square_form: pairwise_scores = squareform(np.concatenate(pairwise_scores)) if return_poses: poses = np.array([poses for scores, poses in scores_poses_tuples]) return pairwise_scores, poses else: return pairwise_scores
def eval_EQ(self,p,num_procs=None,quiet=True): """ Evaluates E[Q(p,r)] and its gradient in parallel. Parameters ---------- p : generator powers num_procs : number of parallel processes quiet : flag """ from multiprocess import Pool,cpu_count if not num_procs: num_procs = cpu_count() num_samples = self.parameters['num_samples'] pool = Pool(num_procs) num = int(np.ceil(float(num_samples)/float(num_procs))) results = list(zip(*pool.map(lambda i: self.eval_EQ_sequential(p,num,i,quiet),range(num_procs),chunksize=1))) pool.terminate() pool.join() assert(len(results) == 2) assert(all([len(vals) == num_procs for vals in results])) return [sum(vals)/float(num_procs) for vals in results]
def test(): print('cpuCount() = %d\n' % cpuCount()) # # Create pool # PROCESSES = 4 print('Creating pool with %d processes\n' % PROCESSES) pool = Pool(PROCESSES) # # Tests # TASKS = [(mul, (i, 7)) for i in range(10)] + \ [(plus, (i, 8)) for i in range(10)] results = [pool.apply_async(calculate, t) for t in TASKS] imap_it = pool.imap(calculatestar, TASKS) imap_unordered_it = pool.imap_unordered(calculatestar, TASKS) print('Ordered results using pool.apply_async():') for r in results: print('\t', r.get()) print() print('Ordered results using pool.imap():') for x in imap_it: print('\t', x) print() print('Unordered results using pool.imap_unordered():') for x in imap_unordered_it: print('\t', x) print() print('Ordered results using pool.map() --- will block till complete:') for x in pool.map(calculatestar, TASKS): print('\t', x) print() # # Simple benchmarks # N = 100000 print('def pow3(x): return x**3') t = time.time() A = list(map(pow3, xrange(N))) print('\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() B = pool.map(pow3, xrange(N)) print('\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() C = list(pool.imap(pow3, xrange(N), chunksize=N//8)) print('\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \ ' seconds' % (N, N//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() L = [None] * 1000000 print('def noop(x): pass') print('L = [None] * 1000000') t = time.time() A = list(map(noop, L)) print('\tmap(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() B = pool.map(noop, L) print('\tpool.map(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() C = list(pool.imap(noop, L, chunksize=len(L)//8)) print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \ (len(L)//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() del A, B, C, L # # Test error handling # print('Testing error handling:') try: print(pool.apply(f, (5,))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.apply()') else: raise AssertionError('expected ZeroDivisionError') try: print(pool.map(f, range(10))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.map()') else: raise AssertionError('expected ZeroDivisionError') try: print(list(pool.imap(f, range(10)))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from list(pool.imap())') else: raise AssertionError('expected ZeroDivisionError') it = pool.imap(f, range(10)) for i in range(10): try: x = it.next() except ZeroDivisionError: if i == 5: pass except StopIteration: break else: if i == 5: raise AssertionError('expected ZeroDivisionError') assert i == 9 print('\tGot ZeroDivisionError as expected from IMapIterator.next()') print() # # Testing timeouts # print('Testing ApplyResult.get() with timeout:', end='') res = pool.apply_async(calculate, TASKS[0]) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % res.get(0.02)) break except TimeoutError: sys.stdout.write('.') print() print() print('Testing IMapIterator.next() with timeout:', end='') it = pool.imap(calculatestar, TASKS) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % it.next(0.02)) except StopIteration: break except TimeoutError: sys.stdout.write('.') print() print() # # Testing callback # print('Testing callback:') A = [] B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729] r = pool.apply_async(mul, (7, 8), callback=A.append) r.wait() r = pool.map_async(pow3, range(10), callback=A.extend) r.wait() if A == B: print('\tcallbacks succeeded\n') else: print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B)) # # Check there are no outstanding tasks # assert not pool._cache, 'cache = %r' % pool._cache # # Check close() methods # print('Testing close():') for worker in pool._pool: assert worker.is_alive() result = pool.apply_async(time.sleep, [0.5]) pool.close() pool.join() assert result.get() is None for worker in pool._pool: assert not worker.is_alive() print('\tclose() succeeded\n') # # Check terminate() method # print('Testing terminate():') pool = Pool(2) ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] pool.terminate() pool.join() for worker in pool._pool: assert not worker.is_alive() print('\tterminate() succeeded\n') # # Check garbage collection # print('Testing garbage collection:') pool = Pool(2) processes = pool._pool ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] del results, pool time.sleep(0.2) for worker in processes: assert not worker.is_alive() print('\tgarbage collection succeeded\n')
for iy, y0 in enumerate(np.arange(0, img_h, 5000)): for ix, x0 in enumerate(np.arange(0, img_w, 5000)): origins.append((x0, y0)) alg = 'cellprofiler' big_labelmap = np.zeros((img_h, img_w), dtype=np.int64) n = 0 for i, input_fp in enumerate(input_fps): prefix = os.path.splitext(input_fp)[0] labelmap = labelmap_alltiles[i].astype(np.int64) # astype(np.int64) is important, otherwise results in negative label values. x0, y0 = origins[i] big_labelmap[y0:y0+5000, x0:x0+5000][labelmap != 0] = labelmap[labelmap != 0] + n n += labelmap.max() labelmap_fp = os.path.splitext(input_img_fp)[0] + '_labelmap_%(alg)s.bp' % dict(alg=alg) bp.pack_ndarray_file(big_labelmap, labelmap_fp) upload_to_s3(labelmap_fp) for fp in input_fps: execute_command('rm ' + fp) t = time.time() pool = Pool(NUM_CORES/2) pool.map(detect_cells, filenames) pool.close() pool.join() sys.stderr.write('Overall time: %.2f seconds.\n' % (time.time()-t))
def aggregate(self, feature_files): """ This aggregator is a front-end to the pymir3 stats module. The statistics that must be computed are found in the simple_aggregation key in the experiment file. :param feature_files: a list of FeatureTrack filenames :type feature_files: list[str] :return: :rtype: None .. note:: These keys are expected to be set in the experiment file: * ['simple_aggregation']['mean'] * ['simple_aggregation']['delta'] * ['simple_aggregation']['variance'] * ['simple_aggregation']['acceleration'] * ['simple_aggregation']['slope'] * ['simple_aggregation']['limits'] * ['simple_aggregation']['csv'] * ['simple_aggregation']['normalize'] * ['general']['scratch_directory'] * ['feature_aggregation']['aggregated_output'] """ features = load_feature_files(feature_files) if self.params['simple_aggregation']['texture_windows']: #for i in range(len(feature_files)): # feature_files[i] = feature_files[i] + "_tw" jobs = [] out_idx = 0 for f in features: jobs.append((f, self.params['simple_aggregation']['texture_window_length'], feature_files[out_idx] )) out_idx+=1 num_files = len(jobs) output_buffer_size = self.params['simple_aggregation']['tw_buffer_size'] pool = Pool(processes=self.params['simple_aggregation']['tw_workers']) pool.map(calc_textures, jobs) # out_idx = 0 # for i in range(0, num_files, output_buffer_size): # print "Calculating texture windows %d through %d of %d" % (i + 1, min(i + output_buffer_size, num_files), num_files) # result = pool.map(calc_textures, jobs[i:min(i + output_buffer_size, num_files)]) # for track in result: # filename = feature_files[out_idx] # print "writing features to file %s..." % (filename) # feature_file = open(filename, "w") # track.save(feature_file) # feature_file.close() # del track # out_idx+=1 # del result # gc.collect() pool.close() pool.join() features = None if features == None: features = load_feature_files(feature_files) stats = feat_stats.Stats() m = stats.stats(features, mean=self.params['simple_aggregation']['mean'], delta=self.params['simple_aggregation']['delta'], variance=self.params['simple_aggregation']['variance'], acceleration=self.params['simple_aggregation']['acceleration'], slope=self.params['simple_aggregation']['slope'], limits=self.params['simple_aggregation']['limits'], csv=self.params['simple_aggregation']['csv'], normalize=self.params['simple_aggregation']['normalize']) out = open(self.params['general']['scratch_directory'] + "/" + self.params['feature_aggregation']['aggregated_output'], "w") m.save(out) out.close()
def partial_dependence(self, feature_ids, modelinstance, filter_classes=None, grid=None, grid_resolution=30, n_jobs=-1, grid_range=None, sample=True, sampling_strategy='random-choice', n_samples=1000, bin_count=50, return_metadata=False, progressbar=True, variance_type='estimate'): """ Approximates the partial dependence of the predict_fn with respect to the variables passed. Parameters: ----------- feature_ids: list the names/ids of the features for which partial dependence is to be computed. Note that the algorithm's complexity scales exponentially with additional features, so generally one should only look at one or two features at a time. These feature ids must be available in the class's associated DataSet. As of now, we only support looking at 1 or 2 features at a time. modelinstance: skater.model.model.Model subtype an estimator function of a fitted model used to derive prediction. Supports classification and regression. Supports classification(binary, multi-class) and regression. predictions = predict_fn(data) Can either by a skater.model.remote.DeployedModel or a skater.model.local.InMemoryModel filter_classes: array type The classes to run partial dependence on. Default None invokes all classes. Only used in classification models. grid: numpy.ndarray 2 dimensional array on which we fix values of features. Note this is determined automatically if not given based on the percentiles of the dataset. grid_resolution: int how many unique values to include in the grid. If the percentile range is 5% to 95%, then that range will be cut into <grid_resolution> equally size bins. Defaults to 30. n_jobs: int The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. Defaults to using all cores(-1). grid_range: tuple the percentile extrama to consider. 2 element tuple, increasing, bounded between 0 and 1. sample: boolean Whether to sample from the original dataset. sampling_strategy: string If sampling, which approach to take. See DataSet.generate_sample for details. n_samples: int The number of samples to use from the original dataset. Note this is only active if sample = True and sampling strategy = 'uniform'. If using 'uniform-over-similarity-ranks', use samples per bin bin_count: int The number of bins to use when using the similarity based sampler. Note this is only active if sample = True and sampling_strategy = 'uniform-over-similarity-ranks'. total samples = bin_count * samples per bin. samples_per_bin: int The number of samples to collect for each bin within the sampler. Note this is only active if sample = True and sampling_strategy = 'uniform-over-similarity-ranks'. If using sampling_strategy = 'uniform', use n_samples. total samples = bin_count * samples per bin. variance_type: string return_metadata: boolean :Examples: >>> from skater.model import InMemoryModel >>> from skater.core.explanations import Interpretation >>> from sklearn.ensemble import RandomForestClassier >>> from sklearn.datasets import load_boston >>> boston = load_boston() >>> X = boston.data >>> y = boston.target >>> features = boston.feature_names >>> rf = RandomForestClassier() >>> rf.fit(X,y) >>> model = InMemoryModel(rf.predict_proba, examples = X) >>> interpreter = Interpretation() >>> interpreter.load_data(X) >>> feature_ids = ['ZN','CRIM'] >>> interpreter.partial_dependence.partial_dependence(features,model) """ if self.data_set is None: load_data_not_called_err_msg = "self.interpreter.data_set not found. \n" \ "Please call Interpretation.load_data \n" \ "before running this method." raise(exceptions.DataSetNotLoadedError(load_data_not_called_err_msg)) feature_ids = self._check_features(feature_ids) if filter_classes: err_msg = "members of filter classes must be \n" \ "members of modelinstance.classes. \n" \ "Expected members of: \n" \ "{0}\n" \ "got: \n" \ "{1}".format(modelinstance.target_names, filter_classes) filter_classes = list(filter_classes) assert all([i in modelinstance.target_names for i in filter_classes]), err_msg # TODO: There might be a better place to do this check if not isinstance(modelinstance, ModelType): raise(exceptions.ModelError("Incorrect estimator function used for computing partial dependence, try one \n" "creating one with skater.model.local.InMemoryModel or \n" "skater.model.remote.DeployedModel")) if modelinstance.model_type == 'classifier' and modelinstance.probability is False: if modelinstance.unique_values is None: raise(exceptions.ModelError('If using classifier without probability scores, unique_values cannot \n' 'be None')) self.interpreter.logger.warn("Classifiers with probability scores can be explained \n" "more granularly than those without scores. If a prediction method with \n" "scores is available, use that instead.") # TODO: This we can change easily to functional style missing_feature_ids = [] for feature_id in feature_ids: if feature_id not in self.data_set.feature_ids: missing_feature_ids.append(feature_id) if missing_feature_ids: missing_feature_id_err_msg = "Features {0} not found in \n" \ "Interpretation.data_set.feature_ids \n" \ "{1}".format(missing_feature_ids, self.data_set.feature_ids) raise(KeyError(missing_feature_id_err_msg)) if grid_range is None: grid_range = (.05, 0.95) else: if not hasattr(grid_range, "__iter__"): err_msg = "Grid range {} needs to be an iterable".format(grid_range) raise(exceptions.MalformedGridRangeError(err_msg)) self._check_grid_range(grid_range) if not modelinstance.has_metadata: examples = self.data_set.generate_sample(strategy='random-choice', sample=True, n_samples=10) examples = DataManager(examples, feature_names=self.data_set.feature_ids) modelinstance._build_model_metadata(examples) # if you dont pass a grid, build one. grid = np.array(grid) if not grid.any(): # Currently, if a given feature has fewer unique values than the value # of grid resolution, then the grid will be set to those unique values. # Otherwise it will take the percentile # range according with grid_resolution bins. grid = self.data_set.generate_grid(feature_ids, grid_resolution=grid_resolution, grid_range=grid_range) else: # want to ensure all grids have 2 axes if len(grid.shape) == 1 and \ (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])): grid = grid[:, np.newaxis].T grid_resolution = grid.shape[1] self.interpreter.logger.debug("Grid shape used for pdp: {}".format(grid.shape)) self.interpreter.logger.debug("Grid resolution for pdp: {}".format(grid_resolution)) # make sure data_set module is giving us correct data structure self._check_grid(grid, feature_ids) # generate data data_sample = self.data_set.generate_sample(strategy=sampling_strategy, sample=sample, n_samples=n_samples, bin_count=bin_count) assert type(data_sample) == self.data_set.data_type, "Something went wrong\n" \ "Theres a type mismatch between\n" \ "the sampled data and the origina\nl" \ "training set. Check Skater.models\n" _pdp_metadata = self._build_metadata_dict(modelinstance, feature_ids, self.data_set.feature_ids, filter_classes, variance_type) self.interpreter.logger.debug("Shape of sampled data: {}".format(data_sample.shape)) self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids)) self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata)) # cartesian product of grid grid_expanded = pd.DataFrame(list(product(*grid))).values if grid_expanded.shape[0] <= 0: empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \ "grid shape: {}".format(grid_expanded.shape) raise(exceptions.MalformedGridError(empty_grid_expanded_err_msg)) predict_fn = modelinstance._get_static_predictor() n_jobs = None if n_jobs < 0 else n_jobs pd_func = functools.partial(_compute_pd, estimator_fn=predict_fn, grid_expanded=grid_expanded, pd_metadata=_pdp_metadata, input_data=data_sample, filter_classes=filter_classes) arg_list = [i for i in range(grid_expanded.shape[0])] executor_instance = Pool(n_jobs) if progressbar: self.interpreter.logger.warn("Progress bars slow down runs by 10-20%. For slightly \n" "faster runs, do progress_bar=False") mapper = executor_instance.imap p = ProgressBar(len(arg_list), units='grid cells') else: mapper = executor_instance.map pd_list = [] try: if n_jobs == 1: raise ValueError("Skipping to single processing") for pd_row in mapper(pd_func, arg_list): if progressbar: p.animate() pd_list.append(pd_row) except: self.interpreter.logger.warn("Multiprocessing failed, going single process") for pd_row in map(pd_func, arg_list): if progressbar: p.animate() pd_list.append(pd_row) finally: executor_instance.close() executor_instance.join() executor_instance.terminate() if return_metadata: return pd.DataFrame(list(pd_list)), _pdp_metadata else: return pd.DataFrame(list(pd_list))
def compute_jaccard_list_vs_all(seed_indices): pool = Pool(14) affinities_to_seeds = np.array(pool.map(lambda i: compute_jaccard_i_vs_all(i), seed_indices)) pool.close() pool.join() return affinities_to_seeds
def compute_spm_histograms(labelmap, sample_locs, patch_size, M): """ Args: labelmap (2d-ndarray of int): sample_locs (2d-ndarray): List of (x,y) locations at which to sample the SPM histograms M (int): number of unique SIFT descriptor words, aka. size of vocabulary Returns: hists_arr0 ((1,M)-array of int) hists_arr1 ((4,M)-array of int) hists_arr2 ((16,M)-array of int) """ global labelmap_global labelmap_global = labelmap # compute level-2 histograms l = 2 grid_size = patch_size / 2**l if l == 2: rx = [-2, -1, 0, 1] ry = [-2, -1, 0, 1] elif l == 1: rx = [-1, 0] ry = [-1, 0] elif l == 0: rx = [-.5] ry = [-.5] rxs, rys = np.meshgrid(rx, ry) patch_coords_allGrid = [] for grid_i, (rx, ry) in enumerate(np.c_[rxs.flat, rys.flat]): patch_xmin = sample_locs[:,0] + rx * grid_size patch_ymin = sample_locs[:,1] + ry * grid_size patch_xmax = sample_locs[:,0] + (rx + 1) * grid_size patch_ymax = sample_locs[:,1] + (ry + 1) * grid_size patch_coords_allGrid.append([patch_xmin, patch_ymin, patch_xmax, patch_ymax]) all_coords = np.hstack(patch_coords_allGrid) patch_xmin = all_coords[0] patch_ymin = all_coords[1] patch_xmax = all_coords[2] patch_ymax = all_coords[3] def compute_histogram_particular_label(i): m = (labelmap_global == i).astype(np.uint8) mi = cv2.integral(m) ci = mi[patch_ymin, patch_xmin] + mi[patch_ymax, patch_xmax] - mi[patch_ymax, patch_xmin] - mi[patch_ymin, patch_xmax] return ci t = time.time() # hists = Parallel(n_jobs=16)(delayed(compute_histogram_particular_label)(i) for i in range(1, M+1)) # hists = Parallel(n_jobs=8)(delayed(compute_histogram_particular_label)(i) for i in range(1, M+1)) pool = Pool(8) hists = pool.map(compute_histogram_particular_label, range(1, M+1)) # pool.terminate() pool.close() pool.join() # del pool sys.stderr.write('done in %f seconds\n' % (time.time() - t)) # ~ 13 seconds n_grid = (2**l)**2 hists_arr2 = np.transpose(np.reshape(hists, (M, n_grid, -1))) print hists_arr2.shape # compute level-1 histograms based on level-2 histograms hists_arr1 = np.transpose([hists_arr2[:, [0,1,4,5], :].sum(axis=1), hists_arr2[:, [2,3,6,7], :].sum(axis=1), hists_arr2[:, [8,9,12,13], :].sum(axis=1), hists_arr2[:, [10,11,14,15], :].sum(axis=1)], [1,0,2]) print hists_arr1.shape # compute level-0 histograms based on level-1 histograms hists_arr0 = hists_arr1.sum(axis=1) print hists_arr0.shape return hists_arr0, hists_arr1, hists_arr2