def post_frame(ans, params, args): # Analysis! from numpy.linalg import eigh from numpy import argsort ev, vecs = eigh(ans["overlap"]) idx = argsort(ev)[::-1] ans["ev"] = ev[idx] / params["snapshots"] ans["vecs"] = vecs[:, idx] print("Sanity check volume: {:f}".format(ans["volume"])) print( ans["x_min"], ans["x_max"], ans["y_min"], ans["y_max"], ans["z_min"], ans["z_max"], ) print(ans["overlap"]) print("Singular values:") print(ans["ev"]) print(ans["vecs"]) from chest import Chest cpath = '{:s}-chest-{:03d}'.format(args.chest_path, ans["frame"]) c = Chest(path=cpath) for key in ans.keys(): c[ans['time'], key] = ans[key] ans.clear() c.flush() ans["cpath"] = cpath return
def post_frame(ans, params, args): # Analysis! from numpy.linalg import eigh from numpy import argsort ev, vecs = eigh(ans["overlap"]) idx = argsort(ev)[::-1] ans["ev"] = ev[idx] / params["snapshots"] ans["vecs"] = vecs[:,idx] print("Sanity check volume: {:f}".format(ans["volume"])) print(ans["x_min"], ans["x_max"], ans["y_min"], ans["y_max"],ans["z_min"], ans["z_max"],) print(ans["overlap"]) print("Singular values:") print(ans["ev"]) print(ans["vecs"]) from chest import Chest cpath = '{:s}-chest-{:03d}'.format(args.chest_path, ans["frame"]) c = Chest(path=cpath) for key in ans.keys(): c[ans['time'], key] = ans[key] ans.clear() c.flush() ans["cpath"] = cpath return
def outer_process(job): """ Process to be executed in the outer IPython.parallel map """ # Split the arguments args, params, frame = job # always need these from importlib import import_module MR = import_module(args.mapreduce) # Initialize the MapReduce data with base cases # Returns job list to pass to map jobs = MR.MR_init(args, params, frame) # Copy a base case in which to reduce the results from copy import deepcopy ans = deepcopy(jobs[0][4]) # Map! import time as time_ ttime = time_.time() if args.thread < 2: results = map(inner_process, jobs) else: from multiprocessing import Pool p = Pool(processes=args.thread) results = p.imap_unordered(inner_process, jobs, chunksize=1) if args.verbose: print(' Map took {:f}s on {:d} processes'.format( time_.time() - ttime, args.thread)) # Reduce! ttime = time_.time() for r in results: MR.reduce_(ans, r) if args.thread >= 2: p.close() if args.verbose: print(' Reduce took {:f}s on {:d} processes'.format( time_.time() - ttime, args.thread)) ans["frame"] = frame # Analysis! post = import_module(args.post) post.post_frame(ans, params, args) post.plot_frame(ans, params, args) # Save the results to file! from chest import Chest cpath = '{:s}-chest-{:03d}'.format(args.name, frame) c = Chest(path=cpath) for key in ans.keys(): c[ans['time'], key] = ans[key] c.flush() return cpath
def outer_process(job): """ Process to be executed in the outer IPython.parallel map """ # Split the arguments args, params, frame = job # always need these from importlib import import_module MR = import_module(args.mapreduce) # Initialize the MapReduce data with base cases # Returns job list to pass to map jobs = MR.MR_init(args, params, frame) # Copy a base case in which to reduce the results from copy import deepcopy ans = deepcopy(jobs[0][4]) # Map! import time as time_ ttime = time_.time() if args.thread < 2: results = map(inner_process, jobs) else: from multiprocessing import Pool p = Pool(processes=args.thread) results = p.imap_unordered(inner_process, jobs, chunksize = 1) if args.verbose: print(' Map took {:f}s on {:d} processes'.format(time_.time()-ttime, args.thread)) # Reduce! ttime = time_.time() for r in results: MR.reduce_(ans, r) if args.thread >= 2: p.close() if args.verbose: print(' Reduce took {:f}s on {:d} processes'.format(time_.time()-ttime, args.thread)) ans["frame"] = frame # Analysis! post = import_module(args.post) post.post_frame(ans, params, args) post.plot_frame(ans, params, args) # Save the results to file! from chest import Chest cpath = '{:s}-chest-{:03d}'.format(args.name, frame) c = Chest(path=cpath) for key in ans.keys(): c[ans['time'], key] = ans[key] c.flush() return cpath
def post_frame(ans, params, args): from chest import Chest cpath = '{:s}-chest-{:03d}'.format(args.chest_path, ans["frame"]) c = Chest(path=cpath) for key in ans.keys(): c[ans['time'], key] = ans[key] ans.clear() c.flush() ans["cpath"] = cpath return
def post_frame(ans, params, args): # Analysis! ans['TAbs'] = max(ans['TMax'], -ans['TMin']) ans['PeCell'] = ans['UAbs']*ans['dx_max']/params['conductivity'] ans['ReCell'] = ans['UAbs']*ans['dx_max']/params['viscosity'] # Mixing height L = params["extent_mesh"][2] - params["root_mesh"][2] h = 0. tmax = np.max(ans["t_proj_z"].to_array()) tmin = np.min(ans["t_proj_z"].to_array()) tzero = (tmax + tmin) / 2 h_cabot = 0. for i in range(ans["t_proj_z"].to_array().shape[0]): if ans["t_proj_z"].to_array()[i] < tzero: h_cabot += (ans["t_proj_z"].to_array()[i] - tmin) else: h_cabot += (tmax - ans["t_proj_z"].to_array()[i]) ans["h"] = h_cabot zs = ans['z_z'].to_array() from utils.my_utils import find_root h_visual = ( find_root(zs, ans["t_proj_z"].to_array(), y0 = tmax - (tmax - tmin)*.01) - find_root(zs, ans["t_proj_z"].to_array(), y0 = tmin + (tmax - tmin)*0.1)) / 2. h_exp = find_root(zs, np.array(ans["t_max_z"].to_array()), y0 = 0.0) ans["H"] = h_visual ans["H_exp"] = h_exp plot_frame(ans, params, args) from interfaces.abstract import AbstractSlice from chest import Chest cpath = '{:s}-chest-{:03d}'.format(args.chest_path, ans["frame"]) c = Chest(path=cpath) for key in ans.keys(): if isinstance(ans[key], AbstractSlice): c[ans['time'], key] = ans[key].to_array() else: c[ans['time'], key] = ans[key] ans.clear() c.flush() ans["cpath"] = cpath return
def main(): dico = Chest(path='francais_tatoeba_5bis-char-max') dico2 = defaultdict(set) i = 0 with open( "/Users/korantin/Documents/Projects/Lexiques/francais_col123.txt", 'r') as ba: tmp = ba.read().splitlines() for phrase in tmp: (ind, ln, phrase) = phrase.strip().split('\t') if len(phrase) <= 10: print(phrase) print(i, len(tmp)) estime(phrase, tmp, memoire=dico, d=dico2) i += 1 for x, y in dico2.items(): dico[x] *= (len(tmp) / len(y)) dico.flush() print(dico.items(), sep='\n')
class ChestCacheTransformer(TransformerBase): def __init__(self, inner, **kwargs): super().__init__(**kwargs) on = "qid" self.inner = inner self.disable = False if CACHE_DIR is None: init() # we take the md5 of the __repr__ of the pipeline to make a unique identifier for the pipeline # all different pipelines should return unique __repr_() values, as these are intended to be # unambiguous trepr = repr(self.inner) if "object at 0x" in trepr: warn( "Cannot cache pipeline %s has a component has not overridden __repr__" % trepr) self.disable = True uid = hashlib.md5(bytes(trepr, "utf-8")).hexdigest() destdir = path.join(CACHE_DIR, uid) os.makedirs(destdir, exist_ok=True) definition_file = path.join(destdir, DEFINITION_FILE) if not path.exists(definition_file): with open(definition_file, "w") as f: f.write(trepr) self.chest = Chest( path=destdir, dump=lambda data, filename: pd.DataFrame.to_pickle(data, filename) if isinstance(data, pd.DataFrame) else pickle.dump( data, filename, protocol=1), load=lambda filehandle: pickle.load(filehandle) if ".keys" in filehandle.name else pd.read_pickle(filehandle)) self.hits = 0 self.requests = 0 def stats(self): return self.hits / self.requests if self.requests > 0 else 0 # dont double cache - we cannot cache ourselves def __invert__(self): return self def __repr__(self): return "Cache(" + self.inner.__repr__() + ")" def __str__(self): return "Cache(" + str(self.inner) + ")" @property def NOCACHE(self): return self.inner def transform(self, input_res): if self.disable: return self.inner.transform(input_res) if "docid" in input_res.columns or "docno" in input_res.columns: raise ValueError( "Caching currently only supports input dataframes with queries as inputs and cannot be used for re-rankers" ) return self._transform_qid(input_res) def _transform_qid(self, input_res): rtr = [] todo = [] for index, row in input_res.iterrows(): qid = row["qid"] self.requests += 1 try: df = self.chest.get(qid, None) except: # occasionally we have file not founds, # lets remove from the cache and continue del self.chest[qid] df = None if df is None: todo.append(row.to_frame().T) else: self.hits += 1 rtr.append(df) if len(todo) > 0: tood_df = pd.concat(todo) todo_res = self.inner.transform(tood_df) for indx, row in tood_df.iterrows(): qid = row["qid"] this_query_res = todo_res[todo_res["qid"] == qid] self.chest[qid] = this_query_res rtr.append(this_query_res) self.chest.flush() return pd.concat(rtr)
# Set up the frame arguments from mapcombine import outer_process jobs = [[args, params, i] for i in range(args.frame, args.frame_end + 1)] # schedule the frames, one IPython process each # if only one process or parallel not set, use normal map import time start_time = time.time() if len(jobs) > 1 and args.parallel: from IPython.parallel import Client p = Client(profile='mpi') stuff = p.load_balanced_view().map_async(outer_process, jobs) else: stuff = map(outer_process, jobs) # insert new results into the out-of-core dictionary (Chest) nelm = params["shape_mesh"][0] * params["shape_mesh"][1] * params[ "shape_mesh"][2] from chest import Chest for i, res in enumerate(stuff): c1 = Chest(path=res['cpath']) c = Chest(path=args.chest_path) c.update(c1) c.flush() c1.drop() # Print a progress update run_time = time.time() - start_time print("Processed {:d}th frame after {:f}s ({:f} eps)".format( i, run_time, (i + 1) * nelm / run_time))
class ChestCacheTransformer(TransformerBase): """ A transformer that cache the results of the consituent (inner) transformer. This is instantiated using the `~` operator on any transformer. Caching is unqiue based on the configuration of the pipeline, as read by executing retr() on the pipeline. Caching lookup is based on the qid, so any change in query _formulation_ will not be reflected in a cache's results. Example Usage:: dataset = pt.get_dataset("trec-robust-2004") # use for first pass and 2nd pass BM25 = pt.BatchRetrieve(index, wmodel="BM25") # used for query expansion RM3 = pt.rewrite.RM3(index) pt.Experiment([ ~BM25, (~BM25) >> RM3 >> BM25 ], dataset.get_topics(), dataset.get_qrels(), eval_metrics=["map"] ) In the above example, we use the `~` operator on the first pass retrieval using BM25, but not on the 2nd pass retrieval, as the query formulation will differ during the second pass. Caching is not supported for re-ranking transformers. """ def __init__(self, inner, **kwargs): super().__init__(**kwargs) on = "qid" self.inner = inner self.disable = False if CACHE_DIR is None: init() # we take the md5 of the __repr__ of the pipeline to make a unique identifier for the pipeline # all different pipelines should return unique __repr_() values, as these are intended to be # unambiguous trepr = repr(self.inner) if "object at 0x" in trepr: warn( "Cannot cache pipeline %s has a component has not overridden __repr__" % trepr) self.disable = True uid = hashlib.md5(bytes(trepr, "utf-8")).hexdigest() destdir = path.join(CACHE_DIR, uid) os.makedirs(destdir, exist_ok=True) definition_file = path.join(destdir, DEFINITION_FILE) if not path.exists(definition_file): with open(definition_file, "w") as f: f.write(trepr) from chest import Chest self.chest = Chest( path=destdir, dump=lambda data, filename: pd.DataFrame.to_pickle(data, filename) if isinstance(data, pd.DataFrame) else pickle.dump( data, filename, protocol=1), load=lambda filehandle: pickle.load(filehandle) if ".keys" in filehandle.name else pd.read_pickle(filehandle)) self.hits = 0 self.requests = 0 def stats(self): return self.hits / self.requests if self.requests > 0 else 0 # dont double cache - we cannot cache ourselves def __invert__(self): return self def __repr__(self): return "Cache(" + self.inner.__repr__() + ")" def __str__(self): return "Cache(" + str(self.inner) + ")" @property def NOCACHE(self): return self.inner def transform(self, input_res): if self.disable: return self.inner.transform(input_res) if "docid" in input_res.columns or "docno" in input_res.columns: raise ValueError( "Caching of %s for re-ranking is not supported. Caching currently only supports input dataframes with queries as inputs and cannot be used for re-rankers." % self.inner.__repr__()) return self._transform_qid(input_res) def _transform_qid(self, input_res): rtr = [] todo = [] # We cannot remove this iterrows() without knowing how to take named tuples into a dataframe for index, row in input_res.iterrows(): qid = str(row["qid"]) self.requests += 1 try: df = self.chest.get(qid, None) except: # occasionally we have file not founds, # lets remove from the cache and continue del self.chest[qid] df = None if df is None: todo.append(row.to_frame().T) else: self.hits += 1 rtr.append(df) if len(todo) > 0: todo_df = pd.concat(todo) todo_res = self.inner.transform(todo_df) for row in todo_df.itertuples(): qid = row.qid this_query_res = todo_res[todo_res["qid"] == qid] self.chest[qid] = this_query_res rtr.append(this_query_res) self.chest.flush() return pd.concat(rtr)
class Timeseries(): # === Constructors, Destructors, and Properties ==================== def __init__(self, name, maxLength, location=os.getcwd()): assert isinstance(maxLength, int), "\'maxLength\' must be an integer!" assert isinstance(location, str) | isinstance(location, PurePath), "\'location\' must be a string or PurePath!" assert isinstance(name, str), "\'name\' must be a string!" assert maxLength >= 1, "\'maxLength\' must be at-least one!" # Private variables self.__maxLength = 0 self.__data = deque([], maxLength) self.__name = name self.__location = location self.__storage = Chest(path=location) # Change dumping/loading method here self.maxLength = maxLength self.load() @property def name(self): return self.__name @property def location(self): return self.__location @property def storage(self): return self.__storage @property def maxLength(self): assert self.__maxLength == self.__data.maxlen, "Dispairity between self.__maxLength and self.__data.maxlength! This is most likely a bug." # Assert that the value aligns with reality. return self.__maxLength @maxLength.setter def maxLength(self, value): assert isinstance(value, int), "\'maxLength\' must be an integer!" assert value >= 1, "\'maxLength\' must be greater than or equal to one!" # Define initial state prevValue = self.__maxLength setValue = value delta = setValue - prevValue distance = abs(delta) # Resize Container if(delta == 0): pass else: oldValues = list(self.__data) self.__data = deque(oldValues, setValue) for _ in range(delta): self.__data.appendleft(None) # Reflect changes self.__maxLength = setValue return self.__maxLength # === Public Start ================================================== def insert(self, obj=None): self.__data.append(obj) # Add the element itself if its not iterable. return self def insertRange(self, iterable=None): if(iterable is None): pass else: for element in iterable: self.__data.append(element) return self def __repr__(self): return self.__data.__repr__() def load(self): if(self.__name in self.__storage._keys): # If there is something to work with in the storage, then continue on. self.insertRange(list(self.__storage[self.__name])) else: # If not, save ourselves so that we have something to work with next time. self.save() def save(self): self.__storage[self.__name] = list(self.__data) self.__storage.flush() def RawData(self): return self.__data def AsList(self): return list(self.__data) def AsPlottable(self) -> Tuple[list,list]: # [[a,b], [a,b], ..., [a,b]] -> [[a,a...,a], [b,b,...,b]] (x,y) = ([None],[None]) for elem in self.__data: if isinstance(elem, Iterable): x.append(datetime.fromtimestamp(elem[0])) y.append(elem[1]) return (x,y)
# Set up the frame arguments from mapcombine import outer_process jobs = [[args, params, i] for i in range(args.frame, args.frame_end+1)] # schedule the frames, one IPython process each # if only one process or parallel not set, use normal map import time start_time = time.time() if len(jobs) > 1 and args.parallel: from IPython.parallel import Client p = Client(profile='mpi') stuff = p.load_balanced_view().map_async(outer_process, jobs) else: stuff = map(outer_process, jobs) # insert new results into the out-of-core dictionary (Chest) nelm = params["shape_mesh"][0] * params["shape_mesh"][1] * params["shape_mesh"][2] from chest import Chest for i, res in enumerate(stuff): c1 = Chest(path=res['cpath']) c = Chest(path=args.chest_path) c.update(c1) c.flush() c1.drop() # Print a progress update run_time = time.time() - start_time print("Processed {:d}th frame after {:f}s ({:f} eps)".format(i, run_time, (i+1)*nelm/run_time))