Beispiel #1
0
def post_frame(ans, params, args):
    # Analysis!
    from numpy.linalg import eigh
    from numpy import argsort

    ev, vecs = eigh(ans["overlap"])
    idx = argsort(ev)[::-1]
    ans["ev"] = ev[idx] / params["snapshots"]
    ans["vecs"] = vecs[:, idx]

    print("Sanity check volume: {:f}".format(ans["volume"]))
    print(
        ans["x_min"],
        ans["x_max"],
        ans["y_min"],
        ans["y_max"],
        ans["z_min"],
        ans["z_max"],
    )
    print(ans["overlap"])
    print("Singular values:")
    print(ans["ev"])
    print(ans["vecs"])

    from chest import Chest
    cpath = '{:s}-chest-{:03d}'.format(args.chest_path, ans["frame"])
    c = Chest(path=cpath)
    for key in ans.keys():
        c[ans['time'], key] = ans[key]
    ans.clear()
    c.flush()

    ans["cpath"] = cpath

    return
Beispiel #2
0
def post_frame(ans, params, args):
  # Analysis! 
  from numpy.linalg import eigh
  from numpy import argsort

  ev, vecs = eigh(ans["overlap"])
  idx = argsort(ev)[::-1]
  ans["ev"] = ev[idx] / params["snapshots"]
  ans["vecs"] = vecs[:,idx]

  print("Sanity check volume: {:f}".format(ans["volume"]))
  print(ans["x_min"], ans["x_max"], ans["y_min"], ans["y_max"],ans["z_min"], ans["z_max"],)
  print(ans["overlap"])
  print("Singular values:")
  print(ans["ev"])
  print(ans["vecs"])

  from chest import Chest
  cpath = '{:s}-chest-{:03d}'.format(args.chest_path, ans["frame"])
  c = Chest(path=cpath)
  for key in ans.keys():
    c[ans['time'], key] = ans[key]
  ans.clear()
  c.flush()

  ans["cpath"] = cpath

  return 
Beispiel #3
0
def outer_process(job):
    """
  Process to be executed in the outer IPython.parallel map
  """

    # Split the arguments
    args, params, frame = job

    # always need these
    from importlib import import_module
    MR = import_module(args.mapreduce)

    # Initialize the MapReduce data with base cases
    # Returns job list to pass to map
    jobs = MR.MR_init(args, params, frame)
    # Copy a base case in which to reduce the results
    from copy import deepcopy
    ans = deepcopy(jobs[0][4])

    # Map!
    import time as time_
    ttime = time_.time()
    if args.thread < 2:
        results = map(inner_process, jobs)
    else:
        from multiprocessing import Pool
        p = Pool(processes=args.thread)
        results = p.imap_unordered(inner_process, jobs, chunksize=1)
    if args.verbose:
        print('  Map took {:f}s on {:d} processes'.format(
            time_.time() - ttime, args.thread))

    # Reduce!
    ttime = time_.time()
    for r in results:
        MR.reduce_(ans, r)
    if args.thread >= 2:
        p.close()
    if args.verbose:
        print('  Reduce took {:f}s on {:d} processes'.format(
            time_.time() - ttime, args.thread))

    ans["frame"] = frame

    # Analysis!
    post = import_module(args.post)
    post.post_frame(ans, params, args)
    post.plot_frame(ans, params, args)

    # Save the results to file!
    from chest import Chest
    cpath = '{:s}-chest-{:03d}'.format(args.name, frame)
    c = Chest(path=cpath)
    for key in ans.keys():
        c[ans['time'], key] = ans[key]
    c.flush()

    return cpath
Beispiel #4
0
def outer_process(job):  
  """
  Process to be executed in the outer IPython.parallel map
  """

  # Split the arguments
  args, params, frame = job

  # always need these
  from importlib import import_module
  MR = import_module(args.mapreduce)

  # Initialize the MapReduce data with base cases
  # Returns job list to pass to map
  jobs = MR.MR_init(args, params, frame)
  # Copy a base case in which to reduce the results
  from copy import deepcopy
  ans = deepcopy(jobs[0][4])

  # Map!
  import time as time_
  ttime = time_.time()
  if args.thread < 2:
    results = map(inner_process, jobs)
  else:
    from multiprocessing import Pool
    p = Pool(processes=args.thread)
    results = p.imap_unordered(inner_process, jobs, chunksize = 1)
  if args.verbose:
    print('  Map took {:f}s on {:d} processes'.format(time_.time()-ttime, args.thread))

  # Reduce!
  ttime = time_.time()
  for r in results:
    MR.reduce_(ans, r)
  if args.thread >= 2:
    p.close()
  if args.verbose:
    print('  Reduce took {:f}s on {:d} processes'.format(time_.time()-ttime, args.thread))

  ans["frame"] = frame

  # Analysis! 
  post = import_module(args.post)
  post.post_frame(ans, params, args)
  post.plot_frame(ans, params, args)

  # Save the results to file!
  from chest import Chest
  cpath = '{:s}-chest-{:03d}'.format(args.name, frame)
  c = Chest(path=cpath)
  for key in ans.keys():
    c[ans['time'], key] = ans[key]
  c.flush()

  return cpath
Beispiel #5
0
def post_frame(ans, params, args):

  from chest import Chest
  cpath = '{:s}-chest-{:03d}'.format(args.chest_path, ans["frame"])
  c = Chest(path=cpath)
  for key in ans.keys():
    c[ans['time'], key] = ans[key]
  ans.clear()
  c.flush()

  ans["cpath"] = cpath

  return 
Beispiel #6
0
def post_frame(ans, params, args):
  # Analysis! 
  ans['TAbs'] = max(ans['TMax'], -ans['TMin'])
  ans['PeCell'] = ans['UAbs']*ans['dx_max']/params['conductivity']
  ans['ReCell'] = ans['UAbs']*ans['dx_max']/params['viscosity']

  # Mixing height
  L = params["extent_mesh"][2] - params["root_mesh"][2]
  h = 0.
  tmax = np.max(ans["t_proj_z"].to_array())
  tmin = np.min(ans["t_proj_z"].to_array())
  tzero = (tmax + tmin) / 2
  h_cabot = 0.
  for i in range(ans["t_proj_z"].to_array().shape[0]):
    if ans["t_proj_z"].to_array()[i] < tzero:
      h_cabot += (ans["t_proj_z"].to_array()[i] - tmin) 
    else:
      h_cabot += (tmax - ans["t_proj_z"].to_array()[i]) 
  ans["h"] = h_cabot

  zs = ans['z_z'].to_array() 
  from utils.my_utils import find_root
  h_visual = ( find_root(zs, ans["t_proj_z"].to_array(), y0 = tmax - (tmax - tmin)*.01)
             - find_root(zs, ans["t_proj_z"].to_array(), y0 = tmin + (tmax - tmin)*0.1)) / 2.

  h_exp = find_root(zs, np.array(ans["t_max_z"].to_array()), y0 = 0.0)

  ans["H"] = h_visual
  ans["H_exp"] = h_exp
  plot_frame(ans, params, args)

  from interfaces.abstract import AbstractSlice

  from chest import Chest
  cpath = '{:s}-chest-{:03d}'.format(args.chest_path, ans["frame"])
  c = Chest(path=cpath)
  for key in ans.keys():
    if isinstance(ans[key], AbstractSlice):
      c[ans['time'], key] = ans[key].to_array()
    else:
      c[ans['time'], key] = ans[key]
  ans.clear()
  c.flush()

  ans["cpath"] = cpath

  return 
Beispiel #7
0
def main():
    dico = Chest(path='francais_tatoeba_5bis-char-max')
    dico2 = defaultdict(set)
    i = 0
    with open(
            "/Users/korantin/Documents/Projects/Lexiques/francais_col123.txt",
            'r') as ba:
        tmp = ba.read().splitlines()
        for phrase in tmp:
            (ind, ln, phrase) = phrase.strip().split('\t')
            if len(phrase) <= 10:
                print(phrase)
                print(i, len(tmp))
                estime(phrase, tmp, memoire=dico, d=dico2)
            i += 1
        for x, y in dico2.items():
            dico[x] *= (len(tmp) / len(y))
        dico.flush()
    print(dico.items(), sep='\n')
Beispiel #8
0
class ChestCacheTransformer(TransformerBase):
    def __init__(self, inner, **kwargs):
        super().__init__(**kwargs)
        on = "qid"
        self.inner = inner
        self.disable = False
        if CACHE_DIR is None:
            init()

        # we take the md5 of the __repr__ of the pipeline to make a unique identifier for the pipeline
        # all different pipelines should return unique __repr_() values, as these are intended to be
        # unambiguous
        trepr = repr(self.inner)
        if "object at 0x" in trepr:
            warn(
                "Cannot cache pipeline %s has a component has not overridden __repr__"
                % trepr)
            self.disable = True

        uid = hashlib.md5(bytes(trepr, "utf-8")).hexdigest()
        destdir = path.join(CACHE_DIR, uid)
        os.makedirs(destdir, exist_ok=True)
        definition_file = path.join(destdir, DEFINITION_FILE)
        if not path.exists(definition_file):
            with open(definition_file, "w") as f:
                f.write(trepr)
        self.chest = Chest(
            path=destdir,
            dump=lambda data, filename: pd.DataFrame.to_pickle(data, filename)
            if isinstance(data, pd.DataFrame) else pickle.dump(
                data, filename, protocol=1),
            load=lambda filehandle: pickle.load(filehandle)
            if ".keys" in filehandle.name else pd.read_pickle(filehandle))
        self.hits = 0
        self.requests = 0

    def stats(self):
        return self.hits / self.requests if self.requests > 0 else 0

    # dont double cache - we cannot cache ourselves
    def __invert__(self):
        return self

    def __repr__(self):
        return "Cache(" + self.inner.__repr__() + ")"

    def __str__(self):
        return "Cache(" + str(self.inner) + ")"

    @property
    def NOCACHE(self):
        return self.inner

    def transform(self, input_res):
        if self.disable:
            return self.inner.transform(input_res)
        if "docid" in input_res.columns or "docno" in input_res.columns:
            raise ValueError(
                "Caching currently only supports input dataframes with queries as inputs and cannot be used for re-rankers"
            )
        return self._transform_qid(input_res)

    def _transform_qid(self, input_res):
        rtr = []
        todo = []

        for index, row in input_res.iterrows():
            qid = row["qid"]
            self.requests += 1
            try:
                df = self.chest.get(qid, None)
            except:
                # occasionally we have file not founds,
                # lets remove from the cache and continue
                del self.chest[qid]
                df = None
            if df is None:
                todo.append(row.to_frame().T)
            else:
                self.hits += 1
                rtr.append(df)
        if len(todo) > 0:
            tood_df = pd.concat(todo)
            todo_res = self.inner.transform(tood_df)
            for indx, row in tood_df.iterrows():
                qid = row["qid"]
                this_query_res = todo_res[todo_res["qid"] == qid]
                self.chest[qid] = this_query_res
                rtr.append(this_query_res)
        self.chest.flush()
        return pd.concat(rtr)
Beispiel #9
0
# Set up the frame arguments
from mapcombine import outer_process
jobs = [[args, params, i] for i in range(args.frame, args.frame_end + 1)]

# schedule the frames, one IPython process each
# if only one process or parallel not set, use normal map
import time
start_time = time.time()
if len(jobs) > 1 and args.parallel:
    from IPython.parallel import Client
    p = Client(profile='mpi')
    stuff = p.load_balanced_view().map_async(outer_process, jobs)
else:
    stuff = map(outer_process, jobs)

# insert new results into the out-of-core dictionary (Chest)
nelm = params["shape_mesh"][0] * params["shape_mesh"][1] * params[
    "shape_mesh"][2]
from chest import Chest
for i, res in enumerate(stuff):
    c1 = Chest(path=res['cpath'])
    c = Chest(path=args.chest_path)
    c.update(c1)
    c.flush()
    c1.drop()

    # Print a progress update
    run_time = time.time() - start_time
    print("Processed {:d}th frame after {:f}s ({:f} eps)".format(
        i, run_time, (i + 1) * nelm / run_time))
Beispiel #10
0
class ChestCacheTransformer(TransformerBase):
    """
        A transformer that cache the results of the consituent (inner) transformer. 
        This is instantiated using the `~` operator on any transformer.

        Caching is unqiue based on the configuration of the pipeline, as read by executing
        retr() on the pipeline. Caching lookup is based on the qid, so any change in query
        _formulation_ will not be reflected in a cache's results.

        Example Usage::

            dataset = pt.get_dataset("trec-robust-2004")
            # use for first pass and 2nd pass
            BM25 = pt.BatchRetrieve(index, wmodel="BM25")

            # used for query expansion
            RM3 = pt.rewrite.RM3(index)
            pt.Experiment([
                    ~BM25,
                    (~BM25) >> RM3 >> BM25
                ],
                dataset.get_topics(),
                dataset.get_qrels(),
                eval_metrics=["map"]
            )

        In the above example, we use the `~` operator on the first pass retrieval using BM25, but not on the 2nd pass retrieval, 
        as the query formulation will differ during the second pass.

        Caching is not supported for re-ranking transformers.        
    """
    def __init__(self, inner, **kwargs):
        super().__init__(**kwargs)
        on = "qid"
        self.inner = inner
        self.disable = False
        if CACHE_DIR is None:
            init()

        # we take the md5 of the __repr__ of the pipeline to make a unique identifier for the pipeline
        # all different pipelines should return unique __repr_() values, as these are intended to be
        # unambiguous
        trepr = repr(self.inner)
        if "object at 0x" in trepr:
            warn(
                "Cannot cache pipeline %s has a component has not overridden __repr__"
                % trepr)
            self.disable = True

        uid = hashlib.md5(bytes(trepr, "utf-8")).hexdigest()
        destdir = path.join(CACHE_DIR, uid)
        os.makedirs(destdir, exist_ok=True)
        definition_file = path.join(destdir, DEFINITION_FILE)
        if not path.exists(definition_file):
            with open(definition_file, "w") as f:
                f.write(trepr)
        from chest import Chest
        self.chest = Chest(
            path=destdir,
            dump=lambda data, filename: pd.DataFrame.to_pickle(data, filename)
            if isinstance(data, pd.DataFrame) else pickle.dump(
                data, filename, protocol=1),
            load=lambda filehandle: pickle.load(filehandle)
            if ".keys" in filehandle.name else pd.read_pickle(filehandle))
        self.hits = 0
        self.requests = 0

    def stats(self):
        return self.hits / self.requests if self.requests > 0 else 0

    # dont double cache - we cannot cache ourselves
    def __invert__(self):
        return self

    def __repr__(self):
        return "Cache(" + self.inner.__repr__() + ")"

    def __str__(self):
        return "Cache(" + str(self.inner) + ")"

    @property
    def NOCACHE(self):
        return self.inner

    def transform(self, input_res):
        if self.disable:
            return self.inner.transform(input_res)
        if "docid" in input_res.columns or "docno" in input_res.columns:
            raise ValueError(
                "Caching of %s for re-ranking is not supported. Caching currently only supports input dataframes with queries as inputs and cannot be used for re-rankers."
                % self.inner.__repr__())
        return self._transform_qid(input_res)

    def _transform_qid(self, input_res):
        rtr = []
        todo = []

        # We cannot remove this iterrows() without knowing how to take named tuples into a dataframe
        for index, row in input_res.iterrows():
            qid = str(row["qid"])
            self.requests += 1
            try:
                df = self.chest.get(qid, None)
            except:
                # occasionally we have file not founds,
                # lets remove from the cache and continue
                del self.chest[qid]
                df = None
            if df is None:
                todo.append(row.to_frame().T)
            else:
                self.hits += 1
                rtr.append(df)
        if len(todo) > 0:
            todo_df = pd.concat(todo)
            todo_res = self.inner.transform(todo_df)
            for row in todo_df.itertuples():
                qid = row.qid
                this_query_res = todo_res[todo_res["qid"] == qid]
                self.chest[qid] = this_query_res
                rtr.append(this_query_res)
        self.chest.flush()
        return pd.concat(rtr)
class Timeseries():

    # === Constructors, Destructors, and Properties ====================
    def __init__(self, name, maxLength, location=os.getcwd()):
        assert isinstance(maxLength, int), "\'maxLength\' must be an integer!"
        assert isinstance(location, str) | isinstance(location, PurePath), "\'location\' must be a string or PurePath!"
        assert isinstance(name, str), "\'name\' must be a string!"
        assert maxLength >= 1, "\'maxLength\' must be at-least one!"

        # Private variables
        self.__maxLength = 0
        self.__data = deque([], maxLength)
        self.__name = name
        self.__location = location
        self.__storage = Chest(path=location) # Change dumping/loading method here

        self.maxLength = maxLength
        self.load()

    @property
    def name(self): return self.__name
    @property
    def location(self): return self.__location
    @property
    def storage(self): return self.__storage

    @property
    def maxLength(self):
        assert self.__maxLength == self.__data.maxlen, "Dispairity between self.__maxLength and self.__data.maxlength! This is most likely a bug." # Assert that the value aligns with reality.
        return self.__maxLength
    @maxLength.setter
    def maxLength(self, value):
        assert isinstance(value, int), "\'maxLength\' must be an integer!"
        assert value >= 1, "\'maxLength\' must be greater than or equal to one!"

        # Define initial state
        prevValue = self.__maxLength
        setValue  = value
        delta     = setValue - prevValue
        distance  = abs(delta)

        # Resize Container
        if(delta == 0):  pass
        else:
            oldValues = list(self.__data)
            self.__data = deque(oldValues, setValue)
            for _ in range(delta): self.__data.appendleft(None)
        
        # Reflect changes
        self.__maxLength = setValue

        return self.__maxLength



    # === Public Start ==================================================
    def insert(self, obj=None):
        self.__data.append(obj) # Add the element itself if its not iterable.
        return self
        
    def insertRange(self, iterable=None):
        if(iterable is None): pass
        else:
            for element in iterable: self.__data.append(element)
        return self

    def __repr__(self):
        return self.__data.__repr__()

    def load(self):
        if(self.__name in self.__storage._keys): # If there is something to work with in the storage, then continue on.
            self.insertRange(list(self.__storage[self.__name]))
        else: # If not, save ourselves so that we have something to work with next time.
            self.save()

    def save(self):
        self.__storage[self.__name] = list(self.__data)
        self.__storage.flush()
    
    def RawData(self): return self.__data
        
    def AsList(self): 
        return list(self.__data)

    def AsPlottable(self) -> Tuple[list,list]:
        # [[a,b], [a,b], ..., [a,b]] -> [[a,a...,a], [b,b,...,b]]   
        (x,y) = ([None],[None])
        for elem in self.__data:
            if isinstance(elem, Iterable):
                x.append(datetime.fromtimestamp(elem[0]))
                y.append(elem[1])
        return (x,y)
Beispiel #12
0
# Set up the frame arguments
from mapcombine import outer_process
jobs = [[args, params, i] for i in range(args.frame, args.frame_end+1)]

# schedule the frames, one IPython process each
# if only one process or parallel not set, use normal map
import time
start_time = time.time()
if len(jobs) > 1 and args.parallel:
  from IPython.parallel import Client
  p = Client(profile='mpi')
  stuff = p.load_balanced_view().map_async(outer_process, jobs)
else:
  stuff =  map(outer_process, jobs)

# insert new results into the out-of-core dictionary (Chest)
nelm = params["shape_mesh"][0] * params["shape_mesh"][1] * params["shape_mesh"][2]
from chest import Chest
for i, res in enumerate(stuff):
  c1 = Chest(path=res['cpath'])
  c = Chest(path=args.chest_path)
  c.update(c1)
  c.flush()
  c1.drop()

  # Print a progress update
  run_time = time.time() - start_time
  print("Processed {:d}th frame after {:f}s ({:f} eps)".format(i, run_time, (i+1)*nelm/run_time))