def estime(ba, memoire: Chest): """ L'estimation maximale des paramètre se fait par le calcul du powerset sur un ensemble BA (Base d'Apprentissage). On fait donc un tf-idf mais en ayant en vocabulaire les composantes du powerset :param BA: :return: """ len_ba = sum(pow(2, len(x)) for x in ba) with Pool(processes=5) as p: for element in ba: for x in p.map(lambda i: powerset(element, i), range(len(element))): for y in x: if not memoire.get(x): memoire[x] = frequence_brute(y, element, True) for x in memoire: memoire[x] /= len(memoire)
def generate_regex(seq: OptimString, memo: Chest = None): file = deque() sortie = deque() file.appendleft(seq) while file: current = file.pop() if not all(x == '.' for x in str(current)): if not memo.get(current): yield current if not isinstance(current.data_pointe[-1], Point): current = current.add_point() file.appendleft(current) for _ in range( current.control.get(current.get_point)[-1] + 1, len(current.data)): current = current.deplace_point() file.appendleft(current)
class ChestCacheTransformer(TransformerBase): def __init__(self, inner, **kwargs): super().__init__(**kwargs) on = "qid" self.inner = inner self.disable = False if CACHE_DIR is None: init() # we take the md5 of the __repr__ of the pipeline to make a unique identifier for the pipeline # all different pipelines should return unique __repr_() values, as these are intended to be # unambiguous trepr = repr(self.inner) if "object at 0x" in trepr: warn( "Cannot cache pipeline %s has a component has not overridden __repr__" % trepr) self.disable = True uid = hashlib.md5(bytes(trepr, "utf-8")).hexdigest() destdir = path.join(CACHE_DIR, uid) os.makedirs(destdir, exist_ok=True) definition_file = path.join(destdir, DEFINITION_FILE) if not path.exists(definition_file): with open(definition_file, "w") as f: f.write(trepr) self.chest = Chest( path=destdir, dump=lambda data, filename: pd.DataFrame.to_pickle(data, filename) if isinstance(data, pd.DataFrame) else pickle.dump( data, filename, protocol=1), load=lambda filehandle: pickle.load(filehandle) if ".keys" in filehandle.name else pd.read_pickle(filehandle)) self.hits = 0 self.requests = 0 def stats(self): return self.hits / self.requests if self.requests > 0 else 0 # dont double cache - we cannot cache ourselves def __invert__(self): return self def __repr__(self): return "Cache(" + self.inner.__repr__() + ")" def __str__(self): return "Cache(" + str(self.inner) + ")" @property def NOCACHE(self): return self.inner def transform(self, input_res): if self.disable: return self.inner.transform(input_res) if "docid" in input_res.columns or "docno" in input_res.columns: raise ValueError( "Caching currently only supports input dataframes with queries as inputs and cannot be used for re-rankers" ) return self._transform_qid(input_res) def _transform_qid(self, input_res): rtr = [] todo = [] for index, row in input_res.iterrows(): qid = row["qid"] self.requests += 1 try: df = self.chest.get(qid, None) except: # occasionally we have file not founds, # lets remove from the cache and continue del self.chest[qid] df = None if df is None: todo.append(row.to_frame().T) else: self.hits += 1 rtr.append(df) if len(todo) > 0: tood_df = pd.concat(todo) todo_res = self.inner.transform(tood_df) for indx, row in tood_df.iterrows(): qid = row["qid"] this_query_res = todo_res[todo_res["qid"] == qid] self.chest[qid] = this_query_res rtr.append(this_query_res) self.chest.flush() return pd.concat(rtr)
class ChestCacheTransformer(TransformerBase): """ A transformer that cache the results of the consituent (inner) transformer. This is instantiated using the `~` operator on any transformer. Caching is unqiue based on the configuration of the pipeline, as read by executing retr() on the pipeline. Caching lookup is based on the qid, so any change in query _formulation_ will not be reflected in a cache's results. Example Usage:: dataset = pt.get_dataset("trec-robust-2004") # use for first pass and 2nd pass BM25 = pt.BatchRetrieve(index, wmodel="BM25") # used for query expansion RM3 = pt.rewrite.RM3(index) pt.Experiment([ ~BM25, (~BM25) >> RM3 >> BM25 ], dataset.get_topics(), dataset.get_qrels(), eval_metrics=["map"] ) In the above example, we use the `~` operator on the first pass retrieval using BM25, but not on the 2nd pass retrieval, as the query formulation will differ during the second pass. Caching is not supported for re-ranking transformers. """ def __init__(self, inner, **kwargs): super().__init__(**kwargs) on = "qid" self.inner = inner self.disable = False if CACHE_DIR is None: init() # we take the md5 of the __repr__ of the pipeline to make a unique identifier for the pipeline # all different pipelines should return unique __repr_() values, as these are intended to be # unambiguous trepr = repr(self.inner) if "object at 0x" in trepr: warn( "Cannot cache pipeline %s has a component has not overridden __repr__" % trepr) self.disable = True uid = hashlib.md5(bytes(trepr, "utf-8")).hexdigest() destdir = path.join(CACHE_DIR, uid) os.makedirs(destdir, exist_ok=True) definition_file = path.join(destdir, DEFINITION_FILE) if not path.exists(definition_file): with open(definition_file, "w") as f: f.write(trepr) from chest import Chest self.chest = Chest( path=destdir, dump=lambda data, filename: pd.DataFrame.to_pickle(data, filename) if isinstance(data, pd.DataFrame) else pickle.dump( data, filename, protocol=1), load=lambda filehandle: pickle.load(filehandle) if ".keys" in filehandle.name else pd.read_pickle(filehandle)) self.hits = 0 self.requests = 0 def stats(self): return self.hits / self.requests if self.requests > 0 else 0 # dont double cache - we cannot cache ourselves def __invert__(self): return self def __repr__(self): return "Cache(" + self.inner.__repr__() + ")" def __str__(self): return "Cache(" + str(self.inner) + ")" @property def NOCACHE(self): return self.inner def transform(self, input_res): if self.disable: return self.inner.transform(input_res) if "docid" in input_res.columns or "docno" in input_res.columns: raise ValueError( "Caching of %s for re-ranking is not supported. Caching currently only supports input dataframes with queries as inputs and cannot be used for re-rankers." % self.inner.__repr__()) return self._transform_qid(input_res) def _transform_qid(self, input_res): rtr = [] todo = [] # We cannot remove this iterrows() without knowing how to take named tuples into a dataframe for index, row in input_res.iterrows(): qid = str(row["qid"]) self.requests += 1 try: df = self.chest.get(qid, None) except: # occasionally we have file not founds, # lets remove from the cache and continue del self.chest[qid] df = None if df is None: todo.append(row.to_frame().T) else: self.hits += 1 rtr.append(df) if len(todo) > 0: todo_df = pd.concat(todo) todo_res = self.inner.transform(todo_df) for row in todo_df.itertuples(): qid = row.qid this_query_res = todo_res[todo_res["qid"] == qid] self.chest[qid] = this_query_res rtr.append(this_query_res) self.chest.flush() return pd.concat(rtr)