def gen_cb(dataset, gen_profile=None, tw_width=0, tw_step=1): """Generate a case base out of a TS dataset using given time-window settings Args: dataset (str): Full path of the dataset "arff" file gen_profile (Callable[[int, int], List[numpy.ndarray]]): Function to generate problem profiles and/or queries out of a data sequence. Its signature should be (data, tw_width, tw_step). tw_width (int): if > 0, width of the 'moving' time window; otherwise, 'expanding' time window approach is applied. tw_step (int): number of steps (in terms of data points in TS) taken at each update. This can also be seen as the number of data points changed at each update. Returns: cbr.TCaseBase: """ # read dataset logger.info("Loading time series dataset: {}".format(dataset)) ts_instances, ts_classes = read_ts(dataset) # create an empty CB cb = cbr.TCaseBase() # loop data appending sequences to the CB logger.info("Generating cb") for idx, instance in enumerate(ts_instances): cb[idx] = cbr.TSSequence(data=instance, tw_width=tw_width, tw_step=tw_step, gen_profile=gen_profile, solution=ts_classes[idx], seq_id=idx) logger.info(".. CB unique solutions: {}".format(cb.solution_set())) logger.info(".. CB generated: {} sequences containing {} cases".format( len(cb), cb.size())) return cb
def run(self): """Runs the Jumping Iteration experiment. Returns: pd.DataFrame: Output of the `ExpJumpData.process()` columns -> ["update", "gain", "jumpat"] """ # Create an ExpJumpData instance to save experiment data exp_jump_data = ExpJumpData() # Generate test problems CB_train, CB_test = exp_common.split_cb(self.cb, self.test_size) len_test = len(CB_test) CB_train = cbr.TCaseBase( cb=CB_train ) # This will be passed to Anytime Lazy KNN, not the `ExpInsightsEngine.cb` # Conduct tests for each sequence in cb_test for idx, sequence in enumerate(CB_test): logger.info( ".. Testing with problem sequence {} of {} (seq_id: {})". format(idx + 1, len_test, sequence.seq_id)) # For every problem create multiple sequence solvers, one for TopDown, others for Jumping Iterator top_down_solver = exp_common.SolveSequence( CB_train, self.k, sequence, self.similarity, rank.TopDownIterator( )) # Note: 'exp_insights_raw' not provided jumping_solvers = {} for jump_at in self.jump_at_lst: jumping_solvers[jump_at] = exp_common.SolveSequence( CB_train, self.k, sequence, self.similarity, rank.JumpingIterator(jump_at=jump_at) ) # Note: 'exp_insights_raw' not provided # Run tests for each update for stop_update in range(sequence.n_profiles()): # Run top_down_solver to stop at the end of the stop_update logger.debug( ".... TOP_DOWN_solver launching for stop_update: {}". format(stop_update)) kNN_top_down, _, calc_pct_top_down = top_down_solver.solve( stop_update=stop_update) # Append to experiment data exp_jump_data.add(stop_update, 100. - calc_pct_top_down, 0) # Run jumping_solvers to jump after every `jump_at` calcs for jump_at in self.jump_at_lst: logger.debug( ".... JUMPING_solver w/ jump at {} launching for stop_update: {}" .format(jump_at, stop_update)) kNN_jumping, _, calc_pct_jumping = jumping_solvers[ jump_at].solve(stop_update=stop_update) # Append to experiment data exp_jump_data.add(stop_update, 100. - calc_pct_jumping, jump_at) # Help garbage collector to release the memory as soon as possible del top_down_solver del jumping_solvers return exp_jump_data.process()
def run(self): """Runs a total of `n_exp` insights experiments. Returns: ExpInsightsProcessed: """ self.all_exp_insights_raw = [] for exp_id in range(self.n_exp): logger.info(".. Experiment {} of {}".format( exp_id + 1, self.n_exp)) CB_train, CB_test = exp_common.split_cb(self.cb, self.test_size) exp_insights_raw = ExpInsightsRaw( ) # Create a new insights object for the new experiment len_test = len(CB_test) CB_train = cbr.TCaseBase( cb=CB_train ) # This will be passed to Anytime Lazy KNN, not the `ExpInsightsEngine.cb` for idx, sequence in enumerate(CB_test): logger.info( ".... Testing with problem sequence {} of {} (seq_id: {})". format(idx + 1, len_test, sequence.seq_id)) exp_insights_raw.add_new_sequence( ) # Save insights of each sequence separately # instantiate a new instance of the `RankIterator` of choice with its given keyword arguments rank_iterator = self.cls_rank_iterator( **self.cls_rank_iterator_kwargs) solve_sequence = exp_common.SolveSequence( CB_train, self.k, sequence, self.similarity, rank_iterator, exp_insights_raw) solve_sequence.solve( ) # All sequence updates w/o interruption and collect insights data del solve_sequence # Help garbage collector to release the memory as soon as possible self.all_exp_insights_raw.append(exp_insights_raw) processed_insights = ProcessExpInsightsRaw( self.all_exp_insights_raw).process() return processed_insights
def run(self): """Runs the Exploit Candidates Iteration experiment. Returns: pd.DataFrame: Output of the `ExpExploitData.process()` columns -> ["update", "gain", "iterator"] """ # Create an ExpExploitData instance to save experiment data exp_exploit_data = ExpExploitData() # Generate test problems CB_train, CB_test = exp_common.split_cb(self.cb, self.test_size) len_test = len(CB_test) CB_train = cbr.TCaseBase(cb=CB_train) # This will be passed to Anytime Lazy KNN, not the `ExpInsightsEngine.cb` # Conduct tests for each sequence in cb_test for idx, sequence in enumerate(CB_test): logger.info(".. Testing with problem sequence {} of {} (seq_id: {})".format(idx + 1, len_test, sequence.seq_id)) # For every problem create two sequence solvers, one for TopDown, others for ExploitCandidates Iterator top_down_solver = exp_common.SolveSequence(CB_train, self.k, sequence, self.similarity, rank.TopDownIterator()) # Note: 'exp_insights_raw' not provided exploit_solver = exp_common.SolveSequence(CB_train, self.k, sequence, self.similarity, rank.ExploitCandidatesIterator()) # Run tests for each update for stop_update in range(sequence.n_profiles()): # Run top_down_solver to stop at the end of the stop_update logger.debug(".... TOP_DOWN_solver launching for stop_update: {}".format(stop_update)) kNN_top_down, _, calc_pct_top_down = top_down_solver.solve(stop_update=stop_update) # Append to experiment data exp_exploit_data.add(stop_update, 100. - calc_pct_top_down, rank.TopDownIterator.abbrv) # Run exploit_cand_solver logger.debug(".... EXPLOIT_CANDIDATES_solver launching for stop_update: {}".format(stop_update)) kNN_exploit, _, calc_pct_exploit = exploit_solver.solve(stop_update=stop_update) # Append to experiment data exp_exploit_data.add(stop_update, 100. - calc_pct_exploit, rank.ExploitCandidatesIterator.abbrv) # Help garbage collector to release the memory as soon as possible del top_down_solver del exploit_solver return exp_exploit_data.process()
def run(self): """Runs the classification experiment. Returns: ExpClassifierProcessedData: """ # Create an ExpClassifierRawData instance to save experiment data exp_classifier_data = ExpClassifierRawData(z=self.z) # Generate test problems CB_train, CB_test = exp_common.split_cb(self.cb, self.test_size) len_test = len(CB_test) CB_train = cbr.TCaseBase( cb=CB_train ) # This will be passed to Anytime Lazy KNN, not the `ExpInsightsEngine.cb` # Conduct tests for each sequence in cb_test for idx, sequence in enumerate(CB_test): logger.info( ".. Testing with problem sequence {} of {} (seq_id: {})". format(idx + 1, len_test, sequence.seq_id)) # For every problem create two sequence solvers, one for uninterrupted, the other for the interrupted solving # instantiate the `RankIterator of choice with its given keyword arguments for both solvers rank_iterator_unint = self.cls_rank_iterator( **self.cls_rank_iterator_kwargs) rank_iterator_int = self.cls_rank_iterator( **self.cls_rank_iterator_kwargs) uninterrupted_solver = SolveSequenceClassifier( CB_train, self.k, sequence, self.similarity, rank_iterator_unint, self.reuse) # Note: 'exp_insights_raw' not provided interrupted_solver = SolveSequenceClassifier( CB_train, self.k, sequence, self.similarity, rank_iterator_int, self.reuse) if self.stop_w_soln: rank_iterator_int_w_soln = self.cls_rank_iterator( **self.cls_rank_iterator_kwargs) interrupted_w_soln_solver = SolveSequenceClassifier( CB_train, self.k, sequence, self.similarity, rank_iterator_int_w_soln, self.reuse) # Run tests for each update for stop_update in range(sequence.n_profiles()): # ------------------------------------------------------------------------------------------------------ # 1) Run uninterrupted_solver to stop at the end of the stop_update logger.debug( ".... UNINTERRUPTED_solver launching for stop_update: {}". format(stop_update)) kNN_uninterrupted, _, _ = uninterrupted_solver.solve( stop_update=stop_update) # Solution Uninterrupted soln_uninterrupted, _ = uninterrupted_solver.anytime_lazy_knn_classifier.suggest_solution( ) # Set the stop_calc_list for testing if stop_update == 0: stop_calc_list = [ None ] # At initial problem (i.e. 0th update) we don't interrupt else: stop_calc_list = pdp.get_calcs_for_conf_tholds( pdp_file=self.pdp_file, update=stop_update, conf_tholds=self.conf_tholds, z=self.z, knn_i=-1) # last kNN member, i.e. kNN[k-1] stop_calc_list.append( None) # Make sure you iterate whole RANK in the end. logger.debug(".... stop_calc_list : {}".format( str(stop_calc_list))) # ------------------------------------------------------------------------------------------------------ # 2) Run interrupted_solver to stop at each stop_calc in the stop_update interrupted = common.APP.INTRPT.W_CALC for stop_calc_ind, stop_calc in enumerate(stop_calc_list): # Run AnytimeLazyKNNClassifier to stop at the `stop_calc`^th calc of the `stop_update`^th update if interrupted == common.APP.INTRPT.W_CALC and ( True if (stop_calc_ind == 0 or stop_calc != stop_calc_list[stop_calc_ind - 1]) else False): # This 'if' is needed for occasions when interrupted=False but conf_thold is provided # TODO (OM, 20200505): ? Check if this info still holds # Only execute if you have been interrupted before # AND if stop_calc is different than the previous one (get_calc_for_confidence can return same calc for different confidence thresholds) logger.debug( ".... INTERRUPTED_solver launching for stop_update: {}, stop_calc: {}" .format(stop_update, stop_calc)) kNN_interrupted, interrupted, calc_pct = interrupted_solver.solve( stop_update=stop_update, stop_calc_at_stop_update=stop_calc) # For each kNN[i], calculate "confidence", its "std_dev", "quality", "gain" and "sim" values for knn_i in range(self.k): knn_unint_i = kNN_uninterrupted[knn_i] knn_int_i = kNN_interrupted[knn_i] if stop_update != 0: conf, std_dev = pdp.get_conf_for_calc( pdp_file=self.pdp_file, update=stop_update, knn_i=knn_i, calc=stop_calc) else: conf, std_dev = ( 1.0, 0.0 ) # In 0^th update we don't interrupt, so the kNNs should be the same. quality_ = knn_int_i.sim / knn_unint_i.sim if knn_unint_i.sim != 0. else 0. gain = 100.0 - calc_pct sim = self.similarity( CB_train.get_case_query(knn_unint_i.case_id), CB_train.get_case_query(knn_int_i.case_id)) if self.conf_tholds is not None: if stop_calc is not None: thold = self.conf_tholds[stop_calc_ind] else: thold = 1. # uninterrupted (last run of the interruption points) else: thold = None # Append the data to the gain_data for the result file exp_classifier_data.add_gain(update=stop_update, calc=stop_calc, knn_i=knn_i, conf=conf, std_dev=std_dev, quality=quality_, gain=gain, sim=sim, thold=thold, stop_w_soln=False, intrpt_w_soln=False) # Compare knn_unint_i and knn_int_i given the confidence intrpt.log_unint_int_comparison( conf, std_dev, stop_update, stop_calc, knn_i, knn_unint_i, knn_int_i, quality_, sim) if interrupted == common.APP.INTRPT.W_CALC: # Solution Interrupted soln_interrupted, _ = interrupted_solver.anytime_lazy_knn_classifier.suggest_solution( ) hit_interrupted = soln_interrupted == soln_uninterrupted exp_classifier_data.add_hit(update=stop_update, thold=thold, stop_w_soln=False, intrpt_w_soln=False, hit=hit_interrupted) logger.debug( "...... Solution hit for conf_thold {}: {}".format( thold, hit_interrupted)) # ------------------------------------------------------------------------------------------------------ # 3) Run interrupted_w_soln_solver to stop at the moment when an exact solution is guaranteed if self.stop_w_soln: logger.debug( ".... INTERRUPTED_W_SOLN_solver launching for stop_update: {}, stop_calc: {}" .format(stop_update, alk_classifier.STOP_CALC_FOR_STOP_W_SOLN)) kNN_interrupted_w_soln, interrupted, calc_pct = interrupted_w_soln_solver.solve( stop_update=stop_update, stop_calc_at_stop_update=alk_classifier. STOP_CALC_FOR_STOP_W_SOLN if stop_update > 0 else None) # Append the data to the gain_data for the result file exp_classifier_data.add_gain( update=stop_update, calc=None, knn_i=None, conf=None, std_dev=None, quality=None, gain=100.0 - calc_pct, sim=None, thold=None, stop_w_soln=True, intrpt_w_soln=interrupted == common.APP.INTRPT.W_SOLN) # Save solution hit upon interruption w/ exact soln if interrupted == common.APP.INTRPT.W_SOLN: soln_interrupted_w_soln, _ = interrupted_solver.anytime_lazy_knn_classifier.suggest_solution( ) hit_interrupted_w_soln = soln_uninterrupted == soln_interrupted_w_soln exp_classifier_data.add_hit(update=stop_update, thold=None, stop_w_soln=True, intrpt_w_soln=True, hit=hit_interrupted_w_soln) logger.debug( "...... Solution hit for stop_w_soln: {}".format( hit_interrupted_w_soln)) if not hit_interrupted_w_soln: logger.error( "...... Solution hit NOT achieved for stop_w_soln !!!" ) # run interrupted_w_soln_solver to complete the search for the remaining ki's w/o interruption if stop_update > 0: logger.debug( ".... INTERRUPTED_W_SOLN_solver *resuming* for stop_update: {}, stop_calc: {}" .format(stop_update, stop_calc)) interrupted_w_soln_solver.solve( stop_update=stop_update, stop_calc_at_stop_update=None) # Help garbage collector to release the memory as soon as possible del uninterrupted_solver del interrupted_solver if self.stop_w_soln: del interrupted_w_soln_solver return exp_classifier_data.process()
def sim_hist(cb, similarity, bins=10, test_size=10, upd_select=None): """Calculates similarities between a proportion of cases over the rest of the case base Args: cb (cbr.TCaseBase): Temporal case base similarity (Callable): Similarity metric bins (int): Number of bins for the histogram test_size(Union[float, int]): test size to split the `cb` into test sequences and CB_train. If float, should be between 0.0 and 1.0 and represent the proportion of the `cb` to generate test sequences; if int, represents the absolute number of test sequences upd_select (int): Particular update index to use as the query for each sequence" Returns: numpy.ndarray: 1D array of histogram of similarity distribution given as percentage values """ logger.info( ".. Calculating similarity distribution for a proportion of cases over the case base" ) CB_train, CB_test = exp_common.split_cb(cb, test_size) CB_train = cbr.TCaseBase(cb=CB_train) linear_search = alk.LinearSearch(cb=CB_train, similarity=similarity) total_queries = sum([seq.n_profiles() for seq in CB_test ]) if upd_select is None else len(CB_test) logger.info(".... Linear search with {} queries X {} cases".format( total_queries, CB_train.size())) distr_arr = np.full((bins, ), 0, dtype=int) bin_width = 1. / bins total_calcs = 0 try: n_features_min, n_features_max = len(CB_test[0].profile(0)), len( CB_test[0].profile(0)) except IndexError: logger.error("No queries to test. Check your case base and test size.") sys.exit(1) sim_min, sim_max, sim_mean = 1., 0., 0. sim_cntr = 0 for sequence in CB_test: for upd_id in range( sequence.n_profiles()) if upd_select is None else [upd_select]: query = sequence.profile( upd_id) # Get query for the sequence update n_features = len(query) if n_features > n_features_max: n_features_max = n_features elif n_features < n_features_min: n_features_min = n_features stage, calcs = linear_search.search(query) # Search for assess in stage.nn: sim_cntr += 1 bin_idx = 0 if assess.sim == 0 else math.ceil(assess.sim / bin_width) - 1 distr_arr[bin_idx] += 1 # Update sim occurrence array if assess.sim < sim_min: sim_min = assess.sim if assess.sim > sim_max: sim_max = assess.sim sim_mean = sim_mean + ( assess.sim - sim_mean) / sim_cntr # Incrementally update mean total_calcs += calcs logger.info(".... Number of features min: {}, max: {}".format( n_features_min, n_features_max)) logger.info(".... Total computations: {}".format(total_calcs)) logger.info(".... Similarity min: {}, max: {}, avg: {}".format( sim_min, sim_max, sim_mean)) return distr_arr / distr_arr.sum()
def run(self): """Runs the interruption experiment. Returns: pd.DataFrame: Output of the `ExpIntrptData.process()` columns -> ["update", "calc", "knni", "conf", "std", "quality", "gain", "sim", "abserr", "abspcterr", "effcysim", "effcyq", "confthold"] """ # Create an ExpIntrptData instance to save experiment data exp_intrpt_data = ExpIntrptData(z=self.z) if self.conf_tholds is not None: self.conf_tholds = sorted(self.conf_tholds) # sort just in case the arguments are not given in ascending order... # Generate test problems CB_train, CB_test = exp_common.split_cb(self.cb, self.test_size) len_test = len(CB_test) CB_train = cbr.TCaseBase(cb=CB_train) # This will be passed to Anytime Lazy KNN, not the `ExpInsightsEngine.cb` # Conduct tests for each sequence in cb_test for idx, sequence in enumerate(CB_test): logger.info(".. Testing with problem sequence {} of {} (seq_id: {})".format(idx + 1, len_test, sequence.seq_id)) # For every problem create two sequence solvers, one for uninterrupted, the other for the interrupted solving # instantiate the `RankIterator of choice with its given keyword arguments for both solvers rank_iterator_unint = self.cls_rank_iterator(**self.cls_rank_iterator_kwargs) rank_iterator_int = self.cls_rank_iterator(**self.cls_rank_iterator_kwargs) uninterrupted_solver = exp_common.SolveSequence(CB_train, self.k, sequence, self.similarity, rank_iterator_unint) # Note: 'exp_insights_raw' not provided interrupted_solver = exp_common.SolveSequence(CB_train, self.k, sequence, self.similarity, rank_iterator_int) # Run tests for each update for stop_update in range(sequence.n_profiles()): # ------------------------------------------------------------------------------------------------------ # 1) Run uninterrupted_solver to stop at the end of the stop_update logger.debug(".... UNINTERRUPTED_solver launching for stop_update: {}".format(stop_update)) kNN_uninterrupted, _, _ = uninterrupted_solver.solve(stop_update=stop_update) # Set the stop_calc_list for testing if stop_update == 0: stop_calc_list = [None] # At initial problem (i.e. 0th update) we don't interrupt else: stop_calc_list = pdp.get_calcs_for_conf_tholds(pdp_file=self.pdp_file, update=stop_update, conf_tholds=self.conf_tholds, z=self.z, knn_i=-1) # last kNN member, i.e. kNN[k-1] stop_calc_list.append(None) # Make sure you iterate whole RANK in the end. logger.debug(".... stop_calc_list : {}".format(str(stop_calc_list))) # ------------------------------------------------------------------------------------------------------ # 2) Run interrupted_solver to stop at each stop_calc in the stop_update interrupted = common.APP.INTRPT.W_CALC for stop_calc_ind, stop_calc in enumerate(stop_calc_list): # Run AnytimeLazyKNN to stop at the `stop_calc`^th calc of the `stop_update`^th update if interrupted == common.APP.INTRPT.W_CALC and ( True if (stop_calc_ind == 0 or stop_calc != stop_calc_list[stop_calc_ind - 1]) else False): # This 'if' is needed for occasions when interrupted=False but conf_thold is provided # TODO (OM, 20200505): ? Check if this info still holds # Only execute if you have been interrupted before # AND if stop_calc is different than the previous one (get_calc_for_confidence can return same calc for different confidence thresholds) logger.debug(".... INTERRUPTED_solver launching for stop_update: {}, stop_calc: {}".format(stop_update, stop_calc)) kNN_interrupted, interrupted, calc_pct = interrupted_solver.solve(stop_update=stop_update, stop_calc_at_stop_update=stop_calc) # For each kNN[i], calculate "confidence", its "std_dev", "quality", "gain" and "sim" values for knn_i in range(self.k): knn_unint_i = kNN_uninterrupted[knn_i] knn_int_i = kNN_interrupted[knn_i] if stop_update != 0: conf, std_dev = pdp.get_conf_for_calc(pdp_file=self.pdp_file, update=stop_update, knn_i=knn_i, calc=stop_calc) else: conf, std_dev = (1.0, 0.0) # In 0^th update we don't interrupt, so the kNNs should be the same. quality_ = knn_int_i.sim / knn_unint_i.sim if knn_unint_i.sim != 0. else 0. gain = 100.0 - calc_pct sim = self.similarity(CB_train.get_case_query(knn_unint_i.case_id), CB_train.get_case_query(knn_int_i.case_id)) if self.conf_tholds is not None: if stop_calc is not None: thold = self.conf_tholds[stop_calc_ind] else: thold = 1. # uninterrupted (last run of the interruption points) else: thold = None # Append the data to the conf_gain_data for the result file exp_intrpt_data.add(update=stop_update, calc=stop_calc, knn_i=knn_i, conf=conf, std_dev=std_dev, quality=quality_, gain=gain, sim=sim, thold=thold) # Compare knn_unint_i and knn_int_i given the confidence log_unint_int_comparison(conf, std_dev, stop_update, stop_calc, knn_i, knn_unint_i, knn_int_i, quality_, sim) # Help garbage collector to release the memory as soon as possible del uninterrupted_solver del interrupted_solver return exp_intrpt_data.process()