def save(self, path, echo=None): """ :type path: str or Path :type echo: bool """ if echo is None: echo = self._echo progress_bar = ProgressBar(total=len(self.memories_dictionary)+2, echo=echo) progress_amount = 0 path = Path(path=path) path.make_dir() progress_bar.show(amount=progress_amount, text='saving parameters') (path + 'parameters.pensieve').save(obj=self.parameters) progress_amount += 1 memory_keys = [] for key, memory in self.memories_dictionary.items(): progress_bar.show(amount=progress_amount, text=f'saving "{key}" memory') memory.save(path=path + key) progress_amount += 1 memory_keys.append(key) progress_bar.show(amount=progress_amount, text=f'saving memory keys') (path + 'memory_keys.pensieve').save(obj=memory_keys) progress_amount += 1 progress_bar.show(amount=progress_amount)
def _find_best_matching_rows(strings, right, right_on, na_ratio, two_na_ratio, case_sensitivity, score_name, num_threads, similarity_function, weights, num_results, echo): """ :param strings: :param right: :param right_on: :param na_ratio: :param two_na_ratio: :param case_sensitivity: :param score_name: :param num_threads: :param num_results: :param echo: :rtype: DataFrame """ right = right.copy() if num_threads == 1: right[score_name] = ProgressBar.apply( data=right, function=lambda row: _get_similarity_between_strings_and_row( strings=strings, row=row, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, similarity_function=similarity_function, weights=weights), echo=echo) else: parallel = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem') progress_bar = ProgressBar(total=len(right) + 1, echo=echo) right[score_name] = parallel( delayed(_get_similarity_between_strings_and_row)( strings=strings, row=row, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, similarity_function=similarity_function, weights=weights) for index, row in iterate(right.iterrows(), progress_bar=progress_bar)) progress_bar.show(amount=len(right) + 1) right = right.sort_values(by=score_name, ascending=False) return right.iloc[0:num_results]
def get_similarities(string, strings, method='jaro_winkler', similarity_function=None, case_sensitivity=1.0, first_char_weight=0.0, first_word_weight=0.0, echo=0): """ :type string: str :type strings: list[str] :type treat_as_sentence: bool :param str method: can be one of 'jaro_winkler', 'levenshtein', 'sentence_jaro_winkler', 'sentence_levenshtein' :type case_sensitivity: float :type first_char_weight: float :type first_word_weight: float :rtype: """ echo = max(0, echo) string = str(string) text = string + ' ? ' return list( ProgressBar.map(function=lambda x: get_similarity( s1=string, s2=x, method=method, similarity_function=similarity_function, first_char_weight=first_char_weight, case_sensitivity=case_sensitivity, first_word_weight=first_word_weight), iterable=strings, iterable_text=strings, text=text, echo=echo))
def find_most_similar(strings, candidates, candidate_ids=None, string_ids=None, method='jaro_winkler', similarity_function=None, case_sensitivity=1.0, first_char_weight=0.0, first_word_weight=0.0, num_results=1, echo=0): """ :type strings: str or list[str] :type candidates: list[str] :type candidate_ids: list or NoneType :type string_ids: list or NoneType :param str method: can be one of 'jaro_winkler', 'levenshtein', 'sentence_jaro_winkler', 'sentence_levenshtein' :type treat_as_sentence: bool :type first_char_weight: float :type first_word_weight: float :type case_sensitivity: float :rtype: pd.DataFrame """ echo = max(0, echo) if string_ids is None: string_ids = range(len(strings)) return pd.concat( list( ProgressBar.map( function=lambda x: find_most_similar_for_one_string( string=x[0], string_id=x[1], candidates=candidates, candidate_ids=candidate_ids, method=method, similarity_function=similarity_function, case_sensitivity=case_sensitivity, first_char_weight=first_char_weight, first_word_weight=first_word_weight, num_results=num_results, echo=echo - 1), iterable=list(zip(strings, string_ids)), iterable_text=strings, echo=echo))).reset_index(drop=True)[[ 'string_id', 'string', 'candidate_id', 'candidate', 'similarity_rank', 'similarity' ]]
def zip_directory(path, zip_path, compression=ZIP_DEFLATED, echo=0): echo = max(0, echo) progress_bar = ProgressBar(echo=echo, total=None) compression = compression or ZIP_STORED amount = 0 with ZipFile(file=zip_path, mode='w', compression=compression) as zip_file: for root, dirs, files in os.walk(path): for file in files: zip_file.write(os.path.join(root, file)) progress_bar.show(amount=amount, text=f'"{file}" zipped into {zip_path}') amount += 1 progress_bar.show(amount=amount, text=f'{zip_path} complete!') return zip_path
def perturb_data(data, perturbation): list_of_data = [] progress_bar = ProgressBar(total=len(perturbation), echo=echo) progress = 0 for column, perturbation_list in perturbation.items(): progress_bar.show(amount=progress, text=column) for delta in perturbation_list: list_of_data.append( perturb_numeric_column(data=data, column=column, delta=delta)) progress += 1 data = concat(list_of_data) progress_bar.show(amount=progress) return data
def get_function_influence(function, data, x_columns, num_deltas=200, num_threads=1, echo=1): """ :param callable function: function that gives a y for a given data :param pd.DataFrame data: :param list[str] or str x_columns: :param int num_deltas: :param int num_threads: :type echo: int :rtype: pd.DataFrame """ if isinstance(x_columns, str): x_columns = [x_columns] elif not isinstance(x_columns, list): raise TypeError(f'x_columns should be either a string or a list not a {type(x_columns)}') def get_influence_data(the_tuple): with warnings.catch_warnings(): warnings.simplefilter('ignore') x_column, this_data, this_num_deltas, this_function = the_tuple record = get_single_column_influence( function=this_function, data=this_data, x_column=x_column, num_deltas=this_num_deltas ) return record tuples = [ (x_column, data, num_deltas, function) for x_column in iterate(x_columns, echo=echo, text='preparing data ...') ] progress_bar = ProgressBar(total=len(x_columns) + 1, echo=echo) progress_bar.show(amount=len(x_columns), text='data prepared.') if num_threads == 1: dataframes = [ get_influence_data(t) for t in iterate(iterable=tuples, progress_bar=progress_bar, text='measuring influence ...') ] else: dataframes = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem')( delayed(get_influence_data)(t) for t in iterate(iterable=tuples, progress_bar=progress_bar, text='measuring influence ...') ) result = pd.concat(dataframes).sort_values(by=['variable', 'x_delta']) progress_bar.show(amount=len(dataframes) + 1, text='influence measured.') return result
def simulate(self, echo=1): """ :type echo: int :rtype: list[dict] """ def _get_influence(column, perturbation, function): influence_result = self.get_single_influence( column=column, perturbation=perturbation, function=function) influence_result['column'] = column influence_result['perturbation'] = perturbation return influence_result progress_bar = ProgressBar(total=2 + len(self._influencers), echo=echo) if self._num_threads == 1: result = [ _get_influence(column=column, perturbation=perturbation, function=self._function) for column, influences in iterate(self._influencers.items(), progress_bar=progress_bar, text='measuring influence ...') for perturbation in influences ] else: result = self.parallel_process( delayed(_get_influence)(column=column, perturbation=perturbation, function=deepcopy(self._function)) for column, influences in iterate( self._influencers.items(), progress_bar=progress_bar, text='measuring influence in parallel ...') for perturbation in influences) progress_bar._total = 2 + len(self._influencers) progress_bar.show(amount=1 + len(self._influencers)) result_data = pd.DataFrame.from_records(result) progress_bar.show(amount=2 + len(self._influencers)) return result_data
def load(cls, path, echo=True): path = Path(path=path) parameters = (path + 'parameters.pensieve').load() pensieve = cls(safe=parameters['safe']) for name, value in parameters.items(): setattr(pensieve, f'_{name}', value) memory_keys = (path + 'memory_keys.pensieve').load() progress_bar = ProgressBar(total=len(memory_keys)) progress_amount = 0 pensieve._memories_dictionary = {} for key in memory_keys: if echo: progress_bar.show(amount=progress_amount, text=f'loading "{key}" memory') memory = Memory.load(path=path + key, pensieve=pensieve) pensieve._memories_dictionary[key] = memory progress_amount += 1 if echo: progress_bar.show(amount=progress_amount) return pensieve
def evaluate(self, keys=None, output=False): """ evaluates multiple memories, in parallel if num_threads != 1 :type keys: list[str] or NoneType or str :type output: bool :rtype: list or NoneType """ if keys is None: keys = list(self.memories_dictionary.keys()) elif isinstance(keys, str): keys = [keys] if self._n_jobs == 1: if output: return [self[key] for key in keys] else: for key in keys: self.memories_dictionary[key].evaluate() else: def get_content(p): return p.content memories = [self.memories_dictionary[key] for key in keys] schedule = self.get_update_schedule(keys=keys) progress_bar = ProgressBar( total=sum([len(schedule_round) for schedule_round in schedule]), echo=self._echo ) progress_amount = 0 for schedule_round in schedule: progress_bar.show(amount=progress_amount, text=f'updating {len(schedule_round)} memories') self.processor(delayed(get_content)(job) for job in schedule_round) progress_amount += len(schedule_round) if progress_amount > 0: progress_bar.show(amount=progress_amount, text=f'{self.key} updated!') contents = self.processor(delayed(get_content)(p) for p in memories) if output: return list(contents)
def encode(self, data, drop_encoded=True, echo=0): echo = max(0, echo) result = data.copy() progress_bar = ProgressBar(total=len(self._column_values)) progress_amount = 0 for col_name, only_include in self._column_values.items(): if echo: progress_bar.show(amount=progress_amount, text=f'DM creating dummies for {col_name}') progress_amount += 1 temp_data = result[[col_name]].copy() if self._lowercase: temp_data[col_name] = temp_data[col_name].str.lower() temp_data[col_name] = np.where( temp_data[col_name].isin(only_include), temp_data[col_name], self._replacement) dummies = pd.get_dummies(data=temp_data[[col_name]], prefix=col_name, prefix_sep='_', dummy_na=self._encode_na, sparse=True) result = pd.concat([result, dummies], axis=1) for col_name in self._one_hot_columns: if col_name not in result.columns: result[col_name] = 0 if echo: progress_bar.show( amount=progress_amount, text=f'DM created dummies for {self._encoded_columns}') if drop_encoded: result = result.drop(columns=self.encoded_columns) extra_columns = [ x for x in result.columns if x not in self.encoded_columns + self.one_hot_columns and x not in data.columns ] if len(extra_columns) > 0: result = result.drop(columns=extra_columns) return result
def train(self, data, echo=0): echo = max(0, echo) result = data.copy() one_hot_columns = [] non_numeric_cols = data.select_dtypes( exclude=['bool', 'number', 'datetime64', 'datetime']).columns if self._include is not None: non_numeric_cols = [ col for col in non_numeric_cols if col in self._include ] if self._exclude is not None: non_numeric_cols = [ col for col in non_numeric_cols if col not in self._exclude ] progress_bar = ProgressBar(total=len(non_numeric_cols)) progress_amount = 0 self._encoded_columns = [] for col_name in non_numeric_cols: if echo: progress_bar.show(amount=progress_amount, text=f'DM training dummies for {col_name}') progress_amount += 1 try: temp_data = data[[col_name]].copy() if self._lowercase: temp_data[col_name] = temp_data[col_name].str.lower() temp_data['count'] = 1 counts = temp_data.groupby(col_name).sum().reset_index( drop=False) if self._top is not None: top_counts = counts.sort_values(by=col_name, ascending=False).head( self._top) only_include = set(top_counts[col_name]) temp_data[col_name] = np.where( temp_data[col_name].isin(only_include), temp_data[col_name], self._replacement) else: only_include = set(counts[col_name]) dummies = pd.get_dummies(data=temp_data[[col_name]], prefix=col_name, prefix_sep='_', dummy_na=self._encode_na, sparse=False) dummies = dummies[[ col for col in dummies if not dummies[col].nunique() == 1 ]] result = pd.concat([result, dummies], axis=1) one_hot_columns += list(dummies.columns) self._column_values[col_name] = only_include self._encoded_columns.append(col_name) except Exception as e: print(f'exception being raised for column: {col_name}') raise e self._one_hot_columns = one_hot_columns if echo: progress_bar.show( amount=progress_amount, text=f'DM trained dummies for {self._encoded_columns}') return result
def eliminate(self, num_rounds=None, echo=1): """ :type num_rounds: int or NoneType :type echo: bool or int :rtype: Regression """ model = self if isinstance(model.fit, BrokenModel): return self num_rounds = num_rounds or len(self.variables) progress_bar = ProgressBar(total=num_rounds, echo=echo) round = 0 previous_lse = None previous_formula = None for round in range(num_rounds): if model.num_insignificant_effects < 1: break else: y = model.formula.dependent_variable lse = model.least_significant_insignificant_effect xs = model.all_effects_but_least_significant progress_bar.show( amount=round, text=f'effects: {len(xs)+1}, eliminating {lse} with p={lse.p}' ) new_formula = Formula.from_variables( independent_variables=xs, dependent_variable=y ) if previous_lse == lse: print('eliminating', lse, '\n\n') print('is main effect insignificant?', model.is_main_effect_significant(lse), '\n\n') for interaction in model.significant_interaction_effects: print(interaction, 'contains?', interaction.contains(lse)) for interaction in model.insignificant_interaction_effects: print(interaction, interaction.p) print(model.least_significant_insignificant_effect, lse) print('\n\ninsignificant effects\n', model.insignificant_effects) print(new_formula.display()) for x in xs: print(x, x.num_interactions, x.p) raise RuntimeError(f'repeating the elimination of {lse}') previous_formula = new_formula previous_lse = lse new_model = model.__class__( data=model.data, formula=new_formula, significance_level=model.significance_level, groups=model.groups, family=model.family, model_builder=model._model_builder, parent=model ) if isinstance(new_model.fit, BrokenModel): break else: model = new_model progress_bar.show( amount=round + 1, text=f'effects: {len(model.formula.independent_variables)}' ) return model
def get_content_and_reference(self): if self.n_jobs == 1: precursor_keys_to_contents = {p.key: p.content for p in self.precursors} else: def get_content(p): return p.content precursors = self.precursors schedule = self.get_update_schedule() progress_bar = ProgressBar( total=sum([len(schedule_round) for schedule_round in schedule]), echo=self.pensieve._echo ) progress_amount = 0 for schedule_round in schedule: progress_bar.show(amount=progress_amount, text=f'updating {len(schedule_round)} memories') self.pensieve.processor(delayed(get_content)(job) for job in schedule_round) progress_amount += len(schedule_round) if progress_amount > 0: progress_bar.show(amount=progress_amount, text=f'{self.key} updated!') contents = self.pensieve.processor(delayed(get_content)(p) for p in precursors) keys = [precursor.key for precursor in precursors] precursor_keys_to_contents = {key: content for key, content in zip(keys, contents)} if len(self.precursor_keys) == 0: new_reference = get_source(self._original_function) if new_reference == self._precursors_reference and not self.is_stale: new_content = self._content elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists(): new_content = self.backup_content else: timer = Timer(start_now=True, unit='timedelta') new_content = self._function() timer.stop() self.pensieve.function_durations.add_measurement(name=self.key, timer=timer) elif len(self.precursor_keys) == 1: precursor_content = list(precursor_keys_to_contents.values())[0] new_reference = (get_source(self._original_function), precursor_keys_to_contents) if new_reference == self._precursors_reference and not self.is_stale: new_content = self._content elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists(): new_content = self.backup_content else: timer = Timer(start_now=True, unit='timedelta') new_content = self._function(precursor_content) timer.stop() self.pensieve.function_durations.add_measurement(name=self.key, timer=timer) else: inputs = EvaluationInput(inputs=precursor_keys_to_contents) new_reference = (get_source(self._original_function), precursor_keys_to_contents) if new_reference == self._precursors_reference and not self.is_stale: new_content = self._content elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists(): new_content = self.backup_content else: timer = Timer(start_now=True, unit='timedelta') new_content = self._function(inputs.originals) timer.stop() self.pensieve.function_durations.add_measurement(name=self.key, timer=timer) self._content_type = get_type(new_content) self._content_access_count += 1 if self.backup_directory and new_reference != self.backup_precursors_reference: self.backup_content = new_content self.backup_precursors_reference = new_reference return new_content, new_reference
def validate(self, problem_type, evaluation_function=None, model=None, model_name=None, model_grid=None, num_threads=None, return_models=False, raise_error=False, main_metric=None, best_model_criteria=None, measure_influence=True, num_influence_points=400, echo=None): """ :param callable evaluation_function: :param str problem_type: either 'regression' of 'classification' :param model: a regressor or classifier :param str or NoneType model_name: :param ModelGrid or NoneType model_grid: :param dict[str, list or str or int or float] parameter_grid: a dictionary telling how the grid of models should be built :param int num_threads: for parallel computing :param bool return_models: whether or not trained models should be returned :param bool raise_error: :param str main_metric: the metric to be used for choosing the best model :param str or list[str] or NoneType other_metrics: other metrics to be included in the aggregate :param str best_model_criteria: 'highest' means the model with the highest metric value should be chosen :param int or bool or ProgressBar echo: :rtype: dict[str,] """ echo = echo or self._echo num_threads = num_threads or self._num_threads if model is not None: try: model_class_name = model.__name__ except AttributeError: model_class_name = None else: model_class_name = None if problem_type.lower().startswith('class'): model_name = model_name or model_class_name or 'classifier' main_metric = main_metric or 'f1_score' best_model_criteria = best_model_criteria or 'highest' evaluation_function = evaluation_function or evaluate_classification else: model_name = model_name or model_class_name or 'regressor' main_metric = main_metric or 'rmse' best_model_criteria = best_model_criteria or 'lowest' evaluation_function = evaluation_function or evaluate_regression if model is not None and model_grid is None: models = {model_name: model} parameters_data = None parameters_dictionary = None elif model is None and model_grid is not None: models = model_grid.models parameters_data = model_grid.parameter_table parameters_dictionary = model_grid.parameters else: raise ValueError( 'either a preset model should be given or a model grid') model_folds = [{ 'model_name': model_name, 'model': model, 'fold_num': fold_num + 1, 'fold': fold } for model_name, model in models.items() for fold_num, fold in enumerate(self.folds)] shared_memory = { 'progress_bar': ProgressBar(total=1 + len(models) * len(self.folds), echo=echo), 'progress_amount': 0 } with warnings.catch_warnings(): warnings.simplefilter("ignore") if num_threads == 1: result = [ validate_fold(model_fold=model_fold, shared_memory=shared_memory, evaluation_function=evaluation_function) for model_fold in model_folds ] else: parallel = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem') result = parallel( delayed(validate_fold)( model_fold=model_fold, shared_memory=shared_memory, evaluation_function=evaluation_function) for model_fold in model_folds) training = [] test = [] trained_models = [] feature_importances_list = [] coefficients_list = [] for record in result: model = record['model'] training_evaluation = record['training_evaluation'] test_evaluation = record['test_evaluation'] training_evaluation['fold_num'] = record['fold_num'] test_evaluation['fold_num'] = record['fold_num'] training_evaluation['model_name'] = record['model_name'] test_evaluation['model_name'] = record['model_name'] training.append(training_evaluation) test.append(test_evaluation) feature_importances = get_feature_importances( model=model, columns=self.x_columns, model_name=record['model_name'], fold_num=record['fold_num'], raise_error=raise_error) if feature_importances is not None: feature_importances_list.append(feature_importances) coefficients = get_coefficients(model=model, columns=self.x_columns, model_name=record['model_name'], fold_num=record['fold_num'], raise_error=raise_error) if coefficients is not None: coefficients_list.append(coefficients) if return_models: trained_models.append(model) with warnings.catch_warnings(): warnings.simplefilter("ignore") feature_importances = DataFrame.from_records( feature_importances_list) coefficients = DataFrame.from_records(coefficients_list) training = bring_to_front(data=DataFrame.from_records(training), columns=['model_name', 'fold_num']) test = bring_to_front(data=DataFrame.from_records(test), columns=['model_name', 'fold_num']) result = Dictionary({'training': training, 'test': test}) if feature_importances.shape[1] > 0: result['feature_importances'] = bring_to_front( data=feature_importances, columns=['model_name', 'fold_num']) result['mean_feature_importances'] = result[ 'feature_importances'].drop(columns='fold_num').groupby( ['model_name']).mean().reset_index() if coefficients.shape[1] > 0: result['coefficients'] = bring_to_front( data=coefficients, columns=['model_name', 'fold_num']) result['mean_coefficients'] = result['coefficients'].drop( columns='fold_num').groupby(['model_name' ]).mean().reset_index() if return_models: result['models'] = trained_models shared_memory['progress_amount'] += 1 shared_memory['progress_bar'].show( amount=shared_memory['progress_amount'], text='validation complete.') if main_metric is not None and main_metric in result[ 'test'].columns: aggregated_training = training.drop( columns='fold_num').groupby( 'model_name').mean().reset_index() aggregated_test = test.drop(columns='fold_num').groupby( 'model_name').mean().reset_index() aggregated_result = aggregated_training.merge( right=aggregated_test, on='model_name', how='outer', suffixes=['_training', '_test']) if parameters_data is not None: aggregated_result = parameters_data.merge( right=aggregated_result, on='model_name', how='outer', suffixes=['_parameter', '']) aggregated_result = aggregated_result.sort_values( f'{main_metric}_test', ascending=best_model_criteria != 'highest') result['aggregate'] = aggregated_result best_model_name = aggregated_result.head( 1)['model_name'].values[0] if parameters_dictionary is not None: result['best_model_parameters'] = parameters_dictionary[ best_model_name] best_model = models[best_model_name] trained_best_model = deepcopy(best_model) trained_best_model.fit(X=self.validation.X, y=self.validation.y) best_model_performance = evaluation_function( trained_best_model.predict(self.holdout.X), self.holdout.y) result['best_model'] = best_model result['best_model_trained'] = trained_best_model result['best_model_performance'] = best_model_performance best_model_feature_importances = get_feature_importances( model=trained_best_model, columns=self.validation.x_columns, model_name=best_model_name, raise_error=raise_error) best_model_coefficients = get_coefficients( model=trained_best_model, columns=self.validation.x_columns, model_name=best_model_name, raise_error=raise_error) if best_model_feature_importances is not None: result[ 'best_model_feature_importances'] = best_model_feature_importances if best_model_coefficients is not None: result['best_model_coefficients'] = best_model_coefficients if measure_influence: result['influence'] = self.get_influence( model=trained_best_model, num_threads=num_threads, num_points=num_influence_points, echo=echo) model = result['best_model_trained'] transformer = self._transformer predictor = Predictor(model=model, transformer=transformer, model_is_fitted=True, transformer_is_fitted=True, x_columns=self.x_columns) result['predictor'] = predictor return result
def fuzzy_left_merge(left, right, left_on=None, right_on=None, on=None, suffixes=('_x', '_y'), score_name='match_ratio', na_ratio=0.5, two_na_ratio=0.75, similarity_function=None, weights=None, case_sensitivity=0.5, num_results=1, num_threads=-1, echo=1): """ :type left: DataFrame :type right: DataFrame :type left_on: list[str] or str or NoneType :type right_on: list[str] or str or NoneType :type on: list[str] or str or NoneType :type how: str or NoneType :type case_sensitivity: float :type num_results: int :type similarity_function: callable :type echo: int or bool or ProgressBar :type num_threads: int :rtype: DataFrame """ if score_name in left.columns or score_name in right.columns: raise ValueError('use a score_name different from column names.') data1 = left.copy() data2 = right.copy() if on is None: on = data1.columns & data2.columns if left_on is None: left_on = on if right_on is None: right_on = on missing_left = [col for col in left_on if col not in data1.columns] if len(missing_left) > 0: raise KeyError(f'missing columns on left: {missing_left}') missing_right = [col for col in right_on if col not in data2.columns] if len(missing_right) > 0: raise KeyError(f'missing columns on right: {missing_right}') data1['fuzzy_id'] = range(len(data1)) if num_threads == 1: results = ProgressBar.apply( data=data1, echo=echo, function=lambda row: _match_rows(row=row, right=data2, left_on=left_on, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, score_name=score_name, num_results=num_results, similarity_function= similarity_function, weights=weights, num_threads=1, echo=echo - 1)) else: parallel = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem') progress_bar = ProgressBar(total=len(data1) + 1, echo=echo) results = parallel( delayed(_match_rows)(row=row, right=data2, left_on=left_on, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, score_name=score_name, num_results=num_results, similarity_function=similarity_function, weights=weights, num_threads=1, echo=echo - 1) for index, row in iterate(data1.iterrows(), progress_bar=progress_bar)) progress_bar.show(amount=len(data1) + 1) data2 = concat(results).reset_index(drop=True) return data1.merge(right=data2, on='fuzzy_id', how='left', suffixes=suffixes).drop(columns='fuzzy_id')
def __init__(self, data, x_columns, y_column, id_columns=None, holdout_ratio=0.2, num_validation_splits=5, random_state=None, transformer=None, num_threads=-1, echo=1, cross_validation=None): self._echo = echo self._num_threads = num_threads self._num_validation_splits = num_validation_splits self._random_state = random_state progress_bar = ProgressBar(total=num_validation_splits + 1, echo=self._echo) progress_amount = 0 progress_bar.show(amount=progress_amount, text='preparing validation Xy') if isinstance(id_columns, str): id_columns = [id_columns] if cross_validation is None: cross_validation = get_cross_validation_by_group( data=data, id_columns=id_columns, num_splits=num_validation_splits, holdout_ratio=holdout_ratio, random_state=random_state) self._cross_validation = cross_validation super().__init__(data=data, x_columns=x_columns, y_column=y_column, id_columns=id_columns, training_rows=cross_validation['validation'], training_name='validation', test_rows=cross_validation['holdout'], test_name='holdout', transformer=deepcopy(transformer)) progress_amount += 1 progress_bar.show(amount=progress_amount, text='preparing folds') def transform_training_test_xy(fold): return TransformedTrainingTestXy(data=data, x_columns=x_columns, y_column=y_column, id_columns=id_columns, training_rows=fold['training'], test_rows=fold['test'], transformer=deepcopy(transformer)) if self.num_threads == 1: self._folds = [ transform_training_test_xy(fold=fold) for fold in iterate(cross_validation['folds'], text='preparing folds (single-threaded)') ] else: processor = Parallel(n_jobs=self.num_threads, backend='threading', require='sharedmem') self._folds = processor( delayed(transform_training_test_xy)(fold=fold) for fold in iterate(cross_validation['folds'], text='preparing folds (multi-threaded)'))