def _find_best_matching_rows(strings, right, right_on, na_ratio, two_na_ratio, case_sensitivity, score_name, num_threads, similarity_function, weights, num_results, echo): """ :param strings: :param right: :param right_on: :param na_ratio: :param two_na_ratio: :param case_sensitivity: :param score_name: :param num_threads: :param num_results: :param echo: :rtype: DataFrame """ right = right.copy() if num_threads == 1: right[score_name] = ProgressBar.apply( data=right, function=lambda row: _get_similarity_between_strings_and_row( strings=strings, row=row, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, similarity_function=similarity_function, weights=weights), echo=echo) else: parallel = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem') progress_bar = ProgressBar(total=len(right) + 1, echo=echo) right[score_name] = parallel( delayed(_get_similarity_between_strings_and_row)( strings=strings, row=row, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, similarity_function=similarity_function, weights=weights) for index, row in iterate(right.iterrows(), progress_bar=progress_bar)) progress_bar.show(amount=len(right) + 1) right = right.sort_values(by=score_name, ascending=False) return right.iloc[0:num_results]
def zip_directory(path, zip_path, compression=ZIP_DEFLATED, echo=0): echo = max(0, echo) progress_bar = ProgressBar(echo=echo, total=None) compression = compression or ZIP_STORED amount = 0 with ZipFile(file=zip_path, mode='w', compression=compression) as zip_file: for root, dirs, files in os.walk(path): for file in files: zip_file.write(os.path.join(root, file)) progress_bar.show(amount=amount, text=f'"{file}" zipped into {zip_path}') amount += 1 progress_bar.show(amount=amount, text=f'{zip_path} complete!') return zip_path
def save(self, path, echo=None): """ :type path: str or Path :type echo: bool """ if echo is None: echo = self._echo progress_bar = ProgressBar(total=len(self.memories_dictionary)+2, echo=echo) progress_amount = 0 path = Path(path=path) path.make_dir() progress_bar.show(amount=progress_amount, text='saving parameters') (path + 'parameters.pensieve').save(obj=self.parameters) progress_amount += 1 memory_keys = [] for key, memory in self.memories_dictionary.items(): progress_bar.show(amount=progress_amount, text=f'saving "{key}" memory') memory.save(path=path + key) progress_amount += 1 memory_keys.append(key) progress_bar.show(amount=progress_amount, text=f'saving memory keys') (path + 'memory_keys.pensieve').save(obj=memory_keys) progress_amount += 1 progress_bar.show(amount=progress_amount)
def perturb_data(data, perturbation): list_of_data = [] progress_bar = ProgressBar(total=len(perturbation), echo=echo) progress = 0 for column, perturbation_list in perturbation.items(): progress_bar.show(amount=progress, text=column) for delta in perturbation_list: list_of_data.append( perturb_numeric_column(data=data, column=column, delta=delta)) progress += 1 data = concat(list_of_data) progress_bar.show(amount=progress) return data
def get_function_influence(function, data, x_columns, num_deltas=200, num_threads=1, echo=1): """ :param callable function: function that gives a y for a given data :param pd.DataFrame data: :param list[str] or str x_columns: :param int num_deltas: :param int num_threads: :type echo: int :rtype: pd.DataFrame """ if isinstance(x_columns, str): x_columns = [x_columns] elif not isinstance(x_columns, list): raise TypeError(f'x_columns should be either a string or a list not a {type(x_columns)}') def get_influence_data(the_tuple): with warnings.catch_warnings(): warnings.simplefilter('ignore') x_column, this_data, this_num_deltas, this_function = the_tuple record = get_single_column_influence( function=this_function, data=this_data, x_column=x_column, num_deltas=this_num_deltas ) return record tuples = [ (x_column, data, num_deltas, function) for x_column in iterate(x_columns, echo=echo, text='preparing data ...') ] progress_bar = ProgressBar(total=len(x_columns) + 1, echo=echo) progress_bar.show(amount=len(x_columns), text='data prepared.') if num_threads == 1: dataframes = [ get_influence_data(t) for t in iterate(iterable=tuples, progress_bar=progress_bar, text='measuring influence ...') ] else: dataframes = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem')( delayed(get_influence_data)(t) for t in iterate(iterable=tuples, progress_bar=progress_bar, text='measuring influence ...') ) result = pd.concat(dataframes).sort_values(by=['variable', 'x_delta']) progress_bar.show(amount=len(dataframes) + 1, text='influence measured.') return result
def load(cls, path, echo=True): path = Path(path=path) parameters = (path + 'parameters.pensieve').load() pensieve = cls(safe=parameters['safe']) for name, value in parameters.items(): setattr(pensieve, f'_{name}', value) memory_keys = (path + 'memory_keys.pensieve').load() progress_bar = ProgressBar(total=len(memory_keys)) progress_amount = 0 pensieve._memories_dictionary = {} for key in memory_keys: if echo: progress_bar.show(amount=progress_amount, text=f'loading "{key}" memory') memory = Memory.load(path=path + key, pensieve=pensieve) pensieve._memories_dictionary[key] = memory progress_amount += 1 if echo: progress_bar.show(amount=progress_amount) return pensieve
def encode(self, data, drop_encoded=True, echo=0): echo = max(0, echo) result = data.copy() progress_bar = ProgressBar(total=len(self._column_values)) progress_amount = 0 for col_name, only_include in self._column_values.items(): if echo: progress_bar.show(amount=progress_amount, text=f'DM creating dummies for {col_name}') progress_amount += 1 temp_data = result[[col_name]].copy() if self._lowercase: temp_data[col_name] = temp_data[col_name].str.lower() temp_data[col_name] = np.where( temp_data[col_name].isin(only_include), temp_data[col_name], self._replacement) dummies = pd.get_dummies(data=temp_data[[col_name]], prefix=col_name, prefix_sep='_', dummy_na=self._encode_na, sparse=True) result = pd.concat([result, dummies], axis=1) for col_name in self._one_hot_columns: if col_name not in result.columns: result[col_name] = 0 if echo: progress_bar.show( amount=progress_amount, text=f'DM created dummies for {self._encoded_columns}') if drop_encoded: result = result.drop(columns=self.encoded_columns) extra_columns = [ x for x in result.columns if x not in self.encoded_columns + self.one_hot_columns and x not in data.columns ] if len(extra_columns) > 0: result = result.drop(columns=extra_columns) return result
def simulate(self, echo=1): """ :type echo: int :rtype: list[dict] """ def _get_influence(column, perturbation, function): influence_result = self.get_single_influence( column=column, perturbation=perturbation, function=function) influence_result['column'] = column influence_result['perturbation'] = perturbation return influence_result progress_bar = ProgressBar(total=2 + len(self._influencers), echo=echo) if self._num_threads == 1: result = [ _get_influence(column=column, perturbation=perturbation, function=self._function) for column, influences in iterate(self._influencers.items(), progress_bar=progress_bar, text='measuring influence ...') for perturbation in influences ] else: result = self.parallel_process( delayed(_get_influence)(column=column, perturbation=perturbation, function=deepcopy(self._function)) for column, influences in iterate( self._influencers.items(), progress_bar=progress_bar, text='measuring influence in parallel ...') for perturbation in influences) progress_bar._total = 2 + len(self._influencers) progress_bar.show(amount=1 + len(self._influencers)) result_data = pd.DataFrame.from_records(result) progress_bar.show(amount=2 + len(self._influencers)) return result_data
def evaluate(self, keys=None, output=False): """ evaluates multiple memories, in parallel if num_threads != 1 :type keys: list[str] or NoneType or str :type output: bool :rtype: list or NoneType """ if keys is None: keys = list(self.memories_dictionary.keys()) elif isinstance(keys, str): keys = [keys] if self._n_jobs == 1: if output: return [self[key] for key in keys] else: for key in keys: self.memories_dictionary[key].evaluate() else: def get_content(p): return p.content memories = [self.memories_dictionary[key] for key in keys] schedule = self.get_update_schedule(keys=keys) progress_bar = ProgressBar( total=sum([len(schedule_round) for schedule_round in schedule]), echo=self._echo ) progress_amount = 0 for schedule_round in schedule: progress_bar.show(amount=progress_amount, text=f'updating {len(schedule_round)} memories') self.processor(delayed(get_content)(job) for job in schedule_round) progress_amount += len(schedule_round) if progress_amount > 0: progress_bar.show(amount=progress_amount, text=f'{self.key} updated!') contents = self.processor(delayed(get_content)(p) for p in memories) if output: return list(contents)
def train(self, data, echo=0): echo = max(0, echo) result = data.copy() one_hot_columns = [] non_numeric_cols = data.select_dtypes( exclude=['bool', 'number', 'datetime64', 'datetime']).columns if self._include is not None: non_numeric_cols = [ col for col in non_numeric_cols if col in self._include ] if self._exclude is not None: non_numeric_cols = [ col for col in non_numeric_cols if col not in self._exclude ] progress_bar = ProgressBar(total=len(non_numeric_cols)) progress_amount = 0 self._encoded_columns = [] for col_name in non_numeric_cols: if echo: progress_bar.show(amount=progress_amount, text=f'DM training dummies for {col_name}') progress_amount += 1 try: temp_data = data[[col_name]].copy() if self._lowercase: temp_data[col_name] = temp_data[col_name].str.lower() temp_data['count'] = 1 counts = temp_data.groupby(col_name).sum().reset_index( drop=False) if self._top is not None: top_counts = counts.sort_values(by=col_name, ascending=False).head( self._top) only_include = set(top_counts[col_name]) temp_data[col_name] = np.where( temp_data[col_name].isin(only_include), temp_data[col_name], self._replacement) else: only_include = set(counts[col_name]) dummies = pd.get_dummies(data=temp_data[[col_name]], prefix=col_name, prefix_sep='_', dummy_na=self._encode_na, sparse=False) dummies = dummies[[ col for col in dummies if not dummies[col].nunique() == 1 ]] result = pd.concat([result, dummies], axis=1) one_hot_columns += list(dummies.columns) self._column_values[col_name] = only_include self._encoded_columns.append(col_name) except Exception as e: print(f'exception being raised for column: {col_name}') raise e self._one_hot_columns = one_hot_columns if echo: progress_bar.show( amount=progress_amount, text=f'DM trained dummies for {self._encoded_columns}') return result
def __init__(self, data, x_columns, y_column, id_columns=None, holdout_ratio=0.2, num_validation_splits=5, random_state=None, transformer=None, num_threads=-1, echo=1, cross_validation=None): self._echo = echo self._num_threads = num_threads self._num_validation_splits = num_validation_splits self._random_state = random_state progress_bar = ProgressBar(total=num_validation_splits + 1, echo=self._echo) progress_amount = 0 progress_bar.show(amount=progress_amount, text='preparing validation Xy') if isinstance(id_columns, str): id_columns = [id_columns] if cross_validation is None: cross_validation = get_cross_validation_by_group( data=data, id_columns=id_columns, num_splits=num_validation_splits, holdout_ratio=holdout_ratio, random_state=random_state) self._cross_validation = cross_validation super().__init__(data=data, x_columns=x_columns, y_column=y_column, id_columns=id_columns, training_rows=cross_validation['validation'], training_name='validation', test_rows=cross_validation['holdout'], test_name='holdout', transformer=deepcopy(transformer)) progress_amount += 1 progress_bar.show(amount=progress_amount, text='preparing folds') def transform_training_test_xy(fold): return TransformedTrainingTestXy(data=data, x_columns=x_columns, y_column=y_column, id_columns=id_columns, training_rows=fold['training'], test_rows=fold['test'], transformer=deepcopy(transformer)) if self.num_threads == 1: self._folds = [ transform_training_test_xy(fold=fold) for fold in iterate(cross_validation['folds'], text='preparing folds (single-threaded)') ] else: processor = Parallel(n_jobs=self.num_threads, backend='threading', require='sharedmem') self._folds = processor( delayed(transform_training_test_xy)(fold=fold) for fold in iterate(cross_validation['folds'], text='preparing folds (multi-threaded)'))
def get_content_and_reference(self): if self.n_jobs == 1: precursor_keys_to_contents = {p.key: p.content for p in self.precursors} else: def get_content(p): return p.content precursors = self.precursors schedule = self.get_update_schedule() progress_bar = ProgressBar( total=sum([len(schedule_round) for schedule_round in schedule]), echo=self.pensieve._echo ) progress_amount = 0 for schedule_round in schedule: progress_bar.show(amount=progress_amount, text=f'updating {len(schedule_round)} memories') self.pensieve.processor(delayed(get_content)(job) for job in schedule_round) progress_amount += len(schedule_round) if progress_amount > 0: progress_bar.show(amount=progress_amount, text=f'{self.key} updated!') contents = self.pensieve.processor(delayed(get_content)(p) for p in precursors) keys = [precursor.key for precursor in precursors] precursor_keys_to_contents = {key: content for key, content in zip(keys, contents)} if len(self.precursor_keys) == 0: new_reference = get_source(self._original_function) if new_reference == self._precursors_reference and not self.is_stale: new_content = self._content elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists(): new_content = self.backup_content else: timer = Timer(start_now=True, unit='timedelta') new_content = self._function() timer.stop() self.pensieve.function_durations.add_measurement(name=self.key, timer=timer) elif len(self.precursor_keys) == 1: precursor_content = list(precursor_keys_to_contents.values())[0] new_reference = (get_source(self._original_function), precursor_keys_to_contents) if new_reference == self._precursors_reference and not self.is_stale: new_content = self._content elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists(): new_content = self.backup_content else: timer = Timer(start_now=True, unit='timedelta') new_content = self._function(precursor_content) timer.stop() self.pensieve.function_durations.add_measurement(name=self.key, timer=timer) else: inputs = EvaluationInput(inputs=precursor_keys_to_contents) new_reference = (get_source(self._original_function), precursor_keys_to_contents) if new_reference == self._precursors_reference and not self.is_stale: new_content = self._content elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists(): new_content = self.backup_content else: timer = Timer(start_now=True, unit='timedelta') new_content = self._function(inputs.originals) timer.stop() self.pensieve.function_durations.add_measurement(name=self.key, timer=timer) self._content_type = get_type(new_content) self._content_access_count += 1 if self.backup_directory and new_reference != self.backup_precursors_reference: self.backup_content = new_content self.backup_precursors_reference = new_reference return new_content, new_reference
def eliminate(self, num_rounds=None, echo=1): """ :type num_rounds: int or NoneType :type echo: bool or int :rtype: Regression """ model = self if isinstance(model.fit, BrokenModel): return self num_rounds = num_rounds or len(self.variables) progress_bar = ProgressBar(total=num_rounds, echo=echo) round = 0 previous_lse = None previous_formula = None for round in range(num_rounds): if model.num_insignificant_effects < 1: break else: y = model.formula.dependent_variable lse = model.least_significant_insignificant_effect xs = model.all_effects_but_least_significant progress_bar.show( amount=round, text=f'effects: {len(xs)+1}, eliminating {lse} with p={lse.p}' ) new_formula = Formula.from_variables( independent_variables=xs, dependent_variable=y ) if previous_lse == lse: print('eliminating', lse, '\n\n') print('is main effect insignificant?', model.is_main_effect_significant(lse), '\n\n') for interaction in model.significant_interaction_effects: print(interaction, 'contains?', interaction.contains(lse)) for interaction in model.insignificant_interaction_effects: print(interaction, interaction.p) print(model.least_significant_insignificant_effect, lse) print('\n\ninsignificant effects\n', model.insignificant_effects) print(new_formula.display()) for x in xs: print(x, x.num_interactions, x.p) raise RuntimeError(f'repeating the elimination of {lse}') previous_formula = new_formula previous_lse = lse new_model = model.__class__( data=model.data, formula=new_formula, significance_level=model.significance_level, groups=model.groups, family=model.family, model_builder=model._model_builder, parent=model ) if isinstance(new_model.fit, BrokenModel): break else: model = new_model progress_bar.show( amount=round + 1, text=f'effects: {len(model.formula.independent_variables)}' ) return model
def fuzzy_left_merge(left, right, left_on=None, right_on=None, on=None, suffixes=('_x', '_y'), score_name='match_ratio', na_ratio=0.5, two_na_ratio=0.75, similarity_function=None, weights=None, case_sensitivity=0.5, num_results=1, num_threads=-1, echo=1): """ :type left: DataFrame :type right: DataFrame :type left_on: list[str] or str or NoneType :type right_on: list[str] or str or NoneType :type on: list[str] or str or NoneType :type how: str or NoneType :type case_sensitivity: float :type num_results: int :type similarity_function: callable :type echo: int or bool or ProgressBar :type num_threads: int :rtype: DataFrame """ if score_name in left.columns or score_name in right.columns: raise ValueError('use a score_name different from column names.') data1 = left.copy() data2 = right.copy() if on is None: on = data1.columns & data2.columns if left_on is None: left_on = on if right_on is None: right_on = on missing_left = [col for col in left_on if col not in data1.columns] if len(missing_left) > 0: raise KeyError(f'missing columns on left: {missing_left}') missing_right = [col for col in right_on if col not in data2.columns] if len(missing_right) > 0: raise KeyError(f'missing columns on right: {missing_right}') data1['fuzzy_id'] = range(len(data1)) if num_threads == 1: results = ProgressBar.apply( data=data1, echo=echo, function=lambda row: _match_rows(row=row, right=data2, left_on=left_on, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, score_name=score_name, num_results=num_results, similarity_function= similarity_function, weights=weights, num_threads=1, echo=echo - 1)) else: parallel = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem') progress_bar = ProgressBar(total=len(data1) + 1, echo=echo) results = parallel( delayed(_match_rows)(row=row, right=data2, left_on=left_on, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, score_name=score_name, num_results=num_results, similarity_function=similarity_function, weights=weights, num_threads=1, echo=echo - 1) for index, row in iterate(data1.iterrows(), progress_bar=progress_bar)) progress_bar.show(amount=len(data1) + 1) data2 = concat(results).reset_index(drop=True) return data1.merge(right=data2, on='fuzzy_id', how='left', suffixes=suffixes).drop(columns='fuzzy_id')