Esempio n. 1
0
	def save(self, path, echo=None):
		"""
		:type path: str or Path
		:type echo: bool
		"""
		if echo is None:
			echo = self._echo

		progress_bar = ProgressBar(total=len(self.memories_dictionary)+2, echo=echo)
		progress_amount = 0

		path = Path(path=path)
		path.make_dir()

		progress_bar.show(amount=progress_amount, text='saving parameters')
		(path + 'parameters.pensieve').save(obj=self.parameters)
		progress_amount += 1

		memory_keys = []

		for key, memory in self.memories_dictionary.items():
			progress_bar.show(amount=progress_amount, text=f'saving "{key}" memory')
			memory.save(path=path + key)
			progress_amount += 1
			memory_keys.append(key)

		progress_bar.show(amount=progress_amount, text=f'saving memory keys')
		(path + 'memory_keys.pensieve').save(obj=memory_keys)
		progress_amount += 1

		progress_bar.show(amount=progress_amount)
Esempio n. 2
0
def _find_best_matching_rows(strings, right, right_on, na_ratio, two_na_ratio,
                             case_sensitivity, score_name, num_threads,
                             similarity_function, weights, num_results, echo):
    """
	:param strings:
	:param right:
	:param right_on:
	:param na_ratio:
	:param two_na_ratio:
	:param case_sensitivity:
	:param score_name:
	:param num_threads:
	:param num_results:
	:param echo:
	:rtype: DataFrame
	"""
    right = right.copy()

    if num_threads == 1:

        right[score_name] = ProgressBar.apply(
            data=right,
            function=lambda row: _get_similarity_between_strings_and_row(
                strings=strings,
                row=row,
                right_on=right_on,
                na_ratio=na_ratio,
                two_na_ratio=two_na_ratio,
                case_sensitivity=case_sensitivity,
                similarity_function=similarity_function,
                weights=weights),
            echo=echo)

    else:

        parallel = Parallel(n_jobs=num_threads,
                            backend='threading',
                            require='sharedmem')
        progress_bar = ProgressBar(total=len(right) + 1, echo=echo)
        right[score_name] = parallel(
            delayed(_get_similarity_between_strings_and_row)(
                strings=strings,
                row=row,
                right_on=right_on,
                na_ratio=na_ratio,
                two_na_ratio=two_na_ratio,
                case_sensitivity=case_sensitivity,
                similarity_function=similarity_function,
                weights=weights)
            for index, row in iterate(right.iterrows(),
                                      progress_bar=progress_bar))
        progress_bar.show(amount=len(right) + 1)

    right = right.sort_values(by=score_name, ascending=False)
    return right.iloc[0:num_results]
Esempio n. 3
0
def get_similarities(string,
                     strings,
                     method='jaro_winkler',
                     similarity_function=None,
                     case_sensitivity=1.0,
                     first_char_weight=0.0,
                     first_word_weight=0.0,
                     echo=0):
    """
	:type string: str
	:type strings: list[str]
	:type treat_as_sentence: bool
	:param str method: can be one of 'jaro_winkler', 'levenshtein', 'sentence_jaro_winkler', 'sentence_levenshtein'
	:type case_sensitivity: float
	:type first_char_weight: float
	:type first_word_weight: float
	:rtype:
	"""
    echo = max(0, echo)
    string = str(string)
    text = string + ' ? '
    return list(
        ProgressBar.map(function=lambda x: get_similarity(
            s1=string,
            s2=x,
            method=method,
            similarity_function=similarity_function,
            first_char_weight=first_char_weight,
            case_sensitivity=case_sensitivity,
            first_word_weight=first_word_weight),
                        iterable=strings,
                        iterable_text=strings,
                        text=text,
                        echo=echo))
Esempio n. 4
0
def find_most_similar(strings,
                      candidates,
                      candidate_ids=None,
                      string_ids=None,
                      method='jaro_winkler',
                      similarity_function=None,
                      case_sensitivity=1.0,
                      first_char_weight=0.0,
                      first_word_weight=0.0,
                      num_results=1,
                      echo=0):
    """
	:type strings: str or list[str]
	:type candidates: list[str]
	:type candidate_ids: list or NoneType
	:type string_ids: list or NoneType
	:param str method: can be one of 'jaro_winkler', 'levenshtein', 'sentence_jaro_winkler', 'sentence_levenshtein'
	:type treat_as_sentence: bool
	:type first_char_weight: float
	:type first_word_weight: float
	:type case_sensitivity: float
	:rtype: pd.DataFrame
	"""
    echo = max(0, echo)
    if string_ids is None:
        string_ids = range(len(strings))

    return pd.concat(
        list(
            ProgressBar.map(
                function=lambda x: find_most_similar_for_one_string(
                    string=x[0],
                    string_id=x[1],
                    candidates=candidates,
                    candidate_ids=candidate_ids,
                    method=method,
                    similarity_function=similarity_function,
                    case_sensitivity=case_sensitivity,
                    first_char_weight=first_char_weight,
                    first_word_weight=first_word_weight,
                    num_results=num_results,
                    echo=echo - 1),
                iterable=list(zip(strings, string_ids)),
                iterable_text=strings,
                echo=echo))).reset_index(drop=True)[[
                    'string_id', 'string', 'candidate_id', 'candidate',
                    'similarity_rank', 'similarity'
                ]]
Esempio n. 5
0
File: zip.py Progetto: idin/disk
def zip_directory(path, zip_path, compression=ZIP_DEFLATED, echo=0):
	echo = max(0, echo)
	progress_bar = ProgressBar(echo=echo, total=None)

	compression = compression or ZIP_STORED
	amount = 0
	with ZipFile(file=zip_path, mode='w', compression=compression) as zip_file:
		for root, dirs, files in os.walk(path):
			for file in files:
				zip_file.write(os.path.join(root, file))
				progress_bar.show(amount=amount, text=f'"{file}" zipped into {zip_path}')
				amount += 1
	progress_bar.show(amount=amount, text=f'{zip_path} complete!')
	return zip_path
Esempio n. 6
0
 def perturb_data(data, perturbation):
     list_of_data = []
     progress_bar = ProgressBar(total=len(perturbation), echo=echo)
     progress = 0
     for column, perturbation_list in perturbation.items():
         progress_bar.show(amount=progress, text=column)
         for delta in perturbation_list:
             list_of_data.append(
                 perturb_numeric_column(data=data,
                                        column=column,
                                        delta=delta))
         progress += 1
     data = concat(list_of_data)
     progress_bar.show(amount=progress)
     return data
Esempio n. 7
0
def get_function_influence(function, data, x_columns, num_deltas=200, num_threads=1, echo=1):
	"""
	:param callable function: function that gives a y for a given data
	:param pd.DataFrame data:
	:param list[str] or str x_columns:
	:param int num_deltas:
	:param int num_threads:
	:type echo: int
	:rtype: pd.DataFrame
	"""
	if isinstance(x_columns, str):
		x_columns = [x_columns]
	elif not isinstance(x_columns, list):
		raise TypeError(f'x_columns should be either a string or a list not a {type(x_columns)}')

	def get_influence_data(the_tuple):
		with warnings.catch_warnings():
			warnings.simplefilter('ignore')
			x_column, this_data, this_num_deltas, this_function = the_tuple
			record = get_single_column_influence(
				function=this_function, data=this_data, x_column=x_column, num_deltas=this_num_deltas
			)
		return record

	tuples = [
		(x_column, data, num_deltas, function)
		for x_column in iterate(x_columns, echo=echo, text='preparing data ...')
	]

	progress_bar = ProgressBar(total=len(x_columns) + 1, echo=echo)
	progress_bar.show(amount=len(x_columns), text='data prepared.')

	if num_threads == 1:
		dataframes = [
			get_influence_data(t)
			for t in iterate(iterable=tuples, progress_bar=progress_bar, text='measuring influence ...')
		]
	else:
		dataframes = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem')(
			delayed(get_influence_data)(t)
			for t in iterate(iterable=tuples, progress_bar=progress_bar, text='measuring influence ...')
		)

	result = pd.concat(dataframes).sort_values(by=['variable', 'x_delta'])
	progress_bar.show(amount=len(dataframes) + 1, text='influence measured.')
	return result
Esempio n. 8
0
    def simulate(self, echo=1):
        """
		:type echo: int
		:rtype: list[dict]
		"""
        def _get_influence(column, perturbation, function):
            influence_result = self.get_single_influence(
                column=column, perturbation=perturbation, function=function)
            influence_result['column'] = column
            influence_result['perturbation'] = perturbation
            return influence_result

        progress_bar = ProgressBar(total=2 + len(self._influencers), echo=echo)

        if self._num_threads == 1:
            result = [
                _get_influence(column=column,
                               perturbation=perturbation,
                               function=self._function) for column, influences
                in iterate(self._influencers.items(),
                           progress_bar=progress_bar,
                           text='measuring influence ...')
                for perturbation in influences
            ]
        else:
            result = self.parallel_process(
                delayed(_get_influence)(column=column,
                                        perturbation=perturbation,
                                        function=deepcopy(self._function))
                for column, influences in iterate(
                    self._influencers.items(),
                    progress_bar=progress_bar,
                    text='measuring influence in parallel ...')
                for perturbation in influences)

        progress_bar._total = 2 + len(self._influencers)
        progress_bar.show(amount=1 + len(self._influencers))

        result_data = pd.DataFrame.from_records(result)
        progress_bar.show(amount=2 + len(self._influencers))
        return result_data
Esempio n. 9
0
	def load(cls, path, echo=True):
		path = Path(path=path)
		parameters = (path + 'parameters.pensieve').load()
		pensieve = cls(safe=parameters['safe'])
		for name, value in parameters.items():
			setattr(pensieve, f'_{name}', value)
		memory_keys = (path + 'memory_keys.pensieve').load()
		progress_bar = ProgressBar(total=len(memory_keys))
		progress_amount = 0
		pensieve._memories_dictionary = {}
		for key in memory_keys:
			if echo:
				progress_bar.show(amount=progress_amount, text=f'loading "{key}" memory')
			memory = Memory.load(path=path + key, pensieve=pensieve)
			pensieve._memories_dictionary[key] = memory
			progress_amount += 1
		if echo:
			progress_bar.show(amount=progress_amount)
		return pensieve
Esempio n. 10
0
	def evaluate(self, keys=None, output=False):
		"""
		evaluates multiple memories, in parallel if num_threads != 1
		:type keys: list[str] or NoneType or str
		:type output: bool
		:rtype: list or NoneType
		"""
		if keys is None:
			keys = list(self.memories_dictionary.keys())
		elif isinstance(keys, str):
			keys = [keys]

		if self._n_jobs == 1:
			if output:
				return [self[key] for key in keys]
			else:
				for key in keys:
					self.memories_dictionary[key].evaluate()
		else:
			def get_content(p):
				return p.content

			memories = [self.memories_dictionary[key] for key in keys]
			schedule = self.get_update_schedule(keys=keys)

			progress_bar = ProgressBar(
				total=sum([len(schedule_round) for schedule_round in schedule]),
				echo=self._echo
			)

			progress_amount = 0
			for schedule_round in schedule:
				progress_bar.show(amount=progress_amount, text=f'updating {len(schedule_round)} memories')
				self.processor(delayed(get_content)(job) for job in schedule_round)
				progress_amount += len(schedule_round)
			if progress_amount > 0:
				progress_bar.show(amount=progress_amount, text=f'{self.key} updated!')

			contents = self.processor(delayed(get_content)(p) for p in memories)
			if output:
				return list(contents)
Esempio n. 11
0
    def encode(self, data, drop_encoded=True, echo=0):
        echo = max(0, echo)
        result = data.copy()
        progress_bar = ProgressBar(total=len(self._column_values))
        progress_amount = 0
        for col_name, only_include in self._column_values.items():
            if echo:
                progress_bar.show(amount=progress_amount,
                                  text=f'DM creating dummies for {col_name}')
            progress_amount += 1
            temp_data = result[[col_name]].copy()
            if self._lowercase:
                temp_data[col_name] = temp_data[col_name].str.lower()
            temp_data[col_name] = np.where(
                temp_data[col_name].isin(only_include), temp_data[col_name],
                self._replacement)
            dummies = pd.get_dummies(data=temp_data[[col_name]],
                                     prefix=col_name,
                                     prefix_sep='_',
                                     dummy_na=self._encode_na,
                                     sparse=True)
            result = pd.concat([result, dummies], axis=1)
        for col_name in self._one_hot_columns:
            if col_name not in result.columns:
                result[col_name] = 0
        if echo:
            progress_bar.show(
                amount=progress_amount,
                text=f'DM created dummies for {self._encoded_columns}')

        if drop_encoded:
            result = result.drop(columns=self.encoded_columns)

        extra_columns = [
            x for x in result.columns if x not in self.encoded_columns +
            self.one_hot_columns and x not in data.columns
        ]
        if len(extra_columns) > 0:
            result = result.drop(columns=extra_columns)

        return result
Esempio n. 12
0
    def train(self, data, echo=0):
        echo = max(0, echo)
        result = data.copy()
        one_hot_columns = []
        non_numeric_cols = data.select_dtypes(
            exclude=['bool', 'number', 'datetime64', 'datetime']).columns

        if self._include is not None:
            non_numeric_cols = [
                col for col in non_numeric_cols if col in self._include
            ]
        if self._exclude is not None:
            non_numeric_cols = [
                col for col in non_numeric_cols if col not in self._exclude
            ]

        progress_bar = ProgressBar(total=len(non_numeric_cols))
        progress_amount = 0
        self._encoded_columns = []
        for col_name in non_numeric_cols:
            if echo:
                progress_bar.show(amount=progress_amount,
                                  text=f'DM training dummies for {col_name}')
            progress_amount += 1
            try:
                temp_data = data[[col_name]].copy()
                if self._lowercase:
                    temp_data[col_name] = temp_data[col_name].str.lower()
                temp_data['count'] = 1
                counts = temp_data.groupby(col_name).sum().reset_index(
                    drop=False)

                if self._top is not None:
                    top_counts = counts.sort_values(by=col_name,
                                                    ascending=False).head(
                                                        self._top)
                    only_include = set(top_counts[col_name])
                    temp_data[col_name] = np.where(
                        temp_data[col_name].isin(only_include),
                        temp_data[col_name], self._replacement)

                else:
                    only_include = set(counts[col_name])

                dummies = pd.get_dummies(data=temp_data[[col_name]],
                                         prefix=col_name,
                                         prefix_sep='_',
                                         dummy_na=self._encode_na,
                                         sparse=False)

                dummies = dummies[[
                    col for col in dummies if not dummies[col].nunique() == 1
                ]]

                result = pd.concat([result, dummies], axis=1)
                one_hot_columns += list(dummies.columns)

                self._column_values[col_name] = only_include
                self._encoded_columns.append(col_name)
            except Exception as e:
                print(f'exception being raised for column: {col_name}')
                raise e
        self._one_hot_columns = one_hot_columns
        if echo:
            progress_bar.show(
                amount=progress_amount,
                text=f'DM trained dummies for {self._encoded_columns}')
        return result
Esempio n. 13
0
	def eliminate(self, num_rounds=None, echo=1):
		"""
		:type num_rounds: int or NoneType
		:type echo: bool or int
		:rtype: Regression
		"""
		model = self
		if isinstance(model.fit, BrokenModel):
			return self

		num_rounds = num_rounds or len(self.variables)
		progress_bar = ProgressBar(total=num_rounds, echo=echo)
		round = 0
		previous_lse = None
		previous_formula = None
		for round in range(num_rounds):

			if model.num_insignificant_effects < 1:
				break

			else:
				y = model.formula.dependent_variable


				lse = model.least_significant_insignificant_effect
				xs = model.all_effects_but_least_significant
				progress_bar.show(
					amount=round,
					text=f'effects: {len(xs)+1}, eliminating {lse} with p={lse.p}'
				)
				new_formula = Formula.from_variables(
					independent_variables=xs,
					dependent_variable=y
				)

				if previous_lse == lse:
					print('eliminating', lse, '\n\n')
					print('is main effect insignificant?', model.is_main_effect_significant(lse), '\n\n')
					for interaction in model.significant_interaction_effects:
						print(interaction, 'contains?', interaction.contains(lse))

					for interaction in model.insignificant_interaction_effects:
						print(interaction, interaction.p)

					print(model.least_significant_insignificant_effect, lse)
					print('\n\ninsignificant effects\n', model.insignificant_effects)
					print(new_formula.display())

					for x in xs:
						print(x, x.num_interactions, x.p)
					raise RuntimeError(f'repeating the elimination of {lse}')

				previous_formula = new_formula
				previous_lse = lse


				new_model = model.__class__(
					data=model.data, formula=new_formula, significance_level=model.significance_level,
					groups=model.groups, family=model.family, model_builder=model._model_builder,
					parent=model
				)



				if isinstance(new_model.fit, BrokenModel):
					break

				else:
					model = new_model

		progress_bar.show(
			amount=round + 1,
			text=f'effects: {len(model.formula.independent_variables)}'
		)
		return model
Esempio n. 14
0
	def get_content_and_reference(self):
		if self.n_jobs == 1:
			precursor_keys_to_contents = {p.key: p.content for p in self.precursors}
		else:
			def get_content(p):
				return p.content

			precursors = self.precursors

			schedule = self.get_update_schedule()

			progress_bar = ProgressBar(
				total=sum([len(schedule_round) for schedule_round in schedule]),
				echo=self.pensieve._echo
			)

			progress_amount = 0
			for schedule_round in schedule:
				progress_bar.show(amount=progress_amount, text=f'updating {len(schedule_round)} memories')
				self.pensieve.processor(delayed(get_content)(job) for job in schedule_round)
				progress_amount += len(schedule_round)
			if progress_amount > 0:
				progress_bar.show(amount=progress_amount, text=f'{self.key} updated!')

			contents = self.pensieve.processor(delayed(get_content)(p) for p in precursors)
			keys = [precursor.key for precursor in precursors]
			precursor_keys_to_contents = {key: content for key, content in zip(keys, contents)}

		if len(self.precursor_keys) == 0:
			new_reference = get_source(self._original_function)

			if new_reference == self._precursors_reference and not self.is_stale:
				new_content = self._content

			elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists():
				new_content = self.backup_content

			else:
				timer = Timer(start_now=True, unit='timedelta')
				new_content = self._function()
				timer.stop()
				self.pensieve.function_durations.add_measurement(name=self.key, timer=timer)

		elif len(self.precursor_keys) == 1:
			precursor_content = list(precursor_keys_to_contents.values())[0]
			new_reference = (get_source(self._original_function), precursor_keys_to_contents)

			if new_reference == self._precursors_reference and not self.is_stale:
				new_content = self._content

			elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists():
				new_content = self.backup_content

			else:
				timer = Timer(start_now=True, unit='timedelta')
				new_content = self._function(precursor_content)
				timer.stop()
				self.pensieve.function_durations.add_measurement(name=self.key, timer=timer)

		else:
			inputs = EvaluationInput(inputs=precursor_keys_to_contents)
			new_reference = (get_source(self._original_function), precursor_keys_to_contents)
			if new_reference == self._precursors_reference and not self.is_stale:
				new_content = self._content
			elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists():
				new_content = self.backup_content
			else:
				timer = Timer(start_now=True, unit='timedelta')
				new_content = self._function(inputs.originals)

				timer.stop()
				self.pensieve.function_durations.add_measurement(name=self.key, timer=timer)

		self._content_type = get_type(new_content)

		self._content_access_count += 1
		if self.backup_directory and new_reference != self.backup_precursors_reference:
			self.backup_content = new_content
			self.backup_precursors_reference = new_reference
		return new_content, new_reference
Esempio n. 15
0
    def validate(self,
                 problem_type,
                 evaluation_function=None,
                 model=None,
                 model_name=None,
                 model_grid=None,
                 num_threads=None,
                 return_models=False,
                 raise_error=False,
                 main_metric=None,
                 best_model_criteria=None,
                 measure_influence=True,
                 num_influence_points=400,
                 echo=None):
        """
		:param callable evaluation_function:
		:param str problem_type: either 'regression' of 'classification'
		:param model: a regressor or classifier
		:param str or NoneType model_name:
		:param ModelGrid or NoneType model_grid:
		:param dict[str, list or str or int or float] parameter_grid: a dictionary telling how the grid of models should be built
		:param int num_threads: for parallel computing
		:param bool return_models: whether or not trained models should be returned
		:param bool raise_error:
		:param str main_metric: the metric to be used for choosing the best model
		:param str or list[str] or NoneType other_metrics: other metrics to be included in the aggregate
		:param str best_model_criteria: 'highest' means the model with the highest metric value should be chosen
		:param int or bool or ProgressBar echo:
		:rtype: dict[str,]
		"""
        echo = echo or self._echo
        num_threads = num_threads or self._num_threads

        if model is not None:
            try:
                model_class_name = model.__name__
            except AttributeError:
                model_class_name = None
        else:
            model_class_name = None

        if problem_type.lower().startswith('class'):
            model_name = model_name or model_class_name or 'classifier'
            main_metric = main_metric or 'f1_score'
            best_model_criteria = best_model_criteria or 'highest'
            evaluation_function = evaluation_function or evaluate_classification
        else:
            model_name = model_name or model_class_name or 'regressor'
            main_metric = main_metric or 'rmse'
            best_model_criteria = best_model_criteria or 'lowest'
            evaluation_function = evaluation_function or evaluate_regression

        if model is not None and model_grid is None:
            models = {model_name: model}
            parameters_data = None
            parameters_dictionary = None

        elif model is None and model_grid is not None:
            models = model_grid.models
            parameters_data = model_grid.parameter_table
            parameters_dictionary = model_grid.parameters

        else:
            raise ValueError(
                'either a preset model should be given or a model grid')

        model_folds = [{
            'model_name': model_name,
            'model': model,
            'fold_num': fold_num + 1,
            'fold': fold
        } for model_name, model in models.items()
                       for fold_num, fold in enumerate(self.folds)]

        shared_memory = {
            'progress_bar':
            ProgressBar(total=1 + len(models) * len(self.folds), echo=echo),
            'progress_amount':
            0
        }

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            if num_threads == 1:
                result = [
                    validate_fold(model_fold=model_fold,
                                  shared_memory=shared_memory,
                                  evaluation_function=evaluation_function)
                    for model_fold in model_folds
                ]
            else:
                parallel = Parallel(n_jobs=num_threads,
                                    backend='threading',
                                    require='sharedmem')
                result = parallel(
                    delayed(validate_fold)(
                        model_fold=model_fold,
                        shared_memory=shared_memory,
                        evaluation_function=evaluation_function)
                    for model_fold in model_folds)

        training = []
        test = []
        trained_models = []
        feature_importances_list = []
        coefficients_list = []

        for record in result:
            model = record['model']
            training_evaluation = record['training_evaluation']
            test_evaluation = record['test_evaluation']
            training_evaluation['fold_num'] = record['fold_num']
            test_evaluation['fold_num'] = record['fold_num']
            training_evaluation['model_name'] = record['model_name']
            test_evaluation['model_name'] = record['model_name']
            training.append(training_evaluation)
            test.append(test_evaluation)

            feature_importances = get_feature_importances(
                model=model,
                columns=self.x_columns,
                model_name=record['model_name'],
                fold_num=record['fold_num'],
                raise_error=raise_error)
            if feature_importances is not None:
                feature_importances_list.append(feature_importances)

            coefficients = get_coefficients(model=model,
                                            columns=self.x_columns,
                                            model_name=record['model_name'],
                                            fold_num=record['fold_num'],
                                            raise_error=raise_error)
            if coefficients is not None:
                coefficients_list.append(coefficients)

            if return_models:
                trained_models.append(model)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            feature_importances = DataFrame.from_records(
                feature_importances_list)
            coefficients = DataFrame.from_records(coefficients_list)

            training = bring_to_front(data=DataFrame.from_records(training),
                                      columns=['model_name', 'fold_num'])
            test = bring_to_front(data=DataFrame.from_records(test),
                                  columns=['model_name', 'fold_num'])
            result = Dictionary({'training': training, 'test': test})

            if feature_importances.shape[1] > 0:
                result['feature_importances'] = bring_to_front(
                    data=feature_importances,
                    columns=['model_name', 'fold_num'])
                result['mean_feature_importances'] = result[
                    'feature_importances'].drop(columns='fold_num').groupby(
                        ['model_name']).mean().reset_index()

            if coefficients.shape[1] > 0:
                result['coefficients'] = bring_to_front(
                    data=coefficients, columns=['model_name', 'fold_num'])
                result['mean_coefficients'] = result['coefficients'].drop(
                    columns='fold_num').groupby(['model_name'
                                                 ]).mean().reset_index()

            if return_models:
                result['models'] = trained_models

            shared_memory['progress_amount'] += 1
            shared_memory['progress_bar'].show(
                amount=shared_memory['progress_amount'],
                text='validation complete.')

            if main_metric is not None and main_metric in result[
                    'test'].columns:
                aggregated_training = training.drop(
                    columns='fold_num').groupby(
                        'model_name').mean().reset_index()
                aggregated_test = test.drop(columns='fold_num').groupby(
                    'model_name').mean().reset_index()
                aggregated_result = aggregated_training.merge(
                    right=aggregated_test,
                    on='model_name',
                    how='outer',
                    suffixes=['_training', '_test'])
                if parameters_data is not None:
                    aggregated_result = parameters_data.merge(
                        right=aggregated_result,
                        on='model_name',
                        how='outer',
                        suffixes=['_parameter', ''])

                aggregated_result = aggregated_result.sort_values(
                    f'{main_metric}_test',
                    ascending=best_model_criteria != 'highest')

                result['aggregate'] = aggregated_result
                best_model_name = aggregated_result.head(
                    1)['model_name'].values[0]

                if parameters_dictionary is not None:
                    result['best_model_parameters'] = parameters_dictionary[
                        best_model_name]

                best_model = models[best_model_name]
                trained_best_model = deepcopy(best_model)
                trained_best_model.fit(X=self.validation.X,
                                       y=self.validation.y)
                best_model_performance = evaluation_function(
                    trained_best_model.predict(self.holdout.X), self.holdout.y)
                result['best_model'] = best_model
                result['best_model_trained'] = trained_best_model
                result['best_model_performance'] = best_model_performance

                best_model_feature_importances = get_feature_importances(
                    model=trained_best_model,
                    columns=self.validation.x_columns,
                    model_name=best_model_name,
                    raise_error=raise_error)

                best_model_coefficients = get_coefficients(
                    model=trained_best_model,
                    columns=self.validation.x_columns,
                    model_name=best_model_name,
                    raise_error=raise_error)

                if best_model_feature_importances is not None:
                    result[
                        'best_model_feature_importances'] = best_model_feature_importances

                if best_model_coefficients is not None:
                    result['best_model_coefficients'] = best_model_coefficients

                if measure_influence:
                    result['influence'] = self.get_influence(
                        model=trained_best_model,
                        num_threads=num_threads,
                        num_points=num_influence_points,
                        echo=echo)

        model = result['best_model_trained']
        transformer = self._transformer
        predictor = Predictor(model=model,
                              transformer=transformer,
                              model_is_fitted=True,
                              transformer_is_fitted=True,
                              x_columns=self.x_columns)
        result['predictor'] = predictor
        return result
Esempio n. 16
0
def fuzzy_left_merge(left,
                     right,
                     left_on=None,
                     right_on=None,
                     on=None,
                     suffixes=('_x', '_y'),
                     score_name='match_ratio',
                     na_ratio=0.5,
                     two_na_ratio=0.75,
                     similarity_function=None,
                     weights=None,
                     case_sensitivity=0.5,
                     num_results=1,
                     num_threads=-1,
                     echo=1):
    """
	:type left: DataFrame
	:type right: DataFrame
	:type left_on: list[str] or str or NoneType
	:type right_on: list[str] or str or NoneType
	:type on: list[str] or str or NoneType
	:type how: str or NoneType
	:type case_sensitivity: float
	:type num_results: int
	:type similarity_function: callable
	:type echo: int or bool or ProgressBar
	:type num_threads: int
	:rtype: DataFrame
	"""
    if score_name in left.columns or score_name in right.columns:
        raise ValueError('use a score_name different from column names.')

    data1 = left.copy()
    data2 = right.copy()

    if on is None:
        on = data1.columns & data2.columns

    if left_on is None:
        left_on = on
    if right_on is None:
        right_on = on

    missing_left = [col for col in left_on if col not in data1.columns]
    if len(missing_left) > 0:
        raise KeyError(f'missing columns on left: {missing_left}')
    missing_right = [col for col in right_on if col not in data2.columns]
    if len(missing_right) > 0:
        raise KeyError(f'missing columns on right: {missing_right}')

    data1['fuzzy_id'] = range(len(data1))

    if num_threads == 1:
        results = ProgressBar.apply(
            data=data1,
            echo=echo,
            function=lambda row: _match_rows(row=row,
                                             right=data2,
                                             left_on=left_on,
                                             right_on=right_on,
                                             na_ratio=na_ratio,
                                             two_na_ratio=two_na_ratio,
                                             case_sensitivity=case_sensitivity,
                                             score_name=score_name,
                                             num_results=num_results,
                                             similarity_function=
                                             similarity_function,
                                             weights=weights,
                                             num_threads=1,
                                             echo=echo - 1))

    else:
        parallel = Parallel(n_jobs=num_threads,
                            backend='threading',
                            require='sharedmem')
        progress_bar = ProgressBar(total=len(data1) + 1, echo=echo)

        results = parallel(
            delayed(_match_rows)(row=row,
                                 right=data2,
                                 left_on=left_on,
                                 right_on=right_on,
                                 na_ratio=na_ratio,
                                 two_na_ratio=two_na_ratio,
                                 case_sensitivity=case_sensitivity,
                                 score_name=score_name,
                                 num_results=num_results,
                                 similarity_function=similarity_function,
                                 weights=weights,
                                 num_threads=1,
                                 echo=echo - 1)
            for index, row in iterate(data1.iterrows(),
                                      progress_bar=progress_bar))
        progress_bar.show(amount=len(data1) + 1)

    data2 = concat(results).reset_index(drop=True)

    return data1.merge(right=data2,
                       on='fuzzy_id',
                       how='left',
                       suffixes=suffixes).drop(columns='fuzzy_id')
Esempio n. 17
0
    def __init__(self,
                 data,
                 x_columns,
                 y_column,
                 id_columns=None,
                 holdout_ratio=0.2,
                 num_validation_splits=5,
                 random_state=None,
                 transformer=None,
                 num_threads=-1,
                 echo=1,
                 cross_validation=None):
        self._echo = echo
        self._num_threads = num_threads
        self._num_validation_splits = num_validation_splits
        self._random_state = random_state
        progress_bar = ProgressBar(total=num_validation_splits + 1,
                                   echo=self._echo)
        progress_amount = 0
        progress_bar.show(amount=progress_amount,
                          text='preparing validation Xy')

        if isinstance(id_columns, str):
            id_columns = [id_columns]

        if cross_validation is None:
            cross_validation = get_cross_validation_by_group(
                data=data,
                id_columns=id_columns,
                num_splits=num_validation_splits,
                holdout_ratio=holdout_ratio,
                random_state=random_state)
        self._cross_validation = cross_validation

        super().__init__(data=data,
                         x_columns=x_columns,
                         y_column=y_column,
                         id_columns=id_columns,
                         training_rows=cross_validation['validation'],
                         training_name='validation',
                         test_rows=cross_validation['holdout'],
                         test_name='holdout',
                         transformer=deepcopy(transformer))
        progress_amount += 1
        progress_bar.show(amount=progress_amount, text='preparing folds')

        def transform_training_test_xy(fold):
            return TransformedTrainingTestXy(data=data,
                                             x_columns=x_columns,
                                             y_column=y_column,
                                             id_columns=id_columns,
                                             training_rows=fold['training'],
                                             test_rows=fold['test'],
                                             transformer=deepcopy(transformer))

        if self.num_threads == 1:
            self._folds = [
                transform_training_test_xy(fold=fold)
                for fold in iterate(cross_validation['folds'],
                                    text='preparing folds (single-threaded)')
            ]
        else:
            processor = Parallel(n_jobs=self.num_threads,
                                 backend='threading',
                                 require='sharedmem')
            self._folds = processor(
                delayed(transform_training_test_xy)(fold=fold)
                for fold in iterate(cross_validation['folds'],
                                    text='preparing folds (multi-threaded)'))