Exemple #1
0
def _find_best_matching_rows(strings, right, right_on, na_ratio, two_na_ratio,
                             case_sensitivity, score_name, num_threads,
                             similarity_function, weights, num_results, echo):
    """
	:param strings:
	:param right:
	:param right_on:
	:param na_ratio:
	:param two_na_ratio:
	:param case_sensitivity:
	:param score_name:
	:param num_threads:
	:param num_results:
	:param echo:
	:rtype: DataFrame
	"""
    right = right.copy()

    if num_threads == 1:

        right[score_name] = ProgressBar.apply(
            data=right,
            function=lambda row: _get_similarity_between_strings_and_row(
                strings=strings,
                row=row,
                right_on=right_on,
                na_ratio=na_ratio,
                two_na_ratio=two_na_ratio,
                case_sensitivity=case_sensitivity,
                similarity_function=similarity_function,
                weights=weights),
            echo=echo)

    else:

        parallel = Parallel(n_jobs=num_threads,
                            backend='threading',
                            require='sharedmem')
        progress_bar = ProgressBar(total=len(right) + 1, echo=echo)
        right[score_name] = parallel(
            delayed(_get_similarity_between_strings_and_row)(
                strings=strings,
                row=row,
                right_on=right_on,
                na_ratio=na_ratio,
                two_na_ratio=two_na_ratio,
                case_sensitivity=case_sensitivity,
                similarity_function=similarity_function,
                weights=weights)
            for index, row in iterate(right.iterrows(),
                                      progress_bar=progress_bar))
        progress_bar.show(amount=len(right) + 1)

    right = right.sort_values(by=score_name, ascending=False)
    return right.iloc[0:num_results]
Exemple #2
0
Fichier : zip.py Projet : idin/disk
def zip_directory(path, zip_path, compression=ZIP_DEFLATED, echo=0):
	echo = max(0, echo)
	progress_bar = ProgressBar(echo=echo, total=None)

	compression = compression or ZIP_STORED
	amount = 0
	with ZipFile(file=zip_path, mode='w', compression=compression) as zip_file:
		for root, dirs, files in os.walk(path):
			for file in files:
				zip_file.write(os.path.join(root, file))
				progress_bar.show(amount=amount, text=f'"{file}" zipped into {zip_path}')
				amount += 1
	progress_bar.show(amount=amount, text=f'{zip_path} complete!')
	return zip_path
Exemple #3
0
	def save(self, path, echo=None):
		"""
		:type path: str or Path
		:type echo: bool
		"""
		if echo is None:
			echo = self._echo

		progress_bar = ProgressBar(total=len(self.memories_dictionary)+2, echo=echo)
		progress_amount = 0

		path = Path(path=path)
		path.make_dir()

		progress_bar.show(amount=progress_amount, text='saving parameters')
		(path + 'parameters.pensieve').save(obj=self.parameters)
		progress_amount += 1

		memory_keys = []

		for key, memory in self.memories_dictionary.items():
			progress_bar.show(amount=progress_amount, text=f'saving "{key}" memory')
			memory.save(path=path + key)
			progress_amount += 1
			memory_keys.append(key)

		progress_bar.show(amount=progress_amount, text=f'saving memory keys')
		(path + 'memory_keys.pensieve').save(obj=memory_keys)
		progress_amount += 1

		progress_bar.show(amount=progress_amount)
Exemple #4
0
 def perturb_data(data, perturbation):
     list_of_data = []
     progress_bar = ProgressBar(total=len(perturbation), echo=echo)
     progress = 0
     for column, perturbation_list in perturbation.items():
         progress_bar.show(amount=progress, text=column)
         for delta in perturbation_list:
             list_of_data.append(
                 perturb_numeric_column(data=data,
                                        column=column,
                                        delta=delta))
         progress += 1
     data = concat(list_of_data)
     progress_bar.show(amount=progress)
     return data
def get_function_influence(function, data, x_columns, num_deltas=200, num_threads=1, echo=1):
	"""
	:param callable function: function that gives a y for a given data
	:param pd.DataFrame data:
	:param list[str] or str x_columns:
	:param int num_deltas:
	:param int num_threads:
	:type echo: int
	:rtype: pd.DataFrame
	"""
	if isinstance(x_columns, str):
		x_columns = [x_columns]
	elif not isinstance(x_columns, list):
		raise TypeError(f'x_columns should be either a string or a list not a {type(x_columns)}')

	def get_influence_data(the_tuple):
		with warnings.catch_warnings():
			warnings.simplefilter('ignore')
			x_column, this_data, this_num_deltas, this_function = the_tuple
			record = get_single_column_influence(
				function=this_function, data=this_data, x_column=x_column, num_deltas=this_num_deltas
			)
		return record

	tuples = [
		(x_column, data, num_deltas, function)
		for x_column in iterate(x_columns, echo=echo, text='preparing data ...')
	]

	progress_bar = ProgressBar(total=len(x_columns) + 1, echo=echo)
	progress_bar.show(amount=len(x_columns), text='data prepared.')

	if num_threads == 1:
		dataframes = [
			get_influence_data(t)
			for t in iterate(iterable=tuples, progress_bar=progress_bar, text='measuring influence ...')
		]
	else:
		dataframes = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem')(
			delayed(get_influence_data)(t)
			for t in iterate(iterable=tuples, progress_bar=progress_bar, text='measuring influence ...')
		)

	result = pd.concat(dataframes).sort_values(by=['variable', 'x_delta'])
	progress_bar.show(amount=len(dataframes) + 1, text='influence measured.')
	return result
Exemple #6
0
	def load(cls, path, echo=True):
		path = Path(path=path)
		parameters = (path + 'parameters.pensieve').load()
		pensieve = cls(safe=parameters['safe'])
		for name, value in parameters.items():
			setattr(pensieve, f'_{name}', value)
		memory_keys = (path + 'memory_keys.pensieve').load()
		progress_bar = ProgressBar(total=len(memory_keys))
		progress_amount = 0
		pensieve._memories_dictionary = {}
		for key in memory_keys:
			if echo:
				progress_bar.show(amount=progress_amount, text=f'loading "{key}" memory')
			memory = Memory.load(path=path + key, pensieve=pensieve)
			pensieve._memories_dictionary[key] = memory
			progress_amount += 1
		if echo:
			progress_bar.show(amount=progress_amount)
		return pensieve
Exemple #7
0
    def encode(self, data, drop_encoded=True, echo=0):
        echo = max(0, echo)
        result = data.copy()
        progress_bar = ProgressBar(total=len(self._column_values))
        progress_amount = 0
        for col_name, only_include in self._column_values.items():
            if echo:
                progress_bar.show(amount=progress_amount,
                                  text=f'DM creating dummies for {col_name}')
            progress_amount += 1
            temp_data = result[[col_name]].copy()
            if self._lowercase:
                temp_data[col_name] = temp_data[col_name].str.lower()
            temp_data[col_name] = np.where(
                temp_data[col_name].isin(only_include), temp_data[col_name],
                self._replacement)
            dummies = pd.get_dummies(data=temp_data[[col_name]],
                                     prefix=col_name,
                                     prefix_sep='_',
                                     dummy_na=self._encode_na,
                                     sparse=True)
            result = pd.concat([result, dummies], axis=1)
        for col_name in self._one_hot_columns:
            if col_name not in result.columns:
                result[col_name] = 0
        if echo:
            progress_bar.show(
                amount=progress_amount,
                text=f'DM created dummies for {self._encoded_columns}')

        if drop_encoded:
            result = result.drop(columns=self.encoded_columns)

        extra_columns = [
            x for x in result.columns if x not in self.encoded_columns +
            self.one_hot_columns and x not in data.columns
        ]
        if len(extra_columns) > 0:
            result = result.drop(columns=extra_columns)

        return result
    def simulate(self, echo=1):
        """
		:type echo: int
		:rtype: list[dict]
		"""
        def _get_influence(column, perturbation, function):
            influence_result = self.get_single_influence(
                column=column, perturbation=perturbation, function=function)
            influence_result['column'] = column
            influence_result['perturbation'] = perturbation
            return influence_result

        progress_bar = ProgressBar(total=2 + len(self._influencers), echo=echo)

        if self._num_threads == 1:
            result = [
                _get_influence(column=column,
                               perturbation=perturbation,
                               function=self._function) for column, influences
                in iterate(self._influencers.items(),
                           progress_bar=progress_bar,
                           text='measuring influence ...')
                for perturbation in influences
            ]
        else:
            result = self.parallel_process(
                delayed(_get_influence)(column=column,
                                        perturbation=perturbation,
                                        function=deepcopy(self._function))
                for column, influences in iterate(
                    self._influencers.items(),
                    progress_bar=progress_bar,
                    text='measuring influence in parallel ...')
                for perturbation in influences)

        progress_bar._total = 2 + len(self._influencers)
        progress_bar.show(amount=1 + len(self._influencers))

        result_data = pd.DataFrame.from_records(result)
        progress_bar.show(amount=2 + len(self._influencers))
        return result_data
Exemple #9
0
	def evaluate(self, keys=None, output=False):
		"""
		evaluates multiple memories, in parallel if num_threads != 1
		:type keys: list[str] or NoneType or str
		:type output: bool
		:rtype: list or NoneType
		"""
		if keys is None:
			keys = list(self.memories_dictionary.keys())
		elif isinstance(keys, str):
			keys = [keys]

		if self._n_jobs == 1:
			if output:
				return [self[key] for key in keys]
			else:
				for key in keys:
					self.memories_dictionary[key].evaluate()
		else:
			def get_content(p):
				return p.content

			memories = [self.memories_dictionary[key] for key in keys]
			schedule = self.get_update_schedule(keys=keys)

			progress_bar = ProgressBar(
				total=sum([len(schedule_round) for schedule_round in schedule]),
				echo=self._echo
			)

			progress_amount = 0
			for schedule_round in schedule:
				progress_bar.show(amount=progress_amount, text=f'updating {len(schedule_round)} memories')
				self.processor(delayed(get_content)(job) for job in schedule_round)
				progress_amount += len(schedule_round)
			if progress_amount > 0:
				progress_bar.show(amount=progress_amount, text=f'{self.key} updated!')

			contents = self.processor(delayed(get_content)(p) for p in memories)
			if output:
				return list(contents)
Exemple #10
0
    def train(self, data, echo=0):
        echo = max(0, echo)
        result = data.copy()
        one_hot_columns = []
        non_numeric_cols = data.select_dtypes(
            exclude=['bool', 'number', 'datetime64', 'datetime']).columns

        if self._include is not None:
            non_numeric_cols = [
                col for col in non_numeric_cols if col in self._include
            ]
        if self._exclude is not None:
            non_numeric_cols = [
                col for col in non_numeric_cols if col not in self._exclude
            ]

        progress_bar = ProgressBar(total=len(non_numeric_cols))
        progress_amount = 0
        self._encoded_columns = []
        for col_name in non_numeric_cols:
            if echo:
                progress_bar.show(amount=progress_amount,
                                  text=f'DM training dummies for {col_name}')
            progress_amount += 1
            try:
                temp_data = data[[col_name]].copy()
                if self._lowercase:
                    temp_data[col_name] = temp_data[col_name].str.lower()
                temp_data['count'] = 1
                counts = temp_data.groupby(col_name).sum().reset_index(
                    drop=False)

                if self._top is not None:
                    top_counts = counts.sort_values(by=col_name,
                                                    ascending=False).head(
                                                        self._top)
                    only_include = set(top_counts[col_name])
                    temp_data[col_name] = np.where(
                        temp_data[col_name].isin(only_include),
                        temp_data[col_name], self._replacement)

                else:
                    only_include = set(counts[col_name])

                dummies = pd.get_dummies(data=temp_data[[col_name]],
                                         prefix=col_name,
                                         prefix_sep='_',
                                         dummy_na=self._encode_na,
                                         sparse=False)

                dummies = dummies[[
                    col for col in dummies if not dummies[col].nunique() == 1
                ]]

                result = pd.concat([result, dummies], axis=1)
                one_hot_columns += list(dummies.columns)

                self._column_values[col_name] = only_include
                self._encoded_columns.append(col_name)
            except Exception as e:
                print(f'exception being raised for column: {col_name}')
                raise e
        self._one_hot_columns = one_hot_columns
        if echo:
            progress_bar.show(
                amount=progress_amount,
                text=f'DM trained dummies for {self._encoded_columns}')
        return result
Exemple #11
0
    def __init__(self,
                 data,
                 x_columns,
                 y_column,
                 id_columns=None,
                 holdout_ratio=0.2,
                 num_validation_splits=5,
                 random_state=None,
                 transformer=None,
                 num_threads=-1,
                 echo=1,
                 cross_validation=None):
        self._echo = echo
        self._num_threads = num_threads
        self._num_validation_splits = num_validation_splits
        self._random_state = random_state
        progress_bar = ProgressBar(total=num_validation_splits + 1,
                                   echo=self._echo)
        progress_amount = 0
        progress_bar.show(amount=progress_amount,
                          text='preparing validation Xy')

        if isinstance(id_columns, str):
            id_columns = [id_columns]

        if cross_validation is None:
            cross_validation = get_cross_validation_by_group(
                data=data,
                id_columns=id_columns,
                num_splits=num_validation_splits,
                holdout_ratio=holdout_ratio,
                random_state=random_state)
        self._cross_validation = cross_validation

        super().__init__(data=data,
                         x_columns=x_columns,
                         y_column=y_column,
                         id_columns=id_columns,
                         training_rows=cross_validation['validation'],
                         training_name='validation',
                         test_rows=cross_validation['holdout'],
                         test_name='holdout',
                         transformer=deepcopy(transformer))
        progress_amount += 1
        progress_bar.show(amount=progress_amount, text='preparing folds')

        def transform_training_test_xy(fold):
            return TransformedTrainingTestXy(data=data,
                                             x_columns=x_columns,
                                             y_column=y_column,
                                             id_columns=id_columns,
                                             training_rows=fold['training'],
                                             test_rows=fold['test'],
                                             transformer=deepcopy(transformer))

        if self.num_threads == 1:
            self._folds = [
                transform_training_test_xy(fold=fold)
                for fold in iterate(cross_validation['folds'],
                                    text='preparing folds (single-threaded)')
            ]
        else:
            processor = Parallel(n_jobs=self.num_threads,
                                 backend='threading',
                                 require='sharedmem')
            self._folds = processor(
                delayed(transform_training_test_xy)(fold=fold)
                for fold in iterate(cross_validation['folds'],
                                    text='preparing folds (multi-threaded)'))
Exemple #12
0
	def get_content_and_reference(self):
		if self.n_jobs == 1:
			precursor_keys_to_contents = {p.key: p.content for p in self.precursors}
		else:
			def get_content(p):
				return p.content

			precursors = self.precursors

			schedule = self.get_update_schedule()

			progress_bar = ProgressBar(
				total=sum([len(schedule_round) for schedule_round in schedule]),
				echo=self.pensieve._echo
			)

			progress_amount = 0
			for schedule_round in schedule:
				progress_bar.show(amount=progress_amount, text=f'updating {len(schedule_round)} memories')
				self.pensieve.processor(delayed(get_content)(job) for job in schedule_round)
				progress_amount += len(schedule_round)
			if progress_amount > 0:
				progress_bar.show(amount=progress_amount, text=f'{self.key} updated!')

			contents = self.pensieve.processor(delayed(get_content)(p) for p in precursors)
			keys = [precursor.key for precursor in precursors]
			precursor_keys_to_contents = {key: content for key, content in zip(keys, contents)}

		if len(self.precursor_keys) == 0:
			new_reference = get_source(self._original_function)

			if new_reference == self._precursors_reference and not self.is_stale:
				new_content = self._content

			elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists():
				new_content = self.backup_content

			else:
				timer = Timer(start_now=True, unit='timedelta')
				new_content = self._function()
				timer.stop()
				self.pensieve.function_durations.add_measurement(name=self.key, timer=timer)

		elif len(self.precursor_keys) == 1:
			precursor_content = list(precursor_keys_to_contents.values())[0]
			new_reference = (get_source(self._original_function), precursor_keys_to_contents)

			if new_reference == self._precursors_reference and not self.is_stale:
				new_content = self._content

			elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists():
				new_content = self.backup_content

			else:
				timer = Timer(start_now=True, unit='timedelta')
				new_content = self._function(precursor_content)
				timer.stop()
				self.pensieve.function_durations.add_measurement(name=self.key, timer=timer)

		else:
			inputs = EvaluationInput(inputs=precursor_keys_to_contents)
			new_reference = (get_source(self._original_function), precursor_keys_to_contents)
			if new_reference == self._precursors_reference and not self.is_stale:
				new_content = self._content
			elif self.backup_directory and new_reference == self.backup_precursors_reference and self.backup_content_exists():
				new_content = self.backup_content
			else:
				timer = Timer(start_now=True, unit='timedelta')
				new_content = self._function(inputs.originals)

				timer.stop()
				self.pensieve.function_durations.add_measurement(name=self.key, timer=timer)

		self._content_type = get_type(new_content)

		self._content_access_count += 1
		if self.backup_directory and new_reference != self.backup_precursors_reference:
			self.backup_content = new_content
			self.backup_precursors_reference = new_reference
		return new_content, new_reference
Exemple #13
0
	def eliminate(self, num_rounds=None, echo=1):
		"""
		:type num_rounds: int or NoneType
		:type echo: bool or int
		:rtype: Regression
		"""
		model = self
		if isinstance(model.fit, BrokenModel):
			return self

		num_rounds = num_rounds or len(self.variables)
		progress_bar = ProgressBar(total=num_rounds, echo=echo)
		round = 0
		previous_lse = None
		previous_formula = None
		for round in range(num_rounds):

			if model.num_insignificant_effects < 1:
				break

			else:
				y = model.formula.dependent_variable


				lse = model.least_significant_insignificant_effect
				xs = model.all_effects_but_least_significant
				progress_bar.show(
					amount=round,
					text=f'effects: {len(xs)+1}, eliminating {lse} with p={lse.p}'
				)
				new_formula = Formula.from_variables(
					independent_variables=xs,
					dependent_variable=y
				)

				if previous_lse == lse:
					print('eliminating', lse, '\n\n')
					print('is main effect insignificant?', model.is_main_effect_significant(lse), '\n\n')
					for interaction in model.significant_interaction_effects:
						print(interaction, 'contains?', interaction.contains(lse))

					for interaction in model.insignificant_interaction_effects:
						print(interaction, interaction.p)

					print(model.least_significant_insignificant_effect, lse)
					print('\n\ninsignificant effects\n', model.insignificant_effects)
					print(new_formula.display())

					for x in xs:
						print(x, x.num_interactions, x.p)
					raise RuntimeError(f'repeating the elimination of {lse}')

				previous_formula = new_formula
				previous_lse = lse


				new_model = model.__class__(
					data=model.data, formula=new_formula, significance_level=model.significance_level,
					groups=model.groups, family=model.family, model_builder=model._model_builder,
					parent=model
				)



				if isinstance(new_model.fit, BrokenModel):
					break

				else:
					model = new_model

		progress_bar.show(
			amount=round + 1,
			text=f'effects: {len(model.formula.independent_variables)}'
		)
		return model
Exemple #14
0
def fuzzy_left_merge(left,
                     right,
                     left_on=None,
                     right_on=None,
                     on=None,
                     suffixes=('_x', '_y'),
                     score_name='match_ratio',
                     na_ratio=0.5,
                     two_na_ratio=0.75,
                     similarity_function=None,
                     weights=None,
                     case_sensitivity=0.5,
                     num_results=1,
                     num_threads=-1,
                     echo=1):
    """
	:type left: DataFrame
	:type right: DataFrame
	:type left_on: list[str] or str or NoneType
	:type right_on: list[str] or str or NoneType
	:type on: list[str] or str or NoneType
	:type how: str or NoneType
	:type case_sensitivity: float
	:type num_results: int
	:type similarity_function: callable
	:type echo: int or bool or ProgressBar
	:type num_threads: int
	:rtype: DataFrame
	"""
    if score_name in left.columns or score_name in right.columns:
        raise ValueError('use a score_name different from column names.')

    data1 = left.copy()
    data2 = right.copy()

    if on is None:
        on = data1.columns & data2.columns

    if left_on is None:
        left_on = on
    if right_on is None:
        right_on = on

    missing_left = [col for col in left_on if col not in data1.columns]
    if len(missing_left) > 0:
        raise KeyError(f'missing columns on left: {missing_left}')
    missing_right = [col for col in right_on if col not in data2.columns]
    if len(missing_right) > 0:
        raise KeyError(f'missing columns on right: {missing_right}')

    data1['fuzzy_id'] = range(len(data1))

    if num_threads == 1:
        results = ProgressBar.apply(
            data=data1,
            echo=echo,
            function=lambda row: _match_rows(row=row,
                                             right=data2,
                                             left_on=left_on,
                                             right_on=right_on,
                                             na_ratio=na_ratio,
                                             two_na_ratio=two_na_ratio,
                                             case_sensitivity=case_sensitivity,
                                             score_name=score_name,
                                             num_results=num_results,
                                             similarity_function=
                                             similarity_function,
                                             weights=weights,
                                             num_threads=1,
                                             echo=echo - 1))

    else:
        parallel = Parallel(n_jobs=num_threads,
                            backend='threading',
                            require='sharedmem')
        progress_bar = ProgressBar(total=len(data1) + 1, echo=echo)

        results = parallel(
            delayed(_match_rows)(row=row,
                                 right=data2,
                                 left_on=left_on,
                                 right_on=right_on,
                                 na_ratio=na_ratio,
                                 two_na_ratio=two_na_ratio,
                                 case_sensitivity=case_sensitivity,
                                 score_name=score_name,
                                 num_results=num_results,
                                 similarity_function=similarity_function,
                                 weights=weights,
                                 num_threads=1,
                                 echo=echo - 1)
            for index, row in iterate(data1.iterrows(),
                                      progress_bar=progress_bar))
        progress_bar.show(amount=len(data1) + 1)

    data2 = concat(results).reset_index(drop=True)

    return data1.merge(right=data2,
                       on='fuzzy_id',
                       how='left',
                       suffixes=suffixes).drop(columns='fuzzy_id')