def test_copy_not_deep(self): expected = pd.DataFrame( [['John', 1, 5.0], ['Mary', 2, 4.0], ['Sally', 6, np.nan], ['Jeff', 3, 9.0], ['Edwin', 9, 1.0]], columns=['string', 'numeric', 'numeric_missing']) container = DataContainer([{ 'frame': expected, 'name': 'test', 'path': 'foo' }]) new_container = container.copy(deep=False) assert_not_equal(id(new_container), id(container)) for name in new_container.keys(): frame = new_container.get_frame(name) path = new_container.get_path(name) old_frame = container.get_frame(name) old_path = container.get_path(name) eq_(path, old_path) assert_frame_equal(frame, old_frame) assert_equal(id(frame), id(old_frame))
def test_copy_not_deep(self): expected = pd.DataFrame([['John', 1, 5.0], ['Mary', 2, 4.0], ['Sally', 6, np.nan], ['Jeff', 3, 9.0], ['Edwin', 9, 1.0]], columns=['string', 'numeric', 'numeric_missing']) container = DataContainer([{'frame': expected, 'name': 'test', 'path': 'foo'}]) new_container = container.copy(deep=False) assert_not_equal(id(new_container), id(container)) for name in new_container.keys(): frame = new_container.get_frame(name) path = new_container.get_path(name) old_frame = container.get_frame(name) old_path = container.get_path(name) eq_(path, old_path) assert_frame_equal(frame, old_frame) assert_equal(id(frame), id(old_frame))
def test_drop_warning(self): container = DataContainer([{'frame': pd.DataFrame(), 'name': 'test'}]) with warnings.catch_warnings(): warnings.filterwarnings('error') container.drop('flab')
def test_get_frames_no_match(self): container = DataContainer([{'frame': pd.DataFrame(), 'name': 'include_this_one'}, {'frame': pd.DataFrame(), 'name': 'include_this_one_not'}, {'frame': pd.DataFrame(), 'name': 'we_want_this_one'}]) frames = container.get_frames(suffix='foo') eq_(frames, {})
def test_get_frames_by_suffix(self): container = DataContainer([{'frame': pd.DataFrame(), 'name': 'include_this_one'}, {'frame': pd.DataFrame(), 'name': 'include_this_one_not'}, {'frame': pd.DataFrame(), 'name': 'we_want_this_one'}]) frames = container.get_frames(suffix='one') eq_(sorted(list(frames.keys())), sorted(['include_this_one', 'we_want_this_one']))
def test_get_frames_by_prefix(self): container = DataContainer([{'frame': pd.DataFrame(), 'name': 'test_two'}, {'frame': pd.DataFrame(), 'name': 'test_three'}, {'frame': pd.DataFrame(), 'name': 'exclude'}]) frames = container.get_frames(prefix='test') eq_(sorted(list(frames.keys())), sorted(['test_two', 'test_three']))
def test_get_frames_both_suffix_and_prefix(self): container = DataContainer([{'frame': pd.DataFrame(), 'name': 'include_this_frame'}, {'frame': pd.DataFrame(), 'name': 'include_it'}, {'frame': pd.DataFrame(), 'name': 'exclude_frame'}, {'frame': pd.DataFrame(), 'name': 'include_this_other_frame'}]) frames = container.get_frames(prefix='include', suffix='frame') eq_(sorted(list(frames.keys())), sorted(['include_this_frame', 'include_this_other_frame']))
def test_rename(self): expected = pd.DataFrame( [['John', 1, 5.0], ['Mary', 2, 4.0], ['Sally', 6, np.nan], ['Jeff', 3, 9.0], ['Edwin', 9, 1.0]], columns=['string', 'numeric', 'numeric_missing']) container = DataContainer([{'frame': expected, 'name': 'test'}]) container.rename('test', 'flerf') assert_frame_equal(container.flerf, expected)
def test_rename_with_path(self): expected = pd.DataFrame([['John', 1, 5.0], ['Mary', 2, 4.0], ['Sally', 6, np.nan], ['Jeff', 3, 9.0], ['Edwin', 9, 1.0]], columns=['string', 'numeric', 'numeric_missing']) container = DataContainer([{'frame': expected, 'name': 'test', 'path': 'foo'}]) container.rename('test', 'flerf') eq_(container.get_path('flerf'), 'foo')
def test_rename(self): expected = pd.DataFrame([['John', 1, 5.0], ['Mary', 2, 4.0], ['Sally', 6, np.nan], ['Jeff', 3, 9.0], ['Edwin', 9, 1.0]], columns=['string', 'numeric', 'numeric_missing']) container = DataContainer([{'frame': expected, 'name': 'test'}]) container.rename('test', 'flerf') assert_frame_equal(container.flerf, expected)
def test_data_container_save_files_with_id(self): data_sets = [{ 'name': 'dataset1', 'frame': pd.DataFrame(np.random.normal(size=(100, 2)), columns=['A', 'B']) }, { 'name': 'dataset2', 'frame': pd.DataFrame(np.random.normal(size=(120, 3)), columns=['A', 'B', 'C']) }] container = DataContainer(data_sets) directory = 'temp_directory_save_files_with_id_xyz' os.makedirs(directory, exist_ok=True) writer = DataWriter('test') for file_type in ['json', 'csv', 'xlsx']: if file_type != 'json': writer.write_experiment_output(directory, container, dataframe_names=['dataset1'], file_format=file_type) else: writer.write_experiment_output( directory, container, new_names_dict={'dataset1': 'aaa'}, dataframe_names=['dataset1'], file_format=file_type) aaa_json = pd.read_json(os.path.join(directory, 'test_aaa.json')) ds_1_csv = pd.read_csv(os.path.join(directory, 'test_dataset1.csv')) ds_1_xls = pd.read_excel(os.path.join(directory, 'test_dataset1.xlsx')) output_dir = os.listdir(directory) rmtree(directory) assert sorted(output_dir) == sorted( ['test_aaa.json', 'test_dataset1.csv', 'test_dataset1.xlsx']) assert_frame_equal(container.dataset1, aaa_json) assert_frame_equal(container.dataset1, ds_1_csv) assert_frame_equal(container.dataset1, ds_1_xls)
def test_data_container_save_wrong_format(self): data_sets = [{'name': 'dataset1', 'frame': pd.DataFrame(np.random.normal(size=(100, 2)), columns=['A', 'B'])}, {'name': 'dataset2', 'frame': pd.DataFrame(np.random.normal(size=(120, 3)), columns=['A', 'B', 'C'])}] container = DataContainer(data_sets) directory = 'temp_directory_container_save_wrong_format_xyz' writer = DataWriter() writer.write_experiment_output(directory, container, dataframe_names=['dataset1'], file_format='html')
def read(self, kwargs_dict=None): """ Read all files passed to the constructor. Parameters ---------- kwargs_dict : dict of dicts, optional Any additional keyword arguments to pass to a particular DataFrame. These arguments will be passed to the `pandas` IO reader function. Defaults to None. Returns ------- datacontainer : DataContainer A DataContainer object. """ for idx, set_path in enumerate(self.dataset_paths): name = self.dataset_names[idx] converter = self.file_converters.get(name, None) if not exists(set_path): raise FileNotFoundError( 'The file {} does not exist'.format(set_path)) if kwargs_dict is not None: kwargs = kwargs_dict.get(name, {}) else: kwargs = {} dataframe = self.read_from_file(set_path, converter, **kwargs) # Add to list of datasets self.datasets.append({ 'name': name.strip(), 'path': set_path, 'frame': dataframe }) return DataContainer(self.datasets)
def test_drop(self): container = DataContainer([{'frame': pd.DataFrame(), 'name': 'test'}]) container.drop('test') assert_false('test' in container)
def get_fairness_analyses(df, group, system_score_column, human_score_column='sc1', base_group=None): """ Compute fairness analyses described in `Loukina et al. 2019 <https://www.aclweb.org/anthology/W19-4401/>`_. The function computes how much variance group membership explains in overall score accuracy (osa), overall score difference (osd), and conditional score difference (csd). See the paper for more details. Parameters ---------- df: pandas DataFrame A dataframe containing columns with numeric human scores, columns with numeric system scores and a column with group membership. group: str Name of the column containing group membership. system_score_column: str Name of the column containing system scores. human_score_column: str Name of the column containing human scores. base_group: str, optional Name of the group to use as the reference category. Defaults to ``None`` in which case the group with the largest number of cases will be used as the reference category. Ties are broken alphabetically. Returns ------- model_dict: dictionary A dictionary with different proposed metrics as keys and fitted models as values. fairness_container: DataContainer A datacontainer with the following datasets: - "estimates_<METRIC>_by_<GROUP>" where "<GROUP>" corresponds to the given group and "<METRIC>" can be "osa", "osd" and "csd" estimates for each group computed by the respective models. - "fairness_metrics_by_<GROUP>" - a summary of model fits (R2 and p values). """ # compute error and squared error df['error'] = df[system_score_column] - df[human_score_column] df['SE'] = df['error']**2 # convert group values to category and reorder them using # the largest category as reference df['group'] = convert_to_ordered_category(df[group], base_group=base_group) base_group = df['group'].cat.categories[0] df['sc1_cat'] = convert_to_ordered_category(df[human_score_column]) # Overall score accuracy (OSA) # Variance in squared error explained by L1 # fit the model osa_model = smf.ols(formula='SE ~ group', data=df) osa_fit = osa_model.fit() # collect the results osa_dict = {'R2': osa_fit.rsquared_adj, 'sig': osa_fit.f_pvalue} osa_results = pd.Series(osa_dict, name='Overall score accuracy') df_coefficients_osa = get_coefficients(osa_fit, base_group) # Overall score difference (OSD) # variance in signed residuals (raw error) explained by L1 # fit the model osd_model = smf.ols(formula='error ~ group', data=df) osd_fit = osd_model.fit() # collect the results osd_dict = {'R2': osd_fit.rsquared_adj, 'sig': osd_fit.f_pvalue} osd_results = pd.Series(osd_dict, name='Overall score difference') df_coefficients_osd = get_coefficients(osd_fit, base_group) # conditional score difference CSD # Variance in score difference conditioned on Native language # fit "null" model with human score only csd_null_mod = smf.ols(formula='error ~ sc1_cat', data=df) csd_null_fit = csd_null_mod.fit() # fit model with both human score and group csd_mod = smf.ols(formula='error ~ group + sc1_cat', data=df) csd_fit = csd_mod.fit() # compare the two models using anova_lm # we filter warnings for this function because we get # runtime warning due to NaNs in the data. # these seem to be by design: https://groups.google.com/forum/#!topic/pystatsmodels/-flY0cNnb3k np.warnings.filterwarnings('ignore') anova_results = anova_lm(csd_null_fit, csd_fit) # we reset warnings np.warnings.resetwarnings() # collect the results. Note that R2 in this case is a difference # in R2 between the two models and significance is obtained from anova csd_dict = { 'R2': csd_fit.rsquared_adj - csd_null_fit.rsquared_adj, 'sig': anova_results.values[1][-1] } csd_results = pd.Series(csd_dict, name='Conditional score difference') df_coefficients_csd = get_coefficients(csd_fit, base_group) # create a summary table df_r2_all = pd.concat([osa_results, osd_results, csd_results], axis=1, sort=True) df_r2_all['base_category'] = base_group # assemble all datasets into a DataContainer datasets = [{ 'name': 'estimates_osa_by_{}'.format(group), 'frame': df_coefficients_osa }, { 'name': 'estimates_osd_by_{}'.format(group), 'frame': df_coefficients_osd }, { 'name': 'estimates_csd_by_{}'.format(group), 'frame': df_coefficients_csd }, { 'name': 'fairness_metrics_by_{}'.format(group), 'frame': df_r2_all }] # assemble all models into a dictionary model_dict = {'osa': osa_fit, 'osd': osd_fit, 'csd': csd_fit} return model_dict, DataContainer(datasets=datasets)