def test_vmerge_blind_append(self): meta = self.example_data_A_meta data = self.example_data_A_data # Create left dataset subset_columns_l = [ 'unique_id', 'gender', 'locality', 'ethnicity', 'q2', 'q3' ] meta_l, data_l = subset_dataset(meta, data[:10], columns=subset_columns_l) dataset_left = (meta_l, data_l) # Create right dataset subset_columns_r = [ 'unique_id', 'gender', 'religion', 'q1', 'q2', 'q8', 'q9' ] meta_r, data_r = subset_dataset(meta, data[5:15], columns=subset_columns_r) dataset_right = (meta_r, data_r) # vmerge datasets indicating row_id dataset_left = (meta_l, data_l) meta_vm, data_vm = vmerge(dataset_left, dataset_right, verbose=False) # check merged dataframe verify_vmerge_data(self, data_l, data_r, data_vm, meta_vm, blind_append=True)
def test_subset_dataset(self): meta = self.example_data_A_meta data = self.example_data_A_data # Create left dataset subset_columns_l = [ 'unique_id', 'gender', 'locality', 'ethnicity', 'q2', 'q3' ] subset_rows_l = 10 subset_cols_l = len(subset_columns_l) meta_l, data_l = subset_dataset(meta, data[:10], columns=subset_columns_l) # check general characteristics of merged dataset self.assertCountEqual(meta_l['columns'].keys(), subset_columns_l) datafile_items = meta_l['sets']['data file']['items'] datafile_columns = [item.split('@')[-1] for item in datafile_items] self.assertCountEqual(meta_l['columns'].keys(), datafile_columns) self.assertCountEqual(data_l.columns.tolist(), datafile_columns) self.assertCountEqual(data_l.columns.tolist(), subset_columns_l) self.assertEqual(data_l.shape, (subset_rows_l, subset_cols_l)) dataset_left = (meta_l, data_l) # Create right dataset subset_columns_r = [ 'unique_id', 'gender', 'religion', 'q1', 'q2', 'q8', 'q9' ] subset_rows_r = 10 subset_cols_r = len(subset_columns_r) meta_r, data_r = subset_dataset( meta, data[5:15], #5:15 columns=subset_columns_r) # check general characteristics of merged dataset self.assertCountEqual(meta_r['columns'].keys(), subset_columns_r) datafile_items = meta_r['sets']['data file']['items'] datafile_columns = [item.split('@')[-1] for item in datafile_items] self.assertCountEqual(meta_r['columns'].keys(), datafile_columns) self.assertCountEqual(data_r.columns.tolist(), datafile_columns) self.assertCountEqual(data_r.columns.tolist(), subset_columns_r) self.assertEqual(data_r.shape, (subset_rows_r, subset_cols_r)) dataset_right = (meta_r, data_r)
def test_hmerge_vmerge_basic(self): meta = self.example_data_A_meta data = self.example_data_A_data # Create left dataset subset_columns_l = [ 'unique_id', 'gender', 'locality', 'ethnicity', 'q2', 'q3' ] meta_l, data_l = subset_dataset(meta, data[:10], columns=subset_columns_l) dataset_left = (meta_l, data_l) # Create right dataset subset_columns_r = [ 'unique_id', 'gender', 'religion', 'q1', 'q2', 'q8', 'q9' ] meta_r, data_r = subset_dataset( meta, data[:10], #5:15 columns=subset_columns_r) dataset_right = (meta_r, data_r) # hmerge datasets meta_hm, data_hm = hmerge(dataset_left, dataset_right, left_on='unique_id', right_on='unique_id', verbose=False) # check merged dataframe verify_hmerge_data(self, data_l, data_r, data_hm, meta_hm) # vmerge datasets dataset_left = (meta_hm, data_hm) meta_vm, data_vm = vmerge(dataset_left, dataset_right, left_on='unique_id', right_on='unique_id', verbose=False) # check merged dataframe verify_vmerge_data(self, data_hm, data_r, data_vm, meta_vm)
def test_vmerge_row_id(self): meta = self.example_data_A_meta data = self.example_data_A_data # Create left dataset subset_columns_l = [ 'unique_id', 'gender', 'locality', 'ethnicity', 'q2', 'q3' ] meta_l, data_l = subset_dataset(meta, data[:10], columns=subset_columns_l) dataset_left = (meta_l, data_l) # Create right dataset subset_columns_r = [ 'unique_id', 'gender', 'religion', 'q1', 'q2', 'q8', 'q9' ] meta_r, data_r = subset_dataset(meta, data[5:15], columns=subset_columns_r) dataset_right = (meta_r, data_r) # vmerge datasets indicating row_id dataset_left = (meta_l, data_l) meta_vm, data_vm = vmerge(dataset_left, dataset_right, on='unique_id', row_id_name='DataSource', left_id=1, right_id=2, verbose=False) expected = { 'text': { 'en-GB': 'vmerge row id' }, 'type': 'int', 'name': 'DataSource' } actual = meta_vm['columns']['DataSource'] self.assertEqual(actual, expected) self.assertTrue(data_vm['DataSource'].dtype == 'int64') # check merged dataframe verify_vmerge_data(self, data_l, data_r, data_vm, meta_vm, row_id_name='DataSource', left_id=1, right_id=2) # vmerge datasets indicating row_id dataset_left = (meta_l, data_l) meta_vm, data_vm = vmerge(dataset_left, dataset_right, on='unique_id', row_id_name='DataSource', left_id=1, right_id=2.0, verbose=False) expected = { 'text': { 'en-GB': 'vmerge row id' }, 'type': 'float', 'name': 'DataSource' } actual = meta_vm['columns']['DataSource'] self.assertEqual(actual, expected) self.assertTrue(data_vm['DataSource'].dtype == 'float64') # check merged dataframe verify_vmerge_data(self, data_l, data_r, data_vm, meta_vm, row_id_name='DataSource', left_id=1, right_id=2.0) # vmerge datasets indicating row_id dataset_left = (meta_l, data_l) meta_vm, data_vm = vmerge(dataset_left, dataset_right, on='unique_id', row_id_name='DataSource', left_id='W1', right_id=2.0, verbose=False) expected = { 'text': { 'en-GB': 'vmerge row id' }, 'type': 'str', 'name': 'DataSource' } actual = meta_vm['columns']['DataSource'] self.assertEqual(actual, expected) #self.assertTrue(data_vm['DataSource'].dtype == 'str') self.assertTrue(is_string_dtype(data_vm['DataSource'])) # check merged dataframe verify_vmerge_data(self, data_l, data_r, data_vm, meta_vm, row_id_name='DataSource', left_id='W1', right_id='2.0')