def resort(self): self.col_map.sort_values(by=self.columns, inplace=True, ignore_index=True) for index, row in self.col_map.iterrows(): key_hash = get_row_hash(row, self.columns) self.map_dict[key_hash] = index
def test_lookup_cols(self): t_map = KeyMap(self.key_cols, self.target_cols) stern_df = pd.read_csv(self.stern_map_path, delimiter='\t', header=0, keep_default_na=False, na_values=",null") t_map.update(stern_df) t_col = t_map.col_map for index, row in stern_df.iterrows(): key = get_row_hash(row, self.key_cols) key_value = t_map.map_dict[key] self.assertEqual(t_col.iloc[key_value]['type'], row['type'], "The key should be looked up for same map") stern_test1 = pd.read_csv(self.stern_test1_path, delimiter='\t', header=0) for index, row in stern_test1.iterrows(): key = get_row_hash(row, self.key_cols) key_value = t_map.map_dict[key] self.assertEqual(t_col.iloc[key_value]['type'], row['type'], "The key should be looked up for other file")
def test_get_row_hash(self): stern_df = read_csv(self.stern_map_path, delimiter='\t', header=0, keep_default_na=False, na_values=",null") key_columns = ['type', 'event_type'] my_map = {} for index, row in stern_df.iterrows(): key = get_row_hash(row, key_columns) my_map[key] = index self.assertEqual( len(my_map.keys()), len(stern_df), "get_row_hash should uniquely hash all of the keys in stern map")
def _update(self, base_df): """ Takes DataFrame objects containing keys Args: base_df (DataFrame): DataFrame of consisting of the columns in the KeyMap """ for index, row in base_df.iterrows(): key = get_row_hash(row, self.columns) if key not in self.map_dict: self.map_dict[key] = len(self.col_map) self.col_map = self.col_map.append(row[self.columns], ignore_index=True) self.count_dict[key] = 0 self.count_dict[key] += 1
def _update(self, base_df): """ Takes DataFrame objects containing keys and DataFrame containing targets and overwrites existing keys Args: base_df (DataFrame): DataFrame of consisting of the columns in the KeyMap Returns: duplicate_indices (list): List of key positions that were duplicated """ duplicate_indices = [] for index, row in base_df.iterrows(): key = get_row_hash(row, self.key_cols) if key not in self.map_dict: self.map_dict[key] = len(self.col_map) self.col_map = self.col_map.append(row, ignore_index=True) else: duplicate_indices.append(index) return duplicate_indices
def _remap(self, df): """ Utility method that iterates through df to do the replacements Args: df (DataFrame): DataFrame in which to perform the mapping Returns: list List of row numbers that had no correspondence in the mapping """ missing_indices = [] for index, row in df.iterrows(): key = get_row_hash(row, self.key_cols) key_value = self.map_dict.get(key, None) if key_value: result = self.col_map.iloc[key_value] row[self.target_cols] = result[self.target_cols].values df.iloc[index] = row else: missing_indices.append(index) return missing_indices
def print(self, file=None): print(f"Counts for key [{str(self.columns)}]:", file=file) for index, row in self.col_map.iterrows(): key_hash = get_row_hash(row, self.columns) print(f"{str(list(row.values))}\t{self.count_dict[key_hash]}", file=file)