def create_filter_list(self, filter_list): """ Create a list of filter expressions. :param filter_list: a list of dictionaries representing the filter expression to be used as part of an SQL WHERE clause :return: a list of strings representing the expressions. """ field_definitions = {} for field_def in fields_for(self.tablename): field_definitions[field_def.get('name')] = field_def string_list = [] for item in filter_list: string, field = create_on_string(item) field_definition = field_definitions.get(field) field_mode = field_definition.get('mode').lower() # if based on a nullable field, make sure to use the exists function if field_mode.lower() == 'nullable': item['qualifier'] = item['qualifier'].upper() string, _ = create_on_string(item) nullable_str = ( ' exists (SELECT * FROM `:idataset.observation` AS record2 ' 'WHERE :join_tablename.observation_id = record2.observation_id ' 'AND {conditional})') string = nullable_str.format(conditional=string) string_list.append(string) return string_list
def do(self): """ This function actually runs deid and using both rule specifications and application of the rules """ self.update_rules() d = Deid(pipeline=self.pipeline, rules=self.deid_rules, parent=self) p = d.apply(self.table_info, self.store, self.get_tablename()) is_meta = np.sum([1 * ('on' in _item) for _item in p]) != 0 LOGGER.info('table:\t%s\t\tis a meta table:\t%s', self.get_tablename(), is_meta) if not is_meta: sql = [self.to_sql(p)] _rsql = None dml_sql = [] else: # # Processing meta tables sql = [] relational_cols = [col for col in p if 'on' not in col] meta_cols = [col for col in p if 'on' in col] _map = {} for col in meta_cols: on_string, _ = create_on_string(col['on']) if on_string not in _map: _map[on_string] = {'specification': [], 'on': {}} _map[on_string]['specification'] += [col] _map[on_string]['on'] = col['on'] fillter = [] for filter_id in _map: item = _map.get(filter_id, {}).get('specification', []) item_filter = _map.get(filter_id, {}).get('on', {}) fillter.append(item_filter) _sql = self.to_sql(item + relational_cols) + ' AND ' + filter_id sql.append(_sql) _rsql = self.gather_unfiltered_records(relational_cols, fillter) sql.append(_rsql) # create additional SQL cleaning statements dml_sql = self.gather_dml_queries(p) for index, segment in enumerate(sql): formatted = segment.replace(':idataset', self.idataset) sql[index] = formatted.replace(':join_tablename', self.tablename) if 'debug' in self.action: self.debug(p) else: # write SQL to file sql_filepath = os.path.join(self.logpath, self.idataset, self.tablename + '.sql') with open(sql_filepath, 'w') as sql_file: final_sql = "\n\nAppend these results to previous results\n\n".join( sql) sql_file.write(final_sql) if dml_sql: sql_file.write( '\n\nDML SQL statements to execute on de-identified table data\n\n' ) final_sql = '\n\n ----------------------------------\n\n'.join( dml_sql) sql_file.write(final_sql) if 'submit' in self.action: for index, statement in enumerate(sql): self.submit(statement, not index) for statement in dml_sql: self.submit(statement, False, dml=True) if 'simulate' in self.action: # # Make this threaded if there is a submit action that is associated with it self.simulate(p) LOGGER.info('FINISHED de-identification on table:\t%s', self.tablename)