def update_final_subject_identifier_from_cdc(self, df_currentstudyparticipant): """Update identifier from CDC HTC data.""" if not df_currentstudyparticipant.empty: df = df_currentstudyparticipant df['htcid'].fillna(value=np.nan, inplace=True) df['htcid'] = df.apply(lambda row: undash(row['htcid']), axis=1) df['ssid'].fillna(value=np.nan, inplace=True) df['ssid'] = df.apply(lambda row: undash(row['ssid']), axis=1) df = df.replace('unk', np.nan) df = df[pd.notnull(df['omangnumber'])] self.df_htc = df.copy() self.df_htc = self.df_htc.rename(columns={ 'htcid': 'subject_identifier_cdc', 'omangnumber': 'identity' }) self.df_htc = pd.merge( self.results[pd.isnull( self.results['final_subject_identifier'])], self.df_htc[['subject_identifier_cdc', 'identity']], left_on='subject_identifier', right_on='subject_identifier_cdc', suffixes=['', '_cdc' ])[['subject_identifier', 'subject_identifier_cdc']] self.df_htc.rename( columns={'subject_identifier_edc': 'final_subject_identifier'}, inplace=True) self.df_htc['final_subject_identifier_source'] = 'cdc (htc)' self.df_htc.drop_duplicates(inplace=True) self.df_htc.set_index('subject_identifier', inplace=True) self.results.set_index('subject_identifier', inplace=True) self.results = self.results.combine_first(self.df_htc) self.results.reset_index(inplace=True) self.df_htc.reset_index(inplace=True) self.df_ccc = df.copy() self.df_ccc = self.df_ccc.rename(columns={ 'ssid': 'subject_identifier_cdc', 'omangnumber': 'identity' }) self.df_ccc = pd.merge( self.results[pd.isnull( self.results['final_subject_identifier'])], self.df_ccc[['subject_identifier_cdc', 'identity']], left_on='subject_identifier', right_on='subject_identifier_cdc', suffixes=['', '_cdc' ])[['subject_identifier', 'subject_identifier_cdc']] self.df_ccc.rename( columns={'subject_identifier_edc': 'final_subject_identifier'}, inplace=True) self.df_ccc['final_subject_identifier_source'] = 'cdc (ccc)' self.df_ccc.drop_duplicates(inplace=True) self.df_ccc.set_index('subject_identifier', inplace=True) self.results.set_index('subject_identifier', inplace=True) self.results = self.results.combine_first(self.df_ccc) self.results.reset_index(inplace=True) self.df_ccc.reset_index(inplace=True)
def update_final_subject_identifier_from_cdc(self, df_currentstudyparticipant): """Update identifier from CDC HTC data.""" if not df_currentstudyparticipant.empty: df = df_currentstudyparticipant df['htcid'].fillna(value=np.nan, inplace=True) df['htcid'] = df.apply(lambda row: undash(row['htcid']), axis=1) df['ssid'].fillna(value=np.nan, inplace=True) df['ssid'] = df.apply(lambda row: undash(row['ssid']), axis=1) df = df.replace('unk', np.nan) df = df[pd.notnull(df['omangnumber'])] self.df_htc = df.copy() self.df_htc = self.df_htc.rename(columns={'htcid': 'subject_identifier_cdc', 'omangnumber': 'identity'}) self.df_htc = pd.merge( self.results[pd.isnull(self.results['final_subject_identifier'])], self.df_htc[['subject_identifier_cdc', 'identity']], left_on='subject_identifier', right_on='subject_identifier_cdc', suffixes=['', '_cdc'])[['subject_identifier', 'subject_identifier_cdc']] self.df_htc.rename(columns={'subject_identifier_edc': 'final_subject_identifier'}, inplace=True) self.df_htc['final_subject_identifier_source'] = 'cdc (htc)' self.df_htc.drop_duplicates(inplace=True) self.df_htc.set_index('subject_identifier', inplace=True) self.results.set_index('subject_identifier', inplace=True) self.results = self.results.combine_first(self.df_htc) self.results.reset_index(inplace=True) self.df_htc.reset_index(inplace=True) self.df_ccc = df.copy() self.df_ccc = self.df_ccc.rename(columns={'ssid': 'subject_identifier_cdc', 'omangnumber': 'identity'}) self.df_ccc = pd.merge( self.results[pd.isnull(self.results['final_subject_identifier'])], self.df_ccc[['subject_identifier_cdc', 'identity']], left_on='subject_identifier', right_on='subject_identifier_cdc', suffixes=['', '_cdc'])[['subject_identifier', 'subject_identifier_cdc']] self.df_ccc.rename(columns={'subject_identifier_edc': 'final_subject_identifier'}, inplace=True) self.df_ccc['final_subject_identifier_source'] = 'cdc (ccc)' self.df_ccc.drop_duplicates(inplace=True) self.df_ccc.set_index('subject_identifier', inplace=True) self.results.set_index('subject_identifier', inplace=True) self.results = self.results.combine_first(self.df_ccc) self.results.reset_index(inplace=True) self.df_ccc.reset_index(inplace=True)
def fetch_results_as_dataframe(self, edc_panels=None): with self.engine.connect() as conn, conn.begin(): df = pd.read_sql_query(self.sql_results, conn) df.fillna(value=np.nan, inplace=True) df['result'] = df['result'].str.replace('<', '') df['result'] = df['result'].str.replace('>', '') df['result'] = df['result'].str.replace('*', '') df['result'] = df['result'].str.replace('=', '') df['result'] = df.apply(lambda row: np.nan if row['result'] == '' else row['result'], axis=1) # df['result_float'] = df[df['result'].str.contains('\d+')]['result'].astype(float, na=False) for column in list( df.select_dtypes(include=['datetime64[ns, UTC]']).columns): df[column] = df[column].astype('datetime64[ns]') df['result_datetime'] = pd.to_datetime(df['result_datetime']) df['received_datetime'] = pd.to_datetime(df['received_datetime']) df['drawn_datetime'] = pd.to_datetime(df['drawn_datetime']) df['drawn_datetime'] = pd.to_datetime(df['drawn_datetime'].dt.date) df['specimen_identifier'] = df.apply(lambda row: np.nan if row[ 'specimen_identifier'] == 'NA' else row['specimen_identifier'], axis=1) df['aliquot_identifier'] = df.apply( lambda row: self.aliquot_identifier(row), axis=1) df['edc_specimen_identifier'] = df.apply( lambda row: self.edc_specimen_identifier(row, self.protocol_prefix ), axis=1) df['subject_identifier'] = df.apply(lambda row: undash( row['subject_identifier'], '^{}-'.format(self.protocol_prefix)), axis=1) df['final_subject_identifier'] = df[ df['subject_identifier'].str.startswith('{}-'.format( self.protocol_prefix))]['subject_identifier'] df['final_subject_identifier_source'] = df.apply( lambda row: np.nan if pd.isnull(row['final_subject_identifier']) else 'lis', axis=1) return df
def fetch_results_as_dataframe(self, edc_panels=None): with self.engine.connect() as conn, conn.begin(): df = pd.read_sql_query(self.sql_results, conn) df.fillna(value=np.nan, inplace=True) df['result'] = df['result'].str.replace('<', '') df['result'] = df['result'].str.replace('>', '') df['result'] = df['result'].str.replace('*', '') df['result'] = df['result'].str.replace('=', '') df['result'] = df.apply( lambda row: np.nan if row['result'] == '' else row['result'], axis=1) # df['result_float'] = df[df['result'].str.contains('\d+')]['result'].astype(float, na=False) for column in list(df.select_dtypes(include=['datetime64[ns, UTC]']).columns): df[column] = df[column].astype('datetime64[ns]') df['result_datetime'] = pd.to_datetime(df['result_datetime']) df['received_datetime'] = pd.to_datetime(df['received_datetime']) df['drawn_datetime'] = pd.to_datetime(df['drawn_datetime']) df['drawn_datetime'] = pd.to_datetime(df['drawn_datetime'].dt.date) df['specimen_identifier'] = df.apply(lambda row: np.nan if row['specimen_identifier'] == 'NA' else row['specimen_identifier'], axis=1) df['aliquot_identifier'] = df.apply(lambda row: self.aliquot_identifier(row), axis=1) df['edc_specimen_identifier'] = df.apply(lambda row: self.edc_specimen_identifier(row, self.protocol_prefix), axis=1) df['subject_identifier'] = df.apply(lambda row: undash(row['subject_identifier'], '^{}-'.format(self.protocol_prefix)), axis=1) df['final_subject_identifier'] = df[df['subject_identifier'].str.startswith('{}-'.format(self.protocol_prefix))]['subject_identifier'] df['final_subject_identifier_source'] = df.apply(lambda row: np.nan if pd.isnull(row['final_subject_identifier']) else 'lis', axis=1) return df