Exemple #1
0
    def update_final_subject_identifier_from_cdc(self,
                                                 df_currentstudyparticipant):
        """Update identifier from CDC HTC data."""
        if not df_currentstudyparticipant.empty:
            df = df_currentstudyparticipant
            df['htcid'].fillna(value=np.nan, inplace=True)
            df['htcid'] = df.apply(lambda row: undash(row['htcid']), axis=1)
            df['ssid'].fillna(value=np.nan, inplace=True)
            df['ssid'] = df.apply(lambda row: undash(row['ssid']), axis=1)
            df = df.replace('unk', np.nan)
            df = df[pd.notnull(df['omangnumber'])]
            self.df_htc = df.copy()
            self.df_htc = self.df_htc.rename(columns={
                'htcid': 'subject_identifier_cdc',
                'omangnumber': 'identity'
            })
            self.df_htc = pd.merge(
                self.results[pd.isnull(
                    self.results['final_subject_identifier'])],
                self.df_htc[['subject_identifier_cdc', 'identity']],
                left_on='subject_identifier',
                right_on='subject_identifier_cdc',
                suffixes=['', '_cdc'
                          ])[['subject_identifier', 'subject_identifier_cdc']]
            self.df_htc.rename(
                columns={'subject_identifier_edc': 'final_subject_identifier'},
                inplace=True)
            self.df_htc['final_subject_identifier_source'] = 'cdc (htc)'
            self.df_htc.drop_duplicates(inplace=True)
            self.df_htc.set_index('subject_identifier', inplace=True)
            self.results.set_index('subject_identifier', inplace=True)
            self.results = self.results.combine_first(self.df_htc)
            self.results.reset_index(inplace=True)
            self.df_htc.reset_index(inplace=True)

            self.df_ccc = df.copy()
            self.df_ccc = self.df_ccc.rename(columns={
                'ssid': 'subject_identifier_cdc',
                'omangnumber': 'identity'
            })
            self.df_ccc = pd.merge(
                self.results[pd.isnull(
                    self.results['final_subject_identifier'])],
                self.df_ccc[['subject_identifier_cdc', 'identity']],
                left_on='subject_identifier',
                right_on='subject_identifier_cdc',
                suffixes=['', '_cdc'
                          ])[['subject_identifier', 'subject_identifier_cdc']]
            self.df_ccc.rename(
                columns={'subject_identifier_edc': 'final_subject_identifier'},
                inplace=True)
            self.df_ccc['final_subject_identifier_source'] = 'cdc (ccc)'
            self.df_ccc.drop_duplicates(inplace=True)
            self.df_ccc.set_index('subject_identifier', inplace=True)
            self.results.set_index('subject_identifier', inplace=True)
            self.results = self.results.combine_first(self.df_ccc)
            self.results.reset_index(inplace=True)
            self.df_ccc.reset_index(inplace=True)
Exemple #2
0
    def update_final_subject_identifier_from_cdc(self, df_currentstudyparticipant):
        """Update identifier from CDC HTC data."""
        if not df_currentstudyparticipant.empty:
            df = df_currentstudyparticipant
            df['htcid'].fillna(value=np.nan, inplace=True)
            df['htcid'] = df.apply(lambda row: undash(row['htcid']), axis=1)
            df['ssid'].fillna(value=np.nan, inplace=True)
            df['ssid'] = df.apply(lambda row: undash(row['ssid']), axis=1)
            df = df.replace('unk', np.nan)
            df = df[pd.notnull(df['omangnumber'])]
            self.df_htc = df.copy()
            self.df_htc = self.df_htc.rename(columns={'htcid': 'subject_identifier_cdc', 'omangnumber': 'identity'})
            self.df_htc = pd.merge(
                self.results[pd.isnull(self.results['final_subject_identifier'])],
                self.df_htc[['subject_identifier_cdc', 'identity']],
                left_on='subject_identifier',
                right_on='subject_identifier_cdc',
                suffixes=['', '_cdc'])[['subject_identifier', 'subject_identifier_cdc']]
            self.df_htc.rename(columns={'subject_identifier_edc': 'final_subject_identifier'}, inplace=True)
            self.df_htc['final_subject_identifier_source'] = 'cdc (htc)'
            self.df_htc.drop_duplicates(inplace=True)
            self.df_htc.set_index('subject_identifier', inplace=True)
            self.results.set_index('subject_identifier', inplace=True)
            self.results = self.results.combine_first(self.df_htc)
            self.results.reset_index(inplace=True)
            self.df_htc.reset_index(inplace=True)

            self.df_ccc = df.copy()
            self.df_ccc = self.df_ccc.rename(columns={'ssid': 'subject_identifier_cdc', 'omangnumber': 'identity'})
            self.df_ccc = pd.merge(
                self.results[pd.isnull(self.results['final_subject_identifier'])],
                self.df_ccc[['subject_identifier_cdc', 'identity']],
                left_on='subject_identifier',
                right_on='subject_identifier_cdc',
                suffixes=['', '_cdc'])[['subject_identifier', 'subject_identifier_cdc']]
            self.df_ccc.rename(columns={'subject_identifier_edc': 'final_subject_identifier'}, inplace=True)
            self.df_ccc['final_subject_identifier_source'] = 'cdc (ccc)'
            self.df_ccc.drop_duplicates(inplace=True)
            self.df_ccc.set_index('subject_identifier', inplace=True)
            self.results.set_index('subject_identifier', inplace=True)
            self.results = self.results.combine_first(self.df_ccc)
            self.results.reset_index(inplace=True)
            self.df_ccc.reset_index(inplace=True)
Exemple #3
0
 def fetch_results_as_dataframe(self, edc_panels=None):
     with self.engine.connect() as conn, conn.begin():
         df = pd.read_sql_query(self.sql_results, conn)
     df.fillna(value=np.nan, inplace=True)
     df['result'] = df['result'].str.replace('<', '')
     df['result'] = df['result'].str.replace('>', '')
     df['result'] = df['result'].str.replace('*', '')
     df['result'] = df['result'].str.replace('=', '')
     df['result'] = df.apply(lambda row: np.nan
                             if row['result'] == '' else row['result'],
                             axis=1)
     # df['result_float'] = df[df['result'].str.contains('\d+')]['result'].astype(float, na=False)
     for column in list(
             df.select_dtypes(include=['datetime64[ns, UTC]']).columns):
         df[column] = df[column].astype('datetime64[ns]')
     df['result_datetime'] = pd.to_datetime(df['result_datetime'])
     df['received_datetime'] = pd.to_datetime(df['received_datetime'])
     df['drawn_datetime'] = pd.to_datetime(df['drawn_datetime'])
     df['drawn_datetime'] = pd.to_datetime(df['drawn_datetime'].dt.date)
     df['specimen_identifier'] = df.apply(lambda row: np.nan if row[
         'specimen_identifier'] == 'NA' else row['specimen_identifier'],
                                          axis=1)
     df['aliquot_identifier'] = df.apply(
         lambda row: self.aliquot_identifier(row), axis=1)
     df['edc_specimen_identifier'] = df.apply(
         lambda row: self.edc_specimen_identifier(row, self.protocol_prefix
                                                  ),
         axis=1)
     df['subject_identifier'] = df.apply(lambda row: undash(
         row['subject_identifier'], '^{}-'.format(self.protocol_prefix)),
                                         axis=1)
     df['final_subject_identifier'] = df[
         df['subject_identifier'].str.startswith('{}-'.format(
             self.protocol_prefix))]['subject_identifier']
     df['final_subject_identifier_source'] = df.apply(
         lambda row: np.nan
         if pd.isnull(row['final_subject_identifier']) else 'lis',
         axis=1)
     return df
Exemple #4
0
 def fetch_results_as_dataframe(self, edc_panels=None):
     with self.engine.connect() as conn, conn.begin():
         df = pd.read_sql_query(self.sql_results, conn)
     df.fillna(value=np.nan, inplace=True)
     df['result'] = df['result'].str.replace('<', '')
     df['result'] = df['result'].str.replace('>', '')
     df['result'] = df['result'].str.replace('*', '')
     df['result'] = df['result'].str.replace('=', '')
     df['result'] = df.apply(
         lambda row: np.nan if row['result'] == '' else row['result'], axis=1)
     # df['result_float'] = df[df['result'].str.contains('\d+')]['result'].astype(float, na=False)
     for column in list(df.select_dtypes(include=['datetime64[ns, UTC]']).columns):
         df[column] = df[column].astype('datetime64[ns]')
     df['result_datetime'] = pd.to_datetime(df['result_datetime'])
     df['received_datetime'] = pd.to_datetime(df['received_datetime'])
     df['drawn_datetime'] = pd.to_datetime(df['drawn_datetime'])
     df['drawn_datetime'] = pd.to_datetime(df['drawn_datetime'].dt.date)
     df['specimen_identifier'] = df.apply(lambda row: np.nan if row['specimen_identifier'] == 'NA' else row['specimen_identifier'], axis=1)
     df['aliquot_identifier'] = df.apply(lambda row: self.aliquot_identifier(row), axis=1)
     df['edc_specimen_identifier'] = df.apply(lambda row: self.edc_specimen_identifier(row, self.protocol_prefix), axis=1)
     df['subject_identifier'] = df.apply(lambda row: undash(row['subject_identifier'], '^{}-'.format(self.protocol_prefix)), axis=1)
     df['final_subject_identifier'] = df[df['subject_identifier'].str.startswith('{}-'.format(self.protocol_prefix))]['subject_identifier']
     df['final_subject_identifier_source'] = df.apply(lambda row: np.nan if pd.isnull(row['final_subject_identifier']) else 'lis', axis=1)
     return df