def fetch_and_fold(table, engine, targets, n_reports): """ function to fetch reports from vcare database Parameters ---------- For definition of parameters, see arguments in `main_fetch_and_fold` """ key1, key2, date = 'patient_id', 'nip', 'date' # data used to train the model df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']] df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'], format='%Y-%m-%d', unit='D') df_reports = sql2df(engine, table)\ .loc[:, ['original_date', 'patient_id', 'report']] mask = [report is not None for report in df_reports['report']] df_reports.rename(columns={'original_date': 'date'}, inplace=True) df_reports = df_reports.loc[mask] # joining features df with complete patient informations df_reports = df_reports.merge(df_targets, on=None, left_on='patient_id', right_on='id').drop('id', axis=1) # df_reports = df_reports[df_reports[date] <= df_reports['C1J1']] # folding frames so that they have the same columns folder = Folder(key1, key2, ['report'], date, n_jobs=-1) reports_folded = folder.fold(df_reports) reports_folded.dropna(inplace=True) reports_folded.drop_duplicates(subset=['value'], inplace=True) # taking only first `n_reports` reports group_dict = {key2: 'first', 'feature': 'first', date: 'last', 'value': lambda g: ' '.join(g[:n_reports])} reports_folded = reports_folded.groupby(key1, as_index=False)\ .agg(group_dict) # parsing and vectorising text reports sections = ['examens complementaire', 'hopital de jour', 'examen du patient'] parser = ReportsParser(sections=None, n_jobs=-1, norm=False, col_name='value') reports_folded['value'] = parser.transform(reports_folded) return reports_folded
def fetch_and_fold(url, header, id, ip, db, targets): key1, key2, date = 'patient_id', 'nip', 'date' # engine for sql connection engine = get_engine(id, ip, db) # fetching targets table df_targets = sql2df(engine, 'patient_target_simbad') df_targets['prescreen'] = df_targets.loc[:, 'prescreen'].dt.date # fetching features # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \ # '&STARTDATE={}&ENDDATE={}' # # header_path = '~/workspace/data/biology/header.csv' url = url header_path = header # fetching features df_bio = fetch(url, header_path, id, ip, db, targets) # parse_dates df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'], errors='coerce', format='%Y%m%d').dt.date df_bio.dropna(inplace=True) df_bio.rename( { 'Date prelvt': date, 'Analyse': 'feature', 'Resultat': 'value' }, inplace=True, axis=1) # joining with targets df_bio = df_bio.merge(df_targets, on=None, left_on='NIP', right_on='nip').drop('NIP', axis=1) df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True) df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce', downcast='float') df_bio = df_bio.loc[:, [key1, key2, 'feature', 'value', date]] # df_bio already folded print('done') return df_bio
def fetch_and_fold(url, header, id, ip, db, targets): key1, key2, date = 'patient_id', 'nip', 'date' # engine for sql connection engine = get_engine(id, ip, db) # fetching targets table df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']] df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'], format='%Y-%m-%d', unit='D') # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \ # '&STARTDATE={}&ENDDATE={}' url = url header_path = header df_bio = fetch(url, header_path,id, ip, db, targets) # parse_dates df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'], format='%Y%m%d').dt.date df_bio.rename({'Date prelvt': date, 'Analyse': 'feature', 'Resultat': 'value'}, inplace=True, axis=1) # joining with targets # df_bio = df_bio.merge(df_targets, on=None, left_on='NIP', # right_on='nip').drop('NIP', axis=1) df_bio = df_targets.merge(df_bio, left_on='nip', right_on='NIP') df_bio.drop('NIP', axis=1, inplace=True) df_bio['C1J1'] = pd.to_datetime(df_bio['C1J1'], format='%Y%m%d').dt.date df_bio = df_bio[df_bio[date] <= df_bio['C1J1']] # ? df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True) df_bio.drop('C1J1', axis=1, inplace=True) #removing non float values df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce', downcast='float') # df_bio already folded print('done') return df_bio
def fetch_and_fold(path, engine, targets, n_reports): """ function to fetch reports from simbad data Parameters ---------- For definition of parameters, see arguments in `main_fetch_and_fold` """ # fetching targets df_targets = sql2df(engine, targets) # fetching reports df = pd.read_excel(path) # normalize nip df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC'] df['nip'] = df.loc[:, 'nip'] \ .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:]) df.drop([ 'N° Dossier patient IGR', 'LC', 'NOCET', 'SIGLE_ETUDE', 'LIBELLE_TYPE_ETUDE', 'NUM CR', 'CR RESP' ], axis=1, inplace=True) df.rename(columns={'CR DATE': 'date', 'text CR': 'value'}, inplace=True) # keep only date in 'date columns' df['date'] = df.loc[:, 'date'].dt.date df['DATE_SIGN_OK'] = df.loc[:, 'DATE_SIGN_OK'].dt.date # taking only consultation reports df = df[df['CR NAT'] == 'CC'] # mask to get only the first one mask = (df['date'] == df['DATE_SIGN_OK']) df = df[mask] # df_rh = df[df['CR NAT'] == 'RH'] # taking only the first for each patient df.dropna(inplace=True) df.drop_duplicates(subset=['value'], inplace=True) # taking only the first reports group_dict = { 'date': 'first', 'DATE_SIGN_OK': 'last', 'value': lambda g: ' '.join(g) } df = df.groupby('nip', as_index=False).agg(group_dict) # # filter uninformative reports and taking the first # df_rh = df_rh[~(df_rh['value'].str.match('Examen du', na=False))] # df_rh.dropna(inplace=True) # df_rh.drop_duplicates(subset=['value'], inplace=True) # # # taking only the first reports # df_rh = df_rh.groupby('nip', as_index=False).agg(group_dict) # # df = pd.concat([df_cc, df_rh], ignore_index=True) # removing useless tags (blocks parsing) df['value'] = df.loc[:, 'value'].apply(lambda s: \ str(s).replace('<u>', '').replace('</u>', '')) # filter date # df = df[df['date'] <= (df['DATE_SIGN_OK'] + datetime.timedelta(weeks=8))] df = df.merge(df_targets, on='nip') parser = ReportsParser(headers='b', is_html=False, norm=False, n_jobs=-1, col_name='value') df['value'] = parser.transform(df) df['feature'] = ['report'] * df.shape[0] df = df.loc[:, ['nip', 'id', 'feature', 'value', 'date']] df = df[df['value'] != ''] df.drop_duplicates(inplace=True) return df
# instantiation of logger stdout = logging.StreamHandler(sys.stdout) stdout.setFormatter(logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s' )) logger = logging.getLogger('default') logger.setLevel(logging.INFO) logger.addHandler(stdout) ## Fetching data logger.info("Loading data from Simbad and VCare") engine = get_engine('vcare', 'srt-ap-92', 'vcare') target_vcare = sql2df(engine, 'patient_target') target_simbad = sql2df(engine, 'patient_target_simbad') target_vcare = target_vcare.loc[:, ['nip', 'id', 'screenfail']] target_simbad.drop(['prescreen', 'index'], axis=1, inplace=True) target = pd.concat([target_vcare, target_simbad], axis=0, ignore_index=True) logger.info("Target shape and head:") print(target.shape) target.head() df_rep_vc = rep_vc(table, engine, 'patient_target', 1) df_rep_sb = rep_sb(path + '/cr_sfditep_2012.xlsx', engine, 'patient_target_simbad', 1) df_rep_sb.rename({'id': 'patient_id'}, axis=1, inplace=True)
def fetch_and_fold(path, engine, targets, n_reports): """ function to fetch radiology reports from vcare database Parameters ---------- For definition of parameters, see arguments in `main_fetch_and_fold` """ key1, key2, date = 'patient_id', 'nip', 'date' # fetching targets table df_targets = sql2df(engine, targets).loc[:, [key2, 'id', 'C1J1']] df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'], format='%Y-%m-%d', unit='D') df_rad = pd.read_excel(path, sep=';', usecols=7, parse_dates=[1, 2, 5]) # filter SESSION and keep prescreen values mask = df_rad['SESSION'].isin(['SCA ', 'IRM ']) # mask_prescreen = df_rad['OBSERVATION DATE'] == 'Before Date' df_rad = df_rad[mask] # [mask_prescreen] df_rad['CR_DATE'] = pd.to_datetime(df_rad['CR_DATE'], format='%Y%m%d') # remove useless rows df_rad = df_rad[~(df_rad['CONTENU_CR'].str.match('Examen du', na=False))] df_rad.rename({'CR_DATE': date}, axis=1, inplace=True) df_rad = df_rad.merge(df_targets, on=None, left_on='Nip', right_on='nip').drop('Nip', axis=1) df_rad['patient_id'] = df_rad['id'] df_rad.drop('id', axis=1, inplace=True) folder = Folder(key1, key2, ['CONTENU_CR'], date, n_jobs=-1) rad_folded = folder.fold(df_rad) rad_folded.dropna(inplace=True) rad_folded.drop_duplicates(subset=['value'], inplace=True) # concatenating n_reports group_dict = { key2: 'first', 'feature': 'first', date: 'last', 'value': lambda g: ' '.join(g[:n_reports]) } rad_folded = rad_folded.groupby(key1, as_index=False).agg(group_dict) # removing useless tags (blocks parsing) rad_folded['value'] = rad_folded.loc[:, 'value'].apply(lambda s: \ str(s).replace('<u>', '').replace('</u>', '')) sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion', 'lesion(s) non cible(s)', 'nouvelles(s) lesion(s)', 'resultats a l etage thoracique', 'en fenetre osseuse', 'a l etage abdomino plevien', 'conclusion :') parser = ReportsParser(headers='b', is_html=False, col_name='value', sections=sections, n_jobs=-1) rad_folded['value'] = parser.transform(rad_folded) rad_folded = rad_folded[rad_folded['value'] != ''] rad_folded['feature'] = ['rad'] * rad_folded.shape[0] return rad_folded
def parse_cr(path, engine, targets, n_reports): PATH = path # fetching targets df_targets = sql2df(engine, targets) # fetching reports # PATH = 'data/cr_sfditep_2012.xlsx' df = pd.read_excel(PATH) df = df[df['CR NAT'] == 'CR'] df.rename(columns={'CR DATE': 'date', 'text CR': 'value'}, inplace=True) # keep only date in 'date columns' df['date'] = df.loc[:, 'date'].dt.date df['DATE_SIGN_OK'] = df.loc[:, 'DATE_SIGN_OK'].dt.date # remove useless reports df = df[~(df['value'].str.match('Examen du', na=False))] # filter by date # df = df[df['date'] <= (df['DATE_SIGN_OK'] + datetime.timedelta(weeks=8))] # normalize nip df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC'] df['nip'] = df.loc[:, 'nip'] \ .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:]) df.drop([ 'N° Dossier patient IGR', 'LC', 'NOCET', 'SIGLE_ETUDE', 'LIBELLE_TYPE_ETUDE', 'NUM CR', 'CR RESP' ], axis=1, inplace=True) df.dropna(inplace=True) df.drop_duplicates(subset=['value'], inplace=True) # taking only the first reports group_dict = { 'date': 'first', 'DATE_SIGN_OK': 'last', 'value': lambda g: ' '.join(g[:n_reports]) } df = df.groupby('nip', as_index=False).agg(group_dict) df = df.merge(df_targets, on='nip') # removing useless tags (blocks parsing) df['value'] = df.loc[:, 'value'].apply(lambda s: \ str(s).replace('<u>', '').replace('</u>', '')) sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion', 'lesions non cibles', 'nouvelles lesions', 'lesion s non cible s', 'nouvelle s lesion s', 'resultats a l etage thoracique', 'en fenetre osseuse', 'a l etage abdomino plevien', 'conclusion :', '') parser = ReportsParser(headers='b', is_html=False, col_name='value', sections=sections, n_jobs=-1) df['value'] = parser.transform(df) # dropping empty rows df = df[df['value'] != ''] df['feature'] = ['rad'] * df.shape[0] df = df.loc[:, ['nip', 'id', 'feature', 'value', 'date']] return df
def fetch(url, header_path, id, ip, dbase, targets_table): """ il suffit de concatener toutes les tables extraites pour ensuite les fold url : str url to the location of biology files header_path : str path to csv file containing header id : str login to the sql database ip : str ip adress to the sql server dbase : str name of the database on the given server targets_table : str name of the table containing targets information @TODO ne need to fetch targets_table from sql since already loaded by @TODO main function Returns ------- """ # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \ # '&STARTDATE={}&ENDDATE={}' # header_path = '~/workspace/data/biology/header.csv' # constant names specific to our database KEY1 = 'id' KEY2 = 'NIP' header = pd.read_csv(header_path, sep=';', encoding='latin1').columns engine = get_engine(id, ip, dbase) df_ids = sql2df(engine, targets_table) df_ids.rename({'nip': KEY2}, inplace=True, axis=1) df_ids['patient_id'] = df_ids[KEY1] cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt'] df_res = pd.DataFrame(data=None, columns=cols) for index, row in df_ids.iterrows(): nip = row[KEY2].replace(' ', '') # patient_id = row['patient_id'] # c1j1_date = row[C1J1].date() # start_date = c1j1_date - timedelta(weeks=8) start_date = row['prescreen'] end_date = start_date + timedelta(weeks=4) start = str(start_date).replace('-', '') stop = str(end_date).replace('-', '') req = requests.get(url.format(nip, start, stop)) values = BeautifulSoup(req.content, 'html.parser').body.text new_df = pd.read_csv(StringIO(values), sep=';', header=None, index_col=False, names=header) new_df = new_df.loc[:, cols + ['LC']] # normalize nip new_df[KEY2] = row[KEY2] new_df.drop('LC', axis=1, inplace=True) df_res = pd.concat([df_res, new_df], axis=0, sort=False, ignore_index=True) return df_res