def main_fetch_and_fold(): description = 'Folding text reports from Ventura Care' parser = argparse.ArgumentParser(description=description) parser.add_argument('--reports', '-r', help='name of the table contains the reports') parser.add_argument('--id', '-I', help='id to connect to sql server') parser.add_argument('--ip', '-a', help='ip address of the sql server') parser.add_argument('--db', '-d', help='name of the database on the sql server') parser.add_argument('--targets', '-t', help='name of the table containing targets on the db') parser.add_argument('--output', '-o', help='output path to write the folded result') parser.add_argument('-n', '--nb', help='number of reports to fetch', type=int) args = parser.parse_args() # getting variables from args engine = get_engine(args.id, args.ip, args.db) reports_folded = fetch_and_fold(args.reports, engine, args.targets, args.nb) output = args.output reports_folded.to_csv(output, encoding='utf-8', sep=';') print('done') return reports_folded
def fetch_and_fold(url, header, id, ip, db, targets): key1, key2, date = 'patient_id', 'nip', 'date' # engine for sql connection engine = get_engine(id, ip, db) # fetching targets table df_targets = sql2df(engine, 'patient_target_simbad') df_targets['prescreen'] = df_targets.loc[:, 'prescreen'].dt.date # fetching features # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \ # '&STARTDATE={}&ENDDATE={}' # # header_path = '~/workspace/data/biology/header.csv' url = url header_path = header # fetching features df_bio = fetch(url, header_path, id, ip, db, targets) # parse_dates df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'], errors='coerce', format='%Y%m%d').dt.date df_bio.dropna(inplace=True) df_bio.rename( { 'Date prelvt': date, 'Analyse': 'feature', 'Resultat': 'value' }, inplace=True, axis=1) # joining with targets df_bio = df_bio.merge(df_targets, on=None, left_on='NIP', right_on='nip').drop('NIP', axis=1) df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True) df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce', downcast='float') df_bio = df_bio.loc[:, [key1, key2, 'feature', 'value', date]] # df_bio already folded print('done') return df_bio
def load(): # patient_id to start iterate description = 'Load targets from file into the sql server' parser = argparse.ArgumentParser(description=description) parser.add_argument(['-p', '--path'], help='path to file that contains the patients info') parser.add_argument(['--id', '-I'], help='id to connect to sql server') parser.add_argument(['--ip', '-a'], help='ip adress of the sql server') parser.add_argument(['--db', '-d'], help='name of the database on the sql server') parser.add_argument(['--output', '-o'], help='name of the new table on the server') # PATH = "/home/v_charvet/workspace/data/cr/cr_rad_tronc.xlsx" args = parser.parse_args() LAST_ID = 3300 engine = get_engine(args.I, args.a, args.d) path = args.p # input = 'data/cr_sfditep_2012.xlsx' df = pd.read_excel(path).loc[:, ['N° Dossier patient IGR', 'LC', 'DATE_SIGN_OK']] df.drop_duplicates(inplace=True) # normalize nip df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC'] df['nip'] = df.loc[:, 'nip']\ .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:]) new_ids = [LAST_ID + i for i in range(df.shape[0])] screenfail = [1] * df.shape[0] target = {'nip': df.loc[:, 'nip'], 'id': new_ids, 'prescreen': df['DATE_SIGN_OK'], 'screenfail': screenfail} df_target = pd.DataFrame(target) df_target.to_sql(args.o, engine, if_exists='replace') print('done') return df_target
def fetch_and_fold(url, header, id, ip, db, targets): key1, key2, date = 'patient_id', 'nip', 'date' # engine for sql connection engine = get_engine(id, ip, db) # fetching targets table df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']] df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'], format='%Y-%m-%d', unit='D') # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \ # '&STARTDATE={}&ENDDATE={}' url = url header_path = header df_bio = fetch(url, header_path,id, ip, db, targets) # parse_dates df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'], format='%Y%m%d').dt.date df_bio.rename({'Date prelvt': date, 'Analyse': 'feature', 'Resultat': 'value'}, inplace=True, axis=1) # joining with targets # df_bio = df_bio.merge(df_targets, on=None, left_on='NIP', # right_on='nip').drop('NIP', axis=1) df_bio = df_targets.merge(df_bio, left_on='nip', right_on='NIP') df_bio.drop('NIP', axis=1, inplace=True) df_bio['C1J1'] = pd.to_datetime(df_bio['C1J1'], format='%Y%m%d').dt.date df_bio = df_bio[df_bio[date] <= df_bio['C1J1']] # ? df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True) df_bio.drop('C1J1', axis=1, inplace=True) #removing non float values df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce', downcast='float') # df_bio already folded print('done') return df_bio
def main_parse_cr(): description = 'Folding radiology reports from Simbad' parser = argparse.ArgumentParser(description=description) parser.add_argument('-p', '--path', help='path to file that contains the reports') parser.add_argument('--id', '-I', help='id to connect to sql server') parser.add_argument('--ip', '-a', help='ip adress of the sql server') parser.add_argument('--db', '-d', help='name of the database on the sql server') parser.add_argument('--targets', '-t', help='name of the table containing targets on the db') parser.add_argument('--output', '-o', help='output path to write the folded result') parser.add_argument('-n', '--nb', help='number of reports to fetch', type=int) args = parser.parse_args() engine = get_engine(args.id, args.ip, args.db) # getting variables from args df = parse_cr(args.path, engine, args.targets, args.nb) # output = '/home/v_charvet/workspace/data/features/simbad/radiology_v{}.csv' output = args.output df.to_csv(output, sep=';', encoding='utf-8') print('done') return 0
def main_fetch_and_fold(): description = 'Folding radiology reports from Ventura Care' parser = argparse.ArgumentParser(description=description) parser.add_argument('-p', '--path', help='path to file that contains the reports') parser.add_argument('--id', '-I', help='id to connect to sql server') parser.add_argument('--ip', '-a', help='ip adress of the sql server') parser.add_argument('--db', '-d', help='name of the database on the sql server') parser.add_argument('--targets', '-t', help='name of the table containing targets on the db') parser.add_argument('--output', '-o', help='output path to write the folded result') parser.add_argument('-n', '--nb', help='number of reports to fetch', type=int) # PATH = "/home/v_charvet/workspace/data/cr/cr_rad_tronc.xlsx" args = parser.parse_args() engine = get_engine(args.id, args.ip, args.db) rad_folded = fetch_and_fold(args.path, engine, args.targets, args.nb) output = args.output rad_folded.to_csv(output, encoding='utf-8', sep=';') print('done') return rad_folded
# expected in path : cr_sfditep_2012.xlsx, ditep_inclus.xlsx, ditep_sf.xlsx table, path = 'event', '/home/v_charvet/workspace/data/' # instantiation of logger stdout = logging.StreamHandler(sys.stdout) stdout.setFormatter(logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s' )) logger = logging.getLogger('default') logger.setLevel(logging.INFO) logger.addHandler(stdout) ## Fetching data logger.info("Loading data from Simbad and VCare") engine = get_engine('vcare', 'srt-ap-92', 'vcare') target_vcare = sql2df(engine, 'patient_target') target_simbad = sql2df(engine, 'patient_target_simbad') target_vcare = target_vcare.loc[:, ['nip', 'id', 'screenfail']] target_simbad.drop(['prescreen', 'index'], axis=1, inplace=True) target = pd.concat([target_vcare, target_simbad], axis=0, ignore_index=True) logger.info("Target shape and head:") print(target.shape) target.head() df_rep_vc = rep_vc(table, engine, 'patient_target', 1) df_rep_sb = rep_sb(path + '/cr_sfditep_2012.xlsx', engine, 'patient_target_simbad', 1)
def fetch(url, header_path, id, ip, dbase, targets_table): """ il suffit de concatener toutes les tables extraites pour ensuite les fold url : str url to the location of biology files header_path : str path to csv file containing header id : str login to the sql database ip : str ip adress to the sql server dbase : str name of the database on the given server targets_table : str name of the table containing targets information @TODO ne need to fetch targets_table from sql since already loaded by @TODO main function Returns ------- """ # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \ # '&STARTDATE={}&ENDDATE={}' # header_path = '~/workspace/data/biology/header.csv' # constant names specific to our database KEY1 = 'id' KEY2 = 'NIP' header = pd.read_csv(header_path, sep=';', encoding='latin1').columns engine = get_engine(id, ip, dbase) df_ids = sql2df(engine, targets_table) df_ids.rename({'nip': KEY2}, inplace=True, axis=1) df_ids['patient_id'] = df_ids[KEY1] cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt'] df_res = pd.DataFrame(data=None, columns=cols) for index, row in df_ids.iterrows(): nip = row[KEY2].replace(' ', '') # patient_id = row['patient_id'] # c1j1_date = row[C1J1].date() # start_date = c1j1_date - timedelta(weeks=8) start_date = row['prescreen'] end_date = start_date + timedelta(weeks=4) start = str(start_date).replace('-', '') stop = str(end_date).replace('-', '') req = requests.get(url.format(nip, start, stop)) values = BeautifulSoup(req.content, 'html.parser').body.text new_df = pd.read_csv(StringIO(values), sep=';', header=None, index_col=False, names=header) new_df = new_df.loc[:, cols + ['LC']] # normalize nip new_df[KEY2] = row[KEY2] new_df.drop('LC', axis=1, inplace=True) df_res = pd.concat([df_res, new_df], axis=0, sort=False, ignore_index=True) return df_res