Example #1
0
def main_fetch_and_fold():
    description = 'Folding text reports from Ventura Care'
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('--reports', '-r',
                        help='name of the table contains the reports')
    parser.add_argument('--id', '-I',
                        help='id to connect to sql server')
    parser.add_argument('--ip', '-a',
                        help='ip address of the sql server')
    parser.add_argument('--db', '-d',
                        help='name of the database on the sql server')
    parser.add_argument('--targets', '-t',
                        help='name of the table containing targets on the db')
    parser.add_argument('--output', '-o',
                        help='output path to write the folded result')
    parser.add_argument('-n', '--nb',
                        help='number of reports to fetch', type=int)
    args = parser.parse_args()

    # getting variables from args
    engine = get_engine(args.id, args.ip, args.db)

    reports_folded = fetch_and_fold(args.reports, engine, args.targets, args.nb)

    output = args.output
    reports_folded.to_csv(output, encoding='utf-8', sep=';')
    print('done')

    return reports_folded
Example #2
0
def fetch_and_fold(url, header, id, ip, db, targets):
    key1, key2, date = 'patient_id', 'nip', 'date'
    # engine for sql connection
    engine = get_engine(id, ip, db)

    # fetching targets table
    df_targets = sql2df(engine, 'patient_target_simbad')
    df_targets['prescreen'] = df_targets.loc[:, 'prescreen'].dt.date

    # fetching features
    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
    #       '&STARTDATE={}&ENDDATE={}'
    #
    # header_path = '~/workspace/data/biology/header.csv'
    url = url
    header_path = header

    # fetching features

    df_bio = fetch(url, header_path, id, ip, db, targets)
    # parse_dates
    df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
                                           errors='coerce',
                                           format='%Y%m%d').dt.date
    df_bio.dropna(inplace=True)

    df_bio.rename(
        {
            'Date prelvt': date,
            'Analyse': 'feature',
            'Resultat': 'value'
        },
        inplace=True,
        axis=1)

    # joining with targets
    df_bio = df_bio.merge(df_targets, on=None, left_on='NIP',
                          right_on='nip').drop('NIP', axis=1)

    df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True)
    df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'],
                                    errors='coerce',
                                    downcast='float')

    df_bio = df_bio.loc[:, [key1, key2, 'feature', 'value', date]]
    # df_bio already folded

    print('done')

    return df_bio
Example #3
0
def load():
    # patient_id to start iterate
    description = 'Load targets from file into the sql server'
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument(['-p', '--path'],
                        help='path to file that contains the patients info')
    parser.add_argument(['--id', '-I'],
                        help='id to connect to sql server')
    parser.add_argument(['--ip', '-a'],
                        help='ip adress of the sql server')
    parser.add_argument(['--db', '-d'],
                        help='name of the database on the sql server')
    parser.add_argument(['--output', '-o'],
                        help='name of the new table on the server')
    # PATH = "/home/v_charvet/workspace/data/cr/cr_rad_tronc.xlsx"

    args = parser.parse_args()
    LAST_ID = 3300

    engine = get_engine(args.I, args.a, args.d)

    path = args.p
    # input = 'data/cr_sfditep_2012.xlsx'
    df = pd.read_excel(path).loc[:, ['N° Dossier patient IGR', 'LC',
                                     'DATE_SIGN_OK']]

    df.drop_duplicates(inplace=True)

    # normalize nip
    df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC']
    df['nip'] = df.loc[:, 'nip']\
        .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:])

    new_ids = [LAST_ID + i for i in range(df.shape[0])]
    screenfail = [1] * df.shape[0]

    target = {'nip': df.loc[:, 'nip'],
              'id': new_ids,
              'prescreen': df['DATE_SIGN_OK'],
              'screenfail': screenfail}

    df_target = pd.DataFrame(target)

    df_target.to_sql(args.o, engine, if_exists='replace')

    print('done')

    return df_target
Example #4
0
def fetch_and_fold(url, header, id, ip, db, targets):
    key1, key2, date = 'patient_id', 'nip', 'date'
    # engine for sql connection
    engine = get_engine(id, ip, db)

    # fetching targets table
    df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']]
    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
                                               format='%Y-%m-%d',
                                               unit='D')

    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
    #       '&STARTDATE={}&ENDDATE={}'
    url = url
    header_path = header


    df_bio = fetch(url, header_path,id, ip, db, targets)
    # parse_dates
    df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
                                           format='%Y%m%d').dt.date

    df_bio.rename({'Date prelvt': date, 'Analyse': 'feature',
                   'Resultat': 'value'}, inplace=True, axis=1)

    # joining with targets
    # df_bio = df_bio.merge(df_targets, on=None, left_on='NIP',
    #                       right_on='nip').drop('NIP', axis=1)
    df_bio = df_targets.merge(df_bio, left_on='nip', right_on='NIP')
    df_bio.drop('NIP', axis=1, inplace=True)

    df_bio['C1J1'] = pd.to_datetime(df_bio['C1J1'],
                                           format='%Y%m%d').dt.date
    df_bio = df_bio[df_bio[date] <= df_bio['C1J1']] # ?

    df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True)
    df_bio.drop('C1J1', axis=1, inplace=True)

    #removing non float values
    df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce',
                                    downcast='float')

    # df_bio already folded

    print('done')

    return df_bio
def main_parse_cr():
    description = 'Folding radiology reports from Simbad'
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-p',
                        '--path',
                        help='path to file that contains the reports')
    parser.add_argument('--id', '-I', help='id to connect to sql server')
    parser.add_argument('--ip', '-a', help='ip adress of the sql server')
    parser.add_argument('--db',
                        '-d',
                        help='name of the database on the sql server')
    parser.add_argument('--targets',
                        '-t',
                        help='name of the table containing targets on the db')
    parser.add_argument('--output',
                        '-o',
                        help='output path to write the folded result')
    parser.add_argument('-n',
                        '--nb',
                        help='number of reports to fetch',
                        type=int)

    args = parser.parse_args()

    engine = get_engine(args.id, args.ip, args.db)

    # getting variables from args
    df = parse_cr(args.path, engine, args.targets, args.nb)

    # output = '/home/v_charvet/workspace/data/features/simbad/radiology_v{}.csv'
    output = args.output
    df.to_csv(output, sep=';', encoding='utf-8')

    print('done')

    return 0
Example #6
0
def main_fetch_and_fold():
    description = 'Folding radiology reports from Ventura Care'
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-p',
                        '--path',
                        help='path to file that contains the reports')
    parser.add_argument('--id', '-I', help='id to connect to sql server')
    parser.add_argument('--ip', '-a', help='ip adress of the sql server')
    parser.add_argument('--db',
                        '-d',
                        help='name of the database on the sql server')
    parser.add_argument('--targets',
                        '-t',
                        help='name of the table containing targets on the db')
    parser.add_argument('--output',
                        '-o',
                        help='output path to write the folded result')
    parser.add_argument('-n',
                        '--nb',
                        help='number of reports to fetch',
                        type=int)
    # PATH = "/home/v_charvet/workspace/data/cr/cr_rad_tronc.xlsx"

    args = parser.parse_args()

    engine = get_engine(args.id, args.ip, args.db)

    rad_folded = fetch_and_fold(args.path, engine, args.targets, args.nb)

    output = args.output
    rad_folded.to_csv(output, encoding='utf-8', sep=';')

    print('done')

    return rad_folded
# expected in path :  cr_sfditep_2012.xlsx, ditep_inclus.xlsx, ditep_sf.xlsx
table, path = 'event', '/home/v_charvet/workspace/data/'

# instantiation of logger
stdout = logging.StreamHandler(sys.stdout)
stdout.setFormatter(logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
))
logger = logging.getLogger('default')
logger.setLevel(logging.INFO)
logger.addHandler(stdout)

## Fetching data
logger.info("Loading data from Simbad and VCare")

engine = get_engine('vcare', 'srt-ap-92', 'vcare')

target_vcare = sql2df(engine, 'patient_target')
target_simbad = sql2df(engine, 'patient_target_simbad')

target_vcare = target_vcare.loc[:, ['nip', 'id', 'screenfail']]
target_simbad.drop(['prescreen', 'index'], axis=1, inplace=True)

target = pd.concat([target_vcare, target_simbad], axis=0, ignore_index=True)
logger.info("Target shape and head:")
print(target.shape)
target.head()

df_rep_vc = rep_vc(table, engine, 'patient_target', 1)
df_rep_sb = rep_sb(path + '/cr_sfditep_2012.xlsx', engine, 'patient_target_simbad', 1)
Example #8
0
def fetch(url, header_path, id, ip, dbase, targets_table):
    """
    il suffit de concatener toutes les tables extraites pour ensuite les fold

    url : str
        url to the location of biology files

    header_path : str
        path to csv file containing header

    id : str
        login to the sql database

    ip : str
        ip adress to the sql server

    dbase : str
        name of the database on the given server

    targets_table : str
        name of the table containing targets information

    @TODO ne need to fetch targets_table from sql since already loaded by
    @TODO main function

    Returns
    -------
    """
    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
    #       '&STARTDATE={}&ENDDATE={}'

    # header_path = '~/workspace/data/biology/header.csv'
    # constant names specific to our database
    KEY1 = 'id'
    KEY2 = 'NIP'

    header = pd.read_csv(header_path, sep=';', encoding='latin1').columns

    engine = get_engine(id, ip, dbase)

    df_ids = sql2df(engine, targets_table)
    df_ids.rename({'nip': KEY2}, inplace=True, axis=1)
    df_ids['patient_id'] = df_ids[KEY1]

    cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt']
    df_res = pd.DataFrame(data=None, columns=cols)

    for index, row in df_ids.iterrows():
        nip = row[KEY2].replace(' ', '')
        # patient_id = row['patient_id']
        # c1j1_date = row[C1J1].date()
        # start_date = c1j1_date - timedelta(weeks=8)
        start_date = row['prescreen']
        end_date = start_date + timedelta(weeks=4)

        start = str(start_date).replace('-', '')
        stop = str(end_date).replace('-', '')

        req = requests.get(url.format(nip, start, stop))
        values = BeautifulSoup(req.content, 'html.parser').body.text

        new_df = pd.read_csv(StringIO(values),
                             sep=';',
                             header=None,
                             index_col=False,
                             names=header)
        new_df = new_df.loc[:, cols + ['LC']]

        # normalize nip
        new_df[KEY2] = row[KEY2]

        new_df.drop('LC', axis=1, inplace=True)

        df_res = pd.concat([df_res, new_df],
                           axis=0,
                           sort=False,
                           ignore_index=True)

    return df_res