Beispiel #1
0
def fetch_and_fold(table, engine, targets, n_reports):
    """ function to fetch reports from vcare database

    Parameters
    ----------
    For definition of parameters, see arguments in `main_fetch_and_fold`
    """
    key1, key2, date = 'patient_id', 'nip', 'date'

    # data used to train the model
    df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']]
    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
                                               format='%Y-%m-%d',
                                               unit='D')

    df_reports = sql2df(engine, table)\
        .loc[:, ['original_date', 'patient_id', 'report']]

    mask = [report is not None for report in df_reports['report']]

    df_reports.rename(columns={'original_date': 'date'}, inplace=True)
    df_reports = df_reports.loc[mask]

    # joining features df with complete patient informations
    df_reports = df_reports.merge(df_targets, on=None, left_on='patient_id',
                                  right_on='id').drop('id', axis=1)
    # df_reports = df_reports[df_reports[date] <= df_reports['C1J1']]

    # folding frames so that they have the same columns
    folder = Folder(key1, key2, ['report'], date, n_jobs=-1)
    reports_folded = folder.fold(df_reports)

    reports_folded.dropna(inplace=True)
    reports_folded.drop_duplicates(subset=['value'], inplace=True)

    # taking only first `n_reports` reports
    group_dict = {key2: 'first', 'feature': 'first', date: 'last',
                  'value': lambda g: ' '.join(g[:n_reports])}
    reports_folded = reports_folded.groupby(key1, as_index=False)\
        .agg(group_dict)

    # parsing and vectorising text reports
    sections = ['examens complementaire', 'hopital de jour',
                'examen du patient']

    parser = ReportsParser(sections=None, n_jobs=-1, norm=False,
                           col_name='value')

    reports_folded['value'] = parser.transform(reports_folded)

    return reports_folded
def fetch_and_fold(url, header, id, ip, db, targets):
    key1, key2, date = 'patient_id', 'nip', 'date'
    # engine for sql connection
    engine = get_engine(id, ip, db)

    # fetching targets table
    df_targets = sql2df(engine, 'patient_target_simbad')
    df_targets['prescreen'] = df_targets.loc[:, 'prescreen'].dt.date

    # fetching features
    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
    #       '&STARTDATE={}&ENDDATE={}'
    #
    # header_path = '~/workspace/data/biology/header.csv'
    url = url
    header_path = header

    # fetching features

    df_bio = fetch(url, header_path, id, ip, db, targets)
    # parse_dates
    df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
                                           errors='coerce',
                                           format='%Y%m%d').dt.date
    df_bio.dropna(inplace=True)

    df_bio.rename(
        {
            'Date prelvt': date,
            'Analyse': 'feature',
            'Resultat': 'value'
        },
        inplace=True,
        axis=1)

    # joining with targets
    df_bio = df_bio.merge(df_targets, on=None, left_on='NIP',
                          right_on='nip').drop('NIP', axis=1)

    df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True)
    df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'],
                                    errors='coerce',
                                    downcast='float')

    df_bio = df_bio.loc[:, [key1, key2, 'feature', 'value', date]]
    # df_bio already folded

    print('done')

    return df_bio
Beispiel #3
0
def fetch_and_fold(url, header, id, ip, db, targets):
    key1, key2, date = 'patient_id', 'nip', 'date'
    # engine for sql connection
    engine = get_engine(id, ip, db)

    # fetching targets table
    df_targets = sql2df(engine, targets).loc[:, ['nip', 'id', 'C1J1']]
    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
                                               format='%Y-%m-%d',
                                               unit='D')

    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
    #       '&STARTDATE={}&ENDDATE={}'
    url = url
    header_path = header


    df_bio = fetch(url, header_path,id, ip, db, targets)
    # parse_dates
    df_bio['Date prelvt'] = pd.to_datetime(df_bio['Date prelvt'],
                                           format='%Y%m%d').dt.date

    df_bio.rename({'Date prelvt': date, 'Analyse': 'feature',
                   'Resultat': 'value'}, inplace=True, axis=1)

    # joining with targets
    # df_bio = df_bio.merge(df_targets, on=None, left_on='NIP',
    #                       right_on='nip').drop('NIP', axis=1)
    df_bio = df_targets.merge(df_bio, left_on='nip', right_on='NIP')
    df_bio.drop('NIP', axis=1, inplace=True)

    df_bio['C1J1'] = pd.to_datetime(df_bio['C1J1'],
                                           format='%Y%m%d').dt.date
    df_bio = df_bio[df_bio[date] <= df_bio['C1J1']] # ?

    df_bio.rename({'id': 'patient_id'}, axis=1, inplace=True)
    df_bio.drop('C1J1', axis=1, inplace=True)

    #removing non float values
    df_bio['value'] = pd.to_numeric(df_bio.loc[:, 'value'], errors='coerce',
                                    downcast='float')

    # df_bio already folded

    print('done')

    return df_bio
def fetch_and_fold(path, engine, targets, n_reports):
    """ function to fetch reports from simbad data

    Parameters
    ----------
    For definition of parameters, see arguments in `main_fetch_and_fold`
    """

    # fetching targets
    df_targets = sql2df(engine, targets)

    # fetching reports
    df = pd.read_excel(path)

    # normalize nip
    df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC']
    df['nip'] = df.loc[:, 'nip'] \
        .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:])

    df.drop([
        'N° Dossier patient IGR', 'LC', 'NOCET', 'SIGLE_ETUDE',
        'LIBELLE_TYPE_ETUDE', 'NUM CR', 'CR RESP'
    ],
            axis=1,
            inplace=True)

    df.rename(columns={'CR DATE': 'date', 'text CR': 'value'}, inplace=True)

    # keep only date in 'date columns'
    df['date'] = df.loc[:, 'date'].dt.date
    df['DATE_SIGN_OK'] = df.loc[:, 'DATE_SIGN_OK'].dt.date

    # taking only consultation reports
    df = df[df['CR NAT'] == 'CC']

    # mask to get only the first one
    mask = (df['date'] == df['DATE_SIGN_OK'])
    df = df[mask]

    # df_rh = df[df['CR NAT'] == 'RH']

    # taking only the first for each patient
    df.dropna(inplace=True)
    df.drop_duplicates(subset=['value'], inplace=True)

    # taking only the first reports
    group_dict = {
        'date': 'first',
        'DATE_SIGN_OK': 'last',
        'value': lambda g: ' '.join(g)
    }
    df = df.groupby('nip', as_index=False).agg(group_dict)

    # # filter uninformative reports and taking the first
    # df_rh = df_rh[~(df_rh['value'].str.match('Examen du', na=False))]
    # df_rh.dropna(inplace=True)
    # df_rh.drop_duplicates(subset=['value'], inplace=True)
    #
    # # taking only the first reports
    # df_rh = df_rh.groupby('nip', as_index=False).agg(group_dict)
    #
    # df = pd.concat([df_cc, df_rh], ignore_index=True)

    # removing useless tags (blocks parsing)
    df['value'] = df.loc[:, 'value'].apply(lambda s: \
        str(s).replace('<u>', '').replace('</u>', ''))

    # filter date
    # df = df[df['date'] <= (df['DATE_SIGN_OK'] + datetime.timedelta(weeks=8))]

    df = df.merge(df_targets, on='nip')

    parser = ReportsParser(headers='b',
                           is_html=False,
                           norm=False,
                           n_jobs=-1,
                           col_name='value')

    df['value'] = parser.transform(df)

    df['feature'] = ['report'] * df.shape[0]

    df = df.loc[:, ['nip', 'id', 'feature', 'value', 'date']]
    df = df[df['value'] != '']
    df.drop_duplicates(inplace=True)

    return df
# instantiation of logger
stdout = logging.StreamHandler(sys.stdout)
stdout.setFormatter(logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
))
logger = logging.getLogger('default')
logger.setLevel(logging.INFO)
logger.addHandler(stdout)

## Fetching data
logger.info("Loading data from Simbad and VCare")

engine = get_engine('vcare', 'srt-ap-92', 'vcare')

target_vcare = sql2df(engine, 'patient_target')
target_simbad = sql2df(engine, 'patient_target_simbad')

target_vcare = target_vcare.loc[:, ['nip', 'id', 'screenfail']]
target_simbad.drop(['prescreen', 'index'], axis=1, inplace=True)

target = pd.concat([target_vcare, target_simbad], axis=0, ignore_index=True)
logger.info("Target shape and head:")
print(target.shape)
target.head()

df_rep_vc = rep_vc(table, engine, 'patient_target', 1)
df_rep_sb = rep_sb(path + '/cr_sfditep_2012.xlsx', engine, 'patient_target_simbad', 1)

df_rep_sb.rename({'id': 'patient_id'}, axis=1, inplace=True)
def fetch_and_fold(path, engine, targets, n_reports):
    """ function to fetch radiology reports from vcare database

       Parameters
       ----------
       For definition of parameters, see arguments in `main_fetch_and_fold`
    """
    key1, key2, date = 'patient_id', 'nip', 'date'

    # fetching targets table
    df_targets = sql2df(engine, targets).loc[:, [key2, 'id', 'C1J1']]
    df_targets.loc[:, 'C1J1'] = pd.to_datetime(df_targets['C1J1'],
                                               format='%Y-%m-%d',
                                               unit='D')

    df_rad = pd.read_excel(path, sep=';', usecols=7, parse_dates=[1, 2, 5])

    # filter SESSION and keep prescreen values
    mask = df_rad['SESSION'].isin(['SCA ', 'IRM '])

    # mask_prescreen = df_rad['OBSERVATION DATE'] == 'Before Date'
    df_rad = df_rad[mask]  # [mask_prescreen]

    df_rad['CR_DATE'] = pd.to_datetime(df_rad['CR_DATE'], format='%Y%m%d')

    # remove useless rows
    df_rad = df_rad[~(df_rad['CONTENU_CR'].str.match('Examen du', na=False))]

    df_rad.rename({'CR_DATE': date}, axis=1, inplace=True)

    df_rad = df_rad.merge(df_targets, on=None, left_on='Nip',
                          right_on='nip').drop('Nip', axis=1)

    df_rad['patient_id'] = df_rad['id']
    df_rad.drop('id', axis=1, inplace=True)

    folder = Folder(key1, key2, ['CONTENU_CR'], date, n_jobs=-1)

    rad_folded = folder.fold(df_rad)

    rad_folded.dropna(inplace=True)
    rad_folded.drop_duplicates(subset=['value'], inplace=True)
    # concatenating n_reports
    group_dict = {
        key2: 'first',
        'feature': 'first',
        date: 'last',
        'value': lambda g: ' '.join(g[:n_reports])
    }

    rad_folded = rad_folded.groupby(key1, as_index=False).agg(group_dict)

    # removing useless tags (blocks parsing)
    rad_folded['value'] = rad_folded.loc[:, 'value'].apply(lambda s: \
        str(s).replace('<u>', '').replace('</u>', ''))

    sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion',
                'lesion(s) non cible(s)', 'nouvelles(s) lesion(s)',
                'resultats a l etage thoracique', 'en fenetre osseuse',
                'a l etage abdomino plevien', 'conclusion :')

    parser = ReportsParser(headers='b',
                           is_html=False,
                           col_name='value',
                           sections=sections,
                           n_jobs=-1)

    rad_folded['value'] = parser.transform(rad_folded)

    rad_folded = rad_folded[rad_folded['value'] != '']
    rad_folded['feature'] = ['rad'] * rad_folded.shape[0]

    return rad_folded
def parse_cr(path, engine, targets, n_reports):
    PATH = path

    # fetching targets
    df_targets = sql2df(engine, targets)

    # fetching reports
    # PATH = 'data/cr_sfditep_2012.xlsx'
    df = pd.read_excel(PATH)

    df = df[df['CR NAT'] == 'CR']
    df.rename(columns={'CR DATE': 'date', 'text CR': 'value'}, inplace=True)

    # keep only date in 'date columns'
    df['date'] = df.loc[:, 'date'].dt.date
    df['DATE_SIGN_OK'] = df.loc[:, 'DATE_SIGN_OK'].dt.date

    # remove useless reports
    df = df[~(df['value'].str.match('Examen du', na=False))]

    # filter by date
    # df = df[df['date'] <= (df['DATE_SIGN_OK'] + datetime.timedelta(weeks=8))]

    # normalize nip
    df['nip'] = df['N° Dossier patient IGR'].astype(str) + df['LC']
    df['nip'] = df.loc[:, 'nip'] \
        .apply(lambda s: s[:4] + '-' + s[4:-2] + ' ' + s[-2:])

    df.drop([
        'N° Dossier patient IGR', 'LC', 'NOCET', 'SIGLE_ETUDE',
        'LIBELLE_TYPE_ETUDE', 'NUM CR', 'CR RESP'
    ],
            axis=1,
            inplace=True)

    df.dropna(inplace=True)
    df.drop_duplicates(subset=['value'], inplace=True)

    # taking only the first reports
    group_dict = {
        'date': 'first',
        'DATE_SIGN_OK': 'last',
        'value': lambda g: ' '.join(g[:n_reports])
    }
    df = df.groupby('nip', as_index=False).agg(group_dict)

    df = df.merge(df_targets, on='nip')

    # removing useless tags (blocks parsing)
    df['value'] = df.loc[:, 'value'].apply(lambda s: \
        str(s).replace('<u>', '').replace('</u>', ''))

    sections = ('resultats', 'resultat', 'evaluation des cibles', 'conclusion',
                'lesions non cibles', 'nouvelles lesions',
                'lesion s non cible s', 'nouvelle s lesion s',
                'resultats a l etage thoracique', 'en fenetre osseuse',
                'a l etage abdomino plevien', 'conclusion :', '')
    parser = ReportsParser(headers='b',
                           is_html=False,
                           col_name='value',
                           sections=sections,
                           n_jobs=-1)

    df['value'] = parser.transform(df)

    # dropping empty rows
    df = df[df['value'] != '']

    df['feature'] = ['rad'] * df.shape[0]

    df = df.loc[:, ['nip', 'id', 'feature', 'value', 'date']]

    return df
def fetch(url, header_path, id, ip, dbase, targets_table):
    """
    il suffit de concatener toutes les tables extraites pour ensuite les fold

    url : str
        url to the location of biology files

    header_path : str
        path to csv file containing header

    id : str
        login to the sql database

    ip : str
        ip adress to the sql server

    dbase : str
        name of the database on the given server

    targets_table : str
        name of the table containing targets information

    @TODO ne need to fetch targets_table from sql since already loaded by
    @TODO main function

    Returns
    -------
    """
    # url = 'http://esimbad/testGSAV7/reslabo?FENID=resLaboPatDitep&NIP={}' \
    #       '&STARTDATE={}&ENDDATE={}'

    # header_path = '~/workspace/data/biology/header.csv'
    # constant names specific to our database
    KEY1 = 'id'
    KEY2 = 'NIP'

    header = pd.read_csv(header_path, sep=';', encoding='latin1').columns

    engine = get_engine(id, ip, dbase)

    df_ids = sql2df(engine, targets_table)
    df_ids.rename({'nip': KEY2}, inplace=True, axis=1)
    df_ids['patient_id'] = df_ids[KEY1]

    cols = [KEY2, 'Analyse', 'Resultat', 'Date prelvt']
    df_res = pd.DataFrame(data=None, columns=cols)

    for index, row in df_ids.iterrows():
        nip = row[KEY2].replace(' ', '')
        # patient_id = row['patient_id']
        # c1j1_date = row[C1J1].date()
        # start_date = c1j1_date - timedelta(weeks=8)
        start_date = row['prescreen']
        end_date = start_date + timedelta(weeks=4)

        start = str(start_date).replace('-', '')
        stop = str(end_date).replace('-', '')

        req = requests.get(url.format(nip, start, stop))
        values = BeautifulSoup(req.content, 'html.parser').body.text

        new_df = pd.read_csv(StringIO(values),
                             sep=';',
                             header=None,
                             index_col=False,
                             names=header)
        new_df = new_df.loc[:, cols + ['LC']]

        # normalize nip
        new_df[KEY2] = row[KEY2]

        new_df.drop('LC', axis=1, inplace=True)

        df_res = pd.concat([df_res, new_df],
                           axis=0,
                           sort=False,
                           ignore_index=True)

    return df_res