Python generate_unit_map Exemples, hgc.ner.generate_unit_map Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : %test_example1.py Projet : KWR-Water/hgc

def test_exm_wide():
    """
    ===================
    Example: import wide data
    ===================

    Next, we will import the same data, but from a ‘wide’ shaped file.
    Note that it is also possible to use a dataframe instead of excel or csv as input
    for hgc.io.import_file(). This requires using the argument “dataframe” instead of “file_name”.
    An advantage of this approach is to prevent repeatedly reading the input file .
    """

    df_temp = pd.read_excel(Path(__file__).cwd() / 'tests/example1.xlsx',
                            sheet_name='wide',
                            header=None)  # ignore headers!

    # step 1: generate feature map
    feature_map2, feature_unmapped2, df_feature_map2 = ner.generate_feature_map(
        entity_orig=list(df_temp.iloc[2, 5:]))
    assert (feature_map2 == {
        'chloride': 'Cl',
        'manganese': 'Mn',
        'nietrate': 'NO3',
        'nitrate (filtered)': 'NO3'
    })

    # step 2: generate unit map
    unit_map2, unit_unmapped2, df_unit_map2 = ner.generate_unit_map(
        entity_orig=list(df_temp.iloc[3, 5:]))
    assert (unit_map2 == {
        'mg-N/L': 'mg/L N',
        'mg/L': 'mg/L',
        'ug/L': 'μg/L',
        'μS/cm': 'μS/cm'
    })

    # step 3: import file
    df2 = io.import_file(dataframe=df_temp,
                         shape='wide',
                         slice_header=[3, slice(2, 5)],
                         slice_feature=[2, slice(5, None)],
                         slice_unit=[3, slice(5, None)],
                         slice_data=[slice(4, None)],
                         map_header={
                             **io.default_map_header(), 'loc.': 'LocationID',
                             'date': 'Datetime',
                             'sample': 'SampleID'
                         },
                         map_features={
                             **feature_map2, 'EC new sensor': 'ec_field'
                         },
                         map_units=unit_map2)[0]

    # step 4: convert to wide format
    df2_hgc = io.stack_to_hgc(df2)

Exemple #2

0

Afficher le fichier

Fichier : %test_prov.py Projet : KWR-Water/hgc

def test_VitensOMIVE():
    # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv'
    # WD = r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/Opkomende stoffen KIWK Zuid_preprocessed.csv'
    WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/preprocessed/Vitens_PP_WP_OMIVE_2009_2020.xlsx'
    df_temp = pd.read_excel(WD, header=None, encoding='ISO-8859-1')
    # define the nrow here 
    n_row = None    
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[1, slice(8, 819)].dropna()))
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[2, slice(8, 819)].dropna()))

    # create a df to record what has been mapped and what has not
    df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose()
    if not not feature_unmapped:
        df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature']))
    if not not unit_unmapped:
        df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit']))

    dct2_arguments = {
        'file_path': WD,
        'sheet_name': 'Sheet1',
        'shape': 'wide',
        'slice_header': [1, slice(1, 8)],
        'slice_feature': [1, slice(8, None)],
        'slice_unit': [2, slice(8, None)],
        'slice_data': [slice(3, n_row), slice(1, None)],

        'map_header': {
            **io.default_map_header(),
            # 'Monsterpuntcode': 'LocationID',
            # 'Omschrijving (Parameter)':'Feature',
            # 'Eenheid (Parameter)': 'Unit',
            # 'Waarde numeriek': 'Value', # Gerapporteerde waarde, right?!
            'Datum': 'Datetime',
            'Naam': 'SampleID',  # Analyse !?
        },
        'map_features': {**feature_map,'pH(1)':'pH'},
        'map_units': {**unit_map},
    }
    df2 = io.import_file(**dct2_arguments)[0]
    df2_hgc = io.stack_to_hgc(df2)
    # with pd.ExcelWriter(r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/KIWK_Zuid_processed.xlsx') as writer:          
    with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/VitensOMIVE.xlsx') as writer:          
        df2_hgc.to_excel(writer, sheet_name='hgc_VitenOMIVE')
        # df2.to_excel(writer, sheet_name='VitensOMIVE')
        df_map.to_excel(writer, sheet_name='mapAndUnmap')  
    df2.to_csv(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/VitensOMIVE_ref.csv')

Exemple #3

0

Afficher le fichier

Fichier : %test_prov.py Projet : KWR-Water/hgc

def test_LIMS_Ruw_2017_2019():
    # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv'
    # WD = r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/Opkomende stoffen KIWK Zuid_preprocessed.csv'
    WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/LIMS_Ruw_2017_2019_preprocessed.xlsx'
    df_temp = pd.read_excel(WD, header=None, encoding='ISO-8859-1')
    # define the nrow here 
    n_row = None    
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 6].dropna()))
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 9].dropna()))

    # create a df to record what has been mapped and what has not
    df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose()
    if not not feature_unmapped:
        df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature']))
    if not not unit_unmapped:
        df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit']))

    dct2_arguments = {
        'file_path': WD,
        'sheet_name': 'Export Worksheet',
        'shape': 'stacked',
        'slice_header': [1, slice(1, 10)],
        'slice_data': [slice(1, n_row), slice(1, 10)],

        'map_header': {
            **io.default_map_header(),
            'POINTDESCR': 'LocationID',
            'ANALYTE':'Feature',
            'UNITS': 'Unit',
            'FINAL': 'Value', # Gerapporteerde waarde, right?!
            'SAMPDATE': 'Datetime',
            'TESTNO': 'SampleID',  # Analyse !?
        },
        'map_features': {**feature_map,'pH(1)':'pH'},
        'map_units': {**unit_map, 'µg/l Hxdcn-d34':'µg/l'},
    }
    df2 = io.import_file(**dct2_arguments)[0]
    df2_hgc = io.stack_to_hgc(df2)
    # with pd.ExcelWriter(r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/KIWK_Zuid_processed.xlsx') as writer:          
    with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/LIMS_Ruw_2017_2019_processed.xlsx') as writer:          
        df2_hgc.to_excel(writer, sheet_name='hgc_LIMS_Ruw_2017_2019')
        df2.to_excel(writer, sheet_name='LIMS_Ruw_2017_2019')
        df_map.to_excel(writer, sheet_name='mapAndUnmap')

Exemple #4

0

Afficher le fichier

Fichier : %test_prov.py Projet : KWR-Water/hgc

def test_province():
    # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv'
    WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse'+'/provincie_data_long_preprocessed.csv'
    df_temp = pd.read_csv(WD, encoding='ISO-8859-1', header=None)

    # define the nrow here 
    n_row = None    
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 25].dropna()))
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 26].dropna()))
 
    # create a df to record what has been mapped and what has not
    df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose()
    if not not feature_unmapped:
        df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature']))
    if not not unit_unmapped:
        df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit']))

    dct2_arguments = {
        'file_path': WD,
        'sheet_name': 'stacked',
        'shape': 'stacked',
        'slice_header': [1, slice(1, None)],
        'slice_data': [slice(1, n_row), slice(1, None)],
        'map_header': {
            **io.default_map_header(),
            'MeetpuntId': 'LocationID',
            'parameter':'Feature',
            'eenheid': 'Unit',
            'waarde': 'Value',
            'Opgegeven bemonstering datum': 'Datetime',
            'Monsternummer': 'SampleID',  # "SampleID" already exists as header, but contains wrong date. Use "Sample number" as "SampleID"
            # 'SampleID': None  # otherwise exists twice in output file
        },
        'map_features': {**feature_map,'pH(1)':'pH'},
        'map_units': {**unit_map, 'oC':'°C'},
    }
    df2 = io.import_file(**dct2_arguments)[0]
    df2_hgc = io.stack_to_hgc(df2)
    with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse'+r'/provincie_processed.xlsx') as writer:          
        df2_hgc.to_excel(writer, sheet_name='hgc_prov')
        df2.to_excel(writer, sheet_name='df_prov')
        df_map.to_excel(writer, sheet_name='mapAndUnmap')

Exemple #5

0

Afficher le fichier

Fichier : %test_prov.py Projet : KWR-Water/hgc

def test_KIWKVenloschol():
    # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv'
    # WD = r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/Opkomende stoffen KIWK Zuid_preprocessed.csv'
    WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/Opkomende stoffen KIWK Venloschol_preprocessed.xlsx'
    df_temp = pd.read_excel(WD, header=None, encoding='ISO-8859-1')
    # define the nrow here 
    n_row = None    
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 20].dropna()))
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 21].dropna()))
 
    # create a df to record what has been mapped and what has not
    df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose()
    if not not feature_unmapped:
        df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature']))
    if not not unit_unmapped:
        df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit']))

    dct2_arguments = {
        'file_path': WD,
        'sheet_name': 'Export KoW 2.0',
        'shape': 'stacked',
        'slice_header': [1, slice(1, 24)],
        'slice_data': [slice(1, n_row), slice(1, 24)],
        'map_header': {
            **io.default_map_header(),
            'Monsterpunt': 'LocationID',
            'Parameter omschrijving':'Feature',
            'Eenheid': 'Unit',
            'Gerapporteerde waarde': 'Value', # Gerapporteerde waarde, right?!
            'Monstername datum': 'Datetime',
            'Analyse': 'SampleID',  # Analyse !?
        },
        'map_features': {**feature_map,'pH(1)':'pH'},
        'map_units': {**unit_map, 'µg/l atrazine-D5':'µg/l'},
    }
    df2 = io.import_file(**dct2_arguments)[0]
    df2_hgc = io.stack_to_hgc(df2)
    # with pd.ExcelWriter(r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/KIWK_Zuid_processed.xlsx') as writer:          
    with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/Opkomende stoffen KIWK Venloschol_processed.xlsx') as writer:          
        df2_hgc.to_excel(writer, sheet_name='hgc_KIWK Venloschol')
        df2.to_excel(writer, sheet_name='KIWK Venloschol')
        df_map.to_excel(writer, sheet_name='mapAndUnmap')

Exemple #6

0

Afficher le fichier

Fichier : %test_example1.py Projet : KWR-Water/hgc

def test_exm_stacked():
    """
    ----------------------
    Step 1: ner.generate_feature_map()
    ----------------------
    First map the features in the original file, to feature names recognized by HGC.
    """

    # compile a list of features by slicing the original file
    lst_features = list(
        pd.read_excel('./tests/example1.xlsx',
                      sheet_name='stacked')['Feature'])

    # automatically detect features using text recognition
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(
        entity_orig=lst_features)

    # check if features are correctly mapped
    assert (feature_map == {
        'chloride': 'Cl',
        'nitrate (filtered)': 'NO3',
        'manganese': 'Mn',
        'nietrate': 'NO3'
    })

    # check for which features the algorithm was not able to find a match that met the minimum resemblance.
    assert (feature_unmapped == ['EC new sensor'])

    # The dataframe can be used to check in more detail the scores how well the original features were matched to HGC features.
    # This can be handy if you want to identify common errors and update the underlying database.
    df_feature_map.head(5)
    """
    In this case we find that the algorithm was not able to find a match for one 
    of the features ('EC new sensor'). Hence, we need to adjust the mapping manually.
    """

    # manually adjust the mapping by merging with a user defined dictionary (optional)
    feature_map2 = {**feature_map, 'EC new sensor': 'ec_field'}
    """
    ----------------------
    Step 2: hgc.io. generate_unit_map()
    ----------------------
    Next, we need to make a mapping for the units, using the same approach as for the features. 
    """
    lst_units = list(
        pd.read_excel(Path(__file__).cwd() / 'tests/example1.xlsx',
                      sheet_name='stacked')['Unit'])
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(
        entity_orig=lst_units)
    assert (unit_map == {
        'mg-N/L': 'mg/L N',
        'mg/L': 'mg/L',
        'ug/L': 'μg/L',
        'μS/cm': 'μS/cm'
    })
    """
    ----------------------
    Step 3: hgc.io.import_file()
    ----------------------
    The third step is to read the original file and and convert the data to the desired 
    datamodel. This requires that we first indicate where to find the data and how to 
    convert it.
    """

    # Arguments defining where to find data
    slice_header = [0, slice(0, 6)]  # row 0
    slice_data = [slice(1, None)
                  ]  # row 1 till end of file. "None" indicates "end" here.

    # Arguments how to convert the data

    # map_header -->  mapping how to adjust headers name
    # Note: The headers 'Value', 'Unit' and 'SampleID' are compulsory. Other headers can be any string
    map_header = {
        **io.default_map_header(), 'loc.': 'LocationID',
        'date': 'Datetime',
        'sample': 'SampleID'
    }

    # map_features --> see step 1

    # map_units --> see step 2

    # feature_units -->  mapping of the desired units for each feature
    # For instance, we can inspect the default units for Cl, NO3 and ec_field
    assert (io.default_feature_units()['Cl'] == 'mg/L')
    assert (io.default_feature_units()['NO3'] == 'mg/L')
    assert (io.default_feature_units()['ec_field'] == 'mS/m')

    # column_dtype --> desired dtypefor columns
    # we will use the default dtype
    print(io.default_column_dtype())  # use default values
    """
    Now the we have defined all the arguments, let's import the data
    """

    df = io.import_file(file_path=str(
        Path(__file__).cwd() / 'tests/example1.xlsx'),
                        sheet_name='stacked',
                        shape='stacked',
                        slice_header=slice_header,
                        slice_data=slice_data,
                        map_header=map_header,
                        map_features=feature_map2,
                        map_units=unit_map)[0]
    df.head(3)  # imported data
    df_1 = io.import_file(file_path=str(
        Path(__file__).cwd() / 'tests/example1.xlsx'),
                          sheet_name='stacked',
                          shape='stacked',
                          slice_header=slice_header,
                          slice_data=slice_data,
                          map_header=map_header,
                          map_features=feature_map2,
                          map_units=unit_map)[1]
    df_1.head(3)  # duplication
    df_2 = io.import_file(file_path=str(
        Path(__file__).cwd() / 'tests/example1.xlsx'),
                          sheet_name='stacked',
                          shape='stacked',
                          slice_header=slice_header,
                          slice_data=slice_data,
                          map_header=map_header,
                          map_features=feature_map2,
                          map_units=unit_map)[2]
    df_2.head(3)  # nan values
    """
    ----------------------
    Step 4: hgc.io.to_hgc()
    ----------------------
    Finally, we need to pivot the stacked data to the wide format used by HGC.
    The default is to use 'LocationID', 'Datetime' and 'SampleID' as index.
    """
    df_hgc = io.stack_to_hgc(df)

Exemple #7

0

Afficher le fichier

Fichier : test_bas_HWL.py Projet : KWR-Water/hgc

def test_hwl():
    """
    Created on Wed Sep 23 15:14:48 2020

    @author: griftba
    """
    import pandas as pd
    import hgc
    from hgc import ner 
    from hgc import io
    from pathlib import Path

    #pip install -U git+https://github.com/KWR-Water/[email protected]

    #    df_temp = pd.read_excel(r'./Chem_Data_bas.xlsx', sheet_name='Raai 2019-2020', header=None) # ignore headers!

    # step 1: generate feature map
    file_name = r'D:\DBOX\Dropbox\008KWR\0081Projects\QSAR_NER\testing_bas\Chem_Data_bas.xlsx'
    lst_features = list(pd.read_excel(file_name, sheet_name='Raai 2019-2020')['Parameter'])
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=lst_features)
    lst_units = list(pd.read_excel(file_name, sheet_name='Raai 2019-2020')['Unit'])
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=lst_units)
    slice_header = [0, slice(0, 8)]  # row 0
    slice_data = [slice(1, None), slice(0, 8)]
    dct2_arguments = {
        'file_path': file_name,
        'sheet_name': 'Raai 2019-2020',
        'shape': 'stacked',
        'slice_header': slice_header,
        'slice_data': slice_data,
        'map_header': {
            **io.default_map_header(), 'Date': 'Datetime', 'Sample': 'SampleID', 'Parameter':'Feature',
        },
        'map_features': feature_map,
        'map_units': unit_map,
    }
    df2 = io.import_file(**dct2_arguments)[0]


    dct2_arguments = {
        'file_path': file_name,
        'sheet_name': 'Raai 2019-2020',
        'shape': 'stacked',
        'slice_header': slice_header,
        'slice_data': slice_data,
        'map_header': {
            **io.default_map_header(), 'Date': 'Datetime', 'Sample': 'SampleID', 'Parameter':'Feature',
        },
        'map_features': feature_map,
        'map_units': unit_map,
    }
    df2_2 = io.import_file(**dct2_arguments)[2] 

    df2_DLV = hgc.io.stack_to_hgc(df2)

    df2_DLV.to_csv('DLV_raai2019_2020.csv')


    df2_DLV.hgc.make_valid()
    is_valid = df2_DLV.hgc.is_valid
    is_valid

    # Recognized HGC columns
    hgc_cols = df2_DLV.hgc.hgc_cols
    print(hgc_cols)

    sum_anions = df2_DLV.hgc.get_sum_anions_stuyfzand()

    water_types = df2_DLV.hgc.get_stuyfzand_water_type()
    print(water_types)
    bex = df2_DLV.hgc.get_bex()
    bex