def test_exm_wide(): """ =================== Example: import wide data =================== Next, we will import the same data, but from a ‘wide’ shaped file. Note that it is also possible to use a dataframe instead of excel or csv as input for hgc.io.import_file(). This requires using the argument “dataframe” instead of “file_name”. An advantage of this approach is to prevent repeatedly reading the input file . """ df_temp = pd.read_excel(Path(__file__).cwd() / 'tests/example1.xlsx', sheet_name='wide', header=None) # ignore headers! # step 1: generate feature map feature_map2, feature_unmapped2, df_feature_map2 = ner.generate_feature_map( entity_orig=list(df_temp.iloc[2, 5:])) assert (feature_map2 == { 'chloride': 'Cl', 'manganese': 'Mn', 'nietrate': 'NO3', 'nitrate (filtered)': 'NO3' }) # step 2: generate unit map unit_map2, unit_unmapped2, df_unit_map2 = ner.generate_unit_map( entity_orig=list(df_temp.iloc[3, 5:])) assert (unit_map2 == { 'mg-N/L': 'mg/L N', 'mg/L': 'mg/L', 'ug/L': 'μg/L', 'μS/cm': 'μS/cm' }) # step 3: import file df2 = io.import_file(dataframe=df_temp, shape='wide', slice_header=[3, slice(2, 5)], slice_feature=[2, slice(5, None)], slice_unit=[3, slice(5, None)], slice_data=[slice(4, None)], map_header={ **io.default_map_header(), 'loc.': 'LocationID', 'date': 'Datetime', 'sample': 'SampleID' }, map_features={ **feature_map2, 'EC new sensor': 'ec_field' }, map_units=unit_map2)[0] # step 4: convert to wide format df2_hgc = io.stack_to_hgc(df2)
def test_VitensOMIVE(): # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv' # WD = r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/Opkomende stoffen KIWK Zuid_preprocessed.csv' WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/preprocessed/Vitens_PP_WP_OMIVE_2009_2020.xlsx' df_temp = pd.read_excel(WD, header=None, encoding='ISO-8859-1') # define the nrow here n_row = None feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[1, slice(8, 819)].dropna())) unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[2, slice(8, 819)].dropna())) # create a df to record what has been mapped and what has not df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose() if not not feature_unmapped: df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature'])) if not not unit_unmapped: df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit'])) dct2_arguments = { 'file_path': WD, 'sheet_name': 'Sheet1', 'shape': 'wide', 'slice_header': [1, slice(1, 8)], 'slice_feature': [1, slice(8, None)], 'slice_unit': [2, slice(8, None)], 'slice_data': [slice(3, n_row), slice(1, None)], 'map_header': { **io.default_map_header(), # 'Monsterpuntcode': 'LocationID', # 'Omschrijving (Parameter)':'Feature', # 'Eenheid (Parameter)': 'Unit', # 'Waarde numeriek': 'Value', # Gerapporteerde waarde, right?! 'Datum': 'Datetime', 'Naam': 'SampleID', # Analyse !? }, 'map_features': {**feature_map,'pH(1)':'pH'}, 'map_units': {**unit_map}, } df2 = io.import_file(**dct2_arguments)[0] df2_hgc = io.stack_to_hgc(df2) # with pd.ExcelWriter(r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/KIWK_Zuid_processed.xlsx') as writer: with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/VitensOMIVE.xlsx') as writer: df2_hgc.to_excel(writer, sheet_name='hgc_VitenOMIVE') # df2.to_excel(writer, sheet_name='VitensOMIVE') df_map.to_excel(writer, sheet_name='mapAndUnmap') df2.to_csv(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/VitensOMIVE_ref.csv')
def test_LIMS_Ruw_2017_2019(): # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv' # WD = r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/Opkomende stoffen KIWK Zuid_preprocessed.csv' WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/LIMS_Ruw_2017_2019_preprocessed.xlsx' df_temp = pd.read_excel(WD, header=None, encoding='ISO-8859-1') # define the nrow here n_row = None feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 6].dropna())) unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 9].dropna())) # create a df to record what has been mapped and what has not df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose() if not not feature_unmapped: df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature'])) if not not unit_unmapped: df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit'])) dct2_arguments = { 'file_path': WD, 'sheet_name': 'Export Worksheet', 'shape': 'stacked', 'slice_header': [1, slice(1, 10)], 'slice_data': [slice(1, n_row), slice(1, 10)], 'map_header': { **io.default_map_header(), 'POINTDESCR': 'LocationID', 'ANALYTE':'Feature', 'UNITS': 'Unit', 'FINAL': 'Value', # Gerapporteerde waarde, right?! 'SAMPDATE': 'Datetime', 'TESTNO': 'SampleID', # Analyse !? }, 'map_features': {**feature_map,'pH(1)':'pH'}, 'map_units': {**unit_map, 'µg/l Hxdcn-d34':'µg/l'}, } df2 = io.import_file(**dct2_arguments)[0] df2_hgc = io.stack_to_hgc(df2) # with pd.ExcelWriter(r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/KIWK_Zuid_processed.xlsx') as writer: with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/LIMS_Ruw_2017_2019_processed.xlsx') as writer: df2_hgc.to_excel(writer, sheet_name='hgc_LIMS_Ruw_2017_2019') df2.to_excel(writer, sheet_name='LIMS_Ruw_2017_2019') df_map.to_excel(writer, sheet_name='mapAndUnmap')
def test_province(): # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv' WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse'+'/provincie_data_long_preprocessed.csv' df_temp = pd.read_csv(WD, encoding='ISO-8859-1', header=None) # define the nrow here n_row = None feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 25].dropna())) unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 26].dropna())) # create a df to record what has been mapped and what has not df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose() if not not feature_unmapped: df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature'])) if not not unit_unmapped: df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit'])) dct2_arguments = { 'file_path': WD, 'sheet_name': 'stacked', 'shape': 'stacked', 'slice_header': [1, slice(1, None)], 'slice_data': [slice(1, n_row), slice(1, None)], 'map_header': { **io.default_map_header(), 'MeetpuntId': 'LocationID', 'parameter':'Feature', 'eenheid': 'Unit', 'waarde': 'Value', 'Opgegeven bemonstering datum': 'Datetime', 'Monsternummer': 'SampleID', # "SampleID" already exists as header, but contains wrong date. Use "Sample number" as "SampleID" # 'SampleID': None # otherwise exists twice in output file }, 'map_features': {**feature_map,'pH(1)':'pH'}, 'map_units': {**unit_map, 'oC':'°C'}, } df2 = io.import_file(**dct2_arguments)[0] df2_hgc = io.stack_to_hgc(df2) with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse'+r'/provincie_processed.xlsx') as writer: df2_hgc.to_excel(writer, sheet_name='hgc_prov') df2.to_excel(writer, sheet_name='df_prov') df_map.to_excel(writer, sheet_name='mapAndUnmap')
def test_KIWKVenloschol(): # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv' # WD = r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/Opkomende stoffen KIWK Zuid_preprocessed.csv' WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/Opkomende stoffen KIWK Venloschol_preprocessed.xlsx' df_temp = pd.read_excel(WD, header=None, encoding='ISO-8859-1') # define the nrow here n_row = None feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 20].dropna())) unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 21].dropna())) # create a df to record what has been mapped and what has not df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose() if not not feature_unmapped: df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature'])) if not not unit_unmapped: df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit'])) dct2_arguments = { 'file_path': WD, 'sheet_name': 'Export KoW 2.0', 'shape': 'stacked', 'slice_header': [1, slice(1, 24)], 'slice_data': [slice(1, n_row), slice(1, 24)], 'map_header': { **io.default_map_header(), 'Monsterpunt': 'LocationID', 'Parameter omschrijving':'Feature', 'Eenheid': 'Unit', 'Gerapporteerde waarde': 'Value', # Gerapporteerde waarde, right?! 'Monstername datum': 'Datetime', 'Analyse': 'SampleID', # Analyse !? }, 'map_features': {**feature_map,'pH(1)':'pH'}, 'map_units': {**unit_map, 'µg/l atrazine-D5':'µg/l'}, } df2 = io.import_file(**dct2_arguments)[0] df2_hgc = io.stack_to_hgc(df2) # with pd.ExcelWriter(r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/KIWK_Zuid_processed.xlsx') as writer: with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/Opkomende stoffen KIWK Venloschol_processed.xlsx') as writer: df2_hgc.to_excel(writer, sheet_name='hgc_KIWK Venloschol') df2.to_excel(writer, sheet_name='KIWK Venloschol') df_map.to_excel(writer, sheet_name='mapAndUnmap')
def test_exm_stacked(): """ ---------------------- Step 1: ner.generate_feature_map() ---------------------- First map the features in the original file, to feature names recognized by HGC. """ # compile a list of features by slicing the original file lst_features = list( pd.read_excel('./tests/example1.xlsx', sheet_name='stacked')['Feature']) # automatically detect features using text recognition feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map( entity_orig=lst_features) # check if features are correctly mapped assert (feature_map == { 'chloride': 'Cl', 'nitrate (filtered)': 'NO3', 'manganese': 'Mn', 'nietrate': 'NO3' }) # check for which features the algorithm was not able to find a match that met the minimum resemblance. assert (feature_unmapped == ['EC new sensor']) # The dataframe can be used to check in more detail the scores how well the original features were matched to HGC features. # This can be handy if you want to identify common errors and update the underlying database. df_feature_map.head(5) """ In this case we find that the algorithm was not able to find a match for one of the features ('EC new sensor'). Hence, we need to adjust the mapping manually. """ # manually adjust the mapping by merging with a user defined dictionary (optional) feature_map2 = {**feature_map, 'EC new sensor': 'ec_field'} """ ---------------------- Step 2: hgc.io. generate_unit_map() ---------------------- Next, we need to make a mapping for the units, using the same approach as for the features. """ lst_units = list( pd.read_excel(Path(__file__).cwd() / 'tests/example1.xlsx', sheet_name='stacked')['Unit']) unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map( entity_orig=lst_units) assert (unit_map == { 'mg-N/L': 'mg/L N', 'mg/L': 'mg/L', 'ug/L': 'μg/L', 'μS/cm': 'μS/cm' }) """ ---------------------- Step 3: hgc.io.import_file() ---------------------- The third step is to read the original file and and convert the data to the desired datamodel. This requires that we first indicate where to find the data and how to convert it. """ # Arguments defining where to find data slice_header = [0, slice(0, 6)] # row 0 slice_data = [slice(1, None) ] # row 1 till end of file. "None" indicates "end" here. # Arguments how to convert the data # map_header --> mapping how to adjust headers name # Note: The headers 'Value', 'Unit' and 'SampleID' are compulsory. Other headers can be any string map_header = { **io.default_map_header(), 'loc.': 'LocationID', 'date': 'Datetime', 'sample': 'SampleID' } # map_features --> see step 1 # map_units --> see step 2 # feature_units --> mapping of the desired units for each feature # For instance, we can inspect the default units for Cl, NO3 and ec_field assert (io.default_feature_units()['Cl'] == 'mg/L') assert (io.default_feature_units()['NO3'] == 'mg/L') assert (io.default_feature_units()['ec_field'] == 'mS/m') # column_dtype --> desired dtypefor columns # we will use the default dtype print(io.default_column_dtype()) # use default values """ Now the we have defined all the arguments, let's import the data """ df = io.import_file(file_path=str( Path(__file__).cwd() / 'tests/example1.xlsx'), sheet_name='stacked', shape='stacked', slice_header=slice_header, slice_data=slice_data, map_header=map_header, map_features=feature_map2, map_units=unit_map)[0] df.head(3) # imported data df_1 = io.import_file(file_path=str( Path(__file__).cwd() / 'tests/example1.xlsx'), sheet_name='stacked', shape='stacked', slice_header=slice_header, slice_data=slice_data, map_header=map_header, map_features=feature_map2, map_units=unit_map)[1] df_1.head(3) # duplication df_2 = io.import_file(file_path=str( Path(__file__).cwd() / 'tests/example1.xlsx'), sheet_name='stacked', shape='stacked', slice_header=slice_header, slice_data=slice_data, map_header=map_header, map_features=feature_map2, map_units=unit_map)[2] df_2.head(3) # nan values """ ---------------------- Step 4: hgc.io.to_hgc() ---------------------- Finally, we need to pivot the stacked data to the wide format used by HGC. The default is to use 'LocationID', 'Datetime' and 'SampleID' as index. """ df_hgc = io.stack_to_hgc(df)