Esempio n. 1
0
        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'RemoveDuplicates',
                                              removeDuplicates)

#%%
dataFlowInventoryAll
        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, 'A')
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'AutoDetectTypes',
                                              automaticallyDetectColumnTypes)

#%%
dataFlowInventoryAll
        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'UPMDataFlowMapping',
                                              createUPMDataflow)

#%%
dataFlowInventoryAll
        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'MapLookups', mapLookups)

#%%
dataFlowInventoryAll
        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, 'A')
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'CleanUpAddress', cleanUpAddress)

#%%
dataFlowInventoryAll
Esempio n. 6
0
        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, 'A')
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'RenameColumns', renameColumns)

#%%
dataFlowInventoryAll
        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, 'A')
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'ParseNullString', parseNulls)

#%%
dataFlowInventoryAll
Esempio n. 8
0
#%% [markdown]
# # Join Tables - Pass 2
# To join two tables:
# - Define left and right dataflows
# - Define left and right join keys
# NOTE - future enhancement would be to quarantine orphaned rows from left and right dataflows

#%%
# Import all of the libraries we need to use...
import pandas as pd
import azureml.dataprep as dprep
import os as os
import re as re
import collections
from azureml.dataprep import col
from azureml.dataprep import Dataflow
from commonDataFlowProcessingLoop import dataFlowProcessingLoop
from commonInventoryCreation import getColumnStats, getDataFlowStats
from commonPackageHandling import openDataFlowPackage, saveDataFlowPackage
from commonJoinHandling import joinTables

# Let's also set up global variables and common functions...
previousStageNumber = '40'
thisStageNumber = '41'

#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'JoinTablesPass2', joinTables)

#%%
dataFlowInventoryAll
Esempio n. 9
0
        columnInventoryIntermediate = columnInventoryIntermediate.append(
            columnInventory)

        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)
        dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append(
            dataFlowInventory)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        # Now return all of the components badk to the main loop...
        return dataFlow, columnInventoryIntermediate, dataFlowInventoryIntermediate

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'SplitTable',
                                              splitTableBasedOnSingleColumn)

#%%
dataFlowInventoryAll
        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        # Now return all of the components badk to the main loop...
        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'QuarantineExtraColumns',
                                              quarantineRows)

#%%
dataFlowInventoryAll
        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        # Now return all of the components badk to the main loop...
        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None


#%%
dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber,
                                              thisStageNumber, 'A',
                                              'RemoveRowsFromTop',
                                              removeRowsFromTop)

#%%
dataFlowInventoryAll