def cleanUpAddress(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) if operationFlag != '': # First replace the escape character with comma + space dataFlow = dataFlow.str_replace(operationFlag, r'\0d0a', ', ') # Now capture any empty addresses dataFlow = dataFlow.replace(operationFlag, r', , , ', None) print('{0}: cleaned up addresses in column {1}'.format( dataName, operationFlag)) # Will now try to split out the address into component columns... # builder = dataFlow.builders.split_column_by_example(source_column=operationFlag) # builder.add_example(example=('552 Malvern Avenue, St. George, Bristol', \ # ['552 Malvern Avenue', 'St. George', 'Bristol'])) # builder.add_example(example=('224A Tomswood St, Hainault, Ilford, Cornwall', \ # ['224A Tomswood St', 'Hainault', 'Ilford', 'Cornwall'])) # dataFlow = builder.to_dataflow() # dataFlow = dataFlow.split_column_by_delimiters(operationFlag, ', ', True) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
def removeDuplicates(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) if operationFlag != '': columnsToKeep = operationFlag numberOfRowsBefore = dataFlow.row_count dataFlow = dataFlow.distinct( dprep.ColumnSelector(columnsToKeep, True, True, invert=False)) print( '{0}: removed duplicates from column {1} rows before {2} rows afer {3}' .format(dataName, operationFlag, numberOfRowsBefore, dataFlow.row_count)) else: print('{0}: no duplicate processing required'.format(dataName)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
def automaticallyDetectColumnTypes(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) # Now perform the operation on the dataFlow : ie remove the number of rows specified from the top # Perform the operation if operationFlag == 'Yes': # Detect and apply column types builder = dataFlow.builders.set_column_types() builder.learn() print('{0}: candidates detected {1}'.format( dataName, builder.conversion_candidates)) builder.ambiguous_date_conversions_keep_month_day() dataFlow = builder.to_dataflow() dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
def parseNulls(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) if operationFlag != '': # Get a list of the columns and count them... dataFlowColumns = list(dataFlow.get_profile().columns.keys()) # Replace any occurences null, using custom, across all columns... dataFlow = dataFlow.replace_na(dataFlowColumns, custom_na_list=operationFlag) print( '{0}: parsed nulls including custom string {1} from {2} columns' .format(dataName, operationFlag, len(dataFlowColumns))) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
def removeRowsFromTop(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) # Now perform the operation on the dataFlow : ie remove the number of rows specified from the top numberOfRowsToRemove = int(operationFlag) dataFlow = dataFlow.skip(numberOfRowsToRemove) print('{0}: removed first {1} row(s)'.format(dataName, numberOfRowsToRemove)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) # Now return all of the components badk to the main loop... return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
def renameColumns(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) if operationFlag != '': # Do the operation on columns to rename them... print('{0}: renamed {1} columns'.format(dataName, operationFlag)) else: print('{0}: no operation to perform'.format(dataName)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
def mapLookups(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) if operationFlag != '': transforms = load_transformation_configuration('./Config/' + operationFlag) if len(transforms) > 1: lookups = get_lookups_from_transforms(transforms) for key in lookups: lookupDictionary = lookups[key] replacements = [] dataFlow = dataFlow.set_column_types( {key: dprep.FieldType.STRING}) for lookup in lookupDictionary: replacements.append( ReplacementsValue(lookup, lookupDictionary[lookup])) destination_column = get_destination_column_name( key, transforms) dataFlow = dataFlow.map_column(key, destination_column, replacements) print(dataName + ': Transformed lookups for column - ' + key + '. Added new column ' + destination_column) else: print('{0}: no look-up processing required'.format(dataName)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
def createUPMDataflow(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) if operationFlag != '': mappingConfig = dprep.read_csv( './Config/' + operationFlag).to_pandas_dataframe() targetDataFlow = dataFlow columnsToKeep = '' for sourceTable in mappingConfig[mappingConfig.SourceTable == dataName]['SourceTable'].unique(): for sourceColumn, targetColumn in mappingConfig[ mappingConfig.SourceTable == sourceTable][[ 'SourceColumn', 'TargetColumn' ]].values: if columnsToKeep is '': columnsToKeep = targetColumn else: columnsToKeep = columnsToKeep + '|' + targetColumn targetDataFlow = targetDataFlow.rename_columns( {sourceColumn: targetColumn}) targetDataFlow = targetDataFlow.drop_columns( dprep.ColumnSelector(columnsToKeep, True, True, invert=True)) newPackageName = next( iter(mappingConfig[mappingConfig.SourceTable == dataName] ['TargetTable'].unique())) createNewPackageDirectory(newPackageName) saveDataFlowPackage(targetDataFlow, newPackageName, thisStageNumber, 'A') else: print('{0}: no duplicate processing required'.format(dataName)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
from azureml.dataprep import Dataflow from commonDataFlowProcessingLoop import dataFlowProcessingLoop from commonInventoryCreation import getColumnStats, getDataFlowStats from commonPackageHandling import openDataFlowPackage, saveDataFlowPackage, createNewPackageDirectory from mappingCode import load_transformation_configuration, get_lookups_from_transforms, get_destination_column_name # Let's also set up global variables and common functions... previousStageNumber = '60' thisStageNumber = '61' qualityFlag = 'A' dataName = 'UPMFOLDER_UPMPERSON' dataAnalyticsPath = './dataAnalytics' #%% # Open the data flow package that has been prepared... dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) print('{0}: loaded package from path {1}'.format(dataName, fullPackagePath)) # Now convert it to a pandas dataframe... dataFrame = dataFlow.to_pandas_dataframe() def createFilePath(dataAnalyticsPath, dataName, stage, qualityFlag): if not os.path.isdir(dataAnalyticsPath): os.mkdir(dataAnalyticsPath) return dataAnalyticsPath + '/dataAnalyticsExtract_' + dataName + '_' + qualityFlag + '.csv' fileName = createFilePath(dataAnalyticsPath, dataName, thisStageNumber, qualityFlag) dataFrame.to_csv(fileName, index = None)
import pandas as pd import azureml.dataprep as dprep import os as os import re as re import collections import seaborn as sns import pandas_profiling as pp import datetime from datetime import datetime from azureml.dataprep import value from azureml.dataprep import col from azureml.dataprep import Dataflow from commonPackageHandling import openDataFlowPackage #%% dataFlow, fullPackagePath = openDataFlowPackage('UPMFOLDER_UPMPERSON', '60', 'A') #%% dataFlow.head(10) #%% builder = dataFlow.builders.split_column_by_example('ADDRESS') #%% builder.preview() #%% builder.keep_delimiters = False #%% builder.delimiters = r'\0d0a'
def splitTableBasedOnSingleColumn(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) dataProfile = dataFlow.get_profile() # Set up empty intermediate dataframes that we will use to build up inventories at both dataFlow and column level dataFlowInventoryIntermediate = pd.DataFrame() columnInventoryIntermediate = pd.DataFrame() if operationFlag != '': # First, grab the unique set of values in the column valuesInColumn = dataProfile.columns[operationFlag].value_counts # Now filter the original data flow based on each unique value in turn and fork a new data flow! for valueToSplitOn in valuesInColumn: newDataFlow = dataFlow.filter( dataFlow[operationFlag] == valueToSplitOn.value) # Create a new name for this data flow based on concatenation of source dataflow, column name and value used for filter newDataName = dataName + '_' + operationFlag + '_' + valueToSplitOn.value newDataProfile = newDataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(newDataProfile, newDataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats( newDataFlow, newDataProfile, newDataName, thisStageNumber, operatorToUse, operationFlag) # Capture the column inventory for the new dataflow columnInventoryIntermediate = columnInventoryIntermediate.append( columnInventory) # Capture the data flow inventory for the new data flow dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append( dataFlowInventory) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage( newDataFlow, newDataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format( newDataName, targetPackagePath)) else: print('{0}: no operation required'.format(dataName)) # Generate column and data flow inventories for the source dataflow columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) columnInventoryIntermediate = columnInventoryIntermediate.append( columnInventory) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append( dataFlowInventory) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) # Now return all of the components badk to the main loop... return dataFlow, columnInventoryIntermediate, dataFlowInventoryIntermediate else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
import azureml.dataprep as dprep import os as os import re as re import collections import seaborn as sns import pandas_profiling as pp import datetime from datetime import datetime from azureml.dataprep import value from azureml.dataprep import col from azureml.dataprep import Dataflow from commonCode import savePackage, openPackage, createFullPackagePath from commonPackageHandling import openDataFlowPackage #%% dataFlow = openDataFlowPackage('PEOPLE', '22', 'A') #%% dataFlow.head(10) #%% dataProfile = dataFlow.get_profile() dataProfile #%% dataFlow.row_count #%% builder = dataFlow.builders.set_column_types() #%%
def quarantineRows(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) # Now perform the operation on the dataFlow : ie remove the number of rows specified from the top # First count the number of columns found dataFlowColumns = list(dataFlow.get_profile().columns.keys()) numberOfColumnsFound = len(dataFlowColumns) # Now convert the operationFlag to an integer headerCount = int(operationFlag) # If we have more columns that expected, we quarantine rows which have values in the extra columns if numberOfColumnsFound > headerCount: # NOTE - this logic assumes that all unwanted columns are on the far right, this could be improved! # Fork a new data flow with rows that have data in the un-expected columns print( '{0}: we have found {1} columns, expected {2} so will now quarantine any rows with data in them ' .format(dataName, numberOfColumnsFound, headerCount)) quarantinedDataFlow = dataFlow.drop_nulls( dataFlowColumns[headerCount:]) # There is a chance we have an extra column but no rows to quarantine, so check this first if quarantinedDataFlow.row_count is None: quarantinedRowCount = 0 print('{0}: no rows to quarantine'.format(dataName)) else: quarantinedRowCount = quarantinedDataFlow.row_count # Finally save the data flow so it can be used later fullPackagePath = saveDataFlowPackage(quarantinedDataFlow, dataName, thisStageNumber, 'B') print('{0}: quarantined {1} rows of data to {2}'.format( dataName, quarantinedRowCount, fullPackagePath)) # Now filter out the quarantined rows from the main data set # NOTE : can't figure out a better way of doign this for now - see note below... for columnToCheck in dataFlowColumns[headerCount:]: # NOTE - don't know why commented line of code below doesn't work! # dataFlow = dataFlow.filter(dataFlow[columnToCheck] != '') dataFlow = dataFlow.assert_value(columnToCheck, value != '', error_code='ShouldBeNone') dataFlow = dataFlow.filter(col(columnToCheck).is_error()) print('{0}: filtered column {1}, row count now {2}'.format( dataName, columnToCheck, dataFlow.row_count)) # Finally drop the extra columns dataFlow = dataFlow.drop_columns(dataFlowColumns[headerCount:]) print('{0}: dropped {1} unwanted columns'.format( dataName, len(dataFlowColumns[headerCount:]))) else: print( '{0}: we have found {1} columns, expected {2} so not going to do anything' .format(dataName, numberOfColumnsFound, headerCount)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) # Now return all of the components badk to the main loop... return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
return x.day def return_date(x): if not isinstance(x, datetime.datetime): return '' if x.day is None: return '' return x #%% dataFlow, dataFlowPath = openDataFlowPackage('PEOPLE', '50', 'A') df = dataFlow.to_pandas_dataframe().head(1000) profile = dataFlow.get_profile() dob = df['DOB'] to_days = np.vectorize(lambda x: return_day(x)) to_months = np.vectorize(lambda x: return_month(x)) to_years = np.vectorize(lambda x: return_year(x)) to_date = np.vectorize(lambda x: return_date(x)) #%% histograms = [] #%%
def joinTables(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) # Set up empty intermediate dataframes that we will use to build up inventories at both dataFlow and column level dataFlowInventoryIntermediate = pd.DataFrame() columnInventoryIntermediate = pd.DataFrame() if operationFlag != '': # Load config file joinConfig = dprep.read_csv('./Config/' + operationFlag).to_pandas_dataframe() # For each config in the file... for index, row in joinConfig.iterrows(): leftDataName = row['LeftDataName'] leftDataFlowJoinColumn = row['LeftDataFlowJoinColumn'] rightDataName = row['RightDataName'] rightDataFlowJoinColumn = row['RightDataFlowJoinColumn'] joinType = row['JoinType'] print( '{0}: ready to join {1} {2} -> {3} {4} using jointype {5}'. format(dataName, leftDataName, leftDataFlowJoinColumn, rightDataName, rightDataFlowJoinColumn, joinType)) # Load right hand data flow rightDataFlow, fullPackagePath = openDataFlowPackage( rightDataName, previousStageNumber, qualityFlag) print('{0}: loaded package from path {1}'.format( rightDataName, fullPackagePath)) # We always perform the inner "MATCH" stype join join_builder = dataFlow.builders.join( right_dataflow=rightDataFlow, left_column_prefix=dataName + '_', right_column_prefix=rightDataName + '_') join_builder.detect_column_info() join_builder.join_key_pairs = [(leftDataFlowJoinColumn, rightDataFlowJoinColumn)] # Setting up join type: # NONE = 0 # MATCH = 2 # UNMATCHLEFT = 4 # UNMATCHRIGHT = 8 join_builder.join_type = 2 innerDataFlow = join_builder.to_dataflow() print('{0} created inner dataflow : Columns : {1}, Rows : {2}'. format(dataName, len(innerDataFlow.get_profile().columns), innerDataFlow.row_count)) if joinType == "LEFT": # Use the "UNMATCHLEFT" setting to grab the rows that haven't been joined from the left data flow join_builder.join_type = 4 leftUnmatchedDataFlow = join_builder.to_dataflow() print( '{0} created left unmatched dataflow : Columns : {1}, Rows : {2}' .format( dataName, len(leftUnmatchedDataFlow.get_profile().columns), leftUnmatchedDataFlow.row_count)) # Now append this dataflow to the original inner join dataflow, to create a "left outer join" newDataFlow = innerDataFlow.append_rows( [leftUnmatchedDataFlow]) else: newDataFlow = innerDataFlow # Create a new name for this data flow based on concatenation of left dataflow and right newDataName = dataName + '_' + rightDataName # Output key stats print('{0} left table : {0}, Columns : {1}, Rows : {2}'.format( leftDataName, len(dataFlow.get_profile().columns), dataFlow.row_count)) print( '{0} right table : {0}, Columns : {1}, Rows : {2}'.format( rightDataName, len(rightDataFlow.get_profile().columns), rightDataFlow.row_count)) newDataProfile = newDataFlow.get_profile() print( '{0} joined table : {0}, Columns : {1}, Rows : {2}'.format( newDataName, len(newDataProfile.columns), newDataFlow.row_count)) # Now generate column and data flow inventories columnInventory = getColumnStats(newDataProfile, newDataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats( newDataFlow, newDataProfile, newDataName, thisStageNumber, operatorToUse, operationFlag) # Capture the column inventory for the new dataflow columnInventoryIntermediate = columnInventoryIntermediate.append( columnInventory) # Capture the data flow inventory for the new data flow dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append( dataFlowInventory) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage( newDataFlow, newDataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format( newDataName, targetPackagePath)) else: print('{0}: no joining of tables required'.format(dataName)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) columnInventoryIntermediate = columnInventoryIntermediate.append( columnInventory) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append( dataFlowInventory) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved source package to {1}'.format( dataName, targetPackagePath)) return dataFlow, columnInventoryIntermediate, dataFlowInventoryIntermediate else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None