コード例 #1
0
ファイル: data_summary.py プロジェクト: vhcg77/pydqc
def data_summary(table_schema,
                 _table,
                 fname,
                 sample_size=1.0,
                 feature_colname='column',
                 dtype_colname='type',
                 output_root='',
                 keep_images=False,
                 n_jobs=1):
    """
	Summarize basic information of all columns in a data table
	based on the provided data schema

	Parameters
	----------
	table_schema: pandas DataFrame
		schema of the table, should contain data types of each column
	_table: pandas DataFrame
		the data table
	fname: string
		the output file name
	sample_size: integer or float(<=1.0), default=1.0
		int: number of sample rows to do the summary (useful for large tables)
		float: sample size in percentage
	feature_colname: string
		name of the column for feature
	dtype_colname: string
		name of the column for data type
	output_root: string
		the root directory for the output file
	keep_images: boolean
		whether to keep all generated images
	n_jobs: int
		the number of jobs to run in parall
	"""

    # make a copy of the raw table
    table = _table.copy()

    # calculate the sample size
    if sample_size <= 1.0:
        sample_size = int(table.shape[0] * sample_size)

    if sample_size < table.shape[0]:
        table = table.sample(sample_size).reset_index(drop=True)

    # classify features based on data type
    key_features = table_schema[table_schema[dtype_colname] ==
                                'key'][feature_colname].values
    numeric_features = table_schema[table_schema[dtype_colname] ==
                                    'numeric'][feature_colname].values
    string_features = table_schema[table_schema[dtype_colname] ==
                                   'str'][feature_colname].values
    date_features = table_schema[table_schema[dtype_colname] ==
                                 'date'][feature_colname].values

    # features with wrong types
    type_correct_features = list(key_features) + list(numeric_features) + list(
        string_features) + list(date_features)
    type_error_features = list(
        set(table_schema[feature_colname].values) - set(type_correct_features))

    # features not in table
    null_features = []

    # temp dir to store all the images generated
    img_dir = 'img_temp'
    if os.path.isdir(img_dir):
        shutil.rmtree(img_dir)
    os.mkdir(img_dir)

    # create a new workbook to store everything
    wb = openpyxl.Workbook()

    # for key features
    # only check features in table
    key_features = [
        feat for feat in key_features if feat in table.columns.values
    ]
    null_features += [
        feat for feat in key_features if feat not in table.columns.values
    ]
    if len(key_features) > 0:
        # get the check result
        _n_jobs = np.min([n_jobs, len(key_features)])
        key_results = Parallel(n_jobs=_n_jobs)(
            delayed(_check_string)(col, table[[col]]) for col in key_features)
        ws = wb.create_sheet(title='key')
        # write the final result to work sheet
        _insert_string_results(key_results, ws, 25)

    # for numeric features
    # only check features in table
    numeric_features = [
        feat for feat in numeric_features if feat in table.columns.values
    ]
    null_features += [
        feat for feat in numeric_features if feat not in table.columns.values
    ]
    if len(numeric_features) > 0:
        # get the check result
        _n_jobs = np.min([n_jobs, len(numeric_features)])
        numeric_results = Parallel(n_jobs=_n_jobs)(
            delayed(_check_numeric)(col, table[[col]], img_dir)
            for col in numeric_features)
        ws = wb.create_sheet(title='numeric')
        # write the final result to work sheet
        _insert_numeric_results(numeric_results, ws, 35, img_dir)

    # for string features
    # only check features in table
    string_features = [
        feat for feat in string_features if feat in table.columns.values
    ]
    null_features += [
        feat for feat in string_features if feat not in table.columns.values
    ]
    if len(string_features) > 0:
        _n_jobs = np.min([n_jobs, len(string_features)])
        string_results = Parallel(n_jobs=_n_jobs)(
            delayed(_check_string)(col, table[[col]])
            for col in string_features)
        ws = wb.create_sheet(title='string')
        # write the final result to work sheet
        _insert_string_results(string_results, ws, 25)

    # for date features
    # only check features in table
    date_features = [
        feat for feat in date_features if feat in table.columns.values
    ]
    null_features += [
        feat for feat in date_features if feat not in table.columns.values
    ]
    if len(date_features) > 0:
        # get the current time
        snapshot_date_now = str(datetime.datetime.now().date())
        for col in date_features:
            table['%s_numeric' % (col)] = (
                pd.to_datetime(snapshot_date_now) -
                pd.to_datetime(table[col], errors='coerce')).astype(
                    'timedelta64[M]', errors='ignore')
        _n_jobs = np.min([n_jobs, len(date_features)])
        date_results = Parallel(n_jobs=_n_jobs)(
            delayed(_check_date)('%s_numeric' %
                                 (col), table[['%s_numeric' %
                                               (col), col]], img_dir)
            for col in date_features)

        ws = wb.create_sheet(title='date')
        # write the final result to work sheet
        _insert_numeric_results(date_results, ws, 35, img_dir, date_flag=True)

    # write schema
    ws = wb['Sheet']
    ws.title = 'schema'
    out_schema = table_schema[[feature_colname, dtype_colname]]
    out_schema['check'] = 'Ok'

    error_indices = []
    if len(type_error_features) > 0:
        out_schema['check'] = out_schema[feature_colname].apply(
            lambda x: 'type error' if x in type_error_features else x)
        error_indices += list(out_schema[out_schema[feature_colname].isin(
            type_error_features)].index.values)
    if len(null_features) > 0:
        out_schema['check'] = out_schema[feature_colname].apply(
            lambda x: 'not exits' if x in null_features else x)
        error_indices += list(out_schema[out_schema[feature_colname].isin(
            null_features)].index.values)

    _ = _insert_df(out_schema, ws, header=True)
    if len(error_indices) > 0:
        for idx in error_indices:
            ws['C%d' % (idx + 2)].style = 'Bad'

    _adjust_ws(ws=ws, row_height=25)

    wb.save(filename=os.path.join(output_root, 'data_summary_%s.xlsx' %
                                  (fname)))

    # remove all temp images
    if not keep_images:
        shutil.rmtree(img_dir)
コード例 #2
0
def data_consist(table1,
                 table2,
                 key1,
                 key2,
                 schema1,
                 schema2,
                 fname,
                 sample_size=1.0,
                 output_root='',
                 keep_images=False,
                 n_jobs=1):
    """
    Check consistency between two tables

    Parameters
    ----------
    table1: pandas DataFrame
        one of the two tables to compare
    table2: pandas DataFrame
        one of the two tables to compare
    key1: string
        key for table1
    key2: string
        key for table2
    schema1: pandas DataFrame
        data schema (contains column names and corresponding data types) for _table1
    schema2: pandas DataFrame
        data schema (contains column names and corresponding data types) for _table2
    fname: string
        the output file name
    sample_size: integer or float(<=1.0), default=1.0
        int: number of sample rows to do the comparison (useful for large tables)
        float: sample size in percentage
    output_root: string, default=''
        the root directory for the output file
    keep_images: boolean, default=False
        whether to keep all generated images
    n_jobs: int, default=1
        the number of jobs to run in parallel
    """

    # check whether keys are valid
    if key1 not in table1.columns.values:
        raise ValueError('key1: does not exist in table1')
    if key2 not in table2.columns.values:
        raise ValueError('key2: does not exist in table2')

    # check whether two tables are unique in key level
    if table1[key1].nunique() != table1.shape[0]:
        raise ValueError('table1: should be unique in %s level' % (key1))
    if table2[key2].nunique() != table2.shape[0]:
        raise ValueError('table2: should be unique in %s level' % (key2))

    # check sample_size
    if sample_size > 1:
        if int(sample_size) != sample_size:
            raise ValueError(
                'sample_size: only accept integer when it is > 1.0')
        if (sample_size > table1.shape[0]) or (sample_size > table2.shape[0]):
            print('sample_size: %d is smaller than %d or %d...' %
                  (sample_size, table1.shape[0], table2.shape[0]))

    # check output_root
    if output_root != '':
        if not os.path.isdir(output_root):
            raise ValueError('output_root: root not exists')

    # create a new workbook to store everything
    wb = openpyxl.Workbook()

    # prepare directory for generated images
    img_dir = 'img_temp'
    if os.path.isdir(img_dir):
        shutil.rmtree(img_dir)
    os.mkdir(img_dir)

    # calculate the sample size
    if sample_size <= 1.0:
        both_keys = list(
            set(table1[key1].values).intersection(set(table2[key2].values)))
        sample_size = np.min([
            int(table1.shape[0] * sample_size),
            int(table2.shape[0] * sample_size),
            len(both_keys)
        ])
        sample_keys = np.random.choice(both_keys, sample_size, replace=False)
        table1 = table1[table1[key1].isin(sample_keys)].reset_index(drop=True)
        table2 = table2[table2[key2].isin(sample_keys)].reset_index(drop=True)

    schema, check_features = _check_features(schema1, schema2)
    corr_results = []

    # key features
    key_features = check_features['key']
    if len(key_features) > 0:
        _n_jobs = np.min([n_jobs, len(key_features)])
        key_results = Parallel(n_jobs=_n_jobs)(
            delayed(_compare_key)(col, table1[[col]], table2[[col]], img_dir)
            for col in key_features)

        for key_result in key_results:
            if 'corr' in key_result.keys():
                corr_results.append(key_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title=u'key')
        _insert_numeric_results(key_results, ws, 40, img_dir)

    # numeric features
    numeric_features = check_features['numeric']
    if len(numeric_features) > 0:
        _n_jobs = np.min([n_jobs, len(numeric_features)])
        numeric_results = Parallel(n_jobs=_n_jobs)(delayed(_consist_numeric)(
            col, table1[[key1, col]], table2[[key2, col]], key1, key2, img_dir)
                                                   for col in numeric_features)

        for numeric_result in numeric_results:
            if 'corr' in numeric_result.keys():
                corr_results.append(numeric_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title=u'numeric')
        _insert_numeric_results(numeric_results, ws, 45, img_dir)

    # string features
    string_features = check_features['str']
    if len(string_features) > 0:
        _n_jobs = np.min([n_jobs, len(string_features)])
        string_results = Parallel(n_jobs=_n_jobs)(delayed(_consist_string)(
            col, table1[[key1, col]], table2[[key2, col]], key1, key2)
                                                  for col in string_features)

        for string_result in string_results:
            if 'corr' in string_result.keys():
                corr_results.append(string_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title=u'string')
        _insert_string_results(string_results, ws, 25)

    # date features
    date_features = check_features['date']
    if len(date_features) > 0:
        # get the current time
        snapshot_date_now = str(datetime.datetime.now().date())
        for col in date_features:
            table1[col] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime(
                table1[col], errors='coerce')).astype('timedelta64[M]',
                                                      errors='ignore')
            table2[col] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime(
                table2[col], errors='coerce')).astype('timedelta64[M]',
                                                      errors='ignore')
        _n_jobs = np.min([n_jobs, len(date_features)])
        date_results = Parallel(n_jobs=_n_jobs)(
            delayed(_consist_numeric)(col,
                                      table1[[key1, col]],
                                      table2[[key2, col]],
                                      key1,
                                      key2,
                                      img_dir,
                                      date_flag=True) for col in date_features)

        for date_result in date_results:
            if 'corr' in date_result.keys():
                corr_results.append(date_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title=u'date')
        _insert_numeric_results(date_results, ws, 45, img_dir, date_flag=True)

    # insert the summary
    _insert_summary(wb, schema, corr_results)

    wb.save(filename=os.path.join(output_root, 'data_consist_%s.xlsx' %
                                  (fname)))
    if not keep_images:
        shutil.rmtree(img_dir)
コード例 #3
0
ファイル: data_compare.py プロジェクト: vhcg77/pydqc
def data_compare(_table1,
                 _table2,
                 _schema1,
                 _schema2,
                 fname,
                 sample_size=1.0,
                 feature_colname1='column',
                 feature_colname2='column',
                 dtype_colname1='type',
                 dtype_colname2='type',
                 output_root='',
                 keep_images=False,
                 n_jobs=1):
    """
	Compare columns between two tables

	Parameters
	----------
	_table1: pandas DataFrame
		one of the two tables to compare
	_table2: pandas DataFrame
		one of the two tables to compare
	_schema1: pandas DataFrame
		data schema (contains column names and corresponding data types) for _table1
	_schema2: pandas DataFrame
		data schema (contains column names and corresponding data types) for _table2
	fname: string
		the output file name
	sample_size: integer or float(<=1.0), default=1.0
		int: number of sample rows to do the comparison (useful for large tables)
		float: sample size in percentage
	feature_colname1: string, default='column'
		name of the column for feature of _table1
	feature_colname2: string, default='column'
		name of the column for feature of _table2
	dtype_colname1: string, default='type'
		name of the column for data type of _table1
	dtype_colname2: string, default='type'
		name of the column for data type of _table2
	output_root: string, default=''
		the root directory for the output file
	keep_images: boolean, default=False
		whether to keep all generated images
	n_jobs: int, default=1
		the number of jobs to run in parallel
	"""

    # start to compare with correct schemas
    # create a new workbook to store everything
    wb = openpyxl.Workbook()

    # prepare directory for generated images
    img_dir = 'img_temp'
    if os.path.isdir(img_dir):
        shutil.rmtree(img_dir)
    os.mkdir(img_dir)

    # copy data tables
    table1 = _table1.copy()
    table2 = _table2.copy()

    # calculate the sample size
    if sample_size <= 1.0:
        sample_size1 = int(table1.shape[0] * sample_size)
        sample_size2 = int(table2.shape[0] * sample_size)
        sample_size = np.min([sample_size1, sample_size2])

    # copy both schema
    schema1 = _schema1.copy()[[feature_colname1,
                               dtype_colname1]].rename(columns={
                                   feature_colname1: 'column_1',
                                   dtype_colname1: 'type_1'
                               })
    schema2 = _schema2.copy()[[feature_colname2,
                               dtype_colname2]].rename(columns={
                                   feature_colname2: 'column_2',
                                   dtype_colname2: 'type_2'
                               })

    # merge two schemas
    schema = schema1.merge(schema2,
                           left_on='column_1',
                           right_on='column_2',
                           how='outer')

    # if data types are different in schema1 and schema2, move to error
    schema_error = schema[schema['type_1'] != schema['type_2']].reset_index(
        drop=True)
    schema_error['error'] = "inconsistent data types"
    schema_error.loc[schema_error['column_1'].isnull(),
                     'error'] = "column not in table1"
    schema_error.loc[schema_error['column_2'].isnull(),
                     'error'] = "column not in table2"
    schema_correct = schema[schema['type_1'] == schema['type_2']].reset_index(
        drop=True)

    # classify the features to compare
    key_features = schema_correct[schema_correct['type_1'] ==
                                  'key']['column_1'].values
    numeric_features = schema_correct[schema_correct['type_1'] ==
                                      'numeric']['column_1'].values
    string_features = schema_correct[schema_correct['type_1'] ==
                                     'str']['column_1'].values
    date_features = schema_correct[schema_correct['type_1'] ==
                                   'date']['column_1'].values

    corr_results = []

    # for key features
    # only check features in both tables
    key_features = [
        feat for feat in key_features
        if (feat in table1.columns.values) and (feat in table2.columns.values)
    ]
    if len(key_features) > 0:
        _n_jobs = np.min([n_jobs, len(key_features)])
        key_results = Parallel(n_jobs=_n_jobs)(
            delayed(_compare_key)(col, table1[[col]], table2[[col]], img_dir)
            for col in key_features)

        for key_result in key_results:
            if 'corr' in key_result.keys():
                corr_results.append(key_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title='key')
        _insert_numeric_results(key_results, ws, 40, img_dir)

    # do sampling here
    if sample_size < table1.shape[0]:
        table1 = table1.sample(sample_size).reset_index(drop=True)
    if sample_size < table2.shape[0]:
        table2 = table2.sample(sample_size).reset_index(drop=True)

    # for numeric features
    # only check features in both tables
    numeric_features = [
        feat for feat in numeric_features
        if (feat in table1.columns.values) and (feat in table2.columns.values)
    ]
    if len(numeric_features) > 0:
        _n_jobs = np.min([n_jobs, len(numeric_features)])
        numeric_results = Parallel(n_jobs=_n_jobs)(delayed(_compare_numeric)(
            col, table1[[col]], table2[[col]], img_dir)
                                                   for col in numeric_features)

        for numeric_result in numeric_results:
            if 'corr' in numeric_result.keys():
                corr_results.append(numeric_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title='numeric')
        _insert_numeric_results(numeric_results, ws, 40, img_dir)

    # for string features
    # only check features in both tables
    string_features = [
        feat for feat in string_features
        if (feat in table1.columns.values) and (feat in table2.columns.values)
    ]
    if len(string_features) > 0:
        _n_jobs = np.min([n_jobs, len(string_features)])
        string_results = Parallel(n_jobs=_n_jobs)(
            delayed(_compare_string)(col, table1[[col]], table2[[col]])
            for col in string_features)

        for string_result in string_results:
            if 'corr' in string_result.keys():
                corr_results.append(string_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title='string')
        _insert_compare_string_results(string_results, ws, 40)

    # for date features
    # only check features in both tables
    date_features = [
        feat for feat in date_features
        if (feat in table1.columns.values) and (feat in table2.columns.values)
    ]
    if len(date_features) > 0:
        # get the current time
        snapshot_date_now = str(datetime.datetime.now().date())
        for col in date_features:
            table1['%s_numeric' % (col)] = (
                pd.to_datetime(snapshot_date_now) -
                pd.to_datetime(table1[col], errors='coerce')).astype(
                    'timedelta64[M]', errors='ignore')
            table2['%s_numeric' % (col)] = (
                pd.to_datetime(snapshot_date_now) -
                pd.to_datetime(table2[col], errors='coerce')).astype(
                    'timedelta64[M]', errors='ignore')
        _n_jobs = np.min([n_jobs, len(date_features)])
        date_results = Parallel(n_jobs=_n_jobs)(
            delayed(_compare_date)('%s_numeric' % (col), table1[
                ['%s_numeric' %
                 (col), col]], table2[['%s_numeric' % (col), col]], img_dir)
            for col in date_features)

        for date_result in date_results:
            if 'corr' in date_result.keys():
                corr_results.append(date_result['corr'])

        # write all results to worksheet
        ws = wb.create_sheet(title='date')
        _insert_numeric_results(date_results, ws, 40, img_dir, date_flag=True)

    # insert the summary
    ws = wb['Sheet']
    ws.title = 'summary'
    summary_df = schema_correct[['column_1', 'type_1']].rename(columns={
        'column_1': 'column',
        'type_1': 'type'
    })
    corr_df = pd.DataFrame(corr_results)
    summary_df = summary_df.merge(corr_df, on='column', how='left')
    summary_df['corr'] = summary_df['corr'].fillna('error')
    summary_df['error_flg'] = summary_df['corr'].apply(lambda x: 1
                                                       if x == 'error' else 0)
    error_rows = summary_df[summary_df['error_flg'] == 1].index.values

    _ = _insert_df(summary_df[['column', 'type', 'corr']], ws, header=True)

    for r_idx in error_rows:
        ws['C%d' % (r_idx + 2)].style = 'Bad'

    _adjust_ws(ws=ws, row_height=25)

    # if there are some errors
    if len(schema_error) > 0:
        ws = wb.create_sheet(title='error')
        _ = _insert_df(schema_error, ws, header=True)
        _adjust_ws(ws=ws, row_height=25)

    wb.save(filename=os.path.join(output_root, 'data_compare_%s.xlsx' %
                                  (fname)))
    if not keep_images:
        shutil.rmtree(img_dir)
コード例 #4
0
def data_summary(table_schema, table, fname, sample_size=1.0, sample_rows=100, output_root='', keep_images=False, n_jobs=1):
    """
    Summarize basic information of all columns in a data table
    based on the provided data schema

    Parameters
    ----------
    table_schema: pandas DataFrame
        schema of the table, should contain data types of each column
    table: pandas DataFrame
        the data table
    fname: string
        the output file name
    sample_size: integer or float(<=1.0), default=1.0
        int: number of sample rows to do the summary (useful for large tables)
        float: sample size in percentage
    sample_rows: integer
        number of rows to get data samples
    output_root: string
        the root directory for the output file
    keep_images: boolean
        whether to keep all generated images
    n_jobs: int
        the number of jobs to run in parall
    """

    # check sample_size
    if sample_size > 1:
        if int(sample_size) != sample_size:
            raise ValueError('sample_size: only accept integer when it is > 1.0')
        if sample_size > table.shape[0]:
            print("sample_size: %d is larger than the data size: %d" % (sample_size, table.shape[0]))

    # check output_root
    if output_root != '':
        if not os.path.isdir(output_root):
            raise ValueError('output_root: root not exists')

    # get data samples before sample_size
    data_sample = table.sample(sample_rows).reset_index(drop=True)

    # calculate the sample size
    if sample_size <= 1.0:
        sample_size = int(table.shape[0] * sample_size)
    if sample_size < table.shape[0]:
        table = table.sample(sample_size).reset_index(drop=True)

    exclude_features, check_features = _check_features(table_schema)

    # temp dir to store all the images generated
    img_dir = 'img_temp'
    if os.path.isdir(img_dir):
        shutil.rmtree(img_dir)
    os.mkdir(img_dir)

    # create a new workbook to store everything
    wb = openpyxl.Workbook()

    # key features
    key_features = check_features['key']
    if len(key_features) > 0:
        # get the check result
        _n_jobs = np.min([n_jobs, len(key_features)])
        key_results = Parallel(n_jobs=_n_jobs)(delayed(_check_string)(col, table[[col]]) for col in key_features)
        ws = wb.create_sheet(title=u'key')
        # write the final result to work sheet
        _insert_string_results(key_results, ws, 25)

    # numeric features
    numeric_features = check_features['numeric']
    if len(numeric_features) > 0:
        # get the check result
        _n_jobs = np.min([n_jobs, len(numeric_features)])
        numeric_results = Parallel(n_jobs=_n_jobs)(
            delayed(_check_numeric)(col, table[[col]], img_dir) for col in numeric_features)
        ws = wb.create_sheet(title=u'numeric')
        # write the final result to work sheet
        _insert_numeric_results(numeric_results, ws, 35, img_dir)

    # string features
    string_features = check_features['str']
    if len(string_features) > 0:
        _n_jobs = np.min([n_jobs, len(string_features)])
        string_results = Parallel(n_jobs=_n_jobs)(delayed(_check_string)(col, table[[col]]) for col in string_features)
        ws = wb.create_sheet(title=u'string')
        # write the final result to work sheet
        _insert_string_results(string_results, ws, 25)

    # date features
    date_features = check_features['date']
    if len(date_features) > 0:
        # get the current time
        snapshot_date_now = str(datetime.datetime.now().date())
        for col in date_features:
            table['%s_numeric' %(col)] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime(table[col], errors='coerce')).astype('timedelta64[M]', errors='ignore')
        _n_jobs = np.min([n_jobs, len(date_features)])
        date_results = Parallel(n_jobs=_n_jobs)(
            delayed(_check_date)('%s_numeric' %(col), table[['%s_numeric' %(col), col]], img_dir) for col in date_features)

        ws = wb.create_sheet(title=u'date')
        # write the final result to work sheet
        _insert_numeric_results(date_results, ws, 35, img_dir, date_flag=True)

    # write schema
    ws = wb['Sheet']
    ws.title = 'schema'
    out_schema = table_schema[['column', 'type']]
    out_schema['check'] = 'Ok'

    # output error features
    error_indices = []
    if len(exclude_features) > 0:
        out_schema['check'] = out_schema['column'].apply(lambda x : 'exclude' if x in exclude_features else 'Ok')
        error_indices += list(out_schema[out_schema['column'].isin(exclude_features)].index.values)

    _ = _insert_df(out_schema, ws, header=True)
    if len(error_indices) > 0:
        for idx in error_indices:
            ws['C%d' %(idx+2)].style = 'Bad'

    _adjust_ws(ws=ws, row_height=25)


    # write data samples
    ws = wb.create_sheet(title=u'sample')
    _ = _insert_df(data_sample, ws, header=True, head_color=False)

    wb.save(filename=os.path.join(output_root, 'data_summary_%s.xlsx' %(fname)))

    # remove all temp images
    if not keep_images:
        shutil.rmtree(img_dir)