def checkTableWithCoulmnNames():
    """
    .. module:: compare two tables with column names
        :platform: Hadoop
        :synopsis: Using Lavenstein distance algorithm, the function will measure the likeliness between two string arrays

    """
    try:
        # reading the input files which are collected from the existing hadoop environment and having column names
        projPath = Path(__file__).parent
        yamlPath = (projPath / "../CommonScripts/config.yaml").resolve()
        if path.exists(yamlPath):
            with open(yamlPath, 'r') as f:
                config = yaml.safe_load(f)
                # input table location
                pathdir = config['tablefilepath']
                # output table(s) store location. this will store a file with matching table names based on column names
                pathmatching = config['tablefilepathcoulmnmatch']
                # sample test data which will be validated against the collected
                sampleData = config['inputtablecolumns']
    except Exception as e:
        logException(
            "Exception occured while reading YAML file in CheckTablesWithColumnNames.py"
            + " " + str(e))


# reading all files from the stored location iteratively which contains the column names for each table
    for filename in os.listdir(pathdir):
        with open(os.path.join(pathdir, filename)) as f:
            # creating an array for all the column names present in the file for the given table
            try:
                lines = f.readlines()
                # transferring the array to a string type for fuzzy matching
                arrElements = ''.join(lines).replace("\n", ",").strip()

                # find the approx match from the historical data by using Levenshtein Distance algorithm
                # the algorithm is implemented in fuzzywuzzy python package, which is used
                # higher the match, more will be the ratio
                ratio = fuzz.token_set_ratio(arrElements, sampleData)
                # consider the tables where ratio is more than 75
                if ratio > 75:
                    print(sampleData, arrElements, ratio)
                    print(filename)
                    # write the matching table names in preconfigured path
                    filetowrite = open(pathmatching + "colmatchingtables.txt",
                                       "w")
                    filetowrite.write(filename + "\n")
            except Exception as e:
                logException(
                    "Exception while comparing with column names in CheckTablesWithColumnNames.py"
                    + " " + str(e))
def rmseofarrays(prediction, target):
    """
    .. module:: determine root mean square error between two numerical arrays
        :platform: Hadoop
        :synopsis: RMSE is a measure of how spread the data points are from the regression line
        :prediction: first argument. 2D array of the existing table's mean and std
        :target: second argument. 2D array of the input table's mean and std
        :returns: the mean(RMSE) of the two inputs

    """
    try:
        return np.sqrt(((prediction - target)**2).mean())

    except Exception as e:
        logException(
            "Exception raised while calculating RMSE in RMSEofArrays.py" +
            " " + str(e))
def inputtablevalues():
    """
    .. module:: extract information from input table
        :platform: Hadoop
        :synopsis: This function will create a 2D array, which will have mean and standard deviation for each column from the input table

    """
    try:
        projPath = Path(__file__).parent
        yamlPath = (projPath / "../CommonScripts/config.yaml").resolve()
        if path.exists(yamlPath):
            with open(yamlPath, 'r') as f:
                config = yaml.safe_load(f)
                # Read the input file from the directory path. File should be in csv format
                pathdir = config['inputtablelocation']
                columnnames = config['inputtablecolumns']
    except Exception as e:
        logException(
            "Exception while reading YAML file in ExtractInputsFromToBeTestFile.py"
            + " " + str(e))

    inputvalusarray = []
    # Create the dataframe which will be used for feature determination
    inputdata = pd.read_csv(pathdir)
    # Read only the columns, which has to be considered for the test, from the config file
    # calculate the Mean, Standard Deviation for each of them
    columnarray = []
    columnarray.append(columnnames)
    try:
        for m in list(inputdata):

            for n in columnarray:

                if fuzz.token_set_ratio(m, n) > 50:
                    inputvalusarraytemp = []
                    inputvalusarraytemp.append(inputdata[m].mean())
                    inputvalusarraytemp.append(inputdata[m].std())
                    inputvalusarray.append(inputvalusarraytemp)
        return inputvalusarray

    except Exception as e:
        logException(
            "Exception while extracting mean and std in ExtractInputsFromToBeTestFile.py"
            + " " + str(e))
def outliersiqr(dset):
    """
    .. module:: remove outliers
        :platform: Hadoop
        :synopsis: Using IQR method, this fuction will remove the 5% records from the beginning and from the end of a shorted list

    """
    low = .05
    high = .95
    quartildiff = dset.quantile([low, high])
    try:

        for name in list(dset.columns):
            if is_numeric_dtype(dset[name]):
                # for the neumeric type columns, filter the date which are under 5%-95% distribution
                dset = dset[(dset[name] > quartildiff.loc[low, name])
                            & (dset[name] < quartildiff.loc[high, name])]
        return dset
    except Exception as e:
        logException(str(e))
from os import path

# Creating sql context for reading data from Hive
sqlContext = HiveContext(sc)
# Read the config file to get the path where table names are stored along with the fields
try:
    projPath = Path(__file__).parent
    yamlPath = (projPath / "../CommonScripts/config.yaml").resolve()
    if path.exists(yamlPath):
        with open(yamlPath, 'r') as f:
            configfile = yaml.safe_load(f)
            filepath = configfile['tablefilepathcoulmnmatch']
            filenameContent = configfile['filenamecontent']
            filecolumnstomatch = configfile['inputtablecolumns']
except Exception as e:
    logException("Error in reading YAML file in TableDataAnalysisForDuplicacy.py"+" "+str(e))

# Create a 2D array for the features from the input dataframe, which has to be tested
input_features_arr = inputtablevalues()
# Create the array for the input columns from the to be tested file
columnarray=[]
columnarray.append(filecolumnstomatch)

try:
    # Read each table from the file which has been created after column matching
    readfile = open(filepath+"colmatchingtables.txt")
    while True:
        line = readfile.readline()
        tableName = line.replace(filenameContent, '')
        # Create the sql dataframe from Hive table
        data = sqlContext.table(tableName)
import yaml
from CommonScripts.LogExceptions import logException
import os
from pathlib import Path
from os import path

# this unit test case is for testing the common error logging functionality
try:
    projPath = Path(__file__).parent
    yamlPath = (projPath / "../CommonScripts/config.yaml").resolve()
    if path.exists(yamlPath):
        with open(yamlPath, 'r') as f:
            config = yaml.safe_load(f)
            # input table location
            pathdir = config['tablefilepath']
    else:
        print("path does not exist:" + yamlPath)
except Exception as e:
    logException("From Exception test script" + str(e))