Python commonReadFile Examples, commonReadFile.commonReadFile Python Examples

Example #1

0

Show file

    def multiThreadGetAllNumericalRangeDiffScore(self, threadNum,
                                                 numericalColumnSmallRange,
                                                 primaryKeysSet):
        comRdFileObj = commonReadFile()
        if (self.outRangeFileGenerateFlag):
            comRdFileObj.clearFileContent(Intermediate_files[1])  #clear file
            headerLst = blist()
            headerLst.append('table.field A')
            headerLst.append('table.field B')
            headerLst.append('range difference score')
            comRdFileObj.writeListRowToFileTsv(Intermediate_files[1],
                                               headerLst)

        #remove numerical small range
        tmpMap = self.allNumericalfieldRangeMap.copy()
        for tbfield, Percentiles in tmpMap.items():
            if (int(Percentiles[4]) -
                    int(Percentiles[1])) <= numericalColumnSmallRange:
                del self.allNumericalfieldRangeMap[tbfield]

        print('len primaryKeysSet, allNumericalfieldRangeMap ',
              len(primaryKeysSet), len(self.allNumericalfieldRangeMap))

        pool = ThreadPool(threadNum)
        pool.map(self.getAllNumericalRangeDiffScoreMap, primaryKeysSet)
        pool.close()
        pool.join()

        if (self.outRangeFileGenerateFlag):
            comRdFileObj = commonReadFile()
            comRdFileObj.sortAndWritetoFile(
                self.allNumericalRangeDifferenceScoreTripleLst,
                Intermediate_files[1])

Example #2

0

Show file

    def multiThreadsGetAllNumericalBucketdotProductsScore(
            self, threadNum, allNumericalPairsRangeDifferenceScoreMap,
            finalNumericalOutputFile):
        comRdFileObj = commonReadFile()
        comRdFileObj.clearFileContent(finalNumericalOutputFile)  #clear file

        headerLst = blist()
        headerLst.append('table.field A')
        headerLst.append('table.field B')
        headerLst.append('bucket dot product score')
        comRdFileObj.writeListRowToFileTsv(finalNumericalOutputFile, headerLst)

        #remove range difference result above threshold pair
        tmpMap = allNumericalPairsRangeDifferenceScoreMap.copy()
        for pair, rdScore in tmpMap.items():
            if rdScore > float(self.rangeDiffThd):
                del allNumericalPairsRangeDifferenceScoreMap[pair]

        print(
            'allNumericalPairsRangeDifferenceScoreMap below threshold len : ',
            len(allNumericalPairsRangeDifferenceScoreMap))

        pool = ThreadPool(threadNum)
        pool.map(self.getAllNumericalBucketdotProductsScoreMap,
                 allNumericalPairsRangeDifferenceScoreMap)
        pool.close()
        pool.join()
        comRdFileObj.sortAndWritetoFile(
            self.allNumericalBucketDPScoreTripleLst, finalNumericalOutputFile)
        print('allNumericalBucketDPScoreTripleLst len : ',
              len(self.allNumericalBucketDPScoreTripleLst))

Example #3

0

Show file

    def multithreadgetAllNonNumericalCosinSimi(
            self, sampleFlag, pairsAllTop, outFileNonNumericalRatioScoreAll):
        #for pr in pairsTupleTobeMatched:
        #    print ('prrrrrr ', pr)
        self.recordPartialResultTimeStart = time.time()  #start time
        comRdFileObj = commonReadFile(
        )  # clear only matching ratio output file
        comRdFileObj.clearFileContent(
            outFileNonNumericalRatioScoreAll)  #clear file

        self.lstTopPairsTobeAllMatched = blist()  #clear at the beginning
        pool = ThreadPool(self.threadNum)
        if sampleFlag:

            pool.map(self.getNonumericalCosSimiRecordWiseSampleMap,
                     pairsAllTop)
        else:
            pool.map(self.getNonumericalCosSimiRecordWiseScalingMethodMap,
                     pairsAllTop)
        pool.close()
        pool.join()

        #write all matching ratio result
        comRdFileObj.sortAndWritetoFile(self.lstTopPairsTobeAllMatched,
                                        outFileNonNumericalRatioScoreAll)

Example #4

0

Show file

 def readSamplesResultTopKMatchingRatio(self,
                                        outFileNonNumericalRatioScoreSample,
                                        outFileNonNumericalRatioScoreAll):
     comRdFileObj = commonReadFile()
     pairLst = comRdFileObj.readTwoColumnTsvFileToList(
         outFileNonNumericalRatioScoreSample)
     self.multithreadgetAllNonNumericalCosinSimi(
         False, pairLst, outFileNonNumericalRatioScoreAll)

Example #5

0

Show file

    def combineFinalResultFromMultipleMachine(self, inputDirPath, outFile):
        readLastNNum = 50
        comRdFileObj = commonReadFile()
        comRdFileObj.clearFileContent(outFile)  #clear file
        lstAllTriples = blist()

        for fileName in commonReadFile.yieldEveryFileIterativeInDirectoryTsv(
                inputDirPath):
            print('fileNamexxxxxx ', fileName)
            lstAllTriples = lstAllTriples + comRdFileObj.readLastNLineFileTsvThreeColumnToLst(
                readLastNNum, fileName)

        fd = open(outFile, 'w')

        strVar = 'TABLE.field A' + '\t' + 'TABLE.fieldB' + '\t' + 'matchingRatio' + '\t' + '\n'
        comRdFileObj.writeStrRowToFileAppendWriter(fd, strVar)
        fd.close()
        comRdFileObj.sortAndWritetoFile(lstAllTriples, outFile)

Example #6

0

Show file

File: manuallyDistributedmainEntry.py Project: windhaunting/data_integration_graph-database_query

    def partitionHashFile(self, machineNo, machineNums,
                          topNonNumericalTobePairFile,
                          allNonNumericalColumnTbFieldValueFile):
        comRdFileObj = commonReadFile()
        pairsTupleTobeMatched = comRdFileObj.readTwoColumnTsvFileToList(
            topNonNumericalTobePairFile)

        partOfPairsTupleTobeMatched = blist()
        print('len(pairsTupleTobeMatched ', len(pairsTupleTobeMatched))
        #hash partition
        for i in range(0, len(pairsTupleTobeMatched)):
            if (machineNo == i % machineNums):
                partOfPairsTupleTobeMatched.append(pairsTupleTobeMatched[i])
    # print('len(pairsTupleTobeMatchedbbbb ', len(pairsTupleTobeMatched))

        allnonNumericalTbfieldValuesMap = comRdFileObj.readFileTbFieldValueIntoMapTsv(
            allNonNumericalColumnTbFieldValueFile)

        return partOfPairsTupleTobeMatched, allnonNumericalTbfieldValuesMap

Example #7

0

Show file

File: readDatabaseFile.py Project: windhaunting/data_integration_graph-database_query

    def numericalFieldValuesWriteFile(self, gtNumericalFieldsPairLst,
                                      tbFieldAllNumericalValuesMap, outFile):
        gtNumericalFieldValsMap = {}
        comRdFileObj = commonReadFile()
        with open(outFile, "a") as fd:
            for pair in gtNumericalFieldsPairLst:
                tblfd1 = pair[0]
                tblfd2 = pair[1]
                if tblfd1 in tbFieldAllNumericalValuesMap:  #add hashmap value
                    if tblfd1 not in gtNumericalFieldValsMap:
                        gtNumericalFieldValsMap[
                            tblfd1] = tbFieldAllNumericalValuesMap[tblfd1]
                        listRow = [tblfd1] + gtNumericalFieldValsMap[tblfd1]
                        comRdFileObj.writeListRowToFileWriterTsv(fd, listRow)

                if tblfd2 in tbFieldAllNumericalValuesMap:
                    if tblfd2 not in gtNumericalFieldValsMap:
                        gtNumericalFieldValsMap[
                            tblfd2] = tbFieldAllNumericalValuesMap[tblfd2]
                        listRow = [tblfd2] + gtNumericalFieldValsMap[tblfd2]
                        comRdFileObj.writeListRowToFileWriterTsv(fd, listRow)

Example #8

0

Show file

    def getPercentilesAllNumerical(self, allNumericalValuesMap,
                                   outRangeFileFlag):
        comRdFileObj = commonReadFile()
        if (outRangeFileFlag):
            comRdFileObj.clearFileContent(Intermediate_files[0])  #clear file
            headerLst = blist()

            headerLst.append('table.field')
            headerLst.append('20% percentile')
            headerLst.append('30% percentile')
            headerLst.append('80% percentile')
            headerLst.append('90% percentile')
            comRdFileObj.writeListRowToFileTsv(Intermediate_files[0],
                                               headerLst)
        print('len allNumericalValuesMap ', len(allNumericalValuesMap))
        for tbField, valsSet in allNumericalValuesMap.items():
            if len(valsSet[1:]) != 0:  #the tb.field values except tb.field
                [
                    minVal, maxVal, percValA1, percValA2, percValA3, percValA4,
                    percValA5, percValA6, percValA7, percValA8, percValA9
                ] = self.getRangePercentiles(valsSet[1:], 10, 20, 30, 40, 50,
                                             60, 70, 80, 90)
                if [
                        minVal, maxVal, percValA1, percValA2, percValA3,
                        percValA4, percValA5, percValA6, percValA7, percValA8,
                        percValA9
                ] != [
                        'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan',
                        'Nan', 'Nan', 'Nan'
                ]:
                    percentLst = blist()
                    percentLst.append(tbField)
                    percentLst.append(str(percValA2))
                    percentLst.append(str(percValA3))
                    percentLst.append(str(percValA8))
                    percentLst.append(str(percValA9))
                    self.allNumericalfieldRangeMap[tbField] = percentLst
                    if (outRangeFileFlag):
                        comRdFileObj.writeListRowToFileTsv(
                            Intermediate_files[0], percentLst)

Example #9

0

Show file

    def multithreadgetNonNumericalCosinSimi(self, pairsTupleTobeMatched,
                                            outFileNonNumericalRatioScoreAll):
        #    print ('prrrrrr ', pr)
        #profile begin
        # pr = cProfile.Profile()
        # pr.enable()

        outFileNonNumericalRatioScoreSample = Intermediate_DirFiles[1]
        print('multithreadgetNonNumericalCosinSimi to be paired len ',
              len(pairsTupleTobeMatched))

        comRdFileObj = commonReadFile(
        )  # clear only matching ratio output file
        comRdFileObj.clearFileContent(
            outFileNonNumericalRatioScoreSample)  #clear file
        self.lstTopPairsTobeAllMatched = blist()  #clear at the beginning
        pool = ThreadPool(self.threadNum)
        pool.map(self.getSamplesNonumericalCosSimiRecordWiseMap,
                 pairsTupleTobeMatched)
        pool.close()
        pool.join()

        #write samples matching ratio result
        print('multithreadgetNonNumericalCosinSimi pairs result len ',
              len(self.lstTopPairsTobeAllMatched),
              outFileNonNumericalRatioScoreSample)

        comRdFileObj.sortAndWritetoFile(self.lstTopPairsTobeAllMatched,
                                        outFileNonNumericalRatioScoreSample)

        #running all sample result, two ways. one way is to read file, the other way is to read from self.lstTopPairsTobeAllMatched
        #self.readSamplesResultTopKMatchingRatio(outFileNonNumericalRatioScoreSample, outFileNonNumericalRatioScoreAll)

        # the second way to do all mathcing
        # self.multithreadgetAllNonNumericalCosinSimi(False, self.lstTopPairsTobeAllMatched, outFileNonNumericalRatioScoreAll)
        '''

Example #10

0

Show file

File: readDatabaseFile.py Project: windhaunting/data_integration_graph-database_query

    def getAllTablesDividedPrimary(self, inputDataDir,
                                   nonNumericalColumnSmallRange,
                                   InterMediateFileFlag):
        preproc = preprocess()
        comRdFileObj = commonReadFile()

        if (InterMediateFileFlag):  #have intermediate file or not
            comRdFileObj.clearFileContent(Intermediate_files[0])
            comRdFileObj.clearFileContent(Intermediate_files[1])
        self.getTableNames(inputDataDir)
        for tbName in self.tablesNameList:
            print('database table name: ', tbName)
            dataList = self.readOneTable(inputDataDir, tbName)

            typeFileOut = inputDataDir + tbName.upper(
            ) + '/desc_accurate.txt'  #column data type and primary key file
            strWrtRow = 'col_name' + '\t' + 'data_type' + '\t' + 'comment' + '\t' + '\n'
            comRdFileObj.writeStrRowToFileAppend(typeFileOut, strWrtRow)
            #[row, fieldOutList] = self.getFields(tbName)

            # get time-format type table.fields
            tbFieldNonNumericalTimeFormatType = self.getNonNumericalFieldsTimeFormatType(
                inputDataDir, tbName)

            for data in dataList:

                #remove date format column,   not matching requirement
                splitNameSet = set(data.fieldA.split('_'))
                if (dateFormatNotationSet & splitNameSet):
                    continue

                newcolNumericalList = blist([])
                newcolNonNumericalList = blist([])
                setVals = set(data.value.unique().flatten())
                colValSet = preproc.filterNullValueAndLowercaseSet(
                    setVals)  #set, unique value considered
                #remove time format and small range
                if data.fieldA in tbFieldNonNumericalTimeFormatType or len(
                        setVals) <= nonNumericalColumnSmallRange:
                    continue

                if preproc.judgeListAllNumbers(colValSet):
                    #judge the primary key and write into files
                    if ((len(setVals) == len(data.value))
                            and len(setVals) != 0):
                        #write into database individual directory files about field type and primary keys
                        strWrtRow = data.fieldA + '\t' + 'numerical' + '\t' + 'primaryKey' + '\t' + '\n'
                        self.primaryKeysSet.add(
                            data.tableA + '.' + data.fieldA
                        )  #store primary keys, generally for numerical only
                    else:
                        strWrtRow = data.fieldA + '\t' + 'numerical' + '\t' + 'otherKey' + '\t' + '\n'
                    comRdFileObj.writeStrRowToFileAppend(
                        typeFileOut, strWrtRow)

                    #store numerical keys
                    strHead = data.tableA + '.' + data.fieldA
                    #newcolNumericalList.append(strHead)
                    for val in colValSet:
                        if preproc.is_number(val):
                            fval = float(val)
                            if fval >= 0:  #need to consider positive for matching
                                newcolNumericalList.append(int(fval))
                    self.tbFieldAllNumericalValuesMap[
                        strHead] = newcolNumericalList
                    if (InterMediateFileFlag):
                        comRdFileObj.writeListRowToFileTsv(
                            Intermediate_files[0], [strHead] +
                            newcolNumericalList)  #write to numerical file
                else:  #elif preproc.judgeListAllNonNumerical(set(colValList)):

                    #judge the primary key and write into files
                    if ((len(setVals) == len(data.value))
                            and len(setVals) != 0):
                        #write into database individual directory files about field type and primary keys
                        strWrtRow = data.fieldA + '\t' + 'non-numerical' + '\t' + 'primaryKey' + '\t' + '\n'
                    else:
                        strWrtRow = data.fieldA + '\t' + 'non-numerical' + '\t' + 'otherKey' + '\t' + '\n'
                    comRdFileObj.writeStrRowToFileAppend(
                        typeFileOut, strWrtRow)

                    #store non-numerical keys
                    strHead = data.tableA + '.' + data.fieldA
                    #newcolNonNumericalList.append(strHead)
                    newcolNonNumericalList += blist(colValSet)
                    self.tbFieldAllNonNumericalValuesMap[
                        strHead] = newcolNonNumericalList
                    if (InterMediateFileFlag):
                        comRdFileObj.writeListRowToFileTsv(
                            Intermediate_files[1],
                            [strHead] + newcolNonNumericalList
                        )  # write to non-numerical file

Example #11

0

Show file

    def getNonumericalCosSimiRecordWiseScalingMethod(
            self, pair, tbFieldAllNonNumericalValuesMap, prefixLength,
            partFetchNum, ratioPruning, recordPrSimiThreshold,
            finalNonNumericalOutputDir):
        comRdFileObj = commonReadFile()
        if not os.path.exists(finalNonNumericalOutputDir + '/' +
                              'pruneResults'):
            os.makedirs(finalNonNumericalOutputDir + '/' + 'pruneResults')

        i = 0
        #get field values
        prA = pair.strip().split('-')[0].lower()  #tb.field A
        prB = pair.strip().split('-')[1].lower()
        #get index
        if prA in tbFieldAllNonNumericalValuesMap and prB in tbFieldAllNonNumericalValuesMap:
            #get all tb field values
            #  print ('pairsDDDDDDDDD: ', prA, prB)

            lstValA = tbFieldAllNonNumericalValuesMap[prA]  #get values
            lstValB = tbFieldAllNonNumericalValuesMap[prB]
            # print ('newLsValA AAAA: ' , prA, prB, len(lsValA),lsValA[0], len(lsValB), lsValB[0])
            writeWholeLst = blist([])  #write rows lists
            cosResLst = blist([])
            pairsNameLstA = blist([])
            pairsNameLstB = blist([])
            pairsNameLstA.append(str(prA))
            pairsNameLstB.append(str(prB))
            cosResLst.append('Cosine Similarity')
            countTrueComparePairs = 0
            #partition A and B columns into several partitions respectively
            lenA = len(lstValA)
            lenB = len(lstValB)

            lenAPartition = min(partFetchNum, lenA)
            numAPartition = ceil(lenA / lenAPartition)
            lenBPartition = min(partFetchNum, lenB)
            numBPartition = ceil(lenB / lenBPartition)

            #scalable method, in partition times partion pairs, if in the first half of partition, the ratio of matching above threshod is less than ratioPruning, exit this column pairs.
            Bexit = False
            laAllLenWhole = 0
            lbAlllenWhole = 0
            for i in range(0, numAPartition):
                if Bexit:
                    break
                if (i + 1) * partFetchNum <= lenA:
                    lsValACur = lstValA[i * partFetchNum:(i + 1) *
                                        partFetchNum]
                else:
                    lsValACur = lstValA[i * partFetchNum:lenA]

                for j in range(0, numBPartition):
                    if (j + 1) * partFetchNum <= lenB:
                        lsValBCur = lstValB[j * partFetchNum:(j + 1) *
                                            partFetchNum]
                    else:
                        lsValBCur = lstValB[j * partFetchNum:lenB]
                    #print ('countTruePairsAAAAAAAAAA ', lenA, lenB, prA, prB)

                    [
                        countTrueComparePairsEvery, countAboveThreholdEvery,
                        laAllLen, lbAlllen
                    ] = self.filterColumnsWithSample(False, lsValACur,
                                                     lsValBCur, prefixLength,
                                                     partFetchNum,
                                                     recordPrSimiThreshold,
                                                     pairsNameLstA,
                                                     pairsNameLstB, cosResLst)
                    #print ('pairsccccccccc: ', prA, prB, countAboveThrehold, partFetchNum, numAPartition)
                    laAllLenWhole += laAllLen
                    lbAllLenWhole += lbAlllen
                    countTrueComparePairs += countTrueComparePairsEvery
                    if (
                            countAboveThreholdEvery /
                        ((len(lsValACur) + len(lsValBCur)) / 2) < ratioPruning
                    ):  #  (numAPartition >=2) and (i <= 2*numAPartition/3) and  judge if the ratio of matching above threshold is low, exit and go to next column,until to the final partition?
                        Bexit = True
                        filetmp = finalNonNumericalOutputDir + '/' + 'pruneResults' + '/' + 'prunePairs.tsv'
                        rowStr = prA + '\t' + prB + '\t' + '\n'
                        comRdFileObj.writeStrRowToFileAppend(filetmp, rowStr)
                        # print ('pairsDDDDDDDDD: ', prA, prB)
                        break
                #consider this column pair
            # if any record pair similarity above threshold, run again sample record similarity
            if (not Bexit) and (len(pairsNameLstA) >
                                1):  #no any pair qualifies
                writeWholeLst.append(pairsNameLstA)
                writeWholeLst.append(pairsNameLstB)
                writeWholeLst.append(cosResLst)
                #print ('countTruePairsBBBBBBBB ', countTrueComparePairs,  prA, prB)
                ##fdprsObj = fieldPairSim(prA, prB, len(pairsNameLstA)/countTrueComparePairs)

                laMatchLen = len(set(pairsNameLstA[0])) - 1
                lbMatchLen = len(set(pairsNameLstB[1])) - 1
                matchingRatio = 0.5 * (laMatchLen / laAllLen +
                                       laAllLenWhole / lbAlllenWhole)
                fdprsObj = fieldPairSim(prA, prB, matchingRatio)

                self.lstTopPairsTobeAllMatched.append(fdprsObj)

                if (len(writeWholeLst) >= 3):
                    #select records, numOfRecords,
                    tbA = prA.split('.')[0]
                    fdA = prA.split('.')[1]
                    tbB = prB.split('.')[0]
                    fdB = prB.split('.')[1]
                    outFile2 = str(tbA).upper() + '__' + str(fdA) + '-' + str(
                        tbB).upper() + '__' + str(fdB)

                    fd = open(
                        finalNonNumericalOutputDir + '/' + outFile2 + '.tsv',
                        'w')
                    comRdFileObj.writeListsColumnsToFileAppendWriterTsv(
                        fd, writeWholeLst)
                    fd.close()
                    writeWholeLst = blist([])
                # totalEndOne = time.time()
                # print ('total time One', totalEndOne - totalEndOneFilter)
        #because the speed and time problem, write out part of result to look
        self.recordPartialResultTimeEnd = time.time()  #start time
        if ((len(self.lstTopPairsTobeAllMatched) != 0
             and len(self.lstTopPairsTobeAllMatched) % 30 == 0)
                or ((self.recordPartialResultTimeEnd -
                     self.recordPartialResultTimeStart) >=
                    86400)):  #86400seconds =1 days
            self.recordPartialResultTimeStart = time.time()
            if not os.path.exists(finalNonNumericalOutputDir + '/' +
                                  'partResultOutput'):
                os.makedirs(finalNonNumericalOutputDir + '/' +
                            'partResultOutput')
            comRdFileObj = commonReadFile(
            )  # clear only matching ratio output file
            comRdFileObj.sortAndWritetoFile(
                self.lstTopPairsTobeAllMatched, finalNonNumericalOutputDir +
                '/' + 'partResultOutput' + '/' + 'partRatioScoreAllResult00' +
                str(len(self.lstTopPairsTobeAllMatched)) + '.tsv')

Example #12

0

Show file

    def getSamplesNonumericalCosSimiRecordWise(self, pair, samplesFlag,
                                               tbFieldAllNonNumericalValuesMap,
                                               prefixLength, sampleRecordsNum,
                                               recordPrSimiThreshold,
                                               finalNonNumericalOutputDir):
        comRdFileObj = commonReadFile()

        i = 0
        #get field values
        if (samplesFlag):
            prA = pair.strip().split('-')[0].lower()  #tb.field A
            prB = pair.strip().split('-')[1].lower()
        else:
            prA = pair.fieldA.strip()
            prB = pair.fieldB.strip()
        #get index
        if prA in tbFieldAllNonNumericalValuesMap and prB in tbFieldAllNonNumericalValuesMap:
            #get all tb field values
            # print ('pairsDDDDDDDDD: ', prA, prB)

            lsValA = tbFieldAllNonNumericalValuesMap[prA]  #get values
            lsValB = tbFieldAllNonNumericalValuesMap[prB]
            # print ('newLsValA AAAA: ' , prA, prB, len(lsValA),lsValA[0], len(lsValB), lsValB[0])
            writeWholeLst = blist([])  #write rows lists
            cosResLst = blist([])
            pairsNameLstA = blist([])
            pairsNameLstB = blist([])
            pairsNameLstA.append(str(prA))
            pairsNameLstB.append(str(prB))
            cosResLst.append('Cosine Similarity')

            [countTruePairs, countAboveThrehold,
             laAllLen, lbAlllen] = self.filterColumnsWithSample(
                 samplesFlag, lsValA, lsValB, prefixLength, sampleRecordsNum,
                 recordPrSimiThreshold, pairsNameLstA, pairsNameLstB,
                 cosResLst)
            # if any record pair similarity above threshold, run again sample record similarity
            if (len(pairsNameLstA) > 1):  #no any pair qualifies
                writeWholeLst.append(pairsNameLstA)
                writeWholeLst.append(pairsNameLstB)
                writeWholeLst.append(cosResLst)

                #fdprsObj = fieldPairSim(prA, prB, len(pairsNameLstA)/countTruePairs)
                #matching ratio score calcuation    1/2*(la/l_alla + lb/l_allb)
                laMatchLen = len(set(pairsNameLstA[0])) - 1
                lbMatchLen = len(set(pairsNameLstB[1])) - 1
                matchingRatio = 0.5 * (laMatchLen / laAllLen +
                                       lbMatchLen / lbAlllen)
                fdprsObj = fieldPairSim(prA, prB, matchingRatio)

                self.lstTopPairsTobeAllMatched.append(fdprsObj)

                if (len(writeWholeLst) >= 3):
                    #select records, numOfRecords,
                    tbA = prA.split('.')[0]
                    fdA = prA.split('.')[1]
                    tbB = prB.split('.')[0]
                    fdB = prB.split('.')[1]
                    outFile2 = str(tbA).upper() + '__' + str(fdA) + '-' + str(
                        tbB).upper() + '__' + str(fdB)

                    if (samplesFlag):  #sample result output dir
                        finalNonNumericalOutputDir = Intermediate_DirFiles[0]
                        fd = open(
                            finalNonNumericalOutputDir + '/' + outFile2 +
                            '.tsv', 'w')
                    comRdFileObj.writeListsColumnsToFileAppendWriterTsv(
                        fd, writeWholeLst)
                    i = i + 1
                    fd.close()
                    writeWholeLst = blist([])
                # totalEndOne = time.time()
                # print ('total time One', totalEndOne - totalEndOneFilter)
        #because the speed and time problem, write out part of result

        if (len(self.lstTopPairsTobeAllMatched) != 0
                and len(self.lstTopPairsTobeAllMatched) % 200 == 0):
            if not os.path.exists(
                    'intermediateOutput/nonNumericalInterOutput/second'):
                os.makedirs(
                    'intermediateOutput/nonNumericalInterOutput/second')
            if not os.path.exists(
                    'intermediateOutput/nonNumericalInterOutput/second/partResultOutput'
            ):
                os.makedirs(
                    'intermediateOutput/nonNumericalInterOutput/second/partResultOutput'
                )
            comRdFileObj = commonReadFile(
            )  # clear only matching ratio output file
            comRdFileObj.sortAndWritetoFile(
                self.lstTopPairsTobeAllMatched,
                Intermediate_DirFiles[2] + '/' + 'partRatioScoreAllResult00' +
                str(len(self.lstTopPairsTobeAllMatched)) + '.tsv')