def multiThreadGetAllNumericalRangeDiffScore(self, threadNum, numericalColumnSmallRange, primaryKeysSet): comRdFileObj = commonReadFile() if (self.outRangeFileGenerateFlag): comRdFileObj.clearFileContent(Intermediate_files[1]) #clear file headerLst = blist() headerLst.append('table.field A') headerLst.append('table.field B') headerLst.append('range difference score') comRdFileObj.writeListRowToFileTsv(Intermediate_files[1], headerLst) #remove numerical small range tmpMap = self.allNumericalfieldRangeMap.copy() for tbfield, Percentiles in tmpMap.items(): if (int(Percentiles[4]) - int(Percentiles[1])) <= numericalColumnSmallRange: del self.allNumericalfieldRangeMap[tbfield] print('len primaryKeysSet, allNumericalfieldRangeMap ', len(primaryKeysSet), len(self.allNumericalfieldRangeMap)) pool = ThreadPool(threadNum) pool.map(self.getAllNumericalRangeDiffScoreMap, primaryKeysSet) pool.close() pool.join() if (self.outRangeFileGenerateFlag): comRdFileObj = commonReadFile() comRdFileObj.sortAndWritetoFile( self.allNumericalRangeDifferenceScoreTripleLst, Intermediate_files[1])
def multiThreadsGetAllNumericalBucketdotProductsScore( self, threadNum, allNumericalPairsRangeDifferenceScoreMap, finalNumericalOutputFile): comRdFileObj = commonReadFile() comRdFileObj.clearFileContent(finalNumericalOutputFile) #clear file headerLst = blist() headerLst.append('table.field A') headerLst.append('table.field B') headerLst.append('bucket dot product score') comRdFileObj.writeListRowToFileTsv(finalNumericalOutputFile, headerLst) #remove range difference result above threshold pair tmpMap = allNumericalPairsRangeDifferenceScoreMap.copy() for pair, rdScore in tmpMap.items(): if rdScore > float(self.rangeDiffThd): del allNumericalPairsRangeDifferenceScoreMap[pair] print( 'allNumericalPairsRangeDifferenceScoreMap below threshold len : ', len(allNumericalPairsRangeDifferenceScoreMap)) pool = ThreadPool(threadNum) pool.map(self.getAllNumericalBucketdotProductsScoreMap, allNumericalPairsRangeDifferenceScoreMap) pool.close() pool.join() comRdFileObj.sortAndWritetoFile( self.allNumericalBucketDPScoreTripleLst, finalNumericalOutputFile) print('allNumericalBucketDPScoreTripleLst len : ', len(self.allNumericalBucketDPScoreTripleLst))
def multithreadgetAllNonNumericalCosinSimi( self, sampleFlag, pairsAllTop, outFileNonNumericalRatioScoreAll): #for pr in pairsTupleTobeMatched: # print ('prrrrrr ', pr) self.recordPartialResultTimeStart = time.time() #start time comRdFileObj = commonReadFile( ) # clear only matching ratio output file comRdFileObj.clearFileContent( outFileNonNumericalRatioScoreAll) #clear file self.lstTopPairsTobeAllMatched = blist() #clear at the beginning pool = ThreadPool(self.threadNum) if sampleFlag: pool.map(self.getNonumericalCosSimiRecordWiseSampleMap, pairsAllTop) else: pool.map(self.getNonumericalCosSimiRecordWiseScalingMethodMap, pairsAllTop) pool.close() pool.join() #write all matching ratio result comRdFileObj.sortAndWritetoFile(self.lstTopPairsTobeAllMatched, outFileNonNumericalRatioScoreAll)
def readSamplesResultTopKMatchingRatio(self, outFileNonNumericalRatioScoreSample, outFileNonNumericalRatioScoreAll): comRdFileObj = commonReadFile() pairLst = comRdFileObj.readTwoColumnTsvFileToList( outFileNonNumericalRatioScoreSample) self.multithreadgetAllNonNumericalCosinSimi( False, pairLst, outFileNonNumericalRatioScoreAll)
def combineFinalResultFromMultipleMachine(self, inputDirPath, outFile): readLastNNum = 50 comRdFileObj = commonReadFile() comRdFileObj.clearFileContent(outFile) #clear file lstAllTriples = blist() for fileName in commonReadFile.yieldEveryFileIterativeInDirectoryTsv( inputDirPath): print('fileNamexxxxxx ', fileName) lstAllTriples = lstAllTriples + comRdFileObj.readLastNLineFileTsvThreeColumnToLst( readLastNNum, fileName) fd = open(outFile, 'w') strVar = 'TABLE.field A' + '\t' + 'TABLE.fieldB' + '\t' + 'matchingRatio' + '\t' + '\n' comRdFileObj.writeStrRowToFileAppendWriter(fd, strVar) fd.close() comRdFileObj.sortAndWritetoFile(lstAllTriples, outFile)
def partitionHashFile(self, machineNo, machineNums, topNonNumericalTobePairFile, allNonNumericalColumnTbFieldValueFile): comRdFileObj = commonReadFile() pairsTupleTobeMatched = comRdFileObj.readTwoColumnTsvFileToList( topNonNumericalTobePairFile) partOfPairsTupleTobeMatched = blist() print('len(pairsTupleTobeMatched ', len(pairsTupleTobeMatched)) #hash partition for i in range(0, len(pairsTupleTobeMatched)): if (machineNo == i % machineNums): partOfPairsTupleTobeMatched.append(pairsTupleTobeMatched[i]) # print('len(pairsTupleTobeMatchedbbbb ', len(pairsTupleTobeMatched)) allnonNumericalTbfieldValuesMap = comRdFileObj.readFileTbFieldValueIntoMapTsv( allNonNumericalColumnTbFieldValueFile) return partOfPairsTupleTobeMatched, allnonNumericalTbfieldValuesMap
def numericalFieldValuesWriteFile(self, gtNumericalFieldsPairLst, tbFieldAllNumericalValuesMap, outFile): gtNumericalFieldValsMap = {} comRdFileObj = commonReadFile() with open(outFile, "a") as fd: for pair in gtNumericalFieldsPairLst: tblfd1 = pair[0] tblfd2 = pair[1] if tblfd1 in tbFieldAllNumericalValuesMap: #add hashmap value if tblfd1 not in gtNumericalFieldValsMap: gtNumericalFieldValsMap[ tblfd1] = tbFieldAllNumericalValuesMap[tblfd1] listRow = [tblfd1] + gtNumericalFieldValsMap[tblfd1] comRdFileObj.writeListRowToFileWriterTsv(fd, listRow) if tblfd2 in tbFieldAllNumericalValuesMap: if tblfd2 not in gtNumericalFieldValsMap: gtNumericalFieldValsMap[ tblfd2] = tbFieldAllNumericalValuesMap[tblfd2] listRow = [tblfd2] + gtNumericalFieldValsMap[tblfd2] comRdFileObj.writeListRowToFileWriterTsv(fd, listRow)
def getPercentilesAllNumerical(self, allNumericalValuesMap, outRangeFileFlag): comRdFileObj = commonReadFile() if (outRangeFileFlag): comRdFileObj.clearFileContent(Intermediate_files[0]) #clear file headerLst = blist() headerLst.append('table.field') headerLst.append('20% percentile') headerLst.append('30% percentile') headerLst.append('80% percentile') headerLst.append('90% percentile') comRdFileObj.writeListRowToFileTsv(Intermediate_files[0], headerLst) print('len allNumericalValuesMap ', len(allNumericalValuesMap)) for tbField, valsSet in allNumericalValuesMap.items(): if len(valsSet[1:]) != 0: #the tb.field values except tb.field [ minVal, maxVal, percValA1, percValA2, percValA3, percValA4, percValA5, percValA6, percValA7, percValA8, percValA9 ] = self.getRangePercentiles(valsSet[1:], 10, 20, 30, 40, 50, 60, 70, 80, 90) if [ minVal, maxVal, percValA1, percValA2, percValA3, percValA4, percValA5, percValA6, percValA7, percValA8, percValA9 ] != [ 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan' ]: percentLst = blist() percentLst.append(tbField) percentLst.append(str(percValA2)) percentLst.append(str(percValA3)) percentLst.append(str(percValA8)) percentLst.append(str(percValA9)) self.allNumericalfieldRangeMap[tbField] = percentLst if (outRangeFileFlag): comRdFileObj.writeListRowToFileTsv( Intermediate_files[0], percentLst)
def multithreadgetNonNumericalCosinSimi(self, pairsTupleTobeMatched, outFileNonNumericalRatioScoreAll): # print ('prrrrrr ', pr) #profile begin # pr = cProfile.Profile() # pr.enable() outFileNonNumericalRatioScoreSample = Intermediate_DirFiles[1] print('multithreadgetNonNumericalCosinSimi to be paired len ', len(pairsTupleTobeMatched)) comRdFileObj = commonReadFile( ) # clear only matching ratio output file comRdFileObj.clearFileContent( outFileNonNumericalRatioScoreSample) #clear file self.lstTopPairsTobeAllMatched = blist() #clear at the beginning pool = ThreadPool(self.threadNum) pool.map(self.getSamplesNonumericalCosSimiRecordWiseMap, pairsTupleTobeMatched) pool.close() pool.join() #write samples matching ratio result print('multithreadgetNonNumericalCosinSimi pairs result len ', len(self.lstTopPairsTobeAllMatched), outFileNonNumericalRatioScoreSample) comRdFileObj.sortAndWritetoFile(self.lstTopPairsTobeAllMatched, outFileNonNumericalRatioScoreSample) #running all sample result, two ways. one way is to read file, the other way is to read from self.lstTopPairsTobeAllMatched #self.readSamplesResultTopKMatchingRatio(outFileNonNumericalRatioScoreSample, outFileNonNumericalRatioScoreAll) # the second way to do all mathcing # self.multithreadgetAllNonNumericalCosinSimi(False, self.lstTopPairsTobeAllMatched, outFileNonNumericalRatioScoreAll) '''
def getAllTablesDividedPrimary(self, inputDataDir, nonNumericalColumnSmallRange, InterMediateFileFlag): preproc = preprocess() comRdFileObj = commonReadFile() if (InterMediateFileFlag): #have intermediate file or not comRdFileObj.clearFileContent(Intermediate_files[0]) comRdFileObj.clearFileContent(Intermediate_files[1]) self.getTableNames(inputDataDir) for tbName in self.tablesNameList: print('database table name: ', tbName) dataList = self.readOneTable(inputDataDir, tbName) typeFileOut = inputDataDir + tbName.upper( ) + '/desc_accurate.txt' #column data type and primary key file strWrtRow = 'col_name' + '\t' + 'data_type' + '\t' + 'comment' + '\t' + '\n' comRdFileObj.writeStrRowToFileAppend(typeFileOut, strWrtRow) #[row, fieldOutList] = self.getFields(tbName) # get time-format type table.fields tbFieldNonNumericalTimeFormatType = self.getNonNumericalFieldsTimeFormatType( inputDataDir, tbName) for data in dataList: #remove date format column, not matching requirement splitNameSet = set(data.fieldA.split('_')) if (dateFormatNotationSet & splitNameSet): continue newcolNumericalList = blist([]) newcolNonNumericalList = blist([]) setVals = set(data.value.unique().flatten()) colValSet = preproc.filterNullValueAndLowercaseSet( setVals) #set, unique value considered #remove time format and small range if data.fieldA in tbFieldNonNumericalTimeFormatType or len( setVals) <= nonNumericalColumnSmallRange: continue if preproc.judgeListAllNumbers(colValSet): #judge the primary key and write into files if ((len(setVals) == len(data.value)) and len(setVals) != 0): #write into database individual directory files about field type and primary keys strWrtRow = data.fieldA + '\t' + 'numerical' + '\t' + 'primaryKey' + '\t' + '\n' self.primaryKeysSet.add( data.tableA + '.' + data.fieldA ) #store primary keys, generally for numerical only else: strWrtRow = data.fieldA + '\t' + 'numerical' + '\t' + 'otherKey' + '\t' + '\n' comRdFileObj.writeStrRowToFileAppend( typeFileOut, strWrtRow) #store numerical keys strHead = data.tableA + '.' + data.fieldA #newcolNumericalList.append(strHead) for val in colValSet: if preproc.is_number(val): fval = float(val) if fval >= 0: #need to consider positive for matching newcolNumericalList.append(int(fval)) self.tbFieldAllNumericalValuesMap[ strHead] = newcolNumericalList if (InterMediateFileFlag): comRdFileObj.writeListRowToFileTsv( Intermediate_files[0], [strHead] + newcolNumericalList) #write to numerical file else: #elif preproc.judgeListAllNonNumerical(set(colValList)): #judge the primary key and write into files if ((len(setVals) == len(data.value)) and len(setVals) != 0): #write into database individual directory files about field type and primary keys strWrtRow = data.fieldA + '\t' + 'non-numerical' + '\t' + 'primaryKey' + '\t' + '\n' else: strWrtRow = data.fieldA + '\t' + 'non-numerical' + '\t' + 'otherKey' + '\t' + '\n' comRdFileObj.writeStrRowToFileAppend( typeFileOut, strWrtRow) #store non-numerical keys strHead = data.tableA + '.' + data.fieldA #newcolNonNumericalList.append(strHead) newcolNonNumericalList += blist(colValSet) self.tbFieldAllNonNumericalValuesMap[ strHead] = newcolNonNumericalList if (InterMediateFileFlag): comRdFileObj.writeListRowToFileTsv( Intermediate_files[1], [strHead] + newcolNonNumericalList ) # write to non-numerical file
def getNonumericalCosSimiRecordWiseScalingMethod( self, pair, tbFieldAllNonNumericalValuesMap, prefixLength, partFetchNum, ratioPruning, recordPrSimiThreshold, finalNonNumericalOutputDir): comRdFileObj = commonReadFile() if not os.path.exists(finalNonNumericalOutputDir + '/' + 'pruneResults'): os.makedirs(finalNonNumericalOutputDir + '/' + 'pruneResults') i = 0 #get field values prA = pair.strip().split('-')[0].lower() #tb.field A prB = pair.strip().split('-')[1].lower() #get index if prA in tbFieldAllNonNumericalValuesMap and prB in tbFieldAllNonNumericalValuesMap: #get all tb field values # print ('pairsDDDDDDDDD: ', prA, prB) lstValA = tbFieldAllNonNumericalValuesMap[prA] #get values lstValB = tbFieldAllNonNumericalValuesMap[prB] # print ('newLsValA AAAA: ' , prA, prB, len(lsValA),lsValA[0], len(lsValB), lsValB[0]) writeWholeLst = blist([]) #write rows lists cosResLst = blist([]) pairsNameLstA = blist([]) pairsNameLstB = blist([]) pairsNameLstA.append(str(prA)) pairsNameLstB.append(str(prB)) cosResLst.append('Cosine Similarity') countTrueComparePairs = 0 #partition A and B columns into several partitions respectively lenA = len(lstValA) lenB = len(lstValB) lenAPartition = min(partFetchNum, lenA) numAPartition = ceil(lenA / lenAPartition) lenBPartition = min(partFetchNum, lenB) numBPartition = ceil(lenB / lenBPartition) #scalable method, in partition times partion pairs, if in the first half of partition, the ratio of matching above threshod is less than ratioPruning, exit this column pairs. Bexit = False laAllLenWhole = 0 lbAlllenWhole = 0 for i in range(0, numAPartition): if Bexit: break if (i + 1) * partFetchNum <= lenA: lsValACur = lstValA[i * partFetchNum:(i + 1) * partFetchNum] else: lsValACur = lstValA[i * partFetchNum:lenA] for j in range(0, numBPartition): if (j + 1) * partFetchNum <= lenB: lsValBCur = lstValB[j * partFetchNum:(j + 1) * partFetchNum] else: lsValBCur = lstValB[j * partFetchNum:lenB] #print ('countTruePairsAAAAAAAAAA ', lenA, lenB, prA, prB) [ countTrueComparePairsEvery, countAboveThreholdEvery, laAllLen, lbAlllen ] = self.filterColumnsWithSample(False, lsValACur, lsValBCur, prefixLength, partFetchNum, recordPrSimiThreshold, pairsNameLstA, pairsNameLstB, cosResLst) #print ('pairsccccccccc: ', prA, prB, countAboveThrehold, partFetchNum, numAPartition) laAllLenWhole += laAllLen lbAllLenWhole += lbAlllen countTrueComparePairs += countTrueComparePairsEvery if ( countAboveThreholdEvery / ((len(lsValACur) + len(lsValBCur)) / 2) < ratioPruning ): # (numAPartition >=2) and (i <= 2*numAPartition/3) and judge if the ratio of matching above threshold is low, exit and go to next column,until to the final partition? Bexit = True filetmp = finalNonNumericalOutputDir + '/' + 'pruneResults' + '/' + 'prunePairs.tsv' rowStr = prA + '\t' + prB + '\t' + '\n' comRdFileObj.writeStrRowToFileAppend(filetmp, rowStr) # print ('pairsDDDDDDDDD: ', prA, prB) break #consider this column pair # if any record pair similarity above threshold, run again sample record similarity if (not Bexit) and (len(pairsNameLstA) > 1): #no any pair qualifies writeWholeLst.append(pairsNameLstA) writeWholeLst.append(pairsNameLstB) writeWholeLst.append(cosResLst) #print ('countTruePairsBBBBBBBB ', countTrueComparePairs, prA, prB) ##fdprsObj = fieldPairSim(prA, prB, len(pairsNameLstA)/countTrueComparePairs) laMatchLen = len(set(pairsNameLstA[0])) - 1 lbMatchLen = len(set(pairsNameLstB[1])) - 1 matchingRatio = 0.5 * (laMatchLen / laAllLen + laAllLenWhole / lbAlllenWhole) fdprsObj = fieldPairSim(prA, prB, matchingRatio) self.lstTopPairsTobeAllMatched.append(fdprsObj) if (len(writeWholeLst) >= 3): #select records, numOfRecords, tbA = prA.split('.')[0] fdA = prA.split('.')[1] tbB = prB.split('.')[0] fdB = prB.split('.')[1] outFile2 = str(tbA).upper() + '__' + str(fdA) + '-' + str( tbB).upper() + '__' + str(fdB) fd = open( finalNonNumericalOutputDir + '/' + outFile2 + '.tsv', 'w') comRdFileObj.writeListsColumnsToFileAppendWriterTsv( fd, writeWholeLst) fd.close() writeWholeLst = blist([]) # totalEndOne = time.time() # print ('total time One', totalEndOne - totalEndOneFilter) #because the speed and time problem, write out part of result to look self.recordPartialResultTimeEnd = time.time() #start time if ((len(self.lstTopPairsTobeAllMatched) != 0 and len(self.lstTopPairsTobeAllMatched) % 30 == 0) or ((self.recordPartialResultTimeEnd - self.recordPartialResultTimeStart) >= 86400)): #86400seconds =1 days self.recordPartialResultTimeStart = time.time() if not os.path.exists(finalNonNumericalOutputDir + '/' + 'partResultOutput'): os.makedirs(finalNonNumericalOutputDir + '/' + 'partResultOutput') comRdFileObj = commonReadFile( ) # clear only matching ratio output file comRdFileObj.sortAndWritetoFile( self.lstTopPairsTobeAllMatched, finalNonNumericalOutputDir + '/' + 'partResultOutput' + '/' + 'partRatioScoreAllResult00' + str(len(self.lstTopPairsTobeAllMatched)) + '.tsv')
def getSamplesNonumericalCosSimiRecordWise(self, pair, samplesFlag, tbFieldAllNonNumericalValuesMap, prefixLength, sampleRecordsNum, recordPrSimiThreshold, finalNonNumericalOutputDir): comRdFileObj = commonReadFile() i = 0 #get field values if (samplesFlag): prA = pair.strip().split('-')[0].lower() #tb.field A prB = pair.strip().split('-')[1].lower() else: prA = pair.fieldA.strip() prB = pair.fieldB.strip() #get index if prA in tbFieldAllNonNumericalValuesMap and prB in tbFieldAllNonNumericalValuesMap: #get all tb field values # print ('pairsDDDDDDDDD: ', prA, prB) lsValA = tbFieldAllNonNumericalValuesMap[prA] #get values lsValB = tbFieldAllNonNumericalValuesMap[prB] # print ('newLsValA AAAA: ' , prA, prB, len(lsValA),lsValA[0], len(lsValB), lsValB[0]) writeWholeLst = blist([]) #write rows lists cosResLst = blist([]) pairsNameLstA = blist([]) pairsNameLstB = blist([]) pairsNameLstA.append(str(prA)) pairsNameLstB.append(str(prB)) cosResLst.append('Cosine Similarity') [countTruePairs, countAboveThrehold, laAllLen, lbAlllen] = self.filterColumnsWithSample( samplesFlag, lsValA, lsValB, prefixLength, sampleRecordsNum, recordPrSimiThreshold, pairsNameLstA, pairsNameLstB, cosResLst) # if any record pair similarity above threshold, run again sample record similarity if (len(pairsNameLstA) > 1): #no any pair qualifies writeWholeLst.append(pairsNameLstA) writeWholeLst.append(pairsNameLstB) writeWholeLst.append(cosResLst) #fdprsObj = fieldPairSim(prA, prB, len(pairsNameLstA)/countTruePairs) #matching ratio score calcuation 1/2*(la/l_alla + lb/l_allb) laMatchLen = len(set(pairsNameLstA[0])) - 1 lbMatchLen = len(set(pairsNameLstB[1])) - 1 matchingRatio = 0.5 * (laMatchLen / laAllLen + lbMatchLen / lbAlllen) fdprsObj = fieldPairSim(prA, prB, matchingRatio) self.lstTopPairsTobeAllMatched.append(fdprsObj) if (len(writeWholeLst) >= 3): #select records, numOfRecords, tbA = prA.split('.')[0] fdA = prA.split('.')[1] tbB = prB.split('.')[0] fdB = prB.split('.')[1] outFile2 = str(tbA).upper() + '__' + str(fdA) + '-' + str( tbB).upper() + '__' + str(fdB) if (samplesFlag): #sample result output dir finalNonNumericalOutputDir = Intermediate_DirFiles[0] fd = open( finalNonNumericalOutputDir + '/' + outFile2 + '.tsv', 'w') comRdFileObj.writeListsColumnsToFileAppendWriterTsv( fd, writeWholeLst) i = i + 1 fd.close() writeWholeLst = blist([]) # totalEndOne = time.time() # print ('total time One', totalEndOne - totalEndOneFilter) #because the speed and time problem, write out part of result if (len(self.lstTopPairsTobeAllMatched) != 0 and len(self.lstTopPairsTobeAllMatched) % 200 == 0): if not os.path.exists( 'intermediateOutput/nonNumericalInterOutput/second'): os.makedirs( 'intermediateOutput/nonNumericalInterOutput/second') if not os.path.exists( 'intermediateOutput/nonNumericalInterOutput/second/partResultOutput' ): os.makedirs( 'intermediateOutput/nonNumericalInterOutput/second/partResultOutput' ) comRdFileObj = commonReadFile( ) # clear only matching ratio output file comRdFileObj.sortAndWritetoFile( self.lstTopPairsTobeAllMatched, Intermediate_DirFiles[2] + '/' + 'partRatioScoreAllResult00' + str(len(self.lstTopPairsTobeAllMatched)) + '.tsv')