def getAllNumericalRangeDiffScore(self, onePrimaryKey, allNumericalfieldRangeMap): #matching using primary key to calculate range difference score, not considering the same table matching if onePrimaryKey in allNumericalfieldRangeMap: #primary key is in the numerical algorithm, run numerical algorithm for tbFd in allNumericalfieldRangeMap: if (onePrimaryKey, tbFd ) in self.allNumericalPairsRangeDifferenceScoreMap or ( tbFd, onePrimaryKey ) in self.allNumericalPairsRangeDifferenceScoreMap: continue tableNm1 = onePrimaryKey.split('.')[0] tableNm2 = tbFd.split('.')[0] if onePrimaryKey != tbFd and tableNm1 != tableNm2: #judge the other field not in the same table matching etc. # print ('len allNumericalFieldsList', tbfdPri + ',' + tbFd) percentileListA = allNumericalfieldRangeMap[onePrimaryKey][ 1:] percentileListB = allNumericalfieldRangeMap[tbFd][1:] rangeDiffScore = self.getRangeMetricNumericalUsed( int(percentileListA[0]), int(percentileListB[0]), int(percentileListA[1]), int(percentileListB[1]), int(percentileListA[2]), int(percentileListB[2]), int(percentileListA[3]), int(percentileListB[3])) rdScoreLst = blist() rdScoreLst.append(onePrimaryKey) rdScoreLst.append(tbFd) rdScoreLst.append(rangeDiffScore) self.allNumericalPairsRangeDifferenceScoreMap[( onePrimaryKey, tbFd)] = rangeDiffScore fdprsObj = fieldPairSim(onePrimaryKey, tbFd, rangeDiffScore) self.allNumericalRangeDifferenceScoreTripleLst.append( fdprsObj)
def readLastNLineFileTsvThreeColumnToLst(self, lastNLine, inputFile): indexLastN = -1 * lastNLine; lstTriples = blist() f = open (inputFile) lineList = f.readlines() f.close() #print (lineList) print ("The last line is:", len(lineList)) #print (lineList[-1]) for row in lineList[indexLastN:-1]: #print ('row ', row, type(row)) prA = row.split('\t')[0].strip().lower() # strip and lowercase prB = row.split('\t')[1].strip().lower() # strip and lowercase matchingRatio = row.split('\t')[2].strip().lower() # strip and lowercase fdprsObj = fieldPairSim(prA, prB, matchingRatio) lstTriples.append(fdprsObj) return lstTriples
def getAllNumericalBucketdotProductsScore(self, rangeDiffThd, inputBucketSizeNum, oneRangeDiffResPair, allNumericalValuesMap): # print ('oneRangeDiffResPair : ', type(oneRangeDiffResPair), oneRangeDiffResPair) preproc = preprocess() pair = oneRangeDiffResPair setX = set() setY = set() fieldX = str(pair[0]) fieldXVal = allNumericalValuesMap[fieldX] for val in set(fieldXVal): #unique value if preproc.is_number(val) and int(float(val)) >= 0 and int( float(val)) < 200000000000: setX.add(int(float(val))) fieldY = str(pair[1]) fieldYVal = allNumericalValuesMap[fieldY] for val in set(fieldYVal): if preproc.is_number(val) and int(float(val)) >= 0 and int( float(val)) < 200000000000: #if selectNum <= 0.5*len(fieldBValue): setY.add(int(float(val))) rangeXY = int( max(min(setX), max(setX), min(setY), max(setY)) - min(min(setX), max(setX), min(setY), max(setY)) + 1) #union of range to decide the # bucketNum bdpRes = self.bucketDotProduct( setX, setY, int(inputBucketSizeNum), rangeXY, True) #normalized buckete dot product score bdpScoreLst = blist() bdpScoreLst.append(fieldX) bdpScoreLst.append(fieldY) bdpScoreLst.append(bdpRes) fdprsObj = fieldPairSim(fieldX, fieldY, bdpRes) self.allNumericalBucketDPScoreTripleLst.append(fdprsObj)
def getNonumericalCosSimiRecordWiseScalingMethod( self, pair, tbFieldAllNonNumericalValuesMap, prefixLength, partFetchNum, ratioPruning, recordPrSimiThreshold, finalNonNumericalOutputDir): comRdFileObj = commonReadFile() if not os.path.exists(finalNonNumericalOutputDir + '/' + 'pruneResults'): os.makedirs(finalNonNumericalOutputDir + '/' + 'pruneResults') i = 0 #get field values prA = pair.strip().split('-')[0].lower() #tb.field A prB = pair.strip().split('-')[1].lower() #get index if prA in tbFieldAllNonNumericalValuesMap and prB in tbFieldAllNonNumericalValuesMap: #get all tb field values # print ('pairsDDDDDDDDD: ', prA, prB) lstValA = tbFieldAllNonNumericalValuesMap[prA] #get values lstValB = tbFieldAllNonNumericalValuesMap[prB] # print ('newLsValA AAAA: ' , prA, prB, len(lsValA),lsValA[0], len(lsValB), lsValB[0]) writeWholeLst = blist([]) #write rows lists cosResLst = blist([]) pairsNameLstA = blist([]) pairsNameLstB = blist([]) pairsNameLstA.append(str(prA)) pairsNameLstB.append(str(prB)) cosResLst.append('Cosine Similarity') countTrueComparePairs = 0 #partition A and B columns into several partitions respectively lenA = len(lstValA) lenB = len(lstValB) lenAPartition = min(partFetchNum, lenA) numAPartition = ceil(lenA / lenAPartition) lenBPartition = min(partFetchNum, lenB) numBPartition = ceil(lenB / lenBPartition) #scalable method, in partition times partion pairs, if in the first half of partition, the ratio of matching above threshod is less than ratioPruning, exit this column pairs. Bexit = False laAllLenWhole = 0 lbAlllenWhole = 0 for i in range(0, numAPartition): if Bexit: break if (i + 1) * partFetchNum <= lenA: lsValACur = lstValA[i * partFetchNum:(i + 1) * partFetchNum] else: lsValACur = lstValA[i * partFetchNum:lenA] for j in range(0, numBPartition): if (j + 1) * partFetchNum <= lenB: lsValBCur = lstValB[j * partFetchNum:(j + 1) * partFetchNum] else: lsValBCur = lstValB[j * partFetchNum:lenB] #print ('countTruePairsAAAAAAAAAA ', lenA, lenB, prA, prB) [ countTrueComparePairsEvery, countAboveThreholdEvery, laAllLen, lbAlllen ] = self.filterColumnsWithSample(False, lsValACur, lsValBCur, prefixLength, partFetchNum, recordPrSimiThreshold, pairsNameLstA, pairsNameLstB, cosResLst) #print ('pairsccccccccc: ', prA, prB, countAboveThrehold, partFetchNum, numAPartition) laAllLenWhole += laAllLen lbAllLenWhole += lbAlllen countTrueComparePairs += countTrueComparePairsEvery if ( countAboveThreholdEvery / ((len(lsValACur) + len(lsValBCur)) / 2) < ratioPruning ): # (numAPartition >=2) and (i <= 2*numAPartition/3) and judge if the ratio of matching above threshold is low, exit and go to next column,until to the final partition? Bexit = True filetmp = finalNonNumericalOutputDir + '/' + 'pruneResults' + '/' + 'prunePairs.tsv' rowStr = prA + '\t' + prB + '\t' + '\n' comRdFileObj.writeStrRowToFileAppend(filetmp, rowStr) # print ('pairsDDDDDDDDD: ', prA, prB) break #consider this column pair # if any record pair similarity above threshold, run again sample record similarity if (not Bexit) and (len(pairsNameLstA) > 1): #no any pair qualifies writeWholeLst.append(pairsNameLstA) writeWholeLst.append(pairsNameLstB) writeWholeLst.append(cosResLst) #print ('countTruePairsBBBBBBBB ', countTrueComparePairs, prA, prB) ##fdprsObj = fieldPairSim(prA, prB, len(pairsNameLstA)/countTrueComparePairs) laMatchLen = len(set(pairsNameLstA[0])) - 1 lbMatchLen = len(set(pairsNameLstB[1])) - 1 matchingRatio = 0.5 * (laMatchLen / laAllLen + laAllLenWhole / lbAlllenWhole) fdprsObj = fieldPairSim(prA, prB, matchingRatio) self.lstTopPairsTobeAllMatched.append(fdprsObj) if (len(writeWholeLst) >= 3): #select records, numOfRecords, tbA = prA.split('.')[0] fdA = prA.split('.')[1] tbB = prB.split('.')[0] fdB = prB.split('.')[1] outFile2 = str(tbA).upper() + '__' + str(fdA) + '-' + str( tbB).upper() + '__' + str(fdB) fd = open( finalNonNumericalOutputDir + '/' + outFile2 + '.tsv', 'w') comRdFileObj.writeListsColumnsToFileAppendWriterTsv( fd, writeWholeLst) fd.close() writeWholeLst = blist([]) # totalEndOne = time.time() # print ('total time One', totalEndOne - totalEndOneFilter) #because the speed and time problem, write out part of result to look self.recordPartialResultTimeEnd = time.time() #start time if ((len(self.lstTopPairsTobeAllMatched) != 0 and len(self.lstTopPairsTobeAllMatched) % 30 == 0) or ((self.recordPartialResultTimeEnd - self.recordPartialResultTimeStart) >= 86400)): #86400seconds =1 days self.recordPartialResultTimeStart = time.time() if not os.path.exists(finalNonNumericalOutputDir + '/' + 'partResultOutput'): os.makedirs(finalNonNumericalOutputDir + '/' + 'partResultOutput') comRdFileObj = commonReadFile( ) # clear only matching ratio output file comRdFileObj.sortAndWritetoFile( self.lstTopPairsTobeAllMatched, finalNonNumericalOutputDir + '/' + 'partResultOutput' + '/' + 'partRatioScoreAllResult00' + str(len(self.lstTopPairsTobeAllMatched)) + '.tsv')
def getSamplesNonumericalCosSimiRecordWise(self, pair, samplesFlag, tbFieldAllNonNumericalValuesMap, prefixLength, sampleRecordsNum, recordPrSimiThreshold, finalNonNumericalOutputDir): comRdFileObj = commonReadFile() i = 0 #get field values if (samplesFlag): prA = pair.strip().split('-')[0].lower() #tb.field A prB = pair.strip().split('-')[1].lower() else: prA = pair.fieldA.strip() prB = pair.fieldB.strip() #get index if prA in tbFieldAllNonNumericalValuesMap and prB in tbFieldAllNonNumericalValuesMap: #get all tb field values # print ('pairsDDDDDDDDD: ', prA, prB) lsValA = tbFieldAllNonNumericalValuesMap[prA] #get values lsValB = tbFieldAllNonNumericalValuesMap[prB] # print ('newLsValA AAAA: ' , prA, prB, len(lsValA),lsValA[0], len(lsValB), lsValB[0]) writeWholeLst = blist([]) #write rows lists cosResLst = blist([]) pairsNameLstA = blist([]) pairsNameLstB = blist([]) pairsNameLstA.append(str(prA)) pairsNameLstB.append(str(prB)) cosResLst.append('Cosine Similarity') [countTruePairs, countAboveThrehold, laAllLen, lbAlllen] = self.filterColumnsWithSample( samplesFlag, lsValA, lsValB, prefixLength, sampleRecordsNum, recordPrSimiThreshold, pairsNameLstA, pairsNameLstB, cosResLst) # if any record pair similarity above threshold, run again sample record similarity if (len(pairsNameLstA) > 1): #no any pair qualifies writeWholeLst.append(pairsNameLstA) writeWholeLst.append(pairsNameLstB) writeWholeLst.append(cosResLst) #fdprsObj = fieldPairSim(prA, prB, len(pairsNameLstA)/countTruePairs) #matching ratio score calcuation 1/2*(la/l_alla + lb/l_allb) laMatchLen = len(set(pairsNameLstA[0])) - 1 lbMatchLen = len(set(pairsNameLstB[1])) - 1 matchingRatio = 0.5 * (laMatchLen / laAllLen + lbMatchLen / lbAlllen) fdprsObj = fieldPairSim(prA, prB, matchingRatio) self.lstTopPairsTobeAllMatched.append(fdprsObj) if (len(writeWholeLst) >= 3): #select records, numOfRecords, tbA = prA.split('.')[0] fdA = prA.split('.')[1] tbB = prB.split('.')[0] fdB = prB.split('.')[1] outFile2 = str(tbA).upper() + '__' + str(fdA) + '-' + str( tbB).upper() + '__' + str(fdB) if (samplesFlag): #sample result output dir finalNonNumericalOutputDir = Intermediate_DirFiles[0] fd = open( finalNonNumericalOutputDir + '/' + outFile2 + '.tsv', 'w') comRdFileObj.writeListsColumnsToFileAppendWriterTsv( fd, writeWholeLst) i = i + 1 fd.close() writeWholeLst = blist([]) # totalEndOne = time.time() # print ('total time One', totalEndOne - totalEndOneFilter) #because the speed and time problem, write out part of result if (len(self.lstTopPairsTobeAllMatched) != 0 and len(self.lstTopPairsTobeAllMatched) % 200 == 0): if not os.path.exists( 'intermediateOutput/nonNumericalInterOutput/second'): os.makedirs( 'intermediateOutput/nonNumericalInterOutput/second') if not os.path.exists( 'intermediateOutput/nonNumericalInterOutput/second/partResultOutput' ): os.makedirs( 'intermediateOutput/nonNumericalInterOutput/second/partResultOutput' ) comRdFileObj = commonReadFile( ) # clear only matching ratio output file comRdFileObj.sortAndWritetoFile( self.lstTopPairsTobeAllMatched, Intermediate_DirFiles[2] + '/' + 'partRatioScoreAllResult00' + str(len(self.lstTopPairsTobeAllMatched)) + '.tsv')