def remove_duplicates(self, filename): util.sort_csv_file(filename, [0, 1], self.separator) file_handle = open(filename, 'r') tmp_file = open(filename + ".tmp", 'w') prev_line = None for line in file_handle: if prev_line is None or line != prev_line: tmp_file.write(line) prev_line = line os.remove(filename) os.rename(filename + ".tmp", filename)
#print "Data:" + data tokens = data.split(separator) if len(tokens) > 0: #print "Adding tokens: " + str(tokens) minHashSig = None if inputType == "tokens": if dataType == "integer": tokens = util.get_int_list(tokens) #print "Sign", tokens if len(tokens) > 0: minHashSig = signer.sign(tokens) else: minHashSig = tokens #print "Key: ", key + ", minHash", minHashSig if minHashSig is not None: lshSig = list(hasher.hash(minHashSig)) minOut = "" if outputMinhash: minOut = separator + util.write_tokens(minHashSig, separator) for i in range(0, numBands): wFile.write(str(i).zfill(3) + ":" + lshSig[i] + separator + keyPrefix + key + minOut + "\n") file.close() wFile.close() if sortOutput: print "Sorting output on LSH Keys.." util.sort_csv_file(outputFilename, [0], separator)
def run(self, inputFilename, baseFilename, outputFilename, separator, computeSimilarity, scoreThreshold): self.separator = separator self.scorethreshold = scoreThreshold lsh_key = None lsh_band = None prev_lsh_key = None prev_lsh_band = None currentClusterInput = [] currentClusterBase = [] file_list = [inputFilename, baseFilename] tmp_combined_file = inputFilename + "tmp" + ".csv" with open(tmp_combined_file, 'w') as file: input_lines = fileinput.input(file_list) file.writelines(input_lines) #sort the combined file and then compute similarity between keys in the same band having same lsh keys util.sort_csv_file(tmp_combined_file, [0], '\t') file = open(tmp_combined_file,'r') #file = open(inputFilename, 'r') del self.outputFilenames[:] out = self.__openFileForWrite(outputFilename) for line in file: line = line.strip() if len(line) > 0: lineTokens = line.split(separator) lsh_key = lineTokens[0] lsh_band = lsh_key[0:3] itemId = lineTokens[1] if prev_lsh_key is None: prev_lsh_key = lsh_key prev_lsh_band = lsh_band print "Start clustering for Band:", lsh_band sys.stdout.flush() if prev_lsh_key != lsh_key: if len(currentClusterInput) > 0: if computeSimilarity: self.__computeSimilarity(currentClusterInput,currentClusterBase,out,lsh_band) else: self.__writeClusters(currentClusterInput,currentClusterBase,out) del currentClusterInput[:] del currentClusterBase[:] if prev_lsh_band != lsh_band: print "Start clustering for Band:", lsh_band sys.stdout.flush() prev_lsh_key = lsh_key prev_lsh_band = lsh_band if basePrefix is not None and itemId.startswith(basePrefix): currentClusterBase.append(lineTokens[1:]) else: currentClusterInput.append(lineTokens[1:]) if len(currentClusterInput) > 0: if computeSimilarity: self.__computeSimilarity(currentClusterInput,currentClusterBase,out,lsh_band) else: self.__writeClusters(currentClusterInput,currentClusterBase,out) file.close() out.close() print "Done computing similarities" sys.stdout.flush()
def die(): print "Please input the required parameters" print "Usage: findSimilarity.py --input <input filename> [--inputPrefix <prefix value> --base <base filename> --basePrefix <prefix value>]--output <output filename> [--separator <sep=\\t>] " exit(1) args = parse_args() if inputFilename is None or outputFilename is None: die() if baseFilename is not None: file_list = [inputFilename,baseFilename] tmp_combined_file = inputFilename + ".tmp" with open(tmp_combined_file, 'w') as file: input_lines = fileinput.input(file_list) file.writelines(input_lines) util.sort_csv_file(tmp_combined_file, [0], '\t') file=open(tmp_combined_file,'r') else: file=open(inputFilename,'r') previousLSHKey = None lshKey = None keysInput=[] keysBase=[] for line in file: lineParts = line.strip().split("\t") lshKey=lineParts[0] itemId=lineParts[1] idx = line.find("\t")