def remove_duplicates(self, filename):
        util.sort_csv_file(filename, [0, 1], self.separator)

        file_handle = open(filename, 'r')
        tmp_file = open(filename + ".tmp", 'w')
        prev_line = None
        for line in file_handle:
            if prev_line is None or line != prev_line:
                tmp_file.write(line)
            prev_line = line

        os.remove(filename)
        os.rename(filename + ".tmp", filename)
Ejemplo n.º 2
0
            #print "Data:" + data
            tokens = data.split(separator)
            if len(tokens) > 0:
                #print "Adding tokens: " + str(tokens)
                minHashSig = None
                if inputType == "tokens":
                    if dataType == "integer":
                        tokens = util.get_int_list(tokens)
                    #print "Sign", tokens
                    if len(tokens) > 0:
                        minHashSig = signer.sign(tokens)
                else:
                    minHashSig = tokens

                #print "Key: ", key + ", minHash", minHashSig
                if minHashSig is not None:
                    lshSig = list(hasher.hash(minHashSig))
                    minOut = ""
                    if outputMinhash:
                        minOut = separator + util.write_tokens(minHashSig, separator)

                    for i in range(0, numBands):
                        wFile.write(str(i).zfill(3) + ":" + lshSig[i] + separator + keyPrefix + key + minOut + "\n")

file.close()
wFile.close()

if sortOutput:
    print "Sorting output on LSH Keys.."
    util.sort_csv_file(outputFilename, [0], separator)
    def run(self, inputFilename, baseFilename, outputFilename, separator, computeSimilarity, scoreThreshold):
        self.separator = separator
        self.scorethreshold = scoreThreshold
        lsh_key = None
        lsh_band = None
        prev_lsh_key = None
        prev_lsh_band = None
        currentClusterInput = []
        currentClusterBase = []

        file_list = [inputFilename, baseFilename]
        tmp_combined_file = inputFilename + "tmp" + ".csv"
        with open(tmp_combined_file, 'w') as file:
           input_lines = fileinput.input(file_list)
           file.writelines(input_lines)

#sort the combined file and then compute similarity between keys in the same band having same lsh keys
        util.sort_csv_file(tmp_combined_file, [0], '\t')
        file = open(tmp_combined_file,'r')

        #file = open(inputFilename, 'r')

        del self.outputFilenames[:]

        out = self.__openFileForWrite(outputFilename)

        for line in file:
            line = line.strip()
            if len(line) > 0:
                lineTokens = line.split(separator)
                lsh_key = lineTokens[0]
                lsh_band = lsh_key[0:3]

                itemId = lineTokens[1]

                if prev_lsh_key is None:
                    prev_lsh_key = lsh_key
                    prev_lsh_band = lsh_band
                    print "Start clustering for Band:", lsh_band
                    sys.stdout.flush()

                if prev_lsh_key != lsh_key:
                     if len(currentClusterInput) > 0:
                        if computeSimilarity:
                            self.__computeSimilarity(currentClusterInput,currentClusterBase,out,lsh_band)

                        else:
                            self.__writeClusters(currentClusterInput,currentClusterBase,out)


                     del currentClusterInput[:]
                     del currentClusterBase[:]

                if prev_lsh_band != lsh_band:
                    print "Start clustering for Band:", lsh_band
                    sys.stdout.flush()

                prev_lsh_key = lsh_key
                prev_lsh_band = lsh_band

                if basePrefix is not None and itemId.startswith(basePrefix):
                    currentClusterBase.append(lineTokens[1:])
                else:
                    currentClusterInput.append(lineTokens[1:])

        if len(currentClusterInput) > 0:
            if computeSimilarity:
                self.__computeSimilarity(currentClusterInput,currentClusterBase,out,lsh_band)

            else:
                self.__writeClusters(currentClusterInput,currentClusterBase,out)

        file.close()
        out.close()
        print "Done computing similarities"
        sys.stdout.flush()
Ejemplo n.º 4
0
def die():
    print "Please input the required parameters"
    print "Usage: findSimilarity.py --input <input filename> [--inputPrefix <prefix value> --base <base filename> --basePrefix <prefix value>]--output <output filename> [--separator <sep=\\t>] "
    exit(1)

args = parse_args()
if inputFilename is None or outputFilename is None:
    die()

if baseFilename is not None:
    file_list = [inputFilename,baseFilename]
    tmp_combined_file =  inputFilename + ".tmp"
    with open(tmp_combined_file, 'w') as file:
        input_lines = fileinput.input(file_list)
        file.writelines(input_lines)
    util.sort_csv_file(tmp_combined_file, [0], '\t')
    file=open(tmp_combined_file,'r')
else:
    file=open(inputFilename,'r')

previousLSHKey = None
lshKey = None
keysInput=[]
keysBase=[]


for line in file:
    lineParts = line.strip().split("\t")
    lshKey=lineParts[0]
    itemId=lineParts[1]
    idx = line.find("\t")