def make_lsh(filename, out_filename, nperm=N_PERM, thresh=0.5, blocksize=1000000): """ Non-parallel variant of LSH caller - deprecated.""" lsh = MinHashLSH(threshold=thresh, num_perm=nperm) batch_id = 0 lsh_filenames = [] current_batch = 0 with io.open(filename, 'r', encoding='utf-8', errors='ignore') as fhandle: for line in tqdm(fhandle, total=get_line_number(filename)): lsplit = line.split(':') if len(lsplit) > 1: lnum = int(lsplit[0]) line_sub = lsplit[1] wordlist = line_sub.split(' ') if len(wordlist) > 3 and (not lsh.__contains__(line_sub)): lsh.insert( str(lnum) + ':' + line_sub, make_hash(wordlist, nperm)) current_batch += 1 if current_batch > blocksize: outfile = out_filename + '_' + str(batch_id) + '.obj' dump_lsh(lsh, outfile) lsh_filenames.append(outfile) lsh = MinHashLSH(threshold=thresh, num_perm=nperm) batch_id += 1 current_batch = 0 if current_batch > 0: outfile = out_filename + '_' + str(batch_id) + '.obj' dump_lsh(lsh, outfile) lsh_filenames.append(outfile) return lsh_filenames
def make_lsh_partial(batch_id, batch_size, filename, out_filename, byte_start, nperm=N_PERM, thresh=0.5): """ Generate the LSH index over a subset of the data. :param batch_id: Batch id, used to determine output filename :param batch_size: Specifies number of lines of the file to read :param filename: Input file, generated using the make_lsh_file family of functions :param out_filename: Output file prefix, batch_id is appended to distinguish each block. :param byte_start: Byte offset for the partial file - this allows make_lsh_partial to read the middle sections of a file using the seek() command. :param nperm: number of permutations in the Min-Hash index. :param thresh: Jaccard index threshold to return :return: filename of the dumped LSH file. """ lsh = MinHashLSH(threshold=thresh, num_perm=nperm) current_batch = 0 with open(filename, 'r', encoding='utf-8', errors='ignore') as fhandle: fhandle.seek(byte_start) for line in fhandle: lsplit = line.split(':') if len(lsplit) > 1: lnum = lsplit[0] line_sub = lsplit[1] wordlist = line_sub.split(' ') if len(wordlist) > 3 and (not lsh.__contains__(line_sub)): # lsh.insert((lnum + ':' + line_sub).encode('utf-8'), make_hash(wordlist, nperm)) current_batch += 1 if current_batch >= batch_size: break outfile = out_filename + '_' + str(batch_id) + '.obj' dump_lsh(lsh, outfile) return outfile