Ejemplo n.º 1
0
def make_lsh(filename,
             out_filename,
             nperm=N_PERM,
             thresh=0.5,
             blocksize=1000000):
    """ Non-parallel variant of LSH caller - deprecated."""
    lsh = MinHashLSH(threshold=thresh, num_perm=nperm)
    batch_id = 0
    lsh_filenames = []
    current_batch = 0
    with io.open(filename, 'r', encoding='utf-8', errors='ignore') as fhandle:
        for line in tqdm(fhandle, total=get_line_number(filename)):
            lsplit = line.split(':')
            if len(lsplit) > 1:
                lnum = int(lsplit[0])
                line_sub = lsplit[1]
                wordlist = line_sub.split(' ')
                if len(wordlist) > 3 and (not lsh.__contains__(line_sub)):
                    lsh.insert(
                        str(lnum) + ':' + line_sub, make_hash(wordlist, nperm))
                    current_batch += 1
                if current_batch > blocksize:
                    outfile = out_filename + '_' + str(batch_id) + '.obj'
                    dump_lsh(lsh, outfile)
                    lsh_filenames.append(outfile)
                    lsh = MinHashLSH(threshold=thresh, num_perm=nperm)
                    batch_id += 1
                    current_batch = 0
    if current_batch > 0:
        outfile = out_filename + '_' + str(batch_id) + '.obj'
        dump_lsh(lsh, outfile)
        lsh_filenames.append(outfile)
    return lsh_filenames
Ejemplo n.º 2
0
def make_lsh_partial(batch_id, batch_size, filename, out_filename, byte_start, nperm=N_PERM, thresh=0.5):
    """
    Generate the LSH index over a subset of the data. 
    :param batch_id: Batch id, used to determine output filename
    :param batch_size: Specifies number of lines of the file to read
    :param filename: Input file, generated using the make_lsh_file family of functions
    :param out_filename: Output file prefix, batch_id is appended to distinguish each block.
    :param byte_start: Byte offset for the partial file - this allows make_lsh_partial to read the middle sections of 
    a file using the seek() command.
    :param nperm: number of permutations in the Min-Hash index.
    :param thresh: Jaccard index threshold to return
    :return: filename of the dumped LSH file.
    """
    lsh = MinHashLSH(threshold=thresh, num_perm=nperm)
    current_batch = 0
    with open(filename, 'r', encoding='utf-8', errors='ignore') as fhandle:
        fhandle.seek(byte_start)
        for line in fhandle:
            lsplit = line.split(':')
            if len(lsplit) > 1:
                lnum = lsplit[0]
                line_sub = lsplit[1]
                wordlist = line_sub.split(' ')
                if len(wordlist) > 3 and (not lsh.__contains__(line_sub)): #
                    lsh.insert((lnum + ':' + line_sub).encode('utf-8'), make_hash(wordlist, nperm))
            current_batch += 1
            if current_batch >= batch_size:
                break
    outfile = out_filename + '_' + str(batch_id) + '.obj'
    dump_lsh(lsh, outfile)
    return outfile