def create(cls, blob_key, **kwargs):

        blob_key = blob_key
        filename = kwargs.get('filename')
        rows = kwargs.get('rows', 5)
        bands = kwargs.get('bands', 40)
        buckets_per_band = kwargs.get('buckets_per_band', 100)
        shingle_type = kwargs.get('shingle_type', 'c4')
        minhash_modulo = kwargs.get('minhash_modulo', 5000)

        max_hashes = calculate_max_hashes(rows, bands)
        dataset = cls.get(blob_key)

        if not dataset:
            dataset = BlobDataset(
                              filename = filename,
                              blob_key = blob_key,
                              random_seeds = get_random_bits(max_hashes),
                              rows = rows,
                              bands = bands,
                              buckets_per_band = buckets_per_band,
                              shingle_type = shingle_type,
                              minhash_modulo = minhash_modulo)
        else:
            dataset.filename = filename

        return dataset.put()
    def create(cls, blob_key, **kwargs):

        blob_key = blob_key
        filename = kwargs.get('filename')
        rows = kwargs.get('rows', 5)
        bands = kwargs.get('bands', 40)
        buckets_per_band = kwargs.get('buckets_per_band', 100)
        shingle_type = kwargs.get('shingle_type', 'c4')
        minhash_modulo = kwargs.get('minhash_modulo', 5000)

        max_hashes = calculate_max_hashes(rows, bands)
        dataset = cls.get(blob_key)

        if not dataset:
            dataset = BlobDataset(filename=filename,
                                  blob_key=blob_key,
                                  random_seeds=get_random_bits(max_hashes),
                                  rows=rows,
                                  bands=bands,
                                  buckets_per_band=buckets_per_band,
                                  shingle_type=shingle_type,
                                  minhash_modulo=minhash_modulo)
        else:
            dataset.filename = filename

        return dataset.put()
def map(dataset, text, id=None):
    logging.info("OpenLSH > map() called.")
    start = datetime.datetime.utcnow()

    hashes = calculate_max_hashes(dataset.rows, dataset.bands)

    if len(dataset.random_seeds) < hashes:
        dataset.random_seeds = get_random_bits(hashes)
        dataset.put()

    sh_type = dataset.shingle_type
    modulo = dataset.minhash_modulo
    seeds = list(dataset.random_seeds)

    minhashes = calc_minhashes(text, sh_type, hashes, seeds, modulo)

    buckets = []
    buckets_per_band = dataset.buckets_per_band

    for band in xrange(dataset.bands):
        minhashes_in_band = [minhashes[band*dataset.rows + row] for row in xrange(dataset.rows)]
        if len(set(minhashes_in_band)) <= 1:
            buckets.append( (band * buckets_per_band) + hash(minhashes_in_band[0]) % buckets_per_band )

    end = datetime.datetime.utcnow()

    if 0 == (start.second % 20):
        logging.info('id %s, length %d, time %d', id, len(text), int((end-start).total_seconds()))

    for bkt in buckets:
        yield (bkt, '/view/%s/%s' % (dataset.filename, id))
Example #4
0
def map(dataset, text, id=None):
    logging.info("OpenLSH > map() called.")
    start = datetime.datetime.utcnow()

    hashes = calculate_max_hashes(dataset.rows, dataset.bands)

    if len(dataset.random_seeds) < hashes:
        dataset.random_seeds = get_random_bits(hashes)
        dataset.put()

    sh_type = dataset.shingle_type
    modulo = dataset.minhash_modulo
    seeds = list(dataset.random_seeds)

    minhashes = calc_minhashes(text, sh_type, hashes, seeds, modulo)

    buckets = []
    buckets_per_band = dataset.buckets_per_band

    for band in xrange(dataset.bands):
        minhashes_in_band = [
            minhashes[band * dataset.rows + row]
            for row in xrange(dataset.rows)
        ]
        if len(set(minhashes_in_band)) <= 1:
            buckets.append((band * buckets_per_band) +
                           hash(minhashes_in_band[0]) % buckets_per_band)

    end = datetime.datetime.utcnow()

    if 0 == (start.second % 20):
        logging.info('id %s, length %d, time %d', id, len(text),
                     int((end - start).total_seconds()))

    for bkt in buckets:
        yield (bkt, '/view/%s/%s' % (dataset.filename, id))