コード例 #1
def dissimilarity_obj_fun(prioritized_tcs, tcs_minhashes):

    # Get first TC
    checked_tcs = [prioritized_tcs.pop()]

    total_dist = 0

    while len(prioritized_tcs) > 0:

        # Get next TC
        current_tc = prioritized_tcs.pop()

        # Get average distance from others TCs
        acc_dist = 0
        for tc in checked_tcs:
            acc_dist += lsh.jDistanceEstimate(tcs_minhashes[current_tc],

        total_dist += acc_dist / len(checked_tcs)

        # Add to checked TCs to compare with the next

    # Return average distance

    return total_dist / len(checked_tcs)
コード例 #2
    def pw_fn(candidates, tcs_minhashes, selected_tcs_minhash, prioritized_tcs,
              tcs, n, times):
        selected_tc, max_dist = random.choice(tuple(candidates)), -1
        for candidate in tcs_minhashes:
            if candidate in candidates:
                dist = lsh.jDistanceEstimate(selected_tcs_minhash,
                if dist > max_dist:
                    selected_tc, max_dist = candidate, dist

        for i in xrange(n):
            if tcs_minhashes[selected_tc][i] < selected_tcs_minhash[i]:
                selected_tcs_minhash[i] = tcs_minhashes[selected_tc][i]

        tcs -= set([selected_tc])
        del tcs_minhashes[selected_tc]
コード例 #3
def fast_pw(input_file, r, b, bbox=False, k=5, memory=False, B=0):
    (str)input_file: path of input file
    (int)r: number of rows
    (int)b: number of bands
    (bool)bbox: True if BB prioritization
    (int)k: k-shingle size (for BB prioritization)
    (bool)memory: if True keep signature in memory and do not store them to file

    (list)P: prioritized test suite
    n = r * b  # number of hash functions

    hashes = [lsh.hashFamily(i) for i in range(n)]

    if memory:
        test_suite = loadTestSuite(input_file, bbox=bbox, k=k)
        # generate minhashes signatures
        mh_t = time.perf_counter()
        tcs_minhashes = {
            tc[0]: lsh.tcMinhashing(tc, hashes)
            for tc in test_suite.items()
        mh_time = time.perf_counter() - mh_t

        # loading input file and generating minhashes signatures
        sigfile = input_file.replace(".txt", ".sig")
        sigtimefile = "{}_sigtime.txt".format(input_file.split(".")[0])
        if not os.path.exists(sigfile):
            mh_t = time.perf_counter()
            storeSignatures(input_file, sigfile, hashes, bbox, k)
            mh_time = time.perf_counter() - mh_t
            with open(sigtimefile, "w") as fout:
            with open(sigtimefile, "r") as fin:
                mh_time = eval(fin.read().replace("\n", ""))

        tcs_minhashes, load_time = loadSignatures(sigfile)

    ptime_start = time.perf_counter()
    tcs = set(tcs_minhashes.keys())

    # budget B modification
    if B == 0:
        B = len(tcs)

    BASE = 0.5
    SIZE = int(len(tcs) * BASE) + 1

    bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n)

    prioritized_tcs = [0]

    # First TC

    selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes)
    first_tc = random.choice(list(tcs_minhashes.keys()))
    for i in range(n):
        if tcs_minhashes[first_tc][i] < selected_tcs_minhash[i]:
            selected_tcs_minhash[i] = tcs_minhashes[first_tc][i]
    tcs -= set([first_tc])
    del tcs_minhashes[first_tc]

    iteration, total = 0, float(len(tcs_minhashes))
    while len(tcs_minhashes) > 0:
        iteration += 1
        if iteration % 100 == 0:
            sys.stdout.write("  Progress: {}%\r".format(
                round(100 * iteration / total, 2)))

        if len(tcs_minhashes) < SIZE:
            bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n)
            SIZE = int(SIZE * BASE) + 1

        sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r,
        filtered_sim_cand = sim_cand.difference(prioritized_tcs)
        candidates = tcs - filtered_sim_cand

        if len(candidates) == 0:
            selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes)
            sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b,
                                         r, n)
            filtered_sim_cand = sim_cand.difference(prioritized_tcs)
            candidates = tcs - filtered_sim_cand
            if len(candidates) == 0:
                candidates = tcs_minhashes.keys()

        selected_tc, max_dist = random.choice(tuple(candidates)), -1
        for candidate in tcs_minhashes:
            if candidate in candidates:
                dist = lsh.jDistanceEstimate(selected_tcs_minhash,
                if dist > max_dist:
                    selected_tc, max_dist = candidate, dist

        for i in range(n):
            if tcs_minhashes[selected_tc][i] < selected_tcs_minhash[i]:
                selected_tcs_minhash[i] = tcs_minhashes[selected_tc][i]


        # select budget B
        if len(prioritized_tcs) >= B + 1:

        tcs -= set([selected_tc])
        del tcs_minhashes[selected_tc]

    ptime = time.perf_counter() - ptime_start

    max_ts_size = sum((1 for line in open(input_file)))
    return mh_time, ptime, prioritized_tcs[1:max_ts_size]
コード例 #4
def fast_pw(input_file, wBoxFile, r, b, bbox=False, k=5, memory=False):
    n = r * b  # number of hash functions

    tC0 = time.clock()
    C = loadCoverage(wBoxFile)
    tC1 = time.clock()
    maxCov = reduce(lambda x, y: x | y, C.values())

    hashes = [lsh.hashFamily(i) for i in range(n)]

    if memory:
        test_suite = loadTestSuite(input_file, bbox=bbox, k=k)
        # generate minhashes signatures
        mh_t = time.clock()
        tcs_minhashes = {
            tc[0]: lsh.tcMinhashing(tc, hashes)
            for tc in test_suite.items()
        mh_time = time.clock() - mh_t
        ptime_start = time.clock()

        # loading input file and generating minhashes signatures
        sigfile = input_file.replace(".txt", ".sig")
        sigtimefile = "{}_sigtime.txt".format(input_file.split(".")[0])
        if not os.path.exists(sigfile):
            mh_t = time.clock()
            storeSignatures(input_file, sigfile, hashes, bbox, k)
            mh_time = time.clock() - mh_t
            with open(sigtimefile, "w") as fout:
            with open(sigtimefile, "r") as fin:
                mh_time = eval(fin.read().replace("\n", ""))

        ptime_start = time.clock()
        tcs_minhashes, load_time = loadSignatures(sigfile)

    tcs = set(tcs_minhashes.keys())

    BASE = 0.5
    SIZE = int(len(tcs) * BASE) + 1

    bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n)

    prioritized_tcs = [0]

    # First TC

    selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes)
    first_tc = random.choice(list(tcs_minhashes.keys()))

    for i in range(n):
        if tcs_minhashes[first_tc][i] < selected_tcs_minhash[i]:
            selected_tcs_minhash[i] = tcs_minhashes[first_tc][i]

    cov = C[first_tc]
    for tc in C.keys():
        C[tc] = C[tc] - cov
        if tc in tcs and len(C[tc]) == 0:
            tcs -= set([tc])
            del tcs_minhashes[tc]

    iteration, total = 0, float(len(tcs_minhashes))
    while cov != maxCov:
        iteration += 1
        if iteration % 100 == 0:
            sys.stdout.write("  Progress: {}%\r".format(
                round(100 * iteration / total, 2)))

        if len(tcs_minhashes) < SIZE:
            bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n)
            SIZE = int(SIZE * BASE) + 1

        sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r,
        filtered_sim_cand = sim_cand.difference(prioritized_tcs)
        candidates = tcs - filtered_sim_cand

        if len(candidates) == 0:
            selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes)
            sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b,
                                         r, n)
            filtered_sim_cand = sim_cand.difference(prioritized_tcs)
            candidates = tcs - filtered_sim_cand
            if len(candidates) == 0:
                candidates = tcs_minhashes.keys()

        selected_tc, max_dist = random.choice(tuple(candidates)), -1
        for candidate in tcs_minhashes:
            if candidate in candidates:
                dist = lsh.jDistanceEstimate(selected_tcs_minhash,
                if dist > max_dist:
                    selected_tc, max_dist = candidate, dist

        for i in range(n):
            if tcs_minhashes[selected_tc][i] < selected_tcs_minhash[i]:
                selected_tcs_minhash[i] = tcs_minhashes[selected_tc][i]


        cov = cov | C[selected_tc]
        for tc in C.keys():
            C[tc] = C[tc] - cov
            if tc in tcs and len(C[tc]) == 0:
                tcs -= set([tc])
                del tcs_minhashes[tc]

    ptime = time.clock() - ptime_start

    max_ts_size = sum((1 for line in open(input_file)))
    return mh_time, tC1 - tC0, ptime, prioritized_tcs[1:max_ts_size]