Python BKTree Examples, pybktree.BKTree Python Examples

Example #1

0

Show file

def correct_cells_whitelist(final_results, umis_per_cell, whitelist,
                            collapsing_threshold, ab_map):
    """
    Corrects cell barcodes.
    
    Args:
        final_results (dict): Dict of dict of Counters with mapping results.
        umis_per_cell (Counter): Counter of UMIs per cell.
        whitelist (set): The whitelist reference given by the user.
        collapsing_threshold (int): Max distance between umis.
        ab_map (OrederedDict): Tags in an ordered dict.

    
    Returns:
        final_results (dict): Same as input but with corrected umis.
        umis_per_cell (Counter): Updated UMI counts after correction.
        corrected_barcodes (int): How many umis have been corrected.
    """
    barcode_tree = pybktree.BKTree(Levenshtein.hamming, whitelist)
    print('Generated barcode tree from whitelist')
    cell_barcodes = list(final_results.keys())
    n_barcodes = len(cell_barcodes)
    print('Finding reference candidates')
    print('Processing {:,} cell barcodes'.format(n_barcodes))

    #Run with one process
    true_to_false = find_true_to_false_map(
        barcode_tree=barcode_tree,
        cell_barcodes=cell_barcodes,
        whitelist=whitelist,
        collapsing_threshold=collapsing_threshold)
    (umis_per_cell, final_results,
     corrected_barcodes) = collapse_cells(true_to_false, umis_per_cell,
                                          final_results, ab_map)
    return (final_results, umis_per_cell, corrected_barcodes)

Example #2

0

Show file

File: dupe_checker.py Project: VirtualLights/Duplicate-Image-Detector

def main():
    distance = 2
    #hashFunction = imagehash.average_hash
    hashFunction = imagehash.dhash
    #hashFunction = imagehash.phash
    #hashFunction = imagehash.whash

    # Image file extensions, more can be added if necessary
    ext = ('.jpg', '.jpeg', '.gif', '.png')

    tree = pybktree.BKTree(hdist, [])
    potentialDuplicates = []

    for filename in os.listdir('.'):
        if filename.endswith(ext):
            # Calculates the hash and adds it to the tree
            with Image.open(filename) as img:
                hashval = hashFunction(img)

                # Checks for potentially duplicate images
                for pd in tree.find((hashval, filename), distance):
                    potentialDuplicates.append((filename, pd[1]))

                tree.add((hashval, filename))

    print(f'Found {len(potentialDuplicates)} potential duplicate images.')

    with open('foundchanges.txt', 'w') as out:
        csv_out = csv.writer(out)
        for row in potentialDuplicates:
            csv_out.writerow(row)

Example #3

0

Show file

File: ad_screener_core.py Project: CybersecurityForDemocracy/Facebook-Political-Ad-Landscape-Notifications

def get_image_simhash_bktree():
    with db_functions.get_ad_info_database_connection() as db_connection:
        db_interface = db_functions.AdsIfoDBInterface(db_connection)
        simhash_to_archive_id_set = db_interface.all_ad_creative_image_simhashes(
        )

    total_sim_hashes = len(simhash_to_archive_id_set)
    logging.info('Got %d image simhashes to process.', total_sim_hashes)

    # Create BKTree with dhash bit difference function as distance_function, used to find similar
    # hashes
    image_simhash_tree = pybktree.BKTree(get_num_bits_different)

    sim_hashes_added_to_tree = 0
    tree_construction_start_time = time.time()
    for sim_hash, archive_id_set in simhash_to_archive_id_set.items():
        # Add single entry in BK tree for simhash with lowest archive_id.
        image_simhash_tree.add(
            ArchiveIDAndSimHash(sim_hash=sim_hash,
                                archive_id=min(archive_id_set)))
        sim_hashes_added_to_tree += 1
        if sim_hashes_added_to_tree % 1000 == 0:
            logging.debug('Added %d/%d simhashes to BKtree.',
                          sim_hashes_added_to_tree, total_sim_hashes)
    logging.info('Constructed BKTree in %s seconds',
                 (time.time() - tree_construction_start_time))
    return image_simhash_tree

Example #4

0

Show file

File: SpellCorrect.py Project: mapozhidaeva/Culturomics

 def __init__(self, V=None, model=None):
     """Constructor method to load external probMaker class, load dictionary and words counts """
     self.vocab = self.load_vocab()
     self.counts = self.load_counts()
     self.trie = pybktree.BKTree(distance, self.vocab)
     self.error_df = self.load_error_df()
     self.pm = probMaker(self.error_df, self.counts)
     self.V = V
     self.model = model

Example #5

0

Show file

File: SpellCheckerBkTree.py Project: roddar92/linguistics_problems

    def fit(self, words_list):
        """
            Подгонка спеллера
        """

        checkpoint = time.time()
        self.words_list = pybktree.BKTree(editdistance.eval, words_list)
        print("Speller fitted in", time.time() - checkpoint)

        return self

Example #6

0

Show file

File: v_cut_detect.py Project: Lai-smile/blooming-flower-

def v_cut_detector(img_path, v_cut_path):
    img = cv2.imread(img_path)
    o_img = img.copy()
    key_text_loc = ()
    # has_v_cut = False
    dominate_color = get_dominant_color(img)
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if dominate_color[0] > 127:
        res, bin_img = cv2.threshold(gray_img, 45, 255, cv2.THRESH_BINARY_INV)
    else:
        res, bin_img = cv2.threshold(gray_img, 45, 255, cv2.THRESH_BINARY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (24, 6))
    dilate = cv2.dilate(bin_img, kernel, iterations=5)
    close_img = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, kernel)
    res1, contours, h = cv2.findContours(close_img, cv2.RETR_TREE,
                                         cv2.CHAIN_APPROX_SIMPLE)
    object_region = {}
    max_area = np.dot(img.shape[0], img.shape[1])
    # print(max_area)
    key_list = []
    for contour_num in range(len(contours)):
        key = []
        cnt = contours[contour_num]
        area = cv2.contourArea(cnt)
        if area < max_area / 3000 or area > 3 * max_area / 4:
            continue
        x, y, w, h = cv2.boundingRect(cnt)  # 将轮廓信息转换成(x, y)坐标，并加上矩形的高度和宽度
        cv2.rectangle(img, (x, y), (x + w, y + h), (255, 255, 0), 2)  # 画出矩形
        # print(x, y+h)
        cut_img = o_img[y:y + h, x:x + w]
        key.append(x)
        key.append(y + h)
        key_list.append(tuple(key))
        object_region[tuple(key)] = cut_img
        text = pytesseract.image_to_string(cut_img,
                                           lang='eng')  # 'chi_sim+eng'
        if 'v-cut' in text.lower():
            print(img_path)
            key_text_loc = tuple(key)
            print(key_text_loc)
            # has_v_cut = True
    bk_tree = pybktree.BKTree(manhattan_distance, key_list)
    if key_text_loc:
        v_cut_key = bk_tree.find(key_text_loc, 1000)
        print(v_cut_key)
        if len(v_cut_key) > 1:
            v_cut_img = object_region[v_cut_key[1][1]]
        else:
            print('no 1000' + img_path)
            v_cut_img = object_region[v_cut_key[0][1]]

        cv2.imwrite(v_cut_path, v_cut_img)

    cv2.imwrite('v-cut.png', img)

Example #7

0

Show file

def getErrorCorrectMapping(cell_barcodes, whitelist, threshold=1):
    ''' Find the mappings between true and false cell barcodes based
    on an edit distance threshold.
    Any cell barcode within the threshold to more than one whitelist
    barcode will be excluded'''

    true_to_false = collections.defaultdict(set)

    # Unexpected results with cythonise hamming distance so redefine in python here
    def hamming_distance(first, second):
        ''' returns the edit distance/hamming distances between
        its two arguements '''

        # We only want to define hamming distance for barcodes with the same length
        if len(first) != len(second):
            return np.inf

        dist = sum([not a == b for a, b in zip(first, second)])
        return dist

    whitelist = set([str(x) for x in whitelist])

    U.info('building bktree')
    tree2 = pybktree.BKTree(hamming_distance, whitelist)
    U.info('done building bktree')

    for cell_barcode in cell_barcodes:

        if cell_barcode in whitelist:
            # if the barcode is already whitelisted, no need to add
            continue

        # get all members of whitelist that are at distance 1
        candidates = [
            white_cell for d, white_cell in tree2.find(cell_barcode, threshold)
            if d > 0
        ]

        if len(candidates) == 0:
            # the cell doesnt match to any whitelisted barcode,
            # hence we have to drop it
            # (as it cannot be asscociated with any frequent barcde)
            continue

        elif len(candidates) == 1:
            white_cell_str = candidates[0]
            true_to_false[white_cell_str].add(cell_barcode)

        else:
            # more than on whitelisted candidate:
            # we drop it as its not uniquely assignable
            continue
    return true_to_false

Example #8

0

Show file

File: benchmarkHashLookup.py Project: mxmlnkn/cppbktree

def benchmark_pybktree(element_counts, repeat_count=10):
    """
    Returns a list of triples:
      - elements
      - tree creation time in seconds
      - lookup time for one element in seconds
    """
    timings = []
    for element_count in tqdm.tqdm(element_counts):
        timing = [element_count]

        runtimes = []
        for i in range(repeat_count):
            elements = np.random.randint(np.iinfo(np.uint64).max,
                                         size=element_count,
                                         dtype=np.uint64)
            t0 = time.time()
            tree = pybktree.BKTree(pybktree.hamming_distance, elements)
            t1 = time.time()
            runtimes.append(t1 - t0)
        timing += [np.mean(runtimes), np.std(runtimes)]

        for distance in [0, 1, 2, 4, 8, 16]:
            runtimes = []
            for i in range(repeat_count):
                elements = np.random.randint(np.iinfo(np.uint64).max,
                                             size=element_count,
                                             dtype=np.uint64)
                tree = pybktree.BKTree(pybktree.hamming_distance, elements)
                t0 = time.time()
                results = tree.find(item=np.uint64(0), n=distance)
                t1 = time.time()
                runtimes.append(t1 - t0)
            timing += [distance, np.mean(runtimes), np.std(runtimes)]

        timings.append(timing)

    return timings

Example #9

0

Show file

def correct_cells_whitelist(final_results, umis_per_cell, whitelist,
                            collapsing_threshold):
    """
    Corrects cell barcodes.
    
    Args:
        final_results (dict): Dict of dict of Counters with mapping results.
        umis_per_cell (Counter): Counter of number of umis per cell.
        whitelist (set): The whitelist reference given by the user.
        collapsing_threshold (int): Max distance between umis.
    
    Returns:
        final_results (dict): Same as input but with corrected umis.
        umis_per_cell (Counter): Counter of umis per cell after cell barcode correction
        corrected_umis (int): How many umis have been corrected.
    """
    true_to_false = defaultdict(set)
    barcode_tree = pybktree.BKTree(Levenshtein.hamming, whitelist)
    print('Generated barcode tree from whitelist')
    cell_barcodes = list(final_results.keys())
    print('Finding reference candidates')
    for i, cell_barcode in enumerate(cell_barcodes):
        if cell_barcode in whitelist:
            # if the barcode is already whitelisted, no need to add
            continue
        # get all members of whitelist that are at distance of collapsing_threshold
        candidates = [
            white_cell for d, white_cell in barcode_tree.find(
                cell_barcode, collapsing_threshold) if d > 0
        ]

        if len(candidates) == 0:
            # the cell doesnt match to any whitelisted barcode,
            # hence we have to drop it
            # (as it cannot be asscociated with any frequent barcode)
            continue
        elif len(candidates) == 1:
            white_cell_str = candidates[0]
            true_to_false[white_cell_str].add(cell_barcode)
        else:
            # more than on whitelisted candidate:
            # we drop it as its not uniquely assignable
            continue
    (umis_per_cell, final_results,
     corrected_barcodes) = collapse_cells(true_to_false=true_to_false,
                                          umis_per_cell=umis_per_cell,
                                          final_results=final_results)
    return (final_results, umis_per_cell, corrected_barcodes)

Example #10

0

Show file

File: sequence_index.py Project: rbrezu/dna_api

    def load(self, idx_dir, force=False):
        if self.loaded and not force:
            return

        self.idx_dir = idx_dir
        self.file_path = os.path.join(idx_dir, 'idx.pk')
        if os.path.exists(self.file_path):
            with open(self.file_path, 'rb') as file:
                self.tree = dill.load(file)
        else:
            self.tree = pybktree.BKTree(
                lambda x, y: editdistance.eval(x['sequence'], y['sequence']))

        self.loaded = True
        self.tree.distance_func = lambda x, y: editdistance.eval(
            x['sequence'], y['sequence'])

Example #11

0

Show file

def main():
    dhash_json = pathlib.Path(os.getenv('DHASH_FILE')).resolve()
    dhash_tree = pybktree.BKTree(diff)
    with dhash_json.open() as f:
        data = json.load(f)
    for path, dhash in data.items():
        dhash_tree.add(Image(path, dhash))

    match_distance = int(os.getenv('MATCH_DISTANCE', 5))
    for image in dhash_tree:
        matches = dhash_tree.find(image, match_distance)
        if len(matches) > 1:
            print(image.path)
            for match in matches:
                print(f'{match[0]} {match[1].path}')
            input()

Example #12

0

Show file

File: get_famous_people_photos.py Project: armanrahman22/face_recognition

def pare_matches_and_download(thumbnail_urls, person):
    urls = set()
    directory = '../../common/images/' + person
    if not os.path.exists(directory) and len(thumbnail_urls) > 10:
        # Make sure all the matches are of the same person
        try:
            identifier = face.Identifier(threshold=1.0)
            images = map(identifier.download_image, thumbnail_urls)
            urls_and_embeddings = identifier.detect_encode_all(
                images, thumbnail_urls, True)
            anchor_embedding = urls_and_embeddings[0].embedding
            # Assume first image is of the right person and check other images are of the same person
            for other in urls_and_embeddings:
                is_match, distance = identifier.compare_embedding(
                    anchor_embedding, other.embedding)
                # print('dist: {} between {} and {}'.format(distance,urls_and_embeddings[0].url, other.url))
                if is_match:
                    urls.add(other.url)
            del identifier
        except Exception as e:
            print(e)

        # Make sure there are no duplicate images
        image_hashes = [HASH_URL(url_to_img_hash(url), url) for url in urls]
        tree = pybktree.BKTree(image_distance, image_hashes)
        # this makes images saved in order of similarity so we can spot duplicates easier
        sorted_image_hashes = sorted(tree)
        to_discard = []
        urls_to_keep = set()
        for image_hash in sorted_image_hashes:
            if image_hash not in to_discard:
                # gets pictures within a hamming distance of 3
                matches = tree.find(image_hash, 3)
                for match in matches:
                    if match[1].url != image_hash.url:
                        to_discard.append(match[1])
                urls_to_keep.add(image_hash.url)

        # Download the images
        download_urls(person, list(urls_to_keep))

    # Update counter
    try:
        increment()
        timer.update(int(counter.value))
    except Exception as e:
        print(e)

Example #13

0

Show file

File: duplicate_finder.py Project: bolshevik/duplicate-images

def find_threshold(db, threshold=1):
    dups = []
    # Build a tree
    cursor = db.find()
    tree = pybktree.BKTree(pybktree.hamming_distance)

    cprint('Finding fuzzy duplicates, it might take a while...')
    cnt = 0
    for document in db.find():
        int_hash = int(document['hash'], 16)
        tree.add(int_hash)
        cnt = cnt + 1

    deduplicated = set()

    scanned = 0
    for document in db.find():
        cprint("\r%d%%" % (scanned * 100 / (cnt - 1)), end='')
        scanned = scanned + 1
        if document['hash'] in deduplicated:
            continue
        deduplicated.add(document['hash'])
        hash_len = len(document['hash'])
        int_hash = int(document['hash'], 16)
        similar = tree.find(int_hash, threshold)
        if len(similar) > 1:
            similar = list(set(similar))

            similars = []
            for (distance, item_hash) in similar:
                item_hash = format(item_hash, '0' + str(hash_len) + 'x')
                if distance > 0:
                    deduplicated.add(item_hash)

                for item in db.find({'hash': item_hash}):
                    item['file_name'] = item['_id']
                    similars.append(item)
            if len(similars) > 0:
                dups.append(
                    {
                        '_id': document['hash'],
                        'total': len(similars),
                        'items': similars
                    }
                )

    return dups

Example #14

0

Show file

File: image_utilities.py Project: Lai-smile/decompress_extract_file

def build_dict_tree(dict_path):
    hash_list = []
    chr_name = []
    bk_tree = None
    for f in get_all_files(dict_path):
        f_path = f
        if f_path[-3:] == 'png':
            chr_image = Image.open(f_path)
            chr_image = chr_image.convert('L')
            represent_hash = dhash.dhash_int(chr_image)
            if not represent_hash:
                continue
            hash_list.append(represent_hash)
            chr_name.append(f_path.split('/')[-1][:-4])

        bk_tree = pybktree.BKTree(pybktree.hamming_distance, hash_list)
    return chr_name, hash_list, bk_tree

Example #15

0

Show file

File: dupe_checker.py Project: VirtualLights/Duplicate-Image-Detector

def main():
    averageTime = []
    for i in range(55):
        start = time.time()

        distance = i // 5
        #hashFunction = imagehash.average_hash
        #hashFunction = imagehash.dhash
        #hashFunction = imagehash.phash
        hashFunction = imagehash.whash

        # Image file extensions, more can be added if necessary
        ext = ('.jpg', '.jpeg', '.gif', '.png')

        tree = pybktree.BKTree(hdist, [])
        potentialDuplicates = []

        for filename in os.listdir('.'):
            if filename.endswith(ext):
                # Calculates the hash and adds it to the tree
                with Image.open(filename) as img:
                    hashval = hashFunction(img)

                    # Checks for potentially duplicate images
                    for pd in tree.find((hashval, filename), distance):
                        potentialDuplicates.append((filename, pd[1][1]))
                    tree.add((hashval, filename))

        print(i)

        f = 'foundchanges' + str(i) + '.txt'
        with open(f, 'w') as out:
            writer = csv.writer(out, delimiter=',')
            writer.writerows(potentialDuplicates)

        end = time.time()
        averageTime.append((i, end - start))

    with open('whash.txt', 'w') as out:
        writer = csv.writer(out, delimiter=',')
        writer.writerows(averageTime)

Example #16

0

Show file

File: transcript_errors.py Project: redst4r/rnaseqtools

def get_most_common_true_sequences(read_counter, topN:int):
    """
    get the most abundant sequences, but also make sure that shadows dont sneak in.
    e.g. a VERY abundant true sequence might be ~100000reads, and 1% (1000)
    will result in shadows. these shadows might end up in the top100 itself
    """
    assert isinstance(read_counter, collections.Counter)
    from rnaseqtools.seqerrors.CB_errors import hamming_distance
    bktree = pybktree.BKTree(hamming_distance)
    DISTANCE = 2

    most_common = set()
    for seq, freq in tqdm.tqdm(read_counter.most_common(topN), desc='finding most common seqs'):
        # if the sequence is close to an an already accepted true seq
        if len(bktree.find(seq, DISTANCE)) > 0:
            continue
        else:
            bktree.add(seq)
            most_common.add(seq)

    return most_common

Example #17

0

Show file

File: core.py Project: oratosquilla-oratoria/myfyrio

def image_grouping(images: Collection['Image'], sensitivity: Sensitivity) \
    -> Generator[Tuple[GroupIndex, Group], None, None]:
    '''Find similar images and group them. Yield a tuple with the group
    index and image group when a new group has been added or existing one
    has been modified (a new image has been added to the group)

    :param images:      images to process,
    :param sensitivity: maximal difference between hashes of 2 images
                        when they are considered similar,
    :yield:             tuple with the group index and list with grouped
                        similar images,
    :raise TypeError:   any of the hashes is not integer
    '''

    image_groups: List[Group] = []

    try:
        bktree = pybktree.BKTree(Image.hamming, images)
    except TypeError:
        raise TypeError('Hashes must be integers')

    checked: Dict['Image', GroupIndex] = dict()

    for image in images:
        distance, closest = _closest(bktree, image, sensitivity)
        if closest is None:
            continue

        # 'closest' goes to the same group as 'image'
        if image in checked and closest not in checked:
            yield _add_img_to_existing_group(image, closest, checked,
                                             image_groups)
        # and vice versa
        if image not in checked and closest in checked:
            yield _add_img_to_existing_group(closest, image, checked,
                                             image_groups)
        # create a new group with 'image' and 'closest' it it
        if image not in checked and closest not in checked:
            yield _add_new_group(image, closest, checked, image_groups,
                                 distance)

Example #18

0

Show file

def get_multitree(voc_fd, lang_id):
    """Get a multitree for the given language."""
    # Get a dict() where each key is a letter and each value
    # is a BK tree of the words that start with that letter
    dst = distance.Distance()
    ed = EditDistanceWrapper(lang_id, dst, phonemise)
    distractors = {}
    for line in voc_fd.readlines():
        (f, w) = line.strip("\n").split("\t")
        first_letter = w[0].lower()
        if first_letter not in distractors:
            distractors[first_letter] = []
        distractors[first_letter].append(w.lower())

    distractors_tree = {}
    for letter in distractors:
        distractors_tree[letter] = pybktree.BKTree(
            ed.edit_distance,
            distractors[letter],
        )

    return distractors_tree

Example #19

0

Show file

def dedupe_images(matched_urls: List[str], person: str) \
        -> Tuple[List[str], str]:

    image_hashes = [IMAGE_HASH(url_to_img_hash(url), url)
                    for url in matched_urls]
    tree = pybktree.BKTree(image_distance, image_hashes)
    # this makes images saved in order of similarity so we can spot duplicates
    # easier
    sorted_image_hashes = sorted(tree)
    to_discard: List[str] = []
    urls_to_keep = set()
    for image_hash in sorted_image_hashes:
        if image_hash not in to_discard:
            # gets pictures within a hamming distance of 5
            matches = tree.find(image_hash, 5)
            for match in matches:
                if match[1].url != image_hash.url:
                    to_discard.append(match[1])
            urls_to_keep.add(image_hash.url)
    # Update counter
    increment()
    TIMER.update(int(COUNTER.value))
    return list(urls_to_keep), person

Example #20

0

Show file

def create_PUG_umi_based(cb_records:list, ec_dict):

    umi_dict = collections.defaultdict(list)
    for record in cb_records:
        umi_dict[record.UMI].append(record)

    # a BKTree of all UMIs in that cell
    tree = pybktree.BKTree(hamming_distance, list(umi_dict.keys()))

    nodes = set()
    edges = []
    for record in cb_records:

        nodes.add(record)
        # any sequence neighbours?
        for distance, umi_neighbor in tree.find(record.UMI, 1):
            # this particular UMI might have multiple records:
            for neighbor_record in umi_dict[umi_neighbor]:
                if record == neighbor_record:
                    continue  # due to d==0 this can be the same record
                # check EC overlap
                T1 = set(ec_dict[record.EC])
                T2 = set(ec_dict[neighbor_record.EC])
                e1 = (record, neighbor_record)
                e2 = (neighbor_record, record)
                if len(T1 & T2) > 0:
                    if record.COUNT > 2 * neighbor_record.COUNT - 1:
                        edges.append(e1)
                    elif neighbor_record.COUNT > 2 * record.COUNT - 1:
                        edges.append(e2)
                    else:
                        edges.append(e1)
                        edges.append(e2)
    G = nx.DiGraph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    return G

Example #21

0

Show file

 def find_su_number(self):
     self.get_iso_object()
     file = os.listdir('su_RAM/')
     hash_list = []
     su_list = []
     img_name = []
     for img_f in file:
         img_path = 'su_RAM/' + img_f
         if img_f[-3:] == 'png':
             # print(img_path)
             sub_iso_img = Image.open(img_path)
             represent_hash = dhash.dhash_int(sub_iso_img)
             # os.remove(img_path)
             if not represent_hash:
                 continue
             # print(represent_hash)
             hash_list.append(represent_hash)
             img_name.append(img_path)
     bk_tree = pybktree.BKTree(pybktree.hamming_distance, hash_list)
     for hash_code in hash_list:
         find_result = bk_tree.find(hash_code, 3)
         similar_number = len(find_result)
         su_list.append(similar_number)
         # print(find_result)
     # os.removedirs('su_RAM/')
     su_number = max(su_list)
     su_index = [i for i, v in enumerate(su_list) if v == su_number]
     # bin_img, contours, hierarchy = cv2.findContours(self.binary_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
     # for i in su_index:
     #     cv2.drawContours(self.o_image, contours, i, (0, 255, 255), thickness=-1)
     # print(su_index)
     for i in su_index:
         print(img_name[i])
     print(img_name)
     # cv2.imwrite('su_RAM/su_image.png', self.o_image)
     return su_number

Example #22

0

Show file

File: read_file_dataset.py Project: Lil-hack/coursework

# count_threads=2
# part_data=len(data)//count_threads
# threads = []
# for i in range(1,count_threads+1):
#     print(i*part_data)
#     if i*part_data-part_data<0:
#         start=0
#     else:
#         start=i*part_data-part_data
#
#     end=i*part_data
#     t = threading.Thread(target=worker, args=(start,end,data,hash))
#     threads.append(t)
#     t.start()
# print(list(data))
tree = pybktree.BKTree(fuzzy_distance, np_data)

print(tree.find(last_foto, 10))

print(tree)

# for foto in data:
# tree.add(convert_base(last_foto,from_base=16,to_base=16))
# print(foto)
# print(hash)
# print(imagehash.hex_to_hash(foto)-hash)
# hash=imagehash.hex_to_hash(foto)

print("--- %s seconds main ---" % (time.time() - start_time))

name = "not_aneta"

Example #23

0

Show file

File: lib.py Project: dfk-paris/similARiTy

class PHashStore:
    tree = pybktree.BKTree(pybktree.hamming_distance, [])

    def add(self, phash):
        if not self.exists(phash):
            self.tree.add(phash)

    def find(self, phash, distance=15):
        return self.tree.find(phash, distance)

    def exists(self, phash):
        return len(self.find(phash, 0)) > 0

    def load(self, io):
        data = json.load(io)
        for r in data:
            self.add(r)

    def dump(self):
        return json.dumps(sorted(self.tree))

    def phash_for(self, image, algorithm='dhash'):
        if algorithm == 'phash':
            return self.phash(image)
        else:
            return self.dhash(image)

    def phash(self, image):
        r = self.__ndarray_for(image, size="32x32!").astype(np.float64)
        h = fft.dctn(r, norm="ortho")[0:8, 0:8]
        avg = np.average(h.reshape(64, )[1:])
        mask = (h <= avg)
        h = mask.reshape(64, ).dot(2**np.arange(mask.size)[::-1])
        return int(h)

    def dhash(self, image):
        r = self.__ndarray_for(image)

        h = 0
        try:
            for i in range(1, 9):
                for j in range(1, 9):
                    h = h << 1 | (1 if r[i][j] >= r[i][j - 1] else 0)
            for i in range(1, 9):
                for j in range(1, 9):
                    h = h << 1 | (1 if r[j][i] >= r[j - 1][i] else 0)
            return h
        except IndexError as e:
            pdb.set_trace()
            return -1
        except ValueError as e:
            pdb.set_trace()
            return -2

    def __ndarray_for(self, image, size="9x9!"):
        image.alpha_channel = False
        image.format = 'gray'
        image.type = 'grayscale'
        image.depth = 8
        image.transform(resize=size)
        result = np.asarray(bytearray(image.make_blob()),
                            dtype=np.uint8).reshape(image.size)
        image.close()
        return result

    def hamming2(self, s1, s2):
        assert len(s1) == len(s2)
        return sum(c1 != c2 for c1, c2 in zip(s1, s2))

Example #24

0

Show file

def makeBkTree(func, addr="../data/DICT.txt"):
    return pybktree.BKTree(func, __readFilesAsList(addr))

Example #25

0

Show file

File: read_data.py Project: myeditha/switchsand

def makeBkTree(func, addr):
    return pybktree.BKTree(func, __readFilesAsList(addr))

Example #26

0

Show file

            else:
                word2wiki_entity[word] = [entity]
    entity_word_set = set(word2wiki_entity)



    print("entity_word_set DONE")

    entity_totally_match = all_entity & entity_in_wiki      # entity totally matched in wiki

    with open("enwiki_match.txt", "w") as f:         # add totally matched entity to file
        for entity in entity_totally_match:
            num = entity2vec[entity]
            f.write("{},,,{},,,{},,,Total_Match\n".format(entity.lower(), entity, num))

    Levenshtein_tree = pybktree.BKTree(distance, entity_in_wiki)
    print("Levenshtein_bktree Done")

    Word_tree = pybktree.BKTree(distance, entity_word_set)
    print("Word_tree Done")

    for entity_to_be_match in (all_entity - entity_totally_match):
        candidates = []
        with open("enwiki_match.txt", "a") as f:
            for long_entity in entity_totally_match:
                if partof(entity_to_be_match, long_entity):
                    candidates.append(long_entity)
            if len(candidates)!=0:
                entity_matched = process.extractOne(entity_to_be_match, candidates)
                num = entity2vec[entity_matched[0]]
                f.write("{},,,{},,,{},,,Abbreviation\n".format(entity_to_be_match.lower(), entity_matched, num))

Example #27

0

Show file

import pybktree
import pandas as pd
import time
import geopandas
from shapely.geometry import Point

t1 = time.time()

df = pd.read_csv("/home/bigdata/Downloads/Data/miniNSPL.csv")
tree = pybktree.BKTree(pybktree.hamming_distance, [0, 4, 5, 14,65,4,76,4,35,63,23])
print sorted(tree)

print df.head()

print df.shape

Southampton = df[df.pcds.str.contains("SO15")]

print Southampton.shape


t2 = time.time()
print t2 -t1

Example #28

0

Show file

else:
    threshold = int(
        input('Enter threshold (e.g. \'40\' means the dhashes are \
    40% different and 60% similar): '))

hashDict = {}
hashList = []
files = os.listdir(filePath)
for file in files:
    image = Image.open(filePath + '/' + file)
    imageDhash = dhash.dhash_int(image)
    hashDict[imageDhash] = file
    hashList.append(imageDhash)

f = csv.writer(open('dhashNearMatches.csv', 'w'))
f.writerow(['percentage'] + ['dhash1'] + ['dhash2'])
completeNearMatches = []
tree = pybktree.BKTree(pybktree.hamming_distance, hashList)
for hash in hashList:
    nearMatches = tree.find(hash, threshold)
    for nearMatch in nearMatches:
        if hashDict[hash] != hashDict[nearMatch[1]]:
            print(nearMatch[0], hashDict[hash], hashDict[nearMatch[1]])
            hashTuple = (nearMatch[0], hashDict[hash], hashDict[nearMatch[1]])
            hashTupleReversed = (nearMatch[0], hashDict[nearMatch[1]],
                                 hashDict[hash])
            if hashTupleReversed not in completeNearMatches:
                completeNearMatches.append(hashTuple)
for hashTuple in completeNearMatches:
    f.writerow([hashTuple[0]] + [hashTuple[1]] + [hashTuple[2]])

Example #29

0

Show file

File: build.py Project: gdh756462786/hmm-spellcheck

    prob_factor = 1 / sum(occurrences)
    for pred in predecessors:
        successor[pred] *= prob_factor
# sort inverse lookup
for successor in words_inverse.keys():
    pred_and_probs = words_inverse[successor].items()
    pred_and_probs = sorted(pred_and_probs, key=lambda x: x[1], reverse=True)
    words_inverse[successor] = dict()
    for (pred, probability) in pred_and_probs:
        words_inverse[successor][pred] = probability

print("Normalizing word frequencies...")
for word in words.values():
    successors = word.keys()
    occurrences = word.values()
    prob_factor = 1 / sum(occurrences)
    for successor in successors:
        word[successor] *= prob_factor

print("Building BKTree...")
tree = pybktree.BKTree(editdistance.eval)
[tree.add(word) for word in words.keys()]

print("Dumping to file...")
model = dict()
model['words'] = words
model['words_inverse'] = words_inverse
model['tree'] = tree

dill.dump(model, open(f"{config.MODEL}/model.dill", 'wb'))

Example #30

0

Show file

File: fuzzy_match.py Project: VStruct/Notte

 def __init__(self, match_threshold=lambda s: 1 + 0.3 * len(s)):
     self.match_tree = pybktree.BKTree(
         jellyfish.damerau_levenshtein_distance)
     self.match_map = {}
     self.max_query_len = 0
     self.get_match_threshold = match_threshold