Example #1
0
def correct_cells_whitelist(final_results, umis_per_cell, whitelist,
                            collapsing_threshold, ab_map):
    """
    Corrects cell barcodes.
    
    Args:
        final_results (dict): Dict of dict of Counters with mapping results.
        umis_per_cell (Counter): Counter of UMIs per cell.
        whitelist (set): The whitelist reference given by the user.
        collapsing_threshold (int): Max distance between umis.
        ab_map (OrederedDict): Tags in an ordered dict.

    
    Returns:
        final_results (dict): Same as input but with corrected umis.
        umis_per_cell (Counter): Updated UMI counts after correction.
        corrected_barcodes (int): How many umis have been corrected.
    """
    barcode_tree = pybktree.BKTree(Levenshtein.hamming, whitelist)
    print('Generated barcode tree from whitelist')
    cell_barcodes = list(final_results.keys())
    n_barcodes = len(cell_barcodes)
    print('Finding reference candidates')
    print('Processing {:,} cell barcodes'.format(n_barcodes))

    #Run with one process
    true_to_false = find_true_to_false_map(
        barcode_tree=barcode_tree,
        cell_barcodes=cell_barcodes,
        whitelist=whitelist,
        collapsing_threshold=collapsing_threshold)
    (umis_per_cell, final_results,
     corrected_barcodes) = collapse_cells(true_to_false, umis_per_cell,
                                          final_results, ab_map)
    return (final_results, umis_per_cell, corrected_barcodes)
def main():
    distance = 2
    #hashFunction = imagehash.average_hash
    hashFunction = imagehash.dhash
    #hashFunction = imagehash.phash
    #hashFunction = imagehash.whash

    # Image file extensions, more can be added if necessary
    ext = ('.jpg', '.jpeg', '.gif', '.png')

    tree = pybktree.BKTree(hdist, [])
    potentialDuplicates = []

    for filename in os.listdir('.'):
        if filename.endswith(ext):
            # Calculates the hash and adds it to the tree
            with Image.open(filename) as img:
                hashval = hashFunction(img)

                # Checks for potentially duplicate images
                for pd in tree.find((hashval, filename), distance):
                    potentialDuplicates.append((filename, pd[1]))

                tree.add((hashval, filename))

    print(f'Found {len(potentialDuplicates)} potential duplicate images.')

    with open('foundchanges.txt', 'w') as out:
        csv_out = csv.writer(out)
        for row in potentialDuplicates:
            csv_out.writerow(row)
def get_image_simhash_bktree():
    with db_functions.get_ad_info_database_connection() as db_connection:
        db_interface = db_functions.AdsIfoDBInterface(db_connection)
        simhash_to_archive_id_set = db_interface.all_ad_creative_image_simhashes(
        )

    total_sim_hashes = len(simhash_to_archive_id_set)
    logging.info('Got %d image simhashes to process.', total_sim_hashes)

    # Create BKTree with dhash bit difference function as distance_function, used to find similar
    # hashes
    image_simhash_tree = pybktree.BKTree(get_num_bits_different)

    sim_hashes_added_to_tree = 0
    tree_construction_start_time = time.time()
    for sim_hash, archive_id_set in simhash_to_archive_id_set.items():
        # Add single entry in BK tree for simhash with lowest archive_id.
        image_simhash_tree.add(
            ArchiveIDAndSimHash(sim_hash=sim_hash,
                                archive_id=min(archive_id_set)))
        sim_hashes_added_to_tree += 1
        if sim_hashes_added_to_tree % 1000 == 0:
            logging.debug('Added %d/%d simhashes to BKtree.',
                          sim_hashes_added_to_tree, total_sim_hashes)
    logging.info('Constructed BKTree in %s seconds',
                 (time.time() - tree_construction_start_time))
    return image_simhash_tree
Example #4
0
 def __init__(self, V=None, model=None):
     """Constructor method to load external probMaker class, load dictionary and words counts """
     self.vocab = self.load_vocab()
     self.counts = self.load_counts()
     self.trie = pybktree.BKTree(distance, self.vocab)
     self.error_df = self.load_error_df()
     self.pm = probMaker(self.error_df, self.counts)
     self.V = V
     self.model = model
    def fit(self, words_list):
        """
            Подгонка спеллера
        """

        checkpoint = time.time()
        self.words_list = pybktree.BKTree(editdistance.eval, words_list)
        print("Speller fitted in", time.time() - checkpoint)

        return self
def v_cut_detector(img_path, v_cut_path):
    img = cv2.imread(img_path)
    o_img = img.copy()
    key_text_loc = ()
    # has_v_cut = False
    dominate_color = get_dominant_color(img)
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if dominate_color[0] > 127:
        res, bin_img = cv2.threshold(gray_img, 45, 255, cv2.THRESH_BINARY_INV)
    else:
        res, bin_img = cv2.threshold(gray_img, 45, 255, cv2.THRESH_BINARY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (24, 6))
    dilate = cv2.dilate(bin_img, kernel, iterations=5)
    close_img = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, kernel)
    res1, contours, h = cv2.findContours(close_img, cv2.RETR_TREE,
                                         cv2.CHAIN_APPROX_SIMPLE)
    object_region = {}
    max_area = np.dot(img.shape[0], img.shape[1])
    # print(max_area)
    key_list = []
    for contour_num in range(len(contours)):
        key = []
        cnt = contours[contour_num]
        area = cv2.contourArea(cnt)
        if area < max_area / 3000 or area > 3 * max_area / 4:
            continue
        x, y, w, h = cv2.boundingRect(cnt)  # 将轮廓信息转换成(x, y)坐标,并加上矩形的高度和宽度
        cv2.rectangle(img, (x, y), (x + w, y + h), (255, 255, 0), 2)  # 画出矩形
        # print(x, y+h)
        cut_img = o_img[y:y + h, x:x + w]
        key.append(x)
        key.append(y + h)
        key_list.append(tuple(key))
        object_region[tuple(key)] = cut_img
        text = pytesseract.image_to_string(cut_img,
                                           lang='eng')  # 'chi_sim+eng'
        if 'v-cut' in text.lower():
            print(img_path)
            key_text_loc = tuple(key)
            print(key_text_loc)
            # has_v_cut = True
    bk_tree = pybktree.BKTree(manhattan_distance, key_list)
    if key_text_loc:
        v_cut_key = bk_tree.find(key_text_loc, 1000)
        print(v_cut_key)
        if len(v_cut_key) > 1:
            v_cut_img = object_region[v_cut_key[1][1]]
        else:
            print('no 1000' + img_path)
            v_cut_img = object_region[v_cut_key[0][1]]

        cv2.imwrite(v_cut_path, v_cut_img)

    cv2.imwrite('v-cut.png', img)
Example #7
0
def getErrorCorrectMapping(cell_barcodes, whitelist, threshold=1):
    ''' Find the mappings between true and false cell barcodes based
    on an edit distance threshold.
    Any cell barcode within the threshold to more than one whitelist
    barcode will be excluded'''

    true_to_false = collections.defaultdict(set)

    # Unexpected results with cythonise hamming distance so redefine in python here
    def hamming_distance(first, second):
        ''' returns the edit distance/hamming distances between
        its two arguements '''

        # We only want to define hamming distance for barcodes with the same length
        if len(first) != len(second):
            return np.inf

        dist = sum([not a == b for a, b in zip(first, second)])
        return dist

    whitelist = set([str(x) for x in whitelist])

    U.info('building bktree')
    tree2 = pybktree.BKTree(hamming_distance, whitelist)
    U.info('done building bktree')

    for cell_barcode in cell_barcodes:

        if cell_barcode in whitelist:
            # if the barcode is already whitelisted, no need to add
            continue

        # get all members of whitelist that are at distance 1
        candidates = [
            white_cell for d, white_cell in tree2.find(cell_barcode, threshold)
            if d > 0
        ]

        if len(candidates) == 0:
            # the cell doesnt match to any whitelisted barcode,
            # hence we have to drop it
            # (as it cannot be asscociated with any frequent barcde)
            continue

        elif len(candidates) == 1:
            white_cell_str = candidates[0]
            true_to_false[white_cell_str].add(cell_barcode)

        else:
            # more than on whitelisted candidate:
            # we drop it as its not uniquely assignable
            continue
    return true_to_false
Example #8
0
def benchmark_pybktree(element_counts, repeat_count=10):
    """
    Returns a list of triples:
      - elements
      - tree creation time in seconds
      - lookup time for one element in seconds
    """
    timings = []
    for element_count in tqdm.tqdm(element_counts):
        timing = [element_count]

        runtimes = []
        for i in range(repeat_count):
            elements = np.random.randint(np.iinfo(np.uint64).max,
                                         size=element_count,
                                         dtype=np.uint64)
            t0 = time.time()
            tree = pybktree.BKTree(pybktree.hamming_distance, elements)
            t1 = time.time()
            runtimes.append(t1 - t0)
        timing += [np.mean(runtimes), np.std(runtimes)]

        for distance in [0, 1, 2, 4, 8, 16]:
            runtimes = []
            for i in range(repeat_count):
                elements = np.random.randint(np.iinfo(np.uint64).max,
                                             size=element_count,
                                             dtype=np.uint64)
                tree = pybktree.BKTree(pybktree.hamming_distance, elements)
                t0 = time.time()
                results = tree.find(item=np.uint64(0), n=distance)
                t1 = time.time()
                runtimes.append(t1 - t0)
            timing += [distance, np.mean(runtimes), np.std(runtimes)]

        timings.append(timing)

    return timings
Example #9
0
def correct_cells_whitelist(final_results, umis_per_cell, whitelist,
                            collapsing_threshold):
    """
    Corrects cell barcodes.
    
    Args:
        final_results (dict): Dict of dict of Counters with mapping results.
        umis_per_cell (Counter): Counter of number of umis per cell.
        whitelist (set): The whitelist reference given by the user.
        collapsing_threshold (int): Max distance between umis.
    
    Returns:
        final_results (dict): Same as input but with corrected umis.
        umis_per_cell (Counter): Counter of umis per cell after cell barcode correction
        corrected_umis (int): How many umis have been corrected.
    """
    true_to_false = defaultdict(set)
    barcode_tree = pybktree.BKTree(Levenshtein.hamming, whitelist)
    print('Generated barcode tree from whitelist')
    cell_barcodes = list(final_results.keys())
    print('Finding reference candidates')
    for i, cell_barcode in enumerate(cell_barcodes):
        if cell_barcode in whitelist:
            # if the barcode is already whitelisted, no need to add
            continue
        # get all members of whitelist that are at distance of collapsing_threshold
        candidates = [
            white_cell for d, white_cell in barcode_tree.find(
                cell_barcode, collapsing_threshold) if d > 0
        ]

        if len(candidates) == 0:
            # the cell doesnt match to any whitelisted barcode,
            # hence we have to drop it
            # (as it cannot be asscociated with any frequent barcode)
            continue
        elif len(candidates) == 1:
            white_cell_str = candidates[0]
            true_to_false[white_cell_str].add(cell_barcode)
        else:
            # more than on whitelisted candidate:
            # we drop it as its not uniquely assignable
            continue
    (umis_per_cell, final_results,
     corrected_barcodes) = collapse_cells(true_to_false=true_to_false,
                                          umis_per_cell=umis_per_cell,
                                          final_results=final_results)
    return (final_results, umis_per_cell, corrected_barcodes)
Example #10
0
    def load(self, idx_dir, force=False):
        if self.loaded and not force:
            return

        self.idx_dir = idx_dir
        self.file_path = os.path.join(idx_dir, 'idx.pk')
        if os.path.exists(self.file_path):
            with open(self.file_path, 'rb') as file:
                self.tree = dill.load(file)
        else:
            self.tree = pybktree.BKTree(
                lambda x, y: editdistance.eval(x['sequence'], y['sequence']))

        self.loaded = True
        self.tree.distance_func = lambda x, y: editdistance.eval(
            x['sequence'], y['sequence'])
Example #11
0
def main():
    dhash_json = pathlib.Path(os.getenv('DHASH_FILE')).resolve()
    dhash_tree = pybktree.BKTree(diff)
    with dhash_json.open() as f:
        data = json.load(f)
    for path, dhash in data.items():
        dhash_tree.add(Image(path, dhash))

    match_distance = int(os.getenv('MATCH_DISTANCE', 5))
    for image in dhash_tree:
        matches = dhash_tree.find(image, match_distance)
        if len(matches) > 1:
            print(image.path)
            for match in matches:
                print(f'{match[0]} {match[1].path}')
            input()
def pare_matches_and_download(thumbnail_urls, person):
    urls = set()
    directory = '../../common/images/' + person
    if not os.path.exists(directory) and len(thumbnail_urls) > 10:
        # Make sure all the matches are of the same person
        try:
            identifier = face.Identifier(threshold=1.0)
            images = map(identifier.download_image, thumbnail_urls)
            urls_and_embeddings = identifier.detect_encode_all(
                images, thumbnail_urls, True)
            anchor_embedding = urls_and_embeddings[0].embedding
            # Assume first image is of the right person and check other images are of the same person
            for other in urls_and_embeddings:
                is_match, distance = identifier.compare_embedding(
                    anchor_embedding, other.embedding)
                # print('dist: {} between {} and {}'.format(distance,urls_and_embeddings[0].url, other.url))
                if is_match:
                    urls.add(other.url)
            del identifier
        except Exception as e:
            print(e)

        # Make sure there are no duplicate images
        image_hashes = [HASH_URL(url_to_img_hash(url), url) for url in urls]
        tree = pybktree.BKTree(image_distance, image_hashes)
        # this makes images saved in order of similarity so we can spot duplicates easier
        sorted_image_hashes = sorted(tree)
        to_discard = []
        urls_to_keep = set()
        for image_hash in sorted_image_hashes:
            if image_hash not in to_discard:
                # gets pictures within a hamming distance of 3
                matches = tree.find(image_hash, 3)
                for match in matches:
                    if match[1].url != image_hash.url:
                        to_discard.append(match[1])
                urls_to_keep.add(image_hash.url)

        # Download the images
        download_urls(person, list(urls_to_keep))

    # Update counter
    try:
        increment()
        timer.update(int(counter.value))
    except Exception as e:
        print(e)
def find_threshold(db, threshold=1):
    dups = []
    # Build a tree
    cursor = db.find()
    tree = pybktree.BKTree(pybktree.hamming_distance)

    cprint('Finding fuzzy duplicates, it might take a while...')
    cnt = 0
    for document in db.find():
        int_hash = int(document['hash'], 16)
        tree.add(int_hash)
        cnt = cnt + 1

    deduplicated = set()

    scanned = 0
    for document in db.find():
        cprint("\r%d%%" % (scanned * 100 / (cnt - 1)), end='')
        scanned = scanned + 1
        if document['hash'] in deduplicated:
            continue
        deduplicated.add(document['hash'])
        hash_len = len(document['hash'])
        int_hash = int(document['hash'], 16)
        similar = tree.find(int_hash, threshold)
        if len(similar) > 1:
            similar = list(set(similar))

            similars = []
            for (distance, item_hash) in similar:
                item_hash = format(item_hash, '0' + str(hash_len) + 'x')
                if distance > 0:
                    deduplicated.add(item_hash)

                for item in db.find({'hash': item_hash}):
                    item['file_name'] = item['_id']
                    similars.append(item)
            if len(similars) > 0:
                dups.append(
                    {
                        '_id': document['hash'],
                        'total': len(similars),
                        'items': similars
                    }
                )

    return dups
def build_dict_tree(dict_path):
    hash_list = []
    chr_name = []
    bk_tree = None
    for f in get_all_files(dict_path):
        f_path = f
        if f_path[-3:] == 'png':
            chr_image = Image.open(f_path)
            chr_image = chr_image.convert('L')
            represent_hash = dhash.dhash_int(chr_image)
            if not represent_hash:
                continue
            hash_list.append(represent_hash)
            chr_name.append(f_path.split('/')[-1][:-4])

        bk_tree = pybktree.BKTree(pybktree.hamming_distance, hash_list)
    return chr_name, hash_list, bk_tree
def main():
    averageTime = []
    for i in range(55):
        start = time.time()

        distance = i // 5
        #hashFunction = imagehash.average_hash
        #hashFunction = imagehash.dhash
        #hashFunction = imagehash.phash
        hashFunction = imagehash.whash

        # Image file extensions, more can be added if necessary
        ext = ('.jpg', '.jpeg', '.gif', '.png')

        tree = pybktree.BKTree(hdist, [])
        potentialDuplicates = []

        for filename in os.listdir('.'):
            if filename.endswith(ext):
                # Calculates the hash and adds it to the tree
                with Image.open(filename) as img:
                    hashval = hashFunction(img)

                    # Checks for potentially duplicate images
                    for pd in tree.find((hashval, filename), distance):
                        potentialDuplicates.append((filename, pd[1][1]))
                    tree.add((hashval, filename))

        print(i)

        f = 'foundchanges' + str(i) + '.txt'
        with open(f, 'w') as out:
            writer = csv.writer(out, delimiter=',')
            writer.writerows(potentialDuplicates)

        end = time.time()
        averageTime.append((i, end - start))

    with open('whash.txt', 'w') as out:
        writer = csv.writer(out, delimiter=',')
        writer.writerows(averageTime)
Example #16
0
def get_most_common_true_sequences(read_counter, topN:int):
    """
    get the most abundant sequences, but also make sure that shadows dont sneak in.
    e.g. a VERY abundant true sequence might be ~100000reads, and 1% (1000)
    will result in shadows. these shadows might end up in the top100 itself
    """
    assert isinstance(read_counter, collections.Counter)
    from rnaseqtools.seqerrors.CB_errors import hamming_distance
    bktree = pybktree.BKTree(hamming_distance)
    DISTANCE = 2

    most_common = set()
    for seq, freq in tqdm.tqdm(read_counter.most_common(topN), desc='finding most common seqs'):
        # if the sequence is close to an an already accepted true seq
        if len(bktree.find(seq, DISTANCE)) > 0:
            continue
        else:
            bktree.add(seq)
            most_common.add(seq)

    return most_common
Example #17
0
def image_grouping(images: Collection['Image'], sensitivity: Sensitivity) \
    -> Generator[Tuple[GroupIndex, Group], None, None]:
    '''Find similar images and group them. Yield a tuple with the group
    index and image group when a new group has been added or existing one
    has been modified (a new image has been added to the group)

    :param images:      images to process,
    :param sensitivity: maximal difference between hashes of 2 images
                        when they are considered similar,
    :yield:             tuple with the group index and list with grouped
                        similar images,
    :raise TypeError:   any of the hashes is not integer
    '''

    image_groups: List[Group] = []

    try:
        bktree = pybktree.BKTree(Image.hamming, images)
    except TypeError:
        raise TypeError('Hashes must be integers')

    checked: Dict['Image', GroupIndex] = dict()

    for image in images:
        distance, closest = _closest(bktree, image, sensitivity)
        if closest is None:
            continue

        # 'closest' goes to the same group as 'image'
        if image in checked and closest not in checked:
            yield _add_img_to_existing_group(image, closest, checked,
                                             image_groups)
        # and vice versa
        if image not in checked and closest in checked:
            yield _add_img_to_existing_group(closest, image, checked,
                                             image_groups)
        # create a new group with 'image' and 'closest' it it
        if image not in checked and closest not in checked:
            yield _add_new_group(image, closest, checked, image_groups,
                                 distance)
Example #18
0
def get_multitree(voc_fd, lang_id):
    """Get a multitree for the given language."""
    # Get a dict() where each key is a letter and each value
    # is a BK tree of the words that start with that letter
    dst = distance.Distance()
    ed = EditDistanceWrapper(lang_id, dst, phonemise)
    distractors = {}
    for line in voc_fd.readlines():
        (f, w) = line.strip("\n").split("\t")
        first_letter = w[0].lower()
        if first_letter not in distractors:
            distractors[first_letter] = []
        distractors[first_letter].append(w.lower())

    distractors_tree = {}
    for letter in distractors:
        distractors_tree[letter] = pybktree.BKTree(
            ed.edit_distance,
            distractors[letter],
        )

    return distractors_tree
Example #19
0
def dedupe_images(matched_urls: List[str], person: str) \
        -> Tuple[List[str], str]:

    image_hashes = [IMAGE_HASH(url_to_img_hash(url), url)
                    for url in matched_urls]
    tree = pybktree.BKTree(image_distance, image_hashes)
    # this makes images saved in order of similarity so we can spot duplicates
    # easier
    sorted_image_hashes = sorted(tree)
    to_discard: List[str] = []
    urls_to_keep = set()
    for image_hash in sorted_image_hashes:
        if image_hash not in to_discard:
            # gets pictures within a hamming distance of 5
            matches = tree.find(image_hash, 5)
            for match in matches:
                if match[1].url != image_hash.url:
                    to_discard.append(match[1])
            urls_to_keep.add(image_hash.url)
    # Update counter
    increment()
    TIMER.update(int(COUNTER.value))
    return list(urls_to_keep), person
Example #20
0
def create_PUG_umi_based(cb_records:list, ec_dict):

    umi_dict = collections.defaultdict(list)
    for record in cb_records:
        umi_dict[record.UMI].append(record)

    # a BKTree of all UMIs in that cell
    tree = pybktree.BKTree(hamming_distance, list(umi_dict.keys()))

    nodes = set()
    edges = []
    for record in cb_records:

        nodes.add(record)
        # any sequence neighbours?
        for distance, umi_neighbor in tree.find(record.UMI, 1):
            # this particular UMI might have multiple records:
            for neighbor_record in umi_dict[umi_neighbor]:
                if record == neighbor_record:
                    continue  # due to d==0 this can be the same record
                # check EC overlap
                T1 = set(ec_dict[record.EC])
                T2 = set(ec_dict[neighbor_record.EC])
                e1 = (record, neighbor_record)
                e2 = (neighbor_record, record)
                if len(T1 & T2) > 0:
                    if record.COUNT > 2 * neighbor_record.COUNT - 1:
                        edges.append(e1)
                    elif neighbor_record.COUNT > 2 * record.COUNT - 1:
                        edges.append(e2)
                    else:
                        edges.append(e1)
                        edges.append(e2)
    G = nx.DiGraph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    return G
Example #21
0
 def find_su_number(self):
     self.get_iso_object()
     file = os.listdir('su_RAM/')
     hash_list = []
     su_list = []
     img_name = []
     for img_f in file:
         img_path = 'su_RAM/' + img_f
         if img_f[-3:] == 'png':
             # print(img_path)
             sub_iso_img = Image.open(img_path)
             represent_hash = dhash.dhash_int(sub_iso_img)
             # os.remove(img_path)
             if not represent_hash:
                 continue
             # print(represent_hash)
             hash_list.append(represent_hash)
             img_name.append(img_path)
     bk_tree = pybktree.BKTree(pybktree.hamming_distance, hash_list)
     for hash_code in hash_list:
         find_result = bk_tree.find(hash_code, 3)
         similar_number = len(find_result)
         su_list.append(similar_number)
         # print(find_result)
     # os.removedirs('su_RAM/')
     su_number = max(su_list)
     su_index = [i for i, v in enumerate(su_list) if v == su_number]
     # bin_img, contours, hierarchy = cv2.findContours(self.binary_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
     # for i in su_index:
     #     cv2.drawContours(self.o_image, contours, i, (0, 255, 255), thickness=-1)
     # print(su_index)
     for i in su_index:
         print(img_name[i])
     print(img_name)
     # cv2.imwrite('su_RAM/su_image.png', self.o_image)
     return su_number
Example #22
0
# count_threads=2
# part_data=len(data)//count_threads
# threads = []
# for i in range(1,count_threads+1):
#     print(i*part_data)
#     if i*part_data-part_data<0:
#         start=0
#     else:
#         start=i*part_data-part_data
#
#     end=i*part_data
#     t = threading.Thread(target=worker, args=(start,end,data,hash))
#     threads.append(t)
#     t.start()
# print(list(data))
tree = pybktree.BKTree(fuzzy_distance, np_data)

print(tree.find(last_foto, 10))

print(tree)

# for foto in data:
# tree.add(convert_base(last_foto,from_base=16,to_base=16))
# print(foto)
# print(hash)
# print(imagehash.hex_to_hash(foto)-hash)
# hash=imagehash.hex_to_hash(foto)

print("--- %s seconds main ---" % (time.time() - start_time))

name = "not_aneta"
Example #23
0
class PHashStore:
    tree = pybktree.BKTree(pybktree.hamming_distance, [])

    def add(self, phash):
        if not self.exists(phash):
            self.tree.add(phash)

    def find(self, phash, distance=15):
        return self.tree.find(phash, distance)

    def exists(self, phash):
        return len(self.find(phash, 0)) > 0

    def load(self, io):
        data = json.load(io)
        for r in data:
            self.add(r)

    def dump(self):
        return json.dumps(sorted(self.tree))

    def phash_for(self, image, algorithm='dhash'):
        if algorithm == 'phash':
            return self.phash(image)
        else:
            return self.dhash(image)

    def phash(self, image):
        r = self.__ndarray_for(image, size="32x32!").astype(np.float64)
        h = fft.dctn(r, norm="ortho")[0:8, 0:8]
        avg = np.average(h.reshape(64, )[1:])
        mask = (h <= avg)
        h = mask.reshape(64, ).dot(2**np.arange(mask.size)[::-1])
        return int(h)

    def dhash(self, image):
        r = self.__ndarray_for(image)

        h = 0
        try:
            for i in range(1, 9):
                for j in range(1, 9):
                    h = h << 1 | (1 if r[i][j] >= r[i][j - 1] else 0)
            for i in range(1, 9):
                for j in range(1, 9):
                    h = h << 1 | (1 if r[j][i] >= r[j - 1][i] else 0)
            return h
        except IndexError as e:
            pdb.set_trace()
            return -1
        except ValueError as e:
            pdb.set_trace()
            return -2

    def __ndarray_for(self, image, size="9x9!"):
        image.alpha_channel = False
        image.format = 'gray'
        image.type = 'grayscale'
        image.depth = 8
        image.transform(resize=size)
        result = np.asarray(bytearray(image.make_blob()),
                            dtype=np.uint8).reshape(image.size)
        image.close()
        return result

    def hamming2(self, s1, s2):
        assert len(s1) == len(s2)
        return sum(c1 != c2 for c1, c2 in zip(s1, s2))
Example #24
0
def makeBkTree(func, addr="../data/DICT.txt"):
    return pybktree.BKTree(func, __readFilesAsList(addr))
Example #25
0
def makeBkTree(func, addr):
    return pybktree.BKTree(func, __readFilesAsList(addr))
Example #26
0
            else:
                word2wiki_entity[word] = [entity]
    entity_word_set = set(word2wiki_entity)



    print("entity_word_set DONE")

    entity_totally_match = all_entity & entity_in_wiki      # entity totally matched in wiki

    with open("enwiki_match.txt", "w") as f:         # add totally matched entity to file
        for entity in entity_totally_match:
            num = entity2vec[entity]
            f.write("{},,,{},,,{},,,Total_Match\n".format(entity.lower(), entity, num))

    Levenshtein_tree = pybktree.BKTree(distance, entity_in_wiki)
    print("Levenshtein_bktree Done")

    Word_tree = pybktree.BKTree(distance, entity_word_set)
    print("Word_tree Done")

    for entity_to_be_match in (all_entity - entity_totally_match):
        candidates = []
        with open("enwiki_match.txt", "a") as f:
            for long_entity in entity_totally_match:
                if partof(entity_to_be_match, long_entity):
                    candidates.append(long_entity)
            if len(candidates)!=0:
                entity_matched = process.extractOne(entity_to_be_match, candidates)
                num = entity2vec[entity_matched[0]]
                f.write("{},,,{},,,{},,,Abbreviation\n".format(entity_to_be_match.lower(), entity_matched, num))
Example #27
0
import pybktree
import pandas as pd
import time
import geopandas
from shapely.geometry import Point

t1 = time.time()

df = pd.read_csv("/home/bigdata/Downloads/Data/miniNSPL.csv")
tree = pybktree.BKTree(pybktree.hamming_distance, [0, 4, 5, 14,65,4,76,4,35,63,23])
print sorted(tree)

print df.head()

print df.shape

Southampton = df[df.pcds.str.contains("SO15")]

print Southampton.shape


t2 = time.time()
print t2 -t1
Example #28
0
else:
    threshold = int(
        input('Enter threshold (e.g. \'40\' means the dhashes are \
    40% different and 60% similar): '))

hashDict = {}
hashList = []
files = os.listdir(filePath)
for file in files:
    image = Image.open(filePath + '/' + file)
    imageDhash = dhash.dhash_int(image)
    hashDict[imageDhash] = file
    hashList.append(imageDhash)

f = csv.writer(open('dhashNearMatches.csv', 'w'))
f.writerow(['percentage'] + ['dhash1'] + ['dhash2'])
completeNearMatches = []
tree = pybktree.BKTree(pybktree.hamming_distance, hashList)
for hash in hashList:
    nearMatches = tree.find(hash, threshold)
    for nearMatch in nearMatches:
        if hashDict[hash] != hashDict[nearMatch[1]]:
            print(nearMatch[0], hashDict[hash], hashDict[nearMatch[1]])
            hashTuple = (nearMatch[0], hashDict[hash], hashDict[nearMatch[1]])
            hashTupleReversed = (nearMatch[0], hashDict[nearMatch[1]],
                                 hashDict[hash])
            if hashTupleReversed not in completeNearMatches:
                completeNearMatches.append(hashTuple)
for hashTuple in completeNearMatches:
    f.writerow([hashTuple[0]] + [hashTuple[1]] + [hashTuple[2]])
Example #29
0
    prob_factor = 1 / sum(occurrences)
    for pred in predecessors:
        successor[pred] *= prob_factor
# sort inverse lookup
for successor in words_inverse.keys():
    pred_and_probs = words_inverse[successor].items()
    pred_and_probs = sorted(pred_and_probs, key=lambda x: x[1], reverse=True)
    words_inverse[successor] = dict()
    for (pred, probability) in pred_and_probs:
        words_inverse[successor][pred] = probability

print("Normalizing word frequencies...")
for word in words.values():
    successors = word.keys()
    occurrences = word.values()
    prob_factor = 1 / sum(occurrences)
    for successor in successors:
        word[successor] *= prob_factor

print("Building BKTree...")
tree = pybktree.BKTree(editdistance.eval)
[tree.add(word) for word in words.keys()]

print("Dumping to file...")
model = dict()
model['words'] = words
model['words_inverse'] = words_inverse
model['tree'] = tree

dill.dump(model, open(f"{config.MODEL}/model.dill", 'wb'))
Example #30
0
 def __init__(self, match_threshold=lambda s: 1 + 0.3 * len(s)):
     self.match_tree = pybktree.BKTree(
         jellyfish.damerau_levenshtein_distance)
     self.match_map = {}
     self.max_query_len = 0
     self.get_match_threshold = match_threshold