def test_lookup_max_edit_distance_too_large(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) with pytest.raises(ValueError) as excinfo: __ = sym_spell.lookup("flam", Verbosity.TOP, 3) self.assertEqual("Distance too large", str(excinfo.value))
def test_create_dictionary_entry_negative_count(self): sym_spell = SymSpell(1, 3) self.assertEqual(False, sym_spell.create_dictionary_entry("pipe", 0)) self.assertEqual(False, sym_spell.create_dictionary_entry("pipe", -1)) sym_spell = SymSpell(1, 3, count_threshold=0) self.assertEqual(True, sym_spell.create_dictionary_entry("pipe", 0))
def test_lookup_should_not_return_non_word_delete(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("pawn", 10) result = sym_spell.lookup("paw", Verbosity.TOP, 0) self.assertEqual(0, len(result)) result = sym_spell.lookup("awn", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_lookup_should_not_return_low_count_word_that_are_also_delete_word( self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("flam", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_lookup_include_unknown(self): sym_spell = SymSpell(2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("flam", Verbosity.TOP, 0, True) self.assertEqual(1, len(result)) self.assertEqual("flam", result[0].term)
def test_words_with_shared_prefix_should_retain_counts(self): sym_spell = SymSpell(1, 3) sym_spell.create_dictionary_entry("pipe", 5) sym_spell.create_dictionary_entry("pips", 10) result = sym_spell.lookup("pipe", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pipe", result[0].term) self.assertEqual(5, result[0].count) self.assertEqual("pips", result[1].term) self.assertEqual(10, result[1].count) result = sym_spell.lookup("pips", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count) result = sym_spell.lookup("pip", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count)
def test_lookup_should_find_exact_match(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("streama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term)
def test_add_additional_counts_should_not_add_word_again(self): sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, 11) self.assertEqual(1, sym_spell.word_count) sym_spell.create_dictionary_entry(word, 3) self.assertEqual(1, sym_spell.word_count)
def test_lookup_include_unknown(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("qwer", Verbosity.TOP, 0, True) self.assertEqual(1, len(result)) self.assertEqual("qwer", result[0].term)
def test_lookup_should_not_return_non_word_delete(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("pawn", 10) result = sym_spell.lookup("paw", Verbosity.TOP, 0) self.assertEqual(0, len(result)) result = sym_spell.lookup("awn", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_create_dictionary_entry_negative_count(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 1, 3) self.assertEqual(False, sym_spell.create_dictionary_entry("pipe", 0)) self.assertEqual(False, sym_spell.create_dictionary_entry("pipe", -1)) sym_spell = SymSpell(16, 1, 3, count_threshold=0) self.assertEqual(True, sym_spell.create_dictionary_entry("pipe", 0))
def test_lookup_transfer_casing_symspellpy(benchmark): sym_spell = SymSpellPy(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.create_dictionary_entry("steam", 4) result = benchmark(sym_spell.lookup, "StreaM", VerbosityPy.TOP, 2, transfer_casing=True) assert (result[0].term == "SteaM")
def test_add_additional_counts_should_not_add_word_again(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, 11) self.assertEqual(1, sym_spell.word_count) sym_spell.create_dictionary_entry(word, 3) self.assertEqual(1, sym_spell.word_count)
def test_lookup_should_return_most_frequent(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("stream", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steamb", result[0].term) self.assertEqual(6, result[0].count)
def test_lookup_avoid_exact_match_early_exit(self): edit_distance_max = 2 sym_spell = SymSpell(edit_distance_max, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("24th", Verbosity.ALL, edit_distance_max, ignore_token=r"\d{2}\w*\b") self.assertEqual(1, len(result)) self.assertEqual("24th", result[0].term)
def test_deletes(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("stream", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steamb", result[0].term) self.assertEqual(6, result[0].count) self.assertTrue(len(sym_spell.deletes))
def test_lookup_compound_no_suggestion(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("machine", 1) typo = "qwer erty ytui a" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(typo, results[0].term)
def test_add_additional_counts_should_not_overflow(self): sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, sys.maxsize - 10) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(sys.maxsize - 10, count) sym_spell.create_dictionary_entry(word, 11) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(sys.maxsize, count)
def test_add_additional_counts_should_increase_count(self): sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, 11) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(11, count) sym_spell.create_dictionary_entry(word, 3) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(11 + 3, count)
def test_lookup_compound_only_combi(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("machine", 1) typo = "ste am machie" correction = "steam machine" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
def test_lookup_transfer_casing(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 4) result = sym_spell.lookup("Stream", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("Steam", result[0].term) sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 4) result = sym_spell.lookup("StreaM", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("SteaM", result[0].term) sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 4) result = sym_spell.lookup("STREAM", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("STEAM", result[0].term) sym_spell = SymSpell() sym_spell.create_dictionary_entry("i", 4) result = sym_spell.lookup("I", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("I", result[0].term)
class NameChecker(object): def __init__(self, name_list): self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) for each_name in name_list: self.sym_spell.create_dictionary_entry(each_name, len(each_name.split(' '))) def get_name(self, name): suggestions = self.sym_spell.lookup(name, Verbosity.CLOSEST, max_edit_distance=2, transfer_casing=True) if suggestions is not None and len(suggestions) > 0: return suggestions[0].term return name
def test_create_dictionary_entry_below_threshold(self): sym_spell = SymSpell(1, 3, count_threshold=10) sym_spell.create_dictionary_entry("pipe", 4) self.assertEqual(1, len(sym_spell.below_threshold_words)) self.assertEqual(4, sym_spell.below_threshold_words["pipe"]) sym_spell.create_dictionary_entry("pipe", 4) self.assertEqual(1, len(sym_spell.below_threshold_words)) self.assertEqual(8, sym_spell.below_threshold_words["pipe"]) sym_spell.create_dictionary_entry("pipe", 4) self.assertEqual(0, len(sym_spell.below_threshold_words))
def test_verbosity_should_control_lookup_results(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("steams", 2) sym_spell.create_dictionary_entry("steem", 3) result = sym_spell.lookup("steems", Verbosity.TOP, 2) self.assertEqual(1, len(result)) result = sym_spell.lookup("steems", Verbosity.CLOSEST, 2) self.assertEqual(2, len(result)) result = sym_spell.lookup("steems", Verbosity.ALL, 2) self.assertEqual(3, len(result))
def test_create_dictionary_entry_below_threshold(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 1, 3, count_threshold=10) sym_spell.create_dictionary_entry("pipe", 4) self.assertEqual(1, len(sym_spell.below_threshold_words)) self.assertEqual(4, sym_spell.below_threshold_words["pipe"]) sym_spell.create_dictionary_entry("pipe", 4) self.assertEqual(1, len(sym_spell.below_threshold_words)) self.assertEqual(8, sym_spell.below_threshold_words["pipe"]) sym_spell.create_dictionary_entry("pipe", 4) self.assertEqual(0, len(sym_spell.below_threshold_words))
def test_delete_dictionary_entry_invalid_word(self): sym_spell = SymSpell() sym_spell.create_dictionary_entry("stea", 1) sym_spell.create_dictionary_entry("steama", 2) sym_spell.create_dictionary_entry("steem", 3) result = sym_spell.lookup("steama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term) self.assertEqual(len("steama"), sym_spell._max_length) self.assertFalse(sym_spell.delete_dictionary_entry("steamab")) result = sym_spell.lookup("steama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term) self.assertEqual(len("steama"), sym_spell._max_length)
def spell_checker(corr_list): sc = SymSpell(max_dictionary_edit_distance=2) for word in corr_list: sc.create_dictionary_entry(word, 1) return sc
class MagicRecognition: def __init__(self, file_all_cards: str, file_keywords: str, languages=tuple("English"), max_ratio_diff=0.3, max_ratio_diff_keyword=0.2) -> None: """Load dictionnaries of cards and keywords Parameters ---------- file_all_cards: str Path to the file containing all cards. If the file does not exist, it is downloaded from mtgjson. file_keywords: str Path to the file containing all keywords. If the file does not exist, it is downloaded from mtgjson. max_ratio_diff : float, optional Maximum ratio (distance/length) for a text to be considered as a card name, by default 0.3 max_ratio_diff_keyword : float, optional Maximum ratio (distance/length) for a text to be considered as a (ignored) keyword, by default 0.2 """ self.max_ratio_diff = max_ratio_diff self.max_ratio_diff_keyword = max_ratio_diff_keyword if not Path(file_all_cards).is_file(): def write_card(f, card): i = card.find(" //") if i != -1: card = card[:i] f.write(card + "$1\n") # required for SymSpell all_cards_json = load_json(URL_ALL_CARDS) with Path(file_all_cards).open("a") as f: for card, l in all_cards_json["data"].items(): if "English" in languages: write_card(f, card) for e in l[0]["foreignData"]: if e["language"] in languages: write_card(f, e["name"]) self.sym_all_cards = SymSpell(max_dictionary_edit_distance=6) self.sym_all_cards._distance_algorithm = editdistance.DistanceAlgorithm.LEVENSHTEIN self.sym_all_cards.load_dictionary(file_all_cards, 0, 1, separator="$") self.all_cards = self.sym_all_cards._words print(f"Loaded {file_all_cards}: {len(self.all_cards)} cards") self.edit_dist = editdistance.EditDistance(editdistance.DistanceAlgorithm.LEVENSHTEIN) if not Path(file_keywords).is_file(): keywords = load_json(URL_KEYWORDS) json.dump(keywords, Path(file_keywords).open("w")) def concat_lists(LL): res = [] for L in LL: res.extend(L) return res keywords_json = json.load(Path(file_keywords).open()) keywords = concat_lists(keywords_json["data"].values()) keywords.extend(["Display", "Land", "Search", "Profile"]) self.sym_keywords = SymSpell(max_dictionary_edit_distance=3) for k in keywords: self.sym_keywords.create_dictionary_entry(k, 1) print(f"Loaded {file_keywords}: {len(keywords)} cards") def _preprocess(self, text: str) -> str: """Remove characters which can't appear on a Magic card (OCR error)""" return re.sub("[^a-zA-Z',. ]", '', text).rstrip(' ') def _preprocess_texts(self, box_texts: BoxTextList) -> None: """Apply `preprocess` on each text""" for box_text in box_texts: box_text.text = self._preprocess(box_text.text) def box_texts_to_cards(self, box_texts: BoxTextList) -> BoxTextList: """Recognize cards from raw texts""" box_texts.sort() box_cards = BoxTextList() for box, text, _ in box_texts: sug = self.sym_keywords.lookup(text, Verbosity.CLOSEST, max_edit_distance=min(3, int(self.max_ratio_diff_keyword * len(text)))) if sug != []: logging.info(f"Keyword rejected: {text} {sug[0].distance/len(text)} {sug[0].term}") else: card = self._search(self._preprocess(text)) if card is not None: box_cards.add(box, card) return box_cards def _assign_stacked(self, box_texts: BoxTextList, box_cards: BoxTextList) -> None: """Set multipliers (e.g. x4) for each (stacked) card in `box_cards` Parameters ---------- box_texts : BoxTextList BoxTextList containing potential multipliers box_cards : BoxTextList BoxTextList containing recognized cards """ def _assign_stacked_one(box_cards: BoxTextList, m: int, comp) -> None: i_min = 0 for i, box_card in enumerate(box_cards): if comp(box_card.box, box_cards[i_min].box): i_min = i box_cards[i_min].n = m logging.info(f"{box_cards[i_min].text} assigned to x{m}") def dist(p: tuple, q: tuple) -> float: return (p[0] - q[0])**2 + (p[1] - q[1])**2 def comp_md(box1: tuple, box2: tuple, box: tuple) -> float: if box1[0] > box[0] or box1[1] > box[1]: return False return dist(box, box1) < dist(box, box2) def comp_sb(box1: tuple, box2: tuple, box: tuple) -> float: return dist(box, box1) < dist(box, box2) comp = (comp_md, comp_sb) for box, text, _ in box_texts: if len(text) == 2: for i in [0, 1]: if text[i] in '×xX' and text[1 - i].isnumeric(): _assign_stacked_one(box_cards, int(text[1 - i]), partial(comp[i], box=box)) def _box_cards_to_deck(self, box_cards: BoxTextList) -> Deck: """Convert recognized cards to decklist""" maindeck, sideboard = Pile(), Pile() n_cards = sum(c.n for c in box_cards) n_added = 0 last_main_card = max(60, n_cards - 15) for _, card, n in box_cards: def add_cards(c, deck, p): if c in deck.cards: deck.cards[c] += p elif p > 0: deck.cards[c] = p n_added_main = max(min(n, last_main_card - n_added), 0) add_cards(card, maindeck, n_added_main) add_cards(card, sideboard, n - n_added_main) n_added += n deck = Deck() deck.maindeck = maindeck deck.sideboard = sideboard return deck def box_texts_to_deck(self, box_texts: BoxTextList) -> Deck: """Convert raw texts to decklist Parameters ---------- box_texts : BoxTextList Raw texts given by an OCR Returns ------- Deck Decklist obtained from `box_texts` """ box_cards = self.box_texts_to_cards(box_texts) self._assign_stacked(box_texts, box_cards) return self._box_cards_to_deck(box_cards) def _search(self, text): """If `text` can be recognized as a Magic card, return that card. Otherwise, return None.""" if len(text) < 3: # a card name is never that short return None if len(text) > 30: # a card name is never that long logging.info(f"Too long: {text}") return None if text in self.all_cards: return text i = text.find("..") # search for truncated card name if i != -1: dist = int(self.max_ratio_diff * i) card = None for c in self.all_cards: d = self.edit_dist.compare(text[:i], c[:i], dist) if d != -1 and d < dist: card = c dist = d if card is None: logging.info(f"Not prefix: {text}") else: logging.info(f"Found prefix: {text} {dist/i} {card}") return card else: text = text.replace('.', '').rstrip(' ') sug = self.sym_all_cards.lookup(text, Verbosity.CLOSEST, max_edit_distance=min(6, int(self.max_ratio_diff * len(text)))) if sug != []: card = sug[0].term ratio = sug[0].distance / len(text) if len(text) < len(card) + 7: logging.info(f"Corrected: {text} {ratio} {card}") return card logging.info(f"Not corrected (too long): {text} {ratio} {card}") else: logging.info(f"Not found: {text}") return None
def _create_spell_checker_dict(ents_counter, thres): sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) for ent, count in ents_counter.items(): if count > thres: sym_spell.create_dictionary_entry(ent, count) return sym_spell
def test_lookup_should_not_return_low_count_word(self): sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("pawn", 1) result = sym_spell.lookup("pawn", Verbosity.TOP, 0) self.assertEqual(0, len(result))