def test_custom_handler(): def custom_handler(text, start, stop, norm): return start, stop, text[start:stop] + " is OK" ts = TextSearch("ignore", "norm", handlers=[("HI", True, custom_handler)]) ts.add("hi", "HI") assert ts.findall("hi HI") == ['hi is OK', 'HI is OK']
def __init__(self): with open('svo_extraction/contract_dict.json', mode="r", encoding="utf-8") as json_file: self.contractdict = json.load(json_file) self.searching = TextSearch("ignore", "norm") self.searching.add(self.contractdict)
class ContractText(): def __init__(self): with open('svo_extraction/contract_dict.json', mode="r", encoding="utf-8") as json_file: self.contractdict = json.load(json_file) self.searching = TextSearch("ignore", "norm") self.searching.add(self.contractdict) def uncontract(self, text: str): return self.searching.replace(text)
def ts_replacer(a, b): # lowercase letters before are allowed, how about a second one... # ts = TextSearch("sensitive", "norm", ALPHANUM - ALPHA_LOWER, ALPHANUM) ts = TextSearch("sensitive", "norm", BOUNDS, BOUNDS) found_sep_in_b = "" for x in [" ", "-", "_", "."]: if x in b: found_sep_in_b = x break # a = ["some", "thing"] # b = ["another", "thing"] aa = normalize(a) bb = normalize(b) # questions like: # - prefer camelCase for word2 when word1 is lowercase # - prefer Halftitle, PascalCase/Titlecase for word2 when word1 is titlecase # etc # below... lower order means higher prio for s in [".", "_", "-", found_sep_in_b, ""]: # halftitle x = aa[0][0].title() + s.join(aa)[1:] y = bb[0][0].title() + s.join(bb)[1:] ts.add(x, y) # camelCase x = s.join([aa[0].lower()] + [x.title() for x in aa[1:]]) y = s.join([bb[0].lower()] + [x.title() for x in bb[1:]]) ts.add(x, y) # easy cases for c in [str.upper, str.title, str.lower]: x = s.join([c(x) for x in aa]) y = s.join([c(x) for x in bb]) ts.add(x, y) # ts.add("SomeThing", "AnotherThing") ts.add(a, b) return ts
def test_foreign_chars(): ts = TextSearch("ignore", "norm", replace_foreign_chars=True) ts.add("á", "A") assert "a" in ts assert "á" in ts assert ts.contains("a") assert ts.contains("á") assert ts.findall("a") assert ts.findall("á") assert ts.find_overlapping("a") assert ts.find_overlapping("á") assert ts.replace("a") == "A" assert ts.replace("á") == "A"
def setup(self): self.tokenizer = TextSearch("sensitive", "norm", set(), set()) self.add_base_cases() self.add_currencies() self.add_words(self.protected_words) if self.handle_http: self.tokenizer.add_http_handler(keep_result=True) for word in ["http://", "https://", "www."]: self.explain_dict[ word] = "regex: when it finds '{}' it will stop after it finds a space.".format( word) if self.handle_domains: self.add_domain_handler() if self.contractions: if self.contractions == True: self.contractions = {} self.contractions.update(contractions_dict) self.contractions.update(leftovers_dict) self.add_words(self.contractions) if self.abbrevs: self.add_words(self.abbrevs)
def test_serializable(): ts = TextSearch("sensitive", dict) ts.add("hi") result = ts.findall("hi") assert result assert json.dumps(result)
def test_replace(): ts = TextSearch("sensitive", "norm") ts.add("hi", "HI") assert ts.replace("test hi test") == "test HI test"
def test_replace_insensitive_keep_casing(): ts = TextSearch("insensitive", "norm") ts.add("hi", "bye") assert ts.replace("test Hi test") == "test Bye test" assert ts.replace("test HI test") == "test BYE test"
def test_not_overlap_3(): ts = TextSearch("ignore", "norm") ts.add("a") ts.add("a a") assert ts.findall("a a a") == ["a a", "a"]
def test_add_dict(): ts = TextSearch("smart", "norm") ts.add({"hi": "greeting", "bye": "bye", "goodbye": "bye"}) assert ts.findall("hi bye goodbye") == ["greeting", "bye", "bye"]
def test_twitter(): ts = TextSearch("ignore", "norm") ts.add_twitter_handler(keep_result=True) assert ts.findall("@hello") == ["@hello"] assert ts.findall("#hello") == ["#hello"]
class Tokenizer: def __init__( self, handle_http=False, handle_domains=False, numbers=True, combine_punctuation=True, eol="\n", currencies=("$", ), protected_words=None, contractions=True, language="en", abbrevs=ABBREVS, ): # set() set() should fallback to just using __iter__ of automaton for a speedboost if language != "en" and contractions: raise ValueError( "No contractions known for languages other than English.") self.contractions = contractions self.tokenizer = None self.handle_http = handle_http self.handle_domains = handle_domains self.combine_punctuation = combine_punctuation self.numbers = numbers self.eol = eol self.currencies = currencies or [] self.protected_words = protected_words or [] self.abbrevs = abbrevs self.explain_dict = {} self.setup() def setup(self): self.tokenizer = TextSearch("sensitive", "norm", set(), set()) self.add_base_cases() self.add_currencies() self.add_words(self.protected_words) if self.handle_http: self.tokenizer.add_http_handler(keep_result=True) for word in ["http://", "https://", "www."]: self.explain_dict[ word] = "regex: when it finds '{}' it will stop after it finds a space.".format( word) if self.handle_domains: self.add_domain_handler() if self.contractions: if self.contractions == True: self.contractions = {} self.contractions.update(contractions_dict) self.contractions.update(leftovers_dict) self.add_words(self.contractions) if self.abbrevs: self.add_words(self.abbrevs) def add_words(self, words): words = words.items() if isinstance(words, dict) else words if words and isinstance(words, (list, set, tuple)) and isinstance( words[0], str): words = [(x, x) for x in words] for x, y in words: REASON_AS_IS = "protected word: adds word as is, prevents splitting it." REASON_UPPER = "protected word: adds word uppercased, prevents splitting it." REASON_TITLE = "protected word: adds word titlecased, prevents splitting it." self.add(x, y, REASON_AS_IS) self.add(x.upper(), y.upper(), REASON_UPPER) if y: self.add(x[0].upper() + x[1:], y[0].upper() + y[1:], REASON_TITLE) def add_domain_handler(self): import re from tldextract.tldextract import TLD_EXTRACTOR valid_re = re.compile("^[a-zA-Z.]+$") tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)] for x in tlds: self.add(x, x, "Added by domain handler, keeps the token existing.") def add_base_cases(self): if self.numbers: for x in "0123456789": self.keep(x + ",") self.keep(x + ".") # self.tokenizer.add(" !", " ! ") if self.combine_punctuation: # combine multiples R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence." for s in "!.?-": for i in range(2, 10): # one of these is a splitting char if i == 1 and s == "-": continue c = s * i e = s * 3 if i > 1 else s # end = "$<EOS>$" if i == 1 or s != "-" else " " end = " \n" if i == 1 or s != "-" else " " self.add(c, " {}{}".format(e, end), R_COMBINE.format(c, e + end)) for i in range(2, 10): # self.tokenizer.add("\n" * i, "$<EOS>$") self.add("\n" * i, " \n ", "merges newlines") for s in "!.?-\n": self.add(s, " " + s + "\n", "Splits on '{}' and creating a new sentence.".format(s)) self.split("- ") self.split("...") # does not work # self.tokenizer.add_regex_handler(["!?"], "[!]+[?]+[!?]+", True, return_value=" !? ") self.split("!?") self.split("!?!") self.split("!!?") self.split("!??") self.split("?!!") self.split("?!?") self.split("??!") for x in string.ascii_letters: self.keep(" " + x + ".") # for x in string.ascii_letters: # self.tokenizer.add("\n" + x, "\n" + x) for s in ":;,": self.split(s, "Splits on '{}' (punctuation)") # quotes (make sure we add all the exeptions) self.split("'") self.split('"') def keep(self, x, reason=None): """ Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """ self.tokenizer.add(x, x) self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace( "x", repr(x)).rstrip() def split(self, x, reason=None): """ Whenever it finds x, it will surround it by whitespace, thus creating a token. """ self.tokenizer.add(x, " {} ".format(x)) self.explain_dict[x] = ( reason or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip()) def drop(self, x, reason=None): """ Whenever it finds x, it will remove it but add a split.""" self.tokenizer.add(x, " ") self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace( "x", repr(x)).rstrip() def strip(self, x, reason=None): """ Whenever it finds x, it will remove it without splitting. """ self.tokenizer.add(x, "") self.explain_dict[x] = ( reason or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip()) def add(self, x, y, reason): self.tokenizer.add(x, y) self.explain_dict[x] = reason def explain(self, char_or_chars): keys = [x for x in self.tokenizer._root_dict if char_or_chars in x] if not keys: return { "explanation": "No explanation, meaning there is nothing specified for the input" } return [{ "from": x, "to": self.tokenizer._root_dict[x], "explanation": self.explain_dict[x] } for x in keys] def remove(self, x): if x in self.tokenizer: self.tokenizer.remove(x) del self.explain_dict[x] def add_currencies(self): for currency in self.currencies: self.split(currency) for num in "0123456789": # to prevent the . and , from being treated as punct for punc in ",.": s = "{currency}{num}{punc}".format(currency=currency, num=num, punc=punc) r = " {currency} {num}{punc}".format(currency=currency, num=num, punc=punc) self.add(s, r, "protecting currency from being seen as a number.") def word_tokenize(self, z, return_entities=False, to_lower=False): if return_entities: a, b = self.tokenizer.replace(" " + z, return_entities=True) return a.split(), b res = self.tokenizer.replace(" " + z).split() if to_lower: res = [x.lower() for x in res] return res def word_newlined_tokenize(self, z): sentences = self.sent_tokenize(z) return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1] def sent_tokenize(self, z): return [ x.split() for x in self.tokenizer.replace(z).split("\n") if x.strip() ]
def test_regex_overlap(): ts = TextSearch("insensitive", "object") ts.add_regex_handler(["last "], r"\d", keep_result=True) ts.add("last") assert ts.findall("last 5")[0].norm == "last 5"
return jsonify(res) # used for debug # return jsonify({"uuid":jsonfile['0']}) if __name__ == '__main__': """ The TextSearch class instance is loaded at the init of the flask API for performance reasons, meaning that all the inputs matrices are stored as in-memory objects Then at each call of the API, the searchTop method is called and uses the in-memory matrices to make the proximity calculation The largest file - document matrix - is (n documents) x (m vect components) 64 bit float 64 bits is not necessary, this can be optimized. """ with open("params.json", 'r') as stream: params = json.load(stream) modelfile = params['modelfile'] docmatrixfile = params['docmatrixfile'] textfile = params['textfile'] textSearch = TextSearch(modelfile, docmatrixfile, textfile) app.run(debug=True)
def test_insensitive_object(): ts = TextSearch("insensitive", "object") ts.add("hi") assert ts.findall("HI")[0].end == 2
def test_regex_norm(): ts = TextSearch("insensitive", "norm") ts.add_regex_handler(["last "], r"\d", keep_result=True) assert ts.findall("last 5") == ["last 5"]
def test_not_overlap(): ts = TextSearch("ignore", "norm") ts.add("http://") ts.add_http_handler(True) assert len(ts.findall("https://vks.ai")) == 1
def test_postfix_regex(): ts = TextSearch("ignore", "norm") ts.add_regex_handler(["products"], r"\d+ ", keep_result=True, prefix=False) assert ts.findall("90 products") == ["90 products"]
def test_overlap(): ts = TextSearch("ignore", "norm") ts.add("hi") ts.add("hi hi") assert len(ts.find_overlapping("hi hi")) == 3
def test_http(): ts = TextSearch("ignore", "norm") ts.add_http_handler(keep_result=True) assert ts.findall("http://google.com") == ["http://google.com"]
def test_repr(): assert repr(TextSearch("ignore", "match")) assert repr(TextSearch("ignore", "match", set(), set()))
def test_http_no_keep(): ts = TextSearch("ignore", "norm") ts.add_http_handler(keep_result=False) ts.add("google") assert ts.findall("http://google.com") == []
def test_ignore_match(): ts = TextSearch("ignore", "match") ts.add("hi") assert ts.findall("hi") == ["hi"] assert ts.findall("HI") == ["hi"] assert ts.findall("asdf") == []
def get_ts(): ts = TextSearch("insensitive", "object") ts.add(nlp_registry) return ts
def test_add_list(): ts = TextSearch("smart", "match") ts.add(["hi", "bye", "hello"]) assert ts.findall("hi bye hello") == ["hi", "bye", "hello"]
def test_sensitive_match(): ts = TextSearch("sensitive", "object") ts.add("hi") assert ts.findall("hi") assert not ts.findall("HI")
def test_not_overlap_2(): ts = TextSearch("ignore", "norm") ts.add("hi", "HI") ts.add("hi hi", "h h") assert ts.replace("hi hi") == "h h"
slang_dict = { "ima": "I am going to", "gonna": "going to", "gotta": "got to", "wanna": "want to", "woulda": "would have", "gimme": "give me", "asap": "as soon as possible", "u": "you", "r ": "are ", } slang_dict.update(unsafe_dict) ts_leftovers = TextSearch("ignore", "norm") ts_leftovers.add(contractions_dict) ts_leftovers.add(leftovers_dict) ts_leftovers_slang = TextSearch("ignore", "norm") ts_leftovers_slang.add(contractions_dict) ts_leftovers_slang.add(leftovers_dict) ts_leftovers_slang.add(slang_dict) ts_slang = TextSearch("ignore", "norm") ts_slang.add(contractions_dict) ts_slang.add(slang_dict) ts_basic = TextSearch("ignore", "norm") ts_basic.add(contractions_dict)
def test_smart_match(): ts = TextSearch("smart", "object") ts.add("hi") assert ts.findall("hi")[0].case == "lower" assert ts.findall("hi")[0].is_exact assert ts.findall("HI")[0].case == "upper" assert not ts.findall("HI")[0].is_exact assert ts.findall("Hi")[0].case == "title" assert not ts.findall("Hi")[0].is_exact ts.add("hI") assert ts.findall("hI")[0].case == "mixed" assert ts.findall("hI")[0].is_exact