def test_merge(): ts1 = TextSearch("sensitive", "match") ts2 = TextSearch("sensitive", "match") ts1.add("hi") ts2.add("hi") assert len(ts1 + ts2) == 1 ts1.remove("hi") ts2.add("bye") assert len(ts1 + ts2) == 2
def __init__(self): with open('svo_extraction/contract_dict.json', mode="r", encoding="utf-8") as json_file: self.contractdict = json.load(json_file) self.searching = TextSearch("ignore", "norm") self.searching.add(self.contractdict)
def test_custom_handler(): def custom_handler(text, start, stop, norm): return start, stop, text[start:stop] + " is OK" ts = TextSearch("ignore", "norm", handlers=[("HI", True, custom_handler)]) ts.add("hi", "HI") assert ts.findall("hi HI") == ['hi is OK', 'HI is OK']
def test_smart_match(): ts = TextSearch("smart", "object") ts.add("hi") assert ts.findall("hi")[0].case == "lower" assert ts.findall("hi")[0].is_exact assert ts.findall("HI")[0].case == "upper" assert not ts.findall("HI")[0].is_exact assert ts.findall("Hi")[0].case == "title" assert not ts.findall("Hi")[0].is_exact ts.add("hI") assert ts.findall("hI")[0].case == "mixed" assert ts.findall("hI")[0].is_exact
def test_foreign_chars(): ts = TextSearch("ignore", "norm", replace_foreign_chars=True) ts.add("á", "A") assert "a" in ts assert "á" in ts assert ts.contains("a") assert ts.contains("á") assert ts.findall("a") assert ts.findall("á") assert ts.find_overlapping("a") assert ts.find_overlapping("á") assert ts.replace("a") == "A" assert ts.replace("á") == "A"
def ts_replacer(a, b): # lowercase letters before are allowed, how about a second one... # ts = TextSearch("sensitive", "norm", ALPHANUM - ALPHA_LOWER, ALPHANUM) ts = TextSearch("sensitive", "norm", BOUNDS, BOUNDS) found_sep_in_b = "" for x in [" ", "-", "_", "."]: if x in b: found_sep_in_b = x break # a = ["some", "thing"] # b = ["another", "thing"] aa = normalize(a) bb = normalize(b) # questions like: # - prefer camelCase for word2 when word1 is lowercase # - prefer Halftitle, PascalCase/Titlecase for word2 when word1 is titlecase # etc # below... lower order means higher prio for s in [".", "_", "-", found_sep_in_b, ""]: # halftitle x = aa[0][0].title() + s.join(aa)[1:] y = bb[0][0].title() + s.join(bb)[1:] ts.add(x, y) # camelCase x = s.join([aa[0].lower()] + [x.title() for x in aa[1:]]) y = s.join([bb[0].lower()] + [x.title() for x in bb[1:]]) ts.add(x, y) # easy cases for c in [str.upper, str.title, str.lower]: x = s.join([c(x) for x in aa]) y = s.join([c(x) for x in bb]) ts.add(x, y) # ts.add("SomeThing", "AnotherThing") ts.add(a, b) return ts
def setup(self): self.tokenizer = TextSearch("sensitive", "norm", set(), set()) self.add_base_cases() self.add_currencies() self.add_words(self.protected_words) if self.handle_http: self.tokenizer.add_http_handler(keep_result=True) for word in ["http://", "https://", "www."]: self.explain_dict[ word] = "regex: when it finds '{}' it will stop after it finds a space.".format( word) if self.handle_domains: self.add_domain_handler() if self.contractions: if self.contractions == True: self.contractions = {} self.contractions.update(contractions_dict) self.contractions.update(leftovers_dict) self.add_words(self.contractions) if self.abbrevs: self.add_words(self.abbrevs)
def test_replace(): ts = TextSearch("sensitive", "norm") ts.add("hi", "HI") assert ts.replace("test hi test") == "test HI test"
def test_replace_insensitive_keep_casing(): ts = TextSearch("insensitive", "norm") ts.add("hi", "bye") assert ts.replace("test Hi test") == "test Bye test" assert ts.replace("test HI test") == "test BYE test"
def test_add_list(): ts = TextSearch("smart", "match") ts.add(["hi", "bye", "hello"]) assert ts.findall("hi bye hello") == ["hi", "bye", "hello"]
def test_add_dict(): ts = TextSearch("smart", "norm") ts.add({"hi": "greeting", "bye": "bye", "goodbye": "bye"}) assert ts.findall("hi bye goodbye") == ["greeting", "bye", "bye"]
def get_ts(): ts = TextSearch("insensitive", "object") ts.add(nlp_registry) return ts
def test_not_overlap_3(): ts = TextSearch("ignore", "norm") ts.add("a") ts.add("a a") assert ts.findall("a a a") == ["a a", "a"]
return jsonify(res) # used for debug # return jsonify({"uuid":jsonfile['0']}) if __name__ == '__main__': """ The TextSearch class instance is loaded at the init of the flask API for performance reasons, meaning that all the inputs matrices are stored as in-memory objects Then at each call of the API, the searchTop method is called and uses the in-memory matrices to make the proximity calculation The largest file - document matrix - is (n documents) x (m vect components) 64 bit float 64 bits is not necessary, this can be optimized. """ with open("params.json", 'r') as stream: params = json.load(stream) modelfile = params['modelfile'] docmatrixfile = params['docmatrixfile'] textfile = params['textfile'] textSearch = TextSearch(modelfile, docmatrixfile, textfile) app.run(debug=True)
def test_http_no_keep(): ts = TextSearch("ignore", "norm") ts.add_http_handler(keep_result=False) ts.add("google") assert ts.findall("http://google.com") == []
def test_insensitive_object(): ts = TextSearch("insensitive", "object") ts.add("hi") assert ts.findall("HI")[0].end == 2
def test_regex_norm(): ts = TextSearch("insensitive", "norm") ts.add_regex_handler(["last "], r"\d", keep_result=True) assert ts.findall("last 5") == ["last 5"]
def test_not_overlap(): ts = TextSearch("ignore", "norm") ts.add("http://") ts.add_http_handler(True) assert len(ts.findall("https://vks.ai")) == 1
def test_postfix_regex(): ts = TextSearch("ignore", "norm") ts.add_regex_handler(["products"], r"\d+ ", keep_result=True, prefix=False) assert ts.findall("90 products") == ["90 products"]
def test_overlap(): ts = TextSearch("ignore", "norm") ts.add("hi") ts.add("hi hi") assert len(ts.find_overlapping("hi hi")) == 3
def test_serializable(): ts = TextSearch("sensitive", dict) ts.add("hi") result = ts.findall("hi") assert result assert json.dumps(result)
def test_repr(): assert repr(TextSearch("ignore", "match")) assert repr(TextSearch("ignore", "match", set(), set()))
def test_http(): ts = TextSearch("ignore", "norm") ts.add_http_handler(keep_result=True) assert ts.findall("http://google.com") == ["http://google.com"]
def test_merge_handler(): ts1 = TextSearch("sensitive", "norm") ts2 = TextSearch("sensitive", "norm") ts1.add_http_handler(True) assert (ts1 + ts2).handlers
def test_twitter(): ts = TextSearch("ignore", "norm") ts.add_twitter_handler(keep_result=True) assert ts.findall("@hello") == ["@hello"] assert ts.findall("#hello") == ["#hello"]
def test_ignore_match(): ts = TextSearch("ignore", "match") ts.add("hi") assert ts.findall("hi") == ["hi"] assert ts.findall("HI") == ["hi"] assert ts.findall("asdf") == []
def test_regex_overlap(): ts = TextSearch("insensitive", "object") ts.add_regex_handler(["last "], r"\d", keep_result=True) ts.add("last") assert ts.findall("last 5")[0].norm == "last 5"
def test_not_overlap_2(): ts = TextSearch("ignore", "norm") ts.add("hi", "HI") ts.add("hi hi", "h h") assert ts.replace("hi hi") == "h h"
slang_dict = { "ima": "I am going to", "gonna": "going to", "gotta": "got to", "wanna": "want to", "woulda": "would have", "gimme": "give me", "asap": "as soon as possible", "u": "you", "r ": "are ", } slang_dict.update(unsafe_dict) ts_leftovers = TextSearch("ignore", "norm") ts_leftovers.add(contractions_dict) ts_leftovers.add(leftovers_dict) ts_leftovers_slang = TextSearch("ignore", "norm") ts_leftovers_slang.add(contractions_dict) ts_leftovers_slang.add(leftovers_dict) ts_leftovers_slang.add(slang_dict) ts_slang = TextSearch("ignore", "norm") ts_slang.add(contractions_dict) ts_slang.add(slang_dict) ts_basic = TextSearch("ignore", "norm") ts_basic.add(contractions_dict)
def test_sensitive_match(): ts = TextSearch("sensitive", "object") ts.add("hi") assert ts.findall("hi") assert not ts.findall("HI")