def normalize_list(inpath, outpath, separator): """Turn a list of terms into normalized form. For instance:: 123&&Term1 124&&Term Number 2 will become:: 123&&term1&&Term1 124&&term number 2&&Term Number 2 etc. Here the `separator` string is "&&" in both, input and output file. """ print("Opening %s for reading..." % inpath) with gzip.open(inpath, "r") as infile: with open(outpath, "w") as outfile: for cnt, line in enumerate(infile.readlines()): if cnt % 100000 == 0: # ping back every 10**6th entry print("Done: %s entries" % cnt) id_num, term = line.strip().split(separator, 1) term = filter_term(term) term = term.decode("utf-8") normalized = dinsort.normalize(term) out_term = "%s%s%s%s%s\n" % ( id_num, separator, normalized, separator, term) outfile.write(out_term.encode("utf-8")) print("Done. Written results to %s" % outpath)
def search(self, query_string): """Return an iterable of ITerms matching `query_string`. A term matches, if its normalized title starts with `query_string`. "normalized" means what `dinsort` defines as normalizing. We will delver at most 10 entries. Why so few? Because the normally used autocomplete widget only displays 10 items and does its own sorting. This led to the following unfortunate possibility: if a term contains umlauts and is sorted by autocomplete widget relatively late, then the term might not show up in the first ten items displayed and is therefore unpickable at all. Giving 10 entries, we can be sure that the (in *our* ordering) first picked term is displayed by the autocomplete widget. """ query_string = normalize(query_string) search_term = "(%s" % to_string(query_string) db_entries = self._get_client().zrangebylex( self.zset_name, search_term, "+", 0, 10) for entry in db_entries: normalized, token = self._split_entry(entry) term = self.getTerm(token) yield term
def test_sort_func_variant(self): # we can set a variant for sort func func = sort_func(variant=VARIANT2) assert func("Öre") == normalize("Öre", variant=VARIANT2)
def test_sort_func_case_sensitive(self): # we can sort case sensitive func = sort_func(case_sensitive=True) assert func("UpperCase") == normalize("UpperCase", case_sensitive=True)
def generate_output_filename(data): name = "{}-{}".format( data['track_number'], dinsort.normalize(data['title'].replace(' ', '-').replace(',', ''), variant=dinsort.VARIANT2)) return name
def test_sort_func(self): # we can get a sort func func = sort_func() assert func("Öre") == normalize("Öre")
def test_lower_case(self): # normalized terms are lowercase assert normalize("FÖÖbar") == "foobar" assert normalize("FÖÖbar", variant=VARIANT2) == "foeoebar"
def test_normalize(self): # we can normalize strings assert normalize("string") is not None
def test_case_sensitive_variant2(self): # variant2 normalizations take place also with upper case chars assert normalize( "Öse", variant=VARIANT2, case_sensitive=True) == "OEse"
def test_umlaut(self): # we get ä -> a by default (and variant1 explicitly requested) assert normalize("ä") == "a" assert normalize("ä", variant=VARIANT1) == "a"
def test_case_sensitive(self): # we can enable case sensitiveness assert normalize("FÖöbar", case_sensitive=True) == "FOobar"
def test_umlaut_variant2(self): # we get ä -> ae with variant 2 assert normalize("ä", variant=VARIANT2) == "ae"
def test_sharp_s(self): # sharp s equals 'ss' assert normalize("ß") == "ss"
def test_diacritic_chars_are_removed_from_diacritic(self): # chars with diacritics (acutes, graves, tildas, etc.) are stripped. assert normalize("Čéñâça") == "cenaca" assert normalize("Čéñâça", variant=VARIANT1) == "cenaca" assert normalize("ČÉÑÂÇA", variant=VARIANT2) == "cenaca" assert normalize("ČÉÑÂÇA") == "cenaca"
def test_case_sensitive_variant2(self): # variant2 normalizations take place also with upper case chars assert normalize("Öse", variant=VARIANT2, case_sensitive=True) == "OEse"