Exemple #1
0
def normalize_list(inpath, outpath, separator):
    """Turn a list of terms into normalized form.

    For instance::

        123&&Term1
        124&&Term Number 2

    will become::

        123&&term1&&Term1
        124&&term number 2&&Term Number 2

    etc. Here the `separator` string is "&&" in both,
    input and output file.
    """
    print("Opening %s for reading..." % inpath)
    with gzip.open(inpath, "r") as infile:
        with open(outpath, "w") as outfile:
            for cnt, line in enumerate(infile.readlines()):
                if cnt % 100000 == 0:
                    # ping back every 10**6th entry
                    print("Done: %s entries" % cnt)
                id_num, term = line.strip().split(separator, 1)
                term = filter_term(term)
                term = term.decode("utf-8")
                normalized = dinsort.normalize(term)
                out_term = "%s%s%s%s%s\n" % (
                    id_num, separator, normalized, separator, term)
                outfile.write(out_term.encode("utf-8"))
    print("Done. Written results to %s" % outpath)
Exemple #2
0
    def search(self, query_string):
        """Return an iterable of ITerms matching `query_string`.

        A term matches, if its normalized title starts with `query_string`.

        "normalized" means what `dinsort` defines as normalizing.

        We will delver at most 10 entries. Why so few? Because the
        normally used autocomplete widget only displays 10 items and
        does its own sorting. This led to the following unfortunate
        possibility: if a term contains umlauts and is sorted by
        autocomplete widget relatively late, then the term might not
        show up in the first ten items displayed and is therefore
        unpickable at all. Giving 10 entries, we can be sure that the
        (in *our* ordering) first picked term is displayed by the
        autocomplete widget.
        """
        query_string = normalize(query_string)
        search_term = "(%s" % to_string(query_string)
        db_entries = self._get_client().zrangebylex(
            self.zset_name, search_term, "+", 0, 10)
        for entry in db_entries:
            normalized, token = self._split_entry(entry)
            term = self.getTerm(token)
            yield term
Exemple #3
0
 def test_sort_func_variant(self):
     # we can set a variant for sort func
     func = sort_func(variant=VARIANT2)
     assert func("Öre") == normalize("Öre", variant=VARIANT2)
Exemple #4
0
 def test_sort_func_case_sensitive(self):
     # we can sort case sensitive
     func = sort_func(case_sensitive=True)
     assert func("UpperCase") == normalize("UpperCase", case_sensitive=True)
Exemple #5
0
def generate_output_filename(data):
    name = "{}-{}".format(
        data['track_number'],
        dinsort.normalize(data['title'].replace(' ', '-').replace(',', ''),
                          variant=dinsort.VARIANT2))
    return name
Exemple #6
0
 def test_sort_func(self):
     # we can get a sort func
     func = sort_func()
     assert func("Öre") == normalize("Öre")
Exemple #7
0
 def test_lower_case(self):
     # normalized terms are lowercase
     assert normalize("FÖÖbar") == "foobar"
     assert normalize("FÖÖbar", variant=VARIANT2) == "foeoebar"
Exemple #8
0
 def test_normalize(self):
     # we can normalize strings
     assert normalize("string") is not None
Exemple #9
0
 def test_case_sensitive_variant2(self):
     # variant2 normalizations take place also with upper case chars
     assert normalize(
         "Öse", variant=VARIANT2, case_sensitive=True) == "OEse"
Exemple #10
0
 def test_umlaut(self):
     # we get ä -> a by default (and variant1 explicitly requested)
     assert normalize("ä") == "a"
     assert normalize("ä", variant=VARIANT1) == "a"
Exemple #11
0
 def test_lower_case(self):
     # normalized terms are lowercase
     assert normalize("FÖÖbar") == "foobar"
     assert normalize("FÖÖbar", variant=VARIANT2) == "foeoebar"
Exemple #12
0
 def test_case_sensitive(self):
     # we can enable case sensitiveness
     assert normalize("FÖöbar", case_sensitive=True) == "FOobar"
Exemple #13
0
 def test_umlaut_variant2(self):
     # we get ä -> ae with variant 2
     assert normalize("ä", variant=VARIANT2) == "ae"
Exemple #14
0
 def test_umlaut(self):
     # we get ä -> a by default (and variant1 explicitly requested)
     assert normalize("ä") == "a"
     assert normalize("ä", variant=VARIANT1) == "a"
Exemple #15
0
 def test_sharp_s(self):
     # sharp s equals 'ss'
     assert normalize("ß") == "ss"
Exemple #16
0
 def test_normalize(self):
     # we can normalize strings
     assert normalize("string") is not None
Exemple #17
0
 def test_diacritic_chars_are_removed_from_diacritic(self):
     # chars with diacritics (acutes, graves, tildas, etc.) are stripped.
     assert normalize("Čéñâça") == "cenaca"
     assert normalize("Čéñâça", variant=VARIANT1) == "cenaca"
     assert normalize("ČÉÑÂÇA", variant=VARIANT2) == "cenaca"
     assert normalize("ČÉÑÂÇA") == "cenaca"
Exemple #18
0
 def test_sharp_s(self):
     # sharp s equals 'ss'
     assert normalize("ß") == "ss"
Exemple #19
0
 def test_case_sensitive_variant2(self):
     # variant2 normalizations take place also with upper case chars
     assert normalize("Öse", variant=VARIANT2,
                      case_sensitive=True) == "OEse"
Exemple #20
0
 def test_umlaut_variant2(self):
     # we get ä -> ae with variant 2
     assert normalize("ä", variant=VARIANT2) == "ae"
Exemple #21
0
 def test_sort_func_variant(self):
     # we can set a variant for sort func
     func = sort_func(variant=VARIANT2)
     assert func("Öre") == normalize("Öre", variant=VARIANT2)
Exemple #22
0
 def test_case_sensitive(self):
     # we can enable case sensitiveness
     assert normalize("FÖöbar", case_sensitive=True) == "FOobar"
Exemple #23
0
 def test_sort_func_case_sensitive(self):
     # we can sort case sensitive
     func = sort_func(case_sensitive=True)
     assert func("UpperCase") == normalize("UpperCase", case_sensitive=True)
Exemple #24
0
 def test_diacritic_chars_are_removed_from_diacritic(self):
     # chars with diacritics (acutes, graves, tildas, etc.) are stripped.
     assert normalize("Čéñâça") == "cenaca"
     assert normalize("Čéñâça", variant=VARIANT1) == "cenaca"
     assert normalize("ČÉÑÂÇA", variant=VARIANT2) == "cenaca"
     assert normalize("ČÉÑÂÇA") == "cenaca"
Exemple #25
0
 def test_sort_func(self):
     # we can get a sort func
     func = sort_func()
     assert func("Öre") == normalize("Öre")