Exemple #1
0
 def test_empty(self):
     self.assertEqual(None, slugify(None))
     self.assertEqual(None, ascii_text(None))
     self.assertEqual(None, latinize_text(None))
     self.assertEqual(None, normalize(None))
     self.assertEqual(None, normalize(''))
     self.assertEqual(None, normalize(' '))
Exemple #2
0
def index_form(texts):
    """Turn a set of strings into the appropriate form for indexing."""
    results = []
    total_len = 0

    for text in texts:
        # We don't want to store more than INDEX_MAX_LEN of text per doc
        if total_len > INDEX_MAX_LEN:
            # TODO: there might be nicer techniques for dealing with overly
            # long text buffers?
            results = list(set(results))
            total_len = sum((len(t) for t in results))
            if total_len > INDEX_MAX_LEN:
                break

        text = stringify(text)
        if text is None:
            continue
        text = collapse_spaces(text)
        total_len += len(text)
        results.append(text)

        # Make latinized text version
        latin = latinize_text(text)
        latin = stringify(latin)
        if latin is None or latin == text:
            continue
        total_len += len(latin)
        results.append(latin)
    return results
Exemple #3
0
def finalize_index(data, schema, texts):
    """Apply final denormalisations to the index."""
    data['schema'] = schema.name
    # Get implied schemata (i.e. parents of the actual schema)
    data['schemata'] = schema.names

    properties = data.get('properties', {})
    for name, prop in schema.properties.items():
        if name not in properties:
            continue
        if prop.type_name in ['entity', 'date', 'url', 'uri', 'country']:
            continue
        for value in ensure_list(properties[name]):
            if name == 'name':
                data['name'] = value
            texts.append(value)

    data = schema.invert(data)
    data['text'] = index_form(texts)

    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(latinize_text(name))
    data['names'] = list(set(names))

    if 'created_at' not in data:
        data['created_at'] = data.get('updated_at')
    return data
Exemple #4
0
def normalize_strong(text):
    """Perform heavy normalisation of a given text.

    The goal of this function is not to retain a readable version of the given
    string, but rather to yield a normalised version suitable for comparisons
    and machine analysis.
    """
    text = latinize_text(string_value(text))
    if text is None:
        return
    text = category_replace(text.lower())
    return collapse_spaces(text)
Exemple #5
0
def search_term(term):
    if term is None:
        return
    term = latinize_text(term)
    if term is None:
        return
    term = term.replace('"', ' ').strip().lower()
    for stopword in STOPWORDS:
        if term.startswith(stopword):
            term = term[len(stopword):]
    if len(term) < 4:
        return
    return term
Exemple #6
0
def pick_name(names: Tuple[str], all_names: Tuple[str]) -> Optional[str]:
    candidates: List[str] = []
    for name in all_names:
        candidates.append(name)
        latin = latinize_text(name)
        if latin is not None:
            candidates.append(latin.title())

    scores: Dict[str, int] = defaultdict(int)
    for pair in combinations(candidates, 2):
        left, right = sorted(pair)
        dist = Levenshtein.distance(left[:128], right[:128])
        scores[left] += dist
        scores[right] += dist

    for cand, _ in sorted(scores.items(), key=lambda x: x[1]):
        if cand in names:
            return cand
    return None
Exemple #7
0
 def test_petro(self):
     text = u'Порошенко Петро Олексійович'
     self.assertEqual('porosenko-petro-oleksijovic', slugify(text))
     self.assertEqual('Porosenko Petro Oleksijovic', ascii_text(text))
     self.assertEqual(u'Porošenko Petro Oleksíjovič', latinize_text(text))
     self.assertEqual(u'порошенко петро олексіиович', normalize(text))
Exemple #8
0
 def test_petro(self):
     text = u"Порошенко Петро Олексійович"
     self.assertEqual("porosenko-petro-oleksijovic", slugify(text))
     self.assertEqual("Porosenko Petro Oleksijovic", ascii_text(text))
     self.assertEqual(u"Porošenko Petro Oleksíjovič", latinize_text(text))
     self.assertEqual(u"порошенко петро олексіиович", normalize(text))
Exemple #9
0
def latin_alt(value):
    """Make a latin version of a string and return if it differs
    from the input."""
    trans_value = latinize_text(value)
    if trans_value.lower() != value.lower():
        return trans_value
Exemple #10
0
# coding: utf-8
from normality import normalize, latinize_text, ascii_text, slugify

SAMPLES = [
    u'Порошенко Петро Олексійович',
    u'FUAD ALIYEV ƏHMƏD OĞLU',
    u'Häschen Spaß',
    u'ავლაბრის ფონდი',
]

for sample in SAMPLES:
    print 'SAMPLE :', sample
    print '  NORM :', normalize(sample)
    print '  SLUG :', slugify(sample)
    print '  LATIN:', latinize_text(sample)
    print '  ASCII:', ascii_text(sample)