def orthography(args): # pragma: no cover ds = get_dataset(args) out = ds.dir.joinpath('orthography.tsv') if out.exists(): if not confirm( 'There already is an orthography profile for this dataset. Overwrite?', default=False): return graphemes = Counter() for line in ds.iter_raw_lexemes(): graphemes.update(grapheme_pattern.findall(line)) with UnicodeWriter(out, delimiter='\t') as writer: writer.writerow(['graphemes', 'frequency', 'IPA']) for grapheme, frequency in graphemes.most_common(): writer.writerow([grapheme, '{0}'.format(frequency), grapheme]) log_dump(out, log=args.log)
def grapheme_clusters(self, word): """ See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION http://www.unicode.org/reports/tr29/ Given a string as input, return a list of Unicode graphemes using the "\X" regular expression. Parameters ---------- word : str A Unicode string to be tokenized into graphemes. Returns ------- result : list List of Unicode graphemes in NFD. """ # init the regex Unicode grapheme cluster match return grapheme_pattern.findall(word)
def from_text(cls, text, mapping='mapping'): """ Create a Profile instance from the Unicode graphemes found in `text`. Parameters ---------- text mapping Returns ------- A Profile instance. """ graphemes = Counter(grapheme_pattern.findall(text)) specs = [ OrderedDict([(cls.GRAPHEME_COL, grapheme), ('frequency', frequency), (mapping, grapheme)]) for grapheme, frequency in graphemes.most_common() ] return cls(*specs)