def _get_process_pron( self, stress: bool, syllable_boundaries: bool, segment: bool, tone: bool, ) -> Callable[[Pron], Pron]: processors = [] if not stress: processors.append(functools.partial(re.sub, r"[ˈˌ]", "")) if not syllable_boundaries: processors.append(functools.partial(re.sub, r"\.", "")) if not tone: processors.append(functools.partial(re.sub, _PARENS_REGEX, "")) processors.append(functools.partial(re.sub, _TONES_REGEX, "")) if segment: processors.append(functools.partial(segments.Tokenizer(), ipa=True)) prosodic_markers = frozenset(["ˈ", "ˌ", "."]) def wrapper(pron): for processor in processors: pron = processor(pron) # GH-59: Skip prons that are empty, or have only stress marks or # syllable boundaries. if any(ch not in prosodic_markers for ch in pron): return pron return wrapper
def __init__(self, language, logger=get_logger()): self.logger = logger self.logger.info('initializing backend %s-%s', self.name(), self.version()) profile = self._load_g2p_profile(language) self.tokenizer = segments.Tokenizer(profile=profile)
def _init_language(self, language): # load the grapheme to phoneme mapping profile = self._load_g2p_profile(language) self._tokenizer = segments.Tokenizer(profile=profile) # this is the language code return pathlib.Path(language).stem
def _get_process_pron(self, no_stress: bool, no_syllable_boundaries: bool, no_segment: bool) -> Callable[[Pron], Pron]: # segments v2.1.2 oddly sets a global logging configuration # that interferes with downstream logging. # See: https://github.com/cldf/segments/issues/47 import segments processors = [] if no_stress: processors.append(functools.partial(re.sub, r"[ˈˌ]", "")) if no_syllable_boundaries: processors.append(functools.partial(re.sub, r"\.", "")) if not no_segment: processors.append(functools.partial(segments.Tokenizer(), ipa=True)) prosodic_markers = frozenset(["ˈ", "ˌ", "."]) def wrapper(pron): for processor in processors: pron = processor(pron) # GH-59: Skip prons that are empty, or have only stress marks or # syllable boundaries. if any(ch not in prosodic_markers for ch in pron): return pron return wrapper
def trim(self, ipa_col=IPA_COLUMN): # Make a copy of the profile (so we don't change in place) new_profile = collections.OrderedDict() for g, entry in self.graphemes.items(): spec = copy.copy(entry) spec[self.GRAPHEME_COL] = g new_profile[g] = spec # Collect all keys, so that we will gradually remove them; those with # ^ and $ go first graphemes = list(new_profile.keys()) bound_graphemes = [ grapheme for grapheme in graphemes if grapheme[0] == "^" and grapheme[-1] == "$" ] bound_graphemes += [ grapheme for grapheme in graphemes if grapheme[0] == "^" and grapheme[-1] != "$" ] bound_graphemes += [ grapheme for grapheme in graphemes if grapheme[0] != "^" and grapheme[-1] == "$" ] check_graphemes = bound_graphemes + sorted( [ grapheme for grapheme in graphemes if len(grapheme) > 1 and grapheme not in bound_graphemes ], key=len, reverse=True, ) # For each entry, we will remove it from `segment_map`, apply the resulting # profile, and add the entry back at the end of loop (still expansive, but # orders of magnitude less expansive than making a copy at each iteration) removed = 0 for grapheme in check_graphemes: if grapheme in new_profile: ipa = new_profile[grapheme][ipa_col] # Obtain the segments without the current rule t = segments.Tokenizer(profile=Profile(*[ copy.copy(s) for g, s in new_profile.items() if g != grapheme ])) if t(grapheme, column=ipa_col) == ipa: # If the resulting `segments` match the `ipa` reference, we can delete the rule: removed += 1 del new_profile[grapheme] for g in set(self.graphemes.keys()) - set(new_profile.keys()): del self.graphemes[g] self.recreate_tree() return removed
def __init__(self, language, punctuation_marks=Punctuation.default_marks(), preserve_punctuation=False, logger=get_logger()): self.logger = logger self.logger.info( 'initializing backend %s-%s', self.name(), self.version()) # load the grapheme to phoneme mapping profile = self._load_g2p_profile(language) self.tokenizer = segments.Tokenizer(profile=profile) # setup punctuation processing self.preserve_punctuation = preserve_punctuation self._punctuator = Punctuation(punctuation_marks)
def augment(self, forms, clts=None, ipa_col=IPA_COLUMN): """ Applies a profile to a wordlist, returning new profile counts and segments. """ self.column_labels.add('FREQUENCY') if clts: self.column_labels.add('SCA') self.column_labels.add('EXAMPLES') freqs = collections.Counter() ex = collections.defaultdict(list) t = segments.Tokenizer(profile=self) for form in forms: graphemes = t(self.segmentable_form(form)).split() freqs.update(graphemes) for g in graphemes: ex[g].append(form[1:-1]) for g, spec in self.graphemes.items(): spec['FREQUENCY'] = freqs.get(g, 0) spec['EXAMPLES'] = ";".join(ex.get(g, [])[:5]) if clts: spec['SCA'] = ipa2sca(spec[ipa_col], clts)
from pathlib import Path import pycldf import pyclts import segments import cldfbench import cldfcatalog import lingpy import lingpy.compare.partial clts_path = cldfcatalog.Config.from_file().get_clone("clts") clts = cldfbench.catalogs.CLTS(clts_path) bipa = clts.api.bipa tokenizer = segments.Tokenizer() def sha1(path): return hashlib.sha1(str(path).encode("utf-8")).hexdigest()[:12] def clean_segments(segment_string: t.List[str]) -> t.Iterable[pyclts.models.Symbol]: """Reduce the row's segments to not contain empty morphemes. This function removes all unknown sound segments (/0/) from the segments string it is passed, and removes empty morphemes by collapsing subsequent morpheme boundary markers (_#◦+→←) into one. >>> segments = "+ _ t a + 0 + a t" >>> c = clean_segments(segments)
def get_wordlist( self, doculect='base', profile=False, ref='crossid', lexstat=True, threshold=0.4): """ Return a classical wordlist from the data. """ if profile: profile = segments.Tokenizer(profile) tokenize = lambda x: profile('^' + x + '$', column='IPA').split() # noqa: E731 else: tokenize = lingpy.ipa2tokens D = { 0: [ 'doculect', 'concept', 'concept_in_source', 'concept_type', 'form', 'tokens', 'occurrences', 'word_forms', 'gloss_forms', 'phrase_example', 'gloss_example', 'references', ] } idx = 1 for ctype in ['lexicon', 'grammar']: concepts = self.get_concepts(ctype=ctype) concordance = self._concordances[ctype] for concept, entries in concepts.items(): for form, lid, cis, freq in entries: # retrieve the concordance pidx, sA, sB = concordance[form, concept, cis, lid][0] txt = self[pidx].phrase gls = self[pidx].gloss word, fgls = self[pidx, sA] tokens = tokenize(form) references = ' '.join( ['{0}:{1}:{2}'.format(a, b, c) for a, b, c in concordance[form, concept, cis, lid]]) # check tokens try: lingpy.tokens2class(tokens, 'sca') check = True except: # noqa: E722, # pragma: no cover check = False if concept.strip() and check: D[idx] = [ doculect if self.monolingual else lid, concept, cis, ctype, form, tokens, freq, word, fgls, txt, gls, references] idx += 1 else: print('[!] Problem with "{0}" / [{1}] [{2}] / {3} {4} {5}'.format( concept, form, tokens, pidx, sA, sB, )) wl = lingpy.Wordlist(D) if lexstat: wl = lingpy.LexStat(D) wl.cluster(method='sca', threshold=threshold, ref=ref) else: wl.add_entries('cog', 'concept,form', lambda x, y: x[y[0]] + '-' + x[y[1]]) wl.renumber('cog', ref) return wl