예제 #1
0
def my_tokenizer(form, prf):

    value = form.strip()
    for form in split_text(value, separators='/,~', strip=True):
        value = form.strip()
        form = "^%s$" % form.replace(" ", "{} ")

        form = strip_brackets(form, brackets={'[': ']'})

    i = 0
    tokens = []
    while True:
        added = False
        for length in range(len(form[i:]), 0, -1):
            needle = form[i:i + length]
            if needle in prf:
                tokens.append(prf[needle])
                i += length
                added = True
                break

        if not added:
            if form[i] == ' ':
                tokens.append("#")
            else:
                tokens.append('<%s>' % form[i])
            i += 1

        if i == len(form):
            break

    # Remove NULLs
    tokens = [token for token in tokens if token != "NULL"]

    return ' '.join(tokens)
예제 #2
0
def cogids2cogid(wordlist,
                 ref="cogids",
                 cognates="cogid",
                 morphemes="morphemes"):
    C, M = {}, {}
    current = 1
    for concept in wordlist.rows:
        base = split_text(strip_brackets(concept))[0].upper().replace(" ", "_")
        idxs = wordlist.get_list(row=concept, flat=True)
        cogids = defaultdict(list)
        for idx in idxs:
            M[idx] = [c for c in wordlist[idx, ref]]
            for cogid in basictypes.ints(wordlist[idx, ref]):
                cogids[cogid] += [idx]
        for i, (cogid, idxs) in enumerate(
                sorted(cogids.items(), key=lambda x: len(x[1]), reverse=True)):
            for idx in idxs:
                if idx not in C:
                    C[idx] = current
                    M[idx][M[idx].index(cogid)] = base
                else:
                    M[idx][M[idx].index(cogid)] = "_" + base.lower()
            current += 1
    wordlist.add_entries(cognates, C, lambda x: x)
    if morphemes:
        wordlist.add_entries(morphemes, M, lambda x: x)
def wikibooks():

    with open('wikibooks.txt') as f:
        data = f.readlines()
    out = []
    gsr = defaultdict(dict)
    for i, line in enumerate(data):
        line = strip_brackets(line.strip().replace('\t', ' '),
                              brackets={'(': ')'})
        if line.startswith('*'):
            if not line[1] == ' ':
                line = line.replace('*', '* ')
            elms = line.split(' ')
            if elms and len(elms) > 1:
                kgsc = elms[1].split('/')
                if len(kgsc) == 1:
                    schuessler = ''
                    karlgren = kgsc[0]
                elif len(kgsc) == 2:
                    karlgren = kgsc[1]
                    schuessler = kgsc[0]
                else:
                    print('[ERROR:schuessler/karlgren] {0}'.format(line))

                try:
                    char = elms[2].split('|')[-1][0]
                except IndexError:
                    print('[ERROR:character] {0}'.format(line))
                    char = ''

                mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]]
                if len(karlgren) not in [4, 5, 6]:
                    print('[ERROR:karlgren] {0}'.format(line, karlgren))
                elif not sinopy.is_chinese(char):
                    print('[ERROR:char] {0}'.format(line))
                elif char:
                    pinyin = sinopy.pinyin(char)
                    if '?' in pinyin or sinopy.is_chinese(pinyin):
                        pinyin = ''
                    out += [(char, pinyin, 'Old_Chinese', karlgren[:4],
                             karlgren, '', 'Karlgren1954')]
                    for reading in mch:
                        out += [(char, pinyin, 'Middle_Chinese', '', karlgren,
                                 reading, 'Wikibooks2016a')]
                        gsr[char][reading] = [pinyin, reading, karlgren]

    with open('karlgren.tsv', 'w') as f:
        f.write(
            'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n'
        )
        for i, line in enumerate(out):
            f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n')

    return gsr
예제 #4
0
def normalize_grapheme(text):
    """
    Apply simple, non-CLTS, normalization.
    """

    new_text = unicodedata.normalize("NFD", text)

    if new_text[0] == "(" and new_text[-1] == ")":
        new_text = new_text[1:-1]

    new_text = strip_brackets(new_text)
    if new_text:
        return new_text
예제 #5
0
파일: forms.py 프로젝트: liualg/pylexibank
    def clean(self, form, item=None):
        """
        Called when a row is added to a CLDF dataset.

        :param form:
        :return: None to skip the form, or the cleaned form as string.
        """
        if form not in self.missing_data:
            if self.normalize_whitespace:
                form = re.sub(r'\s+', ' ', form.strip())
            for source, target in self.replacements:
                form = form.replace(source, target)
            if self.strip_inside_brackets:
                return text.strip_brackets(form, brackets=self.brackets)
            return form
예제 #6
0
def clean_entry(string):
    """simple replacements to enhance segmentation"""
    st = [(' (GEM)', ''), (' (NEP)', ''), (' LO$', ''), (' L$', ''),
          (' C$', ''), (' ING$', ''), (' TIB', ''), (' NEP', ''), ('?', ''),
          (':', 'ː'), ('.', ''), ('=', ''), (' ', '_'), ('_LF', ''), ('*', ''),
          ('^', ''), ('$', '')]
    mapper = dict(zip('0123456789', '⁰¹²³⁴⁵⁶⁸⁹'))
    mapper.update(dict(st))
    string = re.split(';|/|,|>', string)[0]
    string = '^' + string + '$'
    for s, t in st:
        string = string.replace(s, t)
    for s in [' ', '_', '-']:
        string = string.strip(s)
    string = ''.join([x for x in string if not is_chinese(x)])
    return strip_brackets(''.join([mapper.get(s, s) for s in string.strip()]))
예제 #7
0
 def clean_form(self, item, form):
     if form not in ["*", "---", ""]:
         return split_text(strip_brackets(form), ",;/")[0]
예제 #8
0
def clean_string(
        sequence, semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False,
        segmentized=False, rules=None, ignore_brackets=True, brackets=None,
        split_entries=True, splitters='/,;~', preparse=None,
        merge_geminates=True, normalization_form="NFC"):
    """
    Function exhaustively checks how well a sequence is understood by \
            LingPy.

    Parameters
    ----------
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    segmentized : False
        Indicate whether the input string is already segmentized or not. If set
        to True, items in brackets can no longer be ignored.
    rules : dict
        Replacement rules to be applied to a segmentized string.
    ignore_brackets : bool
        If set to True, ignore all content within a given bracket.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    split_entries : bool (default=True)
        Indicate whether multiple entries (with a comma etc.) should be split
        into separate entries.
    splitters : str
        The characters which force the automatic splitting of an entry.
    preparse : list
        List of tuples, giving simple replacement patterns (source and target),
        which are applied before every processing starts.

    Returns
    -------
    cleaned_strings : list
        A list of cleaned strings which are segmented by space characters. If
        splitters are encountered, indicating that the entry contains two
        variants, the list will contain one for each element in a separate
        entry. If there are no splitters, the list has only size one.
    """
    sequence = unicodedata.normalize(normalization_form, sequence)
    rules = rules or {}
    preparse = preparse or []

    # replace white space if not indicated otherwise
    if segmentized:
        segment_list = [sequence.split(' ') if not isinstance(sequence, (list,
            tuple)) else sequence]
    else:
        for s, t in preparse:
            sequence = sequence.replace(s, t)
        segment_list = []
        if ignore_brackets:
            new_sequence = strip_brackets(sequence, brackets=brackets)
        else:
            new_sequence = sequence

        # splitting needs to be done afterwards
        if split_entries:
            new_sequences = split_text(new_sequence, splitters,
                    brackets='' if not ignore_brackets else brackets)
        else:
            new_sequences = [new_sequence]

        for new_sequence in new_sequences:
            segments = ipa2tokens(
                    re.sub(r'\s+', '_', new_sequence.strip()),
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    merge_geminates=merge_geminates)
            segment_list += [segments]
    out = []
    for segments in segment_list:
        segments = [rules.get(s, s) for s in segments]
        out += [' '.join(segments)]
    return out
예제 #9
0
def clean_string(
        sequence, semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False,
        segmentized=False, rules=None, ignore_brackets=True, brackets=None,
        split_entries=True, splitters='/,;~', preparse=None,
        merge_geminates=True, normalization_form="NFC"):
    """
    Function exhaustively checks how well a sequence is understood by \
            LingPy.

    Parameters
    ----------
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    segmentized : False
        Indicate whether the input string is already segmentized or not. If set
        to True, items in brackets can no longer be ignored.
    rules : dict
        Replacement rules to be applied to a segmentized string.
    ignore_brackets : bool
        If set to True, ignore all content within a given bracket.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    split_entries : bool (default=True)
        Indicate whether multiple entries (with a comma etc.) should be split
        into separate entries.
    splitters : str
        The characters which force the automatic splitting of an entry.
    preparse : list
        List of tuples, giving simple replacement patterns (source and target),
        which are applied before every processing starts.

    Returns
    -------
    cleaned_strings : list
        A list of cleaned strings which are segmented by space characters. If
        splitters are encountered, indicating that the entry contains two
        variants, the list will contain one for each element in a separate
        entry. If there are no splitters, the list has only size one.
    """
    sequence = unicodedata.normalize(normalization_form, sequence)
    rules = rules or {}
    preparse = preparse or []

    # replace white space if not indicated otherwise
    if segmentized:
        segment_list = [sequence.split(' ') if not isinstance(sequence, (list,
            tuple)) else sequence]
    else:
        for s, t in preparse:
            sequence = sequence.replace(s, t)
        segment_list = []
        if ignore_brackets:
            new_sequence = strip_brackets(sequence, brackets=brackets)
        else:
            new_sequence = sequence

        # splitting needs to be done afterwards
        if split_entries:
            new_sequences = split_text(new_sequence, splitters,
                    brackets='' if not ignore_brackets else brackets)
        else:
            new_sequences = [new_sequence]

        for new_sequence in new_sequences:
            segments = ipa2tokens(
                    re.sub(r'\s+', '_', new_sequence.strip()),
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    merge_geminates=merge_geminates)
            segment_list += [segments]
    out = []
    for segments in segment_list:
        segments = [rules.get(s, s) for s in segments]
        out += [' '.join(segments)]
    return out
예제 #10
0
def test_strip_brackets():
    strings = ['arm((h)an[d])', '(hand)arm', 'a(hand)r(hand)m(hand)', 'arm⁽hand⁾']
    for string in strings:
        assert text.strip_brackets(string) == 'arm'
    assert text.strip_brackets('arm<hand>', brackets={"<": ">"}) == 'arm'
예제 #11
0
query = r"""
select ?lexemeId ?lemma WHERE {{
  ?lexemeId <http://purl.org/dc/terms/language> wd:Q1860;
            wikibase:lemma ?lemma.
  FILTER (regex(?lemma, '^{0}$'))
}}
"""

findings = defaultdict(list)
mappings, concepticon = get_mappings()
visited = set()

words = defaultdict(list)
for w, vals in mappings['en'].items():
    w2 = strip_brackets(w.lower().strip())
    if w2 not in words:
        words[w2] = vals[0][0]

with open('wikidata.tsv', 'r') as f:
    for line in f:
        if line.strip():
            cid = line.split('\t')[0]
            visited.add(cid)

print(len(words))
with open('fails.tsv', 'r') as f:
    for line in f:
        visited.add(line.strip())

current = ''
예제 #12
0
 def name(self):
     m = re.search(r'tree\s+(?P<name>[^=]+)\s*=', strip_brackets(self, {'[': ']'}))
     if m:
         return m.group('name').strip()
예제 #13
0
 def clean_form(self, item, form):
     if form not in ["*", "---", "-"]:
         form = strip_brackets(split_text(form, separators=";,/")[0])
         return form.replace(" ", "_")