Python split_text Examples, clldutils.text.split_text Python Examples

Example #1

0

Show file

File: test_text.py Project: michaelgfalk/clldutils

def test_split_text():
    assert text.split_text('arm han( )d')[1] == 'hand'
    assert text.split_text('arm han( )d', brackets={})[1] == 'han('
    assert text.split_text('arm h[\t]and   foot')[2] == 'foot'
    assert text.split_text('arm \t\n hand')[1] == 'hand'
    assert text.split_text('arm ')[0] == 'arm'
    assert text.split_text('a(b)c d[e]f', brackets={'(': ')'}) == ['ac', 'd[e]f']
    assert text.split_text('a b c') == ['a', 'b', 'c']
    assert text.split_text('a/b/c', separators=re.compile('/b/')) == ['a', 'c']
    assert text.split_text('a/b/c', separators='/') == ['a', 'b', 'c']
    assert text.split_text('a , b\t; c;', separators=',;', strip=True) == ['a', 'b', 'c']

Example #2

0

Show file

File: run_profile.py Project: natalia-morozova/wold

def my_tokenizer(form, prf):

    value = form.strip()
    for form in split_text(value, separators='/,~', strip=True):
        value = form.strip()
        form = "^%s$" % form.replace(" ", "{} ")

        form = strip_brackets(form, brackets={'[': ']'})

    i = 0
    tokens = []
    while True:
        added = False
        for length in range(len(form[i:]), 0, -1):
            needle = form[i:i + length]
            if needle in prf:
                tokens.append(prf[needle])
                i += length
                added = True
                break

        if not added:
            if form[i] == ' ':
                tokens.append("#")
            else:
                tokens.append('<%s>' % form[i])
            i += 1

        if i == len(form):
            break

    # Remove NULLs
    tokens = [token for token in tokens if token != "NULL"]

    return ' '.join(tokens)

Example #3

0

Show file

File: tolexibase.py Project: tupian-language-resources/tuled

def cogids2cogid(wordlist,
                 ref="cogids",
                 cognates="cogid",
                 morphemes="morphemes"):
    C, M = {}, {}
    current = 1
    for concept in wordlist.rows:
        base = split_text(strip_brackets(concept))[0].upper().replace(" ", "_")
        idxs = wordlist.get_list(row=concept, flat=True)
        cogids = defaultdict(list)
        for idx in idxs:
            M[idx] = [c for c in wordlist[idx, ref]]
            for cogid in basictypes.ints(wordlist[idx, ref]):
                cogids[cogid] += [idx]
        for i, (cogid, idxs) in enumerate(
                sorted(cogids.items(), key=lambda x: len(x[1]), reverse=True)):
            for idx in idxs:
                if idx not in C:
                    C[idx] = current
                    M[idx][M[idx].index(cogid)] = base
                else:
                    M[idx][M[idx].index(cogid)] = "_" + base.lower()
            current += 1
    wordlist.add_entries(cognates, C, lambda x: x)
    if morphemes:
        wordlist.add_entries(morphemes, M, lambda x: x)

Example #4

0

Show file

File: lexibank_uralex.py Project: lexibank/uralex

def bibkeys(s):
    s = re.sub(r', (?P<year>[0-9]{4})', lambda m: ' ' + m.group('year'), s)
    s = s.replace('Sammallahti1998Lehtiranta1989',
                  'Sammallahti1998, Lehtiranta1989')
    res = [
        slug(rid, lowercase=False) for rid in split_text(s, ",", strip=True)
    ]
    return [BIBKEYS.get(k, k) for k in res]

Example #5

0

Show file

File: references.py Project: RimonWehbi/glottolog

 def doctypes(self, hhtypes):
     res = []
     if 'hhtype' in self.fields:
         for ss in split_text(self.fields['hhtype'], separators=',;'):
             ss = ss.split('(')[0].strip()
             if ss in hhtypes:
                 res.append(hhtypes[ss])
     return res, self.parse_ca(self.fields.get('hhtype'))

Example #6

0

Show file

def add_identifiers(data, dblang, items, name_type=False):
    for prov, names in items.items():
        if not isinstance(names, (list, tuple)):
            names = split_text(names, separators=',;')
        for name in sorted(set(names)):
            lang = 'en'
            if name_type:
                if '[' in name and name.endswith(']'):
                    name, lang = [s.strip() for s in name[:-1].split('[', 1)]
                add_identifier(dblang, data, name,
                               'name' if name_type else prov,
                               prov if name_type else None, lang)

Example #7

0

Show file

File: bibfiles.py Project: SimonGreenhill/pyglottolog

    def doctypes(self, hhtypes):
        """Ordered doctypes assigned to this entry.

        :param hhtypes: `OrderedDict` mapping doctype names to doctypes
        :return: `list` of values of `hhtypes` which apply to the entry, ordered by occurrence in\
        `hhtypes`.
        """
        res = set()
        if 'hhtype' in self.fields:
            for ss in split_text(self.fields['hhtype'], separators=',;'):
                ss = ss.split('(')[0].strip()
                if ss in hhtypes:
                    res.add(ss)
        return [v for k, v in hhtypes.items() if k in res], self.parse_ca(self.fields.get('hhtype'))

Example #8

0

Show file

File: sources.py Project: stefelisabeth/dplace

def load_references(repos):
    keys = set()
    for ds in repos.datasets:
        for r in ds.references:
            if ':' in r.key:
                # skip keys with page numbers.
                continue

            # key is in the format Author, Year
            try:
                author, year = split_text(r.key, separators=',', strip=True)
                if (author, year) not in keys:
                    keys.add((author, year))
                    reference = Source.objects.create(
                        author=author, year=year, reference=r.citation)
                    logging.info("Saved new reference %s (%s)"
                                 % (reference.author, reference.year))
            except Exception as e:  # pragma: no cover
                logging.warn("Could not save reference for row %s: %s" % (str(r), e))
    return len(keys)

Example #9

0

Show file

File: sources.py Project: NESCent/dplace

def load_references(repos):
    keys = set()
    for ds in repos.datasets:
        for r in ds.references:
            if ':' in r.key:
                # skip keys with page numbers.
                continue

            # key is in the format Author, Year
            try:
                author, year = split_text(r.key, separators=',', strip=True, brackets={})
                if (author, year) not in keys:
                    keys.add((author, year))
                    reference = Source.objects.create(
                        author=author, year=year, reference=r.citation)
                    logging.info("Saved new reference %s (%s)"
                                 % (reference.author, reference.year))
            except Exception as e:  # pragma: no cover
                logging.warn("Could not save reference for row %s: %s" % (str(r), e))
    return len(keys)

Example #10

0

Show file

File: initdb.py Project: uwblueprint/glottolog3

def load_languoid(data, lang, nodemap):
    dblang = data.add(
        models.Languoid,
        lang.id,
        id=lang.id,
        hid=lang.hid,
        name=lang.name,
        bookkeeping=lang.category == models.BOOKKEEPING,
        newick=lang.newick_node(nodemap).newick,
        latitude=lang.latitude,
        longitude=lang.longitude,
        status=models.LanguoidStatus.get(
            lang.endangerment.name if lang.endangerment else 'safe'),
        level=models.LanguoidLevel.from_string(lang.level.name),
        father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None)
    if lang.iso:
        add_language_codes(data, dblang, lang.iso)

    for prov, names in lang.names.items():
        for name in names:
            l = 'en'
            if '[' in name and name.endswith(']'):
                name, l = [s.strip() for s in name[:-1].split('[', 1)]
            add_identifier(dblang, data, name, 'name', prov, lang=l)

    for prov, ids in lang.identifier.items():
        for id_ in split_text(ids, separators=',;'):
            add_identifier(dblang, data, id_, prov, None)

    if not dblang.bookkeeping:
        # Languages in Bookkeeping do not have a meaningful classification!
        clf = lang.classification_comment
        if clf:
            for attr, pid in [('sub', 'sc'), ('family', 'fc')]:
                val = getattr(clf, attr)
                if attr == 'sub' and not val:
                    # Handle cases with subrefs but no sub comment.
                    val = getattr(clf, 'subrefs')
                    if val:
                        val = ', '.join('{0}'.format(r) for r in val)
                if not val:
                    continue
                vs = common.ValueSet(
                    id='%s-%s' % (pid, lang.id),
                    description=val,
                    language=dblang,
                    parameter=data['Parameter'][pid],
                    contribution=data['Contribution']['clf'])
                DBSession.add(common.Value(id='%s-%s' % (pid, lang.id), valueset=vs))

    iso_ret = lang.iso_retirement
    if iso_ret:
        DBSession.add(models.ISORetirement(
            id=iso_ret.code,
            name=iso_ret.name,
            description=iso_ret.comment,
            effective=iso_ret.effective,
            reason=iso_ret.reason,
            remedy=iso_ret.remedy,
            change_request=iso_ret.change_request,
            languoid=dblang))

    eth_cmt = lang.ethnologue_comment
    if eth_cmt:
        DBSession.add(models.EthnologueComment(
            comment=eth_cmt.comment,
            code=eth_cmt.isohid,
            type=eth_cmt.comment_type,
            affected=eth_cmt.ethnologue_versions,
            languoid=dblang))

Example #11

0

Show file

 def clean_form(self, item, form):
     if form not in ["*", "---", ""]:
         return split_text(strip_brackets(form), ",;/")[0]

Example #12

0

Show file

def load(args):
    glottolog = args.repos
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")
    version = assert_release(glottolog.repos)
    dataset = common.Dataset(
        id='glottolog',
        name="{0} {1}".format(glottolog.publication.web.name, version),
        publisher_name=glottolog.publication.publisher.name,
        publisher_place=glottolog.publication.publisher.place,
        publisher_url=glottolog.publication.publisher.url,
        license=glottolog.publication.license.url,
        domain=purl.URL(glottolog.publication.web.url).domain(),
        contact=glottolog.publication.web.contact,
        jsondata={'license_icon': 'cc-by.png', 'license_name': glottolog.publication.license.name},
    )
    data = Data()

    for e in glottolog.editors.values():
        if e.current:
            ed = data.add(common.Contributor, e.id, id=e.id, name=e.name)
            common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord))
    DBSession.add(dataset)

    contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog')
    DBSession.add(common.ContributionContributor(
        contribution=contrib, contributor=data['Contributor']['hammarstroem']))

    #
    # Add Parameters:
    #
    add = functools.partial(add_parameter, data)
    add('fc', name='Family classification')
    add('sc', name='Subclassification')
    add('aes',
        args.repos.aes_status.values(),
        name=args.repos.aes_status.__defaults__['name'],
        pkw=dict(
            jsondata=dict(
                reference_id=args.repos.aes_status.__defaults__['reference_id'],
                sources=[attr.asdict(v) for v in args.repos.aes_sources.values()],
                scale=[attr.asdict(v) for v in args.repos.aes_status.values()])),
        dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)),
    )
    add('med',
        args.repos.med_types.values(),
        name='Most Extensive Description',
        dekw=lambda de: dict(
            name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)),
    )
    add('macroarea',
        args.repos.macroareas.values(),
        pkw=dict(
            description=args.repos.macroareas.__defaults__['description'],
            jsondata=dict(reference_id=args.repos.macroareas.__defaults__['reference_id'])),
        dekw=lambda de: dict(
            name=de.name,
            description=de.description,
            jsondata=dict(geojson=read_macroarea_geojson(args.repos, de.name, de.description)),
        ),
    )
    add('ltype',
        args.repos.language_types.values(),
        name='Language Type',
        dekw=lambda de: dict(name=de.category, description=de.description),
        delookup='category',
    )
    add('country',
        args.repos.countries,
        dekw=lambda de: dict(name=de.id, description=de.name),
    )

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    #
    # Now load languoid data, keeping track of relations that can only be inserted later.
    #
    lgsources = defaultdict(list)
    # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level
    # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`:
    nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()])
    lgcodes = {k: v.id for k, v in args.repos.languoids_by_code(nodemap).items()}
    for lang in nodemap.values():
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(glottolog, data, lang, nodemap)

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            mas = []
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma)
                mas.append(ma.name)
            ref.macroareas = ', '.join(mas)

Example #13

0

Show file

File: sound_classes.py Project: kadster/lingpy

def clean_string(
        sequence, semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False,
        segmentized=False, rules=None, ignore_brackets=True, brackets=None,
        split_entries=True, splitters='/,;~', preparse=None,
        merge_geminates=True, normalization_form="NFC"):
    """
    Function exhaustively checks how well a sequence is understood by \
            LingPy.

    Parameters
    ----------
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    segmentized : False
        Indicate whether the input string is already segmentized or not. If set
        to True, items in brackets can no longer be ignored.
    rules : dict
        Replacement rules to be applied to a segmentized string.
    ignore_brackets : bool
        If set to True, ignore all content within a given bracket.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    split_entries : bool (default=True)
        Indicate whether multiple entries (with a comma etc.) should be split
        into separate entries.
    splitters : str
        The characters which force the automatic splitting of an entry.
    preparse : list
        List of tuples, giving simple replacement patterns (source and target),
        which are applied before every processing starts.

    Returns
    -------
    cleaned_strings : list
        A list of cleaned strings which are segmented by space characters. If
        splitters are encountered, indicating that the entry contains two
        variants, the list will contain one for each element in a separate
        entry. If there are no splitters, the list has only size one.
    """
    sequence = unicodedata.normalize(normalization_form, sequence)
    rules = rules or {}
    preparse = preparse or []

    # replace white space if not indicated otherwise
    if segmentized:
        segment_list = [sequence.split(' ') if not isinstance(sequence, (list,
            tuple)) else sequence]
    else:
        for s, t in preparse:
            sequence = sequence.replace(s, t)
        segment_list = []
        if ignore_brackets:
            new_sequence = strip_brackets(sequence, brackets=brackets)
        else:
            new_sequence = sequence

        # splitting needs to be done afterwards
        if split_entries:
            new_sequences = split_text(new_sequence, splitters,
                    brackets='' if not ignore_brackets else brackets)
        else:
            new_sequences = [new_sequence]

        for new_sequence in new_sequences:
            segments = ipa2tokens(
                    re.sub(r'\s+', '_', new_sequence.strip()),
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    merge_geminates=merge_geminates)
            segment_list += [segments]
    out = []
    for segments in segment_list:
        segments = [rules.get(s, s) for s in segments]
        out += [' '.join(segments)]
    return out

Example #14

0

Show file

File: lexibank_uralex.py Project: MervideHeer/uralex

    def cmd_makecldf(self, args):
        args.writer.add_sources(self.raw_dir.read("Citations.bib"))
        for src in self._read("Citation_codes"):
            if src["type"] == "E":
                args.writer.add_sources(
                    Source("misc",
                           src["ref_abbr"],
                           author=src["original_reference"]))

        glottocodes = {
            language["ID"]: language["Glottocode"]
            for language in self.languages
        }
        for language in self._read("Languages"):
            glottocode = glottocodes.get(language["lgid3"])
            if not glottocode:
                glottocode = self.glottolog.glottocode_by_iso.get(
                    language["ISO-639-3"])
            args.writer.add_language(
                ID=language["lgid3"],
                Name=language["language"],
                Glottocode=glottocode,
                Description=language["Description"],
                Subgroup=language["Subgroup"],
                ISO639P3code=language["ISO-639-3"],
            )

        for concept in self.concepts:
            args.writer.add_concept(**concept)

        for (cid, cogid), ll in itertools.groupby(
                sorted(self._read("Data"),
                       key=lambda i: (i["mng_item"], i["cogn_set"])),
                lambda i: (i["mng_item"], i["cogn_set"]),
        ):
            for language in ll:
                kw = dict(
                    Value=language["item"],
                    Language_ID=language["lgid3"],
                    Parameter_ID=cid,
                    Comment=language["general_notes"],
                    Source=[
                        slug(rid, lowercase=False) for rid in split_text(
                            language["ref_abbr"], ",", strip=True)
                    ],
                )
                kw.update({
                    k: language[k]
                    for k in [
                        "item_UPA",
                        "item_IPA",
                        "form_set",
                        "age_term_pq",
                        "age_term_aq",
                        "borr_source",
                        "borr_qual",
                        "etym_notes",
                        "glossing_notes",
                    ]
                })

                for lex in args.writer.add_lexemes(**kw):
                    if cogid != "?":
                        args.writer.add_cognate(lexeme=lex,
                                                Cognateset_ID="{0}-{1}".format(
                                                    cid, cogid))

Example #15

0

Show file

File: ingest.py Project: marchdown/dictionaria

def split(s, sep=';'):
    return split_text(s, separators=sep, brackets={}, strip=True)

Example #16

0

Show file

File: lexibank_uralex.py Project: lexibank/uralex

    def cmd_makecldf(self, args):
        args.writer.add_sources(self.raw_dir.read("Citations.bib"))
        bib = parse_string(self.raw_dir.read('Borrowing_references.bib'),
                           'bibtex')
        for k, v in bib.entries.items():
            args.writer.add_sources(
                Source.from_entry(slug(k, lowercase=False), v))

        args.writer.cldf.add_component(
            'BorrowingTable', {
                'name': 'Likelihood',
                'dc:description':
                'Likelihood of borrowing (*possible*, *probable* or *clear*).',
                'datatype': {
                    'base': 'string',
                    'format': 'possible|clear|probable'
                }
            }, {
                'name': 'SourceLanguoid',
                'dc:description': 'Borrowing source of lexeme.',
            })
        args.writer.cldf['FormTable', 'form'].required = False
        args.writer.cldf['FormTable', 'value'].null = NULL_ITEMS
        args.writer.cldf['FormTable', 'value'].required = False
        args.writer.cldf['FormTable', 'value'].common_props['dc:description'] = \
            "Lexeme data. Contains a lexeme or '[No equivalent]': no suitable equivalent for a meaning exists), " \
            "'[Form not found]': no suitable equivalent was found, or '[Not reconstructable]': non-recontructable " \
            "meanings in Proto-Uralic."

        for src in self._read("Citation_codes"):
            if src["type"] == "E":
                args.writer.add_sources(
                    Source("misc",
                           src["ref_abbr"],
                           author=src["original_reference"]))

        glottocodes = {
            language["ID"]: language["Glottocode"]
            for language in self.languages
        }
        for language in self._read("Languages"):
            glottocode = glottocodes.get(language["lgid3"])
            if not glottocode:
                glottocode = self.glottolog.glottocode_by_iso.get(
                    language["ISO-639-3"])
            args.writer.add_language(
                ID=language["lgid3"],
                Name=language["language"],
                Glottocode=glottocode,
                Description=language["Description"],
                Subgroup=language["Subgroup"],
                ISO639P3code=language["ISO-639-3"],
            )

        inlists = {r['mng_item']: r for r in self._read('Meaning_lists')}
        attrs = [
            k for k in attr.fields_dict(UralexConcept).keys() if k != 'LJ_rank'
        ]
        for concept in self.concepts:
            if concept['ID'] in inlists:
                memberships = {
                    k.replace('-', '_'): v == '1'
                    for k, v in inlists[concept['ID']].items()
                    if k.replace('-', '_') in attrs
                }
                concept.update(memberships)
            args.writer.add_concept(**concept)

        for (cid, cogid), ll in itertools.groupby(
                sorted(self._read("Data"),
                       key=lambda i: (i["mng_item"], i["cogn_set"])),
                lambda i: (i["mng_item"], i["cogn_set"]),
        ):
            for language in ll:
                if language['item'] in NULL_ITEMS:
                    language['etym_notes'] = language['etym_notes'] + language[
                        'item']
                kw = dict(
                    Value=language["item"],
                    Language_ID=language["lgid3"],
                    Parameter_ID=cid,
                    Comment=language["general_notes"],
                    Source=[
                        slug(rid, lowercase=False) for rid in split_text(
                            language["ref_abbr"], ",", strip=True)
                    ],
                )
                kw.update({
                    k: language[k]
                    for k in [
                        "item_UPA",
                        "item_IPA",
                        "form_set",
                        "etym_notes",
                        "glossing_notes",
                    ]
                })

                for i, lex in enumerate(args.writer.add_lexemes(**kw)):
                    lex['Form'] = None if lex['Form'] in NULL_ITEMS else lex[
                        'Form']
                    if cogid not in ["?", "0"]:
                        args.writer.add_cognate(lexeme=lex,
                                                Cognateset_ID="{0}-{1}".format(
                                                    cid, cogid))
                    if language['borr_qual']:
                        c = ': borrowed to Pre-Permic'
                        ref = language['ref_borr']
                        if c in ref:
                            comment = c[1:].strip()
                            ref = ref.replace(c, '')
                        else:
                            comment = None
                        args.writer.objects['BorrowingTable'].append(
                            dict(
                                ID=lex['ID'],
                                Target_Form_ID=lex['ID'],
                                SourceLanguoid=language['borr_source'],
                                Likelihood=language['borr_qual'],
                                Source=bibkeys(ref),
                                Comment=comment,
                            ))

Example #17

0

Show file

File: initdb.py Project: uwblueprint/glottolog3

def load(args):
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    dataset = common.Dataset(
        id='glottolog',
        name="Glottolog {0}".format(args.args[0]),
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='glottolog.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    data = Data()
    for i, (id_, name) in enumerate([
        ('hammarstroem', 'Harald Hammarström'),
        ('bank', 'Sebastian Bank'),
        ('forkel', 'Robert Forkel'),
        ('haspelmath', 'Martin Haspelmath'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    clf = data.add(common.Contribution, 'clf', id='clf', name='Classification')
    DBSession.add(common.ContributionContributor(
        contribution=clf, contributor=data['Contributor']['hammarstroem']))

    for pid, pname in [
        ('fc', 'Family classification'),
        ('sc', 'Subclassification'),
        ('vitality', 'Degree of endangerment'),
    ]:
        data.add(common.Parameter, pid, id=pid, name=pname)

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    glottolog = args.repos
    for ma in Macroarea:
        data.add(
            models.Macroarea,
            ma.name,
            id=ma.name,
            name=ma.value,
            description=ma.description)

    for country in glottolog.countries:
        data.add(models.Country, country.id, id=country.id, name=country.name)

    lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list)
    languoids = list(glottolog.languoids())
    nodemap = {l.id: l for l in languoids}
    for lang in languoids:
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(data, lang, nodemap)
        mas[lang.id] = [ma.name for ma in lang.macroareas]
        countries[lang.id] = [c.id for c in lang.countries]
        lgcodes[lang.id] = lang.id
        if lang.hid:
            lgcodes[lang.hid] = lang.id
        if lang.iso:
            lgcodes[lang.iso] = lang.id

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()
    for lid, maids in mas.items():
        for ma in maids:
            DBSession.add(models.Languoidmacroarea(
                languoid_pk=data['Languoid'][lid].pk,
                macroarea_pk=data['Macroarea'][ma].pk))

    for lid, cids in countries.items():
        for cid in cids:
            DBSession.add(models.Languoidcountry(
                languoid_pk=data['Languoid'][lid].pk,
                country_pk=data['Country'][cid].pk))

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma)
                DBSession.add(models.Refmacroarea(
                    ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))

Example #18

0

Show file

File: ingest.py Project: clld/dictionaria

def split(s, sep=';'):
    return split_text(s, separators=sep, brackets={}, strip=True)

Example #19

0

Show file

 def clean_form(self, item, form):
     if form not in ["*", "---", "-"]:
         form = strip_brackets(split_text(form, separators=";,/")[0])
         return form.replace(" ", "_")

Example #20

0

Show file

    def cmd_makecldf(self, args):
        args.writer.cldf.add_component('ParameterTable')
        args.writer.cldf.add_component(
            'LanguageTable',
            'Continent',
            'Genus',
            'WALSCode',  # we add more language metadata
        )
        args.writer.cldf.add_component('CodeTable')

        args.writer.objects['ParameterTable'] = [{
            'ID':
            'sortalclassifier',
            'Name':
            'sortal classifier',
            'Description':
            'Does the language have sortal classifiers, regardless of optional of obligatory?'
        }, {
            'ID':
            'morphosyntacticplural',
            'Name':
            'morphosyntactic plural',
            'Description':
            'Does the language have morphosyntactic plural markers?'
        }]
        args.writer.objects['CodeTable'] = [
            {
                'ID': 'sortalclassifier-1',
                'Parameter_ID': 'sortalclassifier',
                'Name': 'yes'
            },
            {
                'ID': 'sortalclassifier-0',
                'Parameter_ID': 'sortalclassifier',
                'Name': 'no'
            },
            {
                'ID': 'morphosyntacticplural-1',
                'Parameter_ID': 'morphosyntacticplural',
                'Name': 'yes'
            },
            {
                'ID': 'morphosyntacticplural-0',
                'Parameter_ID': 'morphosyntacticplural',
                'Name': 'no'
            },
        ]

        l2s = collections.defaultdict(list)
        sources = []
        for src in sorted(Sources.from_file(self.raw_dir /
                                            'sources.bib').items(),
                          key=lambda i: i.id):
            if src.get('Wals_code'):
                for code in split_text(src['Wals_code'], ';', strip=True):
                    l2s[code].append(src.id)
                sources += [src]

        args.writer.cldf.add_sources(*sources)

        for row in self.raw_dir.read_csv('GSSG_ListOfLanguages.csv',
                                         delimiter=';',
                                         dicts=True):
            lidx = slug(row['language_name'], lowercase=False)
            args.writer.objects['LanguageTable'].append({
                'ID':
                lidx,
                'Name':
                row['language_name'],
                'Latitude':
                row['latitude'],
                'Longitude':
                row['longitude'],
                'Glottocode':
                row['glottocode'],
                'ISO639P3code':
                row['iso_code'],
                'Continent':
                row['continent'],
                'Genus':
                row['genus'],
                'WALSCode':
                row['wals_code']
            })
            for param in ['sortal_classifier', 'morphosyntactic_plural']:
                pid = param.replace('_', '')
                args.writer.objects['ValueTable'].append({
                    "ID":
                    '{}-{}'.format(lidx, pid),
                    "Value":
                    row[param],
                    "Language_ID":
                    lidx,
                    "Parameter_ID":
                    pid,
                    "Code_ID":
                    '{}-{}'.format(pid, '1' if row[param] == 'yes' else '0'),
                    "Source":
                    l2s.get(row['wals_code'], [])
                })

Example #21

0

Show file

File: initdb.py Project: clld/glottolog3

def load_languoid(data, lang, nodemap):
    dblang = data.add(
        models.Languoid,
        lang.id,
        id=lang.id,
        hid=lang.hid,
        name=lang.name,
        bookkeeping=lang.category == models.BOOKKEEPING,
        newick=lang.newick_node(nodemap).newick,
        latitude=lang.latitude,
        longitude=lang.longitude,
        #
        # TODO: switch to using the AES labels, i.e. lang.endangerment.description!
        #
        status=models.LanguoidStatus.get(
            lang.endangerment.name if lang.endangerment else 'safe'),
        level=models.LanguoidLevel.from_string(lang.level.name),
        father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None)
    if lang.iso:
        add_language_codes(data, dblang, lang.iso)

    for prov, names in lang.names.items():
        for name in names:
            l = 'en'
            if '[' in name and name.endswith(']'):
                name, l = [s.strip() for s in name[:-1].split('[', 1)]
            add_identifier(dblang, data, name, 'name', prov, lang=l)

    for prov, ids in lang.identifier.items():
        for id_ in split_text(ids, separators=',;'):
            add_identifier(dblang, data, id_, prov, None)

    if not dblang.bookkeeping:
        # Languages in Bookkeeping do not have a meaningful classification!
        clf = lang.classification_comment
        if clf:
            for attr, pid in [('sub', 'sc'), ('family', 'fc')]:
                val = getattr(clf, attr)
                if attr == 'sub' and not val:
                    # Handle cases with subrefs but no sub comment.
                    val = getattr(clf, 'subrefs')
                    if val:
                        val = ', '.join('{0}'.format(r) for r in val)
                if attr == 'family' and not val:
                    # Handle cases with subrefs but no sub comment.
                    val = getattr(clf, 'familyrefs')
                    if val:
                        val = ', '.join('{0}'.format(r) for r in val)
                if not val:
                    continue
                vs = common.ValueSet(
                    id='%s-%s' % (pid, lang.id),
                    description=val,
                    language=dblang,
                    parameter=data['Parameter'][pid],
                    contribution=data['Contribution']['clf'])
                DBSession.add(common.Value(id='%s-%s' % (pid, lang.id), valueset=vs))

    iso_ret = lang.iso_retirement
    if iso_ret:
        DBSession.add(models.ISORetirement(
            id=iso_ret.code,
            name=iso_ret.name,
            description=iso_ret.comment,
            effective=iso_ret.effective,
            reason=iso_ret.reason,
            remedy=iso_ret.remedy,
            change_request=iso_ret.change_request,
            languoid=dblang))

    eth_cmt = lang.ethnologue_comment
    if eth_cmt:
        DBSession.add(models.EthnologueComment(
            comment=eth_cmt.comment,
            code=eth_cmt.isohid,
            type=eth_cmt.comment_type,
            affected=eth_cmt.ethnologue_versions,
            languoid=dblang))

Example #22

0

Show file

File: sound_classes.py Project: LinguList/lingpy

def clean_string(
        sequence, semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False,
        segmentized=False, rules=None, ignore_brackets=True, brackets=None,
        split_entries=True, splitters='/,;~', preparse=None,
        merge_geminates=True, normalization_form="NFC"):
    """
    Function exhaustively checks how well a sequence is understood by \
            LingPy.

    Parameters
    ----------
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    segmentized : False
        Indicate whether the input string is already segmentized or not. If set
        to True, items in brackets can no longer be ignored.
    rules : dict
        Replacement rules to be applied to a segmentized string.
    ignore_brackets : bool
        If set to True, ignore all content within a given bracket.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    split_entries : bool (default=True)
        Indicate whether multiple entries (with a comma etc.) should be split
        into separate entries.
    splitters : str
        The characters which force the automatic splitting of an entry.
    preparse : list
        List of tuples, giving simple replacement patterns (source and target),
        which are applied before every processing starts.

    Returns
    -------
    cleaned_strings : list
        A list of cleaned strings which are segmented by space characters. If
        splitters are encountered, indicating that the entry contains two
        variants, the list will contain one for each element in a separate
        entry. If there are no splitters, the list has only size one.
    """
    sequence = unicodedata.normalize(normalization_form, sequence)
    rules = rules or {}
    preparse = preparse or []

    # replace white space if not indicated otherwise
    if segmentized:
        segment_list = [sequence.split(' ') if not isinstance(sequence, (list,
            tuple)) else sequence]
    else:
        for s, t in preparse:
            sequence = sequence.replace(s, t)
        segment_list = []
        if ignore_brackets:
            new_sequence = strip_brackets(sequence, brackets=brackets)
        else:
            new_sequence = sequence

        # splitting needs to be done afterwards
        if split_entries:
            new_sequences = split_text(new_sequence, splitters,
                    brackets='' if not ignore_brackets else brackets)
        else:
            new_sequences = [new_sequence]

        for new_sequence in new_sequences:
            segments = ipa2tokens(
                    re.sub(r'\s+', '_', new_sequence.strip()),
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    merge_geminates=merge_geminates)
            segment_list += [segments]
    out = []
    for segments in segment_list:
        segments = [rules.get(s, s) for s in segments]
        out += [' '.join(segments)]
    return out

Example #23

0

Show file

File: initdb.py Project: clld/glottolog3

def load(args):
    glottolog = args.repos
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")
    version = assert_release(glottolog.repos)
    dataset = common.Dataset(
        id='glottolog',
        name="Glottolog {0}".format(version),
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='glottolog.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    data = Data()
    for i, (id_, name) in enumerate([
        ('hammarstroem', 'Harald Hammarström'),
        ('forkel', 'Robert Forkel'),
        ('haspelmath', 'Martin Haspelmath'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    clf = data.add(common.Contribution, 'clf', id='clf', name='Classification')
    DBSession.add(common.ContributionContributor(
        contribution=clf, contributor=data['Contributor']['hammarstroem']))

    for pid, pname in [
        ('fc', 'Family classification'),
        ('sc', 'Subclassification'),
        ('vitality', 'Degree of endangerment'),
    ]:
        data.add(common.Parameter, pid, id=pid, name=pname)

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    for ma in Macroarea:
        data.add(
            models.Macroarea,
            ma.name,
            id=ma.name,
            name=ma.value,
            description=ma.description)

    for country in glottolog.countries:
        data.add(models.Country, country.id, id=country.id, name=country.name)

    lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list)
    languoids = list(glottolog.languoids())
    nodemap = {l.id: l for l in languoids}
    for lang in languoids:
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(data, lang, nodemap)
        mas[lang.id] = [ma.name for ma in lang.macroareas]
        countries[lang.id] = [c.id for c in lang.countries]
        lgcodes[lang.id] = lang.id
        if lang.hid:
            lgcodes[lang.hid] = lang.id
        if lang.iso:
            lgcodes[lang.iso] = lang.id

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()
    for lid, maids in mas.items():
        for ma in maids:
            DBSession.add(models.Languoidmacroarea(
                languoid_pk=data['Languoid'][lid].pk,
                macroarea_pk=data['Macroarea'][ma].pk))

    for lid, cids in countries.items():
        for cid in cids:
            DBSession.add(models.Languoidcountry(
                languoid_pk=data['Languoid'][lid].pk,
                country_pk=data['Country'][cid].pk))

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma)
                DBSession.add(models.Refmacroarea(
                    ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))