コード例 #1
0
ファイル: scrape_verb_prefixes.py プロジェクト: crokobit/data
def scrape(xml_path):
    """Scrape verb prefixed from the MW dictionary."""

    upasargas = set(UPASARGAS.splitlines())
    labels = ['name', 'prefix_type']
    regexp = 'root'

    rows = []
    for i, xml in enumerate(util.iter_mw_xml(xml_path, regexp=regexp)):
        key1 = xml.find('h/key1')
        key2 = xml.find('h/key2')
        entry = key1.text
        if not (entry.endswith('kf') or entry.endswith('BU')):
            continue

        # A root is prefixed iff it has a <root> element. Any matches without
        # one are almost certainly nominals, which we can disregard.
        root = key2.find('.//root')
        if root is None:
            continue

        # Remove lingering XML
        root.clear()
        key2.tag = None
        name = ET.tostring(key2)
        name = re.sub('(<.*?>)|/', '', name)

        # Remove groups ending in upasargas
        splits = [x for x in name.split('-') if x]
        last = splits[-1]
        if last in upasargas or make_tidy(last) in upasargas:
            continue


        # Add prefixes to the proper category
        name = ''.join(splits)
        _type = None

        if name[-1] in ('I', 'U'):
            _type = 'cvi'
        elif name.endswith('A'):
            _type = 'DAc'
        else:
            _type = 'other'

        # 'sampra' is suggested as a prefix. This is wrong.
        if name == 'sampra':
            continue

        rows.append((name, _type))

    rows = util.unique(rows, lambda x: x[0])
    rows.sort(key=lambda x: util.key_fn(x[0]))
    print util.make_csv_string(labels, rows)
コード例 #2
0
ファイル: scrape_indeclinables.py プロジェクト: crokobit/data
def scrape(xml_path):
    """Scrape indeclinables from the MW dictionary."""

    labels = ['name']
    rows = []
    regexp = 'body>\s*<lex>ind'
    for i, xml in enumerate(util.iter_mw_xml(xml_path, regexp=regexp)):
        word = xml.find('h/key1').text
        rows.append([word])
        # util.tick(word, i, 50)

    rows.sort(key=lambda x: util.key_fn(x[0]))
    print util.make_csv_string(labels, rows)
コード例 #3
0
ファイル: scrape_nominals.py プロジェクト: crokobit/data
def scrape(xml_path):
    """Scrape nouns and adjectives from the MW dictionary."""

    noun_lexes = {
        'm': 'm',
        'f': 'f',
        'n': 'n',
        'mf': 'mf',
        'fn': 'fn',
        'nf': 'fn',
        'mn': 'mn',
        'nm': 'mn'
    }
    adj_lexes = {
        'mfn': 'mfn'
    }
    labels = ['stem', 'stem_genders']
    regexp = '(<lex>[^i].*?</lex>)'

    rows = []
    seen = set()
    for i, xml in enumerate(util.iter_mw_xml(xml_path, regexp=regexp)):
        # Genders
        lex = xml.find('body/lex')
        if lex is None:
            lex = xml.find('body/p/lex')
            if lex is None:
                continue
        lex.tag = None
        lex.tail = None
        lex = ET.tostring(lex)
        lex = re.sub('<.*>', '', lex)
        lex = re.sub('[^a-z]', '', lex)
        if lex not in noun_lexes and lex not in adj_lexes:
            continue
        genders = noun_lexes.get(lex) or adj_lexes.get(lex)
        assert genders

        # Stem
        stem = xml.find('h/key1').text

        if (stem, genders) in seen:
            continue
        seen.add((stem, genders))

        rows.append((stem, genders))

    rows.sort(key=lambda x: util.key_fn(x[0]))
    print util.make_csv_string(labels, rows)
コード例 #4
0
def scrape(xml_path):
    """Scrape prefixed roots from the MW dictionary.

    This function doesn't scrape everything, but it's good enough.
    """

    labels = ["prefixed_root", "unprefixed_root", "hom"]
    rows = []

    for i, xml in enumerate(util.iter_mw_xml(xml_path, "vlex")):
        if not has_prefix(xml):
            continue

        prefixed_root = xml.find("h/key1").text

        # Skip any entries without a <root> element. This element wraps the
        # unprefixed root. If <root> is absent, this probably isn't a prefixed
        # root.
        #
        # TODO: The following prefixed roots have no <root> element:
        # - gavez
        # - pAWAntaraya
        # - sampalAy
        # - samprAv
        unprefixed_root = None
        root_elem = xml.find(".//root")
        if root_elem is not None:
            unprefixed_root = root_elem.text
            if (not unprefixed_root) and root_elem.tail:
                unprefixed_root = root_elem.tail.strip()

        if not unprefixed_root or "~" in unprefixed_root:
            continue

        # Some roots are homonymous. The MW <hom> element distinguishes one
        # root sense from another.
        hom = xml.find(".//root/hom")
        hom_value = hom.text if hom is not None else None

        rows.append((prefixed_root, unprefixed_root, hom_value))

    rows.sort(key=lambda x: util.key_fn(x[0]))
    print util.make_csv_string(labels, rows)
コード例 #5
0
def scrape(xml_path):
    """Scrape unprefixed roots from the MW dictionary."""

    labels = ['root', 'hom', 'class', 'voice']
    rows = []

    all_vclasses = set('1 2 3 4 5 6 7 8 9 10 denom'.split())
    all_voices = set('para atma'.split())
    voice_translator = {'p': 'para', 'a': 'atma', 'a1': 'atma'}

    for i, xml in enumerate(util.iter_mw_xml(xml_path)):
        if has_prefix(xml):
            continue

        root = xml.find('h/key1').text

        paradigms = []
        vclasses = []
        voice = None

        # To make a paradigm, we need a class and voice. Viable roots come in
        # three flavors:
        #
        # - class and voice: gam
        # - class, no voice: patAkaya
        # - voice, no class: candrikAya
        #
        # Some roots have neither class and voice. These are currently
        # ignored.
        for token in tokenized_vlexes(xml):
            if token in all_vclasses:
                vclasses.append(token)
            elif token in voice_translator:
                voice = voice_translator[token]
                for vclass in vclasses:
                    paradigms.append((vclass, voice))
                vclasses = []

        # If the voice is not specified, search Sanskrit strings in the entry
        # to infer it.
        if vclasses and not paradigms:
            body = ET.tostring(xml.find('body'))

            # 'ti' at the end of a word
            if re.search('ti[,. <]', body):
                voice = voice_translator['p']
                for vclass in vclasses:
                    paradigms.append((vclass, voice))
            # 'te' or 'mAna' at the end of a word
            elif re.search('(te)|(mAna)|(mARa)[,. <]', body):
                voice = voice_translator['a']
                for vclass in vclasses:
                    paradigms.append((vclass, voice))

        # If the class is not specified, make some high-precision assumptions
        # about it.
        if voice and not paradigms:
            ends = root.endswith
            if ends('Aya') or ends('aya') or ends('Iya'):
                paradigms.append(('denom', voice))

        paradigms = [list(x) for x in util.unique(paradigms)]
        if not paradigms:
            continue

        # Some roots are homonymous. The MW <hom> element distinguishes one
        # root sense from another.
        hom = xml.find('h/hom')
        hom_value = hom.text if hom is not None else None

        for vclass, voice in paradigms:
            assert vclass in all_vclasses
            assert voice in all_voices
            rows.append((root, hom_value, vclass, voice))

    rows.sort(key=lambda x: util.key_fn(x[0]))
    print util.make_csv_string(labels, rows)