Beispiel #1
0
def verb_extractor(node):
    obj = default_extractor(node, True)
    if not obj:
        return None
    form, attrs, variants, definitions = obj

    # filterout compound variants of verbs
    definitions = list(filter(lambda d: 'Compound of ' not in d, definitions))

    # parse conjugation table
    conjugations = []
    for head, table in extract_tables(node):
        new_conjugations = []
        is_reflexive = form + 'se' in head or form.endswith('se')
        if 'Conjugation of ' + form in head:
            new_conjugations = parse_conjugation_table(table, is_reflexive)
        elif not is_reflexive and 'Selected combined forms of ' + form in head:
            # don't parse combined form table of a reflexive verb because it's the same as non-reflexive version
            new_conjugations = parse_combined_forms_table(table)
        for conj_type, conj_form in new_conjugations:
            if is_reflexive:
                conj_type = [REFLEXIVE, *conj_type]
            conjugations.append((conj_type, conj_form))

    if conjugations:
        variants = conjugations

    return form, attrs, variants, definitions
Beispiel #2
0
def verb_extractor(node):
    # Note: variants are not parsed here, since 'form-of' class is missing
    obj = default_extractor(node, True)
    if not obj:
        return None
    form, attrs, variants, definitions = obj

    variants = filter_variants(
        variants, {
            'third-person singular simple present': [PERSON_3_SINGULAR],
            'present participle': [PRESENT_PARTICIPLE],
            'simple past': [SIMPLE_PAST],
            'past participle': [PAST_PARTICIPLE],
            'simple past and past participle':
            [SIMPLE_PAST_AND_PAST_PARTICIPLE],
        })

    # decompose SIMPLE_PAST_AND_PAST_PARTICIPLE
    new_variants = []
    for k, v in variants:
        if k == [SIMPLE_PAST_AND_PAST_PARTICIPLE]:
            new_variants.append(([SIMPLE_PAST], v))
            new_variants.append(([PAST_PARTICIPLE], v))
        else:
            new_variants.append((k, v))

    return form, attrs, new_variants, definitions
Beispiel #3
0
def noun_extractor(node):
    def extract_attrs(p):
        new_attrs = []
        for i in p.find_all("i"):
            for a in i.find_all('a'):
                if a['title'] and 'Appendix:Glossary' in a['title']:
                    if a.text == UNCOUNTABLE:
                        new_attrs.append(a.text)
        return new_attrs

    obj = default_extractor(node, True, extract_attrs)
    if not obj:
        return None
    form, attrs, variants, definitions = obj

    variants = filter_variants(
        variants, {
            "plural": [PLURAL],
            "feminine plural": [FEMININE, PLURAL],
            "feminine": [FEMININE],
            "masculine plural": [MASCULINE, PLURAL],
            "masculine": [MASCULINE],
        })

    return form, attrs, variants, definitions
Beispiel #4
0
def verb_extractor(node):
    # Note: variants are not parsed here, since 'form-of' class is missing
    obj = default_extractor(node, True)
    if not obj:
        return None
    form, attrs, _, definitions = obj

    conjugations = []
    if definitions:
        # parse conjugation table
        for head, table in extract_tables(node):
            new_conjugations = []
            if head.lower().startswith('conjugation of'):
                # lower-case match because {{de-conj-auto}} uses lower-cased title
                new_conjugations, auxiliary, separable = parse_conjugation_table(
                    table)
                if separable:
                    form = get_separable_form(new_conjugations)
                attrs.append(auxiliary)
            elif head.startswith('Subordinate-clause forms of'):
                new_conjugations = parse_subordinate_conjugation_table(table)
            conjugations.extend(new_conjugations)

        # add declension of past participle
        presp, pp = get_participles(conjugations)
        conjugations.extend(get_declension_of_participle(pp, PAST_PARTICIPLE))
        conjugations.extend(
            get_declension_of_participle(presp, PRESENT_PARTICIPLE))

    return form, attrs, conjugations, definitions
Beispiel #5
0
def adjective_extractor(node):
    obj = default_extractor(node, True)
    if not obj:
        return None
    form, attrs, variants, definitions = obj

    variants = filter_variants(
        variants, {
            "plural": [PLURAL],
            "feminine singular": [FEMININE],
            "feminine plural": [FEMININE, PLURAL],
            "feminine": [FEMININE],
            "masculine plural": [MASCULINE, PLURAL],
            "superlative": [SUPERLATIVE],
        })

    return form, attrs, variants, definitions
Beispiel #6
0
def adjective_extractor(node):
    obj = default_extractor(node, True)
    if not obj:
        return None
    form, attrs, variants, definitions = obj

    variants = filter_variants(variants, {
        'superlative': [SUPERLATIVE],
        'comparative': [COMPARATIVE],
    })
    variants.append(([], form))

    for head, table in extract_tables(node):
        if head.startswith('Positive forms of'):
            variants.extend(get_all_cells([], table))
        elif head.startswith('Comparative forms of'):
            variants.extend(get_all_cells([COMPARATIVE], table))
        elif head.startswith('Superlative forms of'):
            variants.extend(get_all_cells([SUPERLATIVE], table))

    return form, attrs, variants, definitions
Beispiel #7
0
def verb_extractor(node):
    # Note: variants are not parsed here, since 'form-of' class is missing
    obj = default_extractor(node, True)
    if not obj:
        return None
    form, _, _, definitions = obj

    conjugations = []
    conj_types = []
    headline = node.find_next('span', {'class': 'mw-headline'},
                              text=['Conjugation'])
    if headline is not None:
        prev = headline.find_previous('span', {'class': 'mw-headline'},
                                      text=['Verb', 'Adjective'])
        if prev == node.span:
            # conjugation really belongs to the current entry
            root = headline.find_next('div', {'class': 'NavFrame'})

            p = root.find_previous('p')
            north_korea = p and 'North Korea' in p.text

            if north_korea:
                # 'contains two conjugation tables (north korean and south korean), so use only the latter
                root = root.find_next('div', {'class': 'NavFrame'})

            head_text = root.find('div', {'class': 'NavHead'}).text.strip()
            if head_text.startswith('Selected forms of the adjective'):
                is_adj = True
            elif head_text.startswith('Selected forms of the verb'):
                is_adj = False
            else:
                assert False, ('unknown NavHead', head_text)
            conjugations, conj_types = parse_conjugation_table(
                root.find('div', {'class': 'NavContent'}), is_adj)

    if 'si-irregular' in conj_types:
        # do not register honorific form as lemma
        return obj

    return form, conj_types, conjugations, definitions
Beispiel #8
0
def noun_extractor(node):
    obj = default_extractor(node, True)
    if not obj:
        return None
    form, attrs, variants, definitions = obj

    variants = filter_variants(
        variants,
        {
            "genitive": [],
            "plural": [],
            "diminutive": [DIMINUTIVE],
            #"feminine": [FEMININE],
        })

    for head, table in extract_tables(node):
        if head.startswith('Declension of'):
            variants.extend(get_all_cells([], table, True))

    variants.append(([], form))

    return form, attrs, variants, definitions