def verb_extractor(node): obj = default_extractor(node, True) if not obj: return None form, attrs, variants, definitions = obj # filterout compound variants of verbs definitions = list(filter(lambda d: 'Compound of ' not in d, definitions)) # parse conjugation table conjugations = [] for head, table in extract_tables(node): new_conjugations = [] is_reflexive = form + 'se' in head or form.endswith('se') if 'Conjugation of ' + form in head: new_conjugations = parse_conjugation_table(table, is_reflexive) elif not is_reflexive and 'Selected combined forms of ' + form in head: # don't parse combined form table of a reflexive verb because it's the same as non-reflexive version new_conjugations = parse_combined_forms_table(table) for conj_type, conj_form in new_conjugations: if is_reflexive: conj_type = [REFLEXIVE, *conj_type] conjugations.append((conj_type, conj_form)) if conjugations: variants = conjugations return form, attrs, variants, definitions
def verb_extractor(node): # Note: variants are not parsed here, since 'form-of' class is missing obj = default_extractor(node, True) if not obj: return None form, attrs, variants, definitions = obj variants = filter_variants( variants, { 'third-person singular simple present': [PERSON_3_SINGULAR], 'present participle': [PRESENT_PARTICIPLE], 'simple past': [SIMPLE_PAST], 'past participle': [PAST_PARTICIPLE], 'simple past and past participle': [SIMPLE_PAST_AND_PAST_PARTICIPLE], }) # decompose SIMPLE_PAST_AND_PAST_PARTICIPLE new_variants = [] for k, v in variants: if k == [SIMPLE_PAST_AND_PAST_PARTICIPLE]: new_variants.append(([SIMPLE_PAST], v)) new_variants.append(([PAST_PARTICIPLE], v)) else: new_variants.append((k, v)) return form, attrs, new_variants, definitions
def noun_extractor(node): def extract_attrs(p): new_attrs = [] for i in p.find_all("i"): for a in i.find_all('a'): if a['title'] and 'Appendix:Glossary' in a['title']: if a.text == UNCOUNTABLE: new_attrs.append(a.text) return new_attrs obj = default_extractor(node, True, extract_attrs) if not obj: return None form, attrs, variants, definitions = obj variants = filter_variants( variants, { "plural": [PLURAL], "feminine plural": [FEMININE, PLURAL], "feminine": [FEMININE], "masculine plural": [MASCULINE, PLURAL], "masculine": [MASCULINE], }) return form, attrs, variants, definitions
def verb_extractor(node): # Note: variants are not parsed here, since 'form-of' class is missing obj = default_extractor(node, True) if not obj: return None form, attrs, _, definitions = obj conjugations = [] if definitions: # parse conjugation table for head, table in extract_tables(node): new_conjugations = [] if head.lower().startswith('conjugation of'): # lower-case match because {{de-conj-auto}} uses lower-cased title new_conjugations, auxiliary, separable = parse_conjugation_table( table) if separable: form = get_separable_form(new_conjugations) attrs.append(auxiliary) elif head.startswith('Subordinate-clause forms of'): new_conjugations = parse_subordinate_conjugation_table(table) conjugations.extend(new_conjugations) # add declension of past participle presp, pp = get_participles(conjugations) conjugations.extend(get_declension_of_participle(pp, PAST_PARTICIPLE)) conjugations.extend( get_declension_of_participle(presp, PRESENT_PARTICIPLE)) return form, attrs, conjugations, definitions
def adjective_extractor(node): obj = default_extractor(node, True) if not obj: return None form, attrs, variants, definitions = obj variants = filter_variants( variants, { "plural": [PLURAL], "feminine singular": [FEMININE], "feminine plural": [FEMININE, PLURAL], "feminine": [FEMININE], "masculine plural": [MASCULINE, PLURAL], "superlative": [SUPERLATIVE], }) return form, attrs, variants, definitions
def adjective_extractor(node): obj = default_extractor(node, True) if not obj: return None form, attrs, variants, definitions = obj variants = filter_variants(variants, { 'superlative': [SUPERLATIVE], 'comparative': [COMPARATIVE], }) variants.append(([], form)) for head, table in extract_tables(node): if head.startswith('Positive forms of'): variants.extend(get_all_cells([], table)) elif head.startswith('Comparative forms of'): variants.extend(get_all_cells([COMPARATIVE], table)) elif head.startswith('Superlative forms of'): variants.extend(get_all_cells([SUPERLATIVE], table)) return form, attrs, variants, definitions
def verb_extractor(node): # Note: variants are not parsed here, since 'form-of' class is missing obj = default_extractor(node, True) if not obj: return None form, _, _, definitions = obj conjugations = [] conj_types = [] headline = node.find_next('span', {'class': 'mw-headline'}, text=['Conjugation']) if headline is not None: prev = headline.find_previous('span', {'class': 'mw-headline'}, text=['Verb', 'Adjective']) if prev == node.span: # conjugation really belongs to the current entry root = headline.find_next('div', {'class': 'NavFrame'}) p = root.find_previous('p') north_korea = p and 'North Korea' in p.text if north_korea: # 'contains two conjugation tables (north korean and south korean), so use only the latter root = root.find_next('div', {'class': 'NavFrame'}) head_text = root.find('div', {'class': 'NavHead'}).text.strip() if head_text.startswith('Selected forms of the adjective'): is_adj = True elif head_text.startswith('Selected forms of the verb'): is_adj = False else: assert False, ('unknown NavHead', head_text) conjugations, conj_types = parse_conjugation_table( root.find('div', {'class': 'NavContent'}), is_adj) if 'si-irregular' in conj_types: # do not register honorific form as lemma return obj return form, conj_types, conjugations, definitions
def noun_extractor(node): obj = default_extractor(node, True) if not obj: return None form, attrs, variants, definitions = obj variants = filter_variants( variants, { "genitive": [], "plural": [], "diminutive": [DIMINUTIVE], #"feminine": [FEMININE], }) for head, table in extract_tables(node): if head.startswith('Declension of'): variants.extend(get_all_cells([], table, True)) variants.append(([], form)) return form, attrs, variants, definitions