def clean_wikicode(wc: Wikicode): """ Performs operations on each etymology section that get rid of extraneous nodes and create new templates based on natural-language parsing. """ cleaner = lambda x: ((not isinstance(x, (Text, Wikilink, Template))) or (isinstance(x, Text) and not bool(x.value.strip()))) for node in wc.filter(recursive=False, matches=cleaner): wc.remove(node) merge_etyl_templates(wc) get_plus_combos(wc) get_comma_combos(wc) get_from_chains(wc) remove_links(wc)
def parse_section(self, code: Wikicode, section: Wikicode): for arg in section.filter(recursive=False): typ = type(arg) if typ in ignore_types: continue elif typ == Template: self.apply_wikitext(arg.name) name = str(arg.name).strip() if name in self.state.page_parser.ignore_templates: continue name = self.to_template_name(name) if name in self.state.page_parser.ignore_templates: continue if name in self.state.page_parser.ignore_pages_if_template: return None # ignore these pages root_match = self.state.page_parser.re_root_templates_full_str.match( name) if root_match or ( self.state.page_parser.re_template_names and self.state.page_parser.re_template_names.match(name)): # Remove well-known params for param in list(arg.params): param_name = str(param.name) for re_param in self.state.page_parser.re_well_known_parameters: m = re_param.match(param_name) if not m: continue extras = '' has_templates = False for arg2 in param.value.filter(recursive=False): arg2type = type(arg2) if arg2type == Text: extras += arg2.value elif arg2type == Template and str( arg2.name ) in self.state.page_parser.root_templates: has_templates = True elif arg2type == Wikilink: extras += str( arg2.text) if arg2.text else str( arg2.title) elif arg2type != Comment: raise ValueError( f"cannot parse well known param {str(param).strip()}" ) extras = extras.strip() if has_templates and extras != '': allowed_extras = self.state.page_parser.re_allowed_extras if not allowed_extras or not allowed_extras.match( extras): raise ValueError( f"well known param '{str(param).strip()}' has text and templates" ) if has_templates: self.parse_section(code, param.value) elif extras: self.state.add_result('_' + m.group(1), param.value.strip()) arg.remove(param) if root_match: self.state.add_result(name, params_to_dict(arg.params)) code.remove(arg) else: new_arg = self.apply_value(code, arg) if new_arg: self.parse_section(code, new_arg) elif not self.state.page_parser.re_ignore_template_prefixes.match( name): self.warn( f"{self.state.header} {self.word}: Unknown template {arg}, " f"consider adding it to ignore_templates") elif typ == Heading: if len(self.state.header) < arg.level - 2: self.state.header += [None] * (arg.level - 2 - len(self.state.header)) else: self.state.header = self.state.header[:arg.level - 2] self.apply_wikitext(arg.title) template = None templates = arg.title.filter_templates(recursive=False) if len(templates) == 1: name = str(templates[0].name).strip() if name in self.state.page_parser.meaning_headers: template = {name: params_to_dict(templates[0].params)} code.remove(templates[0]) if templates and not template: print( f"{self.state.header} {self.word} unrecognized header template in {arg.title}" ) text = str(arg.title).strip() if template: if text: print( f"{self.state.header} {self.word} has text '{text}' in addition to template {template}" ) template['text'] = text self.state.header.append(template) else: self.state.header.append(text) else: self.warn(f"{self.state.header} {self.word}: Ha? {typ} {arg}")
def apply_wikitext(self, code: Wikicode): if code: # print(str(code).replace('\n', '\\n')[:100]) for arg in code.filter(recursive=False): self.apply_value(code, arg)