def make_index(pages): index = collections.defaultdict(list) for p in pages: t = xml_parse(p) check_id(p, t) section = t.find('./refmeta/manvolnum').text refname = t.find('./refnamediv/refname').text purpose = ' '.join(t.find('./refnamediv/refpurpose').text.split()) for f in t.findall('./refnamediv/refname'): infos = (f.text, section, purpose, refname) index[f.text[0].upper()].append(infos) return index
def add_rules(rules, name): xml = xml_parse(name) # print('parsing {}'.format(name), file=sys.stderr) if xml.getroot().tag != 'refentry': return conditional = xml.getroot().get('conditional') or '' rulegroup = rules[conditional] refmeta = xml.find('./refmeta') title = refmeta.find('./refentrytitle').text number = refmeta.find('./manvolnum').text refnames = xml.findall('./refnamediv/refname') target = man(refnames[0].text, number) if title != refnames[0].text: raise ValueError('refmeta and refnamediv disagree: ' + name) for refname in refnames: assert all(refname not in group for group in rules.values()), "duplicate page name" alias = man(refname.text, number) rulegroup[alias] = target
def _extract_directives(page, names): directive_groups = {name: collections.defaultdict(set) for name in names} t = xml_parse(page) section = t.find('./refmeta/manvolnum').text pagename = t.find('./refmeta/refentrytitle').text formatting = {} storopt = directive_groups['options'] for variablelist in t.iterfind('.//variablelist'): klass = variablelist.attrib.get('class') searchpath = variablelist.attrib.get('xpath', './varlistentry/term/varname') storvar = directive_groups[klass or 'miscellaneous'] # <option>s go in OPTIONS, unless class is specified for xpath, stor in ((searchpath, storvar), ('./varlistentry/term/option', storvar if klass else storopt)): for name in variablelist.iterfind(xpath): text = re.sub(r'([= ]).*', r'\1', name.text).rstrip() if text.startswith('-'): # for options, merge options with and without mandatory arg text = text.partition('=')[0] stor[text].add((pagename, section)) if text not in formatting: # use element as formatted display if name.text[-1] in "= '": name.clear() else: name.tail = '' name.text = text formatting[text] = name extra = variablelist.attrib.get('extra-ref') if extra: stor[extra].add((pagename, section)) if extra not in formatting: elt = tree.Element("varname") elt.text = extra formatting[extra] = elt storfile = directive_groups['filenames'] for xpath, absolute_only in (('.//refsynopsisdiv//filename', False), ('.//refsynopsisdiv//command', False), ('.//filename', True)): for name in t.iterfind(xpath): if absolute_only and not (name.text and name.text.startswith('/')): continue if name.attrib.get('index') == 'false': continue name.tail = '' if name.text: if name.text.endswith('*'): name.text = name.text[:-1] if not name.text.startswith('.'): text = name.text.partition(' ')[0] if text != name.text: name.clear() name.text = text if text.endswith('/'): text = text[:-1] storfile[text].add((pagename, section)) if text not in formatting: # use element as formatted display formatting[text] = name else: text = ' '.join(name.itertext()) storfile[text].add((pagename, section)) formatting[text] = name storfile = directive_groups['constants'] for name in t.iterfind('.//constant'): if name.attrib.get('index') == 'false': continue name.tail = '' if name.text.startswith('('): # a cast, strip it name.text = name.text.partition(' ')[2] storfile[name.text].add((pagename, section)) formatting[name.text] = name storfile = directive_groups['specifiers'] for name in t.iterfind(".//table[@class='specifiers']//entry/literal"): if name.text[0] != '%' or name.getparent().text is not None: continue if name.attrib.get('index') == 'false': continue storfile[name.text].add((pagename, section)) formatting[name.text] = name for name in t.iterfind(".//literal[@class='specifiers']"): storfile[name.text].add((pagename, section)) formatting[name.text] = name # Serialize to allow pickling formatting = {name: xml_print(value) for name, value in formatting.items()} return directive_groups, formatting
def _extract_directives(directive_groups, formatting, page): t = xml_parse(page) section = t.find('./refmeta/manvolnum').text pagename = t.find('./refmeta/refentrytitle').text storopt = directive_groups['options'] for variablelist in t.iterfind('.//variablelist'): klass = variablelist.attrib.get('class') storvar = directive_groups[klass or 'miscellaneous'] # <option>s go in OPTIONS, unless class is specified for xpath, stor in (('./varlistentry/term/varname', storvar), ('./varlistentry/term/option', storvar if klass else storopt)): for name in variablelist.iterfind(xpath): text = re.sub(r'([= ]).*', r'\1', name.text).rstrip() stor[text].append((pagename, section)) if text not in formatting: # use element as formatted display if name.text[-1] in '= ': name.clear() else: name.tail = '' name.text = text formatting[text] = name storfile = directive_groups['filenames'] for xpath, absolute_only in (('.//refsynopsisdiv//filename', False), ('.//refsynopsisdiv//command', False), ('.//filename', True)): for name in t.iterfind(xpath): if absolute_only and not (name.text and name.text.startswith('/')): continue if name.attrib.get('noindex'): continue name.tail = '' if name.text: if name.text.endswith('*'): name.text = name.text[:-1] if not name.text.startswith('.'): text = name.text.partition(' ')[0] if text != name.text: name.clear() name.text = text if text.endswith('/'): text = text[:-1] storfile[text].append((pagename, section)) if text not in formatting: # use element as formatted display formatting[text] = name else: text = ' '.join(name.itertext()) storfile[text].append((pagename, section)) formatting[text] = name storfile = directive_groups['constants'] for name in t.iterfind('.//constant'): if name.attrib.get('noindex'): continue name.tail = '' if name.text.startswith('('): # a cast, strip it name.text = name.text.partition(' ')[2] storfile[name.text].append((pagename, section)) formatting[name.text] = name