class TocChapNum(RegexMatchable): def __init__(self, ch_num_substr): self.substr = ch_num_substr # Store input string (ToC entry title) in a property # Complain if the (sub)chapter numbering regex doesn't match the title string assert self.match( self.substr), ValueError("No chapter number in this string") self.numeric = self.get_numbering_tuple(self.substr) add_props_to_ns(["numeric", "substr"]) _re = r"^(Chapter |CHAPTER |§ ?)?(\d+\.)+" # set inherited read-only `RegexMatchable.re` property @classmethod def get_numbering_tuple(cls, target_str): m = cls.match(target_str) if m: mg_ch, mg_num = m.groups() num_group = m.group() if mg_ch: num_group = num_group[len( mg_ch):] # left-strip the chapter substring num_tup = tuple(n for n in num_group.split(".") if n) if not all(map(str.isnumeric, num_tup)): raise ValueError(f"Non-numeric chapter numbering: {num_tup}") return tuple(map(int, num_tup)) else: return m
class TocChapRomNum(RegexMatchable): def __init__(self, ch_num_substr): self.substr = ch_num_substr # Store input string (ToC entry title) in a property # Complain if the (sub)chapter numbering regex doesn't match the title string if not self.match(self.substr): raise ValueError("No chapter number in this string") self.numeric = self.get_numbering_tuple(self.substr) add_props_to_ns(["numeric", "substr"]) # set inherited read-only `RegexMatchable.re` property _re = r"^(Chapter |CHAPTER |§ ?)?((M{0,4})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\.)+" @classmethod def get_numbering_tuple(cls, target_str): m = cls.match(target_str) if m: mg_ch, mg_num = m.groups( )[:2] # only need the broadest Roman numeral group num_group = m.group() if mg_ch: num_group = num_group[len( mg_ch):] # left-strip the chapter substring num_tup = tuple(n for n in num_group.split(".") if n) if not all(map(validate_roman_numeral, num_tup)): raise ValueError( f"Non-Roman numeric chapter numbering: {num_tup}") return tuple(map(roman2int, num_tup)) else: return m
class HTMLSection: def __init__(self, html_tag): self.root = html_tag self._set_up_props() # parse the root then annul the root once parsed # Populate namespace of class definition with properties and class properties add_props_to_ns(["root"]) add_classprops_to_ns(["root_subselector"]) def _attr_tups_from_prop_dict(self): attr_tuples = [] for prop_key, (css_sel, sel_all, *callback) in self._prop_dict.items(): callback = callback[0] if any(map(callable, callback)) else lambda x: x sel_func = self._selAll if sel_all else self._sel attr_tuples.extend([(prop_key, callback(sel_func(css_sel)))]) return attr_tuples def _set_up_props(self): if hasattr(self, "_prop_dict"): # Set up properties on a subclass calling `super().__init__` self._set_attrs(self._attr_tups_from_prop_dict()) self.root = None # annul the root now its content is parsed into attribs def _set_attrs(self, attr_tuple_list): for (attr, val) in attr_tuple_list: setattr(self, attr, val) def _sel(self, css_selector): return self.root.select_one(css_selector) def _selAll(self, css_selector): return self.root.select(css_selector)
class AMSBookInfoPage: def __init__(self, soup): root_selector = "div.productPage div.bounds" # All info is below this subsoup = soup.select_one(root_selector) self.content = ContentSection(subsoup) self.metadata = TextInfoSection(subsoup) def _df_repr(self, as_dict=False): """ Returns a dict built from the name tree of properties, including a recursive step down into any properties whose values are instances implementing the `_df_repr` interface themselves (i.e. providing the subtree of further sub-properties), which can be merged to obtain a single panel of data (suitable for constructing a single DataFrame). """ # Store all properties in top-level dict i.e. columns of a single DataFrame df_dict = {} for p in self._properties: self_prop_val = getattr(self, p) # Coerce prop_dict to keys by taking it as a list p_prop_names = list(getattr(self_prop_val, "_prop_dict")) for subprop_name in p_prop_names: self_subprop_val = getattr(self_prop_val, subprop_name) if hasattr(self_subprop_val, "_df_repr"): # Recurse! subprops_entry_dict = self_subprop_val._df_repr( as_dict=True) else: subprops_entry_dict = {subprop_name: [self_subprop_val]} df_dict.update(subprops_entry_dict) # Sort the keys before returning df_dict = dict(sorted(df_dict.items())) if as_dict: # Sort the keys upon return return df_dict else: # Import late else it adds start lag to every module implementing _df_repr from pandas import DataFrame df = DataFrame.from_dict(df_dict) return df _properties = ["content", "metadata"] add_props_to_ns(_properties)
class SymbolGroup(RegexMultiMatchable): def __init__(self, val): self.formula = Formula(val.group()) _re = r"\S+\[su[b|p]\(+[^\]]+?\)\]\)?" add_props_to_ns(["matched", "formula"])
class FormulaTree: def __init__(self, splits, inners): self._splits = splits self._inners = inners self._statement = self.as_statement( ) # dict keyed by Split index of subjects def __repr__(self): split_reprs = "\n".join([f"{s!r}" for s in self.splits]) inners_reprs = "\n".join([f"{p!r}" for p in self.inners]) return f"{split_reprs}\n\n{inners_reprs}" add_props_to_ns(["splits", "inners", "statement"], read_only=True) # TODO: parse the tree in such a way that an `[`-opener's preceder element is the # subject of the `sub`/`sup` inside the `[]` clause object, and also that the # 'clopening' aspect will permit multiple such `[]` clause objects to be associated # to a single subject. E.g. in `(A[sub(5)])[sup(⊥)]`, `A` is the subject of both # `sub` and `sup` clause objects (with different inner terms: `5` and `⊥`). # e.g. a simple example with one inner term and two splits: # 'A[sub(2)]' # # Split: < self.info='A' ~ self.open_index=0 ~ #self.split_index=None > # Split: < self.info='sub' ~ self.open_index=1 ~ #self.split_index=0 > # InnerTerm: < self.info='2' ~ #self.parent_split_index=1 > # # e.g. a less simple example with two inner terms and five splits: # '(A[sub(5)])[sup(⊥)]' # # Split: < self.info='' ~ self.open_index=0 ~ #self.split_index=None > # Split: < self.info='A' ~ self.open_index=1 ~ #self.split_index=0 > # Split: < self.info='sub' ~ self.open_index=2 ~ #self.split_index=1 > # Split: < self.info='' ~ self.open_index=3 ~ #self.split_index=0 > # Split: < self.info='sup' ~ self.open_index=4 ~ #self.split_index=3 > # InnerTerm: < self.info='5' ~ #self.parent_split_index=2 > # InnerTerm: < self.info='⊥' ~ #self.parent_split_index=4 > def as_statement(self): statement_subjects = { } # Store each subject term as index of unique Split obj for inner in self.inners: # e.g. '(A[sub(5)])[sup(⊥)]' --> first: `inner='5'` parent_split_i = inner.parent_split_index # --> 2 parent_split = self.splits[ parent_split_i] # --> 3rd split or 0-index 2nd caller_command = parent_split.info # --> `Split.info='sub'` caller_split_i = parent_split.split_index # --> `Split.split_index=1` caller_subject = self.splits[ caller_split_i] # --> 2nd split/0-index 1st caller_subject_name = caller_subject.info # --> `Split.info='A'` if caller_subject_name == "": # e.g. for `inner='⊥'` in example above if caller_split_i is None: raise ValueError(f"{caller_command}'s subject is unknown") else: # `clopen_i` is equivalent to `clopen_dict.get(caller_split_i)` clopen_i = self.splits[ caller_split_i].split_index # e.g. 3 --> 0 caller_split_i = clopen_i + 1 # inside linked bracket caller_redirect = self.splits[caller_split_i] caller_subject_name = caller_redirect.info # --> 'A' is clopen name # Now know `caller_subject` to which `inner` is applied by `caller_command` command_object = (caller_command, inner.info) statement_subjects.setdefault(caller_split_i, []) # subject index in splits statement_subjects.get(caller_split_i).append(command_object) return statement_subjects