Ejemplo n.º 1
0
class TocChapNum(RegexMatchable):
    def __init__(self, ch_num_substr):
        self.substr = ch_num_substr  # Store input string (ToC entry title) in a property
        # Complain if the (sub)chapter numbering regex doesn't match the title string
        assert self.match(
            self.substr), ValueError("No chapter number in this string")
        self.numeric = self.get_numbering_tuple(self.substr)

    add_props_to_ns(["numeric", "substr"])
    _re = r"^(Chapter |CHAPTER |§ ?)?(\d+\.)+"  # set inherited read-only `RegexMatchable.re` property

    @classmethod
    def get_numbering_tuple(cls, target_str):
        m = cls.match(target_str)
        if m:
            mg_ch, mg_num = m.groups()
            num_group = m.group()
            if mg_ch:
                num_group = num_group[len(
                    mg_ch):]  # left-strip the chapter substring
            num_tup = tuple(n for n in num_group.split(".") if n)
            if not all(map(str.isnumeric, num_tup)):
                raise ValueError(f"Non-numeric chapter numbering: {num_tup}")
            return tuple(map(int, num_tup))
        else:
            return m
Ejemplo n.º 2
0
class TocChapRomNum(RegexMatchable):
    def __init__(self, ch_num_substr):
        self.substr = ch_num_substr  # Store input string (ToC entry title) in a property
        # Complain if the (sub)chapter numbering regex doesn't match the title string
        if not self.match(self.substr):
            raise ValueError("No chapter number in this string")
        self.numeric = self.get_numbering_tuple(self.substr)

    add_props_to_ns(["numeric", "substr"])
    # set inherited read-only `RegexMatchable.re` property
    _re = r"^(Chapter |CHAPTER |§ ?)?((M{0,4})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\.)+"

    @classmethod
    def get_numbering_tuple(cls, target_str):
        m = cls.match(target_str)
        if m:
            mg_ch, mg_num = m.groups(
            )[:2]  # only need the broadest Roman numeral group
            num_group = m.group()
            if mg_ch:
                num_group = num_group[len(
                    mg_ch):]  # left-strip the chapter substring
            num_tup = tuple(n for n in num_group.split(".") if n)
            if not all(map(validate_roman_numeral, num_tup)):
                raise ValueError(
                    f"Non-Roman numeric chapter numbering: {num_tup}")
            return tuple(map(roman2int, num_tup))
        else:
            return m
Ejemplo n.º 3
0
class HTMLSection:
    def __init__(self, html_tag):
        self.root = html_tag
        self._set_up_props()  # parse the root then annul the root once parsed

    # Populate namespace of class definition with properties and class properties
    add_props_to_ns(["root"])
    add_classprops_to_ns(["root_subselector"])

    def _attr_tups_from_prop_dict(self):
        attr_tuples = []
        for prop_key, (css_sel, sel_all, *callback) in self._prop_dict.items():
            callback = callback[0] if any(map(callable,
                                              callback)) else lambda x: x
            sel_func = self._selAll if sel_all else self._sel
            attr_tuples.extend([(prop_key, callback(sel_func(css_sel)))])
        return attr_tuples

    def _set_up_props(self):
        if hasattr(self, "_prop_dict"):
            # Set up properties on a subclass calling `super().__init__`
            self._set_attrs(self._attr_tups_from_prop_dict())
            self.root = None  # annul the root now its content is parsed into attribs

    def _set_attrs(self, attr_tuple_list):
        for (attr, val) in attr_tuple_list:
            setattr(self, attr, val)

    def _sel(self, css_selector):
        return self.root.select_one(css_selector)

    def _selAll(self, css_selector):
        return self.root.select(css_selector)
Ejemplo n.º 4
0
class AMSBookInfoPage:
    def __init__(self, soup):
        root_selector = "div.productPage div.bounds"  # All info is below this
        subsoup = soup.select_one(root_selector)
        self.content = ContentSection(subsoup)
        self.metadata = TextInfoSection(subsoup)

    def _df_repr(self, as_dict=False):
        """
        Returns a dict built from the name tree of properties, including
        a recursive step down into any properties whose values are instances
        implementing the `_df_repr` interface themselves (i.e. providing the
        subtree of further sub-properties), which can be merged to obtain a
        single panel of data (suitable for constructing a single DataFrame).
        """
        # Store all properties in top-level dict i.e. columns of a single DataFrame
        df_dict = {}
        for p in self._properties:
            self_prop_val = getattr(self, p)
            # Coerce prop_dict to keys by taking it as a list
            p_prop_names = list(getattr(self_prop_val, "_prop_dict"))
            for subprop_name in p_prop_names:
                self_subprop_val = getattr(self_prop_val, subprop_name)
                if hasattr(self_subprop_val, "_df_repr"):
                    # Recurse!
                    subprops_entry_dict = self_subprop_val._df_repr(
                        as_dict=True)
                else:
                    subprops_entry_dict = {subprop_name: [self_subprop_val]}
                df_dict.update(subprops_entry_dict)
        # Sort the keys before returning
        df_dict = dict(sorted(df_dict.items()))
        if as_dict:
            # Sort the keys upon return
            return df_dict
        else:
            # Import late else it adds start lag to every module implementing _df_repr
            from pandas import DataFrame
            df = DataFrame.from_dict(df_dict)
            return df

    _properties = ["content", "metadata"]
    add_props_to_ns(_properties)
Ejemplo n.º 5
0
class SymbolGroup(RegexMultiMatchable):
    def __init__(self, val):
        self.formula = Formula(val.group())

    _re = r"\S+\[su[b|p]\(+[^\]]+?\)\]\)?"
    add_props_to_ns(["matched", "formula"])
Ejemplo n.º 6
0
class FormulaTree:
    def __init__(self, splits, inners):
        self._splits = splits
        self._inners = inners
        self._statement = self.as_statement(
        )  # dict keyed by Split index of subjects

    def __repr__(self):
        split_reprs = "\n".join([f"{s!r}" for s in self.splits])
        inners_reprs = "\n".join([f"{p!r}" for p in self.inners])
        return f"{split_reprs}\n\n{inners_reprs}"

    add_props_to_ns(["splits", "inners", "statement"], read_only=True)

    # TODO: parse the tree in such a way that an `[`-opener's preceder element is the
    # subject of the `sub`/`sup` inside the `[]` clause object, and also that the
    # 'clopening' aspect will permit multiple such `[]` clause objects to be associated
    # to a single subject. E.g. in `(A[sub(5)])[sup(⊥)]`, `A` is the subject of both
    # `sub` and `sup` clause objects (with different inner terms: `5` and `⊥`).

    # e.g. a simple example with one inner term and two splits:
    # 'A[sub(2)]'
    #
    # Split: < self.info='A' ~ self.open_index=0 ~ #self.split_index=None >
    # Split: < self.info='sub' ~ self.open_index=1 ~ #self.split_index=0 >
    # InnerTerm: < self.info='2' ~ #self.parent_split_index=1 >
    #
    # e.g. a less simple example with two inner terms and five splits:
    # '(A[sub(5)])[sup(⊥)]'
    #
    # Split: < self.info='' ~ self.open_index=0 ~ #self.split_index=None >
    # Split: < self.info='A' ~ self.open_index=1 ~ #self.split_index=0 >
    # Split: < self.info='sub' ~ self.open_index=2 ~ #self.split_index=1 >
    # Split: < self.info='' ~ self.open_index=3 ~ #self.split_index=0 >
    # Split: < self.info='sup' ~ self.open_index=4 ~ #self.split_index=3 >
    # InnerTerm: < self.info='5' ~ #self.parent_split_index=2 >
    # InnerTerm: < self.info='⊥' ~ #self.parent_split_index=4 >

    def as_statement(self):
        statement_subjects = {
        }  # Store each subject term as index of unique Split obj
        for inner in self.inners:
            # e.g. '(A[sub(5)])[sup(⊥)]' --> first: `inner='5'`
            parent_split_i = inner.parent_split_index  # --> 2
            parent_split = self.splits[
                parent_split_i]  # --> 3rd split or 0-index 2nd
            caller_command = parent_split.info  # --> `Split.info='sub'`
            caller_split_i = parent_split.split_index  # --> `Split.split_index=1`
            caller_subject = self.splits[
                caller_split_i]  # --> 2nd split/0-index 1st
            caller_subject_name = caller_subject.info  # --> `Split.info='A'`
            if caller_subject_name == "":  # e.g. for `inner='⊥'` in example above
                if caller_split_i is None:
                    raise ValueError(f"{caller_command}'s subject is unknown")
                else:
                    # `clopen_i` is equivalent to `clopen_dict.get(caller_split_i)`
                    clopen_i = self.splits[
                        caller_split_i].split_index  # e.g. 3 --> 0
                    caller_split_i = clopen_i + 1  # inside linked bracket
                    caller_redirect = self.splits[caller_split_i]
                    caller_subject_name = caller_redirect.info  # --> 'A' is clopen name
            # Now know `caller_subject` to which `inner` is applied by `caller_command`
            command_object = (caller_command, inner.info)
            statement_subjects.setdefault(caller_split_i,
                                          [])  # subject index in splits
            statement_subjects.get(caller_split_i).append(command_object)
        return statement_subjects