Exemple #1
0
 def unfold_section(self, obj, level):
     test = lambda x: x["type"] == "header" and x["depth"] == level
     headings = list(filter(test, obj["content"]))
     contents = self.split_list(test, obj["content"])
     # case 1: no underlying structure
     if not headings and len(contents) == 1:
         subsections = obj["content"]
     # case 2: no paragraph before first header
     elif len(headings) == len(contents):
         subsections = [{
             "type": "section",
             "title": h["content"],
             "depth": h["depth"],
             "content": c
         } for h, c in zip(headings, contents)]
     # case 3: one paragraph before first header
     elif len(headings) == len(contents) - 1:
         subsections = (contents[0] + [{
             "type": "section",
             "title": h["content"],
             "depth": h["depth"],
             "content": c
         } for h, c in zip(headings, contents[1:])])
     # mismatch between headers and paragraphs
     else:
         message = "ill-formed structure in article"
         subsections = [{"type": "error", "message": message}]
         log_parser_error(message, obj, position=self.current_position)
     return merge(obj, {"content": self(subsections)})
Exemple #2
0
        def transform_chapter(self, obj):
            authors = defaultdict(int)

            for k, v in chain(*(x["authors"].items()
                                for x in obj["children"])):
                authors[k] += v

            return merge(obj, {"authors": authors})
Exemple #3
0
        def transform_header(self, obj):
            check(obj, "content", -1, "type") == "template"
            check(obj, "content", -1, "name") == "anker"

            heading = text_rstrip(obj["content"][:-1])
            anchor = obj["content"][-1]["params"]["1"]

            return merge(obj, {"content": heading, "anchor": anchor})
Exemple #4
0
 def transform_article(self, obj):
     if not next(
             filter(lambda x: x["type"] == "included_section",
                    obj["content"]), None):
         return obj
     merged_content = chain(
         *(x["content"] if x["type"] == "included_section" else [x]
           for x in obj["content"]))
     return merge(obj, {"content": list(merged_content)})
Exemple #5
0
    def test_merge(self):
        self.assertEqual(merge(None, "a"), "a")
        self.assertListEqual(merge(None, [1, 2]), [1, 2])
        self.assertIsNone(merge(None, None))

        for obj1, obj2, output in [([1, 2], [3, 4], [1, 2, 3, 4]),
                                   ([], [3, 4], [3, 4]), ([1, 2], [], [1, 2]),
                                   (["a"], ["b"], ["a", "b"]),
                                   ({
                                       "a": 1
                                   }, {
                                       "b": 2
                                   }, {
                                       "a": 1,
                                       "b": 2
                                   }), ({
                                       "a": 1
                                   }, {
                                       "a": 2
                                   }, {
                                       "a": 2
                                   }), ({}, {
                                       "a": 2
                                   }, {
                                       "a": 2
                                   }), ({
                                       "a": 2
                                   }, {}, {
                                       "a": 2
                                   })]:
            obj1_before = obj1.copy()
            obj2_before = obj2.copy()

            test_func = self.assertDictEqual if isinstance(obj1, dict) else \
                        self.assertListEqual

            test_func(merge(obj1, obj2), output)

            # obj1 and obj2 didn't change during execution of add_dict()
            test_func(obj1, obj1_before)
            test_func(obj2, obj2_before)
Exemple #6
0
        def transform_article(self, article):
            parser = ArticleContentParser(api=self.api, title=article["title"])

            article_link = self.api._index_url + "?title=" + article[
                "title"].replace(" ", "+")
            report_logger.info("== Parsing of Article [{} {}] ==".format(
                article_link, article["title"]))

            content = parser(self.api.get_content(article["title"]))
            authors = self.get_article_authors(article["title"])

            return merge(article, {"content": content, "authors": authors})
Exemple #7
0
        def normalize(self, obj, mode):
            try:
                formula = self.api.normalize_formula(obj["formula"], mode)

                if mode == "tex":
                    formula = remove_prefix(formula, "{\\begin{aligned}")
                    formula = remove_suffix(formula, "\\end{aligned}}")
            except ValueError:
                message = "Wrong formatted formula"
                # TODO: current_section was not set for this class
                log_parser_error(message, obj)
                return {"type": "error", "message": message}

            return merge(obj, {"formula": formula})
Exemple #8
0
        def transform_template(self, obj):
            if obj["name"] in TEMPLATE_LIST_PARAMS:
                params = obj["params"].copy()

                for param_prefix in TEMPLATE_LIST_PARAMS[obj["name"]]:
                    result = []

                    for n in count(1):
                        try:
                            result.append(params.pop(param_prefix + str(n)))
                        except KeyError:
                            break

                    params[param_prefix + "_list"] = result

                return merge(obj, {"params": params})
            else:
                raise NotInterested()
Exemple #9
0
        def change_inline(self, obj, i, n):
            if lookup(obj, "type") == "text":
                data = re.sub(r"\s+(?=\s)", "", obj["data"])
                data = re.sub(r"\s", " ", data)
                if "\n" in data:
                    print(repr(data))

                if i == 0:
                    data = data.lstrip()

                if i == n - 1:
                    data = data.rstrip()

                if data:
                    return merge(obj, {"data": data})
                else:
                    return None
            else:
                return self(obj)
Exemple #10
0
    def query(self, params, path_to_result):
        params["format"] = "json"
        params["action"] = "query"
        path_to_result = ["query"] + path_to_result
        result = None

        while True:
            api_result = self.req.get(self._api_url, params=params).json()

            if "error" in api_result:
                message = "Error while making API call."

                raise ConnectionError(api_result.get("info", message))

            result = merge(result, query_path(api_result, path_to_result))

            if "continue" in api_result:
                params.update(api_result["continue"])
            else:
                return result
Exemple #11
0
        def transform_dict(self, obj):
            check(obj, "type").of(DEFAULT_VALUES)

            return merge(DEFAULT_VALUES[obj["type"]],
                         super(NodeTransformation, self).act_on_dict(obj))
Exemple #12
0
        def transform_template(self, obj):
            for bname, tname, param_names in BOXSPEC:
                if obj["name"] == tname:
                    params = {
                        k: self(obj["params"].get(v, None))
                        for k, v in param_names.items()
                    }

                    return merge(params, {"type": bname})

            if obj["name"] == "liste":
                if "liste" in obj["params"]:
                    sublist = obj["params"]["liste"][0]

                    assert sublist["type"] == "list"

                    items = sublist["items"]
                    ordered = sublist["ordered"]
                else:
                    items = [{
                        "type": "listitem",
                        "content": self(x)
                    } for x in obj["params"]["item_list"]]
                    ordered = obj["params"].get("type", "") == "ol"

                return {
                    "type": "list",
                    "items": items,
                    "ordered": ordered,
                    "spacing": obj["params"].get("abstand", None)
                }
            elif obj["name"] == "formel":
                formula = obj["params"].get("1", [])

                if len(formula) == 1 and \
                        lookup(formula, 0, "type") == "inlinemath":
                    formula = formula[0]["formula"]
                    if formula.startswith(
                            "\\begin{align}") and formula.endswith(
                                "\\end{align}"):
                        formula = remove_prefix(formula, "\\begin{align}")
                        formula = remove_suffix(formula, "\\end{align}")
                    formula = "\\begin{align}" + formula + "\\end{align}"
                    return {"type": "equation", "formula": formula}
                else:
                    message = "Wrong formatted equation"
                    details = "Equation source code must be completely contained in just one <math></math>.\n (use \\text{this is not math} macro instead)"

                    log_parser_error(message, obj, details,
                                     self.current_section)

                    return {"type": "error", "message": message}
            elif obj["name"] == "(!":
                return None
            elif obj["name"].startswith("#invoke:"):
                # Template is header or footer
                return None
            elif obj["name"] == "noprint":
                return None
            elif obj["name"] == "todo":
                message = "Todo-Message in MediaWiki code."
                details = "Check if this TODO shoud be completed for a book release."
                log_parser_error(message, obj, details, self.current_section)

                return {"type": "error", "message": message}
            else:
                message = "Parsing of template `{}`".format(obj["name"])
                log_parser_error(message, obj, position=self.current_section)

                return {
                    "type": "notimplemented",
                    "target": obj,
                    "message": message
                }