コード例 #1
0
    def test_text_acquisition_methods(self):
        # These methods are intended for use against Tag, but they
        # work on NavigableString as well,

        s = NavigableString("fee ")
        cdata = CData("fie ")
        comment = Comment("foe ")

        assert "fee " == s.get_text()
        assert "fee" == s.get_text(strip=True)
        assert ["fee "] == list(s.strings)
        assert ["fee"] == list(s.stripped_strings)
        assert ["fee "] == list(s._all_strings())

        assert "fie " == cdata.get_text()
        assert "fie" == cdata.get_text(strip=True)
        assert ["fie "] == list(cdata.strings)
        assert ["fie"] == list(cdata.stripped_strings)
        assert ["fie "] == list(cdata._all_strings())

        # Since a Comment isn't normally considered 'text',
        # these methods generally do nothing.
        assert "" == comment.get_text()
        assert [] == list(comment.strings)
        assert [] == list(comment.stripped_strings)
        assert [] == list(comment._all_strings())

        # Unless you specifically say that comments are okay.
        assert "foe" == comment.get_text(strip=True, types=Comment)
        assert "foe " == comment.get_text(types=(Comment, NavigableString))
コード例 #2
0
ファイル: base.py プロジェクト: ZhymabekRoman/translate
 def _translate(node: NavigableString):
     try:
         node.replace_with(
             self.translate(str(node),
                            destination_language=dest_lang,
                            source_language=source_lang).result)
     except Exception:  # ignore if it couldn't find any result or an error occured
         pass
コード例 #3
0
ファイル: pidgin.py プロジェクト: spadev/chatlogsync
    def _write_title(self, file_object, conversation):
        string = self.fill_pattern(conversation, self.TITLE_PATTERN,
                                     self.TIME_FMT_TITLE, untransform=True)
        string_elem = NavigableString(string)
        string_elem.setup() # workaround for BeautifulSoup issue
        formatted_title = string_elem.output_ready()

        file_object.write(self.TITLE_LINE_FMT %
                          (formatted_title, formatted_title)
                          + '\n')
コード例 #4
0
 def add_link_to_ndpname(tag, href):
     initial, middle, final = break_string(tag.string)
     tag.string = ''
     name = middle
     attrs = {'class': 'link-to-model', 'href': href, 'target': '_blank'}
     new_tag = Tag(name="a", attrs=attrs)
     new_tag.string = name
     tag.append(NavigableString(initial))
     tag.append(new_tag)
     tag.append(NavigableString(final))
コード例 #5
0
    def _write_title(self, file_object, conversation):
        string = self.fill_pattern(conversation,
                                   self.TITLE_PATTERN,
                                   self.TIME_FMT_TITLE,
                                   untransform=True)
        string_elem = NavigableString(string)
        string_elem.setup()  # workaround for BeautifulSoup issue
        formatted_title = string_elem.output_ready()

        file_object.write(self.TITLE_LINE_FMT %
                          (formatted_title, formatted_title) + '\n')
コード例 #6
0
    def insert_escaped_tags(self, tags):
        """For each tag in "tags", insert contextual tags (e.g., <p> </p>) as escaped text
			so that these tags are still there when html markup is stripped out."""
        found = False
        for tag in tags:
            strs = list(tag.strings)
            if len(strs) > 0:
                l = tag.name
                strs[0].parent.insert(0, NavigableString("<" + l + ">"))
                strs[-1].parent.append(NavigableString("</" + l + ">"))
                found = True
        return found
コード例 #7
0
ファイル: html_writer.py プロジェクト: chris-carlson/python
 def _convert_element(self,
                      wrapper_element: HtmlTag,
                      parent: Tag = None) -> Tag:
     native_element: Tag = Tag(name=wrapper_element.name,
                               attrs=wrapper_element.attributes,
                               parent=parent,
                               previous=NavigableString('\n'),
                               builder=BUILDER)
     for child in wrapper_element.children:
         native_element.contents.append(
             self._convert_element(child, native_element))
     if len(wrapper_element.text) > 0:
         native_element.contents = [NavigableString(wrapper_element.text)]
     return native_element
コード例 #8
0
    def __init__(self, **kwargs):
        self._alias = ''
        self._sender = ''
        self._text = ''
        self._time = ''
        self._delayed = False
        self._alternate = False
        self._html = []
        self._isuser = False

        for k, v in iter(kwargs.items()):
            setattr(self, '_' + k, v)

        self._system = True if kwargs.get('system', None) else False
        if self._text and not self._html:
            self._html = [NavigableString(self._text)]

        for argname in ('alias', 'sender', 'text'):
            _validate_argument(getattr(self, '_' + argname), argname,
                               basestring)

        if self._system:
            self._alias = ''
            self._sender = ''
        elif not self._alias and not self._sender:
            raise ArgumentError('non-system Entry must have sender or alias')
        elif self._alias == self._sender:
            self._alias = ''
        _validate_argument(self._time, 'time', datetime.datetime)
        _validate_argument(self._html, 'html', list)
        for e in self._html:
            _validate_argument(e, 'html', PageElement)
コード例 #9
0
    def _parse_status(self, comment, info, conversation):
        if comment:
            info['type'], info['system'], info['sender'] = \
                comment.split("|", 2)
            info['type'] = int(info['type'])
            if info['type'] in Status.USER_TYPES:
                l = info['html'][0].split(': ', 1)
                if len(l) == 2:
                    info['msg_html'] = [NavigableString(l[1])
                                        ] + info['html'][1:]

                info['html'] = []
            return

        s = ''.join(
            [x.text if isinstance(x, Tag) else x.string for x in info['html']])
        info['sender'] = conversation.source \
            if s.startswith(_("You")) else None

        if not info['type']:
            typemap = dict(self.STATUS_TYPEMAP, **self.CHAT_STATUS_TYPEMAP) if \
                conversation.isgroup else self.STATUS_TYPEMAP
            for pattern, t in iter(typemap.items()):
                i = util.parse_string(s, pattern)
                if i is not None:
                    for k, v in iter(i.items()):
                        info[k] = v
                    # special case for 'is no longer <type>'
                    typestr = i.get('type')
                    if typestr:
                        info['type'] = \
                            Status.OPPOSITES[Status.PAM_EPYT[typestr]]
                    else:
                        info['type'] = t
                    break
コード例 #10
0
    def _parse(self, course: NavigableString) -> ParseType:
        """Parses course to get its link & icon url, title, description
        counts and stores.
        :course: BeautifulSoup Object"""

        info = {
            "link": "",
            "icon": "",
            "title": "",
            "description": "",
            "counts": {}
        }

        info["link"] = course.a["href"]
        info["icon"] = course.a.img["src"]

        description: NavigableString = course.a.div
        info["title"] = description.div.get_text()
        info["description"] = description.p.get_text()

        counts: NavigableString = course.find("div", {"class": "courseCounts"})
        counts_data: ResultSet = counts.find_all("li")
        for data in counts_data:
            name: str = data.span.get_text().lower()
            val: str = data.find("p").get_text()
            info["counts"][name] = int(val.replace(",", ""))

        return info
コード例 #11
0
 def _populate_fields(self, form_element: Tag, field_values: Dict[str, List[str]]) -> None:
     """
     Inserts the provided values into the form input elements inside the
     exercise element.
     """
     # Find all form elements on the exercise page and fill in the values
     field_elements = form_element.find_all(['input', 'select', 'textarea'])
     for field_element in field_elements:
         field_element = cast(Tag, field_element)
         field_name = cast(str, field_element.get('name'))
         if field_name not in field_values:
             continue
         if field_element.name == 'input':
             if field_element.get('type') in ('radio', 'checkbox'):
                 if field_element.get('value') in field_values[field_name]:
                     field_element['checked'] = ''
                 else:
                     del field_element['checked']
             else:
                 field_element['value'] = field_values[field_name][0]
         elif field_element.name == 'select':
             for option_element in field_element.find_all('option'):
                 option_element = cast(Tag, option_element)
                 if option_element.get('value') in field_values[field_name]:
                     option_element['selected'] = ''
                 else:
                     del option_element['selected']
         elif field_element.name == 'textarea':
             string_content = NavigableString(field_values[field_name][0])
             field_element.contents = [string_content]
コード例 #12
0
def recursive_traversal(element, data):
    if element.name in EXCLUDE_BLOCKS:
        return

    block_children = []
    non_block = []
    non_block_with_attr = []
    navigable_strings = []
    for child in element.children:
        if isinstance(child, NavigableString):
            if str(child) != '' and not child.isspace():
                navigable_strings.append(child)
        elif child.name not in NON_BLOCK_ELEMENTS:
            block_children.append(child)
        elif any(child.has_attr(attr) for attr in TEXT_ATTRS):
            non_block_with_attr.append(child)
        else:
            non_block.append(child)

    if block_children:
        for child in non_block_with_attr:
            replace_attrs(child, data)
        for child in block_children:
            replace_attrs(child, data)
            recursive_traversal(child, data)
        for navigable_str in navigable_strings:
            string_hash = hashlib.md5(navigable_str.encode()).hexdigest()
            data[string_hash] = str(navigable_str)
            navigable_str.replace_with(
                NavigableString(f"{{{{localize('{string_hash}')}}}}"))
        for block in non_block:
            hash_element_content(block, data)

    else:
        hash_element_content(element, data)
コード例 #13
0
 def recursive_replace(tag):
     if hasattr(tag, "contents"):
         for i in range(len(tag.contents)):
             child = tag.contents[i]
             if child.name == "code":
                 tag.contents[i] = NavigableString(self.store(str(child)))
             else:
                 recursive_replace(child)
コード例 #14
0
ファイル: highlight.py プロジェクト: rusi/mcdp
    def make_tag(tag0, klass, data, ndp=None, template=None, poset=None):
        svg = data['svg']

        tag_svg = BeautifulSoup(svg, 'lxml', from_encoding='utf-8').svg

        assert tag_svg.name == 'svg'
        if tag_svg.has_attr('width'):
            ws = tag_svg['width']
            hs = tag_svg['height']
            assert 'pt' in ws
            w = float(ws.replace('pt',''))
            h = float(hs.replace('pt',''))
            scale = MCDPConstants.scale_svg

            w2 = w * scale
            h2 = h * scale
            tag_svg['width'] = w2
            tag_svg['height'] = h2
            tag_svg['rescaled'] = 'Rescaled from %s %s, scale = %s' % (ws, hs, scale)
        else:
            print('no width in SVG tag: %s' % tag_svg)

        tag_svg['class'] = klass

        if tag0.has_attr('style'):
            tag_svg['style'] = tag0['style']
        if tag0.has_attr('id'):
            tag_svg['id'] = tag0['id']

        if generate_pdf:
            pdf0 = data['pdf']
            pdf = crop_pdf(pdf0, margins=0)

            div = Tag(name='div')

            att = MCDPConstants.ATTR_LOAD_NAME
            if tag0.has_attr('id'):
                basename = tag0['id']
            elif ndp is not None and hasattr(ndp, att):
                basename = getattr(ndp, att)
            elif template is not None and hasattr(template, att):
                basename = getattr(template, att)
            elif poset is not None and hasattr(poset, att):
                basename = getattr(poset, att)
            else:
                hashcode = hashlib.sha224(tag0.string).hexdigest()[-8:]
                basename = 'code-%s' % (hashcode)

            docname = os.path.splitext(os.path.basename(realpath))[0]
            download = docname + "." + basename + "." + klass + '.pdf'
            a = create_a_to_data(download=download, data_format='pdf', data=pdf)
            a['class'] = 'pdf_data'
            a.append(NavigableString(download))
            div.append(tag_svg)
            div.append(a)
            return div
        else:
            return tag_svg
コード例 #15
0
 def html(self):
     if not self._html:
         self._html = []
         if self.type in self.USER_TYPES:
             s = self.STATUS_STRING_FMT % \
                 (self.alias if self.alias else self.sender, self.typestr,
                  ': ' if self.msg_html else '')
             self._html.append(NavigableString(s))
             self._html.extend(self.msg_html)
         else:
             self._html.append(self.typestr)
     return self._html
コード例 #16
0
ファイル: adium.py プロジェクト: spadev/chatlogsync
    def _parse_line(self, line, conversation, source, transformed_source):
        """Return (cons, attrs)"""
        status_html = []
        attrs = {}
        cons = None

        for elem in BeautifulSoup(line, ['lxml', 'xml']).children:
            if isinstance(elem, Comment):
                alternate, status_html = elem.split('|', 1)
                attrs['alternate'] = True if alternate else False
                status_html = [NavigableString(status_html)]
                continue

            for key in ('alias', 'sender', 'auto', 'time'):
                attrs[key] = elem.get(key, '')

            if attrs['sender'] == source:
                attrs['sender'] = transformed_source
                attrs['isuser'] = True
            else:
                attrs['isuser'] = False

            attrs['auto'] = bool(attrs['auto'])
            if attrs['time']:
                fmt = self.STRPTIME_FMT_CONVERSATION
                attrs['time'] = self._parse_time(attrs['time'], fmt)

            attrs['html'] = list(elem.children)

            if elem.name == 'status':
                cons = Status
                attrs['type'] = self.STATUS_TYPEMAP.get(elem.get('type'), None)
                if attrs['type'] in Status.USER_TYPES:
                    attrs['msg_html'] = attrs['html']
                    attrs['html'] = status_html
            elif elem.name == 'event':
                cons = Event
                attrs['type'] = self.EVENT_TYPEMAP.get(elem.get('type'), None)
            elif elem.name == 'message':
                cons = Message
            else:
                raise TypeError("unknown type '%s' for entry" % elem.name)

            if not attrs['sender'] and not attrs['alias']:
                print_d("%s is a system entry" % elem)
                attrs['system'] = True

        if not cons:
            raise (ParseError("could not parse line: '%s'" % line))

        return cons, attrs
コード例 #17
0
ファイル: discuss.py プロジェクト: Ali-Hady/pkg_sololearnlib
    def _parse_details(self, code: NavigableString) -> ParseType:
        """Parses a codeContainer and extracts all the info."""

        # Format of details ->
        # {votes: 1184, answers: 24077, post_link: <PostLink> title: <Title>,
        #  tags: [<Tags>, ...], author_name: <AuthorName>, author_link: <Link>,
        #  data_date: <DateTime>, avatar_link: <Link>}
        details: ParseType = {}
        post_stats: NavigableString = code.find("div", {"class": "postStats"})
        post_stats_children = list(post_stats.children)

        details["votes"] = post_stats_children[1].p.string
        # Note the spelling of <a class='postAnsewers'.
        details["answers"] = post_stats_children[3].p.string

        post_details: NavigableString = code.find("div",
                                                  {"class": "postDetails"})

        details["post_link"] = post_details.p.a["href"]
        details["title"] = post_details.p.a.string

        tags_wrapper: NavigableString = list(post_details.children)[3]
        tags: ResultSet = tags_wrapper.find_all("span")
        tag_list: List[str] = []
        for tag in tags:
            tag_list.append(tag.string)

        details["tags"] = tag_list

        author_details: NavigableString = code.find("div",
                                                    {"class": "authorDetails"})
        details["author_name"] = author_details.div.a.string
        details["author_link"] = author_details.div.a["href"]
        details["data_date"] = author_details.p["data-date"]
        details["avatar_link"] = list(author_details.children)[3].img["src"]

        return details
コード例 #18
0
def get_bibliography(bibfile):
    data = open(bibfile).read()
    frag = bs(data)
    res = Tag(name='div')

    ids = []
    for dt in frag.select('dt'):
        assert dt.name == 'dt'
        name = dt.a.attrs['name']
        name = 'bib:' + name
        ids.append(name)
        dd = dt.findNext('dd')
        assert dd.name == 'dd'
        entry = dd.__copy__()
        entry.name = 'cite'
        entry.attrs['id'] = name

        try_to_replace_stuff = False
        if try_to_replace_stuff:
            for x in entry.descendants:
                #print('child', x)
                if isinstance(x, NavigableString):
                    s = x.string.encode('utf-8')
                    s = s.replace('\n', ' ')
                    s = s.replace('[', '')
                    s = s.replace('|', '')
                    s = s.replace(']', '')
                    y = NavigableString(unicode(s, 'utf-8'))
                    x.replace_with(y)
                    #print('string %r' % x.string)
                if isinstance(x, Tag) and x.name == 'a' and x.string == 'bib':
                    x.extract()
        res.append(NavigableString('\n'))
        res.append(entry)
        res.append(NavigableString('\n'))
    print('Found %d bib entries.' % len(ids))
    return res
コード例 #19
0
    def __init__(self, **kwargs):
        self._msg_text = ''
        self._msg_html = []
        atype = kwargs.get('type', None)
        if atype < self._MIN or atype > self._MAX:
            raise TypeError("unknown type %r for status" % atype)
        self._type = atype
        if self._msg_text and not self._msg_html:
            self._msg_html = [NavigableString(self._msg_text)]

        if self._type in self.SYSTEM_STATUSES:
            kwargs['system'] = True

        super(Status, self).__init__(**kwargs)
        self._has_other_html = True if self._html else False
コード例 #20
0
    def _addEndDot(self, node, soup):

        if not node.contents:
            return

        last_content = node.contents[-1]

        is_navigable = isinstance(last_content, NavigableString)
        text = last_content if is_navigable else last_content.get_text(
            separator=' ', strip=True, types=[NavigableString])
        text = text.strip(' .:;)\n\r') + '. '
        if is_navigable:
            node.contents[-1].replace_with(NavigableString(text))
            #print node.contents[-1]
        else:
            last_content.string = text
コード例 #21
0
def substitute_task_marker_p(p, sub, klass):
    try:
        for element in p.descendants:
            if not isinstance(element, NavigableString):
                continue
    
            s = element.string
            if sub in s:
                add_class(p, klass)
                s2 = s.replace(sub, '')
                ns = NavigableString(s2)
                element.replaceWith(ns)
    except AttributeError as e: # a bug with bs4
        msg = 'Bug with descendants: %s' % e
        logger.debug(msg)
        pass
コード例 #22
0
    def insertBefore(self, node, refNode):
        index = self._nodeIndex(node, refNode)
        if (node.element.__class__ == NavigableString and self.element.contents
                and self.element.contents[index - 1].__class__
                == NavigableString):
            # (See comments in appendChild)
            newStr = NavigableString(self.element.contents[index - 1] +
                                     node.element)
            oldNode = self.element.contents[index - 1]
            del self.element.contents[index - 1]
            oldNode.parent = None
            oldNode.extract()

            self.element.insert(index - 1, newStr)
        else:
            self.element.insert(index, node.element)
            node.parent = self
コード例 #23
0
ファイル: modifiers.py プロジェクト: qdev90/habr_proxy
    def modify_html(html, request):
        soup = BeautifulSoup(html, 'html5lib')
        pattern = r'\b(\w{6})\b'

        # find all text inside tags
        def has_content(tag):
            return any(
                bool(re.search(pattern, content)) for content in tag.contents
                if isinstance(content, NavigableString))

        # modify each text instance inside tags
        for tag in soup.body.find_all(has_content):
            for content in tag.contents:
                if isinstance(content, NavigableString):
                    new_content = NavigableString(
                        re.sub(pattern, r'\1™', content))
                    content.replace_with(new_content)

        return str(soup)
コード例 #24
0
    def appendChild(self, node):
        if (node.element.__class__ == NavigableString and self.element.contents
                and self.element.contents[-1].__class__ == NavigableString):
            # Concatenate new text onto old text node
            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
            newStr = NavigableString(self.element.contents[-1] + node.element)

            # Remove the old text node
            # (Can't simply use .extract() by itself, because it fails if
            # an equal text node exists within the parent node)
            oldElement = self.element.contents[-1]
            del self.element.contents[-1]
            oldElement.parent = None
            oldElement.extract()

            self.element.insert(len(self.element.contents), newStr)
        else:
            self.element.insert(len(self.element.contents), node.element)
            node.parent = self
コード例 #25
0
def substitute_special_paragraph(soup, prefix, klass):
    """ 
        Looks for paragraphs that start with a simple string with the given prefix. 
    
        From:
        
            <p>prefix contents</p>
            
        Creates:
        
            <div class='klass-wrap'><p class='klass'>contents</p></div>
    """
    ps = list(soup.select('p'))
    for p in ps:
        # Get first child
        contents = list(p.contents)
        if not contents:
            continue
        c = contents[0]
        if not isinstance(c, NavigableString):
            continue

        s = c.string
        starts = s.lower().startswith(prefix.lower())
        if not starts:
            continue

        without = s[len(prefix):]
        ns = NavigableString(without)
        c.replaceWith(ns)

        div = Tag(name='div')
        add_class(div, klass + '-wrap')
        add_class(p, klass)
        parent = p.parent
        i = parent.index(p)
        p.extract()
        div.append(p)
        parent.insert(i, div)
コード例 #26
0
def substituting_empty_links(soup, raise_errors=False):
    '''
    
    
        default style is [](#sec:systems)  "Chapter 10"
        
        the name is [](#sec:systems?only_name) "My title"
        
        the number is [](#sec:systems?only_number) "10"
        
        and full is [](#sec:systems?toc_link) "Chapter 10 - My title"
    
    
        You can also use "class":
        
            <a href='#sec:name' class='only_number'></a>
            
            or
            
            <a href='#sec:name?only_number'></a>
    

    '''
    CLASS_ONLY_NUMBER = MCDPManualConstants.CLASS_ONLY_NUMBER
    CLASS_NUMBER_NAME = MCDPManualConstants.CLASS_NUMBER_NAME
    CLASS_ONLY_NAME = MCDPManualConstants.CLASS_ONLY_NAME

    logger.debug('substituting_empty_links')

    n = 0
    nerrors = 0
    for le in get_empty_links_to_fragment(soup):

        a = le.linker
        element_id = le.eid
        element = le.linked

        n += 1
        if not element:
            msg = ('Cannot find %s' % element_id)
            note_error_msg(a, msg)
            nerrors += 1
            if raise_errors:
                raise ValueError(msg)
            continue
        # if there is a query, remove it
        if le.query is not None:
            new_href = '#' + le.eid
            a.attrs['href'] = new_href
            logger.info('setting new href= %s' % (new_href))

        if (not LABEL_WHAT_NUMBER  in element.attrs) or \
                (not LABEL_NAME in element.attrs):
            msg = (
                'substituting_empty_links: Could not find attributes %s or %s in %s'
                % (LABEL_NAME, LABEL_WHAT_NUMBER, element))
            if True:
                logger.warning(msg)
            else:
                note_error_msg(a, msg)
                nerrors += 1
                if raise_errors:
                    raise ValueError(msg)
            continue

        label_what_number = element.attrs[LABEL_WHAT_NUMBER]
        label_number = element.attrs[LABEL_NUMBER]
        label_what = element.attrs[LABEL_WHAT]
        label_name = element.attrs[LABEL_NAME]

        classes = list(a.attrs.get('class', []))  # bug: I was modifying

        if le.query is not None:
            classes.append(le.query)

        if 'toc_link' in classes:
            s = Tag(name='span')
            s.string = label_what
            add_class(s, 'toc_what')
            a.append(s)

            a.append(' ')

            s = Tag(name='span')
            s.string = label_number
            add_class(s, 'toc_number')
            a.append(s)

            s = Tag(name='span')
            s.string = ' - '
            add_class(s, 'toc_sep')
            a.append(s)

            if label_name is not None and '<' in label_name:
                contents = bs(label_name)
                # sanitize the label name
                for br in contents.findAll('br'):
                    br.replaceWith(NavigableString(' '))
                for _ in contents.findAll('a'):
                    _.extract()

                a.append(contents)
                #logger.debug('From label_name = %r to a = %r' % (label_name, a))
            else:
                s = Tag(name='span')
                if label_name is None:
                    s.string = '(unnamed)'  # XXX
                else:
                    s.string = label_name
                add_class(s, 'toc_name')
                a.append(s)

        else:

            if CLASS_ONLY_NUMBER in classes:
                label = label_number
            elif CLASS_NUMBER_NAME in classes:
                if label_name is None:
                    label = label_what_number + \
                        ' - ' + '(unnamed)'  # warning
                else:
                    label = label_what_number + ' - ' + label_name
            elif CLASS_ONLY_NAME in classes:
                if label_name is None:
                    label = '(unnamed)'  # warning
                else:
                    label = label_name
            else:
                label = label_what_number

            span1 = Tag(name='span')
            add_class(span1, 'reflabel')
            span1.string = label
            a.append(span1)

    logger.debug('substituting_empty_links: %d total, %d errors' %
                 (n, nerrors))
コード例 #27
0
ファイル: webtoons.py プロジェクト: Chr0nos/jarvis
 def unwrap_ul(li: element.NavigableString) -> WebToonChapter:
     link = li.find('a')['href']
     pretty_name = li.find('img')['alt']
     chapter = WebToonChapter.from_url(link, pretty_name)
     return chapter
コード例 #28
0
ファイル: manual_join_imp.py プロジェクト: kannode/mcdp
def do_bib(soup, bibhere):
    """ find used bibliography entries put them there """
    used = []
    unused = set()
    for a in soup.find_all('a'):
        href = a.attrs.get('href', '')
        if href.startswith('#bib:'):
            used.append(href[1:])  # no "#"
    logger.debug('I found %d references, to these: %s' % (len(used), used))

    # collect all the <cite>
    id2cite = {}
    for c in soup.find_all('cite'):
        ID = c.attrs.get('id', None)
        id2cite[ID] = c
        if ID in used:
            add_class(c, 'used')
        else:
            unused.add(ID)
            add_class(c, 'unused')

    # divide in found and not found
    found = []
    notfound = []
    for ID in used:
        if not ID in id2cite:
            if not ID in notfound:
                notfound.append(ID)
        else:
            found.append(ID)

    # now create additional <cite> for the ones that are not found
    for ID in notfound:
        cite = Tag(name='cite')
        s = 'Reference %s not found.' % ID
        cite.append(NavigableString(s))
        cite.attrs['class'] = ['errored', 'error']  # XXX
        soup.append(cite)
        id2cite[ID] = cite

    # now number the cites
    n = 1
    id2number = {}
    for ID in used:
        if not ID in id2number:
            id2number[ID] = n
        n += 1

    # now add the attributes for cross-referencing
    for ID in used:
        number = id2number[ID]
        cite = id2cite[ID]

        cite.attrs[LABEL_NAME] = '[%s]' % number
        cite.attrs[LABEL_SELF] = '[%s]' % number
        cite.attrs[LABEL_NUMBER] = number
        cite.attrs[LABEL_WHAT] = 'Reference'
        cite.attrs[LABEL_WHAT_NUMBER_NAME] = '[%s]' % number
        cite.attrs[LABEL_WHAT_NUMBER] = '[%s]' % number

    # now put the cites at the end of the document
    for ID in used:
        c = id2cite[ID]
        # remove it from parent
        c.extract()
        #         logger.debug('Extracting cite for %r: %s' % (ID, c))
        # add to bibliography
        bibhere.append(c)

    s = ("Bib cites: %d\nBib used: %s\nfound: %s\nnot found: %s\nunused: %d" %
         (len(id2cite), len(used), len(found), len(notfound), len(unused)))
    logger.info(s)
コード例 #29
0
ファイル: minimal_doc.py プロジェクト: afcarl/mcdp
def get_minimal_document(body_contents, title=None,
                         add_markdown_css=False, add_manual_css=False, stylesheet=None, extra_css=None):
    """ Creates the minimal html document with MCDPL css.

        add_markdown_css: language + markdown
        add_manual_css: language + markdown + (manual*)

        extra_css = additional CSS contents
     """
    check_html_fragment(body_contents)
    soup = bs("")
    assert soup.name == 'fragment'

    if title is None:
        title = ''

    html = Tag(name='html')

    head = Tag(name='head')
    body = Tag(name='body')

    head.append(Tag(name='meta', attrs={'http-equiv':"Content-Type",
                                        'content': "application/xhtml+xml; charset=utf-8"}))

    if stylesheet is None:
        stylesheet = 'v_mcdp_render_default'

    if add_markdown_css or add_manual_css:
        link = Tag(name='link')
        link['rel'] = 'stylesheet'
        link['type'] = 'text/css'
        link['href'] = get_css_filename('compiled/%s' % stylesheet)
        head.append(link)

    tag_title = Tag(name='title')
    tag_title.append(NavigableString(title))
    head.append(tag_title)
    parsed = bs(body_contents)

    assert parsed.name == 'fragment'
    parsed.name = 'div'
    body.append(parsed)
    html.append(head)
    html.append(body)
    soup.append(html)

    if extra_css is not None:
        add_extra_css(soup, extra_css)

    s = to_html_stripping_fragment_document(soup)
    assert not 'DOCTYPE' in s
#     s = html.prettify() # no: it removes empty text nodes

#     ns="""<?xml version="1.0" encoding="utf-8" ?>"""
    ns = """<!DOCTYPE html PUBLIC
    "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"
    "http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd">"""
    res = ns + '\n' + s

#     if add_manual_css and MCDPConstants.manual_link_css_instead_of_including:
#         assert 'manual.css' in res, res

    res = res.replace('<div><!DOCTYPE html>', '<div>')

    return res
コード例 #30
0
ファイル: make_console_pre.py プロジェクト: kannode/mcdp
def mark_console_pres_highlight(soup, res, location):
    for code in soup.select('pre code'):
        pre = code.parent
        if code.string is None:
            continue
        s0 = code.string

        from HTMLParser import HTMLParser
        h = HTMLParser()
        s = h.unescape(s0)
        if s != s0:
            #             print('decoded %r -> %r' % (s0, s))
            pass

        beg = s.strip()

        # is it a console line?
        ct = is_console_line(beg)

        if ct is None:
            continue

        add_class(pre, 'console')

        # add class "on-hostname"
        if ct.hostname is not None:
            cn = 'on-%s' % str(ct.hostname)
            add_class(pre, cn)

        code.string = ''

        lines = s.split('\n')

        def is_program(x, l):
            if x == 'git' and 'apt' in l:
                return False
            return x in programs

        for j, line in enumerate(lines):
            tokens = line.split(' ')
            for i, token in enumerate(tokens):
                previous_is_sudo_or_dollar = i >= 1 and tokens[i - 1] in [
                    '$', 'sudo'
                ]

                if token in ['$', 'DOLLAR']:
                    # add <span class=console_sign>$</span>
                    e = Tag(name='span')
                    e['class'] = 'console_sign'
                    e.string = '$'
                    code.append(e)
                elif i == 0 and token == ct.hostname:
                    # it's the hostname
                    e = Tag(name='span')
                    e['class'] = 'hostname'
                    e.string = token
                    code.append(e)
                elif is_program(token, line) and previous_is_sudo_or_dollar:
                    e = Tag(name='span')
                    e['class'] = '%s program' % token
                    e.string = token
                    code.append(e)
                elif token in program_commands:
                    e = Tag(name='span')
                    e['class'] = '%s program_command' % token
                    e.string = token
                    code.append(e)
                elif token and token[0] == '-':
                    e = Tag(name='span')
                    e['class'] = 'program_option'
                    e.string = token
                    code.append(e)
                else:
                    code.append(NavigableString(token))

                is_last = i == len(tokens) - 1
                if not is_last:
                    before = '![' in ' '.join(tokens[:i + 1])
                    if not before:
                        # XXX: this is a bug
                        space = Tag(name='span')
                        space.append(' ')
                        space['class'] = 'space'
                        code.append(space)
                    else:
                        code.append(' ')

            is_last_line = j == len(lines) - 1
            if not is_last_line:
                code.append(NavigableString('\n'))
コード例 #31
0
ファイル: manual_join_imp.py プロジェクト: kannode/mcdp
def manual_join(template,
                files_contents,
                stylesheet,
                remove=None,
                extra_css=None,
                remove_selectors=None,
                hook_before_toc=None,
                references=None,
                resolve_references=True,
                hook_before_final_pass=None,
                require_toc_placeholder=False,
                permalink_prefix=None,
                crossrefs_aug=None,
                aug0=None):
    """
        files_contents: a list of tuples that can be cast to DocToJoin:
        where the string is a unique one to be used for job naming.

        extra_css: if not None, a string of more CSS to be added
        Remove_selectors: list of selectors to remove (e.g. ".draft").

        hook_before_toc if not None is called with hook_before_toc(soup=soup)
        just before generating the toc
    """
    result = AugmentedResult()

    if references is None:
        references = {}
    check_isinstance(files_contents, list)

    if crossrefs_aug is None:
        crossrefs = Tag(name='no-cross-refs')
    else:
        crossrefs = bs(crossrefs_aug.get_result())
        result.merge(crossrefs_aug)
    if aug0 is not None:
        result.merge(aug0)

    @contextmanager
    def timeit(_):
        yield

    with timeit('manual_join'):

        files_contents = [DocToJoin(*_) for _ in files_contents]

        # cannot use bs because entire document
        with timeit('parsing template'):
            template0 = template
            template = replace_macros(template)
            template_soup = BeautifulSoup(template,
                                          'lxml',
                                          from_encoding='utf-8')
            d = template_soup
            if d.html is None:
                s = "Invalid template"
                raise_desc(ValueError, s, template0=template0)

        with timeit('adding head'):
            assert d.html is not None
            assert '<html' in str(d)
            head = d.find('head')
            if head is None:
                msg = 'Could not find <head> in template:'
                logger.error(msg)
                logger.error(str(d))
                raise Exception(msg)
            assert head is not None
            for x in get_manual_css_frag().contents:
                head.append(x.__copy__())

        with timeit('adding stylesheet'):
            if stylesheet is not None:
                link = Tag(name='link')
                link['rel'] = 'stylesheet'
                link['type'] = 'text/css'
                from mcdp_report.html import get_css_filename
                link['href'] = get_css_filename('compiled/%s' % stylesheet)
                head.append(link)

        with timeit('making basename2soup'):
            basename2soup = OrderedDict()
            for doc_to_join in files_contents:
                if doc_to_join.docname in basename2soup:
                    msg = 'Repeated docname %r' % doc_to_join.docname
                    raise ValueError(msg)
                from .latex.latex_preprocess import assert_not_inside
                if isinstance(doc_to_join.contents, AugmentedResult):
                    result.merge(doc_to_join.contents)
                    contents = doc_to_join.contents.get_result()
                else:
                    contents = doc_to_join.contents
                assert_not_inside(contents, '<fragment')
                assert_not_inside(contents, 'DOCTYPE')

                frag = bs(contents)
                basename2soup[doc_to_join.docname] = frag

        # with timeit('fix_duplicate_ids'):
        # XXX
        # fix_duplicated_ids(basename2soup)

        with timeit('copy contents'):
            body = d.find('body')
            add_comments = False

            for docname, content in basename2soup.items():
                if add_comments:
                    body.append(NavigableString('\n\n'))
                    body.append(
                        Comment('Beginning of document dump of %r' % docname))
                    body.append(NavigableString('\n\n'))

                try_faster = True
                if try_faster:
                    for e in list(content.children):
                        body.append(e.extract())
                else:
                    copy_contents_into(content, body)

                if add_comments:
                    body.append(NavigableString('\n\n'))
                    body.append(Comment('End of document dump of %r' %
                                        docname))
                    body.append(NavigableString('\n\n'))

        with timeit('extract_bibtex_blocks'):
            extract_bibtex_blocks(d)

        with timeit('ID_PUT_BIB_HERE'):

            ID_PUT_BIB_HERE = MCDPManualConstants.ID_PUT_BIB_HERE

            bibhere = d.find('div', id=ID_PUT_BIB_HERE)
            if bibhere is None:
                msg = ('Could not find #%s in document. '
                       'Adding one at end of document.') % ID_PUT_BIB_HERE
                result.note_warning(msg)
                bibhere = Tag(name='div')
                bibhere.attrs['id'] = ID_PUT_BIB_HERE
                d.find('body').append(bibhere)

            do_bib(d, bibhere)

        with timeit('hook_before_final_pass'):
            if hook_before_final_pass is not None:
                hook_before_final_pass(soup=d)

        with timeit('document_final_pass_before_toc'):
            location = LocationUnknown()
            document_final_pass_before_toc(d, remove, remove_selectors, result,
                                           location)

        with timeit('hook_before_toc'):
            if hook_before_toc is not None:
                hook_before_toc(soup=d)

        with timeit('generate_and_add_toc'):
            try:
                generate_and_add_toc(d, raise_error=True, res=result)
            except NoTocPlaceholder as e:
                if require_toc_placeholder:
                    msg = 'Could not find toc placeholder: %s' % e
                    # logger.error(msg)
                    if aug0 is not None:
                        result.note_error(msg)
                    else:
                        raise Exception(msg)

        with timeit('document_final_pass_after_toc'):
            document_final_pass_after_toc(
                soup=d,
                crossrefs=crossrefs,
                resolve_references=resolve_references,
                res=result)

        if extra_css is not None:
            logger.info('adding extra CSS')
            add_extra_css(d, extra_css)

        with timeit('document_only_once'):
            document_only_once(d)

        location = LocationUnknown()
        substitute_github_refs(d, defaults={}, res=result, location=location)

        with timeit('another A pass'):
            for a in d.select('a[href]'):
                href = a.attrs['href']
                if href in references:
                    r = references[href]
                    a.attrs['href'] = r.url
                    if not a.children:  # empty
                        a.append(r.title)

        # do not use to_html_stripping_fragment - this is a complete doc
        # mark_in_html(result, soup=d)

        add_github_links_if_edit_url(soup=d, permalink_prefix=permalink_prefix)

        with timeit('converting to string'):
            res = unicode(d)

        with timeit('encoding'):
            res = res.encode('utf8')

        logger.info('done - %.1f MB' % (len(res) / (1024 * 1024.0)))

        result.set_result(res)
        return result