Ejemplo n.º 1
0
def note_error_msg(tag0, msg):
    check_isinstance(msg, bytes)
    add_class(tag0, 'errored')
    #     logger.error(str(bytes))  # XXX
    t = Tag(name='pre', attrs={'class': 'error'})
    t.string = msg
    tag0.insert_after(t)
Ejemplo n.º 2
0
def note_error(tag0, e):
    check_isinstance(e, BaseException)
    add_class(tag0, 'errored')
    #     logger.error(str(e))  # XXX
    t = Tag(name='pre', attrs={'class': 'error %s' % type(e).__name__})
    t.string = traceback.format_exc(e)
    tag0.insert_after(t)
Ejemplo n.º 3
0
def get_id2element(soup, att):
    id2element = {}
    duplicates = set()
    
    # ignore the maths
    ignore = set() 
    for element in soup.select('svg [%s]' % att): # node with ID below SVG
        ignore.add(element[att])
    for element in soup.select('svg[%s]' % att): # svg with ID
        ignore.add(element[att])
    for element in soup.select('[%s^="MathJax"]' % att): # stuff created by MathJax
        ignore.add(element[att])
        
    for element in soup.select('[%s]' % att):
        ID = element[att]
        if ID in ignore:
            continue
        if ID in id2element:
            duplicates.add(ID)
            other = id2element[ID]
            for e0 in [element, other]:
                if not 'errored' in e0.attrs.get('class', ''):
                    add_class(e0, 'errored')
                    w = Tag(name='span', attrs={'class':'duplicated-id'})
                    w.string = 'More than one element with id %r.' % ID
                    e0.insert_after(w)
        id2element[element[att]] = element
        
    if duplicates:
        s = ", ".join(sorted(duplicates))
        msg = '%d duplicated %s found (not errored): %s' % (len(duplicates), att, s) 
        logger.error(msg)
    return id2element, duplicates
Ejemplo n.º 4
0
def substitute_task_marker_p(p, sub, klass):
    try:
        for element in p.descendants:
            if not isinstance(element, NavigableString):
                continue
    
            s = element.string
            if sub in s:
                add_class(p, klass)
                s2 = s.replace(sub, '')
                ns = NavigableString(s2)
                element.replaceWith(ns)
    except AttributeError as e: # a bug with bs4
        msg = 'Bug with descendants: %s' % e
        logger.debug(msg)
        pass
Ejemplo n.º 5
0
def substitute_special_paragraph(soup, prefix, klass):
    """ 
        Looks for paragraphs that start with a simple string with the given prefix. 
    
        From:
        
            <p>prefix contents</p>
            
        Creates:
        
            <div class='klass-wrap'><p class='klass'>contents</p></div>
    """
    ps = list(soup.select('p'))
    for p in ps:
        # Get first child
        contents = list(p.contents)
        if not contents:
            continue
        c = contents[0]
        if not isinstance(c, NavigableString):
            continue

        s = c.string
        starts = s.lower().startswith(prefix.lower())
        if not starts:
            continue

        without = s[len(prefix):]
        ns = NavigableString(without)
        c.replaceWith(ns)

        div = Tag(name='div')
        add_class(div, klass + '-wrap')
        add_class(p, klass)
        parent = p.parent
        i = parent.index(p)
        p.extract()
        div.append(p)
        parent.insert(i, div)
Ejemplo n.º 6
0
 def mark_not_found(tag):
     add_class(tag, 'library-not-found')
Ejemplo n.º 7
0
def col_macro_(e, ncols):
    """
        Bug: For some reasone bd4 removes the whitespace I use for indentation.
        
    
    """
    assert e.name == 'div' 
    assert e.has_attr('make-col%d' % ncols)
    
#     print describe_tag(e)
    children = list(e.children) 
    # remove strings from this
    is_string = lambda x: isinstance(x, NavigableString)
    strings = [_ for _ in children if is_string(_)]
    children = [_ for _ in children if not is_string(_)]
    
    if len(children) < ncols:
        msg = ('Cannot create table with %r cols with only %d children' % 
               (ncols, len(children)))
        raise_desc(ValueError, msg, tag=describe_tag(e))
    
    for c in children:
        c.extract()
        
    for s in strings:
        ss = str(s)
        empty = not ss.strip()
        if not empty:
            msg = 'Found nonempty string %r between children.' % ss 
            raise_desc(ValueError, msg, tag=describe_tag(e))
        # remove it
        s.extract()
        
    nchildren = len(children)
    nrows = int(math.ceil(nchildren / float(ncols)))
    
    parent = e.parent
    original_position = parent.index(e)
    e.extract()
    table = e
    e.name = 'table'
    add_class(table, 'col%d' % ncols)
    add_class(table, 'colN') 
    
    wrapper = Tag(name='div')
    add_class(wrapper, 'col%d-wrap' % ncols)
    add_class(wrapper, 'colN-wrap')
    
    NL = '\n'
    # S = '-' * 4
    # XXX: change to above to see the problem with indentation
    S = ' ' * 4
    tbody = Tag(name='tbody')
    for row in range(nrows):
        tbody.append(NavigableString(NL))
        tbody.append(NavigableString(S+S))
        tr = Tag(name='tr')
        tr.append(NavigableString(NL))
        for col in range(ncols):
            td = Tag(name='td')
            i = col + row * ncols
            if i < len(children):
                child = children[i]
                td.append(child)
            else:
                td.append(Comment('empty row %d col %d' % (row, col)))
            tr.append(NavigableString(S+S+S))
            tr.append(td)
            tr.append(NavigableString(NL))
        tr.append(S+S)
        if row == 0 and ('labels-row1' in e.attrs.get('class', '')):
            thead = Tag(name='thead')
            thead.append(tr)
            table.append(thead) # add in table, not tbody
        else:
            tbody.append(tr)   # add in tbody
        tbody.append(NavigableString(NL+S))
    table.append(tbody)
    
    wrapper.append(NavigableString(NL + S))  
    wrapper.append(table)
    wrapper.append(NavigableString(NL))
    
    parent.insert(original_position, wrapper) 
    
    
Ejemplo n.º 8
0
def check_if_any_href_is_invalid(soup):
    '''
         Checks if references are invalid and tries to correct them. 
         
        if it is of the form "#frag?query" then query is stripped out
    '''
    errors = []
    math_errors = []
    
    # let's first find all the IDs
    id2element, duplicates = get_id2element(soup, 'id')
    _name2element, _duplicates = get_id2element(soup, 'name')
#     id2element.update(name2element)
#     for a in soup.select('a[href^="#"]'):

    for a in soup.select('[href^="#"]'):
        href = a['href']
        if a.has_attr('class') and  "mjx-svg-href" in a['class']:
            msg = 'Invalid math reference (sorry, no details): href = %s .' % href
            logger.error(msg)
            a.insert_before(Comment('Error: %s' % msg))
            math_errors.append(msg)
            continue 
        assert href.startswith('#')
        ID = href[1:]
        # remove query if it exists
        if '?' in ID:
            ID = ID[:ID.index('?')]
#         not_found = []

        if not ID in id2element:
            # try to fix it
#             
#             # it there is named element
#             if ID in name2element:
#                 real_id = name2element[ID].attrs
            
            # if there is already a prefix, remove it 
            if ':' in href:
                i = href.index(':')
                core = href[i+1:]
            else:
                core = ID
            possible = ['sec', 'sub', 'subsub', 'fig', 'tab', 'code', 'app', 'appsub',
                        'appsubsub',
                        'def', 'eq', 'rem', 'lem', 'prob', 'prop', 'exa', 'thm' ]
            matches = [] 
            others = []
            for possible_prefix in possible:
                why_not = possible_prefix + ':' + core
                others.append(why_not)
                if why_not in id2element:
                    matches.append(why_not)
            
            if len(matches) > 1:
                msg = '%s not found, and multiple matches for heuristics (%s)' % (href, matches)
                logger.error(msg)
                add_class(a, 'errored')
                w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'})
                w.string = msg
                a.insert_after(w)
            elif len(matches) == 1:
                msg = '%s not found, but corrected in %s' % (href, matches[0])
                logger.debug(msg)
                
                add_class(a, 'warning')
                w = Tag(name='span', attrs={'class':'href-replaced'})
                w.string = msg
                a['href'] = '#' + matches[0]
                a.insert_after(w)
                
            else:
#                 msg = 'Not found %r (also tried %s)' % (href, ", ".join(others))
#                 not_found.append(ID)
#                 logger.error(msg)
                errors.append('Not found %r' % (href))
                if not 'errored' in a.attrs.get('class', ''):
                    add_class(a, 'errored')
                    w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'})
                    w.string = 'Not found %r' % (href)
                    a.insert_after(w)
            
        if ID in duplicates:
            msg = 'More than one element matching %r.' % href
            logger.error(msg)
            if not 'errored' in a.attrs.get('class', ''):
                add_class(a, 'errored')
                w = Tag(name='span', attrs={'class':'href-invalid href-invalid-multiple'})
                w.string = msg
                a.insert_after(w)

            errors.append(msg)
            
    return errors, math_errors