Example #1
0
def searchable_text_for_name(name):
    ans = []
    serialized_data = json.loads(get_data(name)[0])
    stack = []
    for child in serialized_data['tree']['c']:
        if child.get('n') == 'body':
            stack.append(child)
    ignore_text = {'script', 'style', 'title'}
    while stack:
        node = stack.pop()
        if isinstance(node, unicode_type):
            ans.append(node)
            continue
        g = node.get
        name = g('n')
        text = g('x')
        tail = g('l')
        children = g('c')
        if name and text and name not in ignore_text:
            ans.append(text)
        if tail:
            stack.append(tail)
        if children:
            stack.extend(reversed(children))
    return ''.join(ans)
Example #2
0
def searchable_text_for_name(name):
    ans = []
    serialized_data = json.loads(get_data(name)[0])
    stack = []
    for child in serialized_data['tree']['c']:
        if child.get('n') == 'body':
            stack.append(child)
    ignore_text = {'script', 'style', 'title'}
    while stack:
        node = stack.pop()
        if isinstance(node, unicode_type):
            ans.append(node)
            continue
        g = node.get
        name = g('n')
        text = g('x')
        tail = g('l')
        children = g('c')
        if name and text and name not in ignore_text:
            ans.append(text)
        if tail:
            stack.append(tail)
        if children:
            stack.extend(reversed(children))
    # Normalize whitespace to a single space, this will cause failures
    # when searching over spaces in pre nodes, but that is a lesser evil
    # since the DOM converts \n, \t etc to a single space
    return regex.sub(r'\s+', ' ', ''.join(ans))
Example #3
0
def searchable_text_for_name(name):
    ans = []
    serialized_data = json.loads(get_data(name)[0])
    stack = []
    removed_tails = []
    for child in serialized_data['tree']['c']:
        if child.get('n') == 'body':
            stack.append(child)
            # the JS code does not add the tail of body tags to flat text
            removed_tails.append((child.pop('l', None), child))
    ignore_text = {'script', 'style', 'title'}
    text_pos = 0
    anchor_offset_map = OrderedDict()
    while stack:
        node = stack.pop()
        if isinstance(node, str):
            ans.append(node)
            text_pos += len(node)
            continue
        g = node.get
        name = g('n')
        text = g('x')
        tail = g('l')
        children = g('c')
        attributes = g('a')
        if attributes:
            for x in attributes:
                if x[0] == 'id':
                    aid = x[1]
                    if aid not in anchor_offset_map:
                        anchor_offset_map[aid] = text_pos
        if name and text and name not in ignore_text:
            ans.append(text)
            text_pos += len(text)
        if tail:
            stack.append(tail)
        if children:
            stack.extend(reversed(children))
    for (tail, body) in removed_tails:
        if tail is not None:
            body['l'] = tail
    return ''.join(ans), anchor_offset_map
Example #4
0
def searchable_text_for_name(name):
    ans = []
    serialized_data = json.loads(get_data(name)[0])
    stack = []
    for child in serialized_data['tree']['c']:
        if child.get('n') == 'body':
            stack.append(child)
    ignore_text = {'script', 'style', 'title'}
    text_pos = 0
    anchor_offset_map = OrderedDict()
    while stack:
        node = stack.pop()
        if isinstance(node, unicode_type):
            ans.append(node)
            text_pos += len(node)
            continue
        g = node.get
        name = g('n')
        text = g('x')
        tail = g('l')
        children = g('c')
        attributes = g('a')
        if attributes:
            for x in attributes:
                if x[0] == 'id':
                    aid = x[1]
                    if aid not in anchor_offset_map:
                        anchor_offset_map[aid] = text_pos
        if name and text and name not in ignore_text:
            ans.append(text)
            text_pos += len(text)
        if tail:
            stack.append(tail)
        if children:
            stack.extend(reversed(children))
    return ''.join(ans), anchor_offset_map