def searchable_text_for_name(name): ans = [] serialized_data = json.loads(get_data(name)[0]) stack = [] for child in serialized_data['tree']['c']: if child.get('n') == 'body': stack.append(child) ignore_text = {'script', 'style', 'title'} while stack: node = stack.pop() if isinstance(node, unicode_type): ans.append(node) continue g = node.get name = g('n') text = g('x') tail = g('l') children = g('c') if name and text and name not in ignore_text: ans.append(text) if tail: stack.append(tail) if children: stack.extend(reversed(children)) return ''.join(ans)
def searchable_text_for_name(name): ans = [] serialized_data = json.loads(get_data(name)[0]) stack = [] for child in serialized_data['tree']['c']: if child.get('n') == 'body': stack.append(child) ignore_text = {'script', 'style', 'title'} while stack: node = stack.pop() if isinstance(node, unicode_type): ans.append(node) continue g = node.get name = g('n') text = g('x') tail = g('l') children = g('c') if name and text and name not in ignore_text: ans.append(text) if tail: stack.append(tail) if children: stack.extend(reversed(children)) # Normalize whitespace to a single space, this will cause failures # when searching over spaces in pre nodes, but that is a lesser evil # since the DOM converts \n, \t etc to a single space return regex.sub(r'\s+', ' ', ''.join(ans))
def searchable_text_for_name(name): ans = [] serialized_data = json.loads(get_data(name)[0]) stack = [] removed_tails = [] for child in serialized_data['tree']['c']: if child.get('n') == 'body': stack.append(child) # the JS code does not add the tail of body tags to flat text removed_tails.append((child.pop('l', None), child)) ignore_text = {'script', 'style', 'title'} text_pos = 0 anchor_offset_map = OrderedDict() while stack: node = stack.pop() if isinstance(node, str): ans.append(node) text_pos += len(node) continue g = node.get name = g('n') text = g('x') tail = g('l') children = g('c') attributes = g('a') if attributes: for x in attributes: if x[0] == 'id': aid = x[1] if aid not in anchor_offset_map: anchor_offset_map[aid] = text_pos if name and text and name not in ignore_text: ans.append(text) text_pos += len(text) if tail: stack.append(tail) if children: stack.extend(reversed(children)) for (tail, body) in removed_tails: if tail is not None: body['l'] = tail return ''.join(ans), anchor_offset_map
def searchable_text_for_name(name): ans = [] serialized_data = json.loads(get_data(name)[0]) stack = [] for child in serialized_data['tree']['c']: if child.get('n') == 'body': stack.append(child) ignore_text = {'script', 'style', 'title'} text_pos = 0 anchor_offset_map = OrderedDict() while stack: node = stack.pop() if isinstance(node, unicode_type): ans.append(node) text_pos += len(node) continue g = node.get name = g('n') text = g('x') tail = g('l') children = g('c') attributes = g('a') if attributes: for x in attributes: if x[0] == 'id': aid = x[1] if aid not in anchor_offset_map: anchor_offset_map[aid] = text_pos if name and text and name not in ignore_text: ans.append(text) text_pos += len(text) if tail: stack.append(tail) if children: stack.extend(reversed(children)) return ''.join(ans), anchor_offset_map