def parse(self): results = [] for x, tag in enumerate(self.html.descendants): if str(type(tag)) == "<class 'bs4.element.Tag'>": if tag.name == 'script': continue # Find tags with no children (base tags) if tag.contents: if sum(1 for _ in tag.descendants) == 1: t = Tag(tag.name.lower()) # Because it might be None (<i class="fa fa-icon"></i>) if tag.string: t.add_content(tag.string) if tag.attrs: for a in tag.attrs: t.add_attribute(a, tag[a]) results.append(t.get_data()) # Self enclosed tags (hr, meta, img, etc...) else: t = Tag(tag.name.lower()) if tag.attrs: for a in tag.attrs: t.add_attribute(a, tag[a]) results.append(t.get_data()) return results
def parse(self): results = [] for x, tag in enumerate(self.html.descendants): if str(type(tag)) == "<class 'bs4.element.Tag'>": #look for global.document.metadata if tag.name == 'script': if tag.string: position = tag.string.find('global.document.metadata=') if position == -1: continue else: a = 'global.document.metadata=' t = Tag('global.document.metadata') s = tag.string[position + len(a):] s = s[:s.find('\n') - 1] t.add_content(s) results.append(t.get_data()) # Find tags with no children (base tags) if tag.contents: if sum(1 for _ in tag.descendants) == 1: t = Tag(tag.name.lower()) # Because it might be None (<i class="fa fa-icon"></i>) if tag.string: t.add_content(tag.string) if tag.attrs: for a in tag.attrs: t.add_attribute(a, tag[a]) results.append(t.get_data()) # Self enclosed tags (hr, meta, img, etc...) else: t = Tag(tag.name.lower()) if tag.attrs: for a in tag.attrs: t.add_attribute(a, tag[a]) results.append(t.get_data()) return results
def parse(self) -> List[dict]: """ :return: list of dictionaries with all the tags """ results: List[dict] = [] for tag in self.html.descendants: if isinstance(tag, bs4.element.Tag): # look for global.document.metadata if tag.name == 'script' and tag.string: position = tag.string.find('global.document.metadata=') if position == -1: continue else: a = 'global.document.metadata=' t = Tag('global.document.metadata') s = tag.string[position + len(a):] s = s[:s.find('\n') - 1] t.add_content(s) results.append(t.get_data()) t = Tag(tag.name.lower()) # Find tags with no children (base tags) if tag.contents and ilen(tag.descendants) == 1: # Because it might be None (<i class="fa fa-icon"></i>) if tag.string: t.add_content(tag.string) if tag.attrs: for a in tag.attrs: t.add_attribute(a, tag[a]) results.append(t.get_data()) else: # Self enclosed tags (hr, meta, img, etc...) if tag.attrs: for a in tag.attrs: t.add_attribute(a, tag[a]) results.append(t.get_data()) return results