def iter_links_object_element(cls, element): """Iterate ``object`` and ``embed`` elements. This function also looks at ``codebase`` and ``archive`` attributes. """ base_link = element.attrib.get("codebase", None) if base_link: # lxml returns codebase as inline link_type = element.attrib.get(base_link) yield LinkInfo( element=element, tag=element.tag, attrib="codebase", link=base_link, inline=True, linked=False, base_link=None, value_type="plain", link_type=link_type, ) for attribute in ("code", "src", "classid", "data"): if attribute in element.attrib: link_type = identify_link_type(element.attrib.get(attribute)) yield LinkInfo( element=element, tag=element.tag, attrib=attribute, link=element.attrib.get(attribute), inline=True, linked=False, base_link=base_link, value_type="plain", link_type=link_type, ) if "archive" in element.attrib: for match in re.finditer(r"[^ ]+", element.attrib.get("archive")): value = match.group(0) link_type = identify_link_type(value) yield LinkInfo( element=element, tag=element.tag, attrib="archive", link=value, inline=True, linked=False, base_link=base_link, value_type="list", link_type=link_type, )
def iter_processed_text(self, file, encoding=None, base_url=None): for text, is_link in self.iter_text(file, encoding): if is_link: try: new_text = json.loads('"{0}"'.format(text)) except ValueError: yield (text, False) continue if is_unlikely_link(new_text) or not is_likely_link(new_text): yield (text, False) continue if base_url: new_link = urljoin_safe(base_url, new_text, allow_fragments=False) else: new_link = new_text if new_link: yield (new_link, identify_link_type(new_link) or True) else: yield (text, False) else: yield (text, False)
def iter_links_object_element(cls, element): '''Iterate ``object`` and ``embed`` elements. This function also looks at ``codebase`` and ``archive`` attributes. ''' base_link = element.attrib.get('codebase', None) if base_link: # lxml returns codebase as inline link_type = element.attrib.get(base_link) yield LinkInfo(element=element, tag=element.tag, attrib='codebase', link=base_link, inline=True, linked=False, base_link=None, value_type='plain', link_type=link_type) for attribute in ('code', 'src', 'classid', 'data'): if attribute in element.attrib: link_type = identify_link_type(element.attrib.get(attribute)) yield LinkInfo(element=element, tag=element.tag, attrib=attribute, link=element.attrib.get(attribute), inline=True, linked=False, base_link=base_link, value_type='plain', link_type=link_type) if 'archive' in element.attrib: for match in re.finditer(r'[^ ]+', element.attrib.get('archive')): value = match.group(0) link_type = identify_link_type(value) yield LinkInfo(element=element, tag=element.tag, attrib='archive', link=value, inline=True, linked=False, base_link=base_link, value_type='list', link_type=link_type)
def test_identifiy_link_type(self): self.assertEqual(LinkType.javascript, identify_link_type('hello.js')) self.assertEqual(LinkType.css, identify_link_type('hello.css')) self.assertEqual(LinkType.html, identify_link_type('hello.html')) self.assertEqual(LinkType.media, identify_link_type('hello.mp3')) self.assertEqual(LinkType.media, identify_link_type('hello.png')) self.assertEqual(LinkType.media, identify_link_type('hello.flv')) self.assertFalse(identify_link_type('hello.exe'))
def iter_links_object_element(cls, element): '''Iterate ``object`` and ``embed`` elements. This function also looks at ``codebase`` and ``archive`` attributes. ''' base_link = element.attrib.get('codebase', None) if base_link: # lxml returns codebase as inline link_type = element.attrib.get(base_link) yield LinkInfo( element=element, tag=element.tag, attrib='codebase', link=base_link, inline=True, linked=False, base_link=None, value_type='plain', link_type=link_type ) for attribute in ('code', 'src', 'classid', 'data'): if attribute in element.attrib: link_type = identify_link_type(element.attrib.get(attribute)) yield LinkInfo( element=element, tag=element.tag, attrib=attribute, link=element.attrib.get(attribute), inline=True, linked=False, base_link=base_link, value_type='plain', link_type=link_type ) if 'archive' in element.attrib: for match in re.finditer(r'[^ ]+', element.attrib.get('archive')): value = match.group(0) link_type = identify_link_type(value) yield LinkInfo( element=element, tag=element.tag, attrib='archive', link=value, inline=True, linked=False, base_link=base_link, value_type='list', link_type=link_type )
def iter_links_element_text(cls, element): '''Get the element text as a link.''' if element.text: link_type = identify_link_type(element.text) yield LinkInfo( element=element, tag=element.tag, attrib=None, link=element.text, inline=False, linked=True, base_link=None, value_type='plain', link_type=link_type )
def iter_links_param_element(cls, element): '''Iterate a ``param`` element.''' valuetype = element.attrib.get('valuetype', '') if valuetype.lower() == 'ref' and 'value' in element.attrib: link_type = identify_link_type(element.attrib.get('value')) yield LinkInfo( element=element, tag=element.tag, attrib='value', link=element.attrib.get('value'), inline=True, linked=False, base_link=None, value_type='plain', link_type=link_type )
def iter_links_param_element(cls, element): """Iterate a ``param`` element.""" valuetype = element.attrib.get("valuetype", "") if valuetype.lower() == "ref" and "value" in element.attrib: link_type = identify_link_type(element.attrib.get("value")) yield LinkInfo( element=element, tag=element.tag, attrib="value", link=element.attrib.get("value"), inline=True, linked=False, base_link=None, value_type="plain", link_type=link_type, )
def iter_links_plain_element(self, element): '''Iterate any element for links using generic rules.''' for attrib_name, link in self.iter_links_by_attrib(element): if attrib_name in self.LINK_ATTRIBUTES: inline = self.is_link_inline(element.tag, attrib_name) linked = self.is_html_link(element.tag, attrib_name) else: inline = is_likely_inline(link) linked = not inline link_type = identify_link_type(link) yield LinkInfo( element=element, tag=element.tag, attrib=attrib_name, link=link, inline=inline, linked=linked, base_link=None, value_type='plain', link_type=link_type )