Ejemplo n.º 1
0
def parse_html_tag(html: str, start_index: int) -> HTMLTag:
    tag = ''
    i = start_index

    if html[i] == '<':
        i += 1

    while i < len(html) and html[i] not in [' ', '>']:
        tag += html[i]
        i += 1

    end_index = html.index('>', i)
    sub_str = html[i:end_index]

    tag_type = HTMLTagType.START
    if tag.startswith('/'):
        tag_type = HTMLTagType.END
        tag = tag[1:]
    elif ends_with_skip(tag, '/', skip=[' ', '>']):
        tag_type = HTMLTagType.SELF_CLOSING

    attributes: Dict[str, HTMLAttribute] = {}
    if not is_blank(sub_str):
        regex_match = re.findall(r' *([a-zA-Z]+)=[\'"]([a-zA-Z0-9_-]+)[\'"]',
                                 sub_str)
        for key, val in regex_match:
            attributes[key] = val

    html_tag = HTMLTag(tag,
                       attributes=attributes,
                       start_index=start_index,
                       end_index=end_index,
                       tag_type=tag_type)

    return html_tag
Ejemplo n.º 2
0
    def get_element_by_end_str(parent: Element,
                               val: str,
                               default=None) -> Optional[str]:
        element = SitemapXml.get_element_by_end(parent, val)

        if element is None or is_blank(element.text):
            return default

        return element.text
Ejemplo n.º 3
0
 def parse(self, line: str):
     if line.startswith('#'):
         return
     elif line.startswith('User-agent:'):
         self.parse_user_agent(line)
     elif line.startswith('Disallow:'):
         self.parse_disallowed_url(line)
     elif line.startswith('Sitemap:'):
         self.parse_sitemap_url(line)
     elif is_blank(line):
         self.hit_blank()
Ejemplo n.º 4
0
def handle_extension_period(ext: str, include_ext_period: bool = False) -> Optional[str]:
    if is_blank(ext):
        return None

    if include_ext_period:
        if not ext.startswith('.'):
            ext = f'.{ext}'
    else:
        while ext.startswith('.'):
            ext = ext[1:]

    return ext
Ejemplo n.º 5
0
def validate_path(directory: str, default_path: str = join_path(os.getcwd(), '/out'), fatal: bool = False) -> str:
    if is_blank(directory):
        if fatal:
            log(f'Path {directory} does not exist.', log_type=LogType.ERROR)

        directory = default_path

    path = directory.replace('\\', '/')

    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

    return path
Ejemplo n.º 6
0
def ignorable_content_type(ignored_content_types: List[str], content_type: str, attempt_download_on_fail: bool = True) -> bool:
    if not ignored_content_types or len(ignored_content_types) == 0:
        return False

    if is_blank(content_type):
        return attempt_download_on_fail

    split = content_type.split(';')

    check = content_type
    if len(split) > 1:
        check = split[0].strip()

    return check in ignored_content_types
Ejemplo n.º 7
0
    def parse_sitemap(data: str):
        tree = ElementTree.fromstring(data)
        urls = SitemapXml.get_elements_by_end(tree, 'url')

        sitemap = SitemapXml()
        for url in urls:
            loc: Optional[str] = SitemapXml.get_element_by_end_str(url, 'loc')
            last_modified_str: Optional[
                str] = SitemapXml.get_element_by_end_str(url, 'lastmod')

            last_modified = None
            if not is_blank(last_modified_str):
                last_modified = datetime.fromisoformat(last_modified_str)

            sitemap_url = SitemapXmlURL(url=loc, last_modified=last_modified)
            sitemap.add_url(sitemap_url)

        return sitemap
Ejemplo n.º 8
0
def url_is_relative(url) -> bool:
    if is_blank(url):
        return False

    return bool(RELATIVE_URL_REGEX.search(url))
Ejemplo n.º 9
0
    def set_sitemap(self, url: str):
        if is_blank(url):
            return

        self.sitemap = url