Python UrlUtil Examples, tools.url_util.UrlUtil Python Examples

Example #1

0

Show file

File: html_util.py Project: kongtianyi/internet-snapshot

 def get_format_html(cls, html, final_url):
     """获得标准化的html"""
     final_protocol = UrlUtil.get_protocol(final_url)
     final_domain = UrlUtil.get_domain(final_url)
     format_html = HtmlUtil.parse_protocol(html, final_protocol)
     format_html = HtmlUtil.parse_domain(format_html, final_protocol, final_domain)
     return format_html

Example #2

0

Show file

File: html_util.py Project: kongtianyi/internet-snapshot

    def get_unsafe_out_chains(cls, format_html, top_domain):
        out_chains = HtmlUtil.get_out_chains(format_html, top_domain)
        unsafe_out_chains = set()

        # 过滤一下安全外链
        safe_chains = set()
        connection = pymysql.connect(**projectconfig.mysql_config)
        # 拿到公共安全外链主域名
        with connection.cursor() as cursor:
            sql = 'SELECT mydomain FROM public_safe_out_chains;'
            cursor.execute(sql)
            pubsocs = cursor.fetchall()
            for pubsoc in pubsocs:
                safe_chains.add(pubsoc["mydomain"])
        # 拿到私有安全外链主域名
        with connection.cursor() as cursor:
            sql = 'SELECT mydomain FROM private_safe_out_chains WHERE owner=%s;'
            cursor.execute(sql, (top_domain,))
            pubsocs = cursor.fetchall()
            for pubsoc in pubsocs:
                safe_chains.add(pubsoc["mydomain"])
        for out_chain in out_chains:
            if UrlUtil.get_top_domain(out_chain) not in safe_chains and not UrlUtil.is_gov_or_edu(out_chain):
                # 主域名不在白名单里而且不是政府或教育机构网站
                unsafe_out_chains.add(out_chain)

        return unsafe_out_chains

Example #3

0

Show file

 def __init__(self, start_url, exist_time=600, deepth=2, max_num=200):
     """
     :param start_url: 起始url
     :param exist_time: 任务最长持续时间
     :param deepth: 页面收集深度
     :param max_num: 最多收集页面
     """
     self.main_item = MainItem(start_url)
     self.main_item.task_id = str(uuid.uuid1())  # 一次收集任务的标识
     self.main_item.refer = ""
     self.main_item.deepth = 1  # 起始深度当然是1
     self.start_url = start_url
     self.top_domain = UrlUtil.get_top_domain(start_url)
     self.exist_time = int(exist_time)
     self.deepth = int(deepth)
     self.max_num = int(max_num)

Example #4

0

Show file

def href_clean(hrefs):
    """清洗从a标签中提取出的href属性，去掉不是网页的条目"""
    result = list()
    not_web_page = ("ico", "svg", "css", "xml", "png", "jpg", "jpeg", "gif"
                    )  # 这些扩展名不是网页，去掉
    if isinstance(hrefs, list):
        for href in hrefs:
            # 是url并且是网页
            if re.match('[a-zA-z]+://[^\s]*', href) \
                    and UrlUtil.get_url_suffix(href) not in not_web_page:
                # 把lxml.etree._ElementUnicodeResult对象做下转换并去掉两侧无用空白字符
                href = str(href).strip()
                result.append(href)
    else:
        logging.error("Param type error, it should be list.")
    return result

Example #5

0

Show file

File: html_util.py Project: kongtianyi/internet-snapshot

 def get_out_chains(cls, format_html, top_domain):
     """获取页面中的外链"""
     tree = etree.HTML(format_html)
     hrefs = tree.xpath("//@href")  # 拿到所有a标签中的链接对象
     iframes = tree.xpath("//iframe/@src")  # 拿到所有iframe的源链接
     jss = tree.xpath("//script/@src")  # 拿到所有的js链接
     hrefs.extend(iframes)
     hrefs.extend(jss)
     if hrefs:
         hrefs = href_clean(hrefs)
     else:
         hrefs = list()
     out_chains = set()
     for href in hrefs:
         if not UrlUtil.get_top_domain(href) == top_domain:
             out_chains.add(href)
     return out_chains

Example #6

0

Show file

File: html_util.py Project: kongtianyi/internet-snapshot

 def diff_out_chains(cls, htmls, urls):
     """
     多个页面中不同的不安全外链
     htmls: html列表
     urls: 与html列表对应的url列表
     return: 各个html中存在的独有的外链，最后一项是所有外链集合的差集
     """
     out_chainss = list()
     for html, url in zip(htmls, urls):
         if html is None:
             logging.error("None object has no out chains!")
             return []
         format_html = HtmlUtil.get_format_html(html, url)
         out_chains = HtmlUtil.get_unsafe_out_chains(format_html, UrlUtil.get_top_domain(url))
         out_chainss.append(out_chains)
     # 并集减交集为差集
     diff = get_union(out_chainss) - get_intersection(out_chainss)
     result = list()
     for out_chains in out_chainss:
         result.append(list(out_chains & diff))
     result.append(list(diff))
     return result

Example #7

0

Show file

 def __init__(self, downloader_item):
     self.downloader_item = downloader_item
     self.connection = pymysql.connect(
         **projectconfig.mysql_config)  # 建立数据库链接
     self.redis_conn = redis.Redis.from_url(projectconfig.REDIS_URL)
     self.safe_chains = set()
     # 拿到公共安全外链主域名
     with self.connection.cursor() as cursor:
         sql = 'SELECT mydomain FROM public_safe_out_chains;'
         cursor.execute(sql)
         pubsocs = cursor.fetchall()
         for pubsoc in pubsocs:
             self.safe_chains.add(pubsoc["mydomain"])
     # 拿到私有安全外链主域名
     with self.connection.cursor() as cursor:
         sql = 'SELECT mydomain FROM private_safe_out_chains WHERE owner=%s;'
         request_top_domain = UrlUtil.get_top_domain(
             downloader_item.request_url)
         cursor.execute(sql, (request_top_domain, ))
         pubsocs = cursor.fetchall()
         for pubsoc in pubsocs:
             self.safe_chains.add(pubsoc["mydomain"])

Example #8

0

Show file

    def download(self, main_item, after_scroll_time=1):
        """下载一个web页面"""
        if not isinstance(main_item, MainItem):
            logging.error("Received param must items.MainItem, but get " +
                          str(type(main_item)))
            return None
        start_time = time.time()
        try:
            self.driver.get(main_item.request_url)  # 请求页面
            # todo 存储图片 screenshot_base64 = self.driver.get_screenshot_as_base64()
        except TimeoutException as e:
            logging.info("Get url:" + main_item.request_url + ", msg: " +
                         e.msg)
            self.driver.execute_script("window.stop()")
        except WebDriverException as e:
            logging.error("When download page, error class: %s, message: %s." %
                          (e.__class__, e.msg))
            options = Options()
            options.add_argument('-headless')
            self.driver = Firefox(firefox_options=options)
            logging.info("Webdriver reinit")
        finally:
            load_time = time.time() - start_time
            logging.info("Get url:" + main_item.request_url + " spend " +
                         str(load_time) + "s.")
        server_ip = socket.gethostbyname(
            UrlUtil.get_domain(main_item.request_url))
        js_scroll = """
                    function go_down() {
                        var h = document.documentElement.scrollHeight || document.body.scrollHeight;
                        window.scroll(h, h);
                    }
                    go_down()
                """  # 翻页JS
        try:
            self.driver.execute_script(js_scroll)  # 执行翻页
            time.sleep(after_scroll_time)  # 执行了翻页后等待页面加载nS
        except WebDriverException as e:
            logging.error(
                "When scroll page, error class: %s, error message: %s" %
                (e.__class__, e.msg))
        current_url = None
        page_source = None
        try:
            current_url = self.driver.current_url
            page_source = self.driver.page_source
        except UnexpectedAlertPresentException as e:
            logging.info("点击弹出框")
            try:
                self.driver.switch_to.alert.accept()
            except NoAlertPresentException:
                # 把框点没了就会抛出这个异常，不知这什么鬼设计
                pass
            try:
                current_url = self.driver.current_url
                page_source = self.driver.page_source
            except WebDriverException as e:
                logging.error(e.msg)
        except WebDriverException as e:
            logging.error(e.msg)
        finally:
            if not current_url:
                current_url = "Something error occurred, please check the error log."
            if not page_source:
                page_source = "Something error occurred, please check the error log."

        # 填充现有信息
        main_item.final_url = current_url
        # download_item.screen_shot = screenshot_base64
        main_item.load_time = load_time
        main_item.html = page_source
        time_array = time.localtime(int(time.time()))
        main_item.get_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        with open("/etc/internet-snapshot.conf", "r") as f:
            ext_conf = json.load(f)
            main_item.send_ip = ext_conf["ip"]
        main_item.server_ip = server_ip

        return main_item

Example #9

0

Show file

File: html_util.py Project: kongtianyi/internet-snapshot

def get_union(sets):
    """取得多个集合的并集"""
    tmp = sets.copy()
    re = tmp.pop()
    for s in sets:
        re = s | re
    return re


def get_intersection(sets):
    """取得多个集合的交集"""
    tmp = sets.copy()
    re = tmp.pop()
    for s in sets:
        re = s & re
    return re


if __name__ == "__main__":
    # html1, url1 = get_html_from_mysql(464)
    # html2, url2 = get_html_from_mysql(465)
    # html3, url3 = get_html_from_mysql(466)
    # import json
    # print(json.dumps(HtmlUtil.diff_out_chains(htmls=[html1, html2, html3], urls=[url1, url2, url3]), indent=4))
    html, url = get_html_from_mysql(504)
    format_html = HtmlUtil.get_format_html(html, url)
    my_out_chains = HtmlUtil.get_out_chains(format_html, UrlUtil.get_top_domain(url))
    with open("check504_ex.txt", "w", encoding="utf-8") as f:
        for my_out_chain in my_out_chains:
            f.write(my_out_chain + " ==> " + UrlUtil.get_top_domain(my_out_chain) + "\n")

Example #10

0

Show file

    def parse(self):
        if not isinstance(self.downloader_item, MainItem):
            logging.error("The param type is: " +
                          str(type(self.downloader_item)) +
                          ", but it should be MainItem.")
            return None
        html = self.downloader_item.html

        # 将downloader_item存库
        with self.connection.cursor() as cursor:
            sql = 'INSERT INTO snapshot (request_url, final_url, load_time, refer, get_time,' \
                  ' task_id, send_ip, server_ip, deepth) VALUES (%s, %s, %s, %s, %s, %s, %s, ' \
                  '%s, %s);'
            result = cursor.execute(sql, self.downloader_item.save_tuple())
            if result != 1:
                logging.error("snapshot插入记录" +
                              self.downloader_item.save_tuple() + "失败！")

        # 拿到刚刚存库记录的id
        with self.connection.cursor() as cursor:
            sql = 'SELECT last_insert_id() as ss_id;'
            cursor.execute(sql)
            result = cursor.fetchone()
            ss_id = result["ss_id"]

        # 将页面内容存库
        ss_html = SsHtmlItem(ss_id=ss_id, html=html)
        with self.connection.cursor() as cursor:
            sql = 'INSERT INTO ss_html (ss_id, html) VALUES (%s, %s);'
            result = cursor.execute(sql, ss_html.save_tuple())
            if result != 1:
                logging.error("ss_html插入记录" + ss_html.save_tuple() + "失败！")

        # 规范化一下页面内链接
        final_protocol = UrlUtil.get_protocol(self.downloader_item.final_url)
        final_domain = UrlUtil.get_domain(self.downloader_item.final_url)
        format_html = HtmlUtil.parse_protocol(html, final_protocol)
        format_html = HtmlUtil.parse_domain(format_html, final_protocol,
                                            final_domain)

        tree = etree.HTML(format_html)
        hrefs = tree.xpath("//@href")  # 拿到所有a标签中的链接对象
        iframes = tree.xpath("//iframe/@src")  # 拿到所有iframe的源链接
        jss = tree.xpath("//script/@src")  # 拿到所有的js链接
        hrefs.extend(iframes)
        hrefs.extend(jss)
        if hrefs:
            hrefs = href_clean(hrefs)
        else:
            hrefs = list()
        inner_chains = set()  # 内链列表，返回给引擎迭代
        unknown_domains = set()  # 可疑外链主域名列表，存库人工复查
        request_top_domain = UrlUtil.get_top_domain(
            self.downloader_item.request_url)
        for href in hrefs:
            this_top_domain = UrlUtil.get_top_domain(href)
            if request_top_domain == this_top_domain and UrlUtil.get_url_suffix(
                    href) != "js":
                inner_chains.add(href)
            elif this_top_domain not in self.safe_chains and not UrlUtil.is_gov_or_edu(
                    href):
                # 主域名不在白名单里而且不是政府或教育机构网站
                unknown_domains.add(this_top_domain)

        # 将须迭代的内链包装对象放入redis
        logging.info("Length of inner_chains is " + str(len(inner_chains)))
        dup_set_name = "engine:dup_set:" + str(self.downloader_item.task_id)
        queue_name = "engine:queue:" + str(self.downloader_item.task_id)
        for inner_chain in inner_chains:
            if isinstance(self.redis_conn.ttl(dup_set_name), int):
                sadd_re = self.redis_conn.sadd(dup_set_name, inner_chain)
                if sadd_re == 1:  # 等于1说明上条插入成功，没有重复，省了一次查重
                    new_main_item = MainItem(
                        inner_chain,
                        refer=self.downloader_item.final_url,
                        task_id=self.downloader_item.task_id,
                        deepth=self.downloader_item.deepth + 1)
                    self.redis_conn.lpush(
                        queue_name,
                        json.dumps(new_main_item, default=main_item_to_json))
        # 将可疑外链存库
        for unknown_domain in unknown_domains:
            with self.connection.cursor() as cursor:
                sql = "SELECT mydomain FROM malicious_domains;"
                cursor.execute(sql)
                malicious_records = cursor.fetchall()
            malicious_domains = set([
                malicious_record["mydomain"]
                for malicious_record in malicious_records
            ])
            if unknown_domain in malicious_domains:
                suspicious_item = SuspiciousItem(
                    ss_id, unknown_domain, 1, 1,
                    time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time())))
            else:
                suspicious_item = SuspiciousItem(ss_id, unknown_domain, 0, -1,
                                                 None)
            with self.connection.cursor() as cursor:
                sql = 'INSERT INTO suspicious_records (ss_id, unknown_domain, checked, result, ' \
                      'check_time) VALUES (%s, %s, %s, %s, %s)'
                result = cursor.execute(sql, suspicious_item.save_tuple())
                if result != 1:
                    logging.error("suspicious_records插入记录" +
                                  suspicious_item.save_tuple() + "失败！")

        self.connection.commit()
        self.connection.close()
        logging.info(self.downloader_item.request_url + " parse over.")

Example #11

0

Show file

    def parse_by_task_id(cls, task_id):
        connection = pymysql.connect(**projectconfig.mysql_config)  # 建立数据库链接
        # 读黑名单
        sql = "SELECT mydomain FROM malicious_domains;"
        with connection.cursor() as cursor:
            cursor.execute(sql)
            malicious_records = cursor.fetchall()
        malicious_domains = set([
            malicious_record["mydomain"]
            for malicious_record in malicious_records
        ])
        sql = "SELECT id,request_url FROM snapshot WHERE task_id=%s;"
        with connection.cursor() as cursor:
            cursor.execute(sql, (task_id, ))
            items = cursor.fetchall()
        urls = dict()  # 用url作键值，将同一url不同地区的下载结果id聚类
        for item in items:
            id = item["id"]
            request_url = item["request_url"]
            if request_url not in urls.keys():
                urls[request_url] = list([
                    id,
                ])
            else:
                urls[request_url].append(id)
        for url in urls.keys():
            htmls = list()
            for html_id in urls.get(url):
                html, final_url = get_html_from_mysql(html_id=html_id)
                format_html = HtmlUtil.get_format_html(html=html,
                                                       final_url=final_url)
                htmls.append(format_html)
            diff_out_chains = HtmlUtil.diff_out_chains_from_same_url(
                htmls=htmls, url=url)

            for i in range(0, len(urls.get(url))):
                sql = "INSERT INTO private_out_chain_records (ss_id, out_chain, checked, result, check_time) " \
                      "VALUES (%s, %s, %s, %s, %s)"
                for diff_out_chain in diff_out_chains[i]:
                    with connection.cursor() as cursor:
                        if UrlUtil.get_top_domain(
                                diff_out_chain) in malicious_domains:
                            private_out_chain_record_item = PrivateOutChainRecordItem(
                                urls.get(url)[i], diff_out_chain, 1, 1, None)
                        else:
                            private_out_chain_record_item = PrivateOutChainRecordItem(
                                urls.get(url)[i], diff_out_chain, 0, -1, None)
                        result = cursor.execute(
                            sql, private_out_chain_record_item.save_tuple())
                        if result != 1:
                            logging.error(
                                "private_out_chain_records插入记录" +
                                private_out_chain_record_item.save_tuple() +
                                "失败！")
            logging.info("url: " + url + " compare over.")
        with connection.cursor() as cursor:
            sql = "UPDATE download_tasks SET compared=1 WHERE task_id=%s;"
            re = cursor.execute(sql, (task_id, ))
            if re != 1:
                logging.error("Update table download_tasks failed!")
        connection.commit()
        connection.close()