Ejemplo n.º 1
0
def initialization_new_listpage_url_cache(database_config):
    # 初始化 new_listpage_url_cache
    # 读取 website, 删掉 new_listpage_url_cache 中目标website_no的记录,然后再重新插入主页url

    # 清空 new_listpage_url_cache 已采的数据
    delete_cache_sql = f"delete from new_listpage_url_cache where website_no in (select website_no from website);"
    # common.query_mysql(database_config, delete_cache_sql)
    website_sql = 'select * from website;'
    website_result = common.query_mysql(database_config, website_sql)
    for website in website_result:
        Website_No = website['Website_No']
        Listpage_Title = website['Website_Name']
        Listpage_URL = website['Website_Main_Page_URL']
        Is_Need_VPN = website['Is_Need_VPN']
        # 计算url MD5 先去掉http和末尾斜杆
        md5_source = Listpage_URL
        md5_source = md5_source.replace('http://', '')
        md5_source = md5_source.replace('https://', '')
        md5_source = md5_source.rstrip('/')
        Listpage_URL_MD5 = common.get_token(md5_source)
        Domain_Code = common.get_domain_code(Listpage_URL)
        Host_Code = common.get_host_code(Listpage_URL)
        Score_Detail = '{"status": True, "message": "root page"}'
        # 插入主页到 new_listpage_url_cache
        insert_url_to_cache = f"""
                insert ignore into new_listpage_url_cache(Column_Extraction_Deep, Listpage_URL, 
                Listpage_Title, Domain_Code, Host_Code, Listpage_URL_MD5, Level_Score, Score_Detail, Website_No, Is_Need_VPN) 
                value(1, '{Listpage_URL}', '{Listpage_Title}', '{Domain_Code}', '{Host_Code}', '{Listpage_URL_MD5}', 
                100, '{Score_Detail}', '{Website_No}', {Is_Need_VPN});
                """
        print(insert_url_to_cache)
        common.query_mysql(database_config, insert_url_to_cache)
Ejemplo n.º 2
0
def main():
    # 导入参数文件
    host = conf.get("database", "host")
    port = conf.get("database", "port")
    user = conf.get("database", "user")
    passwd = conf.get("database", "passwd")
    db = conf.get("database", "db")
    table = conf.get("database", "table")
    reject_domain_file = conf.get("reject_domain", "file_name")

    # 数据库配置
    # database_config = {'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource'}
    database_config = {
        'host': host,
        'port': int(port),
        'user': user,
        'passwd': passwd,
        'db': db
    }

    # 读取reject_domain
    reject_domain_list = get_reject_domain_list(reject_domain_file)

    # 执行删除操作
    for str_domain in reject_domain_list:
        logger.info(str_domain)
        domain = common.get_domain_code(str_domain)
        sql = f'''delete from {table} where domain_code='{domain}';'''
        result = query_mysql(database_config, sql)
        if result > 0:
            logger.info(sql)
            logger.info('delete count: ' + str(result))
def main():
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }

    select_column = f"""select ad.Article_URL, ad.Domain_Code, ad.Article_Source from article_detail ad
                    where 1=1 and ad.Website_No in (select Website_No from website where Website_Name like'%百度新闻%')
                    and ad.Extracted_Time>'2020-10-01' and ad.Extracted_Time<'2020-11-25'
                    and Article_Source is not NULL and Article_Source !='' GROUP BY Domain_Code;"""

    try:
        results = common.query_mysql(extractor_116, select_column)
    except Exception as e:
        results = []

    column_list = []
    for i in results:
        title = i['Article_Source']
        listpage_url = i['Article_URL']

        domain_code = common.get_domain_code(listpage_url)
        host_code = common.get_host_code(listpage_url)
        host_code_index = listpage_url.index(host_code) + len(host_code)
        listpage_url = listpage_url[0:host_code_index] + '/'

        # 计算url MD5 先去掉http和末尾斜杆
        md5_source = listpage_url
        md5_source = md5_source.replace('http://', '')
        md5_source = md5_source.replace('https://', '')
        md5_source = md5_source.rstrip('/')
        record_md5_id = common.get_token(md5_source)

        level_score = '100'
        Score_Detail = '"{\"status\": True, \"message\": \"root page\"}"'
        website_no = 'BAIDU_NEWS'
        column_extraction_deep = 0
        column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{Score_Detail}')"
        # column_list.append(column)
        print(column)
        # 批量插入
        values = column
        insert_column = f"replace into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};"
        # print(insert_column)
        try:
            common.query_mysql(extractor_118, insert_column)
        except Exception as e:
            pass
Ejemplo n.º 4
0
def main(start, end):
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }

    select_column = f"select Website_No,ListPage_URL,ListPage_Title from listpage_url where ListPage_URL_ID IN " \
                    f"(select ListPage_URL_ID from cloud_listpage_url where cloud_listpage_url_id BETWEEN {start} AND {end});"

    try:
        results = common.query_mysql(extractor_116, select_column)
        column_list = []
        for item in results:
            URL = item['ListPage_URL']
            Title = item['ListPage_Title']
            Website_No = item['Website_No']
            Domain_Code = common.get_domain_code(URL)
            Host_Code = common.get_host_code(URL)
            # 计算url MD5 先去掉http和末尾斜杆
            md5_source = URL
            md5_source = md5_source.replace('http://', '')
            md5_source = md5_source.replace('https://', '')
            md5_source = md5_source.rstrip('/')
            Record_MD5_ID = common.get_token(md5_source)
            Level_Score, Score_Detail = common.is_need_filter(Title, URL)
            Column_Extraction_Deep = 1

            column = f"({Column_Extraction_Deep}, '{URL}', '{Title}', '{Domain_Code}', '{Host_Code}', '{Record_MD5_ID}', {Level_Score}, '{Score_Detail}', '{Website_No}')"
            if Level_Score > 20:
                column_list.append(column)

        # 批量插入
        values = ",".join(column_list)
        insert_column = f"insert ignore into column_link(Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No) values{values};"
        # print(insert_column)
        common.query_mysql(extractor_118, insert_column)

    except Exception as e:
        print(e)
def main():
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }
    select_column = f"select * from website where ID>1577;"

    try:
        results = common.query_mysql(extractor_118, select_column)
        column_list = []
        for item in results:
            ID = item['ID']
            Column_Extraction_Deep = '0'
            URL = item['URL']
            Title = item['Title']
            Domain_Code = common.get_domain_code(URL)
            Host_Code = common.get_host_code(URL)
            # 计算url MD5 先去掉http和末尾斜杆
            md5_source = URL
            md5_source = md5_source.replace('http://', '')
            md5_source = md5_source.replace('https://', '')
            md5_source = md5_source.rstrip('/')
            Record_MD5_ID = common.get_token(md5_source)
            Level_Score = '100'
            Score_Detail = '{"status": True, "message": "root page"}'
            Website_No = 'OVERSEA'
            column = f"({Column_Extraction_Deep}, '{URL}', '{Title}', '{Domain_Code}', '{Host_Code}', '{Record_MD5_ID}', {Level_Score}, '{Score_Detail}', '{Website_No}')"
            column_list.append(column)
            # # 更新md5
            update_website = f"update website set record_md5_id='{Record_MD5_ID}' where ID={ID}"
            common.query_mysql(extractor_118, update_website)
            insert_column = f"insert ignore into column_link_oversea(Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No) values{column};"
            print(insert_column)
            common.query_mysql(extractor_118, insert_column)

        # # 批量插入
        # values = ",".join(column_list)
        # insert_column = f"insert ignore into column_link_oversea(Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No) values{values};"
        # print(insert_column)
        # common.query_mysql(extractor_118, insert_column)

    except Exception as e:
        print(e)
def parse_html_to_database(database_config, url, column_extraction_deep, domain_code_source, website_no, Is_Need_VPN, text):
    try:
        root = etree.HTML(text, parser=etree.HTMLParser(encoding='utf-8'))
        column_extraction_deep = 0
        items = root.xpath('//a')
        column_list = []
        for num, item in enumerate(items):
            title = "".join(item.xpath('.//text()'))
            listpage_url = "".join(item.xpath('./@href'))
            if (len(title) > 0) and ('go.php?' in listpage_url):
                listpage_url = listpage_url.replace('/go.php?', '')
                listpage_url = listpage_url.replace('http://www.01-114.com', '')
                print(title, listpage_url)
                # 去掉标点符号
                # 计算url MD5 先去掉http和末尾斜杆
                md5_source = listpage_url
                md5_source = md5_source.replace('http://', '')
                md5_source = md5_source.replace('https://', '')
                md5_source = md5_source.rstrip('/')
                record_md5_id = common.get_token(md5_source)
                domain_code = common.get_domain_code(listpage_url)
                host_code = common.get_host_code(listpage_url)

                host_code_index = listpage_url.index(host_code) + len(host_code)
                listpage_url = listpage_url[0:host_code_index] + '/'

                # 垃圾词、垃圾域名过滤
                level_score = 100
                score_detail = '{"status": True, "message": "root page"}'

                if level_score > 20:
                    column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', {level_score}, '{score_detail}')"
                    column_list.append(column)

        # 批量插入
        print(len(column_list))
        values = ",".join(column_list)
        insert_column = f"replace into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, score_detail) values{values};"
        # print(insert_column)
        query_mysql(database_config, insert_column)
        return True
    except Exception as e:
        pass
def main():
    extractor_118 = {
        'host': '192.168.1.133',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    # select_column = f"select Article_Current_Node_HTML, Website_URL from column_root_source where 1=1 " \
    #                 f"and Source='baidu_news';"
    select_column = f"select Website_URL,Website_Title,Website_Description,Website_Keywords from column_root_source " \
                    f"where 1=1 and Source='baidu_web' " \
                    f"and Website_Title not like '%新闻%' and Website_Title not like'%资讯%' and Website_Title not like'%论坛%' and Website_Title not like'%社区%'" \
                    f"and Website_Keywords like '%新闻%' and Website_Keywords like'%资讯%' and Website_Keywords like'%论坛%' and Website_Keywords like'%社区%';"

    try:
        results = common.query_mysql(extractor_118, select_column)
    except Exception as e:
        results = []

    column_list = []
    for i in results:
        title = i['Website_Title']
        listpage_url = i['Website_URL']

        domain_code = common.get_domain_code(listpage_url)
        host_code = common.get_host_code(listpage_url)
        host_code_index = listpage_url.index(host_code) + len(host_code)
        listpage_url = listpage_url[0:host_code_index] + '/'
        # 计算url MD5 先去掉http和末尾斜杆
        md5_source = listpage_url
        md5_source = md5_source.replace('http://', '')
        md5_source = md5_source.replace('https://', '')
        md5_source = md5_source.rstrip('/')
        record_md5_id = common.get_token(md5_source)

        try:
            title = title.split('-')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split('_')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split(',')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split(',')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split('|')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split(' ')[0].strip()
        except Exception as e:
            title = title

        level_score = '100'
        Score_Detail = '"{\"status\": True, \"message\": \"root page\"}"'

        # website_no = 'BAIDU_NEWS'
        website_no = 'BAIDU_WEB'
        column_extraction_deep = 0
        column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{Score_Detail}')"
        # column_list.append(column)
        print(column)
        # 批量插入
        values = column
        # insert_column = f"replace into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};"
        insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};"
        # print(insert_column)
        try:
            common.query_mysql(extractor_118, insert_column)
        except Exception as e:
            pass
def main():
    fn = open('listpage.txt', 'r', encoding='UTF-8')  # 打开文件
    column_list = []
    for i in fn:
        try:
            title = i.split('=')[0].strip()
            listpage_url = i.split('=')[1].strip()

            domain_code = common.get_domain_code(listpage_url)
            host_code = common.get_host_code(listpage_url)
            host_code_index = listpage_url.index(host_code) + len(host_code)
            listpage_url = listpage_url[0:host_code_index] + '/'
            # 计算url MD5 先去掉http和末尾斜杆
            md5_source = listpage_url
            md5_source = md5_source.replace('http://', '')
            md5_source = md5_source.replace('https://', '')
            md5_source = md5_source.rstrip('/')
            record_md5_id = common.get_token(md5_source)

            if len(title) < 1:
                try:
                    response = requests.get(listpage_url, timeout=5)
                    # print(response.status_code)
                    if response.status_code == 200:
                        encoding = get_encoding(response)
                        # print(encoding)
                        response.encoding = encoding
                        soup = BeautifulSoup(response.text, 'lxml')
                        # print(soup.title.text)
                        title = soup.title.text
                    else:
                        continue
                except:
                    pass
            try:
                title = title.split('-')[0].strip()
            except Exception as e:
                title = title
            try:
                title = title.split('_')[0].strip()
            except Exception as e:
                title = title
            try:
                title = title.split('-')[0].strip()
            except Exception as e:
                title = title

            level_score, Score_Detail = common.is_need_filter(
                title, listpage_url, True)
            # print(level_score, Score_Detail, title, listpage_url)
            if level_score > -100:
                level_score = '100'
                Score_Detail = '"{\"status\": True, \"message\": \"root page\"}"'

                website_no = 'AD_SELECTED'
                column_extraction_deep = 0
                column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{Score_Detail}')"
                # column_list.append(column)
                print(column)
                # 批量插入
                extractor_118 = {
                    'host': '192.168.1.118',
                    'port': 3306,
                    'user': '******',
                    'passwd': 'poms@db',
                    'db': 'mymonitor'
                }
                values = column
                insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};"
                # print(insert_column)
                try:
                    common.query_mysql(extractor_118, insert_column)
                except Exception as e:
                    pass

        except Exception as e:
            pass

    fn.close()  # 关闭文件
def parse_html_to_database(database_config, url, column_extraction_deep,
                           domain_code_source, website_no, Is_Need_VPN, text):
    try:
        root = etree.HTML(text, parser=etree.HTMLParser(encoding='utf-8'))
        column_extraction_deep = int(column_extraction_deep) + 1

        items = root.xpath('//a')
        column_list = []
        for num, item in enumerate(items):

            title = "".join(item.xpath('.//text()'))
            listpage_url = "".join(item.xpath('./@href'))
            listpage_url = urljoin(url, listpage_url)
            # 去掉标点符号
            title = common.filter_punctuation(title)
            listpage_url = common.match_url(listpage_url)
            # 计算url MD5 先去掉http和末尾斜杆
            md5_source = listpage_url
            md5_source = md5_source.replace('http://', '')
            md5_source = md5_source.replace('https://', '')
            md5_source = md5_source.rstrip('/')
            record_md5_id = common.get_token(md5_source)
            domain_code = common.get_domain_code(listpage_url)
            host_code = common.get_host_code(listpage_url)
            # domain 要与源域名一致
            if domain_code_source != domain_code:
                continue

            # 计算a节点占全部a节点百分比,如果总节点小于50就去前50%的节点;如果在50和200之前,就取前30%;大于200,就取前20%
            len_items = len(items)
            node_percent = num / len(items)
            # print(num, 'percent:{:.0%}'.format(node_percent), title)
            logger.debug(
                str(num) + 'percent:{:.0%}'.format(node_percent) + title)
            if len_items < 50:
                if node_percent > 0.5:
                    continue
            if (len_items >= 50) and (len_items <= 200):
                if node_percent > 0.3:
                    continue
            if len_items > 200:
                if node_percent > 0.2:
                    continue

            # 垃圾词、垃圾域名过滤
            level_score, score_detail = common.is_need_filter(
                title, listpage_url, True)
            # print(level_score, score_detail)
            logger.debug(str(level_score) + '=' + score_detail)

            # 入库分值,新闻要大于等于20,论坛要大于等于10
            valid_score = 20
            media_type = common.get_media_type(listpage_url)
            if media_type == 'forum':
                valid_score = 10
            if level_score >= valid_score:
                column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{score_detail}')"
                column_list.append(column)

        # 批量插入
        values = ",".join(column_list)
        insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, score_detail) values{values};"
        # print(insert_column)
        logger.info('column count: ' + str(len(column_list)))
        count = common.query_mysql(database_config, insert_column)
        # logger.info('insert count: ' + str(count))

    except Exception as e:
        logger.error(str(e))
async def get_response(semaphore,
                       url,
                       column_extraction_deep=1,
                       domain_code_source=None,
                       website_no=None):
    try:
        async with semaphore:
            timeout = aiohttp.ClientTimeout(total=20)
            # ValueError: too many file descriptoersin select()报错问题
            # 一般是并发请求数太大导致的,通常通过减少并发数解决。
            #
            # 我遇到的情况:并发量设置的不高,运行一段时间后报该错误。通过搜索、调试,最后看aiohttp文档时发现是因为请求的https站点的服务器没有正确完成ssl连接,需要指定一个叫enable_cleanup_closed的参数为True:
            #
            # session = aiohttp.ClientSession(connector=aiohttp.TCPConnector(enable_cleanup_closed=True)
            # 官方对enable_cleanup_closed参数的解释:
            #
            # Some ssl servers do not properly complete SSL shutdown process, in that case asyncio leaks SSL connections.
            # If this parameter is set to True, aiohttp additionally aborts underlining transport after 2 seconds. It is off by default.
            #
            # 作者:Ovie
            # 链接:https://www.jianshu.com/p/f7af4466f346
            # 来源:简书
            # 著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
            connector = aiohttp.TCPConnector(
                limit=60, verify_ssl=False,
                enable_cleanup_closed=True)  # 60小于64。也可以改成其他数
            async with aiohttp.ClientSession(timeout=timeout,
                                             connector=connector) as session:
                headers = {
                    "user-agent":
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36",
                }
                async with session.get(url, headers=headers) as response:
                    text = await response.text()
                    root = etree.HTML(
                        text, parser=etree.HTMLParser(encoding='utf-8'))
                    # print(url)
                    logging.info(url)
                    column_extraction_deep = int(column_extraction_deep) + 1

                    items = root.xpath('//a')
                    column_list = []
                    # print(len(items))
                    for item in items:
                        title = "".join(item.xpath('.//text()'))
                        listpage_url = "".join(item.xpath('./@href'))
                        listpage_url = urljoin(url, listpage_url)
                        # 去掉标点符号
                        title = common.filter_punctuation(title)
                        listpage_url = common.match_url(listpage_url)
                        # 计算url MD5 先去掉http和末尾斜杆
                        md5_source = listpage_url
                        md5_source = md5_source.replace('http://', '')
                        md5_source = md5_source.replace('https://', '')
                        md5_source = md5_source.rstrip('/')
                        record_md5_id = common.get_token(md5_source)
                        domain_code = common.get_domain_code(listpage_url)
                        host_code = common.get_host_code(listpage_url)
                        # domain 要与源域名一致
                        if domain_code_source != domain_code:
                            continue

                        # 垃圾词、垃圾域名过滤
                        level_score, score_detail = common.is_need_filter(
                            title, listpage_url, False)
                        # print(level_score, score_detail)
                        logging.info(str(level_score) + '=' + score_detail)

                        if level_score > 20:
                            column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{score_detail}')"
                            column_list.append(column)

                    # 批量插入
                    values = ",".join(column_list)
                    insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, score_detail) values{values};"
                    # print(insert_column)
                    common.query_mysql(extractor_118, insert_column)
                    return True

    except Exception as e:
        if len(str(e)) > 0:
            logging.error(str(e))
        return False