コード例 #1
0
async def get_response(i, semaphore):
    async with semaphore:
        async with aiohttp.ClientSession() as session:
            try:
                url = url_template.format(i)
                async with session.get(url) as response:
                    text = await response.text()
                    text_json = json.loads(text)
                    code = text_json["code"]
                    print(url)
                    if code == 200:
                        try:
                            source_name = text_json["data"]["pcArticleVOS"][0][
                                "userId"]
                            last_article_time = text_json["data"][
                                "pcArticleVOS"][0]["publicTime"]
                            print(url, source_name)

                            insert_sql = f'insert ignore into author_other(author_name,author_url,author_id,website) VALUES("{source_name}","{url}","{i}","sohuhao");'
                            # print(insert_sql)
                            common.query_mysql(database_config, insert_sql)

                        except Exception as e:
                            print(e)

            except Exception as e:
                pass
コード例 #2
0
async def get_response(i, semaphore):
    async with semaphore:
        async with aiohttp.ClientSession() as session:
            try:
                headers = {
                    "user-agent":
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36",
                }
                url = url_template.format(i)
                async with session.get(url, headers=headers) as response:
                    text = await response.text()
                    text_json = json.loads(text)
                    data = text_json["data"]
                    # print(text_json)
                    if len(data) > 0:
                        try:
                            source_name = text_json["data"][0]["data"][
                                "source"]
                            last_article_time = text_json["data"][0]["data"][
                                "pdate"]
                            url = url.replace("&pageRow=1&", "&pageRow=100&")
                            print(url, source_name)

                            insert_sql = f'insert ignore into author_other(author_name,author_url,author_id,website) VALUES("{source_name}","{url}","{i}","beijingshijianhao");'
                            # print(insert_sql)
                            common.query_mysql(database_config, insert_sql)

                        except Exception as e:
                            print(e)

            except Exception as e:
                print(e)
コード例 #3
0
def initialization_new_listpage_url_cache(database_config):
    # 初始化 new_listpage_url_cache
    # 读取 website, 删掉 new_listpage_url_cache 中目标website_no的记录,然后再重新插入主页url

    # 清空 new_listpage_url_cache 已采的数据
    delete_cache_sql = f"delete from new_listpage_url_cache where website_no in (select website_no from website);"
    # common.query_mysql(database_config, delete_cache_sql)
    website_sql = 'select * from website;'
    website_result = common.query_mysql(database_config, website_sql)
    for website in website_result:
        Website_No = website['Website_No']
        Listpage_Title = website['Website_Name']
        Listpage_URL = website['Website_Main_Page_URL']
        Is_Need_VPN = website['Is_Need_VPN']
        # 计算url MD5 先去掉http和末尾斜杆
        md5_source = Listpage_URL
        md5_source = md5_source.replace('http://', '')
        md5_source = md5_source.replace('https://', '')
        md5_source = md5_source.rstrip('/')
        Listpage_URL_MD5 = common.get_token(md5_source)
        Domain_Code = common.get_domain_code(Listpage_URL)
        Host_Code = common.get_host_code(Listpage_URL)
        Score_Detail = '{"status": True, "message": "root page"}'
        # 插入主页到 new_listpage_url_cache
        insert_url_to_cache = f"""
                insert ignore into new_listpage_url_cache(Column_Extraction_Deep, Listpage_URL, 
                Listpage_Title, Domain_Code, Host_Code, Listpage_URL_MD5, Level_Score, Score_Detail, Website_No, Is_Need_VPN) 
                value(1, '{Listpage_URL}', '{Listpage_Title}', '{Domain_Code}', '{Host_Code}', '{Listpage_URL_MD5}', 
                100, '{Score_Detail}', '{Website_No}', {Is_Need_VPN});
                """
        print(insert_url_to_cache)
        common.query_mysql(database_config, insert_url_to_cache)
コード例 #4
0
async def get_response(i, semaphore):
    async with semaphore:
        async with aiohttp.ClientSession() as session:
            try:
                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
                }
                url = url_template.format(i)
                async with session.get(url, headers=headers) as response:
                    text = await response.text()
                    json_text = re.findall('yidian.docinfo = ({.*});</script>',
                                           text)[0]
                    result = json.loads(json_text)
                    source_name = result["channel_name"]
                    follower_count = result["bookcount"]
                    follower_count = follower_count.split('人订阅')[0]
                    if '万' in follower_count:
                        follower_count = follower_count.replace('万', '')
                        follower_count = float(follower_count) * 10000
                    print(json_text)

                    if len(source_name) > 0:
                        try:
                            print(url, source_name)
                            insert_sql = f'replace into author_other(author_name,author_url,author_id,follower_count,website) VALUES("{source_name}","{url}","{i}",{follower_count},"yidianhao");'
                            # print(insert_sql)
                            common.query_mysql(database_config, insert_sql)

                        except Exception as e:
                            print(e)

            except Exception as e:
                pass
コード例 #5
0
def main():
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }

    select_column = f"""select ad.Article_URL, ad.Domain_Code, ad.Article_Source from article_detail ad
                    where 1=1 and ad.Website_No in (select Website_No from website where Website_Name like'%百度新闻%')
                    and ad.Extracted_Time>'2020-10-01' and ad.Extracted_Time<'2020-11-25'
                    and Article_Source is not NULL and Article_Source !='' GROUP BY Domain_Code;"""

    try:
        results = common.query_mysql(extractor_116, select_column)
    except Exception as e:
        results = []

    column_list = []
    for i in results:
        title = i['Article_Source']
        listpage_url = i['Article_URL']

        domain_code = common.get_domain_code(listpage_url)
        host_code = common.get_host_code(listpage_url)
        host_code_index = listpage_url.index(host_code) + len(host_code)
        listpage_url = listpage_url[0:host_code_index] + '/'

        # 计算url MD5 先去掉http和末尾斜杆
        md5_source = listpage_url
        md5_source = md5_source.replace('http://', '')
        md5_source = md5_source.replace('https://', '')
        md5_source = md5_source.rstrip('/')
        record_md5_id = common.get_token(md5_source)

        level_score = '100'
        Score_Detail = '"{\"status\": True, \"message\": \"root page\"}"'
        website_no = 'BAIDU_NEWS'
        column_extraction_deep = 0
        column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{Score_Detail}')"
        # column_list.append(column)
        print(column)
        # 批量插入
        values = column
        insert_column = f"replace into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};"
        # print(insert_column)
        try:
            common.query_mysql(extractor_118, insert_column)
        except Exception as e:
            pass
コード例 #6
0
def main(start, end):
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }

    # 每次读取100万,并按domain_code排序,这样分配节点时便于把域名相同的分在一个组
    select_column = f"select title,url,domain_code,record_md5_id from column_link where Column_Link_ID BETWEEN {start} AND {end};"

    try:
        results = common.query_mysql(extractor_118, select_column)
        column_list = []
        count = 0
        print('len(results): ', len(results))
        for item in results:
            Cloud_Server_ID = random.randint(10000, 10062)
            Listpage_URL = item['url']
            # 过滤空格换行等
            p = re.compile('\s+')
            Listpage_URL = re.sub(p, '', Listpage_URL)
            ListPage_Title = item['title']
            Domain_Code = item['domain_code']
            Record_MD5_ID = item['record_md5_id']

            column = f"({Cloud_Server_ID}, '{Listpage_URL}', '{ListPage_Title}', '{Domain_Code}', '{Record_MD5_ID}')"
            column_list.append(column)

            # 每1000条插入一次,分批
            if count > 0 and not count % 1000:
                print(count)
                values = ",".join(column_list)
                insert_column = f"insert ignore into cloud_listpage_url(Cloud_Server_ID, Listpage_URL, ListPage_Title, Domain_Code, Record_MD5_ID) values{values};"
                # print(insert_column)
                common.query_mysql(extractor_116, insert_column)
                column_list = []

            count += 1

        print('len(column_list): ', len(column_list))
        # 批量插入
        values = ",".join(column_list)
        insert_column = f"insert ignore into cloud_listpage_url(Cloud_Server_ID, Listpage_URL, ListPage_Title, Domain_Code, Record_MD5_ID) values{values};"
        # print(insert_column)
        common.query_mysql(extractor_116, insert_column)

    except Exception as e:
        print(e)
コード例 #7
0
def update_score():
    # 连接mysql
    config = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor',
        'charset': 'utf8mb4',
        'cursorclass': pymysql.cursors.DictCursor
    }

    # 1.查询操作
    # 编写sql 查询语句
    select_sql = "select Column_Link_ID,URL,Title from column_link where Level_Score is null limit 1000;"
    update_sql_pattern = "UPDATE column_link SET " \
                         "Level_Score = CASE Column_Link_ID {0} END, " \
                         "Score_Detail = CASE Column_Link_ID {1} END  " \
                         "WHERE Column_Link_ID IN {2};"
    when_then_score_pattern = " WHEN {} THEN {} "
    when_then_detail_pattern = " WHEN {} THEN '{}' "
    id_list = []
    try:
        results = common.query_mysql(config, select_sql)  # 获取查询的所有记录
        print(results)

        when_then_score = ""
        when_then_detail = ""
        # 遍历结果
        for row in results:
            Column_Link_ID = row['Column_Link_ID']
            url = row['URL']
            title = row['Title']
            level_score, status = common.is_need_filter(title, url, False)
            # print(level_score, status)
            when_then_score = when_then_score + when_then_score_pattern.format(
                Column_Link_ID, level_score)
            when_then_detail = when_then_detail + when_then_detail_pattern.format(
                Column_Link_ID, status)
            id_list.append(Column_Link_ID)

        id_tuple = tuple(id_list)
        sql = update_sql_pattern.format(when_then_score, when_then_detail,
                                        id_tuple)
        print(sql)
        try:
            common.query_mysql(config, sql)
        except Exception as e:
            print(e)

    except Exception as e:
        print(e)
コード例 #8
0
def main(start, end):
    extractor_133 = {
        'host': '192.168.1.133',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }

    select_column = f"select * from column_link_2 where Column_Link_ID BETWEEN {start} AND {end};"

    try:
        results = common.query_mysql(extractor_118, select_column)
        column_list = []
        for item in results:
            Extracted_flag = item['Extracted_flag']
            Column_Extraction_Deep = item['Column_Extraction_Deep']
            URL = item['URL']
            Title = item['Title']
            Domain_Code = item['Domain_Code']
            Host_Code = item['Host_Code']
            # 计算url MD5 先去掉http和末尾斜杆
            md5_source = URL
            md5_source = md5_source.replace('http://', '')
            md5_source = md5_source.replace('https://', '')
            md5_source = md5_source.rstrip('/')
            Record_MD5_ID = common.get_token(md5_source)
            Level_Score = item['Level_Score']
            Score_Detail = item['Score_Detail']
            Website_No = item['Website_No']
            Is_User_Added = item['Is_User_Added']
            column = f"('{Extracted_flag}', {Column_Extraction_Deep}, '{URL}', '{Title}', '{Domain_Code}', '{Host_Code}', '{Record_MD5_ID}', {Level_Score}, '{Score_Detail}', '{Website_No}', {Is_User_Added})"
            column_list.append(column)

        # 批量插入
        values = ",".join(column_list)
        insert_column = f"insert ignore into column_link(Extracted_flag, Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No, Is_User_Added) values{values};"
        # print(insert_column)
        common.query_mysql(extractor_118, insert_column)

    except Exception as e:
        print(e)
コード例 #9
0
def main(start, end):
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }

    select_column = f"select Website_No,ListPage_URL,ListPage_Title from listpage_url where ListPage_URL_ID IN " \
                    f"(select ListPage_URL_ID from cloud_listpage_url where cloud_listpage_url_id BETWEEN {start} AND {end});"

    try:
        results = common.query_mysql(extractor_116, select_column)
        column_list = []
        for item in results:
            URL = item['ListPage_URL']
            Title = item['ListPage_Title']
            Website_No = item['Website_No']
            Domain_Code = common.get_domain_code(URL)
            Host_Code = common.get_host_code(URL)
            # 计算url MD5 先去掉http和末尾斜杆
            md5_source = URL
            md5_source = md5_source.replace('http://', '')
            md5_source = md5_source.replace('https://', '')
            md5_source = md5_source.rstrip('/')
            Record_MD5_ID = common.get_token(md5_source)
            Level_Score, Score_Detail = common.is_need_filter(Title, URL)
            Column_Extraction_Deep = 1

            column = f"({Column_Extraction_Deep}, '{URL}', '{Title}', '{Domain_Code}', '{Host_Code}', '{Record_MD5_ID}', {Level_Score}, '{Score_Detail}', '{Website_No}')"
            if Level_Score > 20:
                column_list.append(column)

        # 批量插入
        values = ",".join(column_list)
        insert_column = f"insert ignore into column_link(Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No) values{values};"
        # print(insert_column)
        common.query_mysql(extractor_118, insert_column)

    except Exception as e:
        print(e)
コード例 #10
0
def insert_into_center_database(input_data_list):
    """
    查询相关数据库,解析相关数据并批量insert到mysql对应表中
    :param input_data_list: 类型list,每个item为字典,key全部要以listpage_url表字段命名
    [{"ListPage_Title":"1234","ListPage_URL":"https://v2.sohu.com/1",...},{"ListPage_Title":"2345","ListPage_URL":"https://weibo.com/p/aj/v6/mblog/",...}]
    :return:
    """
    center_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }

    column_list = []
    for i in input_data_list:

        website_no = 'S18605'
        listpage_url = i['ListPage_URL']
        # listpage_url_dm5 = ''
        listpage_title = i['ListPage_Title']
        domain_code = i['Domain_Code']
        host_code = i['Host_Code']
        last_check_score_text = i['Last_Check_Score_Text']
        listpage_save_rule = 3
        is_enabled = 1
        linkurl_min_length = 10
        linktext_min_length = 4

        # 计算url MD5 先去掉http和末尾斜杆
        md5_source = listpage_url
        md5_source = md5_source.replace('http://', '')
        md5_source = md5_source.replace('https://', '')
        md5_source = md5_source.rstrip('/')
        md5_source = md5_source.lower()
        listpage_url_dm5 = common.get_token(md5_source)

        column = f"('{website_no}', '{listpage_url}', '{listpage_url_dm5}','{listpage_title}','{domain_code}','{host_code}',{last_check_score_text},{listpage_save_rule},{is_enabled},{linkurl_min_length},{linktext_min_length})"
        column_list.append(column)

    values = ",".join(column_list)
    insert_column = f"insert ignore into listpage_url(Website_No,ListPage_URL,ListPage_URL_MD5,ListPage_Title,Domain_Code,Host_Code,Last_Check_Score_Text,ListPage_Save_Rule,Is_Enabled,LinkURL_Min_Length,LinkText_Min_Length) values{values};"
    # print(insert_column)
    try:
        common.query_mysql(center_116, insert_column)
    except Exception as e:
        print(e)
コード例 #11
0
def main(start_server_id, end_server_id, count):
    extractor_116 = {'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor'}

    for i in range(start_server_id, end_server_id):
        print(i)
        update_website = f'''
            update cloud_task_schedule set cloud_server_id={i} where 1=1 
            and schedule_id in (select * from 
            (select Schedule_ID from cloud_task_schedule where 1=1 and Cloud_Server_ID is null order by Schedule_ID limit {count})aa
            );
        '''
        print(update_website)
        try:
            common.query_mysql(extractor_116, update_website)
        except Exception as e:
            print(e)
コード例 #12
0
def main():
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }
    select_column = f"select * from website where ID>1577;"

    try:
        results = common.query_mysql(extractor_118, select_column)
        column_list = []
        for item in results:
            ID = item['ID']
            Column_Extraction_Deep = '0'
            URL = item['URL']
            Title = item['Title']
            Domain_Code = common.get_domain_code(URL)
            Host_Code = common.get_host_code(URL)
            # 计算url MD5 先去掉http和末尾斜杆
            md5_source = URL
            md5_source = md5_source.replace('http://', '')
            md5_source = md5_source.replace('https://', '')
            md5_source = md5_source.rstrip('/')
            Record_MD5_ID = common.get_token(md5_source)
            Level_Score = '100'
            Score_Detail = '{"status": True, "message": "root page"}'
            Website_No = 'OVERSEA'
            column = f"({Column_Extraction_Deep}, '{URL}', '{Title}', '{Domain_Code}', '{Host_Code}', '{Record_MD5_ID}', {Level_Score}, '{Score_Detail}', '{Website_No}')"
            column_list.append(column)
            # # 更新md5
            update_website = f"update website set record_md5_id='{Record_MD5_ID}' where ID={ID}"
            common.query_mysql(extractor_118, update_website)
            insert_column = f"insert ignore into column_link_oversea(Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No) values{column};"
            print(insert_column)
            common.query_mysql(extractor_118, insert_column)

        # # 批量插入
        # values = ",".join(column_list)
        # insert_column = f"insert ignore into column_link_oversea(Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No) values{values};"
        # print(insert_column)
        # common.query_mysql(extractor_118, insert_column)

    except Exception as e:
        print(e)
コード例 #13
0
def update_host_code():
    # 连接mysql
    config = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor',
        'charset': 'utf8mb4',
        'cursorclass': pymysql.cursors.DictCursor
    }

    # 1.查询操作
    # 编写sql 查询语句
    select_sql = "select Column_Link_ID,URL from column_link where host_code is null limit 1000;"
    # update_sql = "update column_link set host_code='{}' where Column_Link_ID={};"
    update_sql_pattern = "UPDATE column_link SET host_code = CASE Column_Link_ID {} END WHERE Column_Link_ID IN {};"
    when_then_pattern = " WHEN {} THEN '{}' "
    id_list = []
    try:
        results = common.query_mysql(config, select_sql)  # 获取查询的所有记录

        when_then = ""
        # 遍历结果
        for row in results:
            Column_Link_ID = row['Column_Link_ID']
            url = row['URL']
            host_code = common.get_host_code(url)
            if len(host_code) > 50:
                continue
            # sql = update_sql.format(host_code, Column_Link_ID)
            when_then = when_then + when_then_pattern.format(
                Column_Link_ID, host_code)
            id_list.append(Column_Link_ID)

        id_tuple = tuple(id_list)
        sql = update_sql_pattern.format(when_then, id_tuple)
        print(sql)
        try:
            common.query_mysql(config, sql)
        except Exception as e:
            print(e)

    except Exception as e:
        print(e)
コード例 #14
0
async def get_response(i, semaphore):
    async with semaphore:
        async with aiohttp.ClientSession() as session:
            try:
                headers = {
                    "user-agent":
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36",
                }
                url = url_template.format(i)
                async with session.get(url, headers=headers) as response:
                    text = await response.text()
                    # response.encoding = "utf-8"
                    root = etree.HTML(
                        text, parser=etree.HTMLParser(encoding='utf-8'))

                    article_url = "".join(
                        root.xpath(
                            '//*[@id="content-list"]/div[2]/div/div[1]/a/@href'
                        ))
                    article_url_id = int(
                        article_url.strip(".html").strip("/news/show/id/"))
                    print(article_url_id)

                    # print(text_json)
                    # http://www.ql1d.com/news/show/id/11503356.html
                    if article_url_id > 11503356:
                        try:
                            source_name = "".join(
                                root.xpath(
                                    '//*[@id="content-list"]/div[1]/div/div/div[2]/div[1]/text()'
                                ))
                            print(url, source_name)

                            insert_sql = f'insert ignore into author_other(author_name,author_url,author_id,website) VALUES("{source_name}","{url}","{i}","qiluyidianhao");'
                            # print(insert_sql)
                            common.query_mysql(database_config, insert_sql)

                        except Exception as e:
                            print(e)

            except Exception as e:
                pass
コード例 #15
0
def create_task(loop, database_config):
    semaphore = asyncio.Semaphore(50)  # 限制并发量为500
    try:
        tasks = []
        # 查询待采集目标
        select_column = f"""
            select Column_Link_ID, Column_Extraction_Deep, URL, Domain_Code, Website_No, Extracted_flag 
            from column_link where Extracted_flag is null
            ORDER BY Column_Extraction_Deep limit 1000;
            """
        print('=====query new tasks=====')
        target_items = common.query_mysql(database_config, select_column)
        print('=====start tasks=====')
        id_list = [0]
        for item in target_items:
            id_list.append(item["Column_Link_ID"])
            url = item["URL"]
            domain_code = item["Domain_Code"]
            website_no = item["Website_No"]
            column_extraction_deep = item["Column_Extraction_Deep"]
            # 最多采集3层
            if column_extraction_deep <= 3:
                task = asyncio.ensure_future(
                    get_response(database_config, semaphore, url,
                                 column_extraction_deep, domain_code,
                                 website_no))
                tasks.append(task)

        results = loop.run_until_complete(asyncio.gather(*tasks))
        # print(results)
        print('=====finish tasks=====')
        # 更新Extracted_flag
        id_list = tuple(id_list)
        update_flag = f"update column_link set Extracted_flag='S' where Column_Link_ID in {id_list};"
        print('=====update flag=====')
        common.query_mysql(database_config, update_flag)
        return len(results)

    except Exception as e:
        logger.error(str(e))
        return None
コード例 #16
0
def get_url_list():
    """
    获取需要检测的域名和url
    :return:
    """
    sql = """
            select Domain_Code,listpage_url from listpage_url where website_no in
            (select website_no from website_tags where Website_Tag_ID=105)
            and domain_code is not null
        """
    query_list = query_mysql(config_116, sql)
    return query_list
コード例 #17
0
def main():
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }

    # ID每增长1万个,编入同一个网站编号
    insert_website = '''
        insert into cloud_task_schedule(Cloud_Server_Count,cloud_server_id,ListPage_URL_Count)
        select 63,cloud_server_id,count(1) from cloud_listpage_url
        GROUP BY cloud_server_id order by cloud_server_id;
    '''
    print(insert_website)

    try:
        common.query_mysql(extractor_116, insert_website)
    except Exception as e:
        print(e)
コード例 #18
0
def create_task(loop, semaphore, database_config):
    try:
        tasks = []
        # 查询待采集目标
        select_column = f"""
            select New_Listpage_URL_ID, Column_Extraction_Deep, Listpage_URL, Domain_Code, Website_No, Extracted_flag, Is_Need_VPN 
            from new_listpage_url_cache
            where Extracted_flag is null 
            ORDER BY Column_Extraction_Deep limit 200;
            """
        target_items = common.query_mysql(database_config, select_column)
        id_list = [0]
        for item in target_items:
            id_list.append(item["New_Listpage_URL_ID"])
            url = item["Listpage_URL"]
            domain_code = item["Domain_Code"]
            website_no = item["Website_No"]
            column_extraction_deep = item["Column_Extraction_Deep"]
            Is_Need_VPN = item["Is_Need_VPN"]
            # 最多采集3层
            if column_extraction_deep <= 3:
                task = asyncio.ensure_future(
                    get_response(database_config, semaphore, url,
                                 column_extraction_deep, domain_code,
                                 website_no, Is_Need_VPN))
                tasks.append(task)

        results = loop.run_until_complete(asyncio.gather(*tasks))
        print(results)
        # 更新Extracted_flag
        id_list = tuple(id_list)
        update_flag = f"update new_listpage_url_cache set Extracted_flag='S' where New_Listpage_URL_ID in {id_list};"
        common.query_mysql(database_config, update_flag)
        return len(target_items)

    except Exception as e:
        if len(str(e)) > 0:
            logger.error(str(e))
        return None
コード例 #19
0
def main():
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }

    select_sql = 'select cloud_server_id,website_no from cloud_task_schedule;'

    try:
        results = common.query_mysql(extractor_116, select_sql)
        for item in results:
            cloud_server_id = item['cloud_server_id']
            website_no = item['website_no']
            print(cloud_server_id, website_no)
            update_cloud = f"update cloud_listpage_url set Cloud_Server_ID={cloud_server_id} where website_no='{website_no}';"
            print(update_cloud)
            common.query_mysql(extractor_116, update_cloud)
    except Exception as e:
        print(e)
def main():
    # 导入参数文件
    host = conf.get("database", "host")
    port = conf.get("database", "port")
    user = conf.get("database", "user")
    passwd = conf.get("database", "passwd")
    db = conf.get("database", "db")
    table = conf.get("database", "table")
    reject_url_file = 'reject_url.txt'

    # 数据库配置
    database_config_116 = {'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor'}
    database_config = {'host': host, 'port': int(port), 'user': user, 'passwd': passwd, 'db': db}

    # 取出116数据库domain表中,PR值大于3的domain_code(实际上是host)
    host_with_pr_value = []
    domain_sql = 'select Domain_Code from domain where PR_Value is not null and PR_Value>3'
    domain_pr_value = common.query_mysql(database_config_116, domain_sql)
    for item in domain_pr_value:
        host_with_pr_value.append(item['Domain_Code'])
    # print(host_with_pr_value)

    # 读取 reject_url
    reject_url_list = get_reject_domain_list(reject_url_file)
    # 执行删除操作
    for url in reject_url_list:
        # logger.info(url)
        host = common.get_host_code(url)
        # print(host)
        if host in host_with_pr_value:
            print('-------------------------url didnt deleted:', host, url)
            logger.info('-------------------------url didnt deleted: ' + url)
        else:
            url_md5 = common.get_token(common.get_url_remove_http(url))
            # print(url_md5)
            sql = f'''delete from {table} where Record_MD5_ID='{url_md5}';'''
            result = common.query_mysql(database_config, sql)
            print(url, 'was deleted', result)
コード例 #21
0
ファイル: qiehao.py プロジェクト: wschxida/datasource_manager
async def get_response(i, semaphore):
    async with semaphore:
        async with aiohttp.ClientSession() as session:
            try:
                headers = {
                    "user-agent":
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36",
                }
                url = url_template.format(i)
                async with session.get(url, headers=headers) as response:
                    text = await response.text()
                    text_json = json.loads(text)
                    newslist = text_json["newslist"]
                    # print(text_json)
                    if len(newslist) > 0:
                        try:
                            try:
                                source_name = text_json["newslist"][0][
                                    "source"]
                            except:
                                source_name = text_json["newslist"][0][
                                    "media_id"]

                            last_article_time = text_json["newslist"][0][
                                "timestamp"]
                            print(url, source_name)

                            insert_sql = f'insert ignore into author_qiehao(author_name,author_url,author_id) VALUES("{source_name}","{url}","{i}");'
                            # print(insert_sql)
                            common.query_mysql(database_config, insert_sql)

                        except Exception as e:
                            print(e)

            except Exception as e:
                pass
コード例 #22
0
def main():
    # 导入参数文件
    host = conf.get("database", "host")
    port = conf.get("database", "port")
    user = conf.get("database", "user")
    passwd = conf.get("database", "passwd")
    db = conf.get("database", "db")

    # 数据库配置
    # database_config = {'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource'}
    database_config = {
        'host': host,
        'port': int(port),
        'user': user,
        'passwd': passwd,
        'db': db
    }

    # 插入新采集的栏目到new_listpage_url
    insert_url_to_new = f"""
                insert ignore into new_listpage_url
                select * from new_listpage_url_cache;
                """
    common.query_mysql(database_config, insert_url_to_new)
コード例 #23
0
def main(website_no, start, end):
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }

    # ID每增长1万个,编入同一个网站编号
    update_website = f"update cloud_listpage_url set Website_No='{website_no}' where Cloud_Listpage_URL_ID BETWEEN {start} AND {end};"
    print(update_website)

    try:
        result = common.query_mysql(extractor_116, update_website)
        print(result)
    except Exception as e:
        print(e)
コード例 #24
0
def input_from_column_link():
    extractor_118 = {
        'host': '192.168.1.133',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    select_column_pattern = "select URL as ListPage_URL,Title as ListPage_Title,Domain_Code,Host_Code,Level_Score as Last_Check_Score_Text " \
                            "from column_link where Website_No='GUOWAI' ORDER BY Host_Code limit {},{};"

    for i in range(130):
        # print(i*100, 100)
        select_column = select_column_pattern.format(i * 1000, 1000)
        print(select_column)
        try:
            results = common.query_mysql(extractor_118, select_column)
            # print(results)
            insert_into_center_database(results)

        except Exception as e:
            print(e)
コード例 #25
0
def parse_html_to_database(database_config, url, column_extraction_deep,
                           domain_code_source, website_no, Is_Need_VPN, text):
    try:
        root = etree.HTML(text, parser=etree.HTMLParser(encoding='utf-8'))
        column_extraction_deep = int(column_extraction_deep) + 1

        items = root.xpath('//a')
        column_list = []
        for num, item in enumerate(items):

            title = "".join(item.xpath('.//text()'))
            listpage_url = "".join(item.xpath('./@href'))
            listpage_url = urljoin(url, listpage_url)
            # 去掉标点符号
            title = common.filter_punctuation(title)
            listpage_url = common.match_url(listpage_url)
            # 计算url MD5 先去掉http和末尾斜杆
            md5_source = listpage_url
            md5_source = md5_source.replace('http://', '')
            md5_source = md5_source.replace('https://', '')
            md5_source = md5_source.rstrip('/')
            record_md5_id = common.get_token(md5_source)
            domain_code = common.get_domain_code(listpage_url)
            host_code = common.get_host_code(listpage_url)
            # domain 要与源域名一致
            if domain_code_source != domain_code:
                continue

            # 计算a节点占全部a节点百分比,如果总节点小于50就去前50%的节点;如果在50和200之前,就取前30%;大于200,就取前20%
            len_items = len(items)
            node_percent = num / len(items)
            # print(num, 'percent:{:.0%}'.format(node_percent), title)
            logger.debug(
                str(num) + 'percent:{:.0%}'.format(node_percent) + title)
            if len_items < 50:
                if node_percent > 0.5:
                    continue
            if (len_items >= 50) and (len_items <= 200):
                if node_percent > 0.3:
                    continue
            if len_items > 200:
                if node_percent > 0.2:
                    continue

            # 垃圾词、垃圾域名过滤
            level_score, score_detail = common.is_need_filter(
                title, listpage_url, True)
            # print(level_score, score_detail)
            logger.debug(str(level_score) + '=' + score_detail)

            # 入库分值,新闻要大于等于20,论坛要大于等于10
            valid_score = 20
            media_type = common.get_media_type(listpage_url)
            if media_type == 'forum':
                valid_score = 10
            if level_score >= valid_score:
                column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{score_detail}')"
                column_list.append(column)

        # 批量插入
        values = ",".join(column_list)
        insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, score_detail) values{values};"
        # print(insert_column)
        logger.info('column count: ' + str(len(column_list)))
        count = common.query_mysql(database_config, insert_column)
        # logger.info('insert count: ' + str(count))

    except Exception as e:
        logger.error(str(e))
コード例 #26
0
def main():
    extractor_118 = {
        'host': '192.168.1.133',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    # select_column = f"select Article_Current_Node_HTML, Website_URL from column_root_source where 1=1 " \
    #                 f"and Source='baidu_news';"
    select_column = f"select Website_URL,Website_Title,Website_Description,Website_Keywords from column_root_source " \
                    f"where 1=1 and Source='baidu_web' " \
                    f"and Website_Title not like '%新闻%' and Website_Title not like'%资讯%' and Website_Title not like'%论坛%' and Website_Title not like'%社区%'" \
                    f"and Website_Keywords like '%新闻%' and Website_Keywords like'%资讯%' and Website_Keywords like'%论坛%' and Website_Keywords like'%社区%';"

    try:
        results = common.query_mysql(extractor_118, select_column)
    except Exception as e:
        results = []

    column_list = []
    for i in results:
        title = i['Website_Title']
        listpage_url = i['Website_URL']

        domain_code = common.get_domain_code(listpage_url)
        host_code = common.get_host_code(listpage_url)
        host_code_index = listpage_url.index(host_code) + len(host_code)
        listpage_url = listpage_url[0:host_code_index] + '/'
        # 计算url MD5 先去掉http和末尾斜杆
        md5_source = listpage_url
        md5_source = md5_source.replace('http://', '')
        md5_source = md5_source.replace('https://', '')
        md5_source = md5_source.rstrip('/')
        record_md5_id = common.get_token(md5_source)

        try:
            title = title.split('-')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split('_')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split(',')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split(',')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split('|')[0].strip()
        except Exception as e:
            title = title
        try:
            title = title.split(' ')[0].strip()
        except Exception as e:
            title = title

        level_score = '100'
        Score_Detail = '"{\"status\": True, \"message\": \"root page\"}"'

        # website_no = 'BAIDU_NEWS'
        website_no = 'BAIDU_WEB'
        column_extraction_deep = 0
        column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{Score_Detail}')"
        # column_list.append(column)
        print(column)
        # 批量插入
        values = column
        # insert_column = f"replace into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};"
        insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};"
        # print(insert_column)
        try:
            common.query_mysql(extractor_118, insert_column)
        except Exception as e:
            pass
コード例 #27
0
def main(start, end):
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }

    select_column = f'''
        SELECT
            lu.listpage_url,
            lu.listpage_title,
            w.Website_Important_Level
        FROM
            listpage_url lu
        LEFT JOIN website w ON lu.website_no = w.website_no
        WHERE
            w.Website_Name like'%新浪微博%' 
            and w.Website_Name like'%作者列表%'
            and lu.ListPage_URL_ID BETWEEN {start} AND {end};
    '''

    try:
        results = common.query_mysql(extractor_116, select_column)
        column_list = []
        count = 0
        print(len(results))
        for item in results:
            author_name = item['listpage_title']
            # author_account = item['listpage_url'].replace('http://', '')
            author_url = item['listpage_url']
            try:
                author_id = author_url.split('&id=100505')[1].split(
                    '&feed_type')[0]
                author_type = 'user'
            except Exception as e:
                try:
                    author_id = author_url.split('weibo.com/u/')[1].replace(
                        '?is_all=1', '')
                    author_type = 'user'
                except Exception as e:
                    try:
                        author_id = author_url.split(
                            'weibo.com/p/')[1].replace('/wenzhang', '')
                        author_type = 'page'
                    except Exception as e:
                        author_id = author_name
                        author_type = ''

            is_added = 1
            column = f"('{author_name}', '{author_url}', '{author_id}', '{author_type}', {is_added})"
            column_list.append(column)

            # 每1万条插入一次,分批
            if count > 0 and not count % 10000:
                values = ",".join(column_list)
                insert_column = f"insert ignore into author_weibo(author_name, author_url, author_id, author_type, is_added) values{values};"
                # print(insert_column)
                common.query_mysql(extractor_118, insert_column)
                column_list = []
                print(count)

            count += 1

        values = ",".join(column_list)
        insert_column = f"insert ignore into author_weibo(author_name, author_url, author_id, author_type, is_added) values{values};"
        # print(insert_column)
        common.query_mysql(extractor_118, insert_column)

    except Exception as e:
        print(e)
コード例 #28
0
scroll_id = query['_scroll_id']  # 游标用于输出es查询出的所有结果


column_list = []
for item in results:
    author_name = item['_source']['title'].strip()
    author_url = item['_source']['listpage_url']
    author_id = author_url.split('app_id=')[1]
    is_added = 1
    column = f"('{author_name}', '{author_url}', '{author_id}', {is_added})"
    column_list.append(column)

values = ",".join(column_list)
insert_column = f"insert ignore into author_baijiahao(author_name, author_url, author_id, is_added) values{values};"
print(insert_column)
common.query_mysql(extractor_118, insert_column)


for i in range(0, int(total/100)+1):
    column_list = []
    # scroll参数必须指定否则会报错
    query_scroll = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits']
    for item in query_scroll:
        author_name = item['_source']['title'].strip()
        author_url = item['_source']['listpage_url']
        author_id = author_url.split('app_id=')[1]
        is_added = 1
        column = f"('{author_name}', '{author_url}', '{author_id}', {is_added})"
        column_list.append(column)

    values = ",".join(column_list)
コード例 #29
0
def main():
    fn = open('listpage.txt', 'r', encoding='UTF-8')  # 打开文件
    column_list = []
    for i in fn:
        try:
            title = i.split('=')[0].strip()
            listpage_url = i.split('=')[1].strip()

            domain_code = common.get_domain_code(listpage_url)
            host_code = common.get_host_code(listpage_url)
            host_code_index = listpage_url.index(host_code) + len(host_code)
            listpage_url = listpage_url[0:host_code_index] + '/'
            # 计算url MD5 先去掉http和末尾斜杆
            md5_source = listpage_url
            md5_source = md5_source.replace('http://', '')
            md5_source = md5_source.replace('https://', '')
            md5_source = md5_source.rstrip('/')
            record_md5_id = common.get_token(md5_source)

            if len(title) < 1:
                try:
                    response = requests.get(listpage_url, timeout=5)
                    # print(response.status_code)
                    if response.status_code == 200:
                        encoding = get_encoding(response)
                        # print(encoding)
                        response.encoding = encoding
                        soup = BeautifulSoup(response.text, 'lxml')
                        # print(soup.title.text)
                        title = soup.title.text
                    else:
                        continue
                except:
                    pass
            try:
                title = title.split('-')[0].strip()
            except Exception as e:
                title = title
            try:
                title = title.split('_')[0].strip()
            except Exception as e:
                title = title
            try:
                title = title.split('-')[0].strip()
            except Exception as e:
                title = title

            level_score, Score_Detail = common.is_need_filter(
                title, listpage_url, True)
            # print(level_score, Score_Detail, title, listpage_url)
            if level_score > -100:
                level_score = '100'
                Score_Detail = '"{\"status\": True, \"message\": \"root page\"}"'

                website_no = 'AD_SELECTED'
                column_extraction_deep = 0
                column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{Score_Detail}')"
                # column_list.append(column)
                print(column)
                # 批量插入
                extractor_118 = {
                    'host': '192.168.1.118',
                    'port': 3306,
                    'user': '******',
                    'passwd': 'poms@db',
                    'db': 'mymonitor'
                }
                values = column
                insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};"
                # print(insert_column)
                try:
                    common.query_mysql(extractor_118, insert_column)
                except Exception as e:
                    pass

        except Exception as e:
            pass

    fn.close()  # 关闭文件
コード例 #30
0
def main():
    extractor_116 = {
        'host': '192.168.1.116',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'mymonitor'
    }
    extractor_118 = {
        'host': '192.168.1.118',
        'port': 3306,
        'user': '******',
        'passwd': 'poms@db',
        'db': 'datasource'
    }

    select_column = '''
        SELECT
        lu.listpage_url,
        lu.listpage_title,
        w.Website_Important_Level
        FROM
            listpage_url lu
        LEFT JOIN task_schedule ts ON lu.website_no = ts.website_no
        LEFT JOIN website w ON lu.website_no = w.website_no
        WHERE
        ts.schedule_name IN (
            '搜狗微信_采集作者列表_Python脚本',
            '搜狗微信_列表_Python脚本_无数据')
    '''

    try:
        results = common.query_mysql(extractor_116, select_column)
        column_list = []
        count = 0
        print(len(results))
        for item in results:
            author_name = item['listpage_title']
            author_account = item['listpage_url'].replace('http://', '')
            author_id = author_account
            author_type = item['Website_Important_Level']
            is_added = 1
            column = f"('{author_name}', '{author_account}', '{author_id}', '{author_type}', {is_added})"
            column_list.append(column)

            # 每1万条插入一次,分批
            if count > 0 and not count % 10000:
                values = ",".join(column_list)
                insert_column = f"insert ignore into author_weixin(author_name, author_account, author_id, author_type, is_added) values{values};"
                # print(insert_column)
                common.query_mysql(extractor_118, insert_column)
                column_list = []
                print(count)

            count += 1

        values = ",".join(column_list)
        insert_column = f"insert ignore into author_weixin(author_name, author_account, author_id, author_type, is_added) values{values};"
        # print(insert_column)
        common.query_mysql(extractor_118, insert_column)

    except Exception as e:
        print(e)