async def get_response(i, semaphore): async with semaphore: async with aiohttp.ClientSession() as session: try: url = url_template.format(i) async with session.get(url) as response: text = await response.text() text_json = json.loads(text) code = text_json["code"] print(url) if code == 200: try: source_name = text_json["data"]["pcArticleVOS"][0][ "userId"] last_article_time = text_json["data"][ "pcArticleVOS"][0]["publicTime"] print(url, source_name) insert_sql = f'insert ignore into author_other(author_name,author_url,author_id,website) VALUES("{source_name}","{url}","{i}","sohuhao");' # print(insert_sql) common.query_mysql(database_config, insert_sql) except Exception as e: print(e) except Exception as e: pass
async def get_response(i, semaphore): async with semaphore: async with aiohttp.ClientSession() as session: try: headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", } url = url_template.format(i) async with session.get(url, headers=headers) as response: text = await response.text() text_json = json.loads(text) data = text_json["data"] # print(text_json) if len(data) > 0: try: source_name = text_json["data"][0]["data"][ "source"] last_article_time = text_json["data"][0]["data"][ "pdate"] url = url.replace("&pageRow=1&", "&pageRow=100&") print(url, source_name) insert_sql = f'insert ignore into author_other(author_name,author_url,author_id,website) VALUES("{source_name}","{url}","{i}","beijingshijianhao");' # print(insert_sql) common.query_mysql(database_config, insert_sql) except Exception as e: print(e) except Exception as e: print(e)
def initialization_new_listpage_url_cache(database_config): # 初始化 new_listpage_url_cache # 读取 website, 删掉 new_listpage_url_cache 中目标website_no的记录,然后再重新插入主页url # 清空 new_listpage_url_cache 已采的数据 delete_cache_sql = f"delete from new_listpage_url_cache where website_no in (select website_no from website);" # common.query_mysql(database_config, delete_cache_sql) website_sql = 'select * from website;' website_result = common.query_mysql(database_config, website_sql) for website in website_result: Website_No = website['Website_No'] Listpage_Title = website['Website_Name'] Listpage_URL = website['Website_Main_Page_URL'] Is_Need_VPN = website['Is_Need_VPN'] # 计算url MD5 先去掉http和末尾斜杆 md5_source = Listpage_URL md5_source = md5_source.replace('http://', '') md5_source = md5_source.replace('https://', '') md5_source = md5_source.rstrip('/') Listpage_URL_MD5 = common.get_token(md5_source) Domain_Code = common.get_domain_code(Listpage_URL) Host_Code = common.get_host_code(Listpage_URL) Score_Detail = '{"status": True, "message": "root page"}' # 插入主页到 new_listpage_url_cache insert_url_to_cache = f""" insert ignore into new_listpage_url_cache(Column_Extraction_Deep, Listpage_URL, Listpage_Title, Domain_Code, Host_Code, Listpage_URL_MD5, Level_Score, Score_Detail, Website_No, Is_Need_VPN) value(1, '{Listpage_URL}', '{Listpage_Title}', '{Domain_Code}', '{Host_Code}', '{Listpage_URL_MD5}', 100, '{Score_Detail}', '{Website_No}', {Is_Need_VPN}); """ print(insert_url_to_cache) common.query_mysql(database_config, insert_url_to_cache)
async def get_response(i, semaphore): async with semaphore: async with aiohttp.ClientSession() as session: try: headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', } url = url_template.format(i) async with session.get(url, headers=headers) as response: text = await response.text() json_text = re.findall('yidian.docinfo = ({.*});</script>', text)[0] result = json.loads(json_text) source_name = result["channel_name"] follower_count = result["bookcount"] follower_count = follower_count.split('人订阅')[0] if '万' in follower_count: follower_count = follower_count.replace('万', '') follower_count = float(follower_count) * 10000 print(json_text) if len(source_name) > 0: try: print(url, source_name) insert_sql = f'replace into author_other(author_name,author_url,author_id,follower_count,website) VALUES("{source_name}","{url}","{i}",{follower_count},"yidianhao");' # print(insert_sql) common.query_mysql(database_config, insert_sql) except Exception as e: print(e) except Exception as e: pass
def main(): extractor_116 = { 'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } extractor_118 = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource' } select_column = f"""select ad.Article_URL, ad.Domain_Code, ad.Article_Source from article_detail ad where 1=1 and ad.Website_No in (select Website_No from website where Website_Name like'%百度新闻%') and ad.Extracted_Time>'2020-10-01' and ad.Extracted_Time<'2020-11-25' and Article_Source is not NULL and Article_Source !='' GROUP BY Domain_Code;""" try: results = common.query_mysql(extractor_116, select_column) except Exception as e: results = [] column_list = [] for i in results: title = i['Article_Source'] listpage_url = i['Article_URL'] domain_code = common.get_domain_code(listpage_url) host_code = common.get_host_code(listpage_url) host_code_index = listpage_url.index(host_code) + len(host_code) listpage_url = listpage_url[0:host_code_index] + '/' # 计算url MD5 先去掉http和末尾斜杆 md5_source = listpage_url md5_source = md5_source.replace('http://', '') md5_source = md5_source.replace('https://', '') md5_source = md5_source.rstrip('/') record_md5_id = common.get_token(md5_source) level_score = '100' Score_Detail = '"{\"status\": True, \"message\": \"root page\"}"' website_no = 'BAIDU_NEWS' column_extraction_deep = 0 column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{Score_Detail}')" # column_list.append(column) print(column) # 批量插入 values = column insert_column = f"replace into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};" # print(insert_column) try: common.query_mysql(extractor_118, insert_column) except Exception as e: pass
def main(start, end): extractor_116 = { 'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } extractor_118 = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource' } # 每次读取100万,并按domain_code排序,这样分配节点时便于把域名相同的分在一个组 select_column = f"select title,url,domain_code,record_md5_id from column_link where Column_Link_ID BETWEEN {start} AND {end};" try: results = common.query_mysql(extractor_118, select_column) column_list = [] count = 0 print('len(results): ', len(results)) for item in results: Cloud_Server_ID = random.randint(10000, 10062) Listpage_URL = item['url'] # 过滤空格换行等 p = re.compile('\s+') Listpage_URL = re.sub(p, '', Listpage_URL) ListPage_Title = item['title'] Domain_Code = item['domain_code'] Record_MD5_ID = item['record_md5_id'] column = f"({Cloud_Server_ID}, '{Listpage_URL}', '{ListPage_Title}', '{Domain_Code}', '{Record_MD5_ID}')" column_list.append(column) # 每1000条插入一次,分批 if count > 0 and not count % 1000: print(count) values = ",".join(column_list) insert_column = f"insert ignore into cloud_listpage_url(Cloud_Server_ID, Listpage_URL, ListPage_Title, Domain_Code, Record_MD5_ID) values{values};" # print(insert_column) common.query_mysql(extractor_116, insert_column) column_list = [] count += 1 print('len(column_list): ', len(column_list)) # 批量插入 values = ",".join(column_list) insert_column = f"insert ignore into cloud_listpage_url(Cloud_Server_ID, Listpage_URL, ListPage_Title, Domain_Code, Record_MD5_ID) values{values};" # print(insert_column) common.query_mysql(extractor_116, insert_column) except Exception as e: print(e)
def update_score(): # 连接mysql config = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor', 'charset': 'utf8mb4', 'cursorclass': pymysql.cursors.DictCursor } # 1.查询操作 # 编写sql 查询语句 select_sql = "select Column_Link_ID,URL,Title from column_link where Level_Score is null limit 1000;" update_sql_pattern = "UPDATE column_link SET " \ "Level_Score = CASE Column_Link_ID {0} END, " \ "Score_Detail = CASE Column_Link_ID {1} END " \ "WHERE Column_Link_ID IN {2};" when_then_score_pattern = " WHEN {} THEN {} " when_then_detail_pattern = " WHEN {} THEN '{}' " id_list = [] try: results = common.query_mysql(config, select_sql) # 获取查询的所有记录 print(results) when_then_score = "" when_then_detail = "" # 遍历结果 for row in results: Column_Link_ID = row['Column_Link_ID'] url = row['URL'] title = row['Title'] level_score, status = common.is_need_filter(title, url, False) # print(level_score, status) when_then_score = when_then_score + when_then_score_pattern.format( Column_Link_ID, level_score) when_then_detail = when_then_detail + when_then_detail_pattern.format( Column_Link_ID, status) id_list.append(Column_Link_ID) id_tuple = tuple(id_list) sql = update_sql_pattern.format(when_then_score, when_then_detail, id_tuple) print(sql) try: common.query_mysql(config, sql) except Exception as e: print(e) except Exception as e: print(e)
def main(start, end): extractor_133 = { 'host': '192.168.1.133', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } extractor_118 = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource' } select_column = f"select * from column_link_2 where Column_Link_ID BETWEEN {start} AND {end};" try: results = common.query_mysql(extractor_118, select_column) column_list = [] for item in results: Extracted_flag = item['Extracted_flag'] Column_Extraction_Deep = item['Column_Extraction_Deep'] URL = item['URL'] Title = item['Title'] Domain_Code = item['Domain_Code'] Host_Code = item['Host_Code'] # 计算url MD5 先去掉http和末尾斜杆 md5_source = URL md5_source = md5_source.replace('http://', '') md5_source = md5_source.replace('https://', '') md5_source = md5_source.rstrip('/') Record_MD5_ID = common.get_token(md5_source) Level_Score = item['Level_Score'] Score_Detail = item['Score_Detail'] Website_No = item['Website_No'] Is_User_Added = item['Is_User_Added'] column = f"('{Extracted_flag}', {Column_Extraction_Deep}, '{URL}', '{Title}', '{Domain_Code}', '{Host_Code}', '{Record_MD5_ID}', {Level_Score}, '{Score_Detail}', '{Website_No}', {Is_User_Added})" column_list.append(column) # 批量插入 values = ",".join(column_list) insert_column = f"insert ignore into column_link(Extracted_flag, Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No, Is_User_Added) values{values};" # print(insert_column) common.query_mysql(extractor_118, insert_column) except Exception as e: print(e)
def main(start, end): extractor_116 = { 'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } extractor_118 = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource' } select_column = f"select Website_No,ListPage_URL,ListPage_Title from listpage_url where ListPage_URL_ID IN " \ f"(select ListPage_URL_ID from cloud_listpage_url where cloud_listpage_url_id BETWEEN {start} AND {end});" try: results = common.query_mysql(extractor_116, select_column) column_list = [] for item in results: URL = item['ListPage_URL'] Title = item['ListPage_Title'] Website_No = item['Website_No'] Domain_Code = common.get_domain_code(URL) Host_Code = common.get_host_code(URL) # 计算url MD5 先去掉http和末尾斜杆 md5_source = URL md5_source = md5_source.replace('http://', '') md5_source = md5_source.replace('https://', '') md5_source = md5_source.rstrip('/') Record_MD5_ID = common.get_token(md5_source) Level_Score, Score_Detail = common.is_need_filter(Title, URL) Column_Extraction_Deep = 1 column = f"({Column_Extraction_Deep}, '{URL}', '{Title}', '{Domain_Code}', '{Host_Code}', '{Record_MD5_ID}', {Level_Score}, '{Score_Detail}', '{Website_No}')" if Level_Score > 20: column_list.append(column) # 批量插入 values = ",".join(column_list) insert_column = f"insert ignore into column_link(Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No) values{values};" # print(insert_column) common.query_mysql(extractor_118, insert_column) except Exception as e: print(e)
def insert_into_center_database(input_data_list): """ 查询相关数据库,解析相关数据并批量insert到mysql对应表中 :param input_data_list: 类型list,每个item为字典,key全部要以listpage_url表字段命名 [{"ListPage_Title":"1234","ListPage_URL":"https://v2.sohu.com/1",...},{"ListPage_Title":"2345","ListPage_URL":"https://weibo.com/p/aj/v6/mblog/",...}] :return: """ center_116 = { 'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } column_list = [] for i in input_data_list: website_no = 'S18605' listpage_url = i['ListPage_URL'] # listpage_url_dm5 = '' listpage_title = i['ListPage_Title'] domain_code = i['Domain_Code'] host_code = i['Host_Code'] last_check_score_text = i['Last_Check_Score_Text'] listpage_save_rule = 3 is_enabled = 1 linkurl_min_length = 10 linktext_min_length = 4 # 计算url MD5 先去掉http和末尾斜杆 md5_source = listpage_url md5_source = md5_source.replace('http://', '') md5_source = md5_source.replace('https://', '') md5_source = md5_source.rstrip('/') md5_source = md5_source.lower() listpage_url_dm5 = common.get_token(md5_source) column = f"('{website_no}', '{listpage_url}', '{listpage_url_dm5}','{listpage_title}','{domain_code}','{host_code}',{last_check_score_text},{listpage_save_rule},{is_enabled},{linkurl_min_length},{linktext_min_length})" column_list.append(column) values = ",".join(column_list) insert_column = f"insert ignore into listpage_url(Website_No,ListPage_URL,ListPage_URL_MD5,ListPage_Title,Domain_Code,Host_Code,Last_Check_Score_Text,ListPage_Save_Rule,Is_Enabled,LinkURL_Min_Length,LinkText_Min_Length) values{values};" # print(insert_column) try: common.query_mysql(center_116, insert_column) except Exception as e: print(e)
def main(start_server_id, end_server_id, count): extractor_116 = {'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor'} for i in range(start_server_id, end_server_id): print(i) update_website = f''' update cloud_task_schedule set cloud_server_id={i} where 1=1 and schedule_id in (select * from (select Schedule_ID from cloud_task_schedule where 1=1 and Cloud_Server_ID is null order by Schedule_ID limit {count})aa ); ''' print(update_website) try: common.query_mysql(extractor_116, update_website) except Exception as e: print(e)
def main(): extractor_118 = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource' } select_column = f"select * from website where ID>1577;" try: results = common.query_mysql(extractor_118, select_column) column_list = [] for item in results: ID = item['ID'] Column_Extraction_Deep = '0' URL = item['URL'] Title = item['Title'] Domain_Code = common.get_domain_code(URL) Host_Code = common.get_host_code(URL) # 计算url MD5 先去掉http和末尾斜杆 md5_source = URL md5_source = md5_source.replace('http://', '') md5_source = md5_source.replace('https://', '') md5_source = md5_source.rstrip('/') Record_MD5_ID = common.get_token(md5_source) Level_Score = '100' Score_Detail = '{"status": True, "message": "root page"}' Website_No = 'OVERSEA' column = f"({Column_Extraction_Deep}, '{URL}', '{Title}', '{Domain_Code}', '{Host_Code}', '{Record_MD5_ID}', {Level_Score}, '{Score_Detail}', '{Website_No}')" column_list.append(column) # # 更新md5 update_website = f"update website set record_md5_id='{Record_MD5_ID}' where ID={ID}" common.query_mysql(extractor_118, update_website) insert_column = f"insert ignore into column_link_oversea(Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No) values{column};" print(insert_column) common.query_mysql(extractor_118, insert_column) # # 批量插入 # values = ",".join(column_list) # insert_column = f"insert ignore into column_link_oversea(Column_Extraction_Deep, URL, Title, Domain_Code, Host_Code, Record_MD5_ID, Level_Score, Score_Detail, Website_No) values{values};" # print(insert_column) # common.query_mysql(extractor_118, insert_column) except Exception as e: print(e)
def update_host_code(): # 连接mysql config = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor', 'charset': 'utf8mb4', 'cursorclass': pymysql.cursors.DictCursor } # 1.查询操作 # 编写sql 查询语句 select_sql = "select Column_Link_ID,URL from column_link where host_code is null limit 1000;" # update_sql = "update column_link set host_code='{}' where Column_Link_ID={};" update_sql_pattern = "UPDATE column_link SET host_code = CASE Column_Link_ID {} END WHERE Column_Link_ID IN {};" when_then_pattern = " WHEN {} THEN '{}' " id_list = [] try: results = common.query_mysql(config, select_sql) # 获取查询的所有记录 when_then = "" # 遍历结果 for row in results: Column_Link_ID = row['Column_Link_ID'] url = row['URL'] host_code = common.get_host_code(url) if len(host_code) > 50: continue # sql = update_sql.format(host_code, Column_Link_ID) when_then = when_then + when_then_pattern.format( Column_Link_ID, host_code) id_list.append(Column_Link_ID) id_tuple = tuple(id_list) sql = update_sql_pattern.format(when_then, id_tuple) print(sql) try: common.query_mysql(config, sql) except Exception as e: print(e) except Exception as e: print(e)
async def get_response(i, semaphore): async with semaphore: async with aiohttp.ClientSession() as session: try: headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", } url = url_template.format(i) async with session.get(url, headers=headers) as response: text = await response.text() # response.encoding = "utf-8" root = etree.HTML( text, parser=etree.HTMLParser(encoding='utf-8')) article_url = "".join( root.xpath( '//*[@id="content-list"]/div[2]/div/div[1]/a/@href' )) article_url_id = int( article_url.strip(".html").strip("/news/show/id/")) print(article_url_id) # print(text_json) # http://www.ql1d.com/news/show/id/11503356.html if article_url_id > 11503356: try: source_name = "".join( root.xpath( '//*[@id="content-list"]/div[1]/div/div/div[2]/div[1]/text()' )) print(url, source_name) insert_sql = f'insert ignore into author_other(author_name,author_url,author_id,website) VALUES("{source_name}","{url}","{i}","qiluyidianhao");' # print(insert_sql) common.query_mysql(database_config, insert_sql) except Exception as e: print(e) except Exception as e: pass
def create_task(loop, database_config): semaphore = asyncio.Semaphore(50) # 限制并发量为500 try: tasks = [] # 查询待采集目标 select_column = f""" select Column_Link_ID, Column_Extraction_Deep, URL, Domain_Code, Website_No, Extracted_flag from column_link where Extracted_flag is null ORDER BY Column_Extraction_Deep limit 1000; """ print('=====query new tasks=====') target_items = common.query_mysql(database_config, select_column) print('=====start tasks=====') id_list = [0] for item in target_items: id_list.append(item["Column_Link_ID"]) url = item["URL"] domain_code = item["Domain_Code"] website_no = item["Website_No"] column_extraction_deep = item["Column_Extraction_Deep"] # 最多采集3层 if column_extraction_deep <= 3: task = asyncio.ensure_future( get_response(database_config, semaphore, url, column_extraction_deep, domain_code, website_no)) tasks.append(task) results = loop.run_until_complete(asyncio.gather(*tasks)) # print(results) print('=====finish tasks=====') # 更新Extracted_flag id_list = tuple(id_list) update_flag = f"update column_link set Extracted_flag='S' where Column_Link_ID in {id_list};" print('=====update flag=====') common.query_mysql(database_config, update_flag) return len(results) except Exception as e: logger.error(str(e)) return None
def get_url_list(): """ 获取需要检测的域名和url :return: """ sql = """ select Domain_Code,listpage_url from listpage_url where website_no in (select website_no from website_tags where Website_Tag_ID=105) and domain_code is not null """ query_list = query_mysql(config_116, sql) return query_list
def main(): extractor_116 = { 'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } # ID每增长1万个,编入同一个网站编号 insert_website = ''' insert into cloud_task_schedule(Cloud_Server_Count,cloud_server_id,ListPage_URL_Count) select 63,cloud_server_id,count(1) from cloud_listpage_url GROUP BY cloud_server_id order by cloud_server_id; ''' print(insert_website) try: common.query_mysql(extractor_116, insert_website) except Exception as e: print(e)
def create_task(loop, semaphore, database_config): try: tasks = [] # 查询待采集目标 select_column = f""" select New_Listpage_URL_ID, Column_Extraction_Deep, Listpage_URL, Domain_Code, Website_No, Extracted_flag, Is_Need_VPN from new_listpage_url_cache where Extracted_flag is null ORDER BY Column_Extraction_Deep limit 200; """ target_items = common.query_mysql(database_config, select_column) id_list = [0] for item in target_items: id_list.append(item["New_Listpage_URL_ID"]) url = item["Listpage_URL"] domain_code = item["Domain_Code"] website_no = item["Website_No"] column_extraction_deep = item["Column_Extraction_Deep"] Is_Need_VPN = item["Is_Need_VPN"] # 最多采集3层 if column_extraction_deep <= 3: task = asyncio.ensure_future( get_response(database_config, semaphore, url, column_extraction_deep, domain_code, website_no, Is_Need_VPN)) tasks.append(task) results = loop.run_until_complete(asyncio.gather(*tasks)) print(results) # 更新Extracted_flag id_list = tuple(id_list) update_flag = f"update new_listpage_url_cache set Extracted_flag='S' where New_Listpage_URL_ID in {id_list};" common.query_mysql(database_config, update_flag) return len(target_items) except Exception as e: if len(str(e)) > 0: logger.error(str(e)) return None
def main(): extractor_116 = { 'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } select_sql = 'select cloud_server_id,website_no from cloud_task_schedule;' try: results = common.query_mysql(extractor_116, select_sql) for item in results: cloud_server_id = item['cloud_server_id'] website_no = item['website_no'] print(cloud_server_id, website_no) update_cloud = f"update cloud_listpage_url set Cloud_Server_ID={cloud_server_id} where website_no='{website_no}';" print(update_cloud) common.query_mysql(extractor_116, update_cloud) except Exception as e: print(e)
def main(): # 导入参数文件 host = conf.get("database", "host") port = conf.get("database", "port") user = conf.get("database", "user") passwd = conf.get("database", "passwd") db = conf.get("database", "db") table = conf.get("database", "table") reject_url_file = 'reject_url.txt' # 数据库配置 database_config_116 = {'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor'} database_config = {'host': host, 'port': int(port), 'user': user, 'passwd': passwd, 'db': db} # 取出116数据库domain表中,PR值大于3的domain_code(实际上是host) host_with_pr_value = [] domain_sql = 'select Domain_Code from domain where PR_Value is not null and PR_Value>3' domain_pr_value = common.query_mysql(database_config_116, domain_sql) for item in domain_pr_value: host_with_pr_value.append(item['Domain_Code']) # print(host_with_pr_value) # 读取 reject_url reject_url_list = get_reject_domain_list(reject_url_file) # 执行删除操作 for url in reject_url_list: # logger.info(url) host = common.get_host_code(url) # print(host) if host in host_with_pr_value: print('-------------------------url didnt deleted:', host, url) logger.info('-------------------------url didnt deleted: ' + url) else: url_md5 = common.get_token(common.get_url_remove_http(url)) # print(url_md5) sql = f'''delete from {table} where Record_MD5_ID='{url_md5}';''' result = common.query_mysql(database_config, sql) print(url, 'was deleted', result)
async def get_response(i, semaphore): async with semaphore: async with aiohttp.ClientSession() as session: try: headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", } url = url_template.format(i) async with session.get(url, headers=headers) as response: text = await response.text() text_json = json.loads(text) newslist = text_json["newslist"] # print(text_json) if len(newslist) > 0: try: try: source_name = text_json["newslist"][0][ "source"] except: source_name = text_json["newslist"][0][ "media_id"] last_article_time = text_json["newslist"][0][ "timestamp"] print(url, source_name) insert_sql = f'insert ignore into author_qiehao(author_name,author_url,author_id) VALUES("{source_name}","{url}","{i}");' # print(insert_sql) common.query_mysql(database_config, insert_sql) except Exception as e: print(e) except Exception as e: pass
def main(): # 导入参数文件 host = conf.get("database", "host") port = conf.get("database", "port") user = conf.get("database", "user") passwd = conf.get("database", "passwd") db = conf.get("database", "db") # 数据库配置 # database_config = {'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource'} database_config = { 'host': host, 'port': int(port), 'user': user, 'passwd': passwd, 'db': db } # 插入新采集的栏目到new_listpage_url insert_url_to_new = f""" insert ignore into new_listpage_url select * from new_listpage_url_cache; """ common.query_mysql(database_config, insert_url_to_new)
def main(website_no, start, end): extractor_116 = { 'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } # ID每增长1万个,编入同一个网站编号 update_website = f"update cloud_listpage_url set Website_No='{website_no}' where Cloud_Listpage_URL_ID BETWEEN {start} AND {end};" print(update_website) try: result = common.query_mysql(extractor_116, update_website) print(result) except Exception as e: print(e)
def input_from_column_link(): extractor_118 = { 'host': '192.168.1.133', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } select_column_pattern = "select URL as ListPage_URL,Title as ListPage_Title,Domain_Code,Host_Code,Level_Score as Last_Check_Score_Text " \ "from column_link where Website_No='GUOWAI' ORDER BY Host_Code limit {},{};" for i in range(130): # print(i*100, 100) select_column = select_column_pattern.format(i * 1000, 1000) print(select_column) try: results = common.query_mysql(extractor_118, select_column) # print(results) insert_into_center_database(results) except Exception as e: print(e)
def parse_html_to_database(database_config, url, column_extraction_deep, domain_code_source, website_no, Is_Need_VPN, text): try: root = etree.HTML(text, parser=etree.HTMLParser(encoding='utf-8')) column_extraction_deep = int(column_extraction_deep) + 1 items = root.xpath('//a') column_list = [] for num, item in enumerate(items): title = "".join(item.xpath('.//text()')) listpage_url = "".join(item.xpath('./@href')) listpage_url = urljoin(url, listpage_url) # 去掉标点符号 title = common.filter_punctuation(title) listpage_url = common.match_url(listpage_url) # 计算url MD5 先去掉http和末尾斜杆 md5_source = listpage_url md5_source = md5_source.replace('http://', '') md5_source = md5_source.replace('https://', '') md5_source = md5_source.rstrip('/') record_md5_id = common.get_token(md5_source) domain_code = common.get_domain_code(listpage_url) host_code = common.get_host_code(listpage_url) # domain 要与源域名一致 if domain_code_source != domain_code: continue # 计算a节点占全部a节点百分比,如果总节点小于50就去前50%的节点;如果在50和200之前,就取前30%;大于200,就取前20% len_items = len(items) node_percent = num / len(items) # print(num, 'percent:{:.0%}'.format(node_percent), title) logger.debug( str(num) + 'percent:{:.0%}'.format(node_percent) + title) if len_items < 50: if node_percent > 0.5: continue if (len_items >= 50) and (len_items <= 200): if node_percent > 0.3: continue if len_items > 200: if node_percent > 0.2: continue # 垃圾词、垃圾域名过滤 level_score, score_detail = common.is_need_filter( title, listpage_url, True) # print(level_score, score_detail) logger.debug(str(level_score) + '=' + score_detail) # 入库分值,新闻要大于等于20,论坛要大于等于10 valid_score = 20 media_type = common.get_media_type(listpage_url) if media_type == 'forum': valid_score = 10 if level_score >= valid_score: column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{score_detail}')" column_list.append(column) # 批量插入 values = ",".join(column_list) insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, score_detail) values{values};" # print(insert_column) logger.info('column count: ' + str(len(column_list))) count = common.query_mysql(database_config, insert_column) # logger.info('insert count: ' + str(count)) except Exception as e: logger.error(str(e))
def main(): extractor_118 = { 'host': '192.168.1.133', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } # select_column = f"select Article_Current_Node_HTML, Website_URL from column_root_source where 1=1 " \ # f"and Source='baidu_news';" select_column = f"select Website_URL,Website_Title,Website_Description,Website_Keywords from column_root_source " \ f"where 1=1 and Source='baidu_web' " \ f"and Website_Title not like '%新闻%' and Website_Title not like'%资讯%' and Website_Title not like'%论坛%' and Website_Title not like'%社区%'" \ f"and Website_Keywords like '%新闻%' and Website_Keywords like'%资讯%' and Website_Keywords like'%论坛%' and Website_Keywords like'%社区%';" try: results = common.query_mysql(extractor_118, select_column) except Exception as e: results = [] column_list = [] for i in results: title = i['Website_Title'] listpage_url = i['Website_URL'] domain_code = common.get_domain_code(listpage_url) host_code = common.get_host_code(listpage_url) host_code_index = listpage_url.index(host_code) + len(host_code) listpage_url = listpage_url[0:host_code_index] + '/' # 计算url MD5 先去掉http和末尾斜杆 md5_source = listpage_url md5_source = md5_source.replace('http://', '') md5_source = md5_source.replace('https://', '') md5_source = md5_source.rstrip('/') record_md5_id = common.get_token(md5_source) try: title = title.split('-')[0].strip() except Exception as e: title = title try: title = title.split('_')[0].strip() except Exception as e: title = title try: title = title.split(',')[0].strip() except Exception as e: title = title try: title = title.split(',')[0].strip() except Exception as e: title = title try: title = title.split('|')[0].strip() except Exception as e: title = title try: title = title.split(' ')[0].strip() except Exception as e: title = title level_score = '100' Score_Detail = '"{\"status\": True, \"message\": \"root page\"}"' # website_no = 'BAIDU_NEWS' website_no = 'BAIDU_WEB' column_extraction_deep = 0 column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{Score_Detail}')" # column_list.append(column) print(column) # 批量插入 values = column # insert_column = f"replace into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};" insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};" # print(insert_column) try: common.query_mysql(extractor_118, insert_column) except Exception as e: pass
def main(start, end): extractor_116 = { 'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } extractor_118 = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource' } select_column = f''' SELECT lu.listpage_url, lu.listpage_title, w.Website_Important_Level FROM listpage_url lu LEFT JOIN website w ON lu.website_no = w.website_no WHERE w.Website_Name like'%新浪微博%' and w.Website_Name like'%作者列表%' and lu.ListPage_URL_ID BETWEEN {start} AND {end}; ''' try: results = common.query_mysql(extractor_116, select_column) column_list = [] count = 0 print(len(results)) for item in results: author_name = item['listpage_title'] # author_account = item['listpage_url'].replace('http://', '') author_url = item['listpage_url'] try: author_id = author_url.split('&id=100505')[1].split( '&feed_type')[0] author_type = 'user' except Exception as e: try: author_id = author_url.split('weibo.com/u/')[1].replace( '?is_all=1', '') author_type = 'user' except Exception as e: try: author_id = author_url.split( 'weibo.com/p/')[1].replace('/wenzhang', '') author_type = 'page' except Exception as e: author_id = author_name author_type = '' is_added = 1 column = f"('{author_name}', '{author_url}', '{author_id}', '{author_type}', {is_added})" column_list.append(column) # 每1万条插入一次,分批 if count > 0 and not count % 10000: values = ",".join(column_list) insert_column = f"insert ignore into author_weibo(author_name, author_url, author_id, author_type, is_added) values{values};" # print(insert_column) common.query_mysql(extractor_118, insert_column) column_list = [] print(count) count += 1 values = ",".join(column_list) insert_column = f"insert ignore into author_weibo(author_name, author_url, author_id, author_type, is_added) values{values};" # print(insert_column) common.query_mysql(extractor_118, insert_column) except Exception as e: print(e)
scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果 column_list = [] for item in results: author_name = item['_source']['title'].strip() author_url = item['_source']['listpage_url'] author_id = author_url.split('app_id=')[1] is_added = 1 column = f"('{author_name}', '{author_url}', '{author_id}', {is_added})" column_list.append(column) values = ",".join(column_list) insert_column = f"insert ignore into author_baijiahao(author_name, author_url, author_id, is_added) values{values};" print(insert_column) common.query_mysql(extractor_118, insert_column) for i in range(0, int(total/100)+1): column_list = [] # scroll参数必须指定否则会报错 query_scroll = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits'] for item in query_scroll: author_name = item['_source']['title'].strip() author_url = item['_source']['listpage_url'] author_id = author_url.split('app_id=')[1] is_added = 1 column = f"('{author_name}', '{author_url}', '{author_id}', {is_added})" column_list.append(column) values = ",".join(column_list)
def main(): fn = open('listpage.txt', 'r', encoding='UTF-8') # 打开文件 column_list = [] for i in fn: try: title = i.split('=')[0].strip() listpage_url = i.split('=')[1].strip() domain_code = common.get_domain_code(listpage_url) host_code = common.get_host_code(listpage_url) host_code_index = listpage_url.index(host_code) + len(host_code) listpage_url = listpage_url[0:host_code_index] + '/' # 计算url MD5 先去掉http和末尾斜杆 md5_source = listpage_url md5_source = md5_source.replace('http://', '') md5_source = md5_source.replace('https://', '') md5_source = md5_source.rstrip('/') record_md5_id = common.get_token(md5_source) if len(title) < 1: try: response = requests.get(listpage_url, timeout=5) # print(response.status_code) if response.status_code == 200: encoding = get_encoding(response) # print(encoding) response.encoding = encoding soup = BeautifulSoup(response.text, 'lxml') # print(soup.title.text) title = soup.title.text else: continue except: pass try: title = title.split('-')[0].strip() except Exception as e: title = title try: title = title.split('_')[0].strip() except Exception as e: title = title try: title = title.split('-')[0].strip() except Exception as e: title = title level_score, Score_Detail = common.is_need_filter( title, listpage_url, True) # print(level_score, Score_Detail, title, listpage_url) if level_score > -100: level_score = '100' Score_Detail = '"{\"status\": True, \"message\": \"root page\"}"' website_no = 'AD_SELECTED' column_extraction_deep = 0 column = f"('{title}', '{listpage_url}', '{record_md5_id}', '{website_no}', {column_extraction_deep}, '{domain_code}', '{host_code}', '{level_score}', '{Score_Detail}')" # column_list.append(column) print(column) # 批量插入 extractor_118 = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } values = column insert_column = f"insert ignore into column_link(Title, URL, record_md5_id, website_no, column_extraction_deep, domain_code, host_code, level_score, Score_Detail) values{values};" # print(insert_column) try: common.query_mysql(extractor_118, insert_column) except Exception as e: pass except Exception as e: pass fn.close() # 关闭文件
def main(): extractor_116 = { 'host': '192.168.1.116', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'mymonitor' } extractor_118 = { 'host': '192.168.1.118', 'port': 3306, 'user': '******', 'passwd': 'poms@db', 'db': 'datasource' } select_column = ''' SELECT lu.listpage_url, lu.listpage_title, w.Website_Important_Level FROM listpage_url lu LEFT JOIN task_schedule ts ON lu.website_no = ts.website_no LEFT JOIN website w ON lu.website_no = w.website_no WHERE ts.schedule_name IN ( '搜狗微信_采集作者列表_Python脚本', '搜狗微信_列表_Python脚本_无数据') ''' try: results = common.query_mysql(extractor_116, select_column) column_list = [] count = 0 print(len(results)) for item in results: author_name = item['listpage_title'] author_account = item['listpage_url'].replace('http://', '') author_id = author_account author_type = item['Website_Important_Level'] is_added = 1 column = f"('{author_name}', '{author_account}', '{author_id}', '{author_type}', {is_added})" column_list.append(column) # 每1万条插入一次,分批 if count > 0 and not count % 10000: values = ",".join(column_list) insert_column = f"insert ignore into author_weixin(author_name, author_account, author_id, author_type, is_added) values{values};" # print(insert_column) common.query_mysql(extractor_118, insert_column) column_list = [] print(count) count += 1 values = ",".join(column_list) insert_column = f"insert ignore into author_weixin(author_name, author_account, author_id, author_type, is_added) values{values};" # print(insert_column) common.query_mysql(extractor_118, insert_column) except Exception as e: print(e)