def iptosql(rows): db = sql_operation.getcon() for row in rows: replace_sql = "REPLACE INTO `t_proxy_info` " \ "(`ip`, `port`, `anonymous`, `type`, `location`, `speed`, `last_verify_time`) " \ "VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (row[0], row[1], row[2], row[3], row[4], row[5], row[6]) sql_operation.baseoperation(db, replace_sql) sql_operation.closecon(db)
def tosql(page): db = sql_operation.getcon() for item in page: replace_sql = "Replace INTO `t_stackoverflow_question` " \ "(`question_id`,`title`, `url`, `votes`, `flag`) " \ "VALUES ('%s','%s', '%s', '%s','0')" \ % (item[0], item[1], item[2], item[3]) sql_operation.baseoperation(db, replace_sql) sql_operation.closecon(db)
def tosql(keyword, relevant_search): db = sql_operation.getcon() key_list = participle(keyword) key_str = "---".join(key_list) relevant_str = "-----".join(relevant_search) replace_sql = "REPLACE INTO `t_relevant_search` (`keyword`, `key_list`, `relevant_search`, `flag`) " \ "VALUES ('%s','%s', '%s', '0')" \ % (keyword, key_str, relevant_str) print(replace_sql) sql_operation.baseoperation(db, replace_sql) sql_operation.closecon(db)
def title(): db = sql_operation.getcon() selectsql = "SELECT article_id,title FROM original_qdfuns_article WHERE title_flag IS NULL" results = sql_operation.baseselect(db, selectsql) for row in results: id = row[0] title = row[1] if baidudetector.url(title): updatesql = "UPDATE original_qdfuns_article SET title_flag = '0' WHERE (`article_id`='%d')" % (id) else: updatesql = "UPDATE original_qdfuns_article SET title_flag = '1' WHERE (`article_id`='%d')" % (id) sql_operation.baseoperation(db, updatesql) sql_operation.closecon(db)
def tosql(infoLists): db = sql_operation.getcon() titles = infoLists[0] urls = infoLists[1] subtimes = infoLists[2] stock_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) for i in range(len(titles)): insertsql = "INSERT INTO `original_qdfuns_article` (`title`, `url`, `submit_time`, `stock_time`) " \ "VALUES ('%s', '%s', '%s', '%s')" % \ (titles[i], urls[i], subtimes[i], stock_time) # print(insertsql) sql_operation.baseoperation(db, insertsql) sql_operation.closecon(db)
def key_extract(): db = sql_operation.getcon() key_file = open("keyword_relevant.txt", "a", encoding='utf-8') select_sql = "SELECT keyword,key_list FROM t_relevant_search WHERE keyword LIKE '%么%' and keyword LIKE '%php%'" results = sql_operation.baseselect(db, select_sql) for row in results: # row_str = "" keyword = row[0] # key_list = row[1] key_file.write(keyword + '\n') print(keyword) key_file.close() sql_operation.closecon(db)
def sqltofile(): db = sql_operation.getcon() select_sql = "SELECT ip,`port`,type FROM t_proxy_info Where type = 'HTTPS' Order By rand() Limit 5000" results = sql_operation.baseselect(db, select_sql) proxys = [] for tur in results: item_list = [] for item in tur: item_list.append(item) proxys.append(item_list) sql_operation.closecon(db) print(proxys) thread(proxys)
def updatetosql(): db = sql_operation.getcon() select_sql = "SELECT title,url FROM t_stackoverflow_question WHERE flag = '0' Order By rand() limit 10" results = sql_operation.baseselect(db, select_sql) for row in results: url = row[1] item = item_html(url) ansers_text = "[split]".join(item[7]) updatesql = "UPDATE `t_stackoverflow_question` " \ "SET `tags`='%s', `views`='%s', `answers_num`='%s', `asked_time`='%s', `last_active_time`='%s', `question_content`='%s', `answers_contetnt`='%s' , `flag` = '1'" \ "WHERE (`title`='%s') " \ % (item[4], item[1], item[2], item[3], item[5], item[6], ansers_text, item[0],) # print(updatesql) sql_operation.baseoperation(db, updatesql) sql_operation.closecon(db)
def content(): db = sql_operation.getcon() selectsql = "SELECT article_id,url FROM original_python_article WHERE (content_flag IS NULL or content_flag = 0) and title_flag = 0" results = sql_operation.baseselect(db, selectsql) for row in results: article_id = row[0] url = row[1] print('当前所处理的文章url: ' + url) if circleCheck(url): updatesql = "UPDATE `original_python_article` SET `content_flag`='0' WHERE (`article_id`='%d')" % ( article_id) else: updatesql = "UPDATE `original_python_article` SET `content_flag`='1' WHERE (`article_id`='%d')" % ( article_id) sql_operation.baseoperation(db, updatesql) sql_operation.closecon(db)
def cnblogsSpider(index): # cnblogs.crawler(2, 3) infoLists = cnblogs.getDoc(index) titles = infoLists[0] intros = infoLists[1] urls = infoLists[2] article_from = '博客园首页' others = infoLists[3] stock_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) db = sql_operation.getcon() for i in range(len(titles)): insertsql = "INSERT INTO `original_python_article` (`title`, `intro`, `url`, `from`, `other`, `stock_time`) " \ "VALUES ('%s', '%s', '%s', '%s', '%s', '%s')" % \ (titles[i], intros[i], urls[i], article_from, others[i], stock_time) sql_operation.baseoperation(db, insertsql) sql_operation.closecon(db)
def sql_search(): db = sql_operation.getcon() select_sql = "select keyword,relevant_search from t_relevant_search where flag = '0' Limit 50" update_sql = "update t_relevant_search set flag = '1' where flag = '0' Limit 50" results = sql_operation.baseselect(db, select_sql) sql_operation.baseoperation(db, update_sql) search_list = [] threads = [] for row in results: search_list = row[1].split("-----") for keyword in search_list: thread = threading.Thread(target=page_html, args=[keyword]) threads.append(thread) thread.start() # 阻塞主进程,等待所有子线程结束 for thread in threads: thread.join() sql_operation.closecon(db)
def titleandintro(): db = sql_operation.getcon() selectsql = "SELECT article_id,title,intro FROM original_python_article WHERE title_flag IS NULL or title_flag = 0" results = sql_operation.baseselect(db, selectsql) # print(results) for row in results: article_id = row[0] title = row[1] intro = row[2] if baidudetector.url(title) and baidudetector.url(intro): updatesql = "UPDATE `original_python_article` SET `title_flag`='0' WHERE (`article_id`='%d')" % ( article_id) else: updatesql = "UPDATE `original_python_article` SET `title_flag`='1' WHERE (`article_id`='%d')" % ( article_id) sql_operation.baseoperation(db, updatesql) # print(article_id, title, intro) sql_operation.closecon(db)
def sqlETL(): db = sql_operation.getcon() select_sql = "SELECT id,key_list from t_relevant_search" select_key_sql = "SELECT `key` from t_relevant_search_key" results = sql_operation.baseselect(db, select_sql) for row in results: key_list = row[1].split('---') key_results = sql_operation.baseselect(db, select_key_sql) key_results_list = [] for temp in key_results: key_results_list.append(temp[0]) for key in key_list: if key in key_results_list: update_sql = "update t_relevant_search_key " \ "set keyword_id_list = CONCAT(keyword_id_list,'@','%s') where `key` = '%s'" \ % (row[0], key) print('-----------------') sql_operation.baseoperation(db, update_sql) else: insert_sql = "insert into t_relevant_search_key (`key`,keyword_id_list) value ('%s','%s')" \ % (key, row[0]) sql_operation.baseoperation(db, insert_sql) sql_operation.closecon(db)