def loaddata(c_thread, thread_num, interval): print "run......" driver = get_webdriver() if driver == None: return 0 redis_conn = redis_connect() if redis_conn == None: return 0 while not c_thread.thread_stop: print thread_num, "spider url" time.sleep(3) #出队列 main_url = pop_redis_list(redis_conn, 'science_main_url') print "main_url:", main_url if main_url == None: print "redis connect error or queue is null" break else: mysql_conn = mysql_connect_localhost() if mysql_conn == None: print "mysql connect error" break else: volume_url_list = get_volume_url(driver, main_url) if volume_url_list == 0: push_redis_list(redis_conn, 'science_main_url', main_url) driver.quit() time.sleep(2) print 'restart webdriver' driver = get_webdriver() else: if not volume_url_list: continue else: #获取最终文章URL #返回值只做入队列使用 article_url_list_all = get_article_url( driver, mysql_conn, main_url, volume_url_list) #超时处理 if article_url_list_all == 0: push_redis_list(redis_conn, 'science_main_url', main_url) driver.quit() time.sleep(2) print 'restart webdriver' driver = get_webdriver() else: #入消息队列 print "push article redis" for article_url in article_url_list_all: push_redis_list(redis_conn, 'science_article_url_tmp', article_url) mysql_conn.close() print thread_num, "exit" driver.close()
def loaddata(c_thread, thread_num, interval): print "run......" driver = get_webdriver() if driver == None: return 0 redis_conn = redis_connect() if redis_conn == None: return 0 while not c_thread.thread_stop: print thread_num, "spider article" url = pop_redis_list(redis_conn, 'science_article_url') if url == None: print "url queue is null" break else: print url, "=====================================" mysql_conn = mysql_connect_localhost() if mysql_conn == None: print "mysql connect error!" break else: time.sleep(2) request_flag = get_periodical(driver, url, mysql_conn) if request_flag == 0: push_redis_list(redis_conn, 'science_article_url', url) driver.quit() time.sleep(2) print 'restart webdriver' driver = get_webdriver() # time.sleep(3) mysql_conn.close() #结束退出 print thread_num, " quit" driver.close()
def loaddata(c_thread, thread_num, interval): log_name_title = str(thread_num) + "_tencent_qzone_info_" ip = get_ip() base_date = time.strftime("%Y%m%d", time.localtime()) log = log_setting(log_name_title + base_date + ".log") log.info(thread_num + "run......") driver = qzone_login() time.sleep(3) if driver == None: log.info("phantomjs error!quit") return 0 else: pass #出队 conn_redis = redis_connect() conn_mongo = connect_mongodb() # print "conn_redis",conn_redis # print "conn_mongo",conn_mongo #定义pop的redis名字 redis_list_pop_name = "tencent_qzone_qq_info" redis_list_push_qzone_forbid_name = "tencent_qzone_forbid_qq" if conn_redis == 0 or conn_mongo == 0: log.info("redis or mongodb connect error") else: log.info("connect redis ok") log.info("connect mongodb ok") ip = get_ip() while not c_thread.thread_stop: current_date = time.strftime("%Y%m%d", time.localtime()) if current_date == base_date: pass else: base_date = current_date log = log_setting(log_name_title + base_date + ".log") print 'Thread:(%s) Time:%s\n' % (thread_num, time.ctime()) # log = log_setting() #pop_redis_list(redis_conn,redis_list_name) qq = pop_redis_list(conn_redis, redis_list_pop_name) log.info('Thread:(%s) QQ:%s' % (thread_num, qq)) #判断队列是否为空 if qq == None: log.info("queue is NULL") break else: #获取详细信息 url = "http://user.qzone.qq.com/" + str(qq) + "/profile" info_list = get_info(driver, url, log) # print info_list # msg = get_msg(driver,url) if info_list == 0: #qq放入redis消息队列 push_redis_list_tmp(conn_redis, redis_list_push_qzone_forbid_name, qq) log.info(qq + "请求失败,入队禁止访问消息队列") driver = qzone_login() pass elif info_list == 1: #qq放入redis消息队列 push_redis_list_tmp(conn_redis, redis_list_push_qzone_forbid_name, qq) log.info(qq + "入队禁止访问消息队列") pass else: #存入mongodb log.info("load to mongodb") try: load_mongodb_qzone_info(conn_mongo, qq, info_list) except: rtx('ip', ip + "机器mongodb失败") log.info('ip' + ip + "机器mongodb失败") log.info("mongodb error") break # rtx('IP','正常停止') log.info(thread_num + "quit phantomjs") driver.quit() #rtx提醒 rtx('ip', ip + "机器" + thread_num + "停止运行") log.info('ip' + ip + "机器" + thread_num + "停止运行") #数据库状态更新,根据线程名称 log.info("更新数据库线程状态") thread = ThreadQzoneInfo.objects.get(thread_name=thread_num) thread.thread_status = 0 thread.save()
def loaddata(c_thread, thread_num, interval): print "run......" driver = qzone_login() time.sleep(3) if driver == None: "phantomjs error!quit" return 0 else: pass #连接redis conn_redis = redis_connect() redis_list_name_pop = "tencent_qzone_qq_test" redis_list_name_push = "tencent_qzone_qq_tmp_test" print "conn_redis", conn_redis if conn_redis == None: print "redis connect error" else: while not c_thread.thread_stop: print 'qzone_qq_friend Thread:(%s) Time:%s\n' % (thread_num, time.ctime()) qq = pop_redis_list(conn_redis, redis_list_name_pop) if qq == None: print "queue is NULL" break else: url = "http://user.qzone.qq.com/" + qq + "/mood" print "url", url driver.get(url) try: #等待页面加载完成 frame_element = WebDriverWait(driver, 3).until( EC.presence_of_element_located( (By.ID, "app_canvas_frame"))) print "find frame id" driver.switch_to.frame('app_canvas_frame') try: #等待切换后的元素存在 class_name_element = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, "comments_content"))) print "find conment" html = driver.page_source soup = BeautifulSoup(html) print "======" my_set = set() for i in soup.find_all(class_='comments_content'): friend_qq = str(i.find('a')['href'])[25:-6] print friend_qq if friend_qq != qq: my_set.add(friend_qq) print my_set friend_qq_list = list(my_set) print friend_qq_list except: print "not found conment" friend_qq_list = ['0'] except: print "没有权限访问" friend_qq_list = ['-1'] print friend_qq_list #############################################存入mysql print "insert mysql" #获取qq和friend_qq组成的元组,多个 tmp_tuple = get_tuple(qq, friend_qq_list) #插入mysql数据库 print "insert into table " mysql_conn = mysql_connect_local_qq() insert_mysql_qq(mysql_conn, tmp_tuple) #关闭数据库 mysql_conn.close() ############################################存入临时的redis print "put mid redis" push_redis_list_tmp(conn_redis, redis_list_name_push, qq) print "put auditor mid redis" for friend_qq in friend_qq_list: push_redis_list_tmp(conn_redis, redis_list_name_push, friend_qq) print thread_num, "quit phantomjs" driver.quit() #rtx提醒 ip = get_ip() rtx('ip', ip + "机器" + thread_num + "停止运行") #数据库状态更新,根据线程名称 print "更新数据库线程状态" thread = Thread_qq_friend.objects.get(thread_name=thread_num) thread.thread_status = 0 thread.save()
def loaddata(c_thread, thread_num, interval): log_name_title = "tencent_wb_auditor_" base_date = time.strftime("%Y%m%d", time.localtime()) log = log_setting(log_name_title + base_date + ".log") log.info(thread_num + "run......") driver = qzone_login() time.sleep(3) if driver == None: log.info("phantomjs error!quit") return 0 else: pass #连接redis conn_redis = redis_connect() redis_list_name_pop = "tencent_qzone_qq" redis_list_name_push = "tencent_qzone_qq_transfer" print "conn_redis", conn_redis if conn_redis == None: log.info("redis connect error") else: log.info("redis connect ok") ip = get_ip() while not c_thread.thread_stop: log.info('qzone_qq_friend Thread:(%s) Time:%s' % (thread_num, time.ctime())) qq = pop_redis_list(conn_redis, redis_list_name_pop) if qq == None: log.info("queue is NULL") break else: url = "http://user.qzone.qq.com/" + qq + "/mood" log.info("url" + url) driver.get(url) try: #等待页面加载完成 frame_element = WebDriverWait(driver, 3).until( EC.presence_of_element_located( (By.ID, "app_canvas_frame"))) log.info("find frame id") driver.switch_to.frame('app_canvas_frame') try: #等待切换后的元素存在 class_name_element = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, "comments_content"))) log.info("find conment") html = driver.page_source soup = BeautifulSoup(html) print "======" my_set = set() for i in soup.find_all(class_='comments_content'): friend_qq = str(i.find('a')['href'])[25:-6] print friend_qq if friend_qq != qq: my_set.add(friend_qq) print my_set friend_qq_list = list(my_set) print friend_qq_list except: log.info("not found conment") friend_qq_list = ['0'] except: log.info("没有权限访问") friend_qq_list = ['-1'] print friend_qq_list #############################################存入mysql try: log.info("insert mysql") #获取qq和friend_qq组成的元组,多个 tmp_tuple = get_tuple(qq, friend_qq_list) #插入mysql数据库 print "insert into table " mysql_conn = mysql_connect_qq() insert_mysql_qq(mysql_conn, tmp_tuple) #关闭数据库 mysql_conn.close() except: rtx('ip', ip + "机器QQ空间关系链采集mysql出错") log.info('ip' + ip + "机器QQ空间关系链采集mysql出错") ############################################存入临时的redis try: log.info("put mid redis") push_redis_list_tmp(conn_redis, redis_list_name_push, qq) log.info("put auditor mid redis") for friend_qq in friend_qq_list: push_redis_list_tmp(conn_redis, redis_list_name_push, friend_qq) except: rtx('ip', ip + "机器QQ空间关系链采集redis入队出错") log.info('ip' + ip + "机器QQ空间关系链采集redis入队出错") log.info(thread_num + "quit phantomjs") driver.quit() #rtx提醒 rtx('ip', ip + "机器" + thread_num + "停止运行") log.info('ip' + ip + "机器" + thread_num + "停止运行") #数据库状态更新,根据线程名称 log.info("更新数据库线程状态") thread = Thread_qq_friend.objects.get(thread_name=thread_num) thread.thread_status = 0 thread.save()