def loaddata(c_thread, thread_num, interval):
    print "run......"
    driver = get_webdriver()
    if driver == None:
        return 0
    redis_conn = redis_connect()
    if redis_conn == None:
        return 0

    while not c_thread.thread_stop:
        print thread_num, "spider url"
        time.sleep(3)
        #出队列
        main_url = pop_redis_list(redis_conn, 'science_main_url')

        print "main_url:", main_url
        if main_url == None:
            print "redis connect error or queue is null"
            break
        else:
            mysql_conn = mysql_connect_localhost()
            if mysql_conn == None:
                print "mysql connect error"
                break
            else:
                volume_url_list = get_volume_url(driver, main_url)
                if volume_url_list == 0:
                    push_redis_list(redis_conn, 'science_main_url', main_url)
                    driver.quit()
                    time.sleep(2)
                    print 'restart webdriver'
                    driver = get_webdriver()
                else:
                    if not volume_url_list:
                        continue
                    else:
                        #获取最终文章URL
                        #返回值只做入队列使用
                        article_url_list_all = get_article_url(
                            driver, mysql_conn, main_url, volume_url_list)
                        #超时处理
                        if article_url_list_all == 0:
                            push_redis_list(redis_conn, 'science_main_url',
                                            main_url)
                            driver.quit()
                            time.sleep(2)
                            print 'restart webdriver'
                            driver = get_webdriver()
                        else:
                            #入消息队列
                            print "push article redis"
                            for article_url in article_url_list_all:
                                push_redis_list(redis_conn,
                                                'science_article_url_tmp',
                                                article_url)
                mysql_conn.close()
    print thread_num, "exit"
    driver.close()
Beispiel #2
0
def loaddata(c_thread, thread_num, interval):
    print "run......"

    driver = get_webdriver()
    if driver == None:
        return 0
    redis_conn = redis_connect()
    if redis_conn == None:
        return 0

    while not c_thread.thread_stop:
        print thread_num, "spider article"
        url = pop_redis_list(redis_conn, 'science_article_url')
        if url == None:
            print "url queue is null"
            break
        else:
            print url, "====================================="
            mysql_conn = mysql_connect_localhost()
            if mysql_conn == None:
                print "mysql connect error!"
                break
            else:
                time.sleep(2)
                request_flag = get_periodical(driver, url, mysql_conn)
                if request_flag == 0:
                    push_redis_list(redis_conn, 'science_article_url', url)
                    driver.quit()
                    time.sleep(2)
                    print 'restart webdriver'
                    driver = get_webdriver()
            # time.sleep(3)
            mysql_conn.close()
    #结束退出
    print thread_num, " quit"
    driver.close()
def loaddata(c_thread, thread_num, interval):
    log_name_title = str(thread_num) + "_tencent_qzone_info_"
    ip = get_ip()
    base_date = time.strftime("%Y%m%d", time.localtime())
    log = log_setting(log_name_title + base_date + ".log")
    log.info(thread_num + "run......")
    driver = qzone_login()
    time.sleep(3)

    if driver == None:
        log.info("phantomjs error!quit")
        return 0
    else:
        pass
    #出队
    conn_redis = redis_connect()
    conn_mongo = connect_mongodb()
    # print "conn_redis",conn_redis
    # print "conn_mongo",conn_mongo
    #定义pop的redis名字
    redis_list_pop_name = "tencent_qzone_qq_info"
    redis_list_push_qzone_forbid_name = "tencent_qzone_forbid_qq"

    if conn_redis == 0 or conn_mongo == 0:
        log.info("redis or mongodb connect error")
    else:
        log.info("connect redis ok")
        log.info("connect mongodb ok")
        ip = get_ip()
        while not c_thread.thread_stop:
            current_date = time.strftime("%Y%m%d", time.localtime())
            if current_date == base_date:
                pass
            else:
                base_date = current_date
                log = log_setting(log_name_title + base_date + ".log")

            print 'Thread:(%s) Time:%s\n' % (thread_num, time.ctime())
            # log = log_setting()
            #pop_redis_list(redis_conn,redis_list_name)
            qq = pop_redis_list(conn_redis, redis_list_pop_name)
            log.info('Thread:(%s) QQ:%s' % (thread_num, qq))
            #判断队列是否为空
            if qq == None:
                log.info("queue is NULL")
                break
            else:
                #获取详细信息
                url = "http://user.qzone.qq.com/" + str(qq) + "/profile"
                info_list = get_info(driver, url, log)

                # print info_list
                # msg = get_msg(driver,url)
                if info_list == 0:
                    #qq放入redis消息队列
                    push_redis_list_tmp(conn_redis,
                                        redis_list_push_qzone_forbid_name, qq)
                    log.info(qq + "请求失败,入队禁止访问消息队列")
                    driver = qzone_login()
                    pass
                elif info_list == 1:
                    #qq放入redis消息队列
                    push_redis_list_tmp(conn_redis,
                                        redis_list_push_qzone_forbid_name, qq)
                    log.info(qq + "入队禁止访问消息队列")
                    pass
                else:
                    #存入mongodb
                    log.info("load to mongodb")
                    try:
                        load_mongodb_qzone_info(conn_mongo, qq, info_list)
                    except:
                        rtx('ip', ip + "机器mongodb失败")
                        log.info('ip' + ip + "机器mongodb失败")
                        log.info("mongodb error")
                        break
        # rtx('IP','正常停止')
        log.info(thread_num + "quit phantomjs")
        driver.quit()
        #rtx提醒
        rtx('ip', ip + "机器" + thread_num + "停止运行")
        log.info('ip' + ip + "机器" + thread_num + "停止运行")
        #数据库状态更新,根据线程名称
        log.info("更新数据库线程状态")
        thread = ThreadQzoneInfo.objects.get(thread_name=thread_num)
        thread.thread_status = 0
        thread.save()
def loaddata(c_thread, thread_num, interval):
    print "run......"
    driver = qzone_login()
    time.sleep(3)

    if driver == None:
        "phantomjs error!quit"
        return 0
    else:
        pass

    #连接redis
    conn_redis = redis_connect()
    redis_list_name_pop = "tencent_qzone_qq_test"
    redis_list_name_push = "tencent_qzone_qq_tmp_test"
    print "conn_redis", conn_redis
    if conn_redis == None:
        print "redis connect error"
    else:
        while not c_thread.thread_stop:
            print 'qzone_qq_friend Thread:(%s) Time:%s\n' % (thread_num,
                                                             time.ctime())
            qq = pop_redis_list(conn_redis, redis_list_name_pop)
            if qq == None:
                print "queue is NULL"
                break
            else:
                url = "http://user.qzone.qq.com/" + qq + "/mood"
                print "url", url
                driver.get(url)
                try:
                    #等待页面加载完成
                    frame_element = WebDriverWait(driver, 3).until(
                        EC.presence_of_element_located(
                            (By.ID, "app_canvas_frame")))
                    print "find frame id"
                    driver.switch_to.frame('app_canvas_frame')
                    try:
                        #等待切换后的元素存在
                        class_name_element = WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located(
                                (By.CLASS_NAME, "comments_content")))
                        print "find conment"
                        html = driver.page_source
                        soup = BeautifulSoup(html)
                        print "======"
                        my_set = set()
                        for i in soup.find_all(class_='comments_content'):
                            friend_qq = str(i.find('a')['href'])[25:-6]
                            print friend_qq
                            if friend_qq != qq:
                                my_set.add(friend_qq)
                        print my_set
                        friend_qq_list = list(my_set)
                        print friend_qq_list
                    except:
                        print "not found conment"
                        friend_qq_list = ['0']
                except:
                    print "没有权限访问"
                    friend_qq_list = ['-1']

                print friend_qq_list
                #############################################存入mysql
                print "insert mysql"
                #获取qq和friend_qq组成的元组,多个
                tmp_tuple = get_tuple(qq, friend_qq_list)
                #插入mysql数据库
                print "insert into table "
                mysql_conn = mysql_connect_local_qq()
                insert_mysql_qq(mysql_conn, tmp_tuple)
                #关闭数据库
                mysql_conn.close()

                ############################################存入临时的redis
                print "put mid redis"
                push_redis_list_tmp(conn_redis, redis_list_name_push, qq)
                print "put auditor mid redis"
                for friend_qq in friend_qq_list:
                    push_redis_list_tmp(conn_redis, redis_list_name_push,
                                        friend_qq)
        print thread_num, "quit phantomjs"
        driver.quit()
        #rtx提醒
        ip = get_ip()
        rtx('ip', ip + "机器" + thread_num + "停止运行")
        #数据库状态更新,根据线程名称
        print "更新数据库线程状态"
        thread = Thread_qq_friend.objects.get(thread_name=thread_num)
        thread.thread_status = 0
        thread.save()
def loaddata(c_thread, thread_num, interval):

    log_name_title = "tencent_wb_auditor_"
    base_date = time.strftime("%Y%m%d", time.localtime())
    log = log_setting(log_name_title + base_date + ".log")
    log.info(thread_num + "run......")
    driver = qzone_login()
    time.sleep(3)

    if driver == None:
        log.info("phantomjs error!quit")
        return 0
    else:
        pass

    #连接redis
    conn_redis = redis_connect()
    redis_list_name_pop = "tencent_qzone_qq"
    redis_list_name_push = "tencent_qzone_qq_transfer"
    print "conn_redis", conn_redis
    if conn_redis == None:
        log.info("redis connect error")
    else:
        log.info("redis connect ok")
        ip = get_ip()
        while not c_thread.thread_stop:
            log.info('qzone_qq_friend Thread:(%s) Time:%s' %
                     (thread_num, time.ctime()))
            qq = pop_redis_list(conn_redis, redis_list_name_pop)
            if qq == None:
                log.info("queue is NULL")
                break
            else:
                url = "http://user.qzone.qq.com/" + qq + "/mood"
                log.info("url" + url)
                driver.get(url)
                try:
                    #等待页面加载完成
                    frame_element = WebDriverWait(driver, 3).until(
                        EC.presence_of_element_located(
                            (By.ID, "app_canvas_frame")))
                    log.info("find frame id")
                    driver.switch_to.frame('app_canvas_frame')
                    try:
                        #等待切换后的元素存在
                        class_name_element = WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located(
                                (By.CLASS_NAME, "comments_content")))
                        log.info("find conment")
                        html = driver.page_source
                        soup = BeautifulSoup(html)
                        print "======"
                        my_set = set()
                        for i in soup.find_all(class_='comments_content'):
                            friend_qq = str(i.find('a')['href'])[25:-6]
                            print friend_qq
                            if friend_qq != qq:
                                my_set.add(friend_qq)
                        print my_set
                        friend_qq_list = list(my_set)
                        print friend_qq_list
                    except:
                        log.info("not found conment")
                        friend_qq_list = ['0']
                except:
                    log.info("没有权限访问")
                    friend_qq_list = ['-1']

                print friend_qq_list
                #############################################存入mysql
                try:
                    log.info("insert mysql")
                    #获取qq和friend_qq组成的元组,多个
                    tmp_tuple = get_tuple(qq, friend_qq_list)
                    #插入mysql数据库
                    print "insert into table "
                    mysql_conn = mysql_connect_qq()
                    insert_mysql_qq(mysql_conn, tmp_tuple)
                    #关闭数据库
                    mysql_conn.close()
                except:
                    rtx('ip', ip + "机器QQ空间关系链采集mysql出错")
                    log.info('ip' + ip + "机器QQ空间关系链采集mysql出错")
                ############################################存入临时的redis
                try:
                    log.info("put mid redis")
                    push_redis_list_tmp(conn_redis, redis_list_name_push, qq)
                    log.info("put auditor mid redis")
                    for friend_qq in friend_qq_list:
                        push_redis_list_tmp(conn_redis, redis_list_name_push,
                                            friend_qq)
                except:
                    rtx('ip', ip + "机器QQ空间关系链采集redis入队出错")
                    log.info('ip' + ip + "机器QQ空间关系链采集redis入队出错")
        log.info(thread_num + "quit phantomjs")
        driver.quit()
        #rtx提醒
        rtx('ip', ip + "机器" + thread_num + "停止运行")
        log.info('ip' + ip + "机器" + thread_num + "停止运行")
        #数据库状态更新,根据线程名称
        log.info("更新数据库线程状态")
        thread = Thread_qq_friend.objects.get(thread_name=thread_num)
        thread.thread_status = 0
        thread.save()