Exemple #1
0
def loaddata(c_thread, thread_num, interval):
    log_name_title = "tencent_wb_msg_"
    ip = get_ip()
    base_date = time.strftime("%Y%m%d", time.localtime())
    log = log_setting(log_name_title + base_date + ".log")
    log.info("run......")
    driver = qq_login()
    time.sleep(3)

    if driver == None:
        log.info("phantomjs error!quit")
        return 0
    else:
        pass
    #出队
    conn_redis = redis_connect()
    conn_mongo = connect_mongodb()

    if conn_redis == 0 or conn_mongo == 0:
        log.info("redis or mongodb connect error")
    else:
        log.info("connect redis ok")
        log.info("connect mongodb ok")
        while not c_thread.thread_stop:
            current_date = time.strftime("%Y%m%d", time.localtime())
            if current_date == base_date:
                pass
            else:
                base_date = current_date
                log = log_setting(log_name_title + base_date + ".log")
            log.info('Thread:(%s)' % (thread_num))
            url = pop_redis_list(conn_redis)
            #判断队列是否为空
            if url == None:
                log.info("msg queue is NULL")
                break
            else:
                #获取详细信息
                msg = get_msg(driver, url, log)
                # print "load to mongodb"
                try:
                    load_mongodb(conn_mongo, url, msg)
                except:
                    rtx('ip', ip + "机器mongodb失败")
                    log.info('ip' + ip + "机器mongodb失败")
                    log.info("mongodb error")
                    break
        # rtx('IP','正常停止')
        log.info(thread_num + "quit phantomjs")
        driver.quit()
        #rtx提醒
        rtx('ip', ip + "机器" + thread_num + "停止运行")
        log.info('ip' + ip + "机器" + thread_num + "停止运行")
        #数据库状态更新,根据线程名称
        log.info("更新数据库线程状态")
        thread = ThreadMsg.objects.get(thread_name=thread_num)
        thread.thread_status = 0
        thread.save()
Exemple #2
0
def mysql_connect():
    try:
        mysql_conn = MySQLdb.connect("192.168.8.25", "qzone_spider",
                                     "qzone_spider", "db_tencent_wb")
    except:
        print "connect mysql error"
        rtx('IP', 'mysql连接异常')
        return None
    return mysql_conn
Exemple #3
0
def mysql_connect_local_qq():
    try:
        mysql_conn = MySQLdb.connect("localhost", "qzone_spider",
                                     "qzone_spider", "db_tencent_qzone")
    except:
        print "connect mysql error"
        rtx('IP', 'mysql连接异常')
        return None
    return mysql_conn
Exemple #4
0
def redis_connect():
    #带密码连接
    # r = redis.StrictRedis(host='localhost', port=6379, password='******')
    try:
        redis_conn = redis.Redis(host='192.168.15.111', port=6379, db=0)
    except:
        rtx('IP', 'redis连接异常')
        print "connect redis error"
        return None
    return redis_conn
Exemple #5
0
def connect_mongodb():
    #新版本连接方式
    try:
        conn = MongoClient("192.168.15.111", 27017)
    except:
        conn = 0
        rtx('IP','mongodb连接异常')
    #旧版本连接方式
    # conn = pymongo.Connection("192.168.15.111",27017)
    return conn
Exemple #6
0
def control_thread(request):
    th_name = request.POST['id']
    control = request.POST['control']
    print "thread_name is ", th_name
    #显示活跃状态
    msg_active = True
    thread = ThreadMsg.objects.get(thread_name=th_name)
    if control == 'start':
        rtx('ip', '进程' + str(th_name) + '  开始采集标签信息')
        #状态信息
        # thread1_status = True
        c = ThreadControl()
        # status = 1
        #出现错误,则线程不存在,因此启动线程
        try:
            status = c.is_alive(th_name)
            print "thread is alive? ", status
            if status:
                print "thread is alive,caonot start twice!"
            else:
                print "start ..........thread1"
                c.start(th_name, 1)
        except:
            print "thread is not alive start!!!"
            c.start(th_name, 1)
        thread.thread_status = 1
        thread.save()
    if control == 'stop':
        # thread1_status = False
        # status = 0
        rtx('ip', '进程' + str(th_name) + '  采集标签信息即将停止')
        c = ThreadControl()
        try:
            c.stop(th_name)
            thread.thread_status = 0
            thread.save()
        except:
            print "not thread alive"

    IP = get_ip()
    thread_list = ThreadMsg.objects.filter(thread_ip=IP)
    return render_to_response(
        'index.html', {
            "thread_name": th_name,
            "control": control,
            "thread_list": thread_list,
            "msg_active": msg_active
        })
def loaddata(c_thread, thread_num, interval):
    log_name_title = str(thread_num) + "_tencent_qzone_info_"
    ip = get_ip()
    base_date = time.strftime("%Y%m%d", time.localtime())
    log = log_setting(log_name_title + base_date + ".log")
    log.info(thread_num + "run......")
    driver = qzone_login()
    time.sleep(3)

    if driver == None:
        log.info("phantomjs error!quit")
        return 0
    else:
        pass
    #出队
    conn_redis = redis_connect()
    conn_mongo = connect_mongodb()
    # print "conn_redis",conn_redis
    # print "conn_mongo",conn_mongo
    #定义pop的redis名字
    redis_list_pop_name = "tencent_qzone_qq_info"
    redis_list_push_qzone_forbid_name = "tencent_qzone_forbid_qq"

    if conn_redis == 0 or conn_mongo == 0:
        log.info("redis or mongodb connect error")
    else:
        log.info("connect redis ok")
        log.info("connect mongodb ok")
        ip = get_ip()
        while not c_thread.thread_stop:
            current_date = time.strftime("%Y%m%d", time.localtime())
            if current_date == base_date:
                pass
            else:
                base_date = current_date
                log = log_setting(log_name_title + base_date + ".log")

            print 'Thread:(%s) Time:%s\n' % (thread_num, time.ctime())
            # log = log_setting()
            #pop_redis_list(redis_conn,redis_list_name)
            qq = pop_redis_list(conn_redis, redis_list_pop_name)
            log.info('Thread:(%s) QQ:%s' % (thread_num, qq))
            #判断队列是否为空
            if qq == None:
                log.info("queue is NULL")
                break
            else:
                #获取详细信息
                url = "http://user.qzone.qq.com/" + str(qq) + "/profile"
                info_list = get_info(driver, url, log)

                # print info_list
                # msg = get_msg(driver,url)
                if info_list == 0:
                    #qq放入redis消息队列
                    push_redis_list_tmp(conn_redis,
                                        redis_list_push_qzone_forbid_name, qq)
                    log.info(qq + "请求失败,入队禁止访问消息队列")
                    driver = qzone_login()
                    pass
                elif info_list == 1:
                    #qq放入redis消息队列
                    push_redis_list_tmp(conn_redis,
                                        redis_list_push_qzone_forbid_name, qq)
                    log.info(qq + "入队禁止访问消息队列")
                    pass
                else:
                    #存入mongodb
                    log.info("load to mongodb")
                    try:
                        load_mongodb_qzone_info(conn_mongo, qq, info_list)
                    except:
                        rtx('ip', ip + "机器mongodb失败")
                        log.info('ip' + ip + "机器mongodb失败")
                        log.info("mongodb error")
                        break
        # rtx('IP','正常停止')
        log.info(thread_num + "quit phantomjs")
        driver.quit()
        #rtx提醒
        rtx('ip', ip + "机器" + thread_num + "停止运行")
        log.info('ip' + ip + "机器" + thread_num + "停止运行")
        #数据库状态更新,根据线程名称
        log.info("更新数据库线程状态")
        thread = ThreadQzoneInfo.objects.get(thread_name=thread_num)
        thread.thread_status = 0
        thread.save()
def loaddata(c_thread, thread_num, interval):
    print "run......"
    driver = qzone_login()
    time.sleep(3)

    if driver == None:
        "phantomjs error!quit"
        return 0
    else:
        pass

    #连接redis
    conn_redis = redis_connect()
    redis_list_name_pop = "tencent_qzone_qq_test"
    redis_list_name_push = "tencent_qzone_qq_tmp_test"
    print "conn_redis", conn_redis
    if conn_redis == None:
        print "redis connect error"
    else:
        while not c_thread.thread_stop:
            print 'qzone_qq_friend Thread:(%s) Time:%s\n' % (thread_num,
                                                             time.ctime())
            qq = pop_redis_list(conn_redis, redis_list_name_pop)
            if qq == None:
                print "queue is NULL"
                break
            else:
                url = "http://user.qzone.qq.com/" + qq + "/mood"
                print "url", url
                driver.get(url)
                try:
                    #等待页面加载完成
                    frame_element = WebDriverWait(driver, 3).until(
                        EC.presence_of_element_located(
                            (By.ID, "app_canvas_frame")))
                    print "find frame id"
                    driver.switch_to.frame('app_canvas_frame')
                    try:
                        #等待切换后的元素存在
                        class_name_element = WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located(
                                (By.CLASS_NAME, "comments_content")))
                        print "find conment"
                        html = driver.page_source
                        soup = BeautifulSoup(html)
                        print "======"
                        my_set = set()
                        for i in soup.find_all(class_='comments_content'):
                            friend_qq = str(i.find('a')['href'])[25:-6]
                            print friend_qq
                            if friend_qq != qq:
                                my_set.add(friend_qq)
                        print my_set
                        friend_qq_list = list(my_set)
                        print friend_qq_list
                    except:
                        print "not found conment"
                        friend_qq_list = ['0']
                except:
                    print "没有权限访问"
                    friend_qq_list = ['-1']

                print friend_qq_list
                #############################################存入mysql
                print "insert mysql"
                #获取qq和friend_qq组成的元组,多个
                tmp_tuple = get_tuple(qq, friend_qq_list)
                #插入mysql数据库
                print "insert into table "
                mysql_conn = mysql_connect_local_qq()
                insert_mysql_qq(mysql_conn, tmp_tuple)
                #关闭数据库
                mysql_conn.close()

                ############################################存入临时的redis
                print "put mid redis"
                push_redis_list_tmp(conn_redis, redis_list_name_push, qq)
                print "put auditor mid redis"
                for friend_qq in friend_qq_list:
                    push_redis_list_tmp(conn_redis, redis_list_name_push,
                                        friend_qq)
        print thread_num, "quit phantomjs"
        driver.quit()
        #rtx提醒
        ip = get_ip()
        rtx('ip', ip + "机器" + thread_num + "停止运行")
        #数据库状态更新,根据线程名称
        print "更新数据库线程状态"
        thread = Thread_qq_friend.objects.get(thread_name=thread_num)
        thread.thread_status = 0
        thread.save()
Exemple #9
0
def loaddata(c_thread, thread_num, interval):
    log_name_title = "tencent_wb_auditor_"
    base_date = time.strftime("%Y%m%d", time.localtime())
    log = log_setting(log_name_title + base_date + ".log")
    log.info(thread_num + "run......")
    driver = qq_login()
    time.sleep(3)

    if driver == None:
        log.info("phantomjs error!quit")
        return 0
    else:
        pass

    #连接redis
    conn_redis = redis_connect()
    #mysql连接 异常返回None
    #mysql_conn = mysql_connect()
    # conn_mongo = connect_mongodb()
    # print "conn_mongo",conn_mongo
    if conn_redis == None:
        log.info("redis connect error")
    else:
        log.info("connect redis ok")
        ip = get_ip()
        while not c_thread.thread_stop:
            current_date = time.strftime("%Y%m%d", time.localtime())
            if current_date == base_date:
                pass
            else:
                base_date = current_date
                log = log_setting(log_name_title + base_date + ".log")
            # log.info('Thread:(%s) Time:%s'%(thread_num,time.ctime()))
            log.info('Thread:(%s)' % (thread_num))
            mid = pop_redis_list(conn_redis)
            if mid == None:
                log.info("queue is NULL")
                break
            else:

                url = "http://t.qq.com/" + str(mid)
                log.info("url is: " + url)
                time.sleep(3)
                #根据用户的主页url获取收听的所有页面
                auditor_page_url_list = get_auditor_page_url_via_url(
                    driver, url)
                if auditor_page_url_list == None:
                    log.info("page is not personal,login again")
                    driver.quit()
                    driver = qq_login()
                    if driver == None:
                        break
                    else:
                        pass
                #根据收听的所有页面获取收听者的主页url
                ################根据已知mid获取所有收听的mid
                else:
                    mid_list = get_auditor_main_url(driver,
                                                    auditor_page_url_list)
                    if mid_list == None:
                        continue
                    else:
                        #############################################存入mysql
                        try:
                            log.info("insert mysql")
                            #获取mid和auditor_mid组成的元组,多个
                            tmp_tuple = get_tuple(mid, mid_list)
                            #插入mysql数据库
                            print "insert into table "
                            mysql_conn = mysql_connect()
                            insert_mysql(mysql_conn, tmp_tuple)
                            #关闭数据库
                            mysql_conn.close()
                        except:
                            rtx('ip', ip + "机器mysql出错")
                            log.info('ip' + ip + "机器mysql出错")
                            log.info("insert mysql error")
                        ############################################存入临时的redis
                        try:
                            log.info("put mid redis")
                            push_redis_list_tmp(conn_redis, mid)
                            log.info("put auditor mid redis")
                            for auditor_mid in mid_list:
                                push_redis_list_tmp(conn_redis, auditor_mid)
                        except:
                            rtx('ip', ip + "机器redis出错")
                            log.info('ip' + ip + "机器redis出错")
                            log.info("insert redis error")

        log.info(thread_num + "quit phantomjs")
        driver.quit()
        #rtx提醒
        rtx('ip', ip + "机器" + thread_num + "停止运行")
        log.info('ip' + ip + "机器" + thread_num + "停止运行")
        #数据库状态更新,根据线程名称
        log.info("更新数据库线程状态")
        thread = Threadauditor.objects.get(thread_name=thread_num)
        thread.thread_status = 0
        thread.save()
Exemple #10
0
def qq_login():

    USER_COUNT = TencentUser.objects.count()
    PROXY_COUNT = TencentProxy.objects.count()
    #产生随机数
    print 'USER_COUNT', USER_COUNT
    print 'PROXY_COUNT', PROXY_COUNT
    user_number = random.randint(1, USER_COUNT)
    #判断是否有代理
    if PROXY_COUNT == 0:
        proxy_status = False
    else:
        proxy_number = random.randint(1, PROXY_COUNT)
        print "proxy_number",proxy_number
        proxy_object = TencentProxy.objects.get(proxy_id=proxy_number)
        #proxy_ip = '110.73.6.15:8123'
        proxy_ip = proxy_object.proxy_ip
        proxy = '--proxy=' + proxy_ip
        service_args = [proxy]
        proxy_status = True
        print "proxy",proxy

    print 'user_number is',user_number
    print "proxy_status",proxy_status

    #去数据库中取,随机获取登陆帐号
    user = TencentUser.objects.get(user_id=user_number)
    login_name = user.login_name
    login_pwd = user.login_password
    tencent_wb_name = user.tencent_wb_name

    flag = 1
    count = 0
    while flag:
        try:
            ###################linux
            driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')
            ###################windows
            # if proxy_status:
            #     print "use proxy"
            #     driver  = webdriver.PhantomJS(executable_path='E:\\phantomjs\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs',service_args=service_args)
            # else:
            #     print "no proxy"
            #     driver  = webdriver.PhantomJS(executable_path='E:\\phantomjs\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs')
            flag = 0
        except:
            print "PhantomJS error,wait a moment!"
            time.sleep(2)
            count = count + 1
            if count > 5:
                rtx('IP','连接phantomjs失败,检查phantomjs是否可用')
                return None
    try:
        print "start get main"
        driver.get("http://t.qq.com/")
        print "get over"
        time.sleep(3)
        # driver.switch_to_frame("login_frame")
        driver.switch_to.frame("login_div")
        driver.find_element_by_id("switcher_plogin").click()
        driver.find_element_by_id("u").send_keys(login_name)
        driver.find_element_by_id("p").send_keys(login_pwd)
        driver.find_element_by_id("login_button").click()
        time.sleep(10)
        print "driver.current_url is",driver.current_url
        #判断登陆成功
        if driver.current_url == str("http://t.qq.com/" + tencent_wb_name):
            pass
        else:
            print "url not match!"
            driver.quit()
            qq_login()
    except:
        print "login error!"
        rtx('IP','登陆异常,检查帐密或者代理是否可用')
        #代理访问出错
        driver.quit()
        qq_login()
    return driver
Exemple #11
0
def qzone_login():

    USER_COUNT = TencentUser.objects.count()
    PROXY_COUNT = TencentProxy.objects.count()

    user_number = random.randint(1, USER_COUNT)

    #去数据库中取,随机获取登陆帐号
    user = TencentUser.objects.get(user_id=user_number)
    login_name = user.login_name
    #密码解密s2 = base64.decodestring(s1)
    login_pwd = base64.decodestring(user.login_password)
    qq_qzone_name = user.qq_qzone_name

    login_flag = 1
    login_times = 1
    while login_flag:
        driver_flag = 1
        driver_times = 1
        while driver_flag:
            try:
                #driver = webdriver.PhantomJS('E:\\phantomjs\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs')
                driver = webdriver.PhantomJS(
                    executable_path='/usr/local/phantomjs/bin/phantomjs',
                    service_log_path='/data/tmp/ghostdriver.log')
                driver_flag = 0
            except:
                print "PhantomJS error,wait a moment!"
                time.sleep(10)
            driver_times = driver_times + 1
            if driver_times > 5:
                driver_flag = 0

        try:
            driver.get("http://i.qq.com/")
            time.sleep(3)
            # driver.switch_to_frame("login_frame")
            driver.switch_to.frame("login_frame")
            driver.find_element_by_id("switcher_plogin").click()
            driver.find_element_by_id("u").send_keys(login_name)
            driver.find_element_by_id("p").send_keys(login_pwd)
            driver.find_element_by_id("login_button").click()
            time.sleep(10)
            print "driver.current_url is", driver.current_url
            print "match is : ", "http://user.qzone.qq.com/" + str(
                qq_qzone_name)
            if driver.current_url == "http://user.qzone.qq.com/" + str(
                    qq_qzone_name
            ) or driver.current_url == "https://user.qzone.qq.com/" + str(
                    qq_qzone_name):
                login_flag = 0
            else:
                print "url 不一致!"
                driver.quit()
        except:
            print "login error!"
            driver.quit()

        login_times = login_times + 1
        if login_times > 10:
            rtx('ip', 'qq login error')
            driver = None
            login_flag = 0
    return driver
def loaddata(c_thread, thread_num, interval):

    log_name_title = "tencent_wb_auditor_"
    base_date = time.strftime("%Y%m%d", time.localtime())
    log = log_setting(log_name_title + base_date + ".log")
    log.info(thread_num + "run......")
    driver = qzone_login()
    time.sleep(3)

    if driver == None:
        log.info("phantomjs error!quit")
        return 0
    else:
        pass

    #连接redis
    conn_redis = redis_connect()
    redis_list_name_pop = "tencent_qzone_qq"
    redis_list_name_push = "tencent_qzone_qq_transfer"
    print "conn_redis", conn_redis
    if conn_redis == None:
        log.info("redis connect error")
    else:
        log.info("redis connect ok")
        ip = get_ip()
        while not c_thread.thread_stop:
            log.info('qzone_qq_friend Thread:(%s) Time:%s' %
                     (thread_num, time.ctime()))
            qq = pop_redis_list(conn_redis, redis_list_name_pop)
            if qq == None:
                log.info("queue is NULL")
                break
            else:
                url = "http://user.qzone.qq.com/" + qq + "/mood"
                log.info("url" + url)
                driver.get(url)
                try:
                    #等待页面加载完成
                    frame_element = WebDriverWait(driver, 3).until(
                        EC.presence_of_element_located(
                            (By.ID, "app_canvas_frame")))
                    log.info("find frame id")
                    driver.switch_to.frame('app_canvas_frame')
                    try:
                        #等待切换后的元素存在
                        class_name_element = WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located(
                                (By.CLASS_NAME, "comments_content")))
                        log.info("find conment")
                        html = driver.page_source
                        soup = BeautifulSoup(html)
                        print "======"
                        my_set = set()
                        for i in soup.find_all(class_='comments_content'):
                            friend_qq = str(i.find('a')['href'])[25:-6]
                            print friend_qq
                            if friend_qq != qq:
                                my_set.add(friend_qq)
                        print my_set
                        friend_qq_list = list(my_set)
                        print friend_qq_list
                    except:
                        log.info("not found conment")
                        friend_qq_list = ['0']
                except:
                    log.info("没有权限访问")
                    friend_qq_list = ['-1']

                print friend_qq_list
                #############################################存入mysql
                try:
                    log.info("insert mysql")
                    #获取qq和friend_qq组成的元组,多个
                    tmp_tuple = get_tuple(qq, friend_qq_list)
                    #插入mysql数据库
                    print "insert into table "
                    mysql_conn = mysql_connect_qq()
                    insert_mysql_qq(mysql_conn, tmp_tuple)
                    #关闭数据库
                    mysql_conn.close()
                except:
                    rtx('ip', ip + "机器QQ空间关系链采集mysql出错")
                    log.info('ip' + ip + "机器QQ空间关系链采集mysql出错")
                ############################################存入临时的redis
                try:
                    log.info("put mid redis")
                    push_redis_list_tmp(conn_redis, redis_list_name_push, qq)
                    log.info("put auditor mid redis")
                    for friend_qq in friend_qq_list:
                        push_redis_list_tmp(conn_redis, redis_list_name_push,
                                            friend_qq)
                except:
                    rtx('ip', ip + "机器QQ空间关系链采集redis入队出错")
                    log.info('ip' + ip + "机器QQ空间关系链采集redis入队出错")
        log.info(thread_num + "quit phantomjs")
        driver.quit()
        #rtx提醒
        rtx('ip', ip + "机器" + thread_num + "停止运行")
        log.info('ip' + ip + "机器" + thread_num + "停止运行")
        #数据库状态更新,根据线程名称
        log.info("更新数据库线程状态")
        thread = Thread_qq_friend.objects.get(thread_name=thread_num)
        thread.thread_status = 0
        thread.save()