Ejemplo n.º 1
0
def main():
    fbuserhelper = FBUserHelper.FBUserHelper()
    # 登录
    # fbAccount = ourFBAccount('*****@*****.**', '1qaz@WSX3edc', None)
    fbAccount = OurFBAccount.getAccount('ourFBAccount.txt')
    logName = time.strftime('%Y-%m-%d', time.localtime(time.time())) + "--"
    myargs = {'fName': logName, 'fLevel': logging.DEBUG}
    logger = logHelper.getLogger('myLog', logging.INFO, **myargs)
    # 初始化
    logger.info("============================================================")
    browser, isLogin = FBUserCrawler.initCrawler("https://www.facebook.com/",
                                                 fbAccount)
    if not isLogin:
        logHelper.getLogger().info("login error.please check the log file!")
        time.sleep(10)
        exit(1)

    hn = socket.gethostname()
    ip = socket.gethostbyname(hn)
    spidername = '[{}]/{}'.format(ip, hn)

    while True:
        try:
            # 注意休眠
            while True:
                h = datetime.datetime.now().hour
                # print(datetime.datetime.now())
                if h < 7 or h >= 22:
                    time.sleep(random.randint(300, 1200))  # 随机睡眠 5分钟到20分钟
                else:
                    if h == 7:
                        # 早8点到早10点之间随机选择开始爬取
                        time.sleep(random.randint(3601,
                                                  7201))  # 随机一小时到俩小时分钟后开始爬取
                        logHelper.getLogger().info(
                            'Crawle begin at : ' +
                            datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S"))
                        if browser == None:
                            browser, isLogin = FBUserCrawler.initCrawler(
                                "https://www.facebook.com/", fbAccount)
                            if not isLogin:
                                logHelper.getLogger().info(
                                    "login error.please check the log file!")
                                time.sleep(10)
                                exit(1)
                        break
                    elif h >= 20:
                        # 晚8点到晚9点随机结束
                        rndTime = random.randint(0, 7201)
                        tmpTime = datetime.datetime.now() + datetime.timedelta(
                            seconds=rndTime)
                        if tmpTime.hour >= 22:
                            logHelper.getLogger().info(
                                'Crawle end at : ' +
                                datetime.datetime.now().strftime(
                                    "%Y-%m-%d %H:%M:%S"))
                            time.sleep(rndTime)  # 睡到22点后
                            if browser != None:
                                FBUserCrawler.LogOut(browser)
                                browser.close()
                                browser = None

                        else:
                            break
                    else:
                        break

            if common.testDispatchServer() == 'Disconnected!':  #1.测试调度服务器连接
                logHelper.getLogger().info('Dispatch server is disconnected!')
                time.sleep(5)
                continue
            logHelper.getLogger().info('Dispatch server is connected!')

            logHelper.getLogger().debug('connecting database server ...')
            if common.testDatabaseServer() == 'Disconnected!':  #2.测试服务器连接
                logHelper.getLogger().info(
                    'Database server is disconnected!,trying again later.')
                time.sleep(5)
                continue
            print('Database server is connected!')

            spiderType = 'Facebook.userInfo'  #对应任务类型
            task = getFBUserTask(spiderType)

            taskid = task['id']
            if int(taskid) == -1:
                time.sleep(5)
                continue

            fbid = task['fbid']
            originalfbid = task['originalfbid']
            deep = int(task['deep'])  #当前任务深度,在保存tb_user_friends时+1保存入库
            name = task['name']
            priority = task['priority']

            fbUser = FBBaseUser(fbid, name, deep, priority)

            # logHelper.getLogger().debug(task)
            logHelper.getLogger().info(
                'Spider [{0}] have got a task:{1}/{2}/{3}. spider is working...'
                .format(spiderType, taskid, fbid, originalfbid))
            time.sleep(1)
            #开始单线程爬取工作#####################################################################################################
            try:
                # 填写分配任务时间和承担的spider=''
                fbuserhelper.UpdateTaskDispatch(int(taskid),
                                                spider=spidername[:50])

                resCrawled = FBUserCrawler.crawleInfo(browser, fbUser,
                                                      fbAccount)
                if resCrawled == 1:
                    #完成工作,入库
                    print('Spider have done the job. Saving the results...')

                    #告知调度端完成情况
                    print(
                        'Spider have saved the results,and reporting the job to dispatch server...'
                    )
                    k = reportFBTaskUserComplete(taskid, 2,
                                                 'completed normally.')
                    print('task id:{0},fbid:{1} has reported the job status.'.
                          format(taskid, fbid))
                elif resCrawled == 0:
                    logHelper.getLogger().warning(
                        'network error or unknown error! wait and try...')
                    k = reportFBTaskUserComplete(
                        taskid, 3,
                        'completed abnormally.Reason:network error or unknown error!'
                    )
                    time.sleep(50)
                elif resCrawled == 2:
                    print(
                        'our facebook account is forbiddened. Stop and change the account.'
                    )
                    k = reportFBTaskUserComplete(
                        taskid, 3,
                        'completed abnormally.Reason:our facebook account {} is forbiddened'
                        .format(fbAccount.u))
                    break
                elif resCrawled == 3:
                    print(
                        'the target facebook account is invalid. task completed.'
                    )
                    k = reportFBTaskUserComplete(
                        taskid, 2,
                        'completed normal.Reason:the target facebook account {} is invalid'
                        .format(fbUser.fbid))
                    print('task id:{0},fbid:{1} has reported the job status.'.
                          format(taskid, fbid))

            except Exception as e1:
                logHelper.getLogger().error('Scrawling error!', e1)
                reportFBTaskUserComplete(
                    taskid, 3, 'completed abnormally.Error:{0}'.format(e1))
                #raise  e1

            time.sleep(5)
            # 结束单线程爬取工作#####################################################################################################
        except Exception as e:
            logHelper.getLogger().error(e)
            logHelper.getLogger().error('the main loop error,restart it!')
            time.sleep(50)
Ejemplo n.º 2
0
def main():
    fbuserhelper = FBUserHelper.FBUserHelper()

    # # 登录
    # # fbAccount = ourFBAccount('*****@*****.**', '1qaz@WSX3edc', None)
    # fbAccount = OurFBAccount.getAccount('ourFBAccount.txt')
    # logName = time.strftime('%Y-%m-%d', time.localtime(time.time())) + ".log"
    # myargs = {'fName': logName, 'fLevel': logging.DEBUG}
    # logger = logHelper.getLogger('myLog', logging.INFO, **myargs)
    # # 初始化
    # logger.info("============================================================")
    # browser, isLogin = FBUserCrawler.initCrawler("https://www.facebook.com/", fbAccount)
    # if not isLogin:
    #     logHelper.getLogger().info("login error.please check the log file!")
    #     time.sleep(10)
    #     exit(1)

    while True:
        try:
            if common.testDispatchServer() == 'Disconnected!':  #1.测试调度服务器连接
                logHelper.getLogger().info('Dispatch server is disconnected!')
                time.sleep(5)
                continue
            logHelper.getLogger().info('Dispatch server is connected!')

            logHelper.getLogger().debug('connecting database server ...')
            if common.testDatabaseServer() == 'Disconnected!':  #2.测试服务器连接
                logHelper.getLogger().info(
                    'Database server is disconnected!,trying again later.')
                time.sleep(5)
                continue
            print('Database server is connected!')

            spiderType = 'Facebook.groupInfo'  #对应任务类型
            task = getFBUserTask(spiderType)

            taskid = task['id']
            if int(taskid) == -1:
                time.sleep(5)
                continue

            fbid = task['fbid']
            originalfbid = task['originalfbid']
            deep = int(task['deep'])  #当前任务深度,在保存tb_user_friends时+1保存入库
            name = task['name']
            priority = task['priority']

            # logHelper.getLogger().debug(task)
            logHelper.getLogger().info(
                'Spider [{0}] have got a task:{1}/{2}/{3}. spider is working...'
                .format(spiderType, taskid, fbid, originalfbid))
            time.sleep(1)
            #开始单线程爬取工作#####################################################################################################
            try:
                #if FBUserCrawler.crawleInfo(browser, fbUser, fbAccount) == 1:
                if 1 == 1:
                    #完成工作,入库
                    print('Spider have done the job. Saving the results...')
                    #填写分配任务时间和承担的spider=''
                    fbuserhelper.UpdateTaskDispatch(int(taskid))

                    #告知调度端完成情况
                    print(
                        'Spider have saved the results,and reporting the job to dispatch server...'
                    )
                    k = reportFBTaskUserComplete(taskid, 2,
                                                 'completed normally.')
                    print('task id:{0},fbid:{1} has reported the job status.'.
                          format(taskid, fbid))

            except Exception as e1:
                logHelper.getLogger().error(e1)
                reportFBTaskUserComplete(
                    taskid, 3, 'completed abnormally.Error:{0}'.format(e1))
                #raise  e1

            time.sleep(5)
            # 结束单线程爬取工作#####################################################################################################
        except Exception as e:
            logHelper.getLogger().error(e)
            logHelper.getLogger().error('the main loop error,restart it!')
            time.sleep(50)
Ejemplo n.º 3
0
def main():
    fbuserhelper = FBUserHelper.FBUserHelper()
    while True:
        try:
            if common.testDispatchServer() == 'Disconnected!':  #1.测试调度服务器连接
                logHelper.getLogger().info('Dispatch server is disconnected!')
                time.sleep(5)
                continue
            logHelper.getLogger().info('Dispatch server is connected!')

            time.sleep(random.randint(1,2))

            logHelper.getLogger().debug('connecting database server ...')
            if common.testDatabaseServer() == 'Disconnected!':  #2.测试服务器连接
                logHelper.getLogger().info('Database server is disconnected!,trying again later.')
                time.sleep(5)
                continue
            print('Database server is connected!')

            spiderType = 'Facebook.userInfo'    #对应任务类型
            task = getFBUserTask(spiderType)

            taskid = task['id']
            if int(taskid) == -1:
                time.sleep(5)
                continue

            fbid = task['fbid']
            originalfbid = task['originalfbid']
            deep = int(task['deep'])     #当前任务深度,在保存tb_user_friends时+1保存入库
            name = task['name']
            priority = task['priority']

            fbUser = FBBaseUser(fbid,name,deep,priority)

            # logHelper.getLogger().debug(task)
            logHelper.getLogger().info('Spider [{0}] have got a task:{1}/{2}/{3}. spider is working...'.format(spiderType,taskid,fbid,originalfbid))
            time.sleep(3)
            #开始单线程爬取工作#####################################################################################################
            try:
                # 填写分配任务时间和承担的spider=''
                fbuserhelper.UpdateTaskDispatch(int(taskid))

                #完成工作,入库
                print('Spider have done the job. Saving the results...')

                #告知调度端完成情况
                print('Spider have saved the results,and reporting the job to dispatch server...')
                k = reportFBTaskUserComplete(taskid, 2, 'completed normally.')
                print('task id:{0},fbid:{1} has reported the job status.'.format(taskid,fbid))

            except Exception as e1:
                logHelper.getLogger().error('Scrawling error!',e1)
                reportFBTaskUserComplete(taskid, 3, 'completed abnormally.Error:{0}'.format(e1))

            time.sleep(5)
            # 结束单线程爬取工作#####################################################################################################
        except Exception as e:
            logHelper.getLogger().error(e)
            logHelper.getLogger().error('the main loop error,restart it!')
            time.sleep(50)
Ejemplo n.º 4
0
def main(spiderType):
    # spiderType = 'Twitter.userTimeline'

    twuserhelper = TWUserHelper()

    logName = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    myargs = {'fName': logName, 'fLevel': logging.DEBUG}
    logger = logHelper.getLogger('myLog', logging.INFO, **myargs)
    #
    logger.info("============================================================")

    hn = socket.gethostname()
    ip = socket.gethostbyname(hn)
    spidername = '[{}]/{}'.format(ip, hn)

    while True:
        try:
            if common.testDispatchServer() == 'Disconnected!':
                logHelper.getLogger().info('Dispatch server is disconnected!')
                time.sleep(5)
                continue
            logHelper.getLogger().info('Dispatch server is connected!')

            logHelper.getLogger().debug('connecting database server ...')
            if common.testDispatchServer() == 'Disconnected!':
                logHelper.getLogger().info(
                    'Database server is disconnected!,trying again later.')
                time.sleep(5)
                continue
            print('Database server is connected!')

            task = getTWUserTask(spiderType)

            taskid = task['id']
            if int(taskid) == -1:
                logHelper.getLogger().info(
                    'No [{0}] task to do,sleep a while.'.format(spiderType))
                time.sleep(5)
                continue

            fbid = task['fbid']
            originalfbid = task['originalfbid']
            deep = int(task['deep'])
            name = task['name']
            priority = task['priority']

            # logHelper.getLogger().debug(task)
            logHelper.getLogger().info(
                'Spider [{0}] have got a task:TASKID:{1}/TWID:{2}/PID:{3}. spider is working...'
                .format(spiderType, taskid, fbid, originalfbid))
            time.sleep(1)
            ######################################################################################################
            try:

                twuserhelper.UpdateTaskDispatch(int(taskid),
                                                spider=spidername[:50])

                # Do the job here.
                resCrawled = twusercrawler.dospider(task)

                if resCrawled == 1:

                    print(
                        'Spider have done the job,save the results,and reporting to dispatch server.'
                    )
                    k = reportTWTaskUserComplete(taskid, 2,
                                                 'completed normally.')
                    print('TASK id:{0},fbid:{1} has reported the job status.'.
                          format(taskid, fbid))

                elif resCrawled == 0:
                    logHelper.getLogger().warning(
                        'network error or unknown error! wait and try...')
                    print('network error or unknown error! wait and try...')
                    k = reportTWTaskUserComplete(
                        taskid, 3,
                        'completed abnormally.Reason:network error or unknown error!'
                    )
                    time.sleep(50)

            except Exception as e1:
                logHelper.getLogger().error('Scrawling error!', e1)
                reportTWTaskUserComplete(
                    taskid, 3, 'completed abnormally.Error:{0}'.format(e1))
                raise e1

            time.sleep(1)
        except Exception as e:
            logHelper.getLogger().error(e)
            logHelper.getLogger().error('the main loop error,restart it!')
            time.sleep(50)