Exemple #1
0
    def __init__(self, params=None, subConfigDict=None):
        """
        :param params: 权限最高输入config,可覆盖所有,通常是命令行传入
        :param subConfigDict: 权限次高config,可覆盖父类,通常是子类固定设置或者用于
        :return:
        """

        newCfg = params or subConfigDict
        if newCfg:
            # 统一配置访问入口,会整合global,class,and input,可以用gConfig 统一访问
            # parseParams中转 把传递过来的str类型的配置 转换成dict类型的配置
            configIn = self.parseParams(params)  # input first
            if subConfigDict:
                # 如果subConfigDict非空 则把 处理好的配置参数 添加到subConfigDict
                subConfigDict.update(configIn)  # subClass second
            else:
                subConfigDict = configIn
            # 把配置添加到 全局数据单点配置中
            gTop.get(GD_CFG_IN).update(configIn)
            # 把子配置 更新到全局配置
            gConfig.update(subConfigDict)  #
            # 工作环境
            # upper()返回转换为大写的字符串的副本。
            gConfig.set("env", gConfig.get("env").upper())  # make sure capital

        # 创建日志记录器
        createLogger(gConfig)
Exemple #2
0
 def testBrowser(self, browser="chrome", headless="no"):
     logInfo("test-browser-begin")
     from spiderx.common.constant import CFG_HTTP_BROWSER
     gConfig.set(CFG_HTTP_BROWSER, browser)
     gConfig.set(CFG_HTTP_BROWSERMODE, headless)
     http = SeleniumAgent()
     http.get("http://www.baidu.com")
     #showProcess()
     time.sleep(5)
     logInfo("test-browser-done")
Exemple #3
0
 def __init__(self):
     #有jobId 才有jobEnable
     gConfig.set(CFG_JOB_ENABLE, gConfig.get(CFG_JOB_ID,0))
     if gConfig.get(CFG_JOB_ENABLE, 0):
         # jobManger 任务管理器
         from jobManager.job import Job
         self.job = Job()
     else:
         self.job = None
     self.jobError = None
Exemple #4
0
 def downFile(self):
     """
     down file,
     :return:
     """
     gConfig.set(CFG_DOWN_INDEX,
                 "pinyin_sogou_com/dict/437")  # 索引需要统一设计,多级,便于存储检索
     gConfig.set(CFG_HTTP_OUTFORMAT, "file")  #
     url = "http://download.pinyin.sogou.com/dict/download_cell.php?id=20614&name=dota%20DOTA%E3%80%90%E5%AE%98%E6%96%B9%E6%8E%A8%E8%8D%90%E3%80%91"
     self.downOneDetail(url, None, FileSaveHandler(fileName="dota"))
Exemple #5
0
    def detailTest(self):
        """
        熟悉灵活的配置,包括深层嵌套
        http://www.jobui.com/company/10375749/
        :return:
        """

        # 全局爬虫参数:实际项目中,建议在__init__中设置
        gConfig.set(CFG_DOWN_INDEX,
                    "www_jobui_com/company")  # 索引需要统一设计,多级,便于存储检索

        url = "http://www.jobui.com/company/10375749/"
        company_detail_conf = {
            "logo":
            CssElement("div.company-logo > a > img", "src"),
            "name":
            CssElement("#companyH1 > a"),
            "type":
            CssElement("#cmp-intro > div > div > dl > dd:nth-child(2)"),
            "industry":
            CssElement("#cmp-intro > div > div > dl > dd.comInd > a"),
            "short_name":
            CssElement("#cmp-intro > div > div > dl > dd.gray3"),
            "info":
            CssElement("#textShowMore", text=False),
            # 这是重点关注的,listElement
            "rank":
            ListElement("div.swf-contA > ul.swf-gnfo > li",
                        itemCssElement={
                            "name": CssElement("dfn"),
                            "stars": CssElement(".f60"),
                        }),
            # 这是重点关注的,EmmbedElement
            "others":
            EmmbedElement(
                embedDict={
                    "address":
                    CssElement(
                        "div.cfix > div.s-wrapper > dl.fs16 > dd:nth-child(2)"
                    ),
                    "website":
                    CssElement(
                        "div.cfix > div.s-wrapper > dl.fs16 > dd:nth-child(4)")
                }),
        }

        self.downOneDetail(url, company_detail_conf, HourlySaveLocalHandler())

        # 任务完成 需要调用JobDone
        self.jobDone()
Exemple #6
0
    def testJob(self):
        """
        env=ONLINE,job.id=18230 testJob
        :return:
        """
        proc = psutil.Process(os.getpid())
        info = "\nkill Job %s--%s-%s" % (proc.pid, " ".join(
            proc.cmdline()), getTimestampBySec(proc.create_time()))
        logInfo(info)
        return

        db = gTop.get(CFG_DB_MONITOR)
        gConfig.set("debug.sql", 1)
        db.insert("block", {"jobName": "ttest"})
        db.update("block", {"jobName": "ttest2"}, "where id=1")
        JobUtil().jobStatusInfo({"account": 100, "ip": 300, "num": 3450})
Exemple #7
0
 def testIP(self, selenium=0, browser='chrome'):
     selenium = int(selenium)
     from spiderx.common.utility.httpUtil import RequestsAgent
     from spiderx.common.constant import CFG_HTTP_BROWSER
     gConfig.set(CFG_HTTP_BROWSER, browser)
     http = RequestsAgent() if not selenium else SeleniumAgent()
     parser = etree.HTMLParser(encoding="utf-8")
     extractor = Extractor(http, parser)
     ips = []
     for i in range(2):
         ip = getExtIP(extractor)
         logInfo("%s:%s" % (i, ip))
         http.setNewSession()
         ips.append(ip)
     logInfo("~~~~~~!!!!!!!!!!!!!the unique ip is ---------------%s" %
             (len(set(ips))))
Exemple #8
0
    def __init__(self, params, subConfig=None):
        self.basicConfig = {
            # http down related
            CFG_HTTP_INTERVAL: 0.01,  # 请求间隔
            CFG_HTTP_TIMEOUT: 10,  #
            CFG_HTTP_OUTFORMAT: 'html',  # json
            CFG_HTTP_ENCODING: 'utf-8',  # gbk
            CFG_HTTP_UNESCAPE: 0,  # remove special character quoting
            CFG_HTTP_ENGINE: 'requests',  # selenium
            CFG_HTTP_UA: 'windows',  # mac,ios,android
            CFG_HTTP_BROWSERMODE: 'headless',  #
            CFG_HTTP_BROWSER: BROWSER_TPE_PHANTOMJS,
            CFG_HTTP_MAXREQUEST:
            0,  # 一个session 最多请求次数,0表示无限制,否则超过这个次数将重启session
            CFG_JOB_RUNTIME: 0,  # 爬虫运行时间,0无限制,单位是秒
            CFG_JOB_HEARTBEAT: 60,  # 任务心跳间隔,单位是秒
            CFG_DOWN_MAXNUM: 0,  # 一次爬虫最多下载数量,0无限制
            CFG_DOWN_MAXPAGENUM: 0,  # 一次爬虫最多下载页面数量,0无限制
            CFG_BLOCK_MAXCHECK:
            100,  # 反block,元素检查,檢查次數默认>100次,就算blocked,也有可能是页面结构改变
            CFG_ACCOUNT_PROVIDER: "spideraccount.samanager",
        }

        if subConfig:
            # 如果subConfig有配置 则更新基本配置
            self.basicConfig.update(subConfig)
        # 把基本配置加入到全局配置中 并写入日志中
        BaseClass.__init__(self, params, self.basicConfig)
        #依赖配置一定要紧跟在BaseClass后
        if not gConfig.get(CFG_DOWN_ROOT, None):
            gConfig.set(
                CFG_DOWN_ROOT, "d:/" if gConfig.get("env") in ("ONLINE")
                and not isLinux() else PROJECT_ROOT)
        SpiderJobUtil.__init__(self)

        # 如果获取配置的引擎不是seleenium就用selenium请求,如果是就用requests请求
        self.http = RequestsAgent() if gConfig.get(
            CFG_HTTP_ENGINE, "requests") != "selenium" else SeleniumAgent()
        # 获取全局编码格式 并用lxml 解析
        self.parser = etree.HTMLParser(encoding=gConfig.get(CFG_HTTP_ENCODING))
        # 把请求的方式和用什么编码解析 放到 下载内容提取器
        self.extractor = Extractor(self.http, self.parser)
        self.antiBlock = None

        # 工作开始
        self.syncPoint = self.jobBegin()
Exemple #9
0
    def testAccount(self, acType="webSite"):
        gConfig.set(CFG_DB_BUSINESS, "loginHelperDb")
        source = "www_tianyancha_com"  #"""""test_cxx_com"
        if acType == "cookie":
            ids = (465, 466, 467, 468, 469, 470)
            ac = CookieAccounter()
            for i in ids:
                ac.addAccount(i, "test-cookue-%s" % i)
        else:
            ac = WebAccounter()
            a = ac.getAccounts(source="www_tianyancha_com")
            #ac.updateAccount(a['id'])
            for i in range(2):
                ac.addAccount(phone="1380000000%s" % i,
                              pwd="test",
                              source=source)

        workAccounts = ac.getMoreAccounts(source,
                                          status=ACCOUNT_WORK,
                                          maxNum=5)
        for idx, ac2 in enumerate(workAccounts):
            ac.updateAccount(ac2['id'], ACCOUNT_UNWORK)
            if idx > 1: break

        unworkAccounts = ac.getAccounts(source,
                                        status=ACCOUNT_UNWORK,
                                        maxNum=5)
        assert2(len(unworkAccounts) == 3)
        occuAccounts = ac.getMoreAccounts(source,
                                          status=ACCOUNT_OCCUPIED,
                                          maxNum=5)
        assert2(len(occuAccounts) == 2)

        for ac2 in unworkAccounts:
            ac.updateAccount(ac2['id'], ACCOUNT_WORK)
        workAccounts = ac.getMoreAccounts(source, maxNum=5)
        assert2(len(workAccounts) == 1)
        print("done")
Exemple #10
0
    def backup(self, beginId, endId=286):
        """
        python spiderx\testx.py env=ONLINE backup 105 199
        python spiderx\testx.py env=ONLINE backup 21 99

        :param beginId:
        :param endId:
        :return:
        """
        dest = "e:/company"
        gConfig.set(CFG_DB_BUSINESS, "TYC")

        beginId = int(beginId)
        endId = int(endId)
        while beginId <= endId:
            try:
                db = gTop.get(CFG_DB_BUSINESS)
                company = "company_58_%s" % beginId

                fname = os.path.join(dest, "%s.txt" % company)
                offset1 = 0
                limit1 = 1000
                with codecs.open(fname, "w", encoding="utf-8") as f:
                    while True:
                        rows = db.query(
                            "select name,url from %s limit %s offset %s" %
                            (company, limit1, offset1))
                        if not rows:
                            break
                        offset1 += limit1
                        for name, url in rows:
                            f.write("%s##%s\n" % (name, url))

            except Exception:
                logException()
            else:
                beginId += 1
                logInfo("backup-%s" % company)
Exemple #11
0
 def test(self, idx):
     """
     1,可选user,不要太多页(1840488466,4691977921,4131145503)
     2,idx=1,设置begin,从begin点爬,人为中断
     3,idx=2,断点开始继续,还是人为中断
     4,idx=3,强制使用增量模式,从头开始爬到上次开始点
     :return:
     """
     gConfig.set(CFG_JOB_ENABLE, 1)
     gConfig.set(CFG_DOWN_ROOT, PROJECT_ROOT)
     user = '******'
     idx = int(idx)
     if idx == 1:
         gConfig.set(CFG_DOWN_SYNCBEGIN,
                     (time.time() - 180 * 24 * 3600) * 1000)  #取半年前时间为存量开始点
         self.post(user)
     elif idx == 2:
         logInfo("break point mode")
         self.post(user)
     else:
         gConfig.set(CFG_DOWN_INCMODE, 1)
         logInfo("normal inc mode")
         self.post(user)
Exemple #12
0
    def antiBlockTest2(self):
        """
        爬jobui,拿不到正确信息,超过MAX,可能是页面结构变化,或者是被blocked
        :return:
        """
        from spiderx.common.utility.antiBlockUtil import AntiBlockStrategy
        gConfig.set(CFG_JOB_EMAIL, "*****@*****.**")  #这是测试使用,线上环境无需设置
        strategy = gConfig.get(CFG_AB_STRATEGY) if gConfig.get(
            CFG_AB_STRATEGY, None) else "postpone 10"

        class MyAntiBlockStrategy(AntiBlockStrategy):
            def __init__(self, strategy):
                AntiBlockStrategy.__init__(self, strategy)

            def changeAccount(self):
                logInfo("blockElement strategy:change account,relogin,...")

        antiBlocksConf = [

            # 测试blockElement,结果是报告页面情况有问题,blockElement
            {
                'key': ['jobui.com/company/'],  #
                'blockInfo': None,  # 如果有该info,直接使用
                # 不确定有blockInfo时,用element判断,可以取多个element,但blockElement 出错也有可能是页面结构变化
                #strategy可选
                "blockElement": {
                    "elements": [
                        (
                            {
                                "name": CssElement("#navTab > a:first", None,
                                                   None)
                            },
                            u"公司介绍",  #元素文本内容
                        ),
                    ],
                    "strategy":
                    MyAntiBlockStrategy(strategy),  #strategy可以为空,默认处理直接返回退出爬虫
                    "maxCheckNum":
                    2,  #重复出现2次就认为block
                },
            },
        ]

        # setp 1
        self.addAntiBlock(antiBlocksConf)
        # 这个conf只是随便设置
        conf1 = company_detail_conf = {
            "logo": CssElement("div.company-logo > a > img", "src"),
            "name": CssElement("#companyH1 > a")
        }

        # step 2 测试blockElement,结果是报告页面情况有问题
        for i in range(gConfig.get(CFG_BLOCK_MAXCHECK) + 4):  #
            self.downOne("http://www.jobui.com/company/13132726/", conf1)
            err = self.checkDownStatus(i + 1)
            if err:
                logInfo("sampletest:bocked,exit!")
                break
        if self.antiBlock.isNeedExit():
            logInfo(
                u"block by element,pls check the content to identify the block info! \n 无法继续处理,人工干预"
            )
            return
Exemple #13
0
    def listTest(self):
        """
        这里演示的是下载51job上海地区互联网,软件开发工程师列表,以及详细公司信息
        url pattern:jobarea=020000&funtype=0100&industrytype=32&curr_page=1
        http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=020000&funtype=0100&industrytype=32&keywordtype=2&lang=c&stype=2&postchannel=0000&fromType=1&confirmdate=9
        :return:
        """

        # 全局爬虫参数:实际项目中,建议在__init__中设置
        gConfig.set(CFG_HTTP_ENCODING, "gbk")
        gConfig.set(CFG_DOWN_MAXPAGENUM, 2)  # test
        gConfig.set(CFG_DOWN_MAXNUM, 10)  # test
        gConfig.set(CFG_DOWN_ROOT, PROJECT_ROOT)
        gConfig.set(
            CFG_DOWN_INDEX,
            "www_51job_com/jd/shanghai/internet/se")  # 索引需要统一设计,多级,便于存储检索

        # 列表配置,有固定的几个key
        JdListConf = {
            TAG_LIST_PAGE_PATTERN:
            "&curr_page=",
            TAG_LIST_ITEMS:
            "#resultList>div.el",
            TAG_LIST_NO_RESULT:
            u"对不起,没有该查询结果",  # 可以不需要
            TAG_LIST_TOTAL_PAGE_NUM:
            CssElement("#resultList > div.dw_tlc > div:nth-child(5)",
                       handler=RegexHandler(r".*?/(.*)")),
        }
        # http://jobs.51job.com/all/co3197016.html
        # 公司详情页配置
        companyDetailConf = {
            "id":
            CssElement("#hidCOID", "value"),  # 直接取属性
            "name":
            CssElement("div.tHeader.tHCop > div > h1"),  # 直接取元素
            "key_attr":
            CssElement("div.tHeader.tHCop > div > p.ltype"),  # 直接取元素
            "info":
            CssElement(
                "div.tCompany_full > div.tBorderTop_box.bt > div > div > div.in > p"
            ),  # 直接取元素
            "address":
            CssElement("div.tCompany_full > div:nth-child(2) > div > p",
                       handler=RegexHandler(ur"公司地址:(.*)")),  # 用正则取值
        }

        def normalizeSalary(parent, css, attr, callBackParams):
            """
            这里做了一步清洗,只是为了做specialHandler的说明
            specialHanlder主要是为了处理难提取的内容
            :param parent:
            :param css:
            :param attr:
            :param callBackParams:
            :return:
            """
            value = Extractor.getValue(parent, css, attr)
            # todo, 将价格归一化成月薪数字
            print(callBackParams)
            return value

        def getCompanyDetail(url, conf, callBackParams):
            return self.downOne(url, conf)

        JdItemConf = {
            "title":
            CssElement(".t1"),  # 直接取元素
            "title_url":
            CssElement(".t1>span>a", "href"),  # 直接取属性
            "company":
            CssElement(".t2", handler=RegexHandler(ur"(.*)有限公司")),  # 用正则取值
            "company_url":
            CssElement(
                ".t2>a",
                "href",
                handler=NextLevelHandler(companyDetailConf, getCompanyDetail,
                                         {"key": "testNextLevelHandler"})),
            # 处理下一级url
            "salay":
            CssElement(
                ".t4",
                handler=SpecialHandler(normalizeSalary,
                                       {"key": "testSpecialHandler"})),  # 对
            "issue_date":
            CssElement(".t5"),  # 直接取元素
        }
Exemple #14
0
    def post(self, userId='9650668145'):
        """
        获取帖子 9650668145 1761234358
        {"count":20,"statuses":[...],"total":474,"page":2,"maxPage":24}
        https://xueqiu.com/v4/statuses/user_timeline.json?user_id=9650668145&page=12&type=&_=1508802861537
        https://xueqiu.com/v4/statuses/user_timeline.json?user_id=1761234358&page=12&type=&_=1508802861537
        :param userId:
        :return:
        """
        #每个用户作为一个索引
        newIndex = gConfig.get(CFG_DOWN_INDEX) + "/%s" % userId
        gConfig.set(CFG_DOWN_INDEX, newIndex)
        urlTemp = "https://xueqiu.com/v4/statuses/user_timeline.json?user_id={userId}&page={pageIdx}&type=&_={ts}"
        syncPoint = SyncPoint()  #第一条数据作为同步点,因为第一条数据的帖子最新
        #先获得同步点信息:
        sp = syncPoint.getSyncPoint()
        oldSyncInfo = sp.get("syncInfo", None) if sp else None
        syncInfo = syncPoint.getNewSyncInfoByDesc(oldSyncInfo)
        saveHandler = HourlySaveLocalHandler(syncPoint=syncPoint)  #

        def save(result):
            posts = result["statuses"]
            if len(posts) == 0:
                syncInfo[CFG_DOWN_SYNCEND] = syncInfo[CFG_DOWN_SYNCCURR]
                syncPoint.saveLastSyncInfo({CFG_DOWN_SYNCINFO: syncInfo})
                return 0
            saveNum = 0
            for post in posts:

                result = {
                    "xid":
                    post["id"],
                    "userId":
                    post["user_id"],
                    "commentId":
                    post.get("commentId", 0),
                    "topicId":
                    post["retweet_status_id"],
                    "topicUser":
                    post["retweeted_status"]["user_id"]
                    if post["retweet_status_id"] else post["user_id"],
                    "replyCount":
                    post["reply_count"],
                    "favCount":
                    post["fav_count"],
                    "likeCount":
                    post["like_count"],
                    "retweetCount":
                    post["retweet_count"],
                    "viewCount":
                    post["view_count"],
                    "inTime":
                    post["created_at"],
                }

                test = True  #测试,内容不做保存
                if not test:
                    text = dehtml(post["text"])
                    idx = text.find("//@")
                    if idx > 0:
                        text = text[:idx]
                    wordCount = len(text)
                    if wordCount > gConfig.get("xueqiu.postSaveMinWord",
                                               200):  #超过200字的保存
                        result["info"] = text,
                    result["wordCount"] = wordCount
                    if post["reward_count"] > 0:
                        reward = {
                            "count": post["reward_count"],
                            "users": post["reward_user_count"],
                            "amount": post["reward_amount"],
                        }
                        result["reward"] = json.dumps(reward)

                if int(result["inTime"]) < int(syncInfo[CFG_DOWN_SYNCBEGIN]):
                    syncInfo[CFG_DOWN_SYNCCURR] = result["inTime"]
                    result[CFG_DOWN_SYNCINFO] = syncInfo
                    saveHandler.handle(result)
                    saveNum += 1

            return saveNum

        downNum = 0
        curPage = 0  # if gConfig.get(CFG_DOWN_INCMODE,1) else hasDowned/20+1 #该参数可用做断点使用
        while int(syncInfo[CFG_DOWN_SYNCCURR]) > int(
                syncInfo[CFG_DOWN_SYNCEND]):
            url = urlTemp.format(userId=userId,
                                 pageIdx=curPage,
                                 ts=int(time.time()))
            result = self.downOne(url, None)
            downNum += save(result)
            curPage += 1
            logInfo("user=%s,curPage=%s,downNum=%s" %
                    (userId, curPage, downNum))

        self.jobDone()
Exemple #15
0
    def __init__(self, params=""):

        BaseClass.__init__(self, params)
        gConfig.set(CFG_ACCOUNT_PROVIDER, "spideraccount.samanager")
Exemple #16
0
 def testDb(self):
     gConfig.set("env", "ONLINE")
     name = gTop.get(CFG_DB_MONITOR).getOne("select name from job limit 1")
     logInfo("testing-name-%s" % name)
     from superbase.utility.aliyun import AliYun
     AliYun().upFile(os.path.join(PROJECT_ROOT, "log/spiderx.log"))