Exemple #1
0
        files = self.listDir(aliDir, timeRange)
        for fileName in files:
            try:
                fileName2 = os.path.join(dest, fileName)
                if not os.path.exists(fileName2):
                    self.downFile(fileName, dest)
                with gzip.open(fileName2, 'rb') as f:
                    for line in f:
                        yield json.loads(line)

            except Exception, e:
                logException()

    def help(self):

        print """
            usage: python aliyun.py cmd params
            commands:
            upFile filename
            downFile src [dest] #if dest is none,dest path is same as src
            upDir dirname
            downDir src,[dest] #if dest is none,dest path is same as src
            listDir dirname
            checkFile filename
            """


if __name__ == '__main__':
    callMethod(AliYun, sys.argv[1:])
Exemple #2
0
        :param needLog: 0,no log,1,send job logFile,2,send node log,3 both
        :return:
        """
        attaches = []
        if needLog:
            try:
                jobLog, nodeLog = getLogFileName(batch, jobName)
                logInfo("jobLog=%s\nnodeLog=%s" % (jobLog, nodeLog))

                def getOneAttach(log):
                    if os.path.exists(log):
                        size = os.path.getsize(log)
                        if size > 1024 * 500:  # >500k,gizp it
                            log = gzipOneFile(log)
                        fname = os.path.split(log)[1]
                        attaches.append((fname, log))
                        logInfo("add one attach%s,size=%s" % (log, size))

                if needLog & 1:
                    getOneAttach(jobLog)
                if needLog & 2:
                    getOneAttach(nodeLog)
            except Exception:
                logException()
        # 发送邮件
        Mail.sendEmail(title, content, t_address, attaches=attaches)


if __name__ == '__main__':
    callMethod(Mail, sys.argv[1:])
Exemple #3
0
        if self.antiBlock.isNeedExit():
            logInfo(
                u"block by element,pls check the content to identify the block info! \n 无法继续处理,人工干预"
            )
            return

    def failTest(self, str, str2):
        """
        测试任务失败case
        :param str:
        :param str2:
        :return:
        """
        logInfo("this is a fail test: %s %s" % (str, str2))
        self.jobFail()

    def downFile(self):
        """
        down file,
        :return:
        """
        gConfig.set(CFG_DOWN_INDEX,
                    "pinyin_sogou_com/dict/437")  # 索引需要统一设计,多级,便于存储检索
        gConfig.set(CFG_HTTP_OUTFORMAT, "file")  #
        url = "http://download.pinyin.sogou.com/dict/download_cell.php?id=20614&name=dota%20DOTA%E3%80%90%E5%AE%98%E6%96%B9%E6%8E%A8%E8%8D%90%E3%80%91"
        self.downOneDetail(url, None, FileSaveHandler(fileName="dota"))


if __name__ == '__main__':
    callMethod(SampleSpider, sys.argv[1:])
Exemple #4
0
                "has_tianyan_risk":has_tianyan_risk,
                "company_name_len":company_name_len
            }
            return result#(annualReport_Num,has_tianyan_risk,company_name_len)
        #TODO:series->list->df,应该可以直接优化成series->df
        data = df.apply(lambda x:stat(x.values[0][0]),axis=1).tolist()
        df2 = pd.DataFrame.from_dict(data)
        info = df2.describe()
        logDebug("""
        #####################\n
        this is the stat sample,you can try it and save the resulut to db for further visualization\n
        #####################\n
        %s\n
        #####################\n
        """%info)

        result = info.to_dict()
        #TODO,save the result










if __name__ == '__main__':
    callMethod(MyPanda, sys.argv[1:])
Exemple #5
0
    """
    def __init__(self, params=None):
        # 添加这两个配置只是为了调试方便
        myCfg = {
            # CFG_JOB_BATCH:"split_test20140717",
            # CFG_JOB_NAME:"split",
            "env": "DEV"
        }
        BaseClass.__init__(self, params, myCfg)

    def test1(self, val1, val2):
        """
        这里演示的是:
        1,全局配置参数test.size的使用
        2,脚本调用的case
        python superbase/sample1.py "env=DEV,test.size=101"  test1 hello world
        python superbase/sample1.py "env=DEV,test.size=99"  test1 hello world

        :return:
        """
        size = gConfig.get("test.size", 0)
        if size > 100:
            logInfo("size=%s-%s" % (size, val1))
        else:
            logInfo("size=%s-%s" % (size, val2))


if __name__ == '__main__':
    callMethod(Sample3, sys.argv[1:])
Exemple #6
0
        self.jobDone()

    def test(self, idx):
        """
        1,可选user,不要太多页(1840488466,4691977921,4131145503)
        2,idx=1,设置begin,从begin点爬,人为中断
        3,idx=2,断点开始继续,还是人为中断
        4,idx=3,强制使用增量模式,从头开始爬到上次开始点
        :return:
        """
        gConfig.set(CFG_JOB_ENABLE, 1)
        gConfig.set(CFG_DOWN_ROOT, PROJECT_ROOT)
        user = '******'
        idx = int(idx)
        if idx == 1:
            gConfig.set(CFG_DOWN_SYNCBEGIN,
                        (time.time() - 180 * 24 * 3600) * 1000)  #取半年前时间为存量开始点
            self.post(user)
        elif idx == 2:
            logInfo("break point mode")
            self.post(user)
        else:
            gConfig.set(CFG_DOWN_INCMODE, 1)
            logInfo("normal inc mode")
            self.post(user)


if __name__ == '__main__':
    callMethod(Xueqiu, sys.argv[1:])
Exemple #7
0
            # execute 执行sql命令
            self.sqlite.execute("drop table %s" % name)
        if not sql:
            sql = tables[name]
        self.sqlite.execute(sql)

    def createLocalDb(self):
        """
         #创建本地数据库
        create the local sqlite db
        PROJECT_ROOT/{workDir}/localDb/local.db 存储路径
        :return:
        """
        # 工作目录
        workDir = "spider/localDb" if gConfig.get(CFG_JOB_ENABLE,
                                                  0) == 0 else "sites/localDb"
        # 路径=根目录+工作目录
        path = os.path.join(PROJECT_ROOT, workDir)
        # 创建路径文件夹
        mkdir(path)
        # 文件名称
        fileName = os.path.join(path, "local.db")
        # 连接数据库
        self.sqlite.reset(fileName)
        for table in tables.values():
            self.sqlite.execute(table)


if __name__ == '__main__':
    callMethod(LocalDb, sys.argv[1:])
Exemple #8
0
        with codecs.open("%s/shanghailiepin.txt" % root2,
                         "w",
                         encoding="utf-8") as f2:
            f2.write("%s##%s##%s##%s##%s##%s\n" %
                     ("name", "address", "website", "type", "name2", "info"))
            files = glob.glob("%s/*/*/*/*/*/*/*/*.gz" % root)
            total = 0
            for idx, fileName2 in enumerate(files):
                num = 0
                with gzip.open(fileName2, 'rb') as f:
                    for line in f:
                        try:
                            d = json.loads(line)
                            if u"上海" in d.get("name", ""):
                                f2.write(
                                    u"%s##%s##%s##%s##%s##%s\n" %
                                    (d.get("name", ""), d.get("address", ""),
                                     d.get("website", ""), d.get("type", ""),
                                     d.get("businessLicense",
                                           ""), str2line(d.get("info", ""))))
                                num += 1
                        except Exception:
                            logException()
                total += num
                logInfo("%s:num=%s,total=%s" % (idx, num, total))


if __name__ == '__main__':
    callMethod(TestX, sys.argv[1:])
Exemple #9
0
        driver.find_element_by_xpath(
            '/html/body/section/div[1]/div[1]/ul/li[1]').click()
        driver.find_element_by_xpath(
            '/html/body/section/div[1]/div[2]/form/div[1]/input').send_keys(
                "*****@*****.**")
        driver.find_element_by_xpath(
            '/html/body/section/div[1]/div[2]/form/div[2]/input').send_keys(
                "pwdh8f_lagou")
        driver.find_element_by_xpath(
            '/html/body/section/div[1]/div[2]/form/div[5]/input').click()

        time.sleep(10)
        num = 0
        while num < 300000:
            driver.find_element_by_xpath(
                '//*[@id="lg_tbar"]/div/ul/li[2]/a').click()  #简历
            time.sleep(5)
            driver.find_element_by_xpath(
                '//*[@id="workExperience"]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/div/em'
            ).click()  #edit
            time.sleep(5)
            driver.find_element_by_xpath(
                '//*[@id="currentUpJobForm"]/div/div[6]/input').click()  #edit
            time.sleep(gConfig.get("update.time", 300))
            logDebug("update %s" % num)
            num += 1


if __name__ == '__main__':
    callMethod(Fanli, sys.argv[1:])