Ejemplo n.º 1
0

def test():
    print('当前时间:%s'%datetime.datetime.now().strftime("%Y-%m-%d %X"))
    print('8888888888888888888888')

#任务入口
def entrance():
    #初始化任务队列
    global options,q
    options = configutil.getoptions('urls')
    flag = True
    endTime = '10:00:00'
    i = 0
    while flag:
        i += 1
        for opt in options:
            q.put(opt)
        logger.info('-----------------第%s次爬取开始,当前队列中还有%s个网站需要处理-----------------'%(i,q.qsize()))
        open_thread()
        nowTime = datetime.now().strftime('%X')
        if endTime < nowTime or options.__len__() == 0:
            flag = False
            logger.info('================================今日任务已完成,爬虫进入休眠状态================================')
        else:
            time.sleep(1800)

if __name__ == '__main__':
    logger.info('================================日报爬虫开始工作================================')
    entrance()
Ejemplo n.º 2
0
                        var n = Math.floor(Math.random() * 16.0).toString(16);
                        guid += n;
                        // if ((i == 8) || (i == 12) || (i == 16) || (i == 20)) guid +=
                        // "-";
                    }
                    return guid;
                }"""
    ctx = execjs.compile(js)
    pageid = ctx.call("happy")
    return pageid


if __name__ == '__main__':
    servers = ['47.111.24.165:5000', '47.94.209.31:5000', '47.105.61.16:5000']
    ser = random.choice(servers)
    logger.info('本次工作ip : %s' % ser)
    logger.info('=========================开始抓取政府网站案件=========================')
    logger.info('~~~~~~~~~~~~~~~~~~~证券部分~~~~~~~~~~~~~~~')
    zq = getconfig('pjws', 'address1').format(get_pageid())
    result = grab(zq, ser)
    for i in result:
        logger.info('%s、%s' % (result.index(i) + 1, i))
    es_operate(result)
    time.sleep(random.randint(100, 300))
    logger.info(
        '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
    )
    logger.info(
        '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
    )
    logger.info(
Ejemplo n.º 3
0
    def detail_page(self, docid):
        """文书详情页"""
        url = "http://wenshu.court.gov.cn/website/parse/rest.q4w"
        data = {
            "docId": "%s" % docid,
            "ciphertext": get_cipher(),
            "cfg":
            "com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch",
            "__RequestVerificationToken": "%s" % get_token(),
        }

        response = self.session.post(url, data=data, headers=self.headers)
        json_value = json.loads(response.text)
        secretKey = json_value["secretKey"]
        result = json_value["result"]
        data = json.loads(
            get_result(result, secretKey, time.strftime("%Y%m%d")))
        print(data)


if __name__ == '__main__':
    demo = wenshu()
    logger.info('==================开始抓取证券部分==================')
    demo.get_docid('%E9%93%B6%E8%A1%8C')
    time.sleep(random.randint(30, 120))
    logger.info('++++++++++++++++++开始抓取银行部分++++++++++++++++++')
    demo.get_docid('银行')
    time.sleep(random.randint(30, 120))
    logger.info('~~~~~~~~~~~~~~~~~~开始抓取信托部分~~~~~~~~~~~~~~~~~~')
    demo.get_docid('信托')
Ejemplo n.º 4
0
import hashlib
import time

from elasticsearch import Elasticsearch

from util import configutil
from util.LoggerClass import Logger

logger = Logger(logname='newspaper', logger='esutil').getlog()
try:
    host = configutil.getconfig('eshost', 'host')
    port = configutil.getconfig('eshost', 'port')
    es = Elasticsearch([{'host': host, 'port': port}])
except Exception as ex:
    logger.info(ex)


def insert_single_data(index_name, doc_type, data, esid):
    try:
        res = es.index(index=index_name, doc_type=doc_type, body=data, id=esid)
        return res
    except Exception as e:
        logger.info(e)


def insert_datas(index_name, doc_type, datas):
    try:
        res = es.bulk(index=index_name, doc_type=doc_type, body=datas)
        return res
    except Exception as e: