Beispiel #1
0
def job_gdeegd():

    # 文章列表页
    urls = []
    for i in range(1, 64):
        if (i == 1):
            url = "http://gdee.gd.gov.cn/ggtz3126/index.html"
        else:
            url = "http://gdee.gd.gov.cn/ggtz3126/index_%s.html" % str(i)
        urls.append(url)

    # 文章详情页
    for url in urls:
        t1 = time.time()
        headers = {
            'Connection':
            'keep-alive',
            'Pragma':
            'no-cache',
            'Cache-Control':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Referer':
            'http://gdee.gd.gov.cn/ggtz3126/index_3.html',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cookie':
            'm_bt=yes; openstack_cookie_insert=62355311; _gscu_1815356153=89127015q6kzl720; _gscbrs_1815356153=1; UM_distinctid=171ff59ebaa814-0ab1f7a365db41-d373666-1fa400-171ff59ebab197; CNZZDATA3588456=cnzz_eid%3D214537553-1589123201-http%253A%252F%252Ftest.gzjirui.com%252F%26ntime%3D1589123201; _gscs_1815356153=89127015ev2u6d20|pv:2'
        }
        response = requests.request("GET", url, headers=headers)
        selector = Selector(text=response.text)
        urls = selector.xpath(
            "/html/body/div/div[3]/div[2]/div/div[2]/ul/li[3]/div/a/@href"
        ).extract()
        for u in urls:
            if not FILTER.isContains(u):  # 如果还没有爬过
                FILTER.insert(u)  # 标志为已爬
                seed = Seed(url=u, downloader='gdeegd.crawl0')
                push_seed(seed)
            # seed = Seed(url=u, downloader='gdeegd.crawl0')
            # push_seed(seed)
        writeLog('Finish add the seeds of gdeegd (Used: %s)' %
                 (time.time() - t1))
Beispiel #2
0
def failure_dog():
    while True:
        failureNames = RCONN.keys('Failure:*')
        for one in failureNames:
            length = RCONN.llen(one)
            length_drop = 0
            for i in range(length):
                txt = RCONN.rpop(one)
                js = eval(txt)
                if 'failureCount' in js.keys() and js['failureCount'] < 4:
                    RCONN.lpush(settings.REDIS_KEYNAME, txt)
                else:
                    length_drop += 1
            writeLog('Roll Failure_%s back to Seeds: %s (drop %s)' % (length, one, length_drop))
            sendMsg('Roll Failure_%s back to Seeds: %s (drop %s)' % (length, one, length_drop))
        time.sleep(3600)
Beispiel #3
0
def job_scjgjjs():

    # 文章列表页
    urls = []
    for i in range(1, 21):
        if (i == 1):
            url = "http://scjgj.jiangsu.gov.cn/col/col70311/index.html"
        else:
            url = "http://scjgj.jiangsu.gov.cn/col/col70311/index.html?uid=277431&pageNum=%s" % str(
                i)
        urls.append(url)

    # 文章详情页
    for url in urls:
        t1 = time.time()
        headers = {
            'Connection':
            'keep-alive',
            'Pragma':
            'no-cache',
            'Cache-Control':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Referer':
            'http://test.gzjirui.com/magicflu/html/form/records2.jsp?spaceId=02393294-327d-43ed-835e-d8fe778772a8&formId=-1',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cookie':
            '__jsluid_h=8011b3a4cb561d1de121a1fa390ab4df; _gscu_1226760719=8861650310idmp17; _gscbrs_1226760719=1; yunsuo_session_verify=75a060942bec9e14902b3b5453719ad1; _gscs_1226760719=t89123468mg3q2f70|pv:3'
        }
        response = requests.request("GET", url, headers=headers)
        selector = Selector(text=response.text)
        urls = selector.xpath('//*[@id="277431"]/div/li[1]/a/@href').extract()
        for u in urls:
            if not FILTER.isContains(u):  # 如果还没有爬过
                FILTER.insert(u)  # 标志为已爬
                seed = Seed(url=u, downloader='gdstc.crawl0')
                push_seed(seed)
            # seed = Seed(url=u, downloader='gdstc.crawl0')
            # push_seed(seed)
        writeLog('Finish add the seeds of gdstc (Used: %s)' %
                 (time.time() - t1))
Beispiel #4
0
def job_comgdgov():

    # 文章列表页
    urls = []
    for i in range(1, 16):
        if (i == 1):
            url = "http://com.gd.gov.cn/zwgk/gggs/index.html"
        else:
            url = "http://com.gd.gov.cn/zwgk/gggs/index_%s.html" % str(i)
        urls.append(url)

    # 文章详情页
    for url in urls:
        t1 = time.time()
        headers = {
            'Connection':
            'keep-alive',
            'Pragma':
            'no-cache',
            'Cache-Control':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Referer':
            'http://com.gd.gov.cn/zwgk/gggs/index_16.html',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cookie':
            'UM_distinctid=171ff59ebaa814-0ab1f7a365db41-d373666-1fa400-171ff59ebab197; openstack_cookie_insert=81202878'
        }
        response = requests.request("GET", url, headers=headers)
        selector = Selector(text=response.text)
        urls = selector.xpath(
            "/html/body/div[2]/div/div[2]/ul/li[4]/a/@href").extract()
        for u in urls:
            if not FILTER.isContains(u):  # 如果还没有爬过
                FILTER.insert(u)  # 标志为已爬
                seed = Seed(url=u, downloader='comgdgov.crawl0')
                push_seed(seed)
            # seed = Seed(url=u, downloader='comgdgov.crawl0')
            # push_seed(seed)
        writeLog('Finish add the seeds of comgdgov (Used: %s)' %
                 (time.time() - t1))
Beispiel #5
0
def proxies_dog():
    while True:
        for plan in settings.PLANS:
            if 'isProxy' in plan.keys() and plan['isProxy'] is True:
                if RCONN.llen('Proxies:%s' % plan['spider']) <= 3:
                    writeLog('Proxies:%s need IP ... ' % plan['spider'])
                    try:
                        RCONN.delete('Proxies:%s' % plan['spider'])
                    except Exception as e:
                        pass
                    failure = 0
                    num = 0
                    while failure < 3:
                        try:
                            r = requests.get(
                                'http://dps.kuaidaili.com/api/getdps/?orderid=969999783818434&num=200&sep=2',
                                timeout=10)
                            if r.status_code == 200:
                                txts = r.content.split('\n')
                                for txt in txts:
                                    js = {'http': 'http://%s' % txt, 'https': 'http://%s' % txt}
                                    RCONN.lpush('Proxies:%s' % plan['spider'], str(js))
                                    num += 1
                            break
                        except Exception as e:
                            failure += 1
                    writeLog('Successful Proxies:%s (%s)' % (plan['spider'], num))
        writeLog('proxies_dog sleeping...')
        time.sleep(60)
Beispiel #6
0
def job_gdstc():

    # 文章列表页
    urls = []
    for i in range(1, 21):
        if (i == 1):
            url = "http://gdstc.gd.gov.cn/zwgk_n/tzgg/index.html"
        else:
            url = "http://gdstc.gd.gov.cn/zwgk_n/tzgg/index_%s.html" % str(i)
        urls.append(url)

    # 文章详情页
    for url in urls:
        t1 = time.time()
        headers = {
            'Accept-Encoding': "gzip, deflate",
            'Accept-Language': "zh-CN,zh;q=0.9",
            'Upgrade-Insecure-Requests': "1",
            'User-Agent':
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
            'Accept':
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            'Referer': "http://gdstc.gd.gov.cn/zwgk_n/",
            'Cookie':
            "zh_choose=s; zh_choose=s; openstack_cookie_insert=76667651",
            'Connection': "keep-alive"
        }
        response = requests.request("GET", url, headers=headers)
        selector = Selector(text=response.text)
        urls = selector.xpath(
            "/html/body/div[2]/div[2]/div[2]/ul/li/a/@href").extract()
        for u in urls:
            if not FILTER.isContains(u):  # 如果还没有爬过
                FILTER.insert(u)  # 标志为已爬
                seed = Seed(url=u, downloader='gdstc.crawl0')
                push_seed(seed)
            # seed = Seed(url=u, downloader='gdstc.crawl0')
            # push_seed(seed)
        writeLog('Finish add the seeds of gdstc (Used: %s)' %
                 (time.time() - t1))
Beispiel #7
0
def job_bbsp2peye0():
    start, end = readTime(spiderName='bbsp2peye0')
    t1 = time.time()
    seeds = {
        'http://news.p2peye.com/ptdt/': '平台动态',
        'http://news.p2peye.com/wdzl/': '网贷专栏',
        'http://news.p2peye.com/wdxw/': '网贷新闻',
        'http://www.p2peye.com/forum-60-1.html': '曝光台',
    }
    for seed in seeds.keys():
        try:
            data = {
                'url': seed,
                'spider': 'Spider_bbsp2peye',
                'category': seeds[seed],
                'start': start,
                'end': end,
            }
            push_seed(data)
        except Exception as e:
            writeLog(str(e))
    writeLog('Finish add the seeds of bbsp2peye0 (Used: %s)' %
             (time.time() - t1))
Beispiel #8
0
def tasks_dog():
    """
        任务调度时间主要有七种:weekday、day、daystep、hour、hourstep、minute、minutestep。
        weekday、day、hour、minute三种类型:整型、list、tuple。(hour=3表示每天3点执行,hour=[3,4,5]表示每天3,4,5点都执行一次,hour=(3,6)表示每天3~6点每个小时都执行一次)。
        daystep、hourstep、minutestep为整型。(hourstep=3表示每个3个小时执行一次)。
    """
    start = datetime.datetime.now()
    while True:
        now = datetime.datetime.now()
        weekday = now.weekday() + 1
        day = now.day
        hour = now.hour
        minute = now.minute
        tasks = []
        for plan in settings.PLANS:
            if 'times' not in plan.keys():
                plan['times'] = 0

            if 'weekday' in plan.keys():  # 以下判断时间是否不符合,不符合要求时continue跳过。
                if isContinue(plan['weekday'], weekday) == 1:
                    continue
            if 'day' in plan.keys():
                if isContinue(plan['day'], day) == 1:
                    continue
            if 'daystep' in plan.keys():
                if isinstance(plan['daystep'], int):
                    if plan['daystep'] == 0 or (now - start).days % plan['daystep'] != 0:
                        continue

            if (
                    'weekday' in plan.keys() or 'day' in plan.keys() or 'daystep' in plan.keys()) and 'hour' not in plan.keys() and 'hourstep' not in plan.keys():
                plan['hour'] = start.hour
            if 'hour' in plan.keys():
                if isContinue(plan['hour'], hour) == 1:
                    continue
            if 'hourstep' in plan.keys():
                if isinstance(plan['hourstep'], int):
                    difference = (now - start).seconds / 3600
                    if plan['hourstep'] == 0 or difference % plan['hourstep'] != 0 or difference / plan['hourstep'] != \
                            plan['times']:
                        continue

            if (
                    'hour' in plan.keys() or 'hourstep' in plan.keys()) and 'minute' not in plan.keys() and 'minutestep' not in plan.keys():
                plan['minute'] = start.minute
            if 'minute' in plan.keys():
                if isContinue(plan['minute'], minute) == 1:
                    continue
            if 'minutestep' in plan.keys():
                if isinstance(plan['minutestep'], int):
                    difference = int((now - start).total_seconds()) / 60  # 过去了多少分钟
                    if difference / plan['minutestep'] > plan['times']:
                        plan['times'] = difference / plan['minutestep']
                    if plan['minutestep'] == 0 or difference % plan['minutestep'] != 0 or difference / plan[
                        'minutestep'] != plan['times']:
                        continue
            plan['times'] += 1
            tasks.append(getattr(jobs, plan['name']))

        writeLog("Total task to run : {}".format(len(tasks)))
        for task in tasks:
            try:
                task()
            except Exception as e:
                """执行任务异常"""
                msg = traceback.format_exc()  # 方式1
                writeLog("run task exception: %s" % msg)

        sleep_time = 60 - datetime.datetime.now().second
        writeLog('tasks_dog sleeping...{}'.format(sleep_time))
        time.sleep(sleep_time)