Esempio n. 1
0
def get_brief_data():
    configmap = toolkit.readconfig('./job.xml')
    p = pool.Pool(1024)

    for item, value in configmap.items():
        for job in value:
            mkdir_if_need('./data/brief', job.parameter)

            print('start crawl ' + str(job.parameter) + ' ...')
            p.spawn(lagouspider.scrapy, job.parameter)

    p.join()
Esempio n. 2
0
def get_brief_data():
    configmap = toolkit.readconfig('./job.xml')
    p = pool.Pool(1024)

    for item, value in configmap.items():
        for job in value:
            mkdir_if_need('./data/brief', job.parameter)

            print('start crawl ' + str(job.parameter) + ' ...')
            p.spawn(lagouspider.scrapy, job.parameter)

    p.join()
Esempio n. 3
0
        response = requests.post(req_url, params=payload, headers=headers)
        if num > maxpagenum:
            flag = False

        if response.status_code == 200:
            job_json = response.json()['content']['result']
            print(job_json)

            with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) +
                      '.json',
                      'wt',
                      encoding='utf-8') as f:
                f.write(str(job_json))
                f.flush()
                f.close()

        else:
            print('connect error! url = ' + req_url)

        num += 1


if __name__ == '__main__':
    configmap = toolkit.readconfig('D:/Users/PythonProject/LagouJob/job.xml')

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter)
Esempio n. 4
0
        if num > maxpagenum:
            flag = False

        if response.status_code == 200:
            job_json = response.json()['content']['positionResult']['result']
            print('正在爬取第 ' + str(num) + ' 页的数据...')
            print(job_json)

            with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) +
                      '.json',
                      'wt',
                      encoding='utf-8') as f:
                f.write(str(job_json))
                f.flush()
                f.close()

        else:
            print('connect error! url = ' + req_url)

        num += 1


if __name__ == '__main__':
    configmap = toolkit.readconfig(
        'D:/Users/Administrator/PycharmProjects/LagouJob/job.xml')

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter)
Esempio n. 5
0
        if response.status_code == 200:
            job_json = response.json()['content']['positionResult']['result']
            print('正在爬取第 ' + str(num) + ' 页的数据...')
            print(job_json)
            f = open(filedir + '/' + jobname + '_' + str(num) + '.json',
                     'wt',
                     encoding='utf-8')
            f.write(str(job_json))
            f.flush()
            f.close()
        else:
            print('connect error! url = ' + req_url)

        num += 1
        time.sleep(2)


if __name__ == '__main__':
    configmap = toolkit.readconfig(
        '/Users/iceke/PycharmProjects/LagouJob/job.xml')
    cookie = toolkit.readCookie(
        '/Users/iceke/PycharmProjects/LagouJob/cookie.xml')

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter, cookie)

    excelhelper.process_json('/Users/iceke/PycharmProjects/LagouJob/data/',
                             '/Users/iceke/PycharmProjects/LagouJob/data/')
Esempio n. 6
0
    while flag:
        payload = {'first': 'false', 'pn': num, 'kd': jobname}

        response = requests.post(req_url, params=payload, headers=headers)
        if num > maxpagenum:
            flag = False

        if response.status_code == 200:
            job_json = response.json()['content']['positionResult']['result']
            print('正在爬取第 ' + str(num) + ' 页的数据...')
            print(job_json)

            with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f:
                f.write(str(job_json))
                f.flush()
                f.close()

        else:
            print('connect error! url = ' + req_url)

        num += 1


if __name__ == '__main__':
    configmap = toolkit.readconfig('C:/Users/XuLu/PycharmProjects/LagouJob/job.xml')

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter)
Esempio n. 7
0
                f.write(str(job_json))
                f.flush()
                f.close()

        else:
            print('connect error! url = ' + req_url)

        num += 1
        time.sleep(.5)

        global request_count, request_batch_count
        request_count += 1
        if request_count >= request_batch_count:
            request_count = 0
            urlproxy.switch()
            # print('sleep')
            # time.sleep(60)


if __name__ == '__main__':
    configmap = toolkit.readconfig('../job.xml')

    datapath = '../data'
    if not os.path.exists(datapath):
        os.mkdir(datapath)

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter)
Esempio n. 8
0
        response = requests.post(req_url, params=payload, headers=headers)
        if num > maxpagenum:
            flag = False

        if response.status_code == 200:
            job_json = response.json()['content']['positionResult']['result']
            print('正在爬取第 ' + str(num) + ' 页的数据...')
            print(job_json)

            with open('D:/LagouJobInfo/' + jobname + os.path.sep + str(num) + '.json', 'wt',
                      encoding='utf-8') as f:
                f.write(str(job_json))
                f.flush()
                f.close()

        else:
            print('connect error! url = ' + req_url)

        num += 1
        # time.sleep(2)


if __name__ == '__main__':
    configmap = toolkit.readconfig('D:/Users/29140/PycharmProjects/LagouJob/job.xml')

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter)
Esempio n. 9
0
            flag = False

        if response.status_code == 200:
            job_json = response.json()['content']['positionResult']['result']
            print('正在爬取第 ' + str(num) + ' 页的数据...')
            print(job_json)

            with open('D:/LagouJobInfo/' + jobname + os.path.sep + str(num) +
                      '.json',
                      'wt',
                      encoding='utf-8') as f:
                f.write(str(job_json))
                f.flush()
                f.close()

        else:
            print('connect error! url = ' + req_url)

        num += 1
        # time.sleep(2)


if __name__ == '__main__':
    configmap = toolkit.readconfig(
        'D:/Users/29140/PycharmProjects/LagouJob/job.xml')

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter)
Esempio n. 10
0
    while flag:
        payload = {'first': 'false', 'pn': num, 'kd': jobname}

        response = requests.post(req_url, params=payload, headers=headers)
        if num > maxpagenum:
            flag = False

        if response.status_code == 200:
            job_json = response.json()['content']['result']
            print(job_json)

            with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f:
                f.write(str(job_json))
                f.flush()
                f.close()

        else:
            print('connect error! url = ' + req_url)

        num += 1


if __name__ == '__main__':
    configmap = toolkit.readconfig('D:/Users/PythonProject/LagouJob/job.xml')

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter)
Esempio n. 11
0
    while flag:
        payload = {'first': 'false', 'pn': num, 'kd': jobname}

        response = requests.post(req_url, params=payload, headers=headers)
        if num > maxpagenum:
            flag = False

        if response.status_code == 200:
            job_json = response.json()['content']['positionResult']['result']
            print('正在爬取第 ' + str(num) + ' 页的数据...')
            print(job_json)

            with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f:
                f.write(str(job_json))
                f.flush()
                f.close()

        else:
            print('connect error! url = ' + req_url)

        num += 1


if __name__ == '__main__':
    configmap = toolkit.readconfig('D:/Users/Administrator/PycharmProjects/LagouJob/job.xml')

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter)
Esempio n. 12
0
    while flag:
        payload = {'first': 'false', 'pn': num, 'kd': jobname}

        response = requests.post(req_url, params=payload, headers=headers)
        if num > maxpagenum:
            flag = False

        if response.status_code == 200:
            job_json = response.json()['content']['positionResult']['result']
            print('正在爬取第 ' + str(num) + ' 页的数据...')
            print(job_json)

            with open('./data/brief/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f:
                f.write(str(job_json))
                f.flush()
                f.close()

        else:
            print('connect error! url = ' + req_url)

        num += 1


if __name__ == '__main__':
    configmap = toolkit.readconfig('../job.xml')

    for item, value in configmap.items():
        for job in value:
            print('start crawl ' + str(job.parameter) + ' ...')
            scrapy(job.parameter)