def get_brief_data(): configmap = toolkit.readconfig('./job.xml') p = pool.Pool(1024) for item, value in configmap.items(): for job in value: mkdir_if_need('./data/brief', job.parameter) print('start crawl ' + str(job.parameter) + ' ...') p.spawn(lagouspider.scrapy, job.parameter) p.join()
response = requests.post(req_url, params=payload, headers=headers) if num > maxpagenum: flag = False if response.status_code == 200: job_json = response.json()['content']['result'] print(job_json) with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f: f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 if __name__ == '__main__': configmap = toolkit.readconfig('D:/Users/PythonProject/LagouJob/job.xml') for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter)
if num > maxpagenum: flag = False if response.status_code == 200: job_json = response.json()['content']['positionResult']['result'] print('正在爬取第 ' + str(num) + ' 页的数据...') print(job_json) with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f: f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 if __name__ == '__main__': configmap = toolkit.readconfig( 'D:/Users/Administrator/PycharmProjects/LagouJob/job.xml') for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter)
if response.status_code == 200: job_json = response.json()['content']['positionResult']['result'] print('正在爬取第 ' + str(num) + ' 页的数据...') print(job_json) f = open(filedir + '/' + jobname + '_' + str(num) + '.json', 'wt', encoding='utf-8') f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 time.sleep(2) if __name__ == '__main__': configmap = toolkit.readconfig( '/Users/iceke/PycharmProjects/LagouJob/job.xml') cookie = toolkit.readCookie( '/Users/iceke/PycharmProjects/LagouJob/cookie.xml') for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter, cookie) excelhelper.process_json('/Users/iceke/PycharmProjects/LagouJob/data/', '/Users/iceke/PycharmProjects/LagouJob/data/')
while flag: payload = {'first': 'false', 'pn': num, 'kd': jobname} response = requests.post(req_url, params=payload, headers=headers) if num > maxpagenum: flag = False if response.status_code == 200: job_json = response.json()['content']['positionResult']['result'] print('正在爬取第 ' + str(num) + ' 页的数据...') print(job_json) with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f: f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 if __name__ == '__main__': configmap = toolkit.readconfig('C:/Users/XuLu/PycharmProjects/LagouJob/job.xml') for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter)
f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 time.sleep(.5) global request_count, request_batch_count request_count += 1 if request_count >= request_batch_count: request_count = 0 urlproxy.switch() # print('sleep') # time.sleep(60) if __name__ == '__main__': configmap = toolkit.readconfig('../job.xml') datapath = '../data' if not os.path.exists(datapath): os.mkdir(datapath) for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter)
response = requests.post(req_url, params=payload, headers=headers) if num > maxpagenum: flag = False if response.status_code == 200: job_json = response.json()['content']['positionResult']['result'] print('正在爬取第 ' + str(num) + ' 页的数据...') print(job_json) with open('D:/LagouJobInfo/' + jobname + os.path.sep + str(num) + '.json', 'wt', encoding='utf-8') as f: f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 # time.sleep(2) if __name__ == '__main__': configmap = toolkit.readconfig('D:/Users/29140/PycharmProjects/LagouJob/job.xml') for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter)
flag = False if response.status_code == 200: job_json = response.json()['content']['positionResult']['result'] print('正在爬取第 ' + str(num) + ' 页的数据...') print(job_json) with open('D:/LagouJobInfo/' + jobname + os.path.sep + str(num) + '.json', 'wt', encoding='utf-8') as f: f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 # time.sleep(2) if __name__ == '__main__': configmap = toolkit.readconfig( 'D:/Users/29140/PycharmProjects/LagouJob/job.xml') for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter)
while flag: payload = {'first': 'false', 'pn': num, 'kd': jobname} response = requests.post(req_url, params=payload, headers=headers) if num > maxpagenum: flag = False if response.status_code == 200: job_json = response.json()['content']['result'] print(job_json) with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f: f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 if __name__ == '__main__': configmap = toolkit.readconfig('D:/Users/PythonProject/LagouJob/job.xml') for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter)
while flag: payload = {'first': 'false', 'pn': num, 'kd': jobname} response = requests.post(req_url, params=payload, headers=headers) if num > maxpagenum: flag = False if response.status_code == 200: job_json = response.json()['content']['positionResult']['result'] print('正在爬取第 ' + str(num) + ' 页的数据...') print(job_json) with open('D:/LagouJobInfo/lagou/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f: f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 if __name__ == '__main__': configmap = toolkit.readconfig('D:/Users/Administrator/PycharmProjects/LagouJob/job.xml') for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter)
while flag: payload = {'first': 'false', 'pn': num, 'kd': jobname} response = requests.post(req_url, params=payload, headers=headers) if num > maxpagenum: flag = False if response.status_code == 200: job_json = response.json()['content']['positionResult']['result'] print('正在爬取第 ' + str(num) + ' 页的数据...') print(job_json) with open('./data/brief/' + jobname + '/' + str(num) + '.json', 'wt', encoding='utf-8') as f: f.write(str(job_json)) f.flush() f.close() else: print('connect error! url = ' + req_url) num += 1 if __name__ == '__main__': configmap = toolkit.readconfig('../job.xml') for item, value in configmap.items(): for job in value: print('start crawl ' + str(job.parameter) + ' ...') scrapy(job.parameter)