def run_work(url): cwd_abs = os.path.abspath(__file__) cwd = os.path.dirname(cwd_abs) payload = company_payload(url) job_list = get_job_list(payload) for job_id in job_list: job_url = 'http://www.lagou.com/jobs/' + str(job_id) + '.html' print job_url if not common.sql_select('lagou', job_id): r = common.get_request(job_url) ## if r.status_code == 200: ## r.encoding = 'utf-8' ## job_dict = liblagoucompany.extract2(r.text) ## common.sql_main('lagou', job_dict, job_url, job_id) ## gs_fp = os.path.join(cwd, 'jobs', 'lagou') ## if not os.path.exists(gs_fp): ## os.makedirs(gs_fp) ## job_id = str(job_id).rjust(9, '0') ## store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html') ## father_dir=os.path.dirname(store_path) ## if not os.path.exists(father_dir): ## os.makedirs(father_dir) ## with open(store_path, 'w+') as f: ## f.write(r.text) ## common.rand_sleep(1) if r.status_code == 200: r.encoding = 'utf-8' job_dict = liblagoucompany.extract2(r.text) common.sql_main('lagou', job_dict, job_url, job_id)
def run_work(keyword = 'python'): url_list = get_url_list(keyword) for url_get in url_list: print url_get job_id = re.search('[0-9]+.html', url_get).group()[:-5] print job_id if not common.sql_select('job51', job_id): r = common.get_request(url_get) r.encoding = 'gb2312' job_dict = extract2(r.text) common.sql_main('job51', job_dict, url_get, job_id)
def run_work(url): cid = company_id(url) job_l = job_list(cid) for job_id in job_l: job_url = 'http://www.cjol.com/jobs/job-' + job_id print job_url print job_id if not common.sql_select('cjol', job_id): r = common.get_request(job_url) r.encoding = 'utf-8' job_dict = libcjolcompany.extract2(r.text) common.sql_main('cjol', job_dict, job_url, job_id)
def run_work(curl): url_all = get_url_all(curl) for url_get in url_all: print url_get job_id = re.search('[0-9]+.htm', url_get).group()[:-5] print job_id if not common.sql_select('zhilian', job_id): print common.sql_select('zhilian', job_id) r = common.get_request(url_get) r.encoding = 'utf-8' job_dict = libzlcompany.extract(r.text) common.sql_main('zhilian', job_dict, url_get, job_id)
def main(job_list, option=0): """会更新旧的岗位信息 option=0 只抓取新增加的 option=1""" for url in job_list: job_id = re.search('[0-9]+.html', url).group()[:-5] if option == 0: r1 = common.get_request(url) r1.encoding = 'gb2312' job_dict = html_extract.extract_51(r1.text) # job_id = re.search('[0-9]+.html', url).group()[:-5] common.sql_main(source='job51', job_dict=job_dict, url=url, job_id=job_id) if option == 1: if not common.sql_select('job51', job_id): r1 = common.get_request(url) r1.encoding = 'gb2312' job_dict = lib51company.extract2(r1.text) # job_id = re.search('[0-9]+.html', url).group()[:-5] common.sql_main(source='job51', job_dict=job_dict, url=url, job_id=job_id)