Exemple #1
0
def run_work(url):
    cwd_abs = os.path.abspath(__file__)
    cwd = os.path.dirname(cwd_abs)
    payload = company_payload(url)
    job_list = get_job_list(payload)
    for job_id in job_list:
        job_url = 'http://www.lagou.com/jobs/' + str(job_id) + '.html'
        print job_url
        if not common.sql_select('lagou', job_id):
            r = common.get_request(job_url)
            ##            if r.status_code == 200:
            ##                r.encoding = 'utf-8'
            ##                job_dict = liblagoucompany.extract2(r.text)
            ##                common.sql_main('lagou', job_dict, job_url, job_id)
            ##                gs_fp = os.path.join(cwd, 'jobs', 'lagou')
            ##                if not os.path.exists(gs_fp):
            ##                    os.makedirs(gs_fp)
            ##                job_id = str(job_id).rjust(9, '0')
            ##                store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html')
            ##                father_dir=os.path.dirname(store_path)
            ##                if not os.path.exists(father_dir):
            ##                    os.makedirs(father_dir)
            ##                with open(store_path, 'w+') as f:
            ##                    f.write(r.text)
            ##                common.rand_sleep(1)
            if r.status_code == 200:
                r.encoding = 'utf-8'
                job_dict = liblagoucompany.extract2(r.text)
                common.sql_main('lagou', job_dict, job_url, job_id)
def run_work(keyword = 'python'):
    url_list = get_url_list(keyword)
    for url_get in url_list:
        print url_get
        job_id = re.search('[0-9]+.html', url_get).group()[:-5]
        print job_id
        if not common.sql_select('job51', job_id):
            r = common.get_request(url_get)
            r.encoding = 'gb2312'
            job_dict = extract2(r.text)
            common.sql_main('job51', job_dict, url_get, job_id)
Exemple #3
0
def run_work(url):
    cid = company_id(url)
    job_l = job_list(cid)
    for job_id in job_l:
        job_url = 'http://www.cjol.com/jobs/job-' + job_id
        print job_url
        print job_id
        if not common.sql_select('cjol', job_id):
            r = common.get_request(job_url)
            r.encoding = 'utf-8'
            job_dict = libcjolcompany.extract2(r.text)
            common.sql_main('cjol', job_dict, job_url, job_id)
Exemple #4
0
def run_work(curl):
    url_all = get_url_all(curl)
    for url_get in url_all:
        print url_get
        job_id = re.search('[0-9]+.htm', url_get).group()[:-5]
        print job_id
        if not common.sql_select('zhilian', job_id):
            print common.sql_select('zhilian', job_id)
            r = common.get_request(url_get)
            r.encoding = 'utf-8'
            job_dict = libzlcompany.extract(r.text)
            common.sql_main('zhilian', job_dict, url_get, job_id)
def run_work(curl):
    url_all = get_url_all(curl)
    for url_get in url_all:
        print url_get
        job_id = re.search('[0-9]+.htm', url_get).group()[:-5]
        print job_id
        if not common.sql_select('zhilian', job_id):
            print common.sql_select('zhilian', job_id)
            r = common.get_request(url_get)
            r.encoding = 'utf-8'
            job_dict = libzlcompany.extract(r.text)
            common.sql_main('zhilian', job_dict, url_get, job_id)
def main(job_list, option=0):
    """会更新旧的岗位信息 option=0
    只抓取新增加的 option=1"""
    for url in job_list:
        job_id = re.search('[0-9]+.html', url).group()[:-5]
        if option == 0:
            r1 = common.get_request(url)
            r1.encoding = 'gb2312'
            job_dict = html_extract.extract_51(r1.text)
            # job_id = re.search('[0-9]+.html', url).group()[:-5]
            common.sql_main(source='job51', job_dict=job_dict, url=url, job_id=job_id)
        if option == 1:
            if not common.sql_select('job51', job_id):
                r1 = common.get_request(url)
                r1.encoding = 'gb2312'
                job_dict = lib51company.extract2(r1.text)
                # job_id = re.search('[0-9]+.html', url).group()[:-5]
                common.sql_main(source='job51', job_dict=job_dict, url=url, job_id=job_id)
Exemple #7
0
def main(job_list, option=0):
    """会更新旧的岗位信息 option=0
    只抓取新增加的 option=1"""
    for url in job_list:
        job_id = re.search('[0-9]+.html', url).group()[:-5]
        if option == 0:
            r1 = common.get_request(url)
            r1.encoding = 'gb2312'
            job_dict = html_extract.extract_51(r1.text)
            # job_id = re.search('[0-9]+.html', url).group()[:-5]
            common.sql_main(source='job51',
                            job_dict=job_dict,
                            url=url,
                            job_id=job_id)
        if option == 1:
            if not common.sql_select('job51', job_id):
                r1 = common.get_request(url)
                r1.encoding = 'gb2312'
                job_dict = lib51company.extract2(r1.text)
                # job_id = re.search('[0-9]+.html', url).group()[:-5]
                common.sql_main(source='job51',
                                job_dict=job_dict,
                                url=url,
                                job_id=job_id)