Esempio n. 1
0
def run_company(url, tp):  # 针对桔子对公司的爬取限制,另外写的
    url2 = url
    page = 10
    flag = True
    keyword_all = []
    l_list = range(2, 2427)
    l_list2 = copy.copy(l_list)
    for i in l_list2:
        store_path1 = os.path.join(common.root_path, 'juzi', str(i) + '.html')
        if os.path.isfile(store_path1):
            l_list.remove(i)
        else:
            print store_path1
    print len(l_list), 999999999
    print l_list
    aa = common.get_request(url, timeout=18)
    headers2 = {
        'Origin': 'http://www.itjuzi.com',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'User-Agent':
        'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
        'Accept': '*/*',
        'Referer': 'http://www.itjuzi.com/company?page=2410',
        'Cookie':
        'grwng_uid=2e824b08-70ef-41f7-a9c8-62ff25d8f920; AWSELB=258D9D590E00B3DE939BD2301A2166BB8314D5BFDDA88D29F0E3F22E0935E83EF1C408A6B613204775BA26EA9BE8555ABB5A13289EDD9FCE01B44987A799A50A15E49578ED9A0D7D28BE3696012F59FED65EA97193',
        'Connection': 'keep-alive'
    }
    url_p = 'https://api.growingio.com/v2/eee5a46c52000d401f969f4535bdaa78/web/pv?stm=1459928218615'
    while flag:
        print url
        keyword_list = extract(aa.text, tp)
        print keyword_list
        print '---------------'
        keyword_all.extend(keyword_list)
        print keyword_all
        headers['refer'] = url
        url = url2 + "?page={}".format(page)
        print url
        headers2['refer'] = url
        aa = common.get_request(url, timeout=18)
        # bb = common.post_request(url_p, headers=headers2, data='6\x86\xf0D\x08`\x96`\\`S$\x15\x82\x01`\x1b\x01\x8d\x90&\x002\x10\x09\x9a\xf8\x08\xc0\x19\x80\x9c\x19QZ\xc8\x0c\xcc\x80FDA\x00\xec\x00q\x80\r$\x00np\xc3\xe0\x07H\xd7\x80\x96"\x03\xa8!\x90 +\x88\xeeX+w\xcb\x8b.\x00\xb4dru\xd6\x84\xd1]\x11\x91`\x8b\xa7\x91\n\x17\xf0\xf5\xcd\xca\xbf0\x01\x9dUc)\xd7\'df\xdchDd\xc6\xa4\x8c\xba\xdc\x08&\xba\x92\x08DATT\xf8\xc8\xf8R`\x00."\x82\xeeY\x02\x19\x00\xb6pd\x0cI\xce\xb8d\xdc\xc8<\x02\xee\x00\x16p\x9c\x8c\x18\xb5\x00\xeeph%\x02D"\xad}bP\x19\x00VJ\x00^PbX\x00\xf6E\x02\x00\x0e"\x00\xf4\xd3\x05s\x10\x00v\x00\x9en\x00N\x14"u\x19\x19s\xb0\x8b\x8b}\xad\x03\xc3c\x13+\xcb3k[\x00\xfck\x00\xe6\x08\x00\xbc\xb8\x9dhn\x006"Q\x9dWE\x87Y\xb8\x00\x8e"7\xa7\xdb\xe6G\xc1\x80\x00\xbe|p\n\x9e\x06\xa0\xd1ht\xfaC\x18X.d\xb3Yl\xf6T\x93\x85\xc6\xe4\xf1\xa3\xbc\xbe\x7f X*\x113\xe0"Q\x18\x9cA&\x82H\xa4\xd2n\x0c\x80>\x08\x01\xb4T\x00\x7f*\x01\xd0\x95\x00\x16\x11\x80W\x0c\x80\x01\x00\x07\xc2P\x04\x90\x00\xaa\x00T-\x00\n\xda\xdc\xa8D\x1d\xed\xca*\xc0J\xc82\xb7\x02\xa5Q\xa9\x80\xe6Yx!\xd8\xe6\xe1\xeb\xc0.W\x11\xb8\xd2c3p-\xe0\xf7U\x86\xdb`#\xd8\x1c\x8e\'3\xad\xb0om\xb8\xcc\xdd\x8fM\x8b\xc3S\x09\xf8C\xd5\xef/\xa7^\x10\x88\x02\xe9\x00')
        # print bb.content
        common.get_request('http://www.itjuzi.com/company/{}'.format(
            random.choice(range(1, 500))),
                           timeout=10)
        store_path = os.path.join(common.root_path, 'juzi',
                                  str(page) + '.html')
        with open(store_path, 'w+') as f:
            f.write(aa.text.encode('utf8'))
        common.rand_sleep(30, 15)
        if len(l_list) == 0:
            flag = False
        else:
            page = random.choice(l_list)
            l_list.remove(page)
    return keyword_all
Esempio n. 2
0
def get_page_one(url):
    r2 = common.get_request(url)
    cc = content(r2.text)
    # print cc
    if morepage(r2.text):
        common.rand_sleep(3, 2)
        r3 = common.get_request(url + '?p=1')  # 假如有两页
        logger.info('{} has two page, try to get page one'.format(url))
        cc += content(r3.text)
    dd = common.re_email(cc)
    print dd
    ee = list(set(dd))
    ff = ','.join(ee)
    return ff
Esempio n. 3
0
def get_company(company_id):
    cwd_abs = os.path.abspath(__file__)
    cwd = os.path.dirname(cwd_abs)
    # for i in xrange(1, 120000):
    print company_id
    # if not sql_lg('lagou', company_id):
    if True:
        url = 'http://www.lagou.com/gongsi/{}.html'.format(company_id)
        print url
        r = common.get_request(url, headers=header)
        # print r.url
        if r.status_code == 200:
            print url, '------------------' * 5
            #store_path = os.path.join(cwd,keyword,fname)
            # gs_fp = os.path.join(cwd, 'gongsi', 'lagou')
            # if not os.path.exists(gs_fp):
            #     os.makedirs(gs_fp)
            # # fname = str(company_id) + '.html'
            # job_id = str(company_id)
            # job_id = job_id.rjust(8, '0')
            # store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html')
            # father_dir=os.path.dirname(store_path)
            # if not os.path.exists(father_dir):
            #     os.makedirs(father_dir)
            # with open(store_path, 'w+') as f:
            #     f.write(r.text)
            company_dict = company_parse(r.text)
            sql_lg_main('lagou',
                        job_dict=company_dict,
                        url=url,
                        company_id=company_id)
Esempio n. 4
0
def get_page(url):
    # s = requests.session()
    # r1 = s.get(url)
    r1 = common.get_request(url)
    r1.encoding = 'gb2312'
    soup = BeautifulSoup(r1.text, 'html.parser')
    job_num = soup.find('input', {'name': 'hidTotal'}).get('value')
    job_list = []
    job_1 = soup.find_all('p', {'class': 't1'})
    for i in job_1:
        job_list.append(i.a.get('href'))

    while len(job_list) < int(job_num):
        payload = {'pageno': 2, 'hidTotal': job_num}
        r2 = common.post_request(url, data=payload)
        r2.encoding = 'gb2312'
        soup2 = BeautifulSoup(r2.text, 'html.parser')
        job_2 = soup2.find_all('p', {'class': 't1'})
        for i2 in job_2:
            job_list.append(i2.a.get('href'))
        payload['pageno'] += 1

    print job_list

    print len(job_list), job_num
    return job_list
Esempio n. 5
0
def get_url_all(url):
    r = common.get_request(url, headers)
    url_first = libzlcompany.get_url_list(r.text)
    url_all = url_first
    flag = 1
    while flag:
        url_next = libzlcompany.find_next(r.text)
        # print url_next, 22222222222222
        if url_next:
            url_next_list = libzlcompany.get_url_list(r.text)
            url_all.extend(url_next_list)
            r = common.get_request(url_next)
        else:
            flag = 0
    # print url_all, len(url_all), 111111111111
    return url_all
Esempio n. 6
0
def get_company(company_id):
    cwd_abs = os.path.abspath(__file__)
    cwd = os.path.dirname(cwd_abs)
    # for i in xrange(1, 120000):
    print company_id
    if not sql_lg('lagou', company_id):
        url = 'http://www.lagou.com/gongsi/{}.html'.format(company_id)
        print url
        r = common.get_request(url, headers=header)
        # print r.url
        if r.status_code == 200:
            print url, '------------------' * 5
            #store_path = os.path.join(cwd,keyword,fname)
            gs_fp = os.path.join(cwd, 'gongsi', 'lagou')
            if not os.path.exists(gs_fp):
                os.makedirs(gs_fp)
            # fname = str(company_id) + '.html'
            job_id = str(company_id)
            job_id = job_id.rjust(8, '0')
            store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html')
            father_dir=os.path.dirname(store_path)
            if not os.path.exists(father_dir):
                os.makedirs(father_dir)
            with open(store_path, 'w+') as f:
                f.write(r.text)
            company_dict = company_parse(r.text)
            sql_lg_main('lagou', job_dict=company_dict, url=url, company_id=company_id)
Esempio n. 7
0
def get_page(url):
    # s = requests.session()
    # r1 = s.get(url)
    r1 = common.get_request(url)
    r1.encoding = 'gb2312'
    soup = BeautifulSoup(r1.text, 'html.parser')
    job_num = soup.find('input', {'name': 'hidTotal'}).get('value')
    job_list = []
    job_1 = soup.find_all('p', {'class': 't1'})
    for i in job_1:
        job_list.append(i.a.get('href'))

    while len(job_list) < int(job_num):
        payload = {'pageno': 2, 'hidTotal': job_num}
        r2 = common.post_request(url, data=payload)
        r2.encoding = 'gb2312'
        soup2 = BeautifulSoup(r2.text, 'html.parser')
        job_2 = soup2.find_all('p', {'class': 't1'})
        for i2 in job_2:
            job_list.append(i2.a.get('href'))
        payload['pageno'] += 1

    print job_list

    print len(job_list), job_num
    return job_list
Esempio n. 8
0
def kword(blog_id, blog_app, post_id):
    par = {'blogApp': blog_app,
     'blogId': blog_id,
     'postId': post_id}
    url = 'http://www.cnblogs.com/mvc/blog/CategoriesTags.aspx'
    keyword = ''
    try:
        ab = common.get_request(url, params=par)
        result = ab.json()
        tag = result['Tags']
        tag = tag[tag.find(':')+2:]
        cate = result['Categories']
        cate = cate[cate.find(':')+2:]
        so1 = ''
        so2 = ''
        try:
            so1 = BeautifulSoup(tag, 'html.parser').get_text()
        except:
            pass
        try:
            so2 = BeautifulSoup(cate, 'html.parser').get_text()
        except:
            pass
        keyword = so1 + ',' + so2
        # print keyword, 999999999999999999999
    except:
        pass
    return keyword
Esempio n. 9
0
def kword(blog_id, blog_app, post_id):
    par = {'blogApp': blog_app, 'blogId': blog_id, 'postId': post_id}
    url = 'http://www.cnblogs.com/mvc/blog/CategoriesTags.aspx'
    keyword = ''
    try:
        ab = common.get_request(url, params=par)
        result = ab.json()
        tag = result['Tags']
        tag = tag[tag.find(':') + 2:]
        cate = result['Categories']
        cate = cate[cate.find(':') + 2:]
        so1 = ''
        so2 = ''
        try:
            so1 = BeautifulSoup(tag, 'html.parser').get_text()
        except:
            pass
        try:
            so2 = BeautifulSoup(cate, 'html.parser').get_text()
        except:
            pass
        keyword = so1 + ',' + so2
        # print keyword, 999999999999999999999
    except:
        pass
    return keyword
Esempio n. 10
0
def get_url_all(url):
    r = common.get_request(url, headers)
    url_first = libzlcompany.get_url_list(r.text)
    url_all = url_first
    flag = 1
    while flag:
        url_next = libzlcompany.find_next(r.text)
        # print url_next, 22222222222222
        if url_next:
            url_next_list = libzlcompany.get_url_list(r.text)
            url_all.extend(url_next_list)
            r = common.get_request(url_next)
        else:
            flag = 0
    # print url_all, len(url_all), 111111111111
    return url_all
Esempio n. 11
0
def get_destination_url_path(request=None):
    """Get the (effective, sans any "traversal namespace notation" components
    and other such "traversal processing instruction" url components) target 
    URL path of the (current) request.
    """
    if request is None:
        request = common.get_request()
    #_url = request.URL
    #_url = request.getURL(level=0, path_only=True)
    # NOTE: both URL and getURL() depend on where we are in the traversal 
    # process i.e. they return the *currently* traversed URL path and not 
    # the full requested path. 
    # 
    # So, we use the request's PATH_INFO but as this may contain:
    # - (++) any number of Zope "traversal namespace notation" url components
    # - (@@/) to indicate that the URL is for an object that is a resource
    # - (@@)) to indicate a view name
    # we need to get rid 
    # of them:
    _url = "/".join([ url_component 
                      for url_component in request.get("PATH_INFO").split("/")
                      if not url_component.startswith("++") and
                         not url_component.startswith("@@") ])
    log.debug(" [get_destination_url_path] %s " % _url)
    return _url
Esempio n. 12
0
def get_company(company_id):
    cwd_abs = os.path.abspath(__file__)
    cwd = os.path.dirname(cwd_abs)
    # for i in xrange(1, 120000):
    url = 'http://www.lagou.com/gongsi/{}.html'.format(company_id)
    print url
    r = common.get_request(url, headers=lagouall.header)
    # print r.url
    if r.status_code == 200:
        print url, '------------------' * 5
        #store_path = os.path.join(cwd,keyword,fname)
        # gs_fp = os.path.join(cwd, 'gongsi', 'lagou')
        # if not os.path.exists(gs_fp):
        #     os.makedirs(gs_fp)
        # # fname = str(company_id) + '.html'
        # job_id = str(company_id)
        # job_id = job_id.rjust(8, '0')
        # store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html')
        # father_dir=os.path.dirname(store_path)
        # if not os.path.exists(father_dir):
        #     os.makedirs(father_dir)
        # with open(store_path, 'w+') as f:
        #     f.write(r.text)
        company_dict = lagouall.company_parse(r.text)
        return company_dict
Esempio n. 13
0
def run_work(url):
    cwd_abs = os.path.abspath(__file__)
    cwd = os.path.dirname(cwd_abs)
    payload = company_payload(url)
    job_list = get_job_list(payload)
    for job_id in job_list:
        job_url = 'http://www.lagou.com/jobs/' + str(job_id) + '.html'
        print job_url
        if not common.sql_select('lagou', job_id):
            r = common.get_request(job_url)
            ##            if r.status_code == 200:
            ##                r.encoding = 'utf-8'
            ##                job_dict = liblagoucompany.extract2(r.text)
            ##                common.sql_main('lagou', job_dict, job_url, job_id)
            ##                gs_fp = os.path.join(cwd, 'jobs', 'lagou')
            ##                if not os.path.exists(gs_fp):
            ##                    os.makedirs(gs_fp)
            ##                job_id = str(job_id).rjust(9, '0')
            ##                store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html')
            ##                father_dir=os.path.dirname(store_path)
            ##                if not os.path.exists(father_dir):
            ##                    os.makedirs(father_dir)
            ##                with open(store_path, 'w+') as f:
            ##                    f.write(r.text)
            ##                common.rand_sleep(1)
            if r.status_code == 200:
                r.encoding = 'utf-8'
                job_dict = liblagoucompany.extract2(r.text)
                common.sql_main('lagou', job_dict, job_url, job_id)
Esempio n. 14
0
def main(source):
    url = url_cr(source)
    logging.debug('chuansong url is {}'.format(url))
    r = common.get_request(url)
    if r:
        html = r.text
        one_page(html, source)
        try:
            url2 = next_page(html)
            logging.debug('page 2 url is {}'.format(url2))
            while url2:
                r2 = common.get_request(url2)
                html2 = r2.text
                one_page(html2, source)
                url2 = next_page(html2)
        except Exception as e:
            logging.error('err get next page msg is {}'.format(e), exc_info=True)
Esempio n. 15
0
def main(blog_name):
    sql_name = 'cnblog_' + blog_name
    page = 1
    flag = True
    url_0 = "http://www.cnblogs.com/{}/".format(blog_name)
    url_1 = "http://www.cnblogs.com/{}/".format(blog_name)
    while flag:
        print url_1
        try:
            bb = common.get_request(url_1)
            logging.info('return url {} success '.format(bb.url))
            print bb.url
            soup_2 = BeautifulSoup(bb.text, 'html.parser')
            with open('asdf.html', 'w+') as f:
                f.write(bb.text.encode('utf8'))
            b2 = soup_2.find_all(
                'a',
                {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')})  # 某页的文章链接
            for i_text in b2:
                article_url = i_text.get('href')
                print article_url
                logging.info('article is {}'.format(article_url))
                article_title = i_text.get_text().strip()
                if not common.select(article_url, blog_name):
                    article = common.get_request(article_url)
                    pub_time = common.re_time(article.text)
                    keyword, content = extract(article.text)
                    blog_id, blog_app, post_id = blog_info(article.text)
                    keyword = kword(blog_id, blog_app, post_id)
                    common.sql_insert(sql_name, article_url, article_title,
                                      content, pub_time, keyword)
                    common.rand_sleep(6, 1)
            page += 1
            re_str = url_0 + r'default\S+page={}'.format(page)
            print re_str
            pp = re.compile(re_str)
            ppp = re.search(pp, bb.text)
            if ppp is None:
                flag = False
            else:
                url_1 = ppp.group()
            common.rand_sleep(7, 1)
        except Exception, e:
            print Exception, e
            logging.error('run error', exc_info=True)
Esempio n. 16
0
def main(job_list, option=0):
    """会更新旧的岗位信息 option=0
    只抓取新增加的 option=1"""
    for url in job_list:
        job_id = re.search('[0-9]+.html', url).group()[:-5]
        if option == 0:
            r1 = common.get_request(url)
            r1.encoding = 'gb2312'
            job_dict = html_extract.extract_51(r1.text)
            # job_id = re.search('[0-9]+.html', url).group()[:-5]
            common.sql_main(source='job51', job_dict=job_dict, url=url, job_id=job_id)
        if option == 1:
            if not common.sql_select('job51', job_id):
                r1 = common.get_request(url)
                r1.encoding = 'gb2312'
                job_dict = lib51company.extract2(r1.text)
                # job_id = re.search('[0-9]+.html', url).group()[:-5]
                common.sql_main(source='job51', job_dict=job_dict, url=url, job_id=job_id)
Esempio n. 17
0
def main():
    source = 'mux'
    page = 1
    flag = True
    url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page)
    while flag:
        try:
            print url
            res = common.get_request(url)
            logging.info('return url {} success'.format(res.url))
            print res.url
            soup = BeautifulSoup(res.text, 'html.parser')
            with open('temp.html', 'w+') as f:
                f.write(res.text.encode('utf8'))
            articles = soup.find_all('div', class_='artical_inner')
            for item in articles:
                contents = item.contents
                article_url = contents[9].a.get('href')
                article_title = str(contents[3].a.get('title')).strip()
                if not common.select(article_url, source):
                    pub_time = time.strftime('%Y-%m-%d',\
                        time.strptime(str(contents[5].get_text()).split('|')[-1].strip(), '%Y年%m月%d日'))
                    keyword = str(
                        contents[5].get_text()).split('|')[-2].strip()
                    content = get_content(
                        common.get_request(article_url).text)
                    print article_title
                    common.sql_insert(source, article_url, article_title,
                                      content, pub_time, keyword)
                    common.rand_sleep(6, 1)
            page += 1
            re_str = r'http://mux.baidu.com/\?page_id=10\S+paged={}'.format(
                page)
            pat = re.compile(re_str)
            s_r = re.search(pat, res.text)
            if s_r is None:
                flag = False
            else:
                url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page)
            common.rand_sleep(7, 1)
        except Exception, e:
            print Exception, e
            logging.error('run error', exc_info=True)
Esempio n. 18
0
def main(blog_name):
    sql_name = 'cnblog_' + blog_name
    page = 1
    flag = True
    url_0 = "http://www.cnblogs.com/{}/".format(blog_name)
    url_1 = "http://www.cnblogs.com/{}/".format(blog_name)
    while flag:
        print url_1
        try:
            bb = common.get_request(url_1)
            logging.info('return url {} success '.format(bb.url))
            print bb.url
            soup_2 = BeautifulSoup(bb.text, 'html.parser')
            with open('asdf.html', 'w+') as f:
                f.write(bb.text.encode('utf8'))
            b2 = soup_2.find_all('a', {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')})  # 某页的文章链接
            for i_text in b2:
                article_url = i_text.get('href')
                print article_url
                logging.info('article is {}'.format(article_url))
                article_title = i_text.get_text().strip()
                if not common.select(article_url, blog_name):
                    article = common.get_request(article_url)
                    pub_time = common.re_time(article.text)
                    keyword, content = extract(article.text)
                    blog_id, blog_app, post_id = blog_info(article.text)
                    keyword = kword(blog_id, blog_app, post_id)
                    common.sql_insert(sql_name, article_url, article_title, content, pub_time, keyword)
                    common.rand_sleep(6, 1)
            page += 1
            re_str = url_0 + r'default\S+page={}'.format(page)
            print re_str
            pp = re.compile(re_str)
            ppp = re.search(pp, bb.text)
            if ppp is None:
                flag = False
            else:
                url_1 = ppp.group()
            common.rand_sleep(7, 1)
        except Exception, e:
            print Exception, e
            logging.error('run error', exc_info=True)
Esempio n. 19
0
def one_page(html, source):
    ll = link_list(html)
    common.rand_sleep(3, 1)
    for i in ll:
        url, title = i.split(',')
        logging.debug('next url is {}'.format(url))
        if not sql_se(source, title):
            r2 = common.get_request(url)
            title2, content, pub_time= page_parse(r2.text)
            common.sql_insert(source, url, title, content, pub_time, '')
        common.rand_sleep(6, 2)
Esempio n. 20
0
def get_first(keyword = 'android', payload = payload1):
    url_list = []
    payload['keyword'] = keyword
    r = common.get_request(url, params=payload, cookies=cookies_dict)

    # 保存当前网址
    #print r.text
    url_list = find_url(r.text)
    with open('company.html', 'w+') as f:
       f.write(r.text.encode('ISO-8859-1'))
    return r.text, url_list
Esempio n. 21
0
def run_work(keyword = 'python'):
    url_list = get_url_list(keyword)
    for url_get in url_list:
        print url_get
        job_id = re.search('[0-9]+.html', url_get).group()[:-5]
        print job_id
        if not common.sql_select('job51', job_id):
            r = common.get_request(url_get)
            r.encoding = 'gb2312'
            job_dict = extract2(r.text)
            common.sql_main('job51', job_dict, url_get, job_id)
Esempio n. 22
0
def run_work(url):
    cid = company_id(url)
    job_l = job_list(cid)
    for job_id in job_l:
        job_url = 'http://www.cjol.com/jobs/job-' + job_id
        print job_url
        print job_id
        if not common.sql_select('cjol', job_id):
            r = common.get_request(job_url)
            r.encoding = 'utf-8'
            job_dict = libcjolcompany.extract2(r.text)
            common.sql_main('cjol', job_dict, job_url, job_id)
Esempio n. 23
0
def run_work(curl):
    url_all = get_url_all(curl)
    for url_get in url_all:
        print url_get
        job_id = re.search('[0-9]+.htm', url_get).group()[:-5]
        print job_id
        if not common.sql_select('zhilian', job_id):
            print common.sql_select('zhilian', job_id)
            r = common.get_request(url_get)
            r.encoding = 'utf-8'
            job_dict = libzlcompany.extract(r.text)
            common.sql_main('zhilian', job_dict, url_get, job_id)
Esempio n. 24
0
def run_work(curl):
    url_all = get_url_all(curl)
    for url_get in url_all:
        print url_get
        job_id = re.search('[0-9]+.htm', url_get).group()[:-5]
        print job_id
        if not common.sql_select('zhilian', job_id):
            print common.sql_select('zhilian', job_id)
            r = common.get_request(url_get)
            r.encoding = 'utf-8'
            job_dict = libzlcompany.extract(r.text)
            common.sql_main('zhilian', job_dict, url_get, job_id)
Esempio n. 25
0
def all_blog():
    url_cnblog = 'http://www.cnblogs.com/AllBloggers.aspx'
    aa = common.get_request(url_cnblog)
    soup_a = BeautifulSoup(aa.text, 'html.parser')
    aa = soup_a.find_all('td')
    aa = aa[1:]
    a_dict = dict()
    for i in aa:
        blog_url = i.a.get('href')
        blog_name = blog_url[blog_url.find('com/') + 4:-1]
        blog_cnname = i.a.get_text().strip()
        a_dict.update({blog_name: blog_cnname})
    return a_dict
Esempio n. 26
0
def all_blog():
    url_cnblog = 'http://www.cnblogs.com/AllBloggers.aspx'
    aa = common.get_request(url_cnblog)
    soup_a = BeautifulSoup(aa.text, 'html.parser')
    aa = soup_a.find_all('td')
    aa = aa[1:]
    a_dict = dict()
    for i in aa:
        blog_url = i.a.get('href')
        blog_name = blog_url[blog_url.find('com/') + 4:-1]
        blog_cnname = i.a.get_text().strip()
        a_dict.update({blog_name: blog_cnname})
    return a_dict
Esempio n. 27
0
def run1():
    db = MySQLdb.connect(**common.sql_config)
    cursor = db.cursor(MySQLdb.cursors.SSCursor)
    sql_1 = """select id, url, content from news """
    cursor.execute(sql_1)
    print cursor.rowcount
    i = 0
    row = True
    row = cursor.fetchone()

    while row is not None:
        i += 1
        if i % 100 == 0:
            print i, 666666666666666
        row = cursor.fetchmany(size=500)
        # print row
        for row_id, url, content in row:
            # print row_id
            if comb(content, 250) and 'v2ex.com' not in url:
                # print content, 111111111111111111111
                r = common.get_request(url)
                if r.url.startswith('http://mp.weixin.qq.com/'):
                    soup2 = BeautifulSoup(r.text, 'html.parser')
                    title = soup2.find('title').get_text().encode('utf8')
                    content = soup2.find('div',
                                         {'class': 'rich_media_content'})
                    content = unicode(content).encode('utf8')
                else:
                    content = Document(r.text.encode(
                        r.encoding, 'ignore')).summary().encode('utf-8')
                    title = Document(r.text.encode(
                        r.encoding)).short_title().encode('utf-8')
                db2 = MySQLdb.connect(**common.sql_config)
                cursor2 = db2.cursor()
                if not comb(content, 250) and 'mp.weixin.qq.com' in url:
                    sql = """update news set rating = 0, content = '{}' where id = '{}'""".format(
                        db2.escape_string(content), row_id)
                    print 2222222222
                else:
                    sql = """update news set rating = -1, content = '{}' where id = '{}' """.format(
                        db2.escape_string(content), row_id)
                try:
                    cursor2.execute(sql)
                    db2.commit()
                except Exception, e:
                    print e
                    db2.rollback()
                db.ping(True)
                db2.close()
                print row_id, 777777777777777777777
                print url
Esempio n. 28
0
def run(url, tp):
    aa = common.get_request(url, timeout=8)
    url2 = url
    page = 1
    flag = True
    keyword_all = []
    while flag:
        print url
        keyword_list = extract(aa.text, tp)
        print keyword_list
        print '---------------'
        keyword_all.extend(keyword_list)
        print keyword_all
        if aa.text.find(u'下一页') < 0:
            flag = False
        else:
            page += 1
            headers['refer'] = url
            url = url2 + "?page={}".format(page)
            print url
            aa = common.get_request(url, headers=headers, timeout=8)
        common.rand_sleep(9, 4)
    return keyword_all
Esempio n. 29
0
def main(job_list, option=0):
    """会更新旧的岗位信息 option=0
    只抓取新增加的 option=1"""
    for url in job_list:
        job_id = re.search('[0-9]+.html', url).group()[:-5]
        if option == 0:
            r1 = common.get_request(url)
            r1.encoding = 'gb2312'
            job_dict = html_extract.extract_51(r1.text)
            # job_id = re.search('[0-9]+.html', url).group()[:-5]
            common.sql_main(source='job51',
                            job_dict=job_dict,
                            url=url,
                            job_id=job_id)
        if option == 1:
            if not common.sql_select('job51', job_id):
                r1 = common.get_request(url)
                r1.encoding = 'gb2312'
                job_dict = lib51company.extract2(r1.text)
                # job_id = re.search('[0-9]+.html', url).group()[:-5]
                common.sql_main(source='job51',
                                job_dict=job_dict,
                                url=url,
                                job_id=job_id)
def get_image_address(url):
    newString = url_to_string(url)

    image = get_request(url)
    if str(image).isdigit():
        pass
    else:
        soup_image = BeautifulSoup(image)
        address = []

        img = soup_image.findAll("a", {'target':'_blank'})
        if len(img) :
            for val in img:
                address.append(val.get("href"))
            for val in address:
                imagesAddress.append(newString+val)
Esempio n. 31
0
def job_list(c_id):
    param = {
        'CompanyID': c_id,
        'PageNo': '1',
        'PageSize': '100',
    }
    header = {
        'Host': 'www.cjol.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML'
                      ', like Gecko) Chrome/47.0.2526.111 Safari/537.36'
    }
    curl = 'http://www.cjol.com/jobs/company/joblist'
    r = common.get_request(curl, params=param, headers=header)
    job_l = []
    soup = BeautifulSoup(r.text, 'html.parser')
    soupa = soup.find_all('a')
    for a in soupa:
        job_id = company_id(a.get('href'))
        job_l.append(job_id)
    return job_l
Esempio n. 32
0
def main():
    rootdir = os.getcwd()
    print rootdir
    try:
        company_list_dir = os.path.join(rootdir, 'juzi')
        # for subdir, dirs, files in os.walk(company_list_dir):
        #     for file in files:
        #         logger.info('current file is {}'.format(file))
        #         fff = os.path.join(subdir, file)
        #         with open(fff) as f:
        #             ff = f.read()
        #         url_list = find_all_link(ff)
        #         print url_list
        # for i in url_list:
        for num in xrange(35770, 36000):
            i = 'http://www.itjuzi.com/company/' + str(num)
            try:
                logger.info('current url is {}'.format(i))
                juzi_id = i.replace('http://www.itjuzi.com/company/', '')
                if not sql_sel(juzi_id):
                    logger.info('try to insert {} into mysql'.format(juzi_id))
                    gs_fp = os.path.join(rootdir, 'juzicompany')
                    if not os.path.exists(gs_fp):
                        os.makedirs(gs_fp)
                    job_id = str(juzi_id)
                    job_id = job_id.rjust(5, '0')
                    store_path = os.path.join(gs_fp,job_id[0:3], job_id +'.html')
                    father_dir=os.path.dirname(store_path)
                    if not os.path.exists(father_dir):
                        os.makedirs(father_dir)
                    r = common.get_request(i)
                    if r:
                        with open(store_path, 'w+') as f:
                            f.write(r.text)
                        ll = parse_page(r.text)
                        sql_in(juzi_id, ll)
                        common.rand_sleep(5, 2)
            except:
                logger.error('something wrong ', exc_info=True)
    except:
        logger.error('something wrong ', exc_info=True)
Esempio n. 33
0
def get_url_list(keyword):
    url_list = []
    for industry in industry_list:
        payload1['industrytype'] = industry
        s = requests.Session()
        first_result = get_first(keyword, payload1)
        first_page = first_result[0]
        n = 1
        next_url = get_next(first_page)
        url_list.extend(first_result[1])
        while next_url:
            #print next_url
            n += 1
            fname = 'company'+str(n) + '.html'
            r = common.get_request(next_url, payload1)
            #r = s.get(next_url, params = payload1, cookies=cookies_dict)
            url_list2 = find_url(r.text)
            url_list.extend(url_list2)

            next_url = get_next(r.text)

    url_list = list(set(url_list))
    print len(url_list)
    return url_list
Esempio n. 34
0
def getpage(page):
    url = 'http://www.v2ex.com/go/cv?p={}'.format(page)
    r = common.get_request(url)
    return r
Esempio n. 35
0
 print title
 content = aa.description.encode('utf8')
 if url_list[i]['full'] == 1:
     if aa.has_key('content'):
         content = aa.content[0]['value'].encode('utf8')
         logging.info('{} has full context output'.format(i))
 pub_time = aa.published_parsed
 pub_time = datetime.datetime.fromtimestamp(mktime(pub_time))
 print pub_time
 if not select(url, i):
     i_num2 += 1
     if url_list[i]['full'] != 1:
         try:
             if i == 'oschina blog':
                 url_2 = url + '?fromerr=dy4SuBAE'
                 r = common.get_request(url_2)
             else:
                 r = common.get_request(url)
             print r.url
             print r.encoding
             soup = BeautifulSoup(r.text.encode(r.encoding), 'html.parser')
             keyword = soup.find('meta', {'name': 'keywords'})
             print r.encoding
             if keyword:
                 keyword = keyword.get('content')
                 keyword = keyword.encode('utf8', 'ignore')
             else:
                 keyword = ''
             try:
                 if i == 'phphub':
                     keyword = soup.find('div', {'class': 'meta inline-block'}).a.get_text()
Esempio n. 36
0
async def verify(repos):
    requests = [get_request(URL_FORMAT.format(r)) for r in repos]
    responses = await asyncio.gather(*requests,
                                     loop=asyncio.get_running_loop(),
                                     return_exceptions=True)
    return [x['full_name'] for x in responses if 'full_name' in x]
Esempio n. 37
0
def get_spotify_track_data(title, data, token):
    """
    Try to match tracks with their spotify id.

    json.dumps gets better match in cases where artist name or track name
    includes non alphanumeric characters.
    """

    search_endpoint = 'https://api.spotify.com/v1/search?'
    tracks_data = {}
    not_found = []
    if data:
        for i in data:
            artist = json.dumps(urllib.parse.quote_plus(i[0]))
            track = json.dumps(urllib.parse.quote_plus(i[1]))
            query = ''.join([
                'q=', 'artist:"', artist, '"+', 'track:"', track,
                '"&type=track&limit=1'
            ])
            url = search_endpoint + query
            r = common.get_request(url, {'Authorization': 'Bearer ' + token})
            if r.status_code == 200:
                d = r.json()
                if d:
                    if d['tracks']['items'] and d['tracks']['total'] > 0:
                        album_info = d['tracks']['items'][0]
                        album_name = album_info['album']['name']
                        album_id = album_info['album']['id']
                        album_url = album_info['album']['external_urls'][
                            'spotify']

                        artist_info = album_info['artists'][0]
                        artist_name = artist_info['name']
                        artist_id = artist_info['id']
                        artist_url = artist_info['external_urls']['spotify']

                        track_name = album_info['name']
                        track_id = album_info['id']
                        track_uri = album_info['uri']
                        track_url = album_info['external_urls']['spotify']

                        tracks_data[track_id] = {
                            'track_name': track_name,
                            'track_url': track_url,
                            'track_uri': track_uri,
                            'album': {
                                'album_id': album_id,
                                'album_name': album_name,
                                'album_url': album_url
                            },
                            'artist': {
                                'artist_id': artist_id,
                                'artist_name': artist_name,
                                'artist_url': artist_url
                            }
                        }
                        # # debugging block
                        # if i[1] not in track_name.lower():
                        #     print('track name difference')
                        #     print(i[0], i[1])
                        #     print(url)
                        #     # pprint(d)
                        #     pprint(tracks_data[track_id])
                        #     print()
                        # if i[0] not in artist_name.lower():
                        #     print('artist name difference')
                        #     print(i[0], i[1])
                        #     print(url)
                        #     pprint(tracks_data[track_id])
                        #     print()
                    else:
                        print('There was a problem matching track')
                        print(i)
                        print(d)
                        print()
                        not_found.append(i)
                else:
                    print('There was a problem with the request')
                    print(r)
                    not_found.append(i)

    if tracks_data:
        common.save_to_json(tracks_data, './json/' + title + '_data.json')
        print(len(tracks_data), 'tracks identified')
        print()
    if not_found:
        common.save_to_json(not_found, './json/' + title + '_not_found.json')
        print(len(not_found), 'unidentified tracks')
        pprint(not_found)
        print()
    return tracks_data
def returnUrlAddress():
    html = get_request(domain)
    urlList = detail_url_list(html)
    image_address(urlList)
    return imagesAddress
Esempio n. 39
0
def company_parse(html):
    # 解析 拉勾的公司页面
    soup = BeautifulSoup(html, 'html.parser')
    base_info = soup.find('div', {'id': 'basic_container'})
    aa = base_info.find_all('li')
    company_type, company_process, company_size, company_city, company_product, job_num = '', '', '', '', '', ''
    company_name, company_url, company_word = '', '', ''
    company_main = soup.find('a', {'class': 'hovertips'})
    company_short_name = ''
    try:
        company_name = company_main.get('title').strip()
        company_short_name = company_main.get_text().strip()
        print company_short_name
    except:
        pass
    try:
        company_url = company_main.get('href')
    except:
        pass
    try:
        company_word = soup.find('div', {
            'class': 'company_word'
        }).get_text().strip()
    except:
        pass
    # print company_name, company_url, company_word
    company_leader = ''
    soup3 = soup.find_all('p', {'class': 'item_manager_name'})
    for i3 in soup3:
        company_leader += i3.span.text + ','

    for i1 in aa:
        # print i1.i.get('class')
        if 'type' in i1.i.get('class'):
            company_type = i1.span.text
        if 'process' in i1.i.get('class'):
            company_process = i1.span.text
        if 'number' in i1.i.get('class'):
            company_size = i1.span.text
        if 'address' in i1.i.get('class'):
            company_city = i1.span.text
    company_product_soup = soup.find_all('div', {'class': 'product_url'})
    for i2 in company_product_soup:
        company_product += i2.a.text.strip() + ','
    soup2 = soup.find('div', {'class': 'company_data'}).find_all('li')
    job_num, job_percent, job_day, job_feedback, last_login = '', '', '', '', ''
    logo = ''
    try:
        logo_url = soup.find('img', {'alt': u'公司Logo'}).get('src')
        print logo_url
        r_img = common.get_request(logo_url)
        logo = r_img.content.encode('base64').replace('\n', '')
    except:
        pass
    company_tag = soup.find_all('li', {'class': 'con_ul_li'})
    tag_str = ''
    try:
        tag_str = ','.join([i.get_text().strip() for i in company_tag])
        print tag_str
    except:
        pass
    try:
        company_desc = soup.find('div', {
            'class': 'company_intro_text'
        }).span.get_text().strip()
    except:
        company_desc = ''
    try:
        job_num = soup2[0].strong.text.strip()
    except:
        pass
    try:
        job_percent = soup2[1].strong.text.strip()
    except:
        pass
    try:
        job_day = soup2[2].strong.text.strip()
    except:
        pass
    try:
        job_feedback = soup2[3].strong.text.strip()
    except:
        pass
    try:
        last_login = soup2[4].strong.text.strip()
    except:
        pass
    if soup.find('a', {'class': 'identification'}):
        company_verify = '1'
    else:
        company_verify = '0'
    # print company_type, company_process, company_size, company_city, company_product, company_verify
    # print job_num, job_percent, job_day, job_feedback, last_login
    # print company_leader, company_name, company_url, company_word
    company_dict = {
        'company_type': company_type,
        'company_process': company_process,
        'company_size': company_size,
        'company_city': company_city,
        'company_product': company_product,
        'company_verify': company_verify,
        'job_num': job_num,
        'job_percent': job_percent,
        'job_day': job_day,
        'job_feedback': job_feedback,
        'last_login': last_login,
        'company_leader': company_leader,
        'company_name': company_name,
        'company_url': company_url,
        'company_word': company_word,
        'company_tag': tag_str,
        'company_short_name': company_short_name,
        'company_desc': company_desc,
        'logo': logo
    }
    # print len(company_dict)
    return company_dict