def FiveEightJobs(assignPage='1', totalPages=1, nextUrl=''): ''' # Function: 向服务器提交搜索信息,并获取职位搜索页的所有职位信息 # Params : keyword=搜索关键词,assignPage=页码 # Notes : ''' print 'Tring processing General Search List Page %s ==========='%(assignPage if assignPage else '1') # === 获取网页源码 === url = nextUrl if nextUrl else gen58JobUrl(pn=assignPage) webTarget = webPageSourceCode( url ) if not webTarget : return '' # 如果没有获取到网络信息 则退出 # 不过目前这一句的逻辑是否正确还没想通-_-! # === BeautifulSoup解析源码,也是最花时间的,解析器不对则会造成7秒/页面 === soup = BeautifulSoup(webTarget['html'], 'html5lib') # === 检测当前页是否有结果 === with open('log.html', 'w') as f: f.write(soup.prettify('utf-8')) if bsGet(soup, css='#searchTip', withTxt='抱歉') or bsGet(soup, css='h1[class="item"]', withTxt='抱歉'): print 'No any result or you have been blocked.-_-!' # 如果没有显示结果 则推出 return '' # === 获取真实页码 === truePage = bsGet(soup, css='div[class="pagerout"] div[class="pager"] strong') truePage = int(truePage) if truePage else 1 # 如果结果少于1页,则不会有任何结果 # === 获取下一页链接 === # 58的下一页链接是不完整的-_-!再去补完还不如自己造呢 try: nextUrl = bsGet(soup,css='div[class="pagerout"] a[class="next"]',attri='href') except: print 'No link of next-page found.' # === 获取信息条目 === blocks = soup.select('[logr$="ses^composite^0"]') print '=== Detected %d Job Information in this page.' %len(blocks) if len(blocks): titles = 'jobName,jobLink,cmpName, cmpLink, cmpLoc, jobUpdate' values = [] for row in blocks: if bsGet(row, css='div[class="tuiguang"]'): continue # 排除推广信息 values.append([ bsGet(row, css='a[_t="common"]'), bsGet(row, css='a[_t="common"]',attri='href'), bsGet(row, css='div[class="titbar"] h2'), bsGet(row, css='dd[class="w96"]'), bsGet(row, css='dd[class="w68"]') ]) # 输出结果:MySQL的sql文件输出 sqlfile = './data/INSERT_INTO_TEMP_SEARCHRESULTS_FiveEight.sql' fback = sqlInsert('TEMP_SEARCHRESULTS_FiveEight', titles, values, sqlfile=sqlfile) # print fback else: print 'No any record found in this page.' if int(truePage) < int(totalPages): if not nextUrl and truePage < assignPage: FiveEightJobs(keyword, assignPage='%d'%(int(truePage)+1), totalPages=totalPages) else: FiveEightJobs(keyword, assignPage=truePage+1, nextUrl=nextUrl, totalPages=totalPages) else: print '-'*50 + 'Reached the end of records. truePage[%s], assignPage[%s], totalPages[%s].' %(truePage,assignPage,totalPages)
def FiveEightRoster(nextUrl='', assignPage=1, city='', industry=''): ''' # Function: 抓取58同城的“企业名录”网页。只抓取"名称"和"链接"。 # Notes : 1. 为求效率,这是个"三重递归"函数。逻辑是这样的:第一次运行,挨个找城市链接,然后点开一个城市链接, 然后再挨个点开行业链接,循环读取所有名录之后再进入下一个城市链接进行循环。 2. 运行后发现。。。这玩意效率太高!不到2分钟就被58屏蔽IP了-_-!怎么办。。 ''' # === 先从主页抓取所有子城市、行业类别的名录页 === if not nextUrl: print '=' * 80 + 'First Run.' webTarget = webPageSourceCode( 'http://qy.58.com/citylist/') # 初始先从全部城市页面入手 if not webTarget: return '' soup = BeautifulSoup(webTarget['html'], 'html5lib') if not city: ctLinks = soup.select('#clist a[href^="http://qy.58.com/"]') for ct in ctLinks: FiveEightRoster(ct['href'], city=ct.get_text(strip=True)) else: indLinks = soup.select( '[class^="indCateList"] a[href^="http://qy.58.com/"]') for link in indLinks: FiveEightRoster(link['href'], city=city, industry=link.get_text(strip=True)) return '' # === 读取一个分类的所有页面数据 === OK 可以独立运行 print 'Tring processing the list-page %d of Firm Roster in the city [%s] ===========' % ( assignPage, city) url = 'http://qy.58.com/%s/pn%d' % (city, assignPage) if not nextUrl else nextUrl # url = './templates/58Firm-Roster.html' webTarget = webPageSourceCode(url) if not webTarget: return '' soup = BeautifulSoup(webTarget['html'], 'html5lib') firms = soup.select('[class="compList"] a[href^="http://qy.58.com/"]') if not len(firms): # 说明已经到结尾了 print 'You have reached the end of records, or maybe you have been blocked.' return '' titles = 'cmpName, cmpLink_58, cmpCity, industry' values = [[tag.get_text(strip=True), tag['href'], city, industry] for tag in firms] print '=== Detected %d Firms in this page.' % len(values) subpath = '_'.join(urlAnalyse(url)['path'].split('/')) sqlfile = './data/INSERT_INTO_FIRMS%spn%d.sql' % (subpath, assignPage) sqls = sqlInsert(table='FIRMS', titles=titles, values=values, sqlfile=sqlfile)
def FiveEightRoster(nextUrl='', assignPage=1, city='', industry=''): ''' # Function: 抓取58同城的“企业名录”网页。只抓取"名称"和"链接"。 # Notes : 1. 为求效率,这是个"三重递归"函数。逻辑是这样的:第一次运行,挨个找城市链接,然后点开一个城市链接, 然后再挨个点开行业链接,循环读取所有名录之后再进入下一个城市链接进行循环。 2. 运行后发现。。。这玩意效率太高!不到2分钟就被58屏蔽IP了-_-!怎么办。。 ''' # === 先从主页抓取所有子城市、行业类别的名录页 === if not nextUrl: print '='*80 + 'First Run.' webTarget = webPageSourceCode('http://qy.58.com/citylist/') # 初始先从全部城市页面入手 if not webTarget: return '' soup = BeautifulSoup(webTarget['html'], 'html5lib') if not city: ctLinks = soup.select('#clist a[href^="http://qy.58.com/"]') for ct in ctLinks: FiveEightRoster(ct['href'], city=ct.get_text(strip=True)) else: indLinks = soup.select('[class^="indCateList"] a[href^="http://qy.58.com/"]') for link in indLinks: FiveEightRoster(link['href'], city=city, industry=link.get_text(strip=True)) return '' # === 读取一个分类的所有页面数据 === OK 可以独立运行 print 'Tring processing the list-page %d of Firm Roster in the city [%s] ==========='%(assignPage, city) url = 'http://qy.58.com/%s/pn%d'%(city,assignPage) if not nextUrl else nextUrl # url = './templates/58Firm-Roster.html' webTarget = webPageSourceCode(url) if not webTarget: return '' soup = BeautifulSoup(webTarget['html'], 'html5lib') firms = soup.select('[class="compList"] a[href^="http://qy.58.com/"]') if not len(firms): # 说明已经到结尾了 print 'You have reached the end of records, or maybe you have been blocked.' return '' titles = 'cmpName, cmpLink_58, cmpCity, industry' values = [[tag.get_text(strip=True), tag['href'], city, industry] for tag in firms] print '=== Detected %d Firms in this page.' %len(values) subpath = '_'.join(urlAnalyse(url)['path'].split('/')) sqlfile = './data/INSERT_INTO_FIRMS%spn%d.sql'%(subpath, assignPage) sqls = sqlInsert(table='FIRMS',titles=titles, values=values, sqlfile=sqlfile)
def ZhilianSearchList(keyword='数据', assignPage=1, totalPages=1, scope=0, nextUrl=''): ''' # Function: 向智联招聘提交搜索信息,并获取智联搜索页的所有职位信息 # Params : keyword=搜索关键词,assignPage=页码 ''' if scope==0 : print 'Tring processing General Search List Page %d ==========='%assignPage elif scope==1 : print 'Tring processing Company Search List Page %d -----------'%assignPage elif scope==2 : print 'Tring processing Position Search List Page %d -----------'%assignPage # === 编制URL参数 === urlParams = { 'kw' : keyword, # 搜索关键词 'sm' : 0, # 显示方式代码: 列表是'0',详细是'1'。显示不同源码也不同,尽量选列表模式,源码更好解析。 'jl' : '北京', # 搜索城市:'北京',多项用'+'连接(URL编码为%2B) #'bj' : '', # 职位类别代码:互联网产品/运营管理 的代码为 '160200',多项用'%3B'连接(URL编码的%) #'in' : '', # 行业代码:多项用';'连接(URL编码为%3B) 'kt' : scope, # 关键词搜索范围:全文'0' | 公司名'1' | 职位名'2' 'isadv' : 0, # 是否高级搜索:快速搜索'0' | 高级搜索'1' # 'isfilter' : 1, # 是不是筛选器: '0' | '1' # 'ispts' : '', # 通常为 '1' #'sj' : '', # 职位子类别代码: # 'gc' : '5号', # 地铁线路: '5号' # 'ga' : '立水桥', # 地名或地铁站名: '天通苑南' 、 '小汤山' # 'sb' : 0, # 排序方式代码:默认排序是'0',相关度排序是'1', 首发日排序是'2' #'fjt' : '10000', # 职位标签 五险一金'10000' 年底双薪'10001' 绩效奖金'10002' 等等 # 'sf' : -1, # 月薪底线:'8001' 不限是'-1' # 'st' : -1, # 月薪上限:'10000' 不限是'-1' # 'ct' : -1, # 公司性质代码 # 'el' : -1, # 学历代码 # 'we' : -1, # 工作经验代码 # 'et' : -1, # 职位类型代码:兼职'1' 全职'2' 实习'4' # 'pd' : -1, # 发布时间(天数):一周是'7',一个月是'30',不限是'-1' 'p' : assignPage, # 页码,超出总页码时,则会显示最后一页 #'gr' : '', # # 're' : '2015', # 这个限制了搜素数量,但是其实也不是按年份搜索 'sg' : '', # 即全网唯一标示符——GUID #'' : '' # } # === 获取网页源码 === ''' # 其实在这里应该加一个计时器,如果时间超长都不返回结果,那么就伪装IP再来一次。 # 或者如果获取源码失败,也伪装IP等再来一次。 ''' if nextUrl : webTarget = webPageSourceCode(nextUrl) else: webTarget = webPageSourceCode('http://sou.zhaopin.com/jobs/searchresult.ashx', urlParams) if not webTarget : return '' # 如果没有获取到网络信息 则退出 # 不过目前这一句的逻辑是否正确还没想通-_-! # === BeautifulSoup解析源码,也是最花时间的,解析器不对则会造成7秒/页面 === soup = BeautifulSoup(webTarget['html'], 'html5lib') # === 获取搜索结果的数量,并进行相应处理 === total_results = bsGet(soup, css='[class$=search_yx_tj] em') print 'There are %s results found as total.' %total_results if total_results == '0': return '' # 如果当前页面没有结果,则不进行处理了。 ''' # === 获取全网唯一ID,即url中的sg参数 === tags = soup.select('#guid') guid = tags[0]['value'] if len(tags) else '' print 'The "guid" is %s.' %guid ''' # === 获取真实页码 === truePage = bsGet(soup, css='[class*="pagesDown"] a[class*="current"]') truePage = int(truePage) if truePage else 1 # 如果结果少于1页,则不会有任何结果 # === 获取下一页链接 === try: nextUrl = soup.select('a[class*=next-page]')[0]['href'] except: print 'No link of next-page found.' # === 获取信息条目 === blocks = soup.select('[class$=newlist]') print '=== Detected %d Job Information in this page.' %len(blocks) if len(blocks): titles = 'jobName,cmpName,feedback,workingAge,eduReq,cmpType,cmpSize,jobDescri,jobLink,cmpLink,payMonthly,cmpLoc,jobUpdate' values = [] for row in blocks: values.append([ bsGet(row, css='[class$=zwmc]'), # 职位名称 bsGet(row, css='[class$=gsmc]'), # 公司名称 bsGet(row, css='[class$=fk_lv]'), # 反馈比率 bsGet(row, withTxt='经验:'), # 工作经验 bsGet(row, withTxt='学历:'), # 学历背景 bsGet(row, withTxt='公司性质:'), # 公司性质 bsGet(row, withTxt='公司规模:'), # 公司规模 bsGet(row, withTxt='岗位职责:'), # 岗位职责 bsGet(row, css='[class$=zwmc] a[href^="http"]', attri='href'), # 招聘网址 bsGet(row, css='[class$=gsmc] a[href^="http"]', attri='href'), # 企业网址 bsGet(row, css='[class$=zwyx]', withTxt='职位月薪:'), # 职位月薪 bsGet(row, css='[class$=gzdd]', withTxt='地点:'), # 工作地点 bsGet(row, css=['[class$=gxsj]', 'dl p']), # 更新时间 ]) # print 'withTxt is an unicode string:',type(values[0][4]) == type(u'') # True # print 'attri is an unicode string:', type(values[0][8]) == type(u'') # True # print 'multi-search got an unicode string:', type(values[0][8]) == type(u'') # True ''' # === 子链接抓取:新式方案 === # 不在这里进行解析以免一个地方出错导致全程失败, # 应当先获取全部搜索结果,再本函数外对本次获取的子链接进行抓取。 # === 跳转并解析职位信息页面 === # jobUrl = bsGet(row,css='[class$=zwmc] a[href^="http"]', attri='href') # if jobUrl : ZhilianJobPage(jobUrl) # else : print 'Failed on retrieving URL of the job: %s' %values[0] # === 跳转并解析企业信息页面 === # 方法1 # 但是会有问题就是,如果`识别重复`方面没有做好,这里就会形成无限循环。 # 可以想到的笨方法就是,先取得所有相关的企业名称和链接,然后再用函数把它读取出来,循环生成。 # publicJobs = ZhilianFirmPage(values[-1]) # print 'This company is recruiting %d jobs now.' %len(publicJobs) # 方法2 # 递归本函数,用企业名搜索其下所有招聘信息。 # ZhilianSearchList(values[1].encode('utf-8'), 1, scope=3) ''' # 输出结果:MySQL的sql文件输出 sqlfile = './data/INSERT_INTO_TEMP_SEARCHRESULTS_ZHILIAN.sql' fback = sqlInsert('TEMP_SEARCHRESULTS_ZHILIAN', titles, values, sqlfile=sqlfile) # print fback ''' # === 递归调用函数自身,循环读取下一页 === # 循环读取每一页的信息 # 智联招聘一般全网同时会有100,000个职位 # 但是都不超过90个页面,一页有40个,所以顶多只能获取3600个 # 另外,如果页码超过现有的,则会仍显示一些招聘信息,但是都是重复的。 # 唯一不同是,上方会显示“共0个职位满足条件” # 如果真实的页码并没有指定页码那么多,就代表搜索到头了。 # >>> ''' if truePage < totalPages: if not nextUrl and truePage < assignPage: ZhilianSearchList(keyword, assignPage=truePage+1, totalPages=totalPages) else: ZhilianSearchList(keyword, assignPage=truePage+1, nextUrl=nextUrl, totalPages=totalPages) else: print '-'*50 + 'Reached the end of records. truePage[%d], assignPage[%d], totalPages[%d].' %(truePage,assignPage,totalPages)
def FiveEightJobs(assignPage='1', totalPages=1, nextUrl=''): ''' # Function: 向服务器提交搜索信息,并获取职位搜索页的所有职位信息 # Params : keyword=搜索关键词,assignPage=页码 # Notes : ''' print 'Tring processing General Search List Page %s ===========' % ( assignPage if assignPage else '1') # === 获取网页源码 === url = nextUrl if nextUrl else gen58JobUrl(pn=assignPage) webTarget = webPageSourceCode(url) if not webTarget: return '' # 如果没有获取到网络信息 则退出 # 不过目前这一句的逻辑是否正确还没想通-_-! # === BeautifulSoup解析源码,也是最花时间的,解析器不对则会造成7秒/页面 === soup = BeautifulSoup(webTarget['html'], 'html5lib') # === 检测当前页是否有结果 === with open('log.html', 'w') as f: f.write(soup.prettify('utf-8')) if bsGet(soup, css='#searchTip', withTxt='抱歉') or bsGet( soup, css='h1[class="item"]', withTxt='抱歉'): print 'No any result or you have been blocked.-_-!' # 如果没有显示结果 则推出 return '' # === 获取真实页码 === truePage = bsGet(soup, css='div[class="pagerout"] div[class="pager"] strong') truePage = int(truePage) if truePage else 1 # 如果结果少于1页,则不会有任何结果 # === 获取下一页链接 === # 58的下一页链接是不完整的-_-!再去补完还不如自己造呢 try: nextUrl = bsGet(soup, css='div[class="pagerout"] a[class="next"]', attri='href') except: print 'No link of next-page found.' # === 获取信息条目 === blocks = soup.select('[logr$="ses^composite^0"]') print '=== Detected %d Job Information in this page.' % len(blocks) if len(blocks): titles = 'jobName,jobLink,cmpName, cmpLink, cmpLoc, jobUpdate' values = [] for row in blocks: if bsGet(row, css='div[class="tuiguang"]'): continue # 排除推广信息 values.append([ bsGet(row, css='a[_t="common"]'), bsGet(row, css='a[_t="common"]', attri='href'), bsGet(row, css='div[class="titbar"] h2'), bsGet(row, css='dd[class="w96"]'), bsGet(row, css='dd[class="w68"]') ]) # 输出结果:MySQL的sql文件输出 sqlfile = './data/INSERT_INTO_TEMP_SEARCHRESULTS_FiveEight.sql' fback = sqlInsert('TEMP_SEARCHRESULTS_FiveEight', titles, values, sqlfile=sqlfile) # print fback else: print 'No any record found in this page.' if int(truePage) < int(totalPages): if not nextUrl and truePage < assignPage: FiveEightJobs(keyword, assignPage='%d' % (int(truePage) + 1), totalPages=totalPages) else: FiveEightJobs(keyword, assignPage=truePage + 1, nextUrl=nextUrl, totalPages=totalPages) else: print '-' * 50 + 'Reached the end of records. truePage[%s], assignPage[%s], totalPages[%s].' % ( truePage, assignPage, totalPages)