def FiveEightJobs(assignPage='1', totalPages=1, nextUrl=''):
	'''
	# Function: 向服务器提交搜索信息,并获取职位搜索页的所有职位信息
	# Params  : keyword=搜索关键词,assignPage=页码
	# Notes   : 
	'''
	print 'Tring processing General  Search List Page %s ==========='%(assignPage if assignPage else '1')		
	# === 获取网页源码 ===
	url = nextUrl if nextUrl else gen58JobUrl(pn=assignPage)
	webTarget = webPageSourceCode( url )
	if not webTarget : return '' # 如果没有获取到网络信息 则退出 # 不过目前这一句的逻辑是否正确还没想通-_-!
	# === BeautifulSoup解析源码,也是最花时间的,解析器不对则会造成7秒/页面 ===
	soup = BeautifulSoup(webTarget['html'], 'html5lib')
	# === 检测当前页是否有结果 ===
	with open('log.html', 'w') as f:
		f.write(soup.prettify('utf-8'))
	if bsGet(soup, css='#searchTip', withTxt='抱歉') or bsGet(soup, css='h1[class="item"]', withTxt='抱歉'): 
		print 'No any result or you have been blocked.-_-!' # 如果没有显示结果 则推出
		return ''
	# === 获取真实页码 ===
	truePage = bsGet(soup, css='div[class="pagerout"] div[class="pager"] strong')
	truePage = int(truePage) if truePage else 1 # 如果结果少于1页,则不会有任何结果
	# === 获取下一页链接 === # 58的下一页链接是不完整的-_-!再去补完还不如自己造呢
	try: nextUrl = bsGet(soup,css='div[class="pagerout"] a[class="next"]',attri='href')
	except: print 'No link of next-page found.'
	# === 获取信息条目 ===
	blocks = soup.select('[logr$="ses^composite^0"]')
	print '=== Detected %d Job Information in this page.' %len(blocks)
	if len(blocks):
		titles = 'jobName,jobLink,cmpName, cmpLink, cmpLoc, jobUpdate'
		values = []
		for row in blocks:
			if bsGet(row, css='div[class="tuiguang"]'): continue # 排除推广信息
			values.append([
				bsGet(row, css='a[_t="common"]'),
				bsGet(row, css='a[_t="common"]',attri='href'),
				bsGet(row, css='div[class="titbar"] h2'),
				bsGet(row, css='dd[class="w96"]'),
				bsGet(row, css='dd[class="w68"]')
			])
		# 输出结果:MySQL的sql文件输出
		sqlfile = './data/INSERT_INTO_TEMP_SEARCHRESULTS_FiveEight.sql'
		fback = sqlInsert('TEMP_SEARCHRESULTS_FiveEight', titles, values, sqlfile=sqlfile)
		# print fback
	else: 
		print 'No any record found in this page.'
	if int(truePage) < int(totalPages):
		if not nextUrl and truePage < assignPage: FiveEightJobs(keyword, assignPage='%d'%(int(truePage)+1), totalPages=totalPages)
		else: FiveEightJobs(keyword, assignPage=truePage+1, nextUrl=nextUrl, totalPages=totalPages)
	else: print '-'*50 + 'Reached the end of records. truePage[%s], assignPage[%s], totalPages[%s].' %(truePage,assignPage,totalPages)
def FiveEightRoster(nextUrl='', assignPage=1, city='', industry=''):
    '''
	# Function: 抓取58同城的“企业名录”网页。只抓取"名称"和"链接"。
	# Notes   : 1. 为求效率,这是个"三重递归"函数。逻辑是这样的:第一次运行,挨个找城市链接,然后点开一个城市链接,
				然后再挨个点开行业链接,循环读取所有名录之后再进入下一个城市链接进行循环。
				2. 运行后发现。。。这玩意效率太高!不到2分钟就被58屏蔽IP了-_-!怎么办。。
	'''
    # === 先从主页抓取所有子城市、行业类别的名录页 ===
    if not nextUrl:
        print '=' * 80 + 'First Run.'
        webTarget = webPageSourceCode(
            'http://qy.58.com/citylist/')  # 初始先从全部城市页面入手
        if not webTarget: return ''
        soup = BeautifulSoup(webTarget['html'], 'html5lib')
        if not city:
            ctLinks = soup.select('#clist a[href^="http://qy.58.com/"]')
            for ct in ctLinks:
                FiveEightRoster(ct['href'], city=ct.get_text(strip=True))
        else:
            indLinks = soup.select(
                '[class^="indCateList"] a[href^="http://qy.58.com/"]')
            for link in indLinks:
                FiveEightRoster(link['href'],
                                city=city,
                                industry=link.get_text(strip=True))
        return ''

    # === 读取一个分类的所有页面数据 === OK 可以独立运行
    print 'Tring processing the list-page %d of Firm Roster in the city [%s] ===========' % (
        assignPage, city)
    url = 'http://qy.58.com/%s/pn%d' % (city,
                                        assignPage) if not nextUrl else nextUrl
    # url = './templates/58Firm-Roster.html'
    webTarget = webPageSourceCode(url)
    if not webTarget: return ''
    soup = BeautifulSoup(webTarget['html'], 'html5lib')
    firms = soup.select('[class="compList"] a[href^="http://qy.58.com/"]')
    if not len(firms):  # 说明已经到结尾了
        print 'You have reached the end of records, or maybe you have been blocked.'
        return ''
    titles = 'cmpName, cmpLink_58, cmpCity, industry'
    values = [[tag.get_text(strip=True), tag['href'], city, industry]
              for tag in firms]
    print '=== Detected %d Firms in this page.' % len(values)
    subpath = '_'.join(urlAnalyse(url)['path'].split('/'))
    sqlfile = './data/INSERT_INTO_FIRMS%spn%d.sql' % (subpath, assignPage)
    sqls = sqlInsert(table='FIRMS',
                     titles=titles,
                     values=values,
                     sqlfile=sqlfile)
def FiveEightRoster(nextUrl='', assignPage=1, city='', industry=''):
	'''
	# Function: 抓取58同城的“企业名录”网页。只抓取"名称"和"链接"。
	# Notes   : 1. 为求效率,这是个"三重递归"函数。逻辑是这样的:第一次运行,挨个找城市链接,然后点开一个城市链接,
				然后再挨个点开行业链接,循环读取所有名录之后再进入下一个城市链接进行循环。
				2. 运行后发现。。。这玩意效率太高!不到2分钟就被58屏蔽IP了-_-!怎么办。。
	'''
	# === 先从主页抓取所有子城市、行业类别的名录页 ===
	if not nextUrl:
		print '='*80 + 'First Run.'
		webTarget = webPageSourceCode('http://qy.58.com/citylist/') # 初始先从全部城市页面入手
		if not webTarget: return ''
		soup = BeautifulSoup(webTarget['html'], 'html5lib')
		if not city:
			ctLinks = soup.select('#clist a[href^="http://qy.58.com/"]')
			for ct in ctLinks: 
				FiveEightRoster(ct['href'], city=ct.get_text(strip=True))
		else:
			indLinks = soup.select('[class^="indCateList"] a[href^="http://qy.58.com/"]')
			for link in indLinks: FiveEightRoster(link['href'], city=city, industry=link.get_text(strip=True))
		return ''

	# === 读取一个分类的所有页面数据 === OK 可以独立运行
	print 'Tring processing the list-page %d of Firm Roster in the city [%s] ==========='%(assignPage, city)
	url = 'http://qy.58.com/%s/pn%d'%(city,assignPage) if not nextUrl else nextUrl
	# url = './templates/58Firm-Roster.html'
	webTarget = webPageSourceCode(url)
	if not webTarget: return ''
	soup = BeautifulSoup(webTarget['html'], 'html5lib')
	firms = soup.select('[class="compList"] a[href^="http://qy.58.com/"]')
	if not len(firms): # 说明已经到结尾了
		print 'You have reached the end of records, or maybe you have been blocked.'
		return ''
	titles  = 'cmpName, cmpLink_58, cmpCity, industry'
	values  = [[tag.get_text(strip=True), tag['href'], city, industry] for tag in firms]
	print '=== Detected %d Firms in this page.' %len(values)
	subpath = '_'.join(urlAnalyse(url)['path'].split('/'))
	sqlfile = './data/INSERT_INTO_FIRMS%spn%d.sql'%(subpath, assignPage)
	sqls = sqlInsert(table='FIRMS',titles=titles, values=values, sqlfile=sqlfile)
Beispiel #4
0
def ZhilianSearchList(keyword='数据', assignPage=1, totalPages=1, scope=0, nextUrl=''):
	'''
	# Function: 向智联招聘提交搜索信息,并获取智联搜索页的所有职位信息
	# Params : keyword=搜索关键词,assignPage=页码
	'''
	if   scope==0 : print 'Tring processing General  Search List Page %d ==========='%assignPage
	elif scope==1 : print 'Tring processing Company  Search List Page %d -----------'%assignPage
	elif scope==2 : print 'Tring processing Position Search List Page %d -----------'%assignPage
	# === 编制URL参数 ===
	urlParams = {
		'kw' : keyword, # 搜索关键词
		'sm' : 0, # 显示方式代码: 列表是'0',详细是'1'。显示不同源码也不同,尽量选列表模式,源码更好解析。
		'jl' : '北京', # 搜索城市:'北京',多项用'+'连接(URL编码为%2B)
		#'bj' : '', # 职位类别代码:互联网产品/运营管理 的代码为 '160200',多项用'%3B'连接(URL编码的%)
		#'in' : '', # 行业代码:多项用';'连接(URL编码为%3B)
		'kt' : scope, # 关键词搜索范围:全文'0' | 公司名'1' | 职位名'2'
		'isadv' : 0, # 是否高级搜索:快速搜索'0' | 高级搜索'1'
		# 'isfilter' : 1, # 是不是筛选器: '0' | '1'
		# 'ispts' : '', # 通常为 '1'
		#'sj' : '', # 职位子类别代码:
		# 'gc' : '5号', # 地铁线路: '5号'
		# 'ga' : '立水桥', # 地名或地铁站名: '天通苑南' 、 '小汤山'
		# 'sb' : 0, # 排序方式代码:默认排序是'0',相关度排序是'1', 首发日排序是'2'
		#'fjt' : '10000', # 职位标签 五险一金'10000' 年底双薪'10001' 绩效奖金'10002' 等等
		# 'sf' : -1, # 月薪底线:'8001' 不限是'-1'
		# 'st' : -1, # 月薪上限:'10000' 不限是'-1'
		# 'ct' : -1, # 公司性质代码
		# 'el' : -1, # 学历代码
		# 'we' : -1, # 工作经验代码
		# 'et' : -1, # 职位类型代码:兼职'1' 全职'2' 实习'4'
		# 'pd' : -1, # 发布时间(天数):一周是'7',一个月是'30',不限是'-1'
		'p' : assignPage, # 页码,超出总页码时,则会显示最后一页
		#'gr' : '', # 
		# 're' : '2015', # 这个限制了搜素数量,但是其实也不是按年份搜索
		'sg' : '', # 即全网唯一标示符——GUID
		#'' : '' #
	}
	# === 获取网页源码 ===
	'''
	# 其实在这里应该加一个计时器,如果时间超长都不返回结果,那么就伪装IP再来一次。
	# 或者如果获取源码失败,也伪装IP等再来一次。
	'''
	if nextUrl : webTarget = webPageSourceCode(nextUrl)
	else:        webTarget = webPageSourceCode('http://sou.zhaopin.com/jobs/searchresult.ashx', urlParams)
	if not webTarget : return '' # 如果没有获取到网络信息 则退出 # 不过目前这一句的逻辑是否正确还没想通-_-!
	# === BeautifulSoup解析源码,也是最花时间的,解析器不对则会造成7秒/页面 ===
	soup = BeautifulSoup(webTarget['html'], 'html5lib')
	# === 获取搜索结果的数量,并进行相应处理 ===
	total_results = bsGet(soup, css='[class$=search_yx_tj] em')
	print 'There are %s results found as total.' %total_results
	if total_results == '0': return '' # 如果当前页面没有结果,则不进行处理了。
	'''
	# === 获取全网唯一ID,即url中的sg参数 ===
	tags = soup.select('#guid')
	guid = tags[0]['value'] if len(tags)  else ''
	print 'The "guid" is %s.' %guid
	'''
	# === 获取真实页码 ===
	truePage = bsGet(soup, css='[class*="pagesDown"] a[class*="current"]')
	truePage = int(truePage) if truePage else 1 # 如果结果少于1页,则不会有任何结果
	# === 获取下一页链接 ===
	try: nextUrl = soup.select('a[class*=next-page]')[0]['href']
	except: print 'No link of next-page found.'
	# === 获取信息条目 ===
	blocks = soup.select('[class$=newlist]')
	print '=== Detected %d Job Information in this page.' %len(blocks)
	if len(blocks):
		titles = 'jobName,cmpName,feedback,workingAge,eduReq,cmpType,cmpSize,jobDescri,jobLink,cmpLink,payMonthly,cmpLoc,jobUpdate'
		values = []
		for row in blocks:
			values.append([
				bsGet(row, css='[class$=zwmc]'),  # 职位名称
				bsGet(row, css='[class$=gsmc]'),  # 公司名称
				bsGet(row, css='[class$=fk_lv]'), # 反馈比率
				bsGet(row, withTxt='经验:'),     # 工作经验
				bsGet(row, withTxt='学历:'),     # 学历背景
				bsGet(row, withTxt='公司性质:'), # 公司性质
				bsGet(row, withTxt='公司规模:'), # 公司规模
				bsGet(row, withTxt='岗位职责:'), # 岗位职责
				bsGet(row, css='[class$=zwmc] a[href^="http"]', attri='href'), # 招聘网址
				bsGet(row, css='[class$=gsmc] a[href^="http"]', attri='href'), # 企业网址
				bsGet(row, css='[class$=zwyx]', withTxt='职位月薪:'),         # 职位月薪
				bsGet(row, css='[class$=gzdd]', withTxt='地点:'),             # 工作地点
				bsGet(row, css=['[class$=gxsj]', 'dl p']),                     # 更新时间
			])
			# print 'withTxt is an unicode string:',type(values[0][4]) == type(u'') # True
			# print 'attri is an unicode string:', type(values[0][8]) == type(u'') # True
			# print 'multi-search got an unicode string:', type(values[0][8]) == type(u'') # True
			'''	
			# === 子链接抓取:新式方案 ===
			# 不在这里进行解析以免一个地方出错导致全程失败,
			# 应当先获取全部搜索结果,再本函数外对本次获取的子链接进行抓取。
				# === 跳转并解析职位信息页面 ===
				# jobUrl = bsGet(row,css='[class$=zwmc] a[href^="http"]', attri='href')
				# if jobUrl : ZhilianJobPage(jobUrl)
				# else      : print 'Failed on retrieving URL of the job: %s' %values[0]
				# === 跳转并解析企业信息页面 ===
				# 方法1
				# 但是会有问题就是,如果`识别重复`方面没有做好,这里就会形成无限循环。
				# 可以想到的笨方法就是,先取得所有相关的企业名称和链接,然后再用函数把它读取出来,循环生成。
				# publicJobs = ZhilianFirmPage(values[-1])
				# print 'This company is recruiting %d jobs now.' %len(publicJobs)
				# 方法2
				# 递归本函数,用企业名搜索其下所有招聘信息。
				# ZhilianSearchList(values[1].encode('utf-8'), 1, scope=3) 
			'''
		# 输出结果:MySQL的sql文件输出
		sqlfile = './data/INSERT_INTO_TEMP_SEARCHRESULTS_ZHILIAN.sql'
		fback = sqlInsert('TEMP_SEARCHRESULTS_ZHILIAN', titles, values, sqlfile=sqlfile)
		# print fback
	'''
		# === 递归调用函数自身,循环读取下一页 ===
		# 循环读取每一页的信息
		# 智联招聘一般全网同时会有100,000个职位
		# 但是都不超过90个页面,一页有40个,所以顶多只能获取3600个
		# 另外,如果页码超过现有的,则会仍显示一些招聘信息,但是都是重复的。
		# 唯一不同是,上方会显示“共0个职位满足条件”
		# 如果真实的页码并没有指定页码那么多,就代表搜索到头了。
		# >>>
	'''
	if truePage < totalPages:
		if not nextUrl and truePage < assignPage: ZhilianSearchList(keyword, assignPage=truePage+1, totalPages=totalPages)
		else: ZhilianSearchList(keyword, assignPage=truePage+1, nextUrl=nextUrl, totalPages=totalPages)
	else: print '-'*50 + 'Reached the end of records. truePage[%d], assignPage[%d], totalPages[%d].' %(truePage,assignPage,totalPages)
def FiveEightJobs(assignPage='1', totalPages=1, nextUrl=''):
    '''
	# Function: 向服务器提交搜索信息,并获取职位搜索页的所有职位信息
	# Params  : keyword=搜索关键词,assignPage=页码
	# Notes   : 
	'''
    print 'Tring processing General  Search List Page %s ===========' % (
        assignPage if assignPage else '1')
    # === 获取网页源码 ===
    url = nextUrl if nextUrl else gen58JobUrl(pn=assignPage)
    webTarget = webPageSourceCode(url)
    if not webTarget: return ''  # 如果没有获取到网络信息 则退出 # 不过目前这一句的逻辑是否正确还没想通-_-!
    # === BeautifulSoup解析源码,也是最花时间的,解析器不对则会造成7秒/页面 ===
    soup = BeautifulSoup(webTarget['html'], 'html5lib')
    # === 检测当前页是否有结果 ===
    with open('log.html', 'w') as f:
        f.write(soup.prettify('utf-8'))
    if bsGet(soup, css='#searchTip', withTxt='抱歉') or bsGet(
            soup, css='h1[class="item"]', withTxt='抱歉'):
        print 'No any result or you have been blocked.-_-!'  # 如果没有显示结果 则推出
        return ''
    # === 获取真实页码 ===
    truePage = bsGet(soup,
                     css='div[class="pagerout"] div[class="pager"] strong')
    truePage = int(truePage) if truePage else 1  # 如果结果少于1页,则不会有任何结果
    # === 获取下一页链接 === # 58的下一页链接是不完整的-_-!再去补完还不如自己造呢
    try:
        nextUrl = bsGet(soup,
                        css='div[class="pagerout"] a[class="next"]',
                        attri='href')
    except:
        print 'No link of next-page found.'
    # === 获取信息条目 ===
    blocks = soup.select('[logr$="ses^composite^0"]')
    print '=== Detected %d Job Information in this page.' % len(blocks)
    if len(blocks):
        titles = 'jobName,jobLink,cmpName, cmpLink, cmpLoc, jobUpdate'
        values = []
        for row in blocks:
            if bsGet(row, css='div[class="tuiguang"]'): continue  # 排除推广信息
            values.append([
                bsGet(row, css='a[_t="common"]'),
                bsGet(row, css='a[_t="common"]', attri='href'),
                bsGet(row, css='div[class="titbar"] h2'),
                bsGet(row, css='dd[class="w96"]'),
                bsGet(row, css='dd[class="w68"]')
            ])
        # 输出结果:MySQL的sql文件输出
        sqlfile = './data/INSERT_INTO_TEMP_SEARCHRESULTS_FiveEight.sql'
        fback = sqlInsert('TEMP_SEARCHRESULTS_FiveEight',
                          titles,
                          values,
                          sqlfile=sqlfile)
        # print fback
    else:
        print 'No any record found in this page.'
    if int(truePage) < int(totalPages):
        if not nextUrl and truePage < assignPage:
            FiveEightJobs(keyword,
                          assignPage='%d' % (int(truePage) + 1),
                          totalPages=totalPages)
        else:
            FiveEightJobs(keyword,
                          assignPage=truePage + 1,
                          nextUrl=nextUrl,
                          totalPages=totalPages)
    else:
        print '-' * 50 + 'Reached the end of records. truePage[%s], assignPage[%s], totalPages[%s].' % (
            truePage, assignPage, totalPages)