def getCompany(request): ''' 展示工作详情、公司评价和面试 :param request:附带参数公司名字,工作jobId :return:公司评价、面试经验,工作jd ''' logger.info('开始展示公司详情') jobId = request.GET.get('jobId') company = request.GET.get('company') logger.info('获得数据'+str(jobId)+company) data_boss = list(db_boss.connM.find({'jobId':jobId})) company_interview = list(DataClass.DataClass('kanzhunDB').connM1[company].find({'title': 'interview'},{'_id':0,'title':0})) company_review = list(DataClass.DataClass('kanzhunDB').connM1[company].find({'title': 'review'},{'_id':0,'title':0})) #读取标签和面试难度、公司评分等数据 company_tags = list(DataClass.DataClass('kanzhunDB').connM1[company].find({'title': 'company'},{'companyTags':1,'_id':0}))[0] interview_degree = list(DataClass.DataClass('kanzhunDB').connM1[company].find({'title': 'company'},{'_id':0,'interviewDegree':1}))[0] companyScore = list(DataClass.DataClass('kanzhunDB').connM1[company].find({'title': 'company'},{'_id':0,'companyScore':1}))[0] logger.info('从数据库取出数据'+str(len(data_boss))+'公司评价'+str(len(company_review))+'公司面试经验'+str(len(company_interview))) # url = request.get_full_path() # if not company_review: # return HttpResponse('暂时无该公司数据') # print(data_boss,'\n',company_review[0]) context = { 'jobs':data_boss, 'interviews':company_interview, 'reviews':company_review, 'company_tags':company_tags, 'interview_degree':interview_degree, 'company_score':companyScore, } return render(request,'company.html',context)
def __init__(self): self.manager = URLManager('boss') self.download = HTMLDownload() self.job_analysis = JobHTMLAnalysis() self.detail_analysis = DetailHTMLAnalysis() self.db = DataClass.DataClass('bossDB', 'bossDB') # self.query = 'python' self.page = 1
def __init__(self, company): # 初始化 logger.info('接收到参数:'+company) self.manager = URLManager('kanzhun') self.HTMLDownload = HTMLDownload() self.companyAnal = CompanyHTMLAnalysis() self.interview = InterviewHTMLAnalysis() self.interviewDetail = InterviewDetailHTMLAnalysis() self.review = ReviewHTMLAnalysis() self.reviewDetail = ReviewDetailHTMLAnalysis() self.salary = SalaryHTMLAnalysis() self.company = company self.connM = DataClass.DataClass('kanzhunDB', self.company).connM
#encoding=utf-8 from django.test import TestCase # Create your tests here. from db import DataClass db_kanzhun = DataClass.DataClass('kanzhunDB') company = '荔枝微课' from db.redisPool import getRedis db = getRedis() # tags = db.smembers('boss_old_urls') # print(type(tags),tags) # for i in tags: # print(i) db_boss = DataClass.DataClass('bossDB', 'bossDB') data_boss = list( DataClass.DataClass('kanzhunDB').connM1[company].find( {'title': 'company'}, {'interviewDegree': 1})) # print(data_boss.count()) # for i in data_boss: # # print(i['content'][10]) # print('***********************************') # i['match'] = 1.1 # print(i) # li = list(data_boss) print('列表:&&&&&&&&&&&&&7', data_boss) for i in data_boss: print(i) # dblist = db_kanzhun.connM1.collection_names()
def index_data(request): ''' 点击查找按钮后返回查找的数据 :param request: :return: ''' logger.info('index_data方法,根据city和job查找数据并分页返回') # 对city和job的操作 job = request.GET.get('job') city = request.GET.get('city') city_code = 'c101280600' # 现在默认城市为深圳 # #这个设置缓存应该是不用了 if not all([job,city]): logger.info('初次进入,没有城市和职位,用默认值') city = request.session.get('job','深圳') job = request.session.get('city','python') else: #如果有这俩值,就设置缓存 request.session['job'] = job request.session['city'] = city logger.info('得到的信息:' + str(city) + str(job)) page = int(request.GET.get('page',1)) logger.info('对处理好的数据分页并返回,当前页数:'+str(page)) count = 7 next = (page-1) * 7 #后一页的数据 import math page_count = math.ceil(len(list(db_boss.connM.find({'job': {'$regex':job,'$options':'i'}, 'city': city})))/count) #数据库中总页数 if page_count < 5: page_counts = range(1, page_count + 1) elif page <= 3: page_counts = range(1, 6) elif page_count - page <= 2: page_counts = range(page_count - 4, page_count + 1) else: page_counts = range(page - 2, page + 3) # city = '深圳' # job = 'python' # 根据job和city信息取出数据库中的数据 data_boss = list(db_boss.connM.find({'job': {'$regex':job,'$options':'i'}, 'city': city}).limit(count).skip(next)) logger.info('获得了' + str(len(data_boss)) + '条数据') # if (len(data_boss) == 0): # # 如果没取到数据,启动boss直聘爬虫 # logger.info('数据库中没数据,开始boss直聘爬虫进程') # bc().main(job, city_code) # # p = multiprocessing.Process(target=start_boss,args=(job,city_code,)) # # p.start() # # t1 = threading.Thread(target=start_boss,args=(job,city_code,)) # # t1.start() # # else: # # 这里应该加上对现有数据准确性的核对 # pass # 建立对于mongdb中看准网数据的连接 db_kanzhun = DataClass.DataClass('kanzhunDB') # 处理数据,与看准网数据进程匹配。根据是否有能力标签,计算匹配程度 for i in data_boss: # 取出公司信息根据公司名称取出看准网数据,没有就启动爬虫 company = i['content'][2] # if company not in db_kanzhun.connM1.collection_names(): # 公司名称不存在与看准网数据库的集合名中,说明没数据,启动看准网爬虫 # logger.info('开始启动看准网爬虫进程,爬取' + company + '的数据') # kanzhunC(company).main(company) # 第一个company是在爬虫内部建立根据公司名命名的collection集合 # p2 = multiprocessing.Process(target=start_kanzhun,args=(company,)) # t2 = threading.Thread(target=start_kanzhun,args=(company,)) # t2.start() # context['salary'] = i['content'][1] logger.info(company) # 这时候,已经有数据了,开始对看准网数据进行梳理 data_kanzhun = db_kanzhun.connM1[company].find({'title': 'company'}) logger.info('取出看准网相关数据') if data_kanzhun.count() > 0: # 数据库里有公司相关的数据 if 'companyScore' not in data_kanzhun[0]: # 没有公司评分 company_score = None else: company_score = data_kanzhun[0]['companyScore'] if 'interviewDegree' not in data_kanzhun[0]: interview_degree = None else: interview_degree = data_kanzhun[0]['interviewDegree'] # 读取工作jd jd = i['content'][-1] i['company_score'] = company_score i['interview'] = interview_degree else: # 数据库中有这个公司的数据集合,但是为空。说明该公司在看准网上没数据 i['company_score'] = '看准网没数据' i['interview'] = '看准网没数据' context = { 'data_boss': data_boss, # 职位数据 'page':page, #当前页码 'page_counts':page_counts, #分页范围 'page_count':page_count #总页数 } return render(request, 'jobs.html', context)
def analysis(request): ''' ajax后台方法,进行匹配指数计算和推荐指数计算并返回 :return: ''' logger.info('analysis方法开始调用') conn = get_redis_connection('default') # 按照预先的配置连接redis tags = request.POST.get('tags') logger.info('&&&&&&&&&&&&&&&当前标签非缓存中是:' + str(tags)) if not tags: tags = conn.smembers('tags') logger.info('&&&&&&&&&&&&&&&当前标签是:'+str(tags)) else: logger.info('得到前台传过来的数据' + str(type(tags))) tag = tags.split(',') # 将解析成列表的标签遍历加入redis的列表中 for i in tag: conn.sadd('tags', i) db_kanzhun = DataClass.DataClass('kanzhunDB') # 对city和job的操作 job = request.GET.get('job') city = request.GET.get('city') city_code = 'c101280600' # 现在默认城市为深圳 # if not all([job, city]): logger.info('初次进入,没有城市和职位,读取缓存值') job = request.session.get('job', 'python') city = request.session.get('city', '深圳') else: # 如果有这俩值,就设置缓存 request.session['job'] = job request.session['city'] = city logger.info('得到的信息:' + str(city) + str(job)) # city = '深圳' # job = 'python' # 根据job和city信息取出数据库中的数据 page = int(request.GET.get('page', 1)) logger.info('对处理好的数据分页并返回,当前页数:'+str(page)) count = 7 next = (page - 1) * 7 # 后一页的数据 import math page_count = math.ceil( len(list(db_boss.connM.find({'job': {'$regex': job, '$options': 'i'}, 'city': city}))) / count) # 数据库中总页数 if page_count < 5: page_counts = range(1, page_count + 1) elif page <= 3: page_counts = range(1, 6) elif page_count - page <= 2: page_counts = range(page_count - 4, page_count + 1) else: page_counts = range(page - 2, page + 3) # city = '深圳' # job = 'python' # 根据job和city信息取出数据库中的数据 data_boss = list( db_boss.connM.find({'job': {'$regex': job, '$options': 'i'}, 'city': city}).limit(count).skip(next)) logger.info('获得了' + str(len(data_boss)) + '条数据') for i in data_boss: company = i['content'][2] data_kanzhun = db_kanzhun.connM1[company].find({'title': 'company'}) if data_kanzhun.count() > 0: if 'companyScore' not in data_kanzhun[0]: company_socre = 0 else: company_socre = data_kanzhun[0]['companyScore'] if 'interviewDegree' in data_kanzhun[0]: interview_degree = data_kanzhun[0]['interviewDegree'] else: interview_degree = 0 # 遍历工作信息 jd = i['content'][-1] # 利用match方法匹配 match_score = match(jd, tags) logger.info('匹配程度是:' + str(match_score)) logger.info('当前公司:' + str(company) + ':' + str(company_socre) + str(interview_degree) + str(match_score)) recommended = recommend(match_score, interview_degree, company_socre) i['match'] = match_score i['recommend'] = recommended else: i['match'] = '看准网无信息' i['recommend'] = '看准网无信息' logger.info('analysis开始返回数据') context = { 'page': page, # 当前页 'page_count': page_count, # 总页数 'page_counts':page_counts, #分页范围 'data_boss': data_boss, # 职位数据 } return render(request,'jobs1.html',context)
from django.shortcuts import render,redirect,HttpResponse from django.http import JsonResponse from django.views.generic import View from db import DataClass import logging from django_redis import get_redis_connection from django.core.paginator import Paginator import multiprocessing,threading from .spider.kanzhunSpider.Control import Control as kanzhunC from .spider.bossSpider.Control import Control as bc # Create your views here. logger = logging.getLogger('django_console') db_boss = DataClass.DataClass('bossDB','bossDB') # db_kanzhun = DataClass.DataClass('kanzhunDB','kanzhunDB') def start_kanzhun(company): print('看准网线程********',company) kc = kanzhunC(company) kc.main(company) def start_boss(job,city_code): bc().main(job, city_code) def index(request): ''' 全部用GET方式 :param request: :return: ''' logger.info('返回首页')
#coding=utf-8 ''' ************************* file: AnalysisJobs start_test author: gongyi date: 2019/7/17 18:13 **************************** change activity: 2019/7/17 18:13 ''' #清除数据库现有数据并重新启动爬虫 from db import DataClass from pymongo import MongoClient # 删除redis db = DataClass.DataClass('kanzhunDB','kanzhunDB') db.connR.delete('kanzhun_new_urls','kanzhun_old_urls') #删除mongo conn = MongoClient('192.168.0.106',27017) connM = conn.kanzhunDB conn.drop_database('kanzhunDB') print('已删除数据')
#coding=utf-8 ''' ************************* file: AnalysisJobs spider_test author: gongyi date: 2019/7/15 14:48 **************************** change activity: 2019/7/15 14:48 ''' from Control import Control from db import DataClass db = DataClass.DataClass() db.connR.delete('new_urls','old_urls') db.connM.remove({}) print('已删除数据') # spider = Control() # spider.main()