def main(): c = connect('lagou', 'companys') companyIds = [] for result in c.find(): companyIds.append(result.get('companyId')) UA = get_ua() url = 'https://www.lagou.com/gongsi/searchPosition.json' cursor = connect('lagou', 'positions') for companyId in companyIds[10000:20000]: company = {} for label in labels.keys(): items = [] for i in range(1, 10): payload = get_payload(companyId, label, i, 10) response = requests.get(url, params=payload, headers=UA).json() time.sleep(0.7) results = get_results(response) if len(results) == 0: company[labels.get(label)] = get_jobs(items, label) break else: items += results company['companyId'] = companyId print("Get ", companyId) cursor.insert(company)
def main(): useds = used() unuseds = [] cursor = connect('lagou', 'jobs') cl = connect('lagou', 'uncate') for result in cursor.find(): name = result.get('name') ID = result.get('ID') if ID in useds: continue else: item = {} item['ID'] = ID item['name'] = name cl.insert(item)
def used(): cursor = connect('lagou', 'categories') useds = [] for result in cursor.find(): useds.append(result.get('ID')) return useds
def main(): cursor = connect('lagou', 'jobs') cl = connect('lagou', 'categories') for category in categories.keys(): for result in cursor.find(): item = {} name = result.get('name') ID = result.get('ID') for c in categories.get(category): if c in name: item['name'] = name item['ID'] = ID item['category'] = category cl.insert(item) break
def load_data(): """载入职位名称 @return -> list,所有的职位名 """ names = [] cursor = connect('lagou', 'jobs') for result in cursor.find(): names.append(result.get('name')) return names
def main(): usedID = [] cursor = connect('lagou', 'uncate') cl = connect('lagou', 'categories') for category in categories.keys(): for result in cursor.find(): item = {} name = result.get('name') ID = result.get('ID') if ID in usedID: continue for c in categories.get(category): if c in name: usedID.append(ID) item['name'] = name item['ID'] = ID item['category'] = category cl.insert(item) break
def main(): trainSet = loadData() mysql = MySql('locations', 'leo', 'mm123456', collection='new_provices') cursor = connect('lagou', 'predicts') results = mysql.find() for result in results: province = result.get('Province') city = result.get('city') county = result.get('county') lng = float(result.get('Longitude')) lat = float(result.get('latitude')) neighbors = get_neighbors(trainSet, [lng, lat], 1) assume = predict(neighbors) cursor.insert({ 'province': province, 'city': city, 'county': county, 'lng': lng, 'lat': lat, 'value': assume }) mysql.close()
#获得城市不同发展阶段的公司所占比例 from utils.mongo import connect jobs = connect('lagou', 'jobs') #cities = ('北京', '上海', '广州', '深圳', '杭州')#保存要统计的城市 cities = ('成都', '武汉', '南京', '西安', '长沙') counter = {} #用来保存城市 company_ids = set() #用于过滤重复公司 for job in jobs.find(): company_id = job.get('companyId') if company_id in company_ids: continue else: company_ids.add(company_id) company = job.get('company') city = job.get('location').replace(' ', '') dev = company.get('发展阶段').replace('发展阶段', '') if city in cities: if city in counter.keys(): #如果已经存在 if dev in counter[city].keys(): #如果发展阶段已经存在城市的字典里 counter[city][dev] += 1 else: #如果发展阶段没有存在 counter[city][dev] = 1 else: #如果城市还未存在 counter[city] = {} #置为空字典 else: continue
"""获取每个省份的互联网公司数目 """ from utils.mongo import connect counter = {} cursor = connect('lagou', 'location') for result in cursor.find(): nums = result.get('company_nums') province = result.get('province') if province not in counter.keys(): counter[province] = nums else: counter[province] = counter.get(province) + nums for key, value in counter.items(): print('{name:', "'", key, "'", ',', 'value:', value, '},')
"""统计各个分类下的人数 """ from utils.mongo import connect cursor = connect('lagou', 'categories') counter = {} for result in cursor.find(): category = result.get('category') if category in counter.keys(): counter[category] = counter.get(category) + 1 else: counter[category] = 1 print(counter)
#互联网公司工作年限与对应年薪 from utils.mongo import connect categories = connect('lagou', 'categories') jobs = connect('lagou', 'jobs') salarys = connect('lagou', 'salarys') product = {} design = {} function = {} market = {} finance = {} maintain = {} tech = {} def count(d, ID): """统计计数不同分类对经验的要求与年薪 @param d 7种字典 @param ID 职位的ID @postcondition 对应的字典包含不同年限的计数,年薪,平均年薪 """ year = jobs.find_one({'ID': ID}).get('experience') #获取工作经验 salary = salarys.find_one({'ID': ID}).get('salary_avg') #获取平均月薪 annual_salary = salary * 12 #得到年薪 if year in d.keys(): d[year]['count'] += 1
"""获取公司的ID 写入到数据库 """ from utils import mongo cursor = mongo.connect('lagou', 'jobs') results = cursor.find() companys = set() for result in results: companyId = result.get('companyId') companys.add(companyId) print(len(companys)) c = mongo.connect('lagou', 'companys') for company in companys: item = {} item['companyId'] = company c.insert(item)
from utils.mongo import connect from utils.mysql import MySql mysql = MySql('locations', 'leo', 'mm123456', collection='new_provices') locations = connect('lagou', 'location') cities = [] try: for location in locations.find(): city = location.get('city') new_city = city + '%' result = mysql.find(mohu=True, city=new_city) try: province = result.get('Province') except Exception: print(city) locations.update_one({'city': city}, {'$set': {'province': province}}) finally: mysql.close()
"""This is to discover the connection between diploma and job """ from utils import mongo jobs = mongo.connect('lagou', 'jobs') backgrounds = mongo.connect('lagou', 'backgrounds') items = {}#要插入到backgrounds的数据 count = 0#计数器 diplomas = []#一共有多少学历要求 def main(): results = jobs.find() for result in results: diploma = result.get('background') if diploma not in items.keys(): items[diploma] = 1 else: items[diploma] = items.get(diploma) + 1 print(items) if __name__ == '__main__': main()