Example #1
0
class HandleLagouData(object):
    def __init__(self):
        self.mysql_session = Session()
        self.date = time.strftime("%Y-%m-%d", time.localtime())

    def insert_item(self, item):
        date = time.strftime("%Y-%m-%d", time.localtime())
        data = Lagoutables(
            positionID=item['positionId'],
            longitude=item['longitude'],
            latitude=item['latitude'],
            positionName=item['positionName'],
            workYear=item['workYear'],
            education=item['education'],
            jobNature=item['jobNature'],
            financeStage=item['financeStage'],
            companySize=item['companySize'],
            industryField=item['industryField'],
            city=item['city'],
            positionAdvantage=item['positionAdvantage'],
            companyShortName=item['companyShortName'],
            companyFullName=item['companyFullName'],
            district=item['district'],
            companyLabelList=','.join(item['companyLabelList']),
            salary=item['salary'],
            crawl_date=date
        )
        query_result = self.mysql_session.query(Lagoutables).filter(Lagoutables.crawl_date == date,
                                                                    Lagoutables.positionID == item[
                                                                        "positionId"]).first()
        if query_result:
            print("该岗位信息已存在%s:%s:%s" % (item["positionId"], item["city"], item["positionName"]))
        else:
            self.mysql_session.add(data)
            self.mysql_session.commit()
            print("新增岗位信息%s" % item["positionId"])

    def
Example #2
0
class HandleLagouData(object):
    def __init__(self):
        self.mysql_session = Session()
        self.date = time.strftime("%Y-%m-%d", time.localtime())

    #数据的存储方法
    def insert_item(self, item):

        data = Lagoutables(
            # 岗位ID,非空字段
            positionId=item['positionId'],
            # 经度
            longitude=item['longitude'],
            # 纬度
            latitude=item['latitude'],
            # 岗位名称
            positionName=item['positionName'],
            # 工作年限
            workYear=item['workYear'],
            # 学历
            education=item['education'],
            # 岗位性质
            jobNature=item['jobNature'],
            # 业务方向
            industryField=item['industryField'],
            # 公司类型
            financeStage=item['financeStage'],
            # 公司规模
            companySize=item['companySize'],
            # 所在城市
            city=item['city'],
            # 岗位标签
            positionAdvantage=item['positionAdvantage'],
            # 公司简称
            companyShortName=item['companyShortName'],
            # 公司全称
            companyFullName=item['companyFullName'],
            # 所在区
            district=item['district'],
            # 公司福利标签
            companyLabelList=','.join(item['companyLabelList']),
            # 工资
            salary=item['salary'],
            # 抓取日期
            crawl_date=self.date)

        #在存储数据之前,先来查询一下表里是否有这条岗位信息
        query_result = self.mysql_session.query(Lagoutables).filter(
            Lagoutables.crawl_date == date,
            Lagoutables.positionId == item['positionId']).first()
        if query_result:
            print('该岗位信息已存在%s:%s:%s' %
                  (item['positionId'], item['city'], item['positionName']))
        else:
            #插入数据
            self.mysql_session.add(data)
            #提交数据到数据库
            self.mysql_session.commit()
            print('新增岗位信息%s' % item['positionId'])

    def query_industryfield_result(self):
        info = {}
        result = self.mysql_session.query(Lagoutables.industryField).filter(
            Lagoutables.crawl_date == self.date).all()

        result_list1 = [x[0].split(',')[0] for x in result]
        result_list2 = [x for x in Counter(result_list1).items()]
        data = [{
            "name": x[0],
            "value": x[1]
        } for x in result_list2 if x[1] > 140]
        name_list = [name['name'] for name in data]
        info['x_name'] = name_list
        info['data'] = data
        return info

    # 查询薪资情况
    def query_salary_result(self):
        info = {}
        # 查询今日抓取到的薪资数据
        result = self.mysql_session.query(Lagoutables.salary).filter(
            Lagoutables.crawl_date == self.date).all()
        # 处理原始数据
        result_list1 = [x[0] for x in result]
        # 计数,并返回
        result_list2 = [x for x in Counter(result_list1).items() if x[1] > 100]
        result = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in result]
        info['x_name'] = name_list
        info['data'] = result
        return info

    # 查询工作年限情况
    def query_workyear_result(self):
        info = {}
        # 查询今日抓取到的薪资数据
        result = self.mysql_session.query(Lagoutables.workYear).filter(
            Lagoutables.crawl_date == self.date).all()
        # 处理原始数据
        result_list1 = [x[0] for x in result]
        # 计数,并返回
        result_list2 = [x for x in Counter(result_list1).items()]
        result = [{
            "name": x[0],
            "value": x[1]
        } for x in result_list2 if x[1] > 15]
        name_list = [name['name'] for name in result]
        info['x_name'] = name_list
        info['data'] = result
        return info

    # 查询学历信息
    def query_education_result(self):
        info = {}
        # 查询今日抓取到的薪资数据
        result = self.mysql_session.query(Lagoutables.education).filter(
            Lagoutables.crawl_date == self.date).all()
        # 处理原始数据
        result_list1 = [x[0] for x in result]
        # 计数,并返回
        result_list2 = [x for x in Counter(result_list1).items()]
        result = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in result]
        info['x_name'] = name_list
        info['data'] = result
        return info

    # 岗位发布数量,折线图
    def query_job_result(self):
        info = {}
        result = self.mysql_session.query(Lagoutables.crawl_date,
                                          func.count('*').label('c')).group_by(
                                              Lagoutables.crawl_date).all()
        result1 = [{"name": x[0], "value": x[1]} for x in result]
        name_list = [name['name'] for name in result1]
        info['x_name'] = name_list
        info['data'] = result1
        return info

    # 根据城市计数
    def query_city_result(self):
        info = {}
        # 查询今日抓取到的薪资数据
        result = self.mysql_session.query(
            Lagoutables.city,
            func.count('*').label('c')).filter(
                Lagoutables.crawl_date == self.date).group_by(
                    Lagoutables.city).all()
        result1 = [{"name": x[0], "value": x[1]} for x in result]
        name_list = [name['name'] for name in result1]
        info['x_name'] = name_list
        info['data'] = result1
        return info

    #融资情况
    def query_financestage_result(self):
        info = {}
        # 查询今日抓取到的薪资数据
        result = self.mysql_session.query(Lagoutables.financeStage).filter(
            Lagoutables.crawl_date == self.date).all()
        # 处理原始数据
        result_list1 = [x[0] for x in result]
        # 计数,并返回
        result_list2 = [x for x in Counter(result_list1).items()]
        result = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in result]
        info['x_name'] = name_list
        info['data'] = result
        return info

    # 公司规模
    def query_companysize_result(self):
        info = {}
        # 查询今日抓取到的薪资数据
        result = self.mysql_session.query(Lagoutables.companySize).filter(
            Lagoutables.crawl_date == self.date).all()
        # 处理原始数据
        result_list1 = [x[0] for x in result]
        # 计数,并返回
        result_list2 = [x for x in Counter(result_list1).items()]
        result = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in result]
        info['x_name'] = name_list
        info['data'] = result
        return info

    # 任职情况
    def query_jobNature_result(self):
        info = {}
        # 查询今日抓取到的薪资数据
        result = self.mysql_session.query(Lagoutables.jobNature).filter(
            Lagoutables.crawl_date == self.date).all()
        # 处理原始数据
        result_list1 = [x[0] for x in result]
        # 计数,并返回
        result_list2 = [x for x in Counter(result_list1).items()]
        result = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in result]
        info['x_name'] = name_list
        info['data'] = result
        return info

    # 抓取数量
    def count_result(self):
        info = {}
        info['all_count'] = self.mysql_session.query(Lagoutables).count()
        info['today_count'] = self.mysql_session.query(Lagoutables).filter(
            Lagoutables.crawl_date == self.date).count()
        return info
Example #3
0
 def __init__(self):
     self.mysql_session = Session()
     self.date = time.strftime("%Y-%m-%d", time.localtime())
Example #4
0
 def __init__(self):
     # 实例化session信息
     self.mysql_session = Session()
     # self.date = time.strftime("%Y-%m-%d",time.localtime())
     self.date = '2019-06-24'
Example #5
0
class HandleLagouData(object):
    def __init__(self):
        # 实例化Session信息
        self.mysql_session = Session()
        self.date = time.strftime("%Y-%m-%d", time.localtime())

    # 数据的存储方法
    def insert_item(self, item):
        # 今天
        date = time.strftime("%Y-%m-%d", time.localtime())
        # 存储数据结构
        data = Lagoutables(
            positionId=item['positionId'],
            longitude=item['longitude'],
            latitude=item['latitude'],
            positionName=item['positionName'],
            workYear=item['workYear'],
            education=item['education'],
            jobNature=item['jobNature'],
            financeStage=item['financeStage'],
            companySize=item['companySize'],
            industryField=item['industryField'],
            city=item['city'],
            positionAdvantage=item['positionAdvantage'],
            companyShortName=item['companyShortName'],
            companyFullName=item['companyFullName'],
            district=item['district'],
            companyLabelList=','.join(item['companyLabelList']),
            salary=item['salary'],
            crawl_date=date,
            tag=KD,
        )

        # 在存储数据之前。先来查询是否有这条岗位信息
        query_result = self.mysql_session.query(Lagoutables).filter(
            Lagoutables.crawl_date == date, Lagoutables.tag == KD,
            Lagoutables.positionId == item['positionId']).first()
        if query_result:
            print('该岗位信息已存在%s:%s:%s' %
                  (item['positionId'], item['city'], item['positionName']))
        else:
            # 插入数据
            self.mysql_session.add(data)
            # 提交数据到数据库
            self.mysql_session.commit()
            print('新增岗位信息%s' % item['positionId'])

    # 行业信息
    def query_industryfield_result(self):
        info = {}
        # 查询今天抓取的数据
        result = self.mysql_session.query(Lagoutables.industryField).filter(
            Lagoutables.crawl_date == self.date, Lagoutables.tag == KD).all()
        result_list1 = [x[0].split(',')[0] for x in result]
        result_list2 = [x for x in Counter(result_list1).items() if x[1] > 100]
        # 填充的是series里面的data
        data = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in data]
        data_list = [name['value'] for name in data]
        info['x_name'] = name_list
        info['data_list'] = data_list
        return info

    # 查询薪资情况
    def query_salary_result(self):
        info = {}
        # 查询今天抓取的数据
        result = self.mysql_session.query(Lagoutables.salary).filter(
            Lagoutables.crawl_date == self.date, Lagoutables.tag == KD).all()
        result_list1 = [x[0].split(',')[0] for x in result]
        result_list2 = [x for x in Counter(result_list1).items() if x[1] > 70]
        # 填充的是series里面的data
        data = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in data]
        data_list = [name['value'] for name in data]
        info['x_name'] = name_list
        info['data_list'] = data_list
        return info

    # 查询工作年限情况
    def query_workyear_result(self):
        info = {}
        # 查询今天抓取的数据
        result = self.mysql_session.query(Lagoutables.workYear).filter(
            Lagoutables.crawl_date == self.date, Lagoutables.tag == KD).all()
        result_list1 = [x[0].split(',')[0] for x in result]
        result_list2 = [x for x in Counter(result_list1).items()]
        # 填充的是series里面的data
        data = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in data]
        data_list = [name['value'] for name in data]
        info['x_name'] = name_list
        info['data_list'] = data_list
        return info

    # 查询学历信息
    def query_education_result(self):
        info = {}
        # 查询今天抓取的数据
        result = self.mysql_session.query(Lagoutables.education).filter(
            Lagoutables.crawl_date == self.date, Lagoutables.tag == KD).all()
        result_list1 = [x[0].split(',')[0] for x in result]
        result_list2 = [x for x in Counter(result_list1).items()]
        # 填充的是series里面的data
        data = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in data]
        data_list = [name['value'] for name in data]
        info['x_name'] = name_list
        info['data'] = data
        info['data_list'] = data_list
        return info

    # 岗位发布数量,折线图
    def query_job_result(self):
        info = {}
        # 查询今天抓取的数据
        result = self.mysql_session.query(
            Lagoutables.crawl_date, func.count(Lagoutables.id),
            Lagoutables.tag).filter(Lagoutables.tag == 'python').group_by(
                Lagoutables.crawl_date).all()
        result_go = self.mysql_session.query(
            Lagoutables.crawl_date, func.count(Lagoutables.id),
            Lagoutables.tag).filter(Lagoutables.tag == 'go').group_by(
                Lagoutables.crawl_date).all()

        name_list = [name[0] for name in result]
        data_list = [name[1] for name in result]
        data_list_go = [name[1] for name in result_go]
        info['x_name'] = name_list
        info['data_list'] = data_list
        info['data_list_go'] = [0, 0] + data_list_go
        return info

    # 根据城市计数
    def query_city_result(self):
        info = {}
        result = self.mysql_session.query(
            Lagoutables.city, func.count(Lagoutables.id)).filter(
                Lagoutables.crawl_date == self.date,
                Lagoutables.tag == KD).group_by(Lagoutables.city).all()
        name_list = [name[0] for name in result]
        data_list = [name[1] for name in result]
        data = [{"name": x[0], "value": x[1]} for x in result]
        info['data'] = data
        info['x_name'] = name_list
        info['data_list'] = data_list
        return info

    # 融资情况
    def query_financestage_result(self):
        info = {}
        # 查询今天抓取的数据
        result = self.mysql_session.query(Lagoutables.financeStage).filter(
            Lagoutables.crawl_date == self.date, Lagoutables.tag == KD).all()
        result_list1 = [x[0].split(',')[0] for x in result]
        result_list2 = [x for x in Counter(result_list1).items()]
        # 填充的是series里面的data
        data = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in data]
        data_list = [name['value'] for name in data]
        info['x_name'] = name_list
        info['data_list'] = data_list
        info['data'] = data
        return info

    # 公司规模
    def query_companysize_result(self):
        info = {}
        # 查询今天抓取的数据
        result = self.mysql_session.query(Lagoutables.companySize).filter(
            Lagoutables.crawl_date == self.date, Lagoutables.tag == KD).all()
        result_list1 = [x[0].split(',')[0] for x in result]
        result_list2 = [x for x in Counter(result_list1).items()]
        # 填充的是series里面的data
        data = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in data]
        data_list = [name['value'] for name in data]
        info['x_name'] = name_list
        info['data_list'] = data_list
        info['data'] = data
        return info

    # 任职情况
    def query_jobNature_result(self):
        info = {}
        # 查询今天抓取的数据
        result = self.mysql_session.query(Lagoutables.jobNature).filter(
            Lagoutables.crawl_date == self.date, Lagoutables.tag == KD).all()
        result_list1 = [x[0].split(',')[0] for x in result]
        result_list2 = [x for x in Counter(result_list1).items()]
        # 填充的是series里面的data
        data = [{"name": x[0], "value": x[1]} for x in result_list2]
        name_list = [name['name'] for name in data]
        data_list = [name['value'] for name in data]
        info['x_name'] = name_list
        info['data_list'] = data_list
        info['data'] = data
        return info

    # 抓取数量
    def count_result(self):
        info = {}
        all_count = self.mysql_session.query(Lagoutables.id,
                                             Lagoutables.tag == KD).count()
        today_count = self.mysql_session.query(Lagoutables.id).filter(
            Lagoutables.crawl_date == self.date,
            Lagoutables.tag == KD).count()
        return all_count, today_count