コード例 #1
0
ファイル: vacancy.py プロジェクト: kai-railg/LagouSpider
 def __init__(self):
     super(LagouCompanySummary, self).__init__()
     self.company_index_url = self.domain + "/gongsi/0-0-0-6"
     self.company_index_ajax_url = self.domain + "/gongsi/0-0-0-6.json"
     self.company_detail_index_url = self.domain + "/gongsi/{}.html"
     self.company_detail_job_url = self.domain + "/gongsi/j{}.html"
     self.company_detail_ajax_url = self.domain + "/gongsi/searchPosition.json"
     self.company_ids = []
     self.exists_company = 0
     self.f_data = FileSteam("company_summary")
     self.depend()
コード例 #2
0
    def __init__(self, data):
        self.data = data
        self._session = requests.session()
        self.channel_url = self.domain + ""
        self.vacancy_url = self.domain + ""
        self.company_url = self.domain + ""
        self.login_url = self.domain + ""
        self.password = "" if Debug else ""
        self.username = ""
        self.company_file_stream = FileSteam("company_exist")

        self.login()
コード例 #3
0
ファイル: vacancy.py プロジェクト: kai-railg/LagouSpider
class LagouCompanySummary(LagouSpider):
    def __init__(self):
        super(LagouCompanySummary, self).__init__()
        self.company_index_url = self.domain + "/gongsi/0-0-0-6"
        self.company_index_ajax_url = self.domain + "/gongsi/0-0-0-6.json"
        self.company_detail_index_url = self.domain + "/gongsi/{}.html"
        self.company_detail_job_url = self.domain + "/gongsi/j{}.html"
        self.company_detail_ajax_url = self.domain + "/gongsi/searchPosition.json"
        self.company_ids = []
        self.exists_company = 0
        self.f_data = FileSteam("company_summary")
        self.depend()

    def update_referer(self, referer):
        headers = self.get_headers
        headers.update({
            "Referer": referer
        })
        return headers

    def depend(self):
        self.requests(url=self.domain + "/gongsi/0-0-0-6",
                      headers=self.user_agent_header)

    def main(self):
        headers = self.update_referer(self.company_index_url)
        resp = []
        for i in range(1, 3):
            data = {u"first": u"false", u"pn": i, u"sid": self.sid,
                    u"sortField": 0, u"havemark": 0}
            response = self.requests(method="POST",
                                     url=self.company_index_ajax_url,
                                     headers=headers,
                                     data=data)
            res = self.parse(response)
            if isinstance(res, list):
                resp += res
            time.sleep(random() + randint(1, 4))

        self.mapping(resp)

    def parse(self, response):
        status_code = response.status_code
        assert status_code == 200, self.set_tip(u"{}公司主页响应状态码异常:{}".format(self.name, status_code))
        content = response.content
        if content and isinstance(content, str):
            content = json.loads(content)
            return self.filter(content)

    def filter(self, data):
        objs = data and data.get("result")  # type:list
        if not objs:
            return
        self.sid = data["showId"]
        return objs

    def mapping(self, data):
        company_map = CompanyMapping(data).main()
        if company_map:
            self.write_company(company_map)
        print u"爬取公司数量:{}".format(len(company_map))

    def write_company(self, data):
        f_data = self.f_data.read()  # type:dict
        if not f_data:
            self.f_data.write(data)
            return
        f_data.update(data)
        self.f_data.write(f_data)
コード例 #4
0
ファイル: vacancy.py プロジェクト: kai-railg/LagouSpider
 def logger(self):
     dt = datetime.now().strftime('%Y-%m-%d %H:%M')
     msg = dt + self.tip + u"\n"
     FileSteam("logger").add(msg.encode("utf-8"))
コード例 #5
0
ファイル: vacancy.py プロジェクト: kai-railg/LagouSpider
 def write_vacancy_ids(self):
     ve = FileSteam("vacancy_exist").read()
     FileSteam("vacancy_exist").write(ve + self.vacancy_temp)
コード例 #6
0
ファイル: vacancy.py プロジェクト: kai-railg/LagouSpider
 def vacancy_exist(self):
     return FileSteam("vacancy_exist").read()
コード例 #7
0
class CohirerBase(object):
    domain = "" if not Debug else ""

    def __init__(self, data):
        self.data = data
        self._session = requests.session()
        self.channel_url = self.domain + ""
        self.vacancy_url = self.domain + ""
        self.company_url = self.domain + ""
        self.login_url = self.domain + ""
        self.password = "" if Debug else ""
        self.username = ""
        self.company_file_stream = FileSteam("company_exist")

        self.login()

    @property
    def session(self):
        return self._session

    def requests(self, url, data, method="POST"):
        return self.session.request(method=method, url=url, json=data, headers=self.headers)

    @property
    def headers(self):
        return {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
            "Connection": "keep-alive",
            "Content-Type": "application/json",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,zh-TW;q=0.6,es;q=0.5",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
        }

    def login(self):
        data = dict(username=self.username,
                    password=self.password,
                    remember_me=True)
        self.requests(url=self.login_url, data=data)

    def random_datetime(self):
        dt = datetime.now()
        return datetime(dt.year, dt.month, dt.day,
                        randint(8, 20), randint(1, 59), randint(1, 59))

    def create_company(self, data):
        company_id = self.filter_company(data)
        if company_id:
            return company_id
        res = self.requests(url=self.company_url, data=data)
        if res.status_code == 201:
            content = json.loads(res.content)
            self.add_filter({self.get_company_name(data): content.get("id")})
            return content.get("id")

    def filter_company(self, data):
        cxr = self.company_file_stream.read()
        name = self.get_company_name(data)
        return name and cxr.get(name)

    def add_filter(self, dic):
        cxr = self.company_file_stream.read()  # type:dict
        cxr.update(dic)
        self.company_file_stream.write(cxr)

    def get_company_name(self, data):
        if data and isinstance(data, dict):
            name = data.get("name")
            return name and name.get("c_name")