def __init__(self): super(LagouCompanySummary, self).__init__() self.company_index_url = self.domain + "/gongsi/0-0-0-6" self.company_index_ajax_url = self.domain + "/gongsi/0-0-0-6.json" self.company_detail_index_url = self.domain + "/gongsi/{}.html" self.company_detail_job_url = self.domain + "/gongsi/j{}.html" self.company_detail_ajax_url = self.domain + "/gongsi/searchPosition.json" self.company_ids = [] self.exists_company = 0 self.f_data = FileSteam("company_summary") self.depend()
def __init__(self, data): self.data = data self._session = requests.session() self.channel_url = self.domain + "" self.vacancy_url = self.domain + "" self.company_url = self.domain + "" self.login_url = self.domain + "" self.password = "" if Debug else "" self.username = "" self.company_file_stream = FileSteam("company_exist") self.login()
class LagouCompanySummary(LagouSpider): def __init__(self): super(LagouCompanySummary, self).__init__() self.company_index_url = self.domain + "/gongsi/0-0-0-6" self.company_index_ajax_url = self.domain + "/gongsi/0-0-0-6.json" self.company_detail_index_url = self.domain + "/gongsi/{}.html" self.company_detail_job_url = self.domain + "/gongsi/j{}.html" self.company_detail_ajax_url = self.domain + "/gongsi/searchPosition.json" self.company_ids = [] self.exists_company = 0 self.f_data = FileSteam("company_summary") self.depend() def update_referer(self, referer): headers = self.get_headers headers.update({ "Referer": referer }) return headers def depend(self): self.requests(url=self.domain + "/gongsi/0-0-0-6", headers=self.user_agent_header) def main(self): headers = self.update_referer(self.company_index_url) resp = [] for i in range(1, 3): data = {u"first": u"false", u"pn": i, u"sid": self.sid, u"sortField": 0, u"havemark": 0} response = self.requests(method="POST", url=self.company_index_ajax_url, headers=headers, data=data) res = self.parse(response) if isinstance(res, list): resp += res time.sleep(random() + randint(1, 4)) self.mapping(resp) def parse(self, response): status_code = response.status_code assert status_code == 200, self.set_tip(u"{}公司主页响应状态码异常:{}".format(self.name, status_code)) content = response.content if content and isinstance(content, str): content = json.loads(content) return self.filter(content) def filter(self, data): objs = data and data.get("result") # type:list if not objs: return self.sid = data["showId"] return objs def mapping(self, data): company_map = CompanyMapping(data).main() if company_map: self.write_company(company_map) print u"爬取公司数量:{}".format(len(company_map)) def write_company(self, data): f_data = self.f_data.read() # type:dict if not f_data: self.f_data.write(data) return f_data.update(data) self.f_data.write(f_data)
def logger(self): dt = datetime.now().strftime('%Y-%m-%d %H:%M') msg = dt + self.tip + u"\n" FileSteam("logger").add(msg.encode("utf-8"))
def write_vacancy_ids(self): ve = FileSteam("vacancy_exist").read() FileSteam("vacancy_exist").write(ve + self.vacancy_temp)
def vacancy_exist(self): return FileSteam("vacancy_exist").read()
class CohirerBase(object): domain = "" if not Debug else "" def __init__(self, data): self.data = data self._session = requests.session() self.channel_url = self.domain + "" self.vacancy_url = self.domain + "" self.company_url = self.domain + "" self.login_url = self.domain + "" self.password = "" if Debug else "" self.username = "" self.company_file_stream = FileSteam("company_exist") self.login() @property def session(self): return self._session def requests(self, url, data, method="POST"): return self.session.request(method=method, url=url, json=data, headers=self.headers) @property def headers(self): return { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", "Connection": "keep-alive", "Content-Type": "application/json", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,zh-TW;q=0.6,es;q=0.5", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" } def login(self): data = dict(username=self.username, password=self.password, remember_me=True) self.requests(url=self.login_url, data=data) def random_datetime(self): dt = datetime.now() return datetime(dt.year, dt.month, dt.day, randint(8, 20), randint(1, 59), randint(1, 59)) def create_company(self, data): company_id = self.filter_company(data) if company_id: return company_id res = self.requests(url=self.company_url, data=data) if res.status_code == 201: content = json.loads(res.content) self.add_filter({self.get_company_name(data): content.get("id")}) return content.get("id") def filter_company(self, data): cxr = self.company_file_stream.read() name = self.get_company_name(data) return name and cxr.get(name) def add_filter(self, dic): cxr = self.company_file_stream.read() # type:dict cxr.update(dic) self.company_file_stream.write(cxr) def get_company_name(self, data): if data and isinstance(data, dict): name = data.get("name") return name and name.get("c_name")