def parse_detail(self, response): """ company_name 会社名 job_name ポジション link_url 募集詳細link https://type.jp nearest_station 住所 longitude 経度 latitude 緯度 source 出所 occupation 職種 annual_income_min 年収min annual_income_max 年収max published_time サイト内での掲載時間 create_data クロリングした時間 """ company_name = response.meta.get("company_name", "") company_name = re.sub("【(.*?)】", "", company_name) company_name = re.sub("\s*", "", company_name) link_url = "https://type.jp" + response.meta.get("link_url", "") job_name = response.meta.get("job_name", "") nearest_station = response.meta.get("nearest_station", "") nearest_station = re.sub("\r\n", "", nearest_station) # annual_incomeはNoneの場合もある annual_income = response.xpath("//span[@class='ico_salary']/text()").get() if annual_income is not None: y = re.compile("\d*~\d*") x = re.search(y, "年収:400~700万円") x = x.group() annual_income = x.split("~") annual_income_min = annual_income[0] annual_income_max = annual_income[1] else: annual_income_min = "" annual_income_max = "" published_time = response.meta.get("published_time", "") published_time = published_time.strip() # print(company_name, link_url, job_name, nearest_station, annual_income_min, annual_income_max) longitude, latitude = get_coordinate(company_name) type_item = TypeItem() type_item["company_name"] = company_name type_item["link_url"] = link_url type_item["nearest_station"] = nearest_station type_item["job_name"] = job_name type_item["annual_income_min"] = annual_income_min type_item["annual_income_max"] = annual_income_max type_item["longitude"] = longitude type_item["latitude"] = latitude type_item["occupation"] = "営業" type_item["source"] = "type" type_item["published_time"] = published_time type_item["create_data"] = datetime.now() yield type_item
def parse_detail(self, response): """ company_name 会社名 job_name ポジション link_url 募集詳細link nearest_station 住所 longitude 経度 latitude 緯度 source 出所 annual_income_min 年収min annual_income_max 年収max occupation 職種 published_time サイト内での掲載時間 create_data クロリングした時間 """ wantedly_item = WantedlyItem() company_name = response.meta.get("company_name", "") link_url = "https://www.wantedly.com" + response.meta.get( "link_url", "") job_name = response.meta.get("job_name", "") published_time = response.meta.get("published_time", "") """サイト内での掲載時間整理""" published_time = published_time.strip() published_time = re.sub("<(.*)>", "", published_time) published_time = published_time.strip() """住所の整理""" nearest_station = response.xpath( "//li/div[@class='company-description']/text()").getall() nearest_station = nearest_station[-1] nearest_station = nearest_station.strip() # print("会社名:", company_name, "url:", link_url, "仕事内容:", job_name, "場所:", nearest_station, "掲載時間:", # published_time) """経度緯度の取得""" longitude, latitude = get_coordinate(nearest_station) """年収(現在は取得出来ない)""" annual_income_min = 0 annual_income_max = 0 wantedly_item["company_name"] = company_name wantedly_item["link_url"] = link_url wantedly_item["job_name"] = job_name wantedly_item["nearest_station"] = nearest_station wantedly_item["longitude"] = longitude wantedly_item["latitude"] = latitude wantedly_item["annual_income_min"] = annual_income_min wantedly_item["annual_income_max"] = annual_income_max wantedly_item["occupation"] = "営業" wantedly_item["source"] = "wantedly" wantedly_item["published_time"] = published_time wantedly_item["create_data"] = datetime.now() yield wantedly_item
def parse_detail(self, response): """ company_name 会社名 job_name ポジション link_url 募集詳細link https://type.jp nearest_station 住所 longitude 経度 latitude 緯度 source 出所 occupation 職種 annual_income_min 年収min annual_income_max 年収max published_time サイト内での掲載時間 create_data クロリングした時間 """ green_item = GreenItem() company_name = response.meta.get("company_name", "") job_name = response.meta.get("job_name", "") link_url = 'https://www.green-japan.com' + response.meta.get("link_url", "") """勤務地の整理""" nearest_station = response.xpath("//table[@class='detail-content-table js-impression'][2]/tr/td").getall() try: nearest_station = nearest_station[0] except: nearest_station = response.xpath("//table[@class='detail-content-table js-impression'][1]/tr/td").getall() nearest_station = nearest_station[0] regex = re.compile(r"勤務地詳細】<br>(.*?)<br>") y = regex.search(nearest_station) nearest_station = y.group(1) nearest_station = nearest_station.replace(' ', '') nearest_station = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?/【】本社:“”■!,。?、~@#¥%……&*()]+", "", nearest_station) """サイト内での掲載時間""" published_time = response.meta.get("published_time", "") """年収の整理""" annual_income = response.meta.get("annual_income", "") pattern = re.compile(r'<[^>]+>', re.S) annual_income = pattern.sub('', annual_income) annual_income = annual_income.strip() re_text = re.compile("(\d{3,4})万円") annual_income = re.findall(re_text, annual_income) if len(annual_income) < 2: annual_income_min = annual_income[0] annual_income_max = annual_income[0] else: annual_income_min = annual_income[0] annual_income_max = annual_income[1] """内容チェック用""" # print(f"company_name:{company_name} \n link_url:{link_url}" # f" \n job_name:{job_name} \n nearest_station:{nearest_station}" # f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n" # f"published_time:{published_time}") """経緯度取得""" longitude, latitude = get_coordinate(nearest_station) green_item["company_name"] = company_name green_item["job_name"] = job_name green_item["link_url"] = link_url green_item["nearest_station"] = nearest_station green_item["longitude"] = longitude green_item["latitude"] = latitude green_item["source"] = "green" green_item["occupation"] = "営業" green_item["annual_income_min"] = annual_income_min green_item["annual_income_max"] = annual_income_max green_item["published_time"] = published_time green_item["create_data"] = datetime.now() yield green_item
def parse_detail(self, response): company_name = response.meta.get("company_name", "") company_name = company_name.strip() link_url = "https://mynavi.agentsearch.jp" + response.meta.get("link_url", "") job_name = response.meta.get("job_name", "") pattern = re.compile(r'<[^>]+>', re.S) job_name = pattern.sub('', job_name) job_name = job_name.strip() """住所の整形""" nearest_station = response.meta.get("nearest_station", "") pattern = re.compile(r'<[^>]+>', re.S) nearest_station = pattern.sub('', str(nearest_station)) nearest_station = nearest_station.strip() nearest_station = nearest_station.replace(" ", "") nearest_station = re.sub("[<>…'>]", "", nearest_station) """年収の整形""" annual_income = response.meta.get("annual_income", "") annual_income = str(annual_income) pattern = re.compile(r'\d{3,4}', re.S) annual_income = re.findall(pattern, annual_income) n = len(annual_income) for i in range(n - 1): if int(annual_income[i]) > int(annual_income[i + 1]): annual_income[i], annual_income[i + 1] = annual_income[i + 1], annual_income[i] try: if len(annual_income) == 2: annual_income_min = annual_income[0] annual_income_max = annual_income[1] elif len(annual_income) > 2: annual_income_min = annual_income[1] annual_income_max = annual_income[-1] else: annual_income_min = annual_income[0] annual_income_max = annual_income[0] except: annual_income_min = 0 annual_income_max = 0 """サイト内での掲載時間の整形""" published_time = response.xpath("//div[@class='information cf']/span").getall() published_time = published_time[1] + published_time[2] published_time = re.sub("<.*?>", "", published_time) """検証用""" # print(f"company_name:{company_name} \n link_url:{link_url}" # f" \n job_name:{job_name} \n nearest_station:{nearest_station}" # f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n" # f"published_time:{published_time}") """経度緯度の整理""" longitude, latitude = get_coordinate(company_name) """ company_name 会社名 job_name ポジション link_url 募集詳細link https://type.jp nearest_station 住所 longitude 経度 latitude 緯度 source 出所 occupation 職種 annual_income_min 年収min annual_income_max 年収max published_time サイト内での掲載時間 create_data クロリングした時間 """ mynav_item = MynaviItem() mynav_item["company_name"] = company_name mynav_item["job_name"] = job_name mynav_item["link_url"] = link_url mynav_item["nearest_station"] = nearest_station mynav_item["longitude"] = longitude mynav_item["latitude"] = latitude mynav_item["source"] = "マイナビ" mynav_item["occupation"] = "営業" mynav_item["annual_income_min"] = annual_income_min mynav_item["annual_income_max"] = annual_income_max mynav_item["published_time"] = published_time mynav_item["create_data"] = datetime.now() yield mynav_item
def parse_detail(self, response): """ company_name 会社名 job_name ポジション link_url 募集詳細link https://type.jp nearest_station 住所 longitude 経度 latitude 緯度 source 出所 occupation 職種 annual_income_min 年収min annual_income_max 年収max published_time サイト内での掲載時間 create_data クロリングした時間 """ dota_item = DodaItem() company_name = response.meta.get("company_name", "") company_name = re.sub("【(.*?)】|\((.*?)\)", "", company_name) job_name = response.meta.get("job_name", "") link_url = response.meta.get("link_url", "") nearest_station = response.meta.get("nearest_station", "") nearest_station = nearest_station.split("、") try: nearest_station = nearest_station[0] if "駅" not in nearest_station: nearest_station = "東京" except: nearest_station = "東京" """サイト内での掲載時間""" published_time = response.xpath("//p[@class='meta_text']/text()").get() """年収の整理""" annual_income = response.meta.get("annual_income", "") pattern = re.compile(r'<[^>]+>', re.S) annual_income = pattern.sub('', annual_income) re_text = re.compile("(\d{3,4})万円") annual_income = re.findall(re_text, annual_income) n = len(annual_income) for i in range(n): for j in range(0, n - i - 1): if int(annual_income[j]) > int(annual_income[j + 1]): annual_income[j], annual_income[j + 1] = annual_income[ j + 1], annual_income[j] if len(annual_income) >= 2: annual_income_min = annual_income[0] annual_income_max = annual_income[-1] elif len(annual_income) == 1: annual_income_min = annual_income[0] annual_income_max = annual_income[0] else: annual_income_min = 0 annual_income_max = 0 # print(f"company_name:{company_name} \n link_url:{link_url}" # f" \n job_name:{job_name} \n nearest_station:{nearest_station}" # f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n" # f"published_time:{published_time}") longitude, latitude = get_coordinate(nearest_station) dota_item["company_name"] = company_name dota_item["job_name"] = job_name dota_item["link_url"] = link_url dota_item["nearest_station"] = nearest_station dota_item["longitude"] = longitude dota_item["latitude"] = latitude dota_item["annual_income_min"] = annual_income_min dota_item["annual_income_max"] = annual_income_max dota_item["occupation"] = "営業" dota_item["source"] = "doda" dota_item["published_time"] = published_time dota_item["create_data"] = datetime.now() yield dota_item
def parse_detail(self, response): """ company_name 会社名 job_name ポジション link_url 募集詳細link https://type.jp nearest_station 住所 longitude 経度 latitude 緯度 source 出所 occupation 職種 annual_income_min 年収min annual_income_max 年収max published_time サイト内での掲載時間 create_data クロリングした時間 """ en_item = EnItem() company_name = response.meta.get("company_name", "") company_name = re.sub("((.*?))", "", company_name) link_url = "https://employment.en-japan.com" + response.meta.get("link_url", "") job_name = response.meta.get("job_name", "") nearest_station = response.meta.get("nearest_station", "") """サイト内での掲載時間の整理""" published_time = response.meta.get("published_time", "") pattern = re.compile(r'<[^>]+>', re.S) published_time = pattern.sub('', published_time) published_time = published_time.split() try: published_time = published_time[1] except: published_time = published_time[0] """年収の取得および整理""" categoryIcon_money = response.xpath("//div[@class='categoryIcon money']") if categoryIcon_money: annual_income = response.xpath("//div[@class='categorySet moneyCategorySet']/div[@class='categoryData']").get() pattern = re.compile(r'<[^>]+>', re.S) annual_income = pattern.sub('', annual_income) re_text = re.compile("(\d{3,4})万円") annual_income = re.findall(re_text, annual_income) n = len(annual_income) for i in range(n): for j in range(0, n - i - 1): if int(annual_income[j]) > int(annual_income[j + 1]): annual_income[j], annual_income[j + 1] = annual_income[j + 1], annual_income[j] annual_income_min = annual_income[0] annual_income_max = annual_income[-1] else: annual_income_min = 0 annual_income_max = 0 """内容チェック用""" # print(f"company_name:{company_name} \n link_url:{link_url}" # f" \n job_name:{job_name} \n nearest_station:{nearest_station}" # f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n" # f"published_time:{published_time}") """経緯度の取得""" longitude, latitude = get_coordinate(company_name) en_item["company_name"] = company_name en_item["link_url"] = link_url en_item["job_name"] = job_name en_item["nearest_station"] = nearest_station en_item["longitude"] = longitude en_item["latitude"] = latitude en_item["source"] = "エン転職" en_item["occupation"] = "営業" en_item["annual_income_min"] = annual_income_min en_item["annual_income_max"] = annual_income_max en_item["published_time"] = published_time en_item["create_data"] = datetime.now() yield en_item
def parse_detail(self, response): company_name = response.meta.get("company_name", "") link_url = "https://next.rikunabi.com" + response.meta.get( "link_url", "") job_name = response.meta.get("job_name", "") annual_income = response.meta.get("annual_income", "") nearest_station = response.meta.get("nearest_station", "") """会社名の整理""" re_text = re.compile("(\w*)株式会社(\w*)") company_name = re.search(re_text, company_name) company_name = company_name.group() """最高と最低年収の整理""" annual_income = re.sub("<.*>", "", annual_income) re_text = re.compile("(\d{3,4})万円/") annual_income = re.findall(re_text, annual_income) n = len(annual_income) for i in range(n - 1): if int(annual_income[i]) > int(annual_income[i + 1]): annual_income[i], annual_income[i + 1] = annual_income[ i + 1], annual_income[i] try: if len(annual_income) == 2: annual_income_min = annual_income[0] annual_income_max = annual_income[1] elif len(annual_income) > 2: annual_income_min = annual_income[1] annual_income_max = annual_income[-1] else: annual_income_min = annual_income[0] annual_income_max = annual_income[0] except: annual_income_min = 0 annual_income_max = 0 """勤務地の整理""" nearest_station = nearest_station.strip() """経度緯度の整理""" longitude, latitude = get_coordinate(company_name) """サイト内での掲載時間""" published_time = response.xpath( "//p[@class='rnn-inlineBlock rnn-offerInfoHeader__date rnn-textM']/text()" ).get() """検証用""" # print(f"company_name:{company_name} \n link_url:{link_url}" # f" \n job_name:{job_name} \n nearest_station:{nearest_station}" # f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n" # f"published_time:{published_time}") """ company_name 会社名 job_name ポジション link_url 募集詳細link https://type.jp nearest_station 住所 longitude 経度 latitude 緯度 source 出所 occupation 職種 annual_income_min 年収min annual_income_max 年収max published_time サイト内での掲載時間 create_data クロリングした時間 """ next_rikunabi_item = NextRikuabiItem() next_rikunabi_item["company_name"] = company_name next_rikunabi_item['job_name'] = job_name next_rikunabi_item["link_url"] = link_url next_rikunabi_item["nearest_station"] = nearest_station next_rikunabi_item["longitude"] = longitude next_rikunabi_item["latitude"] = latitude next_rikunabi_item["source"] = "next_rikunabi" next_rikunabi_item["occupation"] = "営業" next_rikunabi_item["annual_income_min"] = annual_income_min next_rikunabi_item["annual_income_max"] = annual_income_max next_rikunabi_item["published_time"] = published_time next_rikunabi_item["create_data"] = datetime.now() yield next_rikunabi_item