Example #1
0
    def parse_detail(self, response):
        """
                      company_name             会社名
                      job_name                ポジション 
                      link_url                募集詳細link   https://type.jp
                      nearest_station         住所
                      longitude                 経度
                      latitude                  緯度
                      source                    出所
                      occupation                職種
                      annual_income_min         年収min
                      annual_income_max         年収max
                      published_time            サイト内での掲載時間
                      create_data              クロリングした時間 

              """
        company_name = response.meta.get("company_name", "")
        company_name = re.sub("【(.*?)】", "", company_name)
        company_name = re.sub("\s*", "", company_name)
        link_url = "https://type.jp" + response.meta.get("link_url", "")
        job_name = response.meta.get("job_name", "")
        nearest_station = response.meta.get("nearest_station", "")
        nearest_station = re.sub("\r\n", "", nearest_station)
        # annual_incomeはNoneの場合もある
        annual_income = response.xpath("//span[@class='ico_salary']/text()").get()
        if annual_income is not None:
            y = re.compile("\d*~\d*")
            x = re.search(y, "年収:400~700万円")
            x = x.group()
            annual_income = x.split("~")
            annual_income_min = annual_income[0]
            annual_income_max = annual_income[1]
        else:
            annual_income_min = ""
            annual_income_max = ""
        published_time = response.meta.get("published_time", "")
        published_time = published_time.strip()

        # print(company_name, link_url, job_name, nearest_station, annual_income_min, annual_income_max)

        longitude, latitude = get_coordinate(company_name)

        type_item = TypeItem()
        type_item["company_name"] = company_name
        type_item["link_url"] = link_url
        type_item["nearest_station"] = nearest_station
        type_item["job_name"] = job_name
        type_item["annual_income_min"] = annual_income_min
        type_item["annual_income_max"] = annual_income_max
        type_item["longitude"] = longitude
        type_item["latitude"] = latitude
        type_item["occupation"] = "営業"
        type_item["source"] = "type"
        type_item["published_time"] = published_time
        type_item["create_data"] = datetime.now()
        yield type_item
Example #2
0
    def parse_detail(self, response):
        """
                      company_name             会社名
                      job_name                ポジション 
                      link_url                募集詳細link
                      nearest_station         住所
                      longitude                 経度
                      latitude                  緯度
                      source                    出所
                      annual_income_min         年収min
                      annual_income_max         年収max
                      occupation                職種
                      published_time            サイト内での掲載時間
                      create_data              クロリングした時間 

              """
        wantedly_item = WantedlyItem()
        company_name = response.meta.get("company_name", "")
        link_url = "https://www.wantedly.com" + response.meta.get(
            "link_url", "")
        job_name = response.meta.get("job_name", "")
        published_time = response.meta.get("published_time", "")
        """サイト内での掲載時間整理"""
        published_time = published_time.strip()
        published_time = re.sub("<(.*)>", "", published_time)
        published_time = published_time.strip()
        """住所の整理"""
        nearest_station = response.xpath(
            "//li/div[@class='company-description']/text()").getall()
        nearest_station = nearest_station[-1]
        nearest_station = nearest_station.strip()

        # print("会社名:", company_name, "url:", link_url, "仕事内容:", job_name, "場所:", nearest_station, "掲載時間:",
        #       published_time)
        """経度緯度の取得"""
        longitude, latitude = get_coordinate(nearest_station)
        """年収(現在は取得出来ない)"""
        annual_income_min = 0
        annual_income_max = 0

        wantedly_item["company_name"] = company_name
        wantedly_item["link_url"] = link_url
        wantedly_item["job_name"] = job_name
        wantedly_item["nearest_station"] = nearest_station
        wantedly_item["longitude"] = longitude
        wantedly_item["latitude"] = latitude
        wantedly_item["annual_income_min"] = annual_income_min
        wantedly_item["annual_income_max"] = annual_income_max
        wantedly_item["occupation"] = "営業"
        wantedly_item["source"] = "wantedly"
        wantedly_item["published_time"] = published_time
        wantedly_item["create_data"] = datetime.now()
        yield wantedly_item
Example #3
0
    def parse_detail(self, response):
        """
                      company_name             会社名
                      job_name                ポジション 
                      link_url                募集詳細link   https://type.jp
                      nearest_station         住所
                      longitude                 経度
                      latitude                  緯度
                      source                    出所
                      occupation                職種
                      annual_income_min         年収min
                      annual_income_max         年収max
                      published_time            サイト内での掲載時間
                      create_data              クロリングした時間 

         """
        green_item = GreenItem()

        company_name = response.meta.get("company_name", "")
        job_name = response.meta.get("job_name", "")
        link_url = 'https://www.green-japan.com' + response.meta.get("link_url", "")

        """勤務地の整理"""
        nearest_station = response.xpath("//table[@class='detail-content-table js-impression'][2]/tr/td").getall()
        try:
            nearest_station = nearest_station[0]
        except:
            nearest_station = response.xpath("//table[@class='detail-content-table js-impression'][1]/tr/td").getall()
            nearest_station = nearest_station[0]
        regex = re.compile(r"勤務地詳細】<br>(.*?)<br>")
        y = regex.search(nearest_station)
        nearest_station = y.group(1)
        nearest_station = nearest_station.replace(' ', '')
        nearest_station = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?/【】本社:“”■!,。?、~@#¥%……&*()]+", "", nearest_station)

        """サイト内での掲載時間"""
        published_time = response.meta.get("published_time", "")

        """年収の整理"""
        annual_income = response.meta.get("annual_income", "")
        pattern = re.compile(r'<[^>]+>', re.S)
        annual_income = pattern.sub('', annual_income)
        annual_income = annual_income.strip()
        re_text = re.compile("(\d{3,4})万円")

        annual_income = re.findall(re_text, annual_income)
        if len(annual_income) < 2:
            annual_income_min = annual_income[0]
            annual_income_max = annual_income[0]
        else:
            annual_income_min = annual_income[0]
            annual_income_max = annual_income[1]

        """内容チェック用"""
        # print(f"company_name:{company_name} \n link_url:{link_url}"
        #       f" \n job_name:{job_name} \n nearest_station:{nearest_station}"
        #       f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n"
        #       f"published_time:{published_time}")

        """経緯度取得"""
        longitude, latitude = get_coordinate(nearest_station)

        green_item["company_name"] = company_name
        green_item["job_name"] = job_name
        green_item["link_url"] = link_url
        green_item["nearest_station"] = nearest_station
        green_item["longitude"] = longitude
        green_item["latitude"] = latitude
        green_item["source"] = "green"
        green_item["occupation"] = "営業"
        green_item["annual_income_min"] = annual_income_min
        green_item["annual_income_max"] = annual_income_max
        green_item["published_time"] = published_time
        green_item["create_data"] = datetime.now()
        yield green_item
Example #4
0
    def parse_detail(self, response):
        company_name = response.meta.get("company_name", "")
        company_name = company_name.strip()
        link_url = "https://mynavi.agentsearch.jp" + response.meta.get("link_url", "")
        job_name = response.meta.get("job_name", "")
        pattern = re.compile(r'<[^>]+>', re.S)
        job_name = pattern.sub('', job_name)
        job_name = job_name.strip()

        """住所の整形"""
        nearest_station = response.meta.get("nearest_station", "")
        pattern = re.compile(r'<[^>]+>', re.S)
        nearest_station = pattern.sub('', str(nearest_station))
        nearest_station = nearest_station.strip()
        nearest_station = nearest_station.replace(" ", "")
        nearest_station = re.sub("[<>…'>]", "", nearest_station)

        """年収の整形"""
        annual_income = response.meta.get("annual_income", "")
        annual_income = str(annual_income)
        pattern = re.compile(r'\d{3,4}', re.S)
        annual_income = re.findall(pattern, annual_income)
        n = len(annual_income)
        for i in range(n - 1):
            if int(annual_income[i]) > int(annual_income[i + 1]):
                annual_income[i], annual_income[i + 1] = annual_income[i + 1], annual_income[i]
        try:
            if len(annual_income) == 2:
                annual_income_min = annual_income[0]
                annual_income_max = annual_income[1]
            elif len(annual_income) > 2:
                annual_income_min = annual_income[1]
                annual_income_max = annual_income[-1]
            else:
                annual_income_min = annual_income[0]
                annual_income_max = annual_income[0]
        except:
            annual_income_min = 0
            annual_income_max = 0

        """サイト内での掲載時間の整形"""
        published_time = response.xpath("//div[@class='information cf']/span").getall()
        published_time = published_time[1] + published_time[2]
        published_time = re.sub("<.*?>", "", published_time)

        """検証用"""
        # print(f"company_name:{company_name} \n link_url:{link_url}"
        #       f" \n job_name:{job_name} \n nearest_station:{nearest_station}"
        #       f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n"
        #       f"published_time:{published_time}")

        """経度緯度の整理"""
        longitude, latitude = get_coordinate(company_name)

        """
                     company_name             会社名
                     job_name                ポジション 
                     link_url                募集詳細link   https://type.jp
                     nearest_station         住所
                     longitude                 経度
                     latitude                  緯度
                     source                    出所
                     occupation                職種
                     annual_income_min         年収min
                     annual_income_max         年収max
                     published_time            サイト内での掲載時間
                     create_data              クロリングした時間 

        """

        mynav_item = MynaviItem()
        mynav_item["company_name"] = company_name
        mynav_item["job_name"] = job_name
        mynav_item["link_url"] = link_url
        mynav_item["nearest_station"] = nearest_station
        mynav_item["longitude"] = longitude
        mynav_item["latitude"] = latitude
        mynav_item["source"] = "マイナビ"
        mynav_item["occupation"] = "営業"
        mynav_item["annual_income_min"] = annual_income_min
        mynav_item["annual_income_max"] = annual_income_max
        mynav_item["published_time"] = published_time
        mynav_item["create_data"] = datetime.now()
        yield mynav_item
Example #5
0
    def parse_detail(self, response):
        """
                      company_name             会社名
                      job_name                ポジション 
                      link_url                募集詳細link   https://type.jp
                      nearest_station         住所
                      longitude                 経度
                      latitude                  緯度
                      source                    出所
                      occupation                職種
                      annual_income_min         年収min
                      annual_income_max         年収max
                      published_time            サイト内での掲載時間
                      create_data              クロリングした時間 

              """
        dota_item = DodaItem()

        company_name = response.meta.get("company_name", "")
        company_name = re.sub("【(.*?)】|\((.*?)\)", "", company_name)
        job_name = response.meta.get("job_name", "")
        link_url = response.meta.get("link_url", "")
        nearest_station = response.meta.get("nearest_station", "")
        nearest_station = nearest_station.split("、")
        try:
            nearest_station = nearest_station[0]
            if "駅" not in nearest_station:
                nearest_station = "東京"
        except:
            nearest_station = "東京"
        """サイト内での掲載時間"""
        published_time = response.xpath("//p[@class='meta_text']/text()").get()
        """年収の整理"""
        annual_income = response.meta.get("annual_income", "")
        pattern = re.compile(r'<[^>]+>', re.S)
        annual_income = pattern.sub('', annual_income)
        re_text = re.compile("(\d{3,4})万円")
        annual_income = re.findall(re_text, annual_income)
        n = len(annual_income)
        for i in range(n):
            for j in range(0, n - i - 1):
                if int(annual_income[j]) > int(annual_income[j + 1]):
                    annual_income[j], annual_income[j + 1] = annual_income[
                        j + 1], annual_income[j]
        if len(annual_income) >= 2:
            annual_income_min = annual_income[0]
            annual_income_max = annual_income[-1]
        elif len(annual_income) == 1:
            annual_income_min = annual_income[0]
            annual_income_max = annual_income[0]
        else:
            annual_income_min = 0
            annual_income_max = 0

        # print(f"company_name:{company_name} \n link_url:{link_url}"
        #       f" \n job_name:{job_name} \n nearest_station:{nearest_station}"
        #       f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n"
        #       f"published_time:{published_time}")

        longitude, latitude = get_coordinate(nearest_station)

        dota_item["company_name"] = company_name
        dota_item["job_name"] = job_name
        dota_item["link_url"] = link_url
        dota_item["nearest_station"] = nearest_station
        dota_item["longitude"] = longitude
        dota_item["latitude"] = latitude
        dota_item["annual_income_min"] = annual_income_min
        dota_item["annual_income_max"] = annual_income_max
        dota_item["occupation"] = "営業"
        dota_item["source"] = "doda"
        dota_item["published_time"] = published_time
        dota_item["create_data"] = datetime.now()
        yield dota_item
Example #6
0
    def parse_detail(self, response):
        """
                     company_name             会社名
                     job_name                ポジション 
                     link_url                募集詳細link   https://type.jp
                     nearest_station         住所
                     longitude                 経度
                     latitude                  緯度
                     source                    出所
                     occupation                職種
                     annual_income_min         年収min
                     annual_income_max         年収max
                     published_time            サイト内での掲載時間
                     create_data              クロリングした時間 

        """
        en_item = EnItem()
        company_name = response.meta.get("company_name", "")
        company_name = re.sub("((.*?))", "", company_name)
        link_url = "https://employment.en-japan.com" + response.meta.get("link_url", "")
        job_name = response.meta.get("job_name", "")
        nearest_station = response.meta.get("nearest_station", "")

        """サイト内での掲載時間の整理"""
        published_time = response.meta.get("published_time", "")
        pattern = re.compile(r'<[^>]+>', re.S)
        published_time = pattern.sub('', published_time)
        published_time = published_time.split()
        try:
            published_time = published_time[1]
        except:
            published_time = published_time[0]

        """年収の取得および整理"""
        categoryIcon_money = response.xpath("//div[@class='categoryIcon money']")
        if categoryIcon_money:
            annual_income = response.xpath("//div[@class='categorySet moneyCategorySet']/div[@class='categoryData']").get()
            pattern = re.compile(r'<[^>]+>', re.S)
            annual_income = pattern.sub('', annual_income)
            re_text = re.compile("(\d{3,4})万円")

            annual_income = re.findall(re_text, annual_income)

            n = len(annual_income)
            for i in range(n):
                for j in range(0, n - i - 1):
                    if int(annual_income[j]) > int(annual_income[j + 1]):
                        annual_income[j], annual_income[j + 1] = annual_income[j + 1], annual_income[j]
            annual_income_min = annual_income[0]
            annual_income_max = annual_income[-1]
        else:
            annual_income_min = 0
            annual_income_max = 0

        """内容チェック用"""
        # print(f"company_name:{company_name} \n link_url:{link_url}"
        #       f" \n job_name:{job_name} \n nearest_station:{nearest_station}"
        #       f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n"
        #       f"published_time:{published_time}")

        """経緯度の取得"""
        longitude, latitude = get_coordinate(company_name)

        en_item["company_name"] = company_name
        en_item["link_url"] = link_url
        en_item["job_name"] = job_name
        en_item["nearest_station"] = nearest_station
        en_item["longitude"] = longitude
        en_item["latitude"] = latitude
        en_item["source"] = "エン転職"
        en_item["occupation"] = "営業"
        en_item["annual_income_min"] = annual_income_min
        en_item["annual_income_max"] = annual_income_max
        en_item["published_time"] = published_time
        en_item["create_data"] = datetime.now()
        yield en_item
    def parse_detail(self, response):

        company_name = response.meta.get("company_name", "")
        link_url = "https://next.rikunabi.com" + response.meta.get(
            "link_url", "")
        job_name = response.meta.get("job_name", "")
        annual_income = response.meta.get("annual_income", "")
        nearest_station = response.meta.get("nearest_station", "")
        """会社名の整理"""
        re_text = re.compile("(\w*)株式会社(\w*)")
        company_name = re.search(re_text, company_name)
        company_name = company_name.group()
        """最高と最低年収の整理"""
        annual_income = re.sub("<.*>", "", annual_income)
        re_text = re.compile("(\d{3,4})万円/")
        annual_income = re.findall(re_text, annual_income)
        n = len(annual_income)
        for i in range(n - 1):
            if int(annual_income[i]) > int(annual_income[i + 1]):
                annual_income[i], annual_income[i + 1] = annual_income[
                    i + 1], annual_income[i]
        try:
            if len(annual_income) == 2:
                annual_income_min = annual_income[0]
                annual_income_max = annual_income[1]
            elif len(annual_income) > 2:
                annual_income_min = annual_income[1]
                annual_income_max = annual_income[-1]
            else:
                annual_income_min = annual_income[0]
                annual_income_max = annual_income[0]
        except:
            annual_income_min = 0
            annual_income_max = 0
        """勤務地の整理"""
        nearest_station = nearest_station.strip()
        """経度緯度の整理"""
        longitude, latitude = get_coordinate(company_name)
        """サイト内での掲載時間"""
        published_time = response.xpath(
            "//p[@class='rnn-inlineBlock rnn-offerInfoHeader__date rnn-textM']/text()"
        ).get()
        """検証用"""
        # print(f"company_name:{company_name} \n link_url:{link_url}"
        #       f" \n job_name:{job_name} \n nearest_station:{nearest_station}"
        #       f"\n annual_income_min:{annual_income_min} \n annual_income_max:{annual_income_max} \n"
        #       f"published_time:{published_time}")
        """
                     company_name             会社名
                     job_name                ポジション 
                     link_url                募集詳細link   https://type.jp
                     nearest_station         住所
                     longitude                 経度
                     latitude                  緯度
                     source                    出所
                     occupation                職種
                     annual_income_min         年収min
                     annual_income_max         年収max
                     published_time            サイト内での掲載時間
                     create_data              クロリングした時間 

        """
        next_rikunabi_item = NextRikuabiItem()
        next_rikunabi_item["company_name"] = company_name
        next_rikunabi_item['job_name'] = job_name
        next_rikunabi_item["link_url"] = link_url
        next_rikunabi_item["nearest_station"] = nearest_station
        next_rikunabi_item["longitude"] = longitude
        next_rikunabi_item["latitude"] = latitude
        next_rikunabi_item["source"] = "next_rikunabi"
        next_rikunabi_item["occupation"] = "営業"
        next_rikunabi_item["annual_income_min"] = annual_income_min
        next_rikunabi_item["annual_income_max"] = annual_income_max
        next_rikunabi_item["published_time"] = published_time
        next_rikunabi_item["create_data"] = datetime.now()
        yield next_rikunabi_item