def Vacancy_info(url): print(url) page = requests.get(url) # /html/body/table[2]/tbody/tr/td[2]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/div/div[7] # Description try: description = Selector(response=page).xpath( '/html/body/table[2]/tr/td[2]/div/table/tr[2]/td[2]/table/tr/td/div[6]' ).get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() # description = re.sub(r"\s+", " ", description) except: description = "" if description is None: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # Email try: email = re.findall(r'[\w\.-]+@[\w\.-]+', description) email = email[0] except: email = "" data = { "description_ka": description_ka, "description_ru": description_ru, "description_en": description_en, "email": email } print("Info Scraped Successfully") return data # //*[@id="CenterBody1"]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/div/div[7] # //*[@id="CenterBody1"]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/div/div[7] # Vacancy_info('https://jobs24.ge/?act=obj&id=173982&PHPSESSID=tf04s8ucsd5trehbc1qouk90f25tnqma')
def Vacancy_info(url): url = url.replace("/en/", "/ge/") print(url) page = requests.get(url) # Description try: description = Selector(response=page).xpath('//*[@id="job"]/table/tr[1]/td/table[2]').get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() description = description.replace('*', "") description = re.sub(r"\s+", " ", description) print(description) except: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # Email try: email = re.findall(r'[\w\.-]+@[\w\.-]+', description) email = email[0] except: email = "" data = { "description_ka" : description_ka, "description_ru" : description_ru, "description_en" : description_en, "email" : email } return data # Vacancy_info("https://jobs.ge/en/?view=jobs&id=268715")
def Vacancy_info(url): print(url) page = requests.get(url) # Description try: description = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[4]').get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() description = re.sub(r"\s+", " ", description) print(description) except: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # Email try: email = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[2]/div[2]/div/div/a/@href' ).get() email = email.replace("mailto:", "") except: email = "" # Location try: location = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[1]/div[2]/span/text()' ).get() location_id = [] try: location_id.append({ "city": f"{location}", "id": f"{Geonames(location)}" }) except: location_id.append({"city": f"{location}", "id": "611717"}) except: location_id = [{"city": "Tbilisi", "id": "611717"}] # Category try: category = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[2]/div[2]/span[1]/text()' ).get() except: category = "" # Stack try: stack = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[4]/div[2]/text()' ).get() if "სრული განაკვეთი" in stack: stack = "Full-Stack" except: stack = "" data = { "description_en": description_en, "description_ka": description_ka, "description_ru": description_ru, "email": email, "location": location_id, "category": category, "stack": stack } print("Vacancy Scraped Succesfully") return data
description_ka = "" elif detect(description) == "et": description_ru = "" description_en = Translate(description) description_ka = description else: description_ru = "" description_en = description description_ka = "" try: location = Selector(response=page).xpath('/html/body/div/div[2]/main/div[4]/div[2]/a[contains(.,"Location")]').get() location = location.split('"tagDesc">') location = location[1].split('</') location = location[0].rstrip() location = location.lstrip() location_id = [] try: location_id.append({ "city" : f"{location}", "id" : f"{Geonames(location)}" } ) except: location_id.append({ "city" : f"{location}", "id" : "611717" }) except: location_id = [{"city" : "Tbilisi", "id" : "611717"}] try: activity = Selector(response=page).xpath('/html/body/div/div[2]/main/div[4]/div[2]/a[contains(.,"Activity")]').get() activity = activity.split('"tagDesc">') activity = activity[1].split('</') activity = activity[0].rstrip() activity = activity.lstrip()
'//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get( ) phone = remove_tags(phone) if "," in phone: array = phone.split(",") phone = [] for each in array: each = each.lstrip() each = each.rstrip() each = each.split(" ", 1) code = each[0] code = code.replace("+", "") number = each[1] phone.append({"country_code": code, "number": number}) else: phone = phone.lstrip() add = phone.rstrip() add = add.split(" ", 1) code = add[0] code = code.replace("+", "") number = add[1] phone = [{"country_code": code, "number": number}] except: phone = [] # Web try: web = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get( ) web = remove_tags(web)
def BiaFunction(company): driver.get(f"https://www.bia.ge/EN") driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys( f"{company}") time.sleep(3) try: link = driver.find_element_by_xpath( '/html/body/div[8]/div[2]').get_attribute('data-url') page = requests.get(link) # Company name name = Selector(response=page).xpath( '//*[@id="TrademarksListBox"]/li/text()').get() # Vat number vat_number = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]/text()' ).get() # Address try: address = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]/text()' ).get() raw = address.split(",") postal_code = raw[0] location = raw[1] location = location.lstrip() region = raw[2] appartment = raw[3] city_id = Geonames(location) address = { "location": { "country": "GE", "city": { "id": f"{city_id}", "city": location } }, "postal_code": postal_code, "appartament": appartment, "region": region } except Exception as e: print(e) address = {} # Working hours try: working_hours = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li/text()' ).get() raw = working_hours.split(":", 1) days = raw[0].split("-") till = days[1].lstrip().lower() days = [] for day in weekdays: if day != till: days.append(day) else: days.append(day) break hourfrom = raw[1].split("-")[0] hourfrom = hourfrom.lstrip() hourfrom = hourfrom.rstrip() hourto = raw[1].split("-")[1] hourto = hourto.lstrip() hourto = hourto.rstrip() business_hours = { "week_days": days, "hour_from": hourfrom, "hour_to": hourto } except: business_hours = {} # Foundation Date foundation_date = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]/text()' ).get() # Phone try: phone = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get() phone = remove_tags(phone) if "," in phone: array = phone.split(",") phone = [] for each in array: each = each.lstrip() each = each.rstrip() each = each.split(" ", 1) code = each[0] code = code.replace("+", "") number = each[1] number = number.replace(" ", "") phone.append({"country_code": code, "number": number}) else: phone = phone.lstrip() add = phone.rstrip() add = add.split(" ", 1) code = add[0] code = code.replace("+", "") number = add[1] number = number.replace(" ", "") phone = [{"country_code": code, "number": number}] except: phone = [] # Web try: web = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get() web = remove_tags(web) if "," in web: array = web.split(",") web = [] for each in array: each = each.lstrip() each = each.rstrip() web.append(each) else: web = web.lstrip() add = web.rstrip() web = [add] except: web = [] # Email try: email = Selector( response=page).xpath('//*[@id="TabPanelBox"]').get() email = email.replace("*****@*****.**", "") email = re.findall(r'[\w\.-]+@[\w\.-]+', email) except: email = [] info = { "name": name, "vat": vat_number, "addresses": address, "business_hours": business_hours, "phones": phone, "websites": web, "emails": email, "foundation_date": foundation_date } print("Bia Scraped Successfully") # print(info) return info except: print("No info") return "No info" # driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(Keys.RETURN) # try: # logo = driver.find_element_by_id('LogoImageUploaderBox').get_attribute("style") # except: # logo = "" # print(logo) # try: # name = driver.find_element_by_id('CompanyNameBox').text # except: # name = "" # print(name) # try: # trademarks = driver.find_element_by_xpath('//*[@id="TrademarksListBox"]/li').text # except: # trademarks = "" # print(trademarks) # try: # legal_form = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[1]/span[2]').text # except: # legal_form = "" # print(legal_form) # try: # registration_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[1]/span[2]').text # except: # registration_number = "" # print(registration_number) # try: # registration_authority = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[1]/span[2]').text # except: # registration_authority = "" # print(registration_authority) # try: # status = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[1]/span[2]').text # except: # status = "" # print(status) # try: # brands = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[1]/td[2]/span[2]').text # except: # brands = "" # print(brands) # try: # vat_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]').text # except: # vat_number = "" # print(vat_number) # try: # registration_date = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]').text # except: # registration_date = "" # print(registration_date) # try: # legal_address = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]').text # except: # legal_address = "" # print(legal_address) # try: # working_hours = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li').text # except: # working_hours = "" # print(working_hours) # try: # phone = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').text # except: # phone = "" # print(phone) # try: # website = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').text # except: # website = "" # print(website) # x = mycol.insert_one({ # "Name": name, # "Logo": logo, # "Trademarks": trademarks, # "Legal_Form": legal_form, # "Registration_Number": registration_number, # "Registration_Authority": registration_authority, # "Status": status, # "Brands": brands, # "VAT_Number": vat_number, # "Registration_Date": registration_date, # "Legal_Address": legal_address, # "Working_Hours": working_hours, # "Phone": phone, # "Website": website # }) # driver.find_element_by_xpath('').text # driver.find_element_by_xpath('').text # //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span/a # //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span
deadline_month = int(months[f"{ends.split(' ')[1]}"]) deadline_year = year except: deadline_day = 0 deadline_month = 0 deadline_year = 0 user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36" page = requests.get(url, headers={"User-Agent": user_agent}) try: location = Selector(response=page).xpath( f'//*[@id="main_content"]/div[3]/form/div/div[2]/table/tr[contains(.,"Location")]/td[2]/span[1]/text()' ).get() location = location.rstrip() lcoation = location.lstrip() except: location = "" try: salary = Selector(response=page).xpath( f'//*[@id="main_content"]/div[3]/form/div/div[2]/table/tr[contains(.,"Salary")]/td[2]/span[1]/text()' ).get() salary = salary.rstrip() salary = salary.lstrip() if salary == "": salary = 0 print(salary) except: salary = 0 if "-" in salary:
publish_month = int(published[1]) publish_year = int(published[2].split("/")[0]) except: # print(e) publish_day = "" publish_month = "" publish_year = "" # Location try: location = Selector(response=page).xpath( '//*[@id="main-body"]/div[2]/div/div[1]/div[1]/div[1]/div/div[2]/div/div/div[1]/div[contains(.,"მდებარეობა:")]' ).get() location = location.split('<span>') location = location[1].split('</span>')[0] location = location.lstrip() location = location.rstrip() location_id = [] try: location_id.append({ "city": f"{location}", "id": f"{Geonames(location)}" }) except: location_id.append({ "city": f"{location}", "id": "611717" }) except: location_id = [{"city": "Tbilisi", "id": "611717"}]
def Vacancy(link, cookies): print("request sent for Vacancy succesfully") url = link print(url) cookies = { "Cookie": cookies } headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36"} page = requests.get(url, cookies=cookies) # Stack try: stack = Selector(response=page).xpath('//*[@id="page"]/div/div/div/div/div/div[1]/span[2]/text()').get() except: stack = "" # Education try: education = Selector(response=page).xpath('//*[@id="page"]/div/div/div/div/div/div[3]/span[contains(.,"Education:")]').get() education = education.split("</strong>")[1] education = education.split("</span>")[0].strip() except: education = "" # Languages try: languages = Selector(response=page).xpath('//*[@id="page"]/div/div/div/div/div/div[3]/span[contains(.,"Languages:")]').get() languages = languages.split("</strong>")[1] languages = languages.split("</span>")[0].strip() except: languages = "" # Email try: email = Selector(response=page).xpath('//*[@id="page"]/main/div/div/div[2]/div/aside[2]/div/div/span/a/text()').get() except: email = "" if email is None: email = "" # Logo try: logo = Selector(response=page).xpath('//*[@id="page"]/main/div/div/div[2]/div/aside[3]/div/div/figure/img/@src').get() except: logo = "" if logo is None: logo = "" # Description try: description = Selector(response=page).xpath('//*[@id="page"]/main/div/div/div[1]/div[1]/article/div').get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() description = description.replace('*', "") description = re.sub(r"\s+", " ", description) except: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # contains(text(),"STODOLINK") data = { "stack" : stack, "education" : education, "languages" : languages, "email" : email, "logo" : logo, "description" : description, "description_ka" : description_ka, "description_ru" : description_ru, "description_en" : description_en } print("Vacancy scraped succesfully") # print(data) return data # Vacancy('_ga=GA1.2.2101960191.1593693483; _gid=GA1.2.453973920.1593693483; WSID=dnh4xi0r4g1qhrtdiygzn241; __RequestVerificationToken=-ZO3RUnIkifRk6Z-oYnkY1BO7sljzPZhydaRlB23lP0PyUlYkuV0iw3TrkEAsMFrOxCONP1xxAIZh8qzX2tzB_D5DSiXD8G3RyUdZn-wyGE1; LastVisit=2020-07-02T16:38:08.2821857+04:00; _gat=1', 'https://www.cv.ge/announcement/127911/office-housekeeper')