def BiaFunction(company): driver.get(f"https://www.bia.ge/EN") driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys( f"{company}") time.sleep(3) try: link = driver.find_element_by_xpath( '/html/body/div[8]/div[2]').get_attribute('data-url') page = requests.get(link) # Company name name = Selector(response=page).xpath( '//*[@id="TrademarksListBox"]/li/text()').get() # Vat number vat_number = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]/text()' ).get() # Address try: address = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]/text()' ).get() raw = address.split(",") postal_code = raw[0] location = raw[1] location = location.lstrip() region = raw[2] appartment = raw[3] city_id = Geonames(location) address = { "location": { "country": "GE", "city": { "id": f"{city_id}", "city": location } }, "postal_code": postal_code, "appartament": appartment, "region": region } except Exception as e: print(e) address = {} # Working hours try: working_hours = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li/text()' ).get() raw = working_hours.split(":", 1) days = raw[0].split("-") till = days[1].lstrip().lower() days = [] for day in weekdays: if day != till: days.append(day) else: days.append(day) break hourfrom = raw[1].split("-")[0] hourfrom = hourfrom.lstrip() hourfrom = hourfrom.rstrip() hourto = raw[1].split("-")[1] hourto = hourto.lstrip() hourto = hourto.rstrip() business_hours = { "week_days": days, "hour_from": hourfrom, "hour_to": hourto } except: business_hours = {} # Foundation Date foundation_date = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]/text()' ).get() # Phone try: phone = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get() phone = remove_tags(phone) if "," in phone: array = phone.split(",") phone = [] for each in array: each = each.lstrip() each = each.rstrip() each = each.split(" ", 1) code = each[0] code = code.replace("+", "") number = each[1] number = number.replace(" ", "") phone.append({"country_code": code, "number": number}) else: phone = phone.lstrip() add = phone.rstrip() add = add.split(" ", 1) code = add[0] code = code.replace("+", "") number = add[1] number = number.replace(" ", "") phone = [{"country_code": code, "number": number}] except: phone = [] # Web try: web = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get() web = remove_tags(web) if "," in web: array = web.split(",") web = [] for each in array: each = each.lstrip() each = each.rstrip() web.append(each) else: web = web.lstrip() add = web.rstrip() web = [add] except: web = [] # Email try: email = Selector( response=page).xpath('//*[@id="TabPanelBox"]').get() email = email.replace("*****@*****.**", "") email = re.findall(r'[\w\.-]+@[\w\.-]+', email) except: email = [] info = { "name": name, "vat": vat_number, "addresses": address, "business_hours": business_hours, "phones": phone, "websites": web, "emails": email, "foundation_date": foundation_date } print("Bia Scraped Successfully") # print(info) return info except: print("No info") return "No info" # driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(Keys.RETURN) # try: # logo = driver.find_element_by_id('LogoImageUploaderBox').get_attribute("style") # except: # logo = "" # print(logo) # try: # name = driver.find_element_by_id('CompanyNameBox').text # except: # name = "" # print(name) # try: # trademarks = driver.find_element_by_xpath('//*[@id="TrademarksListBox"]/li').text # except: # trademarks = "" # print(trademarks) # try: # legal_form = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[1]/span[2]').text # except: # legal_form = "" # print(legal_form) # try: # registration_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[1]/span[2]').text # except: # registration_number = "" # print(registration_number) # try: # registration_authority = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[1]/span[2]').text # except: # registration_authority = "" # print(registration_authority) # try: # status = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[1]/span[2]').text # except: # status = "" # print(status) # try: # brands = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[1]/td[2]/span[2]').text # except: # brands = "" # print(brands) # try: # vat_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]').text # except: # vat_number = "" # print(vat_number) # try: # registration_date = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]').text # except: # registration_date = "" # print(registration_date) # try: # legal_address = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]').text # except: # legal_address = "" # print(legal_address) # try: # working_hours = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li').text # except: # working_hours = "" # print(working_hours) # try: # phone = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').text # except: # phone = "" # print(phone) # try: # website = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').text # except: # website = "" # print(website) # x = mycol.insert_one({ # "Name": name, # "Logo": logo, # "Trademarks": trademarks, # "Legal_Form": legal_form, # "Registration_Number": registration_number, # "Registration_Authority": registration_authority, # "Status": status, # "Brands": brands, # "VAT_Number": vat_number, # "Registration_Date": registration_date, # "Legal_Address": legal_address, # "Working_Hours": working_hours, # "Phone": phone, # "Website": website # }) # driver.find_element_by_xpath('').text # driver.find_element_by_xpath('').text # //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span/a # //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span
def Vacancy(link): print("request sent for Vacancy succesfully") url = link # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Location try: location = Selector(response=page).xpath( '/html/body/div[2]/table/tr[contains(., "Location:")]').get() location = location.split("<td>")[1].split("</td>")[0].replace( "&nbsp", " ") location = location.split(",")[0] location = [{'city': location, 'id': Geonames(location)}] except: location = [{'city': 'Yerevan', 'id': '616052'}] # Company url try: c_url = Selector(response=page).xpath( '/html/body/div[2]/table/tr[contains(., "Company:")]').get() c_url = c_url.split('href="')[1].split('">')[0] except: c_url = "" # Vacancy Description try: description = Selector(response=page).xpath('/html/body/div[4]').get() description = remove_tags(description) description = description.strip() description = description.replace('&nbsp', " ") except: description = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Email try: email = Selector(response=page).xpath('//*[@id="job"]/a/@href').get() email = email.replace('mailto:', "") email = [email] except: email = [] data = { "location": location, "c_link": c_url, "description_am": description_am, "description_en": description_en, "email": email } # print(data) return data
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Location try: location = Selector(response=page).xpath( '/html/body/main/section/div/div[1]/div[3]/ul/li[3]/a/text()').get( ) location = location.strip() location = location.split(",")[0] location = [{"city": location, "id": Geonames(location)}] except: location = [{"city": "Yerevan", "id": "616052"}] # Website try: website = Selector(response=page).xpath( '/html/body/main/section/div/div[1]/div[3]/ul/li[4]/a/@href').get( ) if website is None: website = [] else: website = [website] except: website = [] # Job Type try: job_type = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[3]/text()').get() job_type = job_type.strip() except: job_type # Published try: published = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[7]/text()').get() published = published.strip() except: published = "" # Salary try: salary = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[2]/text()').get() salary = salary.strip() salary = salary.replace("֏", "") salary = salary.replace(",", "") salary = salary.replace(" ", "") salary = int(salary) except: salary = 0 # Gender try: gender = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[4]/text()[2]').get() gender = gender.strip() except: gender = "" # Description try: description = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/p').get() description = remove_tags(description).strip() except: description = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Email try: driver.get(link) email = driver.find_element_by_xpath( '/html/body/main/section/div/div[2]/div/p').text email = re.findall(r'[\w\.-]+@[\w\.-]+', email) except Exception as e: email = [] data = { "location": location, "website": website, "job_type": job_type, "publish_day": published, "salary": salary, "gender": gender, "description_am": description_am, "description_en": description_en, "email": email } # print(data) return data # Vacancy("https://www.worknet.am/en/job/%D5%A2%D5%A1%D5%B6%D5%BE%D5%B8%D6%80-%D5%BA%D5%A1%D5%B0%D5%A5%D5%BD%D5%BF%D5%AB-%D5%A1%D5%B7%D5%AD%D5%A1%D5%BF%D5%A1%D5%AF%D5%AB%D6%81-4656")
region = raw[2] else: location = raw[2].strip() region = raw[1] appartment = raw[3] address = { "location": location, "postal_code": postal_code, "appartament": appartment, "region": region } except: print("OOOps") address = [] try: location_id = Geonames(location) except: location_id = None # Splitting house number from a street possibilities1 = [ "str.", "Str.", "Ave.", "ave.", "ln.", "Ln.", "Plateau", "settlement", "mass.", "sq.", "Sq.", "In.", "In ", "Highway", "Alley", "cul-de-sac", "(Temka)", "Plot.", "Range", "Ascent", "Embankment", "Q.", "Dead-end", "Descent", "city", "Khevi", "passage", "Sector", "Lane", "m/d", "cul-de-ac", "str", "Plot", "Mount", "square", "zone" ] possibilities2 = [ "Vazha-Pshavela", "Varketili",