コード例 #1
0
ファイル: bia.py プロジェクト: Caravan2/scripts
def BiaFunction(company):
    driver.get(f"https://www.bia.ge/EN")

    driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(
        f"{company}")
    time.sleep(3)
    try:
        link = driver.find_element_by_xpath(
            '/html/body/div[8]/div[2]').get_attribute('data-url')

        page = requests.get(link)

        # Company name
        name = Selector(response=page).xpath(
            '//*[@id="TrademarksListBox"]/li/text()').get()

        # Vat number
        vat_number = Selector(response=page).xpath(
            '//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]/text()'
        ).get()

        # Address
        try:
            address = Selector(response=page).xpath(
                '//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]/text()'
            ).get()
            raw = address.split(",")
            postal_code = raw[0]
            location = raw[1]
            location = location.lstrip()
            region = raw[2]
            appartment = raw[3]
            city_id = Geonames(location)
            address = {
                "location": {
                    "country": "GE",
                    "city": {
                        "id": f"{city_id}",
                        "city": location
                    }
                },
                "postal_code": postal_code,
                "appartament": appartment,
                "region": region
            }
        except Exception as e:
            print(e)
            address = {}

        # Working hours
        try:
            working_hours = Selector(response=page).xpath(
                '//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li/text()'
            ).get()
            raw = working_hours.split(":", 1)
            days = raw[0].split("-")
            till = days[1].lstrip().lower()
            days = []
            for day in weekdays:
                if day != till:
                    days.append(day)
                else:
                    days.append(day)
                    break

            hourfrom = raw[1].split("-")[0]
            hourfrom = hourfrom.lstrip()
            hourfrom = hourfrom.rstrip()

            hourto = raw[1].split("-")[1]
            hourto = hourto.lstrip()
            hourto = hourto.rstrip()
            business_hours = {
                "week_days": days,
                "hour_from": hourfrom,
                "hour_to": hourto
            }
        except:
            business_hours = {}

        # Foundation Date
        foundation_date = Selector(response=page).xpath(
            '//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]/text()'
        ).get()

        # Phone
        try:
            phone = Selector(response=page).xpath(
                '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get()
            phone = remove_tags(phone)
            if "," in phone:
                array = phone.split(",")
                phone = []
                for each in array:
                    each = each.lstrip()
                    each = each.rstrip()
                    each = each.split(" ", 1)
                    code = each[0]
                    code = code.replace("+", "")
                    number = each[1]
                    number = number.replace(" ", "")
                    phone.append({"country_code": code, "number": number})
            else:
                phone = phone.lstrip()
                add = phone.rstrip()
                add = add.split(" ", 1)
                code = add[0]
                code = code.replace("+", "")
                number = add[1]
                number = number.replace(" ", "")
                phone = [{"country_code": code, "number": number}]
        except:
            phone = []

        # Web
        try:
            web = Selector(response=page).xpath(
                '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get()
            web = remove_tags(web)
            if "," in web:
                array = web.split(",")
                web = []
                for each in array:
                    each = each.lstrip()
                    each = each.rstrip()
                    web.append(each)
            else:
                web = web.lstrip()
                add = web.rstrip()
                web = [add]
        except:
            web = []

        # Email
        try:
            email = Selector(
                response=page).xpath('//*[@id="TabPanelBox"]').get()
            email = email.replace("*****@*****.**", "")
            email = re.findall(r'[\w\.-]+@[\w\.-]+', email)
        except:
            email = []

        info = {
            "name": name,
            "vat": vat_number,
            "addresses": address,
            "business_hours": business_hours,
            "phones": phone,
            "websites": web,
            "emails": email,
            "foundation_date": foundation_date
        }
        print("Bia Scraped Successfully")
        # print(info)
        return info
    except:
        print("No info")
        return "No info"

    # driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(Keys.RETURN)

    # try:
    #     logo = driver.find_element_by_id('LogoImageUploaderBox').get_attribute("style")
    # except:
    #     logo = ""
    # print(logo)

    # try:
    #     name = driver.find_element_by_id('CompanyNameBox').text
    # except:
    #     name = ""
    # print(name)

    # try:
    #     trademarks = driver.find_element_by_xpath('//*[@id="TrademarksListBox"]/li').text
    # except:
    #     trademarks = ""
    # print(trademarks)

    # try:
    #     legal_form = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[1]/span[2]').text
    # except:
    #     legal_form = ""
    # print(legal_form)

    # try:
    #     registration_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[1]/span[2]').text
    # except:
    #     registration_number = ""
    # print(registration_number)

    # try:
    #     registration_authority = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[1]/span[2]').text
    # except:
    #     registration_authority = ""
    # print(registration_authority)

    # try:
    #     status = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[1]/span[2]').text
    # except:
    #     status = ""
    # print(status)

    # try:
    #     brands = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[1]/td[2]/span[2]').text
    # except:
    #     brands = ""
    # print(brands)

    # try:
    #     vat_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]').text
    # except:
    #     vat_number = ""
    # print(vat_number)

    # try:
    #     registration_date = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]').text
    # except:
    #     registration_date = ""
    # print(registration_date)

    # try:
    #     legal_address = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]').text
    # except:
    #     legal_address = ""
    # print(legal_address)

    # try:
    #     working_hours = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li').text
    # except:
    #     working_hours = ""
    # print(working_hours)

    # try:
    #     phone = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').text
    # except:
    #     phone = ""
    # print(phone)

    # try:
    #     website = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').text
    # except:
    #     website = ""
    # print(website)

    # x = mycol.insert_one({
    #     "Name": name,
    #     "Logo": logo,
    #     "Trademarks": trademarks,
    #     "Legal_Form": legal_form,
    #     "Registration_Number": registration_number,
    #     "Registration_Authority": registration_authority,
    #     "Status": status,
    #     "Brands": brands,
    #     "VAT_Number": vat_number,
    #     "Registration_Date": registration_date,
    #     "Legal_Address": legal_address,
    #     "Working_Hours": working_hours,
    #     "Phone": phone,
    #     "Website": website
    # })

    # driver.find_element_by_xpath('').text


# driver.find_element_by_xpath('').text

# //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span/a
# //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span
コード例 #2
0
ファイル: vacancy.py プロジェクト: Caravan2/scripts
def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    # headers = {"Accept-Language": "en-US,en;q=0.5"}
    page = requests.get(url)  #headers=headers)

    # Location
    try:
        location = Selector(response=page).xpath(
            '/html/body/div[2]/table/tr[contains(., "Location:")]').get()
        location = location.split("<td>")[1].split("</td>")[0].replace(
            "&amp;nbsp", " ")
        location = location.split(",")[0]
        location = [{'city': location, 'id': Geonames(location)}]
    except:
        location = [{'city': 'Yerevan', 'id': '616052'}]

    # Company url
    try:
        c_url = Selector(response=page).xpath(
            '/html/body/div[2]/table/tr[contains(., "Company:")]').get()
        c_url = c_url.split('href="')[1].split('">')[0]
    except:
        c_url = ""

    # Vacancy Description
    try:
        description = Selector(response=page).xpath('/html/body/div[4]').get()
        description = remove_tags(description)
        description = description.strip()
        description = description.replace('&amp;nbsp', " ")
    except:
        description = ""
    try:
        if detect(description) == "et":
            try:
                description_en = Translate(description)
            except:
                description_en = ""
            description_am = description
        else:
            description_en = description
            description_am = ""
    except:
        description_en = ""
        description_am = ""

    # Email
    try:
        email = Selector(response=page).xpath('//*[@id="job"]/a/@href').get()
        email = email.replace('mailto:', "")
        email = [email]
    except:
        email = []

    data = {
        "location": location,
        "c_link": c_url,
        "description_am": description_am,
        "description_en": description_en,
        "email": email
    }

    # print(data)
    return data
コード例 #3
0
ファイル: vacancy.py プロジェクト: Caravan2/scripts
def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    print(url)
    # headers = {"Accept-Language": "en-US,en;q=0.5"}
    page = requests.get(url)  #headers=headers)

    # Location
    try:
        location = Selector(response=page).xpath(
            '/html/body/main/section/div/div[1]/div[3]/ul/li[3]/a/text()').get(
            )
        location = location.strip()
        location = location.split(",")[0]
        location = [{"city": location, "id": Geonames(location)}]
    except:
        location = [{"city": "Yerevan", "id": "616052"}]

    # Website
    try:
        website = Selector(response=page).xpath(
            '/html/body/main/section/div/div[1]/div[3]/ul/li[4]/a/@href').get(
            )
        if website is None:
            website = []
        else:
            website = [website]
    except:
        website = []

    # Job Type
    try:
        job_type = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/ul/li[3]/text()').get()
        job_type = job_type.strip()
    except:
        job_type

    # Published
    try:
        published = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/ul/li[7]/text()').get()
        published = published.strip()
    except:
        published = ""

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/ul/li[2]/text()').get()
        salary = salary.strip()
        salary = salary.replace("֏", "")
        salary = salary.replace(",", "")
        salary = salary.replace(" ", "")
        salary = int(salary)
    except:
        salary = 0

    # Gender
    try:
        gender = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/ul/li[4]/text()[2]').get()
        gender = gender.strip()
    except:
        gender = ""

    # Description
    try:
        description = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/p').get()
        description = remove_tags(description).strip()
    except:
        description = ""
    try:
        if detect(description) == "et":
            try:
                description_en = Translate(description)
            except:
                description_en = ""
            description_am = description
        else:
            description_en = description
            description_am = ""
    except:
        description_en = ""
        description_am = ""

    # Email
    try:
        driver.get(link)
        email = driver.find_element_by_xpath(
            '/html/body/main/section/div/div[2]/div/p').text
        email = re.findall(r'[\w\.-]+@[\w\.-]+', email)
    except Exception as e:
        email = []

    data = {
        "location": location,
        "website": website,
        "job_type": job_type,
        "publish_day": published,
        "salary": salary,
        "gender": gender,
        "description_am": description_am,
        "description_en": description_en,
        "email": email
    }

    # print(data)
    return data


# Vacancy("https://www.worknet.am/en/job/%D5%A2%D5%A1%D5%B6%D5%BE%D5%B8%D6%80-%D5%BA%D5%A1%D5%B0%D5%A5%D5%BD%D5%BF%D5%AB-%D5%A1%D5%B7%D5%AD%D5%A1%D5%BF%D5%A1%D5%AF%D5%AB%D6%81-4656")
コード例 #4
0
ファイル: insert.py プロジェクト: Caravan2/scripts
                    region = raw[2]
                else:
                    location = raw[2].strip()
                    region = raw[1]
                appartment = raw[3]
                address = {
                    "location": location,
                    "postal_code": postal_code,
                    "appartament": appartment,
                    "region": region
                }
            except:
                print("OOOps")
                address = []
            try:
                location_id = Geonames(location)
            except:
                location_id = None

            # Splitting house number from a street
            possibilities1 = [
                "str.", "Str.", "Ave.", "ave.", "ln.", "Ln.", "Plateau",
                "settlement", "mass.", "sq.", "Sq.", "In.", "In ", "Highway",
                "Alley", "cul-de-sac", "(Temka)", "Plot.", "Range", "Ascent",
                "Embankment", "Q.", "Dead-end", "Descent", "city", "Khevi",
                "passage", "Sector", "Lane", "m/d", "cul-de-ac", "str", "Plot",
                "Mount", "square", "zone"
            ]
            possibilities2 = [
                "Vazha-Pshavela",
                "Varketili",