Python Selector.lstrip Examples, scrapy.selector.Selector.lstrip Python Examples

Example #1

0

Show file

def Vacancy_info(url):
    print(url)
    page = requests.get(url)
    # /html/body/table[2]/tbody/tr/td[2]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/div/div[7]
    # Description
    try:
        description = Selector(response=page).xpath(
            '/html/body/table[2]/tr/td[2]/div/table/tr[2]/td[2]/table/tr/td/div[6]'
        ).get()
        description = remove_tags(description)
        description = description.rstrip()
        description = description.lstrip()
        # description = re.sub(r"\s+", " ", description)
    except:
        description = ""
    if description is None:
        description = ""
    if detect(description) == "ru":
        description_ru = description
        description_en = Translate(description)
        description_ka = ""
    elif detect(description) == "et":
        description_ru = ""
        try:
            description_en = Translate(description)
        except:
            description_en = ""
        description_ka = description
    else:
        description_ru = ""
        description_en = description
        description_ka = ""

    # Email
    try:
        email = re.findall(r'[\w\.-]+@[\w\.-]+', description)
        email = email[0]
    except:
        email = ""

    data = {
        "description_ka": description_ka,
        "description_ru": description_ru,
        "description_en": description_en,
        "email": email
    }
    print("Info Scraped Successfully")
    return data


# //*[@id="CenterBody1"]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/div/div[7]
# //*[@id="CenterBody1"]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/div/div[7]

# Vacancy_info('https://jobs24.ge/?act=obj&id=173982&PHPSESSID=tf04s8ucsd5trehbc1qouk90f25tnqma')

Example #2

0

Show file

def Vacancy_info(url):
    url = url.replace("/en/", "/ge/")
    print(url)
    page = requests.get(url)


    # Description
    try:
        description = Selector(response=page).xpath('//*[@id="job"]/table/tr[1]/td/table[2]').get()
        description = remove_tags(description)
        description = description.rstrip()
        description = description.lstrip()
        description = description.replace('*', "")
        description = re.sub(r"\s+", " ", description)
        print(description)
    except:
        description = ""
    if detect(description) == "ru":
        description_ru = description
        description_en = Translate(description)
        description_ka = ""
    elif detect(description) == "et":
        description_ru = ""
        try: 
            description_en = Translate(description)
        except:
            description_en = ""
        description_ka = description
    else:
        description_ru = ""
        description_en = description
        description_ka = ""

    # Email
    try:
        email = re.findall(r'[\w\.-]+@[\w\.-]+', description)
        email = email[0]
    except:
        email = ""

    data = {
        "description_ka" : description_ka,
        "description_ru" : description_ru,
        "description_en" : description_en,
        "email" : email
    }
    return data

# Vacancy_info("https://jobs.ge/en/?view=jobs&id=268715")

Example #3

0

Show file

File: vacancy.py Project: Caravan2/scripts

def Vacancy_info(url):
    print(url)
    page = requests.get(url)

    # Description
    try:
        description = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[4]').get()
        description = remove_tags(description)
        description = description.rstrip()
        description = description.lstrip()
        description = re.sub(r"\s+", " ", description)
        print(description)
    except:
        description = ""
    if detect(description) == "ru":
        description_ru = description
        description_en = Translate(description)
        description_ka = ""
    elif detect(description) == "et":
        description_ru = ""
        try:
            description_en = Translate(description)
        except:
            description_en = ""
        description_ka = description
    else:
        description_ru = ""
        description_en = description
        description_ka = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[2]/div[2]/div/div/a/@href'
        ).get()
        email = email.replace("mailto:", "")
    except:
        email = ""

    # Location
    try:
        location = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[1]/div[2]/span/text()'
        ).get()
        location_id = []
        try:
            location_id.append({
                "city": f"{location}",
                "id": f"{Geonames(location)}"
            })
        except:
            location_id.append({"city": f"{location}", "id": "611717"})
    except:
        location_id = [{"city": "Tbilisi", "id": "611717"}]

    # Category
    try:
        category = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[2]/div[2]/span[1]/text()'
        ).get()
    except:
        category = ""

    # Stack
    try:
        stack = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[4]/div[2]/text()'
        ).get()
        if "სრული განაკვეთი" in stack:
            stack = "Full-Stack"
    except:
        stack = ""

    data = {
        "description_en": description_en,
        "description_ka": description_ka,
        "description_ru": description_ru,
        "email": email,
        "location": location_id,
        "category": category,
        "stack": stack
    }

    print("Vacancy Scraped Succesfully")
    return data

Example #4

0

Show file

File: hiro.py Project: Caravan2/scripts

            description_ka = ""
        elif detect(description) == "et":
            description_ru = ""
            description_en = Translate(description)
            description_ka = description
        else:
            description_ru = ""
            description_en = description
            description_ka = ""

        try:    
            location = Selector(response=page).xpath('/html/body/div/div[2]/main/div[4]/div[2]/a[contains(.,"Location")]').get()
            location = location.split('"tagDesc">')
            location = location[1].split('</')
            location = location[0].rstrip()
            location = location.lstrip()
            location_id = []
            try:
                location_id.append({ "city" : f"{location}", "id" : f"{Geonames(location)}" } )
            except:
                location_id.append({ "city" : f"{location}", "id" : "611717" })
        except:
            location_id = [{"city" : "Tbilisi", "id" : "611717"}]


        try:    
            activity = Selector(response=page).xpath('/html/body/div/div[2]/main/div[4]/div[2]/a[contains(.,"Activity")]').get()
            activity = activity.split('"tagDesc">')
            activity = activity[1].split('</')
            activity = activity[0].rstrip()
            activity = activity.lstrip()

Example #5

0

Show file

File: app.py Project: Caravan2/scripts

                    '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get(
                    )
                phone = remove_tags(phone)
                if "," in phone:
                    array = phone.split(",")
                    phone = []
                    for each in array:
                        each = each.lstrip()
                        each = each.rstrip()
                        each = each.split(" ", 1)
                        code = each[0]
                        code = code.replace("+", "")
                        number = each[1]
                        phone.append({"country_code": code, "number": number})
                else:
                    phone = phone.lstrip()
                    add = phone.rstrip()
                    add = add.split(" ", 1)
                    code = add[0]
                    code = code.replace("+", "")
                    number = add[1]
                    phone = [{"country_code": code, "number": number}]
            except:
                phone = []

            # Web
            try:
                web = Selector(response=page).xpath(
                    '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get(
                    )
                web = remove_tags(web)

Example #6

0

Show file

File: bia.py Project: Caravan2/scripts

def BiaFunction(company):
    driver.get(f"https://www.bia.ge/EN")

    driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(
        f"{company}")
    time.sleep(3)
    try:
        link = driver.find_element_by_xpath(
            '/html/body/div[8]/div[2]').get_attribute('data-url')

        page = requests.get(link)

        # Company name
        name = Selector(response=page).xpath(
            '//*[@id="TrademarksListBox"]/li/text()').get()

        # Vat number
        vat_number = Selector(response=page).xpath(
            '//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]/text()'
        ).get()

        # Address
        try:
            address = Selector(response=page).xpath(
                '//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]/text()'
            ).get()
            raw = address.split(",")
            postal_code = raw[0]
            location = raw[1]
            location = location.lstrip()
            region = raw[2]
            appartment = raw[3]
            city_id = Geonames(location)
            address = {
                "location": {
                    "country": "GE",
                    "city": {
                        "id": f"{city_id}",
                        "city": location
                    }
                },
                "postal_code": postal_code,
                "appartament": appartment,
                "region": region
            }
        except Exception as e:
            print(e)
            address = {}

        # Working hours
        try:
            working_hours = Selector(response=page).xpath(
                '//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li/text()'
            ).get()
            raw = working_hours.split(":", 1)
            days = raw[0].split("-")
            till = days[1].lstrip().lower()
            days = []
            for day in weekdays:
                if day != till:
                    days.append(day)
                else:
                    days.append(day)
                    break

            hourfrom = raw[1].split("-")[0]
            hourfrom = hourfrom.lstrip()
            hourfrom = hourfrom.rstrip()

            hourto = raw[1].split("-")[1]
            hourto = hourto.lstrip()
            hourto = hourto.rstrip()
            business_hours = {
                "week_days": days,
                "hour_from": hourfrom,
                "hour_to": hourto
            }
        except:
            business_hours = {}

        # Foundation Date
        foundation_date = Selector(response=page).xpath(
            '//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]/text()'
        ).get()

        # Phone
        try:
            phone = Selector(response=page).xpath(
                '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get()
            phone = remove_tags(phone)
            if "," in phone:
                array = phone.split(",")
                phone = []
                for each in array:
                    each = each.lstrip()
                    each = each.rstrip()
                    each = each.split(" ", 1)
                    code = each[0]
                    code = code.replace("+", "")
                    number = each[1]
                    number = number.replace(" ", "")
                    phone.append({"country_code": code, "number": number})
            else:
                phone = phone.lstrip()
                add = phone.rstrip()
                add = add.split(" ", 1)
                code = add[0]
                code = code.replace("+", "")
                number = add[1]
                number = number.replace(" ", "")
                phone = [{"country_code": code, "number": number}]
        except:
            phone = []

        # Web
        try:
            web = Selector(response=page).xpath(
                '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get()
            web = remove_tags(web)
            if "," in web:
                array = web.split(",")
                web = []
                for each in array:
                    each = each.lstrip()
                    each = each.rstrip()
                    web.append(each)
            else:
                web = web.lstrip()
                add = web.rstrip()
                web = [add]
        except:
            web = []

        # Email
        try:
            email = Selector(
                response=page).xpath('//*[@id="TabPanelBox"]').get()
            email = email.replace("*****@*****.**", "")
            email = re.findall(r'[\w\.-]+@[\w\.-]+', email)
        except:
            email = []

        info = {
            "name": name,
            "vat": vat_number,
            "addresses": address,
            "business_hours": business_hours,
            "phones": phone,
            "websites": web,
            "emails": email,
            "foundation_date": foundation_date
        }
        print("Bia Scraped Successfully")
        # print(info)
        return info
    except:
        print("No info")
        return "No info"

    # driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(Keys.RETURN)

    # try:
    #     logo = driver.find_element_by_id('LogoImageUploaderBox').get_attribute("style")
    # except:
    #     logo = ""
    # print(logo)

    # try:
    #     name = driver.find_element_by_id('CompanyNameBox').text
    # except:
    #     name = ""
    # print(name)

    # try:
    #     trademarks = driver.find_element_by_xpath('//*[@id="TrademarksListBox"]/li').text
    # except:
    #     trademarks = ""
    # print(trademarks)

    # try:
    #     legal_form = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[1]/span[2]').text
    # except:
    #     legal_form = ""
    # print(legal_form)

    # try:
    #     registration_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[1]/span[2]').text
    # except:
    #     registration_number = ""
    # print(registration_number)

    # try:
    #     registration_authority = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[1]/span[2]').text
    # except:
    #     registration_authority = ""
    # print(registration_authority)

    # try:
    #     status = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[1]/span[2]').text
    # except:
    #     status = ""
    # print(status)

    # try:
    #     brands = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[1]/td[2]/span[2]').text
    # except:
    #     brands = ""
    # print(brands)

    # try:
    #     vat_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]').text
    # except:
    #     vat_number = ""
    # print(vat_number)

    # try:
    #     registration_date = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]').text
    # except:
    #     registration_date = ""
    # print(registration_date)

    # try:
    #     legal_address = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]').text
    # except:
    #     legal_address = ""
    # print(legal_address)

    # try:
    #     working_hours = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li').text
    # except:
    #     working_hours = ""
    # print(working_hours)

    # try:
    #     phone = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').text
    # except:
    #     phone = ""
    # print(phone)

    # try:
    #     website = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').text
    # except:
    #     website = ""
    # print(website)

    # x = mycol.insert_one({
    #     "Name": name,
    #     "Logo": logo,
    #     "Trademarks": trademarks,
    #     "Legal_Form": legal_form,
    #     "Registration_Number": registration_number,
    #     "Registration_Authority": registration_authority,
    #     "Status": status,
    #     "Brands": brands,
    #     "VAT_Number": vat_number,
    #     "Registration_Date": registration_date,
    #     "Legal_Address": legal_address,
    #     "Working_Hours": working_hours,
    #     "Phone": phone,
    #     "Website": website
    # })

    # driver.find_element_by_xpath('').text


# driver.find_element_by_xpath('').text

# //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span/a
# //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span

Example #7

0

Show file

            deadline_month = int(months[f"{ends.split(' ')[1]}"])
            deadline_year = year
        except:
            deadline_day = 0
            deadline_month = 0
            deadline_year = 0

        user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
        page = requests.get(url, headers={"User-Agent": user_agent})

        try:
            location = Selector(response=page).xpath(
                f'//*[@id="main_content"]/div[3]/form/div/div[2]/table/tr[contains(.,"Location")]/td[2]/span[1]/text()'
            ).get()
            location = location.rstrip()
            lcoation = location.lstrip()
        except:
            location = ""

        try:
            salary = Selector(response=page).xpath(
                f'//*[@id="main_content"]/div[3]/form/div/div[2]/table/tr[contains(.,"Salary")]/td[2]/span[1]/text()'
            ).get()
            salary = salary.rstrip()
            salary = salary.lstrip()
            if salary == "":
                salary = 0
            print(salary)
        except:
            salary = 0
        if "-" in salary:

Example #8

0

Show file

                    publish_month = int(published[1])
                    publish_year = int(published[2].split("/")[0])
                except:
                    # print(e)
                    publish_day = ""
                    publish_month = ""
                    publish_year = ""

                # Location
                try:
                    location = Selector(response=page).xpath(
                        '//*[@id="main-body"]/div[2]/div/div[1]/div[1]/div[1]/div/div[2]/div/div/div[1]/div[contains(.,"მდებარეობა:")]'
                    ).get()
                    location = location.split('<span>')
                    location = location[1].split('</span>')[0]
                    location = location.lstrip()
                    location = location.rstrip()
                    location_id = []
                    try:
                        location_id.append({
                            "city": f"{location}",
                            "id": f"{Geonames(location)}"
                        })
                    except:
                        location_id.append({
                            "city": f"{location}",
                            "id": "611717"
                        })
                except:
                    location_id = [{"city": "Tbilisi", "id": "611717"}]

Example #9

0

Show file

File: vacancy.py Project: Caravan2/scripts

def Vacancy(link, cookies):
    print("request sent for Vacancy succesfully")
    url = link
    print(url)
    cookies = { "Cookie": cookies }
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36"}
    page = requests.get(url, cookies=cookies)

    # Stack
    try:
        stack = Selector(response=page).xpath('//*[@id="page"]/div/div/div/div/div/div[1]/span[2]/text()').get()
    except:
        stack = ""


    # Education
    try:
        education = Selector(response=page).xpath('//*[@id="page"]/div/div/div/div/div/div[3]/span[contains(.,"Education:")]').get()
        education = education.split("</strong>")[1]
        education = education.split("</span>")[0].strip()
    except:
        education = ""


    # Languages
    try:
        languages = Selector(response=page).xpath('//*[@id="page"]/div/div/div/div/div/div[3]/span[contains(.,"Languages:")]').get()
        languages = languages.split("</strong>")[1]
        languages = languages.split("</span>")[0].strip()
    except:
        languages = ""


    # Email
    try:
        email = Selector(response=page).xpath('//*[@id="page"]/main/div/div/div[2]/div/aside[2]/div/div/span/a/text()').get()
    except:
        email = ""
    if email is None:
        email = ""

    # Logo
    try:
        logo = Selector(response=page).xpath('//*[@id="page"]/main/div/div/div[2]/div/aside[3]/div/div/figure/img/@src').get()
    except:
        logo = ""
    if logo is None:
        logo = ""


    # Description
    try:
        description = Selector(response=page).xpath('//*[@id="page"]/main/div/div/div[1]/div[1]/article/div').get()
        description = remove_tags(description)
        description = description.rstrip()
        description = description.lstrip()
        description = description.replace('*', "")
        description = re.sub(r"\s+", " ", description)
    except:
        description = ""
    if detect(description) == "ru":
        description_ru = description
        description_en = Translate(description)
        description_ka = ""
    elif detect(description) == "et":
        description_ru = ""
        try: 
            description_en = Translate(description)
        except:
            description_en = ""
        description_ka = description
    else:
        description_ru = ""
        description_en = description
        description_ka = ""
# contains(text(),"STODOLINK")


    data = {
        "stack" : stack,
        "education" : education,
        "languages" : languages,
        "email" : email,
        "logo" : logo,
        "description" : description,
        "description_ka" : description_ka,
        "description_ru" : description_ru,
        "description_en" : description_en
    }

    print("Vacancy scraped succesfully")
    # print(data)
    return data

# Vacancy('_ga=GA1.2.2101960191.1593693483; _gid=GA1.2.453973920.1593693483; WSID=dnh4xi0r4g1qhrtdiygzn241; __RequestVerificationToken=-ZO3RUnIkifRk6Z-oYnkY1BO7sljzPZhydaRlB23lP0PyUlYkuV0iw3TrkEAsMFrOxCONP1xxAIZh8qzX2tzB_D5DSiXD8G3RyUdZn-wyGE1; LastVisit=2020-07-02T16:38:08.2821857+04:00; _gat=1', 'https://www.cv.ge/announcement/127911/office-housekeeper')