Beispiel #1
0
    def _extract_external_link_requests(self, response, tag, counter):
        r = []
        siteList = []
        ObjectList = dict()
        externalSites = []
        #uniqueExternalSites=[]
        if isinstance(response, HtmlResponse):
            tag = 'L'
            #linkcount=0
            counterValueLink = counter
            sites = Selector(response).xpath("//link/@href").extract()
            # for site in sites:
            #     linkcount=linkcount+1
            #logging.info('linkcount',linkcount)

            #logwr = csv.writer(logFile, delimiter=',',quotechar=' ', quoting=csv.QUOTE_MINIMAL)
            for item in sites:
                if isinstance(item, unicode):
                    item = item.encode('utf-8')
                    siteList.append(item)
                else:
                    siteList.append(item)
            sites.append(counterValueLink)
            externallinkCount, InternallinkCount, uniqueExternalSites, externalSites, secondlevelurl = _extract_object_count(
                siteList)
            #wr.writerow(siteList)
            linkcount = len(siteList)
            #lock.acquire()
            # ObjectList['url']=response.url
            # ObjectList['counter']=counterValueLink
            # ObjectList['linkcount']=linkcount
            # ObjectList['InternallinkCount']=InternallinkCount
            # ObjectList['ExternallinkCount']=externallinkCount
            # #lock.acquire()
            # logwr.writerow([ObjectList])
            logwr.writerow({
                'url': response.url,
                'counter': counterValueLink,
                'InternallinkCount': InternallinkCount,
                'ExternallinkCount': externallinkCount,
                'UniqueExternalSites': uniqueExternalSites,
                'ExternalSites': externalSites,
                'secondlevelurl': secondlevelurl
            })
            #lock.acquire()
            #lock.release()
            r.extend(
                Request(site,
                        callback=self.parse,
                        meta={
                            'tagType': tag,
                            'counter': counterValueLink
                        }) for site in siteList
                if site.startswith("http://") or site.startswith("https://"))
        return r
Beispiel #2
0
 def parse(self, response):
     sel = Selector(response)
     try:
         page_json = sel.re(weibos_re)[0]
     except IndexError:
         raise LoginFailed()
     page_html = json.loads(page_json).get('html')
     if not page_html:
         raise IgnoreRequest()
     page_urls = Selector(text=page_html).xpath(
         './/a[contains(@suda-data,"key=tblog_search_weibo&value=weibo_page'
         '")]/@href'
     ).extract()
     page_urls.pop(-1)
     page_urls.append(self.search_url.format(1))
     for href in page_urls:
         url = ''.join([self.url_prefix, href])
         yield Request(url=url,
                       meta={'cookiejar': 1},
                       cookies=self.cookies,
                       callback=self.parse_weibo)
Beispiel #3
0
    def _extract_external_link_requests(self,response,tag,counter):
        r=[]
        siteList=[]
        ObjectList=dict()
        externalSites=[]
        #uniqueExternalSites=[]
        if isinstance(response, HtmlResponse):
            tag='L'
            #linkcount=0
            counterValueLink=counter
            sites = Selector(response).xpath("//link/@href").extract()
            # for site in sites:
            #     linkcount=linkcount+1
            #logging.info('linkcount',linkcount)

            #logwr = csv.writer(logFile, delimiter=',',quotechar=' ', quoting=csv.QUOTE_MINIMAL)
            for item in sites:
                if isinstance(item, unicode):
                    item=item.encode('utf-8')
                    siteList.append(item)
                else:
                    siteList.append(item)
            sites.append(counterValueLink)
            externallinkCount,InternallinkCount,uniqueExternalSites,externalSites,secondlevelurl=_extract_object_count(siteList)
            #wr.writerow(siteList)
            linkcount=len(siteList)
            #lock.acquire()
            # ObjectList['url']=response.url
            # ObjectList['counter']=counterValueLink
            # ObjectList['linkcount']=linkcount
            # ObjectList['InternallinkCount']=InternallinkCount
            # ObjectList['ExternallinkCount']=externallinkCount
            # #lock.acquire()
            # logwr.writerow([ObjectList])
            logwr.writerow({'url': response.url, 'counter': counterValueLink,'InternallinkCount':InternallinkCount,'ExternallinkCount':externallinkCount,'UniqueExternalSites':uniqueExternalSites,'ExternalSites':externalSites,'secondlevelurl':secondlevelurl})
            #lock.acquire()
            #lock.release()
            r.extend(Request(site, callback=self.parse,meta={'tagType': tag,'counter': counterValueLink})for site in siteList if site.startswith("http://") or site.startswith("https://"))
        return r
Beispiel #4
0
def Vacancy(link):
    url = link
    headers = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9,ru;q=0.8"
    }
    page = requests.get(url, headers=headers)

    # Company
    try:
        company = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/h4/text()').get()
    except:
        company = ""

    # position
    try:
        position = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[2]/div/div[1]/h4/text()').get()
    except:
        position = ""

    # logo
    try:
        logo = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/img/@src').get()
    except:
        logo = ""

    # Job_type
    try:
        job_type = Selector(response=page).xpath(
            '/html/body/div[3]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]//text()[2]'
        ).get()
        job_type = job_type.strip()
    except:
        job_type = ""

    # Contact Person
    try:
        person = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[2]').get(
            )
        person = person.strip()
    except:
        person = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[3]').get(
            )
        email = email.strip()
        email = [email]
    except:
        email = []

    # Phone
    try:
        phone = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[4]').get(
            )
        phone = phone.strip()
        if "," in phone:
            phones = phone.split(",")
            phone = []
            for each in phones:
                each = each.strip()
                if "+" in each and " " in each:
                    number = each.split(" ",
                                        1)[1].replace('-',
                                                      "").replace(" ", "")
                    country_code = each.split(" ", 1)[0].replace('+', "")
                    phone.append({
                        "country_code": country_code,
                        "number": number
                    })
                elif "+" in each and " " not in each:
                    if "+374" in each:
                        country_code = "374"
                        number = each.replace("+374", "")
                        phone.append({
                            "country_code": country_code,
                            "number": number
                        })
                    elif "+1" in each:
                        country_code = "1"
                        number = each.replace("+1", "")
                        phone.append({
                            "country_code": country_code,
                            "number": number
                        })
                    else:
                        country_code = "374"
                        number = each
                        phone.append({
                            "country_code": country_code,
                            "number": number
                        })
                elif "+" not in each:
                    number = each.replace('-', "").replace(" ", "")
                    country_code = "374"
                    phone.append({
                        "country_code": country_code,
                        "number": number
                    })
        else:
            if "+" in phone and " " in phone:
                number = phone.split(" ", 1)[1].replace('-',
                                                        "").replace(" ", "")
                country_code = phone.split(" ", 1)[0].replace('+', "")
                phone = [{"country_code": country_code, "number": number}]
            elif "+" in phone and " " not in phone:
                if "+374" in phone:
                    country_code = "374"
                    number = phone.replace("+374", "")
                    phone = [{"country_code": country_code, "number": number}]
                elif "+1" in phone:
                    country_code = "1"
                    number = phone.replace("+1", "")
                    phone = [{"country_code": country_code, "number": number}]
                else:
                    country_code = "374"
                    number = phone
                    phone = [{"country_code": country_code, "number": number}]
            elif "+" not in phone:
                number = phone.replace('-', "").replace(" ", "")
                country_code = "374"
                phone = [{"country_code": country_code, "number": number}]

    except Exception as e:
        phone = []

    # Website
    try:
        website = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[5]').get(
            )
        website = website.strip()
        if "not" in website:
            website = []
        else:
            website = [website]
    except:
        website = []

    # Published
    try:
        published = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[2]').get()
        published = published.strip()
        publish_day = int(published.split("-")[2])
        publish_month = int(published.split("-")[1])
        publish_year = int(published.split("-")[0])
    except:
        publish_day = 0
        publish_month = 0
        publish_year = 0

    # Ends
    try:
        ends = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[5]').get()
        ends = ends.strip()
        deadline_day = int(ends.split("-")[2])
        deadline_month = int(ends.split("-")[1])
        deadline_year = int(ends.split("-")[0])
    except:
        deadline_day = 0
        deadline_month = 0
        deadline_year = 0

    # Career Level
    try:
        career_level = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[1]/text()').get(
            )
        if career_level == None:
            career_level = ""
    except:
        career_level = ""

    # Education
    try:
        education = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[2]/text()').get(
            )
        if education == None:
            education = ""
    except:
        education = ""

    # Experience
    try:
        experience = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[3]/text()').get(
            )
        if experience == None:
            experience = ""
    except:
        experience = ""

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/strong/text()').get(
            )
        if "-" in salary:
            salary = salary.split("-")
            min_salary = salary[0].strip()
            min_salary = int(min_salary.replace(".", ""))
            max_salary = salary[1].strip()
            max_salary = int(max_salary.replace('.', ""))
        elif "-" not in salary and salary != "N/A":
            min_salary = int(salary.replace("."))
            max_salary = int(salary.replace("."))
        else:
            min_salary = 0
            max_salary = 0
    except:
        min_salary = 0
        max_salary = 0

    # Vacancy Description
    try:
        v_description = Selector(
            response=page).xpath('//*[@id="loyal"]/div[2]/div/div[1]').get()
        v_description = remove_tags(v_description).strip()
        v_description = v_description.replace('\xa0', " ")
    except:
        v_description = ""
    try:
        if detect(v_description) == "et":
            try:
                v_description_en = Translate(v_description)
            except:
                v_description_en = " "
            v_description_am = v_description
        else:
            v_description_en = v_description
            v_description_am = ""
    except:
        v_description_am = ""
        v_description_en = ""

    # Company Description
    try:
        c_description = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()').get()
        c_description = c_description.strip()
    except:
        c_description = ""
    try:
        if detect(c_description) == "et":
            try:
                c_description_en = Translate(c_description)
            except:
                c_description_en = " "
            c_description_am = c_description
        else:
            c_description_en = c_description
            c_description_am = ""
    except:
        c_description_am = ""
        c_description_en = ""
# c_descrip ; //*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()

    data = {
        "company": company,
        "position": position,
        "logo": logo,
        "person": person,
        "job_type": job_type,
        "email": email,
        "phone": phone,
        "website": website,
        "publish_day": publish_day,
        "publish_month": publish_month,
        "publish_year": publish_year,
        "deadline_day": deadline_day,
        "deadline_month": deadline_month,
        "deadline_year": deadline_year,
        "career_level": career_level,
        "education": education,
        "experience": experience,
        "min_salary": min_salary,
        "max_salary": max_salary,
        "v_description_am": v_description_am,
        "v_description_en": v_description_en,
        "c_description_am": c_description_am,
        "c_description_en": c_description_en,
    }

    print(data)
    return data


# Vacancy("https://rezume.am/job/2184")
Beispiel #5
0
            try:
                phone = Selector(response=page).xpath(
                    '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get(
                    )
                phone = remove_tags(phone)
                if "," in phone:
                    array = phone.split(",")
                    phone = []
                    for each in array:
                        each = each.lstrip()
                        each = each.rstrip()
                        each = each.split(" ", 1)
                        code = each[0]
                        code = code.replace("+", "")
                        number = each[1]
                        phone.append({"country_code": code, "number": number})
                else:
                    phone = phone.lstrip()
                    add = phone.rstrip()
                    add = add.split(" ", 1)
                    code = add[0]
                    code = code.replace("+", "")
                    number = add[1]
                    phone = [{"country_code": code, "number": number}]
            except:
                phone = []

            # Web
            try:
                web = Selector(response=page).xpath(
                    '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get(
Beispiel #6
0
def BiaFunction(company):
    driver.get(f"https://www.bia.ge/EN")

    driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(
        f"{company}")
    time.sleep(3)
    try:
        link = driver.find_element_by_xpath(
            '/html/body/div[8]/div[2]').get_attribute('data-url')

        page = requests.get(link)

        # Company name
        name = Selector(response=page).xpath(
            '//*[@id="TrademarksListBox"]/li/text()').get()

        # Vat number
        vat_number = Selector(response=page).xpath(
            '//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]/text()'
        ).get()

        # Address
        try:
            address = Selector(response=page).xpath(
                '//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]/text()'
            ).get()
            raw = address.split(",")
            postal_code = raw[0]
            location = raw[1]
            location = location.lstrip()
            region = raw[2]
            appartment = raw[3]
            city_id = Geonames(location)
            address = {
                "location": {
                    "country": "GE",
                    "city": {
                        "id": f"{city_id}",
                        "city": location
                    }
                },
                "postal_code": postal_code,
                "appartament": appartment,
                "region": region
            }
        except Exception as e:
            print(e)
            address = {}

        # Working hours
        try:
            working_hours = Selector(response=page).xpath(
                '//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li/text()'
            ).get()
            raw = working_hours.split(":", 1)
            days = raw[0].split("-")
            till = days[1].lstrip().lower()
            days = []
            for day in weekdays:
                if day != till:
                    days.append(day)
                else:
                    days.append(day)
                    break

            hourfrom = raw[1].split("-")[0]
            hourfrom = hourfrom.lstrip()
            hourfrom = hourfrom.rstrip()

            hourto = raw[1].split("-")[1]
            hourto = hourto.lstrip()
            hourto = hourto.rstrip()
            business_hours = {
                "week_days": days,
                "hour_from": hourfrom,
                "hour_to": hourto
            }
        except:
            business_hours = {}

        # Foundation Date
        foundation_date = Selector(response=page).xpath(
            '//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]/text()'
        ).get()

        # Phone
        try:
            phone = Selector(response=page).xpath(
                '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get()
            phone = remove_tags(phone)
            if "," in phone:
                array = phone.split(",")
                phone = []
                for each in array:
                    each = each.lstrip()
                    each = each.rstrip()
                    each = each.split(" ", 1)
                    code = each[0]
                    code = code.replace("+", "")
                    number = each[1]
                    number = number.replace(" ", "")
                    phone.append({"country_code": code, "number": number})
            else:
                phone = phone.lstrip()
                add = phone.rstrip()
                add = add.split(" ", 1)
                code = add[0]
                code = code.replace("+", "")
                number = add[1]
                number = number.replace(" ", "")
                phone = [{"country_code": code, "number": number}]
        except:
            phone = []

        # Web
        try:
            web = Selector(response=page).xpath(
                '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get()
            web = remove_tags(web)
            if "," in web:
                array = web.split(",")
                web = []
                for each in array:
                    each = each.lstrip()
                    each = each.rstrip()
                    web.append(each)
            else:
                web = web.lstrip()
                add = web.rstrip()
                web = [add]
        except:
            web = []

        # Email
        try:
            email = Selector(
                response=page).xpath('//*[@id="TabPanelBox"]').get()
            email = email.replace("*****@*****.**", "")
            email = re.findall(r'[\w\.-]+@[\w\.-]+', email)
        except:
            email = []

        info = {
            "name": name,
            "vat": vat_number,
            "addresses": address,
            "business_hours": business_hours,
            "phones": phone,
            "websites": web,
            "emails": email,
            "foundation_date": foundation_date
        }
        print("Bia Scraped Successfully")
        # print(info)
        return info
    except:
        print("No info")
        return "No info"

    # driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(Keys.RETURN)

    # try:
    #     logo = driver.find_element_by_id('LogoImageUploaderBox').get_attribute("style")
    # except:
    #     logo = ""
    # print(logo)

    # try:
    #     name = driver.find_element_by_id('CompanyNameBox').text
    # except:
    #     name = ""
    # print(name)

    # try:
    #     trademarks = driver.find_element_by_xpath('//*[@id="TrademarksListBox"]/li').text
    # except:
    #     trademarks = ""
    # print(trademarks)

    # try:
    #     legal_form = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[1]/span[2]').text
    # except:
    #     legal_form = ""
    # print(legal_form)

    # try:
    #     registration_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[1]/span[2]').text
    # except:
    #     registration_number = ""
    # print(registration_number)

    # try:
    #     registration_authority = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[1]/span[2]').text
    # except:
    #     registration_authority = ""
    # print(registration_authority)

    # try:
    #     status = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[1]/span[2]').text
    # except:
    #     status = ""
    # print(status)

    # try:
    #     brands = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[1]/td[2]/span[2]').text
    # except:
    #     brands = ""
    # print(brands)

    # try:
    #     vat_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]').text
    # except:
    #     vat_number = ""
    # print(vat_number)

    # try:
    #     registration_date = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]').text
    # except:
    #     registration_date = ""
    # print(registration_date)

    # try:
    #     legal_address = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]').text
    # except:
    #     legal_address = ""
    # print(legal_address)

    # try:
    #     working_hours = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li').text
    # except:
    #     working_hours = ""
    # print(working_hours)

    # try:
    #     phone = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').text
    # except:
    #     phone = ""
    # print(phone)

    # try:
    #     website = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').text
    # except:
    #     website = ""
    # print(website)

    # x = mycol.insert_one({
    #     "Name": name,
    #     "Logo": logo,
    #     "Trademarks": trademarks,
    #     "Legal_Form": legal_form,
    #     "Registration_Number": registration_number,
    #     "Registration_Authority": registration_authority,
    #     "Status": status,
    #     "Brands": brands,
    #     "VAT_Number": vat_number,
    #     "Registration_Date": registration_date,
    #     "Legal_Address": legal_address,
    #     "Working_Hours": working_hours,
    #     "Phone": phone,
    #     "Website": website
    # })

    # driver.find_element_by_xpath('').text


# driver.find_element_by_xpath('').text

# //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span/a
# //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span