def _extract_external_link_requests(self, response, tag, counter): r = [] siteList = [] ObjectList = dict() externalSites = [] #uniqueExternalSites=[] if isinstance(response, HtmlResponse): tag = 'L' #linkcount=0 counterValueLink = counter sites = Selector(response).xpath("//link/@href").extract() # for site in sites: # linkcount=linkcount+1 #logging.info('linkcount',linkcount) #logwr = csv.writer(logFile, delimiter=',',quotechar=' ', quoting=csv.QUOTE_MINIMAL) for item in sites: if isinstance(item, unicode): item = item.encode('utf-8') siteList.append(item) else: siteList.append(item) sites.append(counterValueLink) externallinkCount, InternallinkCount, uniqueExternalSites, externalSites, secondlevelurl = _extract_object_count( siteList) #wr.writerow(siteList) linkcount = len(siteList) #lock.acquire() # ObjectList['url']=response.url # ObjectList['counter']=counterValueLink # ObjectList['linkcount']=linkcount # ObjectList['InternallinkCount']=InternallinkCount # ObjectList['ExternallinkCount']=externallinkCount # #lock.acquire() # logwr.writerow([ObjectList]) logwr.writerow({ 'url': response.url, 'counter': counterValueLink, 'InternallinkCount': InternallinkCount, 'ExternallinkCount': externallinkCount, 'UniqueExternalSites': uniqueExternalSites, 'ExternalSites': externalSites, 'secondlevelurl': secondlevelurl }) #lock.acquire() #lock.release() r.extend( Request(site, callback=self.parse, meta={ 'tagType': tag, 'counter': counterValueLink }) for site in siteList if site.startswith("http://") or site.startswith("https://")) return r
def parse(self, response): sel = Selector(response) try: page_json = sel.re(weibos_re)[0] except IndexError: raise LoginFailed() page_html = json.loads(page_json).get('html') if not page_html: raise IgnoreRequest() page_urls = Selector(text=page_html).xpath( './/a[contains(@suda-data,"key=tblog_search_weibo&value=weibo_page' '")]/@href' ).extract() page_urls.pop(-1) page_urls.append(self.search_url.format(1)) for href in page_urls: url = ''.join([self.url_prefix, href]) yield Request(url=url, meta={'cookiejar': 1}, cookies=self.cookies, callback=self.parse_weibo)
def _extract_external_link_requests(self,response,tag,counter): r=[] siteList=[] ObjectList=dict() externalSites=[] #uniqueExternalSites=[] if isinstance(response, HtmlResponse): tag='L' #linkcount=0 counterValueLink=counter sites = Selector(response).xpath("//link/@href").extract() # for site in sites: # linkcount=linkcount+1 #logging.info('linkcount',linkcount) #logwr = csv.writer(logFile, delimiter=',',quotechar=' ', quoting=csv.QUOTE_MINIMAL) for item in sites: if isinstance(item, unicode): item=item.encode('utf-8') siteList.append(item) else: siteList.append(item) sites.append(counterValueLink) externallinkCount,InternallinkCount,uniqueExternalSites,externalSites,secondlevelurl=_extract_object_count(siteList) #wr.writerow(siteList) linkcount=len(siteList) #lock.acquire() # ObjectList['url']=response.url # ObjectList['counter']=counterValueLink # ObjectList['linkcount']=linkcount # ObjectList['InternallinkCount']=InternallinkCount # ObjectList['ExternallinkCount']=externallinkCount # #lock.acquire() # logwr.writerow([ObjectList]) logwr.writerow({'url': response.url, 'counter': counterValueLink,'InternallinkCount':InternallinkCount,'ExternallinkCount':externallinkCount,'UniqueExternalSites':uniqueExternalSites,'ExternalSites':externalSites,'secondlevelurl':secondlevelurl}) #lock.acquire() #lock.release() r.extend(Request(site, callback=self.parse,meta={'tagType': tag,'counter': counterValueLink})for site in siteList if site.startswith("http://") or site.startswith("https://")) return r
def Vacancy(link): url = link headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36", "Accept-Language": "en-US,en;q=0.9,ru;q=0.8" } page = requests.get(url, headers=headers) # Company try: company = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/h4/text()').get() except: company = "" # position try: position = Selector(response=page).xpath( '//*[@id="loyal"]/div[2]/div/div[1]/h4/text()').get() except: position = "" # logo try: logo = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/img/@src').get() except: logo = "" # Job_type try: job_type = Selector(response=page).xpath( '/html/body/div[3]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]//text()[2]' ).get() job_type = job_type.strip() except: job_type = "" # Contact Person try: person = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[2]').get( ) person = person.strip() except: person = "" # Email try: email = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[3]').get( ) email = email.strip() email = [email] except: email = [] # Phone try: phone = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[4]').get( ) phone = phone.strip() if "," in phone: phones = phone.split(",") phone = [] for each in phones: each = each.strip() if "+" in each and " " in each: number = each.split(" ", 1)[1].replace('-', "").replace(" ", "") country_code = each.split(" ", 1)[0].replace('+', "") phone.append({ "country_code": country_code, "number": number }) elif "+" in each and " " not in each: if "+374" in each: country_code = "374" number = each.replace("+374", "") phone.append({ "country_code": country_code, "number": number }) elif "+1" in each: country_code = "1" number = each.replace("+1", "") phone.append({ "country_code": country_code, "number": number }) else: country_code = "374" number = each phone.append({ "country_code": country_code, "number": number }) elif "+" not in each: number = each.replace('-', "").replace(" ", "") country_code = "374" phone.append({ "country_code": country_code, "number": number }) else: if "+" in phone and " " in phone: number = phone.split(" ", 1)[1].replace('-', "").replace(" ", "") country_code = phone.split(" ", 1)[0].replace('+', "") phone = [{"country_code": country_code, "number": number}] elif "+" in phone and " " not in phone: if "+374" in phone: country_code = "374" number = phone.replace("+374", "") phone = [{"country_code": country_code, "number": number}] elif "+1" in phone: country_code = "1" number = phone.replace("+1", "") phone = [{"country_code": country_code, "number": number}] else: country_code = "374" number = phone phone = [{"country_code": country_code, "number": number}] elif "+" not in phone: number = phone.replace('-', "").replace(" ", "") country_code = "374" phone = [{"country_code": country_code, "number": number}] except Exception as e: phone = [] # Website try: website = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[5]').get( ) website = website.strip() if "not" in website: website = [] else: website = [website] except: website = [] # Published try: published = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[2]').get() published = published.strip() publish_day = int(published.split("-")[2]) publish_month = int(published.split("-")[1]) publish_year = int(published.split("-")[0]) except: publish_day = 0 publish_month = 0 publish_year = 0 # Ends try: ends = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[5]').get() ends = ends.strip() deadline_day = int(ends.split("-")[2]) deadline_month = int(ends.split("-")[1]) deadline_year = int(ends.split("-")[0]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Career Level try: career_level = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[1]/text()').get( ) if career_level == None: career_level = "" except: career_level = "" # Education try: education = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[2]/text()').get( ) if education == None: education = "" except: education = "" # Experience try: experience = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[3]/text()').get( ) if experience == None: experience = "" except: experience = "" # Salary try: salary = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/strong/text()').get( ) if "-" in salary: salary = salary.split("-") min_salary = salary[0].strip() min_salary = int(min_salary.replace(".", "")) max_salary = salary[1].strip() max_salary = int(max_salary.replace('.', "")) elif "-" not in salary and salary != "N/A": min_salary = int(salary.replace(".")) max_salary = int(salary.replace(".")) else: min_salary = 0 max_salary = 0 except: min_salary = 0 max_salary = 0 # Vacancy Description try: v_description = Selector( response=page).xpath('//*[@id="loyal"]/div[2]/div/div[1]').get() v_description = remove_tags(v_description).strip() v_description = v_description.replace('\xa0', " ") except: v_description = "" try: if detect(v_description) == "et": try: v_description_en = Translate(v_description) except: v_description_en = " " v_description_am = v_description else: v_description_en = v_description v_description_am = "" except: v_description_am = "" v_description_en = "" # Company Description try: c_description = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()').get() c_description = c_description.strip() except: c_description = "" try: if detect(c_description) == "et": try: c_description_en = Translate(c_description) except: c_description_en = " " c_description_am = c_description else: c_description_en = c_description c_description_am = "" except: c_description_am = "" c_description_en = "" # c_descrip ; //*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text() data = { "company": company, "position": position, "logo": logo, "person": person, "job_type": job_type, "email": email, "phone": phone, "website": website, "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "career_level": career_level, "education": education, "experience": experience, "min_salary": min_salary, "max_salary": max_salary, "v_description_am": v_description_am, "v_description_en": v_description_en, "c_description_am": c_description_am, "c_description_en": c_description_en, } print(data) return data # Vacancy("https://rezume.am/job/2184")
try: phone = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get( ) phone = remove_tags(phone) if "," in phone: array = phone.split(",") phone = [] for each in array: each = each.lstrip() each = each.rstrip() each = each.split(" ", 1) code = each[0] code = code.replace("+", "") number = each[1] phone.append({"country_code": code, "number": number}) else: phone = phone.lstrip() add = phone.rstrip() add = add.split(" ", 1) code = add[0] code = code.replace("+", "") number = add[1] phone = [{"country_code": code, "number": number}] except: phone = [] # Web try: web = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get(
def BiaFunction(company): driver.get(f"https://www.bia.ge/EN") driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys( f"{company}") time.sleep(3) try: link = driver.find_element_by_xpath( '/html/body/div[8]/div[2]').get_attribute('data-url') page = requests.get(link) # Company name name = Selector(response=page).xpath( '//*[@id="TrademarksListBox"]/li/text()').get() # Vat number vat_number = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]/text()' ).get() # Address try: address = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]/text()' ).get() raw = address.split(",") postal_code = raw[0] location = raw[1] location = location.lstrip() region = raw[2] appartment = raw[3] city_id = Geonames(location) address = { "location": { "country": "GE", "city": { "id": f"{city_id}", "city": location } }, "postal_code": postal_code, "appartament": appartment, "region": region } except Exception as e: print(e) address = {} # Working hours try: working_hours = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li/text()' ).get() raw = working_hours.split(":", 1) days = raw[0].split("-") till = days[1].lstrip().lower() days = [] for day in weekdays: if day != till: days.append(day) else: days.append(day) break hourfrom = raw[1].split("-")[0] hourfrom = hourfrom.lstrip() hourfrom = hourfrom.rstrip() hourto = raw[1].split("-")[1] hourto = hourto.lstrip() hourto = hourto.rstrip() business_hours = { "week_days": days, "hour_from": hourfrom, "hour_to": hourto } except: business_hours = {} # Foundation Date foundation_date = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]/text()' ).get() # Phone try: phone = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get() phone = remove_tags(phone) if "," in phone: array = phone.split(",") phone = [] for each in array: each = each.lstrip() each = each.rstrip() each = each.split(" ", 1) code = each[0] code = code.replace("+", "") number = each[1] number = number.replace(" ", "") phone.append({"country_code": code, "number": number}) else: phone = phone.lstrip() add = phone.rstrip() add = add.split(" ", 1) code = add[0] code = code.replace("+", "") number = add[1] number = number.replace(" ", "") phone = [{"country_code": code, "number": number}] except: phone = [] # Web try: web = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get() web = remove_tags(web) if "," in web: array = web.split(",") web = [] for each in array: each = each.lstrip() each = each.rstrip() web.append(each) else: web = web.lstrip() add = web.rstrip() web = [add] except: web = [] # Email try: email = Selector( response=page).xpath('//*[@id="TabPanelBox"]').get() email = email.replace("*****@*****.**", "") email = re.findall(r'[\w\.-]+@[\w\.-]+', email) except: email = [] info = { "name": name, "vat": vat_number, "addresses": address, "business_hours": business_hours, "phones": phone, "websites": web, "emails": email, "foundation_date": foundation_date } print("Bia Scraped Successfully") # print(info) return info except: print("No info") return "No info" # driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(Keys.RETURN) # try: # logo = driver.find_element_by_id('LogoImageUploaderBox').get_attribute("style") # except: # logo = "" # print(logo) # try: # name = driver.find_element_by_id('CompanyNameBox').text # except: # name = "" # print(name) # try: # trademarks = driver.find_element_by_xpath('//*[@id="TrademarksListBox"]/li').text # except: # trademarks = "" # print(trademarks) # try: # legal_form = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[1]/span[2]').text # except: # legal_form = "" # print(legal_form) # try: # registration_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[1]/span[2]').text # except: # registration_number = "" # print(registration_number) # try: # registration_authority = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[1]/span[2]').text # except: # registration_authority = "" # print(registration_authority) # try: # status = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[1]/span[2]').text # except: # status = "" # print(status) # try: # brands = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[1]/td[2]/span[2]').text # except: # brands = "" # print(brands) # try: # vat_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]').text # except: # vat_number = "" # print(vat_number) # try: # registration_date = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]').text # except: # registration_date = "" # print(registration_date) # try: # legal_address = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]').text # except: # legal_address = "" # print(legal_address) # try: # working_hours = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li').text # except: # working_hours = "" # print(working_hours) # try: # phone = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').text # except: # phone = "" # print(phone) # try: # website = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').text # except: # website = "" # print(website) # x = mycol.insert_one({ # "Name": name, # "Logo": logo, # "Trademarks": trademarks, # "Legal_Form": legal_form, # "Registration_Number": registration_number, # "Registration_Authority": registration_authority, # "Status": status, # "Brands": brands, # "VAT_Number": vat_number, # "Registration_Date": registration_date, # "Legal_Address": legal_address, # "Working_Hours": working_hours, # "Phone": phone, # "Website": website # }) # driver.find_element_by_xpath('').text # driver.find_element_by_xpath('').text # //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span/a # //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span