def parse_item(self, response): item = Sun0769Item() question = Selector(response).xpath( '//div[@class="pagecenter p3"]//strong/text()').extract()[0] title = question.strip().split(u'编号:')[0] #.strip().split(' ')[0] #.split(r':')[-1] number = question.strip().split(' ')[-1].split(':')[-1] content = Selector(response).xpath( "//div[@class='pagecenter p3']//div[@class='contentext']/text()" ).extract() # 有图片时 if len(content) == 0: content = Selector(response).xpath( " //div[@class='pagecenter p3']//div[@class='c1 text14_2']/text()" ).extract() item["content"] = "".join(content).strip() else: img = Selector(response).xpath( "//div[@class='pagecenter p3']//img/@src").extract() item["img"] = img item["content"] = "".join(content).strip() item["title"] = title item["number"] = number item["url"] = response.url yield item
def parse_comment(self, response): try: worth = Selector(response=response).xpath('//span[@id="rating_worthy_num"]/text()').get() worthless = Selector(response=response).xpath('//span[@id="rating_unworthy_num"]/text()').get() price = Selector(response=response).xpath('//div[@class="price"]/span/text()').get() url = Selector(response=response).xpath('//a[@class="img-box"]/img[@class="main-img"]/@src').get() if not price: price = Selector(response=response).xpath('//div[@class="old-price-box"]/p/span[2]/text()').get() goods_item = response.meta['item'] goods_item['visible_price'] = price goods_item['worth'] = worth goods_item['worthless'] = worthless goods_item['url'] = url yield goods_item # 返回商品ITEM comment_list = Selector(response=response).xpath('//ul[@class="comment_listBox"]/li[@class="comment_list"]') for comment_content in comment_list: item = CommentItem() item['goods_id'] = goods_item['goods_id'] item['comment_id'] = comment_content.xpath('./@id').get().strip().split("_")[-1] display_time = comment_content.xpath( './div[@class="comment_conBox"]/div[@class="comment_avatar_time "]/div[@class="time"]/text()').get().strip() item['time'] = convert_time(display_time) item['text'] = comment_content.xpath( './div[@class="comment_conBox"]/div[@class="comment_conWrap"]/div[@class="comment_con"]/p/span/text()').get() if item['text']: if item['text'] != " ": yield item # 返回评论ITEM # 检查评论是否有下一页,如果有继续爬取 next_page = Selector(response=response).xpath( '//*[@class="pagination"]/li[@class="pagedown"]/a/@href').get() if next_page: yield scrapy.Request(url=next_page.strip(), meta={'item': goods_item}, callback=self.parse_comment) except Exception as e: print(f"抓取评论页面时出现错误:{str(e)}")
def parse_ranking(self, response): try: # 如果有下一页继续爬取 next_page = Selector(response=response).xpath('//li[@class="page-turn next-page"]/a/@href').get() if next_page: yield scrapy.Request(url=next_page.strip(), callback=self.parse_ranking) # 循环爬取每个商品信息 goods_list = Selector(response=response).xpath('//ul[@class="feed-list-hits"]/li') for goods_content in goods_list: item = GoodsItem() goods_info = goods_content.xpath( './div/div[@class="z-feed-content "]/div[@class="z-feed-foot"]/div[@class="z-feed-foot-r"]/div/div/a[@class="z-btn z-btn-red"]/@onclick').get().strip() # 临时将详情页存入url字段 item['url'] = goods_content.xpath( './div/div[@class="z-feed-content "]/div[@class="z-feed-foot"]/div[@class="z-feed-foot-l"]/a[2]/@href').get().strip() display_time = goods_content.xpath( './div/div[@class="z-feed-content "]/div[@class="z-feed-foot"]/div[@class="z-feed-foot-r"]/span[@class="feed-block-extras"]/text()').get().strip() item['time'] = convert_time(display_time) pattern = ".*dataLayer.push.*gtmAddToCart\((.*?)\)$" info_text = re.search(pattern, goods_info) if info_text: goods_json = json.loads(info_text.group(1).replace("'", '"')) item['name'] = goods_json["name"] item['goods_id'] = goods_json['id'] item['brand'] = goods_json['brand'] item['category'] = goods_json['category'] item['price'] = goods_json['price'] yield scrapy.Request(url=item['url'], meta={'item': item}, callback=self.parse_comment) except Exception as e: print(f"抓取榜单页面时出现错误:{str(e)}")
def collectSubscribe(self, p_content, p_definition, p_seqno): s = Selector(text=p_content) selectors = p_definition["selectors"] rooturl = p_definition["addr"] limit = -1 if "limit" in p_definition: limit = p_definition["limit"] data = [] for sel in selectors: xpath = sel["xpath"] for idx, item in enumerate(s.xpath(xpath)): if limit >= 0 and idx >= limit: break kv = {} kv["no"] = idx + 1 string = item.xpath(".//text()").getall() strings = "" if string: for s in string: strings = strings + s.strip() kv["value"] = strings kv["hashcode"] = hash(strings) if item.xpath("@href"): href = item.xpath("@href").get() kv["href"] = geturl(rooturl, href) data.append(kv) return data
def parseWaterBill(self, response): #Check if we found the water bill if not then write to the failed CSV and return. if (len( response.xpath( "//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblCurrentBalance']" )) == 0): print("Couldn't find a water bill for account " + response.meta['account_or_address']) self.writeFailedCSV(response.meta['account_or_address']) return None #I use the item feature in scrapy to store the items. wateritem = WaterbillItem() wateritem['Searched_Address'] = response.meta[ 'search_type'] #This is a relic of when I searched by addresses. table = response.xpath('//table[@class="dataTable"]//tr') headers = [ 'Account Number', 'Service Address', 'Current Read Date', 'Current Bill Date', 'Penalty Date', 'Current Bill Amount', 'Previous Balance', 'Current Balance', 'Previous Read Date', 'Last Pay Date', 'Last Pay Amount', 'TimeStamp' ] #I can't determine if this actually works because I can't find an address with a shut off notice. if (len( response.xpath( "//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblTurnOffDate']" )) != 0): wateritem['TurnOffDate'] = "Yes" #wateritem['TurnOffDate'] = Selector(text=row.extract()).xpath("//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblTurnOffDate']").extract_first() else: wateritem['TurnOffDate'] = 'No' for row in table: header = Selector( text=row.extract()).xpath('//th/text()').extract_first() value = Selector(text=row.extract()).xpath( '//td/descendant::*/text()').extract_first() if value == None: value = '' #So it populates the excel sheet with a blank spot if (header != None and header.strip().replace(':', "") in headers): value = value.replace('$', '').replace(",", '') if ("Date" in header and value != ''): #Convert to SQL Datetime Format value = datetime.strptime(value.strip(), '%m/%d/%Y').strftime('%Y-%m-%d') wateritem[header.strip().replace(':', "").replace( ' ', '_')] = value.strip() wateritem['Timestamp'] = datetime.today().strftime('%Y-%m-%d') return wateritem
def getText(response, xpaz): """THis function for reading any text""" selector = Selector(response=response).xpath(xpaz).get() if type(selector) == str: selector = selector.strip() else: selector = '' return selector
def getIntObj(response, xpaz): """This funcrion for reading floor and amount of rooms""" selector = Selector(response=response).xpath(xpaz).get() if type(selector) == str: selector = selector.strip() else: return '' selector = float(selector[:3].strip()) return selector
def obtain_rate (self, date_str): html_str = self._fetch_from_site(date_str) ratePath = Selector(text=html_str).xpath("/html/body/table/tr[2]/td[1]/table/tr[2]/td[4]/table/tr/td/text()").get() if (ratePath is None): return 'n/a' return ratePath.strip()
def parse(self, response): rows = response.xpath( '//*[@id="mainlayout"]/div[2]/div[3]/div[2]/div').extract( ) # inspect html and you will see our elements are all <div> for row in rows[1:]: # rows[0] are only the columns names Dat = Selector( text=row).xpath('//div[1]/text()').extract()[1].encode( 'utf-8', 'ignore') Dat = Dat.strip()[3:5] + '-' + Dat.strip()[0:2] + '-' + Dat.strip( )[6:8] Date = datetime.strptime(Dat, '%d-%m-%y').date() # the next line below doesn't work because this xpath address contains ctl01 that changes for every row: # Symbol = Selector(text=row).xpath('//*[@id="ctl00_CPI_rptAnnouncement_ctl01_dvSymItem"]/a/text()').extract()[0].encode('utf-8','ignore') # solution: use xpath "contains" to leave out reference numbers: Symbol = Selector(text=row).xpath( '//*[contains(@id,"dvSymItem")]/a/text()').extract()[0].encode( 'utf-8', 'ignore') # compnay has the same syntax problem so we use "contains" again: Company = Selector( text=row).xpath('//*[contains(@id,"dvCompItem")]/a/text()' ).extract()[0].encode('utf-8', 'ignore').strip() CurrR = Selector( text=row).xpath('//div[4]/text()').extract()[0].encode( 'utf-8', 'ignore') NewR = Selector( text=row).xpath('//div[5]/text()').extract()[0].encode( 'utf-8', 'ignore') Period = Selector( text=row).xpath('//div[6]/text()').extract()[0].encode( 'utf-8', 'ignore') item = PwItem() item['Date'] = Date item['Symbol'] = Symbol item['Company'] = Company item['CurrR'] = CurrR item['NewR'] = NewR item['Period'] = Period yield item
def parse_basic_info(self, response): basic_info = response.xpath( '//div[@class="zong"]//div[@class="wai"]').extract() id = 2 for info in basic_info: type = Selector( text=info).xpath('//div[@class="zi"]/text()').extract()[0] type = type.strip() name_info = Selector(text=info).xpath('//div[@id="b ' + str(id) + '"]/ul/li/ul/li').extract() id = id + 1 for fake_name in name_info: name = None url = None has_a = Selector(text=fake_name).xpath('//li/a').extract() if len(has_a) == 0: name = Selector( text=fake_name).xpath('//li/text()').extract()[0] url = None else: name = Selector( text=fake_name).xpath('//li/a/text()').extract()[0] url = Selector( text=fake_name).xpath('//li/a/@href').extract()[0] if name != None: name = name.strip() if url != None: url = url.strip() if len(url) <= 7: url = None item = SpiderLoaderItem(item=BankListItem(), response=response) item.add_value('type', type) item.add_value('name', name) item.add_value('url', url) item.add_value('longitude', '') item.add_value('latitude', '') item.add_value('address', '') item.add_value('tel', '') item.add_value('workday', '') item.add_value('table_name', 'CBRCBANK_BANK_LIST') yield item.load_item()
def parse(self, response): productList = Selector(text=response.body).xpath( '//li[contains(@class, "gl-item")]').extract() # $object = UPLOAD_PATH.$new_path.md5(time().mt_rand(100, 999999999)). # '.'.pathinfo($file->getInfo('name'), PATHINFO_EXTENSION); # $new_path = 'goods'.date('Y').'/'.date('m-d').'/'; Class = Selector(text=response.body).xpath( '//div[contains(@class, "p-name p-name-type-2")]//em[not(i)]' ).extract() print(Class) for item in productList: if self.num > self.getNum: break name = Selector(text=item).xpath( '//div[contains(@class, "p-name")]/a/em').extract()[0] name = filterStr.filter_tags(name) skuid = Selector(text=item).xpath('//li/@data-sku').extract()[0] price = Selector(text=item).xpath( '//div[contains(@class, "p-price")]/strong/i').extract()[0] price = filterStr.filter_tags(price) imgsrc = Selector(text=item).xpath( '//li[contains(@class, "gl-item")]//img/@src').extract()[0] imgsrc = imgsrc.replace('//', '') # 去除京东超市 # '京东超市金龙鱼 食用油 葵花籽清香型 食用植物调和油5L(新老包装随机发货)' name = name.replace("京东超市", "") name = name.replace("(京东定制)", "") name = name.replace("(京东定制装)", "") name = name.replace("京东自营", "") name = name.replace("(新老包装随机发货)", "") name = name.replace("新旧包装随机配送", "") name = name.replace("新老包装随机发放", "") name = name.replace("(新老包装随机发放,数量有限,赠完为止)", "") name = name.replace("中粮出品", "") name = name.replace("(中粮出品)", "") if "【沃尔玛】" in name: continue name = name.replace("【沃尔玛】", "") self.item['name'] = name.strip() self.item['price'] = price self.item['skuid'] = skuid # self.item['Class'] = Class self.item['imgsrc'] = imgsrc self.item['sourceType'] = SOURCE_TYPE_JD self.item['goods_id'] = self.insertGoods(self.item) self.num = self.num + 1 yield self.item
def parse(self, response): # get all the listing blocks listings = response.xpath('//a[@class="col-xs-12 profitem"]').getall() # within each listing block get the details for i in listings: # there is more than 1 heading or suburb, just get the first one suburb = Selector(text=i).xpath( '//h4[@class="mat-header"]/text()').get().strip() # new or updated listing status = Selector(text=i).xpath( '//span[@class="mat-text-span text-uppercase mat-new hidden-xs"]/text()' ).get() # price price = Selector( text=i).xpath('//h4[@class="mat-header mat-price"]').get() # some regex to extract the price loc = re.search("</sup>", price) price = price[loc.span()[1]:] price = price.replace('<sup>', '') price = price.replace('</sup>', '') price = price.replace('</h4>', '') price = re.sub('\xa0', ' ', price) price = price.strip() # get all feature details in a list details = Selector(text=i).xpath( '//ul[@class="mat-feture"]/li/div[@class="mat-fetaure-avl"]/text()' ).getall() # listing details home_type = details[0].strip() available = details[1].strip() occupants = details[2].strip() # get description desc = Selector(text=i).xpath( '//div[@class="col-sm-4 col-md-6 hidden-xs hidden-sm mathes-list"]/p/text()' ).get().strip() desc = desc.replace('\r', '') desc = desc.replace('\n', '') listing = { 'suburb': suburb, 'status': status, 'price': price, 'home_type': home_type, 'available': available, 'occupants': occupants, 'description': desc, } yield (listing)
def getObj(response, xpaz): """This function for reading price""" selector = Selector(response=response).xpath(xpaz).get() if type(selector) == str: selector = selector.strip() else: return '' price = '' for char in selector: if char != ' ' and char != '$': price += char price = int(price) return price
def parse_comments(self, response): item = CommentsItem() item['id'] = response.meta['id'] item['flag'] = response.meta['flag'] item['author'] = [] item['author_comment'] = [] item['time'] = [] text = response.text restojson = json.loads(text, encoding='utf-8') html = restojson['data']['html'] html = html.split('\\n') html = ''.join(html) author_comments = Selector( text=html).xpath('//*[@class="WB_text"]').extract() for author_comment in author_comments: item['author'].append( Selector(text=author_comment).xpath('//a/text()').extract()[0]) remove_author = Selector( text=author_comment).xpath('//a/text()').extract()[0] author_comment = dealcontent(author_comment) comment = Selector(text=author_comment).xpath('//text()').extract() comment.remove(remove_author) comment = ''.join(comment) while re.match(r'^ ', comment): comment = comment.strip(' ') item['author_comment'].append(comment) if item['flag'] == 'forwarded': item['time'] = Selector(text=html).xpath( '//*[@class="WB_from S_txt2"]/a/@title').extract() if item['flag'] == 'comment': item['time'] = Selector(text=html).xpath( '//*[@class="WB_from S_txt2"]/text()').extract() item['like_count'] = Selector(text=html).xpath( '////span[@node-type="like_status"]/em[2]/text()').extract() lens = len(item['like_count']) for i in range(0, lens): item['like_count'][i] = item['like_count'][i].replace('赞', '0') yield copy.deepcopy(item)
def parse(self, response): folders = response.xpath('//div[@class="AccordionPanel"]').extract() for folder in folders: year = Selector(text=folder).xpath('//div[@class="AccordionPanelTab"]/text()').extract_first() os.makedirs('./newyork/' + year) rows = Selector(text=folder).xpath('//table[@class="listingTable"]/tbody/tr').extract() for row in rows: name = Selector(text=row).xpath('//td[@headers="Name"]/text()').extract_first() name = name.strip() if name is not None else '' length = len(name) name = name if length < 235 else name[:235] date = Selector(text=row).xpath('//td[contains(@headers, "Date")]/text()').extract_first() href = Selector(text=row).xpath('//td/a[contains(text(), "Transcript") or contains(text(), "Transcript and Testimony")]/@href').extract_first() t_type = Selector(text=row).xpath('//td/a[contains(text(), "Transcript") or contains(text(), "Transcript and Testimony")]/text()').extract_first() if href is not None and 'Transcript' in t_type: yield Request(href, callback=self.parsetranscript, meta={'year': year, 'name': name, 'date': date, 'download_timeout': 3500}, dont_filter=True) else: continue
def Company_Info(link): print("Trying to get Identification Number") url = link page = requests.get(url) vat = Selector(response=page).xpath('/html/body/div[4]/div/div[2]/div/div[1]/div[1]/div[contains(., "Identification")]/span[2]/text()').get() try: vat = vat.strip() except: vat = None data = { "vat" : vat } # print(data) return data # Company_Info("https://www.yell.ge/company.php?lan=eng&id=139568") # print(main)
def getSpace(response, xpaz): """This function return size of apartamens """ spaces = Selector(response=response).xpath(xpaz).get() if type(spaces) == str: spaces = spaces.strip() else: return '' total_split = [i for i in re.split(r'(\d+.\d+|\W+)', spaces) if i] total_space = 0 live_space = 0 kitchen_space = 0 sqear = [None] * 3 index = 0 for string in total_split: if string.replace('.', '', 1).isdigit(): sqear[index] = string index += 1 if sqear[2] is None: if sqear[0]: total_space = float(sqear[0]) else: total_split = 0 if sqear[1]: kitchen_space = float(sqear[1]) else: live_space = 0 else: total_space = float(sqear[0]) live_space = float(sqear[1]) kitchen_space = float(sqear[2]) return total_space, live_space, kitchen_space
position = "" # Company try: company = Selector(response=page).xpath( f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[2]/p/span[1]/a/text()' ).get() except: company = "" # Published try: published = Selector(response=page).xpath( f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[2]/p/span[4]/time/span[1]/text()' ).get() published = published.strip().split(",") publish_year = int(published[1].strip()) publish_day = int(published[0].split(" ")[1]) publish_month = int(months[published[0].split(" ")[0]]) except: publish_year = 0 publish_day = 0 publish_month = 0 if yesterday_day != publish_day or yesterday_month != publish_month: print("Not published yesterday") continue # Ends try: ends = Selector(response=page).xpath( f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[2]/p/span[4]/time/span[2]/text()'
candidates = [] for county, tid in ref.items(): print county r = requests.get('https://kmt2018.com/candidate_json.asp?tid=%s&cid=2' % tid) if r.status_code == 500: continue r.encoding = 'utf-8' cs = r.json() cs = [x for x in cs if x['name'] != u'陸續更新中'] for candidate in cs: print candidate['name'] rd = requests.get('https://kmt2018.com/read_candidate.asp?ids=%s' % candidate['uid']) rd.encoding = 'utf-8' x = Selector(text=rd.text, type='html') for desc in x.css('.desc .title'): content = '\n'.join([x.strip() for x in desc.xpath('following-sibling::div[1]//text()').extract() if x.strip()]) if desc.xpath('text()').extract_first() == u'競選口號': candidate['slogan'] = content elif desc.xpath('text()').extract_first() == u'經歷': candidate['experience'] = content elif desc.xpath('text()').extract_first() == u'學歷': candidate['education'] = content candidate['name'] = re.sub('\s', '', candidate['name']) candidate['county'] = county candidate['constituency'] = normalize_constituency(candidate['desc']) img_link = candidate['picture'] f_name = '%s_%d_%s.%s' % (candidate['county'], candidate['constituency'], candidate['name'], img_link.split('.')[-1].split('?')[0]) f = '%s/%s' % (path, f_name) cmd = 'wget -N --no-check-certificate "%s" -O %s' % (img_link, f) subprocess.call(cmd, shell=True) candidate['image'] = u'%s/%s/%s/%s/%s' % (common.storage_domain(), 'councilors', '2018', u'中國國民黨', f_name)
def Vacancy(link): print("request sent for Vacancy succesfully") url = link # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Location try: location = Selector(response=page).xpath( '/html/body/div[2]/table/tr[contains(., "Location:")]').get() location = location.split("<td>")[1].split("</td>")[0].replace( "&nbsp", " ") location = location.split(",")[0] location = [{'city': location, 'id': Geonames(location)}] except: location = [{'city': 'Yerevan', 'id': '616052'}] # Company url try: c_url = Selector(response=page).xpath( '/html/body/div[2]/table/tr[contains(., "Company:")]').get() c_url = c_url.split('href="')[1].split('">')[0] except: c_url = "" # Vacancy Description try: description = Selector(response=page).xpath('/html/body/div[4]').get() description = remove_tags(description) description = description.strip() description = description.replace('&nbsp', " ") except: description = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Email try: email = Selector(response=page).xpath('//*[@id="job"]/a/@href').get() email = email.replace('mailto:', "") email = [email] except: email = [] data = { "location": location, "c_link": c_url, "description_am": description_am, "description_en": description_en, "email": email } # print(data) return data
deadline_year = 0 # Email try: email = re.findall(r'[\w\.-]+@[\w\.-]+', description)[0] except Exception as e: email = [] # Publication stuff v_page = requests.get(v_link) try: published = Selector(response=v_page).xpath( '//*[@id="ContentplaceholderMain_T7553F19B005_Col00"]/div[2]/div[2]/div[1]/div[1]/text()' ).get() published = published.strip() published = published.split(" ") publish_day = published[1].replace(",", "") publish_day = int(publish_day) publish_month = int(months[f"{published[0]}"]) publish_year = int(published[2]) except: published = 0 publish_month = 0 publish_year = 0 if publish_day != yesterday_day: print("Not published Yesterday") continue data = { "company": company,
def parse(self, response): regions = response.xpath('//h2').xpath('@id').extract() non_us_cities = ['phnom penh', 'shanghai', 'hangzhou'] for region in regions: region_body = response.xpath('//h2[@id = "{region}"]/following-sibling::ul[@class = "reglist"]'.format(region=region)).get() pois = Selector(text=region_body).css('.loc').extract() for poi in pois: ref = Selector(text=poi).xpath('//a/@href').extract()[0].split('/')[-1] name = Selector(text=poi).xpath('//h4/text()').extract() if name == []: name = Selector(text=poi).xpath('//h3/text()').extract() name = ''.join(name) map_link = Selector(text=poi).xpath('//div[@class = "locaddress"]/a').xpath('@href').extract_first() lat, long = None, None if 'daddr' in map_link: coords = map_link.split('daddr=')[1].split(',') lat = coords[0] long = coords[1] addr = Selector(text=poi).xpath('//div[@class = "locaddress"]/a').extract_first() addr = Selector(text=addr).xpath('//a/text()').extract() addr = [a.strip() for a in addr] addr_full = ', '.join(addr) street = ', '.join(addr[:-1]) city, state, postcode = None, None, None if region in ['cambodia', 'china']: for c in non_us_cities: if c in poi.lower(): city = c.capitalize() country = region.capitalize() else: city = addr[-1].split(', ')[0] state_postcode = addr[-1].split(', ')[1].split(' ') if len(state_postcode) > 1: state = state_postcode[0] postcode = state_postcode[1] country = "US" phone = Selector(text=poi).xpath('//div[@class="locinfo"]/text()').get() phone = phone.strip() if phone else None opening_hours = Selector(text=poi).xpath('//div[@class="lochrs"]/text()').extract() opening_hours = opening_hours = ', '.join([hours.strip() for hours in opening_hours]) if opening_hours else None properties = { 'ref': ref, 'website': 'https://www.habitburger.com/locations/' + ref, 'name': name, 'addr_full': addr_full, 'street': street, 'city': city, 'state': state, 'postcode': postcode, 'country': country, 'phone': phone, 'opening_hours': opening_hours, 'lat': lat, 'lon': long, } yield GeojsonPointItem(**properties)
def parse(self, response): NATS_SERVER_HOME = os.environ['NATS_SERVER_HOME'] WEATHER_DIR_PATH = NATS_SERVER_HOME + "/share/tg/weather" page = response.url.split("/")[-2] htmlDoc = response.body # We check the URL and distinguish the work flow # Different URL page contains different info. The page formats are different. We have to handle them individually. # If URL contains "sigmet" if ('sigmet' in response.url): filename_SIGMET = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + '.sigmet' file_SIGMET = open(WEATHER_DIR_PATH + "/" + filename_SIGMET, 'wb') title = Selector(text=htmlDoc).xpath( '//div[@id="awc_main_content"]/div[@id="title"]/text()' ).extract_first() if (len(title.strip()) > 0): file_SIGMET.write(title + "\n\n") array_children = Selector( text=htmlDoc).xpath('//div[@id="awc_main_content"]/*') for i in range(0, len(array_children)): node_layer_1 = array_children[i] if ('p' == node_layer_1.xpath('name()').extract()[0]): node_layer_2 = node_layer_1.xpath('//b') if not (node_layer_2 is None): file_SIGMET.write( node_layer_2.xpath('text()').extract()[0] + '\n') elif ('b' == node_layer_1.xpath('name()').extract()[0]): file_SIGMET.write( node_layer_1.xpath('text()').extract()[0] + '\n') elif ('pre' == node_layer_1.xpath('name()').extract()[0]): file_SIGMET.write( node_layer_1.xpath('text()').extract()[0] + '\n\n') file_SIGMET.close() elif ('metar' in response.url): # If URL contains "metar" filename_METAR = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + '.metar' file_METAR = open(WEATHER_DIR_PATH + "/" + filename_METAR, 'wb') title = Selector(text=htmlDoc).xpath( '//div[@id="awc_main_content"]/div[@id="title"]/text()' ).extract_first() if (len(title.strip()) > 0): file_METAR.write(title + "\n\n") array_children = Selector( text=htmlDoc).xpath('//div[@id="awc_main_content"]/*') for i in range(0, len(array_children)): node_layer_1 = array_children[i] if ('p' == node_layer_1.xpath('name()').extract()[0]): node_layer_2 = node_layer_1.xpath('//b') if not (node_layer_2 is None): file_METAR.write( node_layer_2.xpath('text()').extract()[0] + '\n') elif ('b' == node_layer_1.xpath('name()').extract()[0]): file_METAR.write( node_layer_1.xpath('text()').extract()[0] + '\n') array_data = Selector( text=htmlDoc).xpath('//div[@id="awc_main_content"]/text()') for i in range(0, len(array_data)): if (len(array_data[i].extract().strip()) > 0): file_METAR.write("\n" + array_data[i].extract()) file_METAR.close() elif ('airep' in response.url): # If URL contains "airep" filename_AIREP = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + '.airep' file_AIREP = open(WEATHER_DIR_PATH + "/" + filename_AIREP, 'wb') title = Selector(text=htmlDoc).xpath( '//div[@id="awc_main_content"]/div[@id="title"]/text()' ).extract_first() if (len(title.strip()) > 0): file_AIREP.write(title + "\n\n") array_children = Selector( text=htmlDoc).xpath('//div[@id="awc_main_content"]/div/*') for i in range(0, len(array_children)): node_layer_1 = array_children[i] if ('p' == node_layer_1.xpath('name()').extract()[0]): node_layer_2 = node_layer_1.xpath('//b') if not (node_layer_2 is None): file_AIREP.write( node_layer_2.xpath('text()').extract()[0] + '\n\n') elif ('code' == node_layer_1.xpath('name()').extract()[0]): file_AIREP.write( node_layer_1.xpath('text()').extract()[0] + '\n') file_AIREP.close()
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Location try: location = Selector(response=page).xpath( '/html/body/main/section/div/div[1]/div[3]/ul/li[3]/a/text()').get( ) location = location.strip() location = location.split(",")[0] location = [{"city": location, "id": Geonames(location)}] except: location = [{"city": "Yerevan", "id": "616052"}] # Website try: website = Selector(response=page).xpath( '/html/body/main/section/div/div[1]/div[3]/ul/li[4]/a/@href').get( ) if website is None: website = [] else: website = [website] except: website = [] # Job Type try: job_type = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[3]/text()').get() job_type = job_type.strip() except: job_type # Published try: published = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[7]/text()').get() published = published.strip() except: published = "" # Salary try: salary = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[2]/text()').get() salary = salary.strip() salary = salary.replace("֏", "") salary = salary.replace(",", "") salary = salary.replace(" ", "") salary = int(salary) except: salary = 0 # Gender try: gender = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[4]/text()[2]').get() gender = gender.strip() except: gender = "" # Description try: description = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/p').get() description = remove_tags(description).strip() except: description = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Email try: driver.get(link) email = driver.find_element_by_xpath( '/html/body/main/section/div/div[2]/div/p').text email = re.findall(r'[\w\.-]+@[\w\.-]+', email) except Exception as e: email = [] data = { "location": location, "website": website, "job_type": job_type, "publish_day": published, "salary": salary, "gender": gender, "description_am": description_am, "description_en": description_en, "email": email } # print(data) return data # Vacancy("https://www.worknet.am/en/job/%D5%A2%D5%A1%D5%B6%D5%BE%D5%B8%D6%80-%D5%BA%D5%A1%D5%B0%D5%A5%D5%BD%D5%BF%D5%AB-%D5%A1%D5%B7%D5%AD%D5%A1%D5%BF%D5%A1%D5%AF%D5%AB%D6%81-4656")
def parse_detail(self, response): loan_name = response.xpath('//h1/text()').extract() if not loan_name: loan_name = "" else: loan_name = loan_name[0].strip().replace(" ", "") mortgage_info = response.xpath( '//span[@class="item doc-color-red"]/span/text()').extract() if not mortgage_info: mortgage_info = "" else: mortgage_info = mortgage_info[0].strip().replace(" ", "") identity_limit = response.xpath( '//span[@class="spec can-reg"]/text()').extract() if not identity_limit: identity_limit = "" else: identity_limit = identity_limit[0].strip().replace(" ", "") lending_time_info = response.xpath( '//span[@class="spec fangkuan"]/text()').extract() if not lending_time_info: lending_time_info = "" else: lending_time_info = lending_time_info[0].strip() prepayment_requirement = response.xpath( '//span[@class="doc-color-tail"]/*/@hover-tip').extract() if not prepayment_requirement: prepayment_requirement = response.xpath( '//span[@class="doc-color-tail"]/span/text()').extract() if prepayment_requirement: prepayment_requirement = prepayment_requirement[0].strip() else: prepayment_requirement = "" else: prepayment_requirement = Selector( text=prepayment_requirement[0]).xpath( "//span/text()").extract()[0] prepayment_requirement = prepayment_requirement.strip() extra_info = response.xpath( '//meta[@name="description"]/@content').extract() if extra_info: extra_info = extra_info[0].strip() else: extra_info = "" detail = response.xpath( '//div[@class="pd_other_item_content"]/text()').extract() item = response.meta['item'] item['loan_name'] = loan_name item['mortgage_info'] = mortgage_info item['identity_limit_info'] = identity_limit item['lending_time_info'] = lending_time_info item['extra_info'] = extra_info item['prepayment_requirement'] = prepayment_requirement requrement_detail = "" if detail: for dl in detail: requrement_detail += dl.strip() item['requirement_detail'] = requrement_detail tmp_cookie = _cookie tmp_cookie['cityDomain'] = item['city'] tmp_cookie['my_city'] = item['city'] referer = "http://www.rong360.com/p_" + item['loan_id'] tmp_header = _headers tmp_header['Referer'] = referer if item['loan_type'] == self.household_loan: for loan_amt in range(self.loan_amt_min, self.loan_amt_max + 1, self.loan_amt_gap): for loan_duration in range(self.loan_duration_min, self.loan_duration_max + 1, self.loan_duration_gap): tmp_form_data = _form_d tmp_form_data['loan_limit'] = str(loan_amt) tmp_form_data['loan_term'] = str(loan_duration) yield scrapy.FormRequest( self.interest_url, formdata=tmp_form_data, cookies=tmp_cookie, headers=tmp_header, method=_method, meta={'item': item}, callback=self.parse_interest, dont_filter=True, ) elif item['loan_type'] == self.zero_payment_loan: tmp_form_data = _form_d tmp_form_data['loan_limit'] = str(item['loan_amt']) tmp_form_data['loan_term'] = str(item['loan_duration']) yield scrapy.FormRequest( self.interest_url, formdata=tmp_form_data, cookies=tmp_cookie, headers=tmp_header, method=_method, meta={'item': item}, callback=self.parse_interest, dont_filter=True, ) else: pass pass
async def get_links_from_url(definition): """Download the page at `url` and parse it for links. Returned links have had the fragment after `#` removed, and have been made absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes 'http://www.tornadoweb.org/en/stable/gen.html'. """ url = definition["addr"] selectors = definition["selectors"] level = definition["level"] parent = "" if "parent" in definition: parent = definition["parent"] contents = [] urls = [] #response = await httpclient.AsyncHTTPClient().fetch(url) try: response = await http_client.fetch(url, method='GET', headers=http_header, validate_cert=False) except Exception as e: print("Error: %s" % e) else: print("fetched %s" % url) #print(response.body) #html = response.body.decode(errors="ignore") s = Selector(text=response.body) print(s) for selector in selectors: xpath = selector["xpath"] wrap = definition["wrap"] kv = {} if wrap == 1: strings = "" kv["url"] = url kv["no"] = 1 for s in s.xpath(xpath).xpath(".//text()").getall(): strings = strings + s.strip() kv["value"] = strings kv["hashcode"] = hash(strings) contents.append(kv) else: #content.append(s.xpath(xpath).xpath(".//text()").getall()) for idx, item in enumerate(s.xpath(xpath)): #contents.append({"parent":parent, "rownum":level+"-"+str(idx), "data": {"value":item.xpath(".//text()").getall()}}) kv = {} kv["url"] = url kv["no"] = idx + 1 string = item.xpath(".//text()").getall() if string: strings = "" for s in string: strings = strings + s.strip() kv["value"] = strings kv["hashcode"] = hash(strings) else: kv["value"] = "" kv["hashcode"] = "" if selector["extract"] == 1: href = item.xpath("@href") if href: kv["href"] = geturl(url, href.get()) #urls.append({"addr":href.get(),"parent":level+"-"+str(idx)}) urls.append(kv["href"]) contents.append(kv) #return [urljoin(url, remove_fragment(new_url)) for new_url in get_links(html)] return contents, urls
try: name = Selector(response=page).xpath( '//*[@id="person-profile"]/div/h2/text()').get() except: name = "" if name is None or name == "": f = open("check.txt", "a") f.write(f"id: {i} was Not captured\n") continue # ID try: _id = Selector(response=page).xpath( '//*[@id="person-attributes"]/tbody/tr/td[2]/text()').get() _id = _id.strip() except: _id = "" whatsthere = people.find_one({"identification_number": _id}) if whatsthere is None: # Listed Affiliations affiliations = [] num_1 = len( Selector(response=page).xpath( '//*[@id="affiliations-list"]/tbody/tr').getall()) + 1 for tr in range(1, num_1): # Company try:
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Published try: published = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[2]/span/text()[2]' ).get() published = published.strip().split(" ") publish_day = int(published[0].split("/")[0]) publish_month = int(published[0].split("/")[1]) publish_year = int(published[0].split("/")[2]) except Exception as e: publish_day = 0 publish_month = 0 publish_year = 0 if yesterday_day != publish_day or yesterday_month != publish_month: print("Not published yesterday") return # Location # try: location = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[1]/text()' ).get() location = location.strip() location_id = [] location = {"city": f"{location}", "id": f"{Geonames(location)}"} location_id.append(location) except: location_id = [{'city': 'Yerevan', 'id': '616052'}] # Posted by try: posted_by = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[1]/text()' ).get() posted_by = posted_by.strip() except: posted_by = "" # Email try: email = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[2]/text()' ).get() email = email.strip() if email == "": email = [] else: email = [email] except: email = [] # Workspace try: workspace = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[2]/div[2]/div[2]/p/text()' ).get() workspace = workspace.strip() except: workspace = "" # Job_type try: job_type = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[3]/div[2]/div[2]/p/text()' ).get() job_type = job_type.strip() except: job_type = "" # Salary try: salary = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[4]/div[2]/div[2]/p/text()' ).get() salary = salary.strip().replace("Until ", "") if "-" in salary: salary = salary.split("-") min_salary = int(salary[0].strip()) max_salary = int(salary[1].strip()) elif "-" not in salary and salary != '': min_salary = int(salary) max_salary = int(salary) else: min_salary = 0 max_salary = 0 except: min_salary = 0 max_salary = 0 # Education try: education = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[5]/div[2]/div[2]/p/text()' ).get() education = education.strip() except: education = "" # Experience try: experience = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[6]/div[2]/div[2]/p/text()' ).get() experience = experience.strip() except: experience = "" # Gender try: gender = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[7]/div[2]/div[2]/p/i/@class' ).get() if "female" in gender: gender = "female" elif "male" in gender: gender = "male" else: gender = '' except: gender = "" # Age try: age = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[8]/div[2]/div[2]/p/text()' ).get() age = age.strip() except: age = "" print(1) # Description try: description = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[10]/div[2]/div/p/text()' ).get() description = description.strip() except: description = "" description_en = "" description_am = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Phone try: phone = Selector(response=page).css( '#sidebar-border > div.detailed-info-block.form-inline.clearfix > div.clearfix > div > div.user-details' ).extract() phones = [] for phone in phone: phone = remove_tags(phone).strip() area_code = "374" number = phone.replace(" ", "") number = number.replace("-", "") number = number.replace("(", "") number = number.replace(")", "") phones.append({'country_code': area_code, "number": number}) except: phone = [] # Username try: username = Selector(response=page).xpath( '//*[@id="sidebar-border"]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div[2]/h6/a/text()' ).get() username = username.strip() except: username = "" data = { "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "location_id": location_id, "posted_by": posted_by, "email": email, "workspace": workspace, "job_type": job_type, "min_salary": min_salary, "max_salary": max_salary, "education": education, "experience": experience, "gender": gender, "age": age, "description_am": description_am, "description_en": description_en, "phone": phones, "username": username } print(data) return data # Vacancy("https://full.am/en/job/public/view/1163") # https://full.am/en/job/public/view/12067 # https://full.am/en/job/public/view/1163
def parse(self, response): item = {} # get company name: comp_name = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/h3/text()' ).extract() if len(comp_name) > 0: comp_name = " ".join(comp_name[0].split()) item['company name'] = comp_name else: comp_name2 = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/text()' ).extract() if len(comp_name2) > 0: comp_name2 = " ".join(comp_name2[0].split()) item['company name'] = comp_name2 # get company url: comp_url = response.url item['company_url'] = comp_url # get company address: address = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/p[1]/text()' ).extract() #extract the data list address join_address = " ".join("".join(address).split()) item['company address'] = join_address # get company country: item['country'] = "Singapore" #as default # get company phone & fax: phone = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuephone"]/a/text()' ).extract()[0].strip() if len( Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuefax"]/a/text()' ).extract()) > 0: fax = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuefax"]/a/text()' ).extract()[0].strip() item['company phone number'] = [ " ".join(phone.split()), " ".join(fax.split()) ] else: item['company phone number'] = [" ".join(phone.split())] # get company email: if len( Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/a[@id="textemail"]/@onclick' ).extract()) > 0: email = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/a[@id="textemail"]/@onclick' ).extract()[0] item['company email'] = email.split("'")[1] # get company website: if len( Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuewebsite"]/a/@href' ).extract()) > 0: web = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuewebsite"]/a/@href' ).extract()[0] item['company website'] = web.strip() # get company description: if len( Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="company-description"]/text()' ).extract()) > 0: comp_description = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="company-description"]/text()' ).extract() comp_description = "".join(comp_description) if comp_description.strip() != "": item['company description'] = comp_description.strip() # get company product & services: if len( Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="owl-carousel-container"]/div[1]/div[@class="item"]/a/img/@title' ).extract()) > 0: comp_ps = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="owl-carousel-container"]/div[1]/div[@class="item"]/a/img/@title' ).extract() item['products and services'] = comp_ps # get company categories: if len( Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="company-description"]/ul/ul/li/a/text()' ).extract()) > 0: comp_cat = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="company-description"]/ul/ul/li/a/text()' ).extract() item['category'] = comp_cat # get company contacts: if len( Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/p[2]/text()' ).extract()) > 0: contacts_raw = Selector(response).xpath( '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/p[2]/text()' ).extract() contacts = [] for elem in contacts_raw: elem = elem.strip() if elem != "Contact": if "Tel" not in elem: if "Mobile" not in elem: if "mail" not in elem: elem = elem.split(",") el = {} if len(elem) > 1: el['job_title'] = elem[1] el['name'] = elem[0] contacts.append(el) item['contacts'] = contacts elif len(elem) == 1: if elem[0] != "": el['name'] = elem[0] contacts.append(el) item['contacts'] = contacts # still can't handle for email's contact # emails_raw = Selector(response).xpath('//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/p[2]/a/text()').extract() yield item
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Company try: company = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/text()' ).get() except: company = "" # Website try: website = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/@href' ).get() website = [website] except: website = [] # Position try: position = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobPostTitle"]/text()' ).get() except: position = "" # logo try: logo = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_imgCompanyLogoLink"]/@src' ).get() logo = "http://jobfinder.am/" + logo except: logo = '' # Job_type try: job_type = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblPositionType"]/text()' ).get() except: job_type = "" # Category try: category = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblCategory"]/text()' ).get() except: category = "" # Experience try: experience = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblExperience"]/text()' ).get() except: experience = "" # Education try: education = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblEducation"]/text()' ).get() except: education = "" # Location try: location = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblLocation"]/text()' ).get() except: location = "" # Published try: published = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()' ).get() published = published.split(" ") published = published[0].split("-") publish_day = int(published[0]) publish_month = int(published[1]) publish_year = int("20" + published[2]) except: publish_day = 0 publish_month = 0 publish_year = 0 if yesterday_day != publish_day or yesterday_month != publish_month: print("Not published yesterday") return # Ends try: ends = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()' ).get() ends = ends.split(" ") ends = ends[0].split("-") deadline_day = int(ends[0]) deadline_month = int(ends[1]) deadline_year = int("20" + ends[2]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Salary try: salary = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblSalary"]/text()' ).get() salary = int(salary) except: salary = 0 # Age try: age = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAge"]/text()' ).get() if "--------" in age: age = "" except: age = "" # Gender try: gender = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblGender"]/text()' ).get() if "--------" in gender: gender = "" except: gender = "" # Job Description try: j_description = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobDescription"]/text()' ).get() except: j_description = "" # Job Responsibilities try: j_responsibilities = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobResponsibilities"]/text()' ).get() except: j_responsibilities = "" # Required Qualifications try: r_qualifications = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblRequiredQualifications"]' ).get() r_qualifications = remove_tags(r_qualifications) except: r_qualifications = "" # Application Procedure try: a_procedure = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]' ).get() a_procedure = remove_tags(a_procedure) except: a_procedure = remove_tags(a_procedure) v_description = j_description + "\n" + j_responsibilities + "\n" + r_qualifications + "\n" + a_procedure try: if detect(v_description) == "et": try: v_description_en = Translate(v_description) except: v_description_en = "" v_description_am = v_description else: v_description_en = v_description v_description_am = "" except: v_description_en = "" v_description_am = "" # About Company try: c_description = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAboutCompany"]' ).get() c_description = remove_tags(c_description) except: c_description = "" try: if detect(c_description) == "et": try: c_description_en = Translate(c_description) except: c_description_en = "" c_description_am = c_description else: c_description_en = c_description c_description_am = "" except: c_description_en = "" c_description_am = "" # Email try: email = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]/a/text()' ).get() email = email.strip() email = [email] except: email = [] # Phone try: phone = re.search(r"\d{9}", v_description_en).group() phone = [{"country_code": "374", "number": phone}] except: phone = [] data = { "company": company, "position": position, "website": website, "logo": logo, "job_type": job_type, "category": category, "experience": experience, "education": education, "location": location, "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "salary": salary, "age": age, "gender": gender, "v_description_am": v_description_am, "v_description_en": v_description_en, "c_description_am": c_description_am, "c_description_en": c_description_en, "email": email, "phone": phone, } # print(data) return data # Vacancy('http://jobfinder.am/ViewJob.aspx?JobPostingID=49217')
def Vacancy(link): url = link headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36", "Accept-Language": "en-US,en;q=0.9,ru;q=0.8" } page = requests.get(url, headers=headers) # Company try: company = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/h4/text()').get() except: company = "" # position try: position = Selector(response=page).xpath( '//*[@id="loyal"]/div[2]/div/div[1]/h4/text()').get() except: position = "" # logo try: logo = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/img/@src').get() except: logo = "" # Job_type try: job_type = Selector(response=page).xpath( '/html/body/div[3]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]//text()[2]' ).get() job_type = job_type.strip() except: job_type = "" # Contact Person try: person = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[2]').get( ) person = person.strip() except: person = "" # Email try: email = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[3]').get( ) email = email.strip() email = [email] except: email = [] # Phone try: phone = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[4]').get( ) phone = phone.strip() if "," in phone: phones = phone.split(",") phone = [] for each in phones: each = each.strip() if "+" in each and " " in each: number = each.split(" ", 1)[1].replace('-', "").replace(" ", "") country_code = each.split(" ", 1)[0].replace('+', "") phone.append({ "country_code": country_code, "number": number }) elif "+" in each and " " not in each: if "+374" in each: country_code = "374" number = each.replace("+374", "") phone.append({ "country_code": country_code, "number": number }) elif "+1" in each: country_code = "1" number = each.replace("+1", "") phone.append({ "country_code": country_code, "number": number }) else: country_code = "374" number = each phone.append({ "country_code": country_code, "number": number }) elif "+" not in each: number = each.replace('-', "").replace(" ", "") country_code = "374" phone.append({ "country_code": country_code, "number": number }) else: if "+" in phone and " " in phone: number = phone.split(" ", 1)[1].replace('-', "").replace(" ", "") country_code = phone.split(" ", 1)[0].replace('+', "") phone = [{"country_code": country_code, "number": number}] elif "+" in phone and " " not in phone: if "+374" in phone: country_code = "374" number = phone.replace("+374", "") phone = [{"country_code": country_code, "number": number}] elif "+1" in phone: country_code = "1" number = phone.replace("+1", "") phone = [{"country_code": country_code, "number": number}] else: country_code = "374" number = phone phone = [{"country_code": country_code, "number": number}] elif "+" not in phone: number = phone.replace('-', "").replace(" ", "") country_code = "374" phone = [{"country_code": country_code, "number": number}] except Exception as e: phone = [] # Website try: website = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[5]').get( ) website = website.strip() if "not" in website: website = [] else: website = [website] except: website = [] # Published try: published = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[2]').get() published = published.strip() publish_day = int(published.split("-")[2]) publish_month = int(published.split("-")[1]) publish_year = int(published.split("-")[0]) except: publish_day = 0 publish_month = 0 publish_year = 0 # Ends try: ends = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[5]').get() ends = ends.strip() deadline_day = int(ends.split("-")[2]) deadline_month = int(ends.split("-")[1]) deadline_year = int(ends.split("-")[0]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Career Level try: career_level = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[1]/text()').get( ) if career_level == None: career_level = "" except: career_level = "" # Education try: education = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[2]/text()').get( ) if education == None: education = "" except: education = "" # Experience try: experience = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[3]/text()').get( ) if experience == None: experience = "" except: experience = "" # Salary try: salary = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/strong/text()').get( ) if "-" in salary: salary = salary.split("-") min_salary = salary[0].strip() min_salary = int(min_salary.replace(".", "")) max_salary = salary[1].strip() max_salary = int(max_salary.replace('.', "")) elif "-" not in salary and salary != "N/A": min_salary = int(salary.replace(".")) max_salary = int(salary.replace(".")) else: min_salary = 0 max_salary = 0 except: min_salary = 0 max_salary = 0 # Vacancy Description try: v_description = Selector( response=page).xpath('//*[@id="loyal"]/div[2]/div/div[1]').get() v_description = remove_tags(v_description).strip() v_description = v_description.replace('\xa0', " ") except: v_description = "" try: if detect(v_description) == "et": try: v_description_en = Translate(v_description) except: v_description_en = " " v_description_am = v_description else: v_description_en = v_description v_description_am = "" except: v_description_am = "" v_description_en = "" # Company Description try: c_description = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()').get() c_description = c_description.strip() except: c_description = "" try: if detect(c_description) == "et": try: c_description_en = Translate(c_description) except: c_description_en = " " c_description_am = c_description else: c_description_en = c_description c_description_am = "" except: c_description_am = "" c_description_en = "" # c_descrip ; //*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text() data = { "company": company, "position": position, "logo": logo, "person": person, "job_type": job_type, "email": email, "phone": phone, "website": website, "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "career_level": career_level, "education": education, "experience": experience, "min_salary": min_salary, "max_salary": max_salary, "v_description_am": v_description_am, "v_description_en": v_description_en, "c_description_am": c_description_am, "c_description_en": c_description_en, } print(data) return data # Vacancy("https://rezume.am/job/2184")