Python Selector.stripの例、scrapy.selector.Selector.strip Pythonの例

コード例 #1

0

ファイルを表示

ファイル: dong.py プロジェクト: MrWgEvan/python2_7_test

    def parse_item(self, response):

        item = Sun0769Item()

        question = Selector(response).xpath(
            '//div[@class="pagecenter p3"]//strong/text()').extract()[0]
        title = question.strip().split(u'编号:')[0]
        #.strip().split(' ')[0]
        #.split(r'：')[-1]
        number = question.strip().split(' ')[-1].split(':')[-1]

        content = Selector(response).xpath(
            "//div[@class='pagecenter p3']//div[@class='contentext']/text()"
        ).extract()
        # 有图片时
        if len(content) == 0:
            content = Selector(response).xpath(
                " //div[@class='pagecenter p3']//div[@class='c1 text14_2']/text()"
            ).extract()
            item["content"] = "".join(content).strip()
        else:
            img = Selector(response).xpath(
                "//div[@class='pagecenter p3']//img/@src").extract()
            item["img"] = img
            item["content"] = "".join(content).strip()

        item["title"] = title
        item["number"] = number
        item["url"] = response.url

        yield item

コード例 #2

0

ファイルを表示

 def parse_comment(self, response):
     try:
         worth = Selector(response=response).xpath('//span[@id="rating_worthy_num"]/text()').get()
         worthless = Selector(response=response).xpath('//span[@id="rating_unworthy_num"]/text()').get()
         price = Selector(response=response).xpath('//div[@class="price"]/span/text()').get()
         url = Selector(response=response).xpath('//a[@class="img-box"]/img[@class="main-img"]/@src').get()
         if not price:
             price = Selector(response=response).xpath('//div[@class="old-price-box"]/p/span[2]/text()').get()
         goods_item = response.meta['item']
         goods_item['visible_price'] = price
         goods_item['worth'] = worth
         goods_item['worthless'] = worthless
         goods_item['url'] = url
         yield goods_item  # 返回商品ITEM
         comment_list = Selector(response=response).xpath('//ul[@class="comment_listBox"]/li[@class="comment_list"]')
         for comment_content in comment_list:
             item = CommentItem()
             item['goods_id'] = goods_item['goods_id']
             item['comment_id'] = comment_content.xpath('./@id').get().strip().split("_")[-1]
             display_time = comment_content.xpath(
                 './div[@class="comment_conBox"]/div[@class="comment_avatar_time "]/div[@class="time"]/text()').get().strip()
             item['time'] = convert_time(display_time)
             item['text'] = comment_content.xpath(
                 './div[@class="comment_conBox"]/div[@class="comment_conWrap"]/div[@class="comment_con"]/p/span/text()').get()
             if item['text']:
                 if item['text'] != " ":
                     yield item  # 返回评论ITEM
         # 检查评论是否有下一页，如果有继续爬取
         next_page = Selector(response=response).xpath(
             '//*[@class="pagination"]/li[@class="pagedown"]/a/@href').get()
         if next_page:
             yield scrapy.Request(url=next_page.strip(), meta={'item': goods_item}, callback=self.parse_comment)
     except Exception as e:
         print(f"抓取评论页面时出现错误：{str(e)}")

コード例 #3

0

ファイルを表示

 def parse_ranking(self, response):
     try:
         # 如果有下一页继续爬取
         next_page = Selector(response=response).xpath('//li[@class="page-turn  next-page"]/a/@href').get()
         if next_page:
             yield scrapy.Request(url=next_page.strip(), callback=self.parse_ranking)
         # 循环爬取每个商品信息
         goods_list = Selector(response=response).xpath('//ul[@class="feed-list-hits"]/li')
         for goods_content in goods_list:
             item = GoodsItem()
             goods_info = goods_content.xpath(
                 './div/div[@class="z-feed-content "]/div[@class="z-feed-foot"]/div[@class="z-feed-foot-r"]/div/div/a[@class="z-btn z-btn-red"]/@onclick').get().strip()
             # 临时将详情页存入url字段
             item['url'] = goods_content.xpath(
                 './div/div[@class="z-feed-content "]/div[@class="z-feed-foot"]/div[@class="z-feed-foot-l"]/a[2]/@href').get().strip()
             display_time = goods_content.xpath(
                 './div/div[@class="z-feed-content "]/div[@class="z-feed-foot"]/div[@class="z-feed-foot-r"]/span[@class="feed-block-extras"]/text()').get().strip()
             item['time'] = convert_time(display_time)
             pattern = ".*dataLayer.push.*gtmAddToCart\((.*?)\)$"
             info_text = re.search(pattern, goods_info)
             if info_text:
                 goods_json = json.loads(info_text.group(1).replace("'", '"'))
                 item['name'] = goods_json["name"]
                 item['goods_id'] = goods_json['id']
                 item['brand'] = goods_json['brand']
                 item['category'] = goods_json['category']
                 item['price'] = goods_json['price']
                 yield scrapy.Request(url=item['url'], meta={'item': item}, callback=self.parse_comment)
     except Exception as e:
         print(f"抓取榜单页面时出现错误：{str(e)}")

コード例 #4

0

ファイルを表示

 def collectSubscribe(self, p_content, p_definition, p_seqno):
     s = Selector(text=p_content)
     selectors = p_definition["selectors"]
     rooturl = p_definition["addr"]
     limit = -1
     if "limit" in p_definition:
         limit = p_definition["limit"]
     data = []
     for sel in selectors:
         xpath = sel["xpath"]
         for idx, item in enumerate(s.xpath(xpath)):
             if limit >= 0 and idx >= limit:
                 break
             kv = {}
             kv["no"] = idx + 1
             string = item.xpath(".//text()").getall()
             strings = ""
             if string:
                 for s in string:
                     strings = strings + s.strip()
             kv["value"] = strings
             kv["hashcode"] = hash(strings)
             if item.xpath("@href"):
                 href = item.xpath("@href").get()
                 kv["href"] = geturl(rooturl, href)
             data.append(kv)
     return data

コード例 #5

0

ファイルを表示

ファイル: main.py プロジェクト: tdiffendal/WaterBill

 def parseWaterBill(self, response):
     #Check if we found the water bill if not then write to the failed CSV and return.
     if (len(
             response.xpath(
                 "//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblCurrentBalance']"
             )) == 0):
         print("Couldn't find a water bill for account " +
               response.meta['account_or_address'])
         self.writeFailedCSV(response.meta['account_or_address'])
         return None
     #I use the item feature in scrapy to store the items.
     wateritem = WaterbillItem()
     wateritem['Searched_Address'] = response.meta[
         'search_type']  #This is a relic of when I searched by addresses.
     table = response.xpath('//table[@class="dataTable"]//tr')
     headers = [
         'Account Number', 'Service Address', 'Current Read Date',
         'Current Bill Date', 'Penalty Date', 'Current Bill Amount',
         'Previous Balance', 'Current Balance', 'Previous Read Date',
         'Last Pay Date', 'Last Pay Amount', 'TimeStamp'
     ]
     #I can't determine if this actually works because I can't find an address with a shut off notice.
     if (len(
             response.xpath(
                 "//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblTurnOffDate']"
             )) != 0):
         wateritem['TurnOffDate'] = "Yes"
         #wateritem['TurnOffDate'] = Selector(text=row.extract()).xpath("//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblTurnOffDate']").extract_first()
     else:
         wateritem['TurnOffDate'] = 'No'
     for row in table:
         header = Selector(
             text=row.extract()).xpath('//th/text()').extract_first()
         value = Selector(text=row.extract()).xpath(
             '//td/descendant::*/text()').extract_first()
         if value == None:
             value = ''  #So it populates the excel sheet with a blank spot
         if (header != None and header.strip().replace(':', "") in headers):
             value = value.replace('$', '').replace(",", '')
             if ("Date" in header and value != ''):
                 #Convert to SQL Datetime Format
                 value = datetime.strptime(value.strip(),
                                           '%m/%d/%Y').strftime('%Y-%m-%d')
             wateritem[header.strip().replace(':', "").replace(
                 ' ', '_')] = value.strip()
     wateritem['Timestamp'] = datetime.today().strftime('%Y-%m-%d')
     return wateritem

コード例 #6

0

ファイルを表示

ファイル: spider.py プロジェクト: shchedrovmaxim/Parser

def getText(response, xpaz):
    """THis function for reading any text"""
    selector = Selector(response=response).xpath(xpaz).get()
    if type(selector) == str:
        selector = selector.strip()
    else:
        selector = ''
    return selector

コード例 #7

0

ファイルを表示

ファイル: spider.py プロジェクト: shchedrovmaxim/Parser

def getIntObj(response, xpaz):
    """This funcrion for reading floor and amount of rooms"""
    selector = Selector(response=response).xpath(xpaz).get()
    if type(selector) == str:
        selector = selector.strip()
    else:
        return ''
    selector = float(selector[:3].strip())
    return selector

コード例 #8

0

ファイルを表示

  def obtain_rate (self, date_str):
    html_str = self._fetch_from_site(date_str)

    ratePath = Selector(text=html_str).xpath("/html/body/table/tr[2]/td[1]/table/tr[2]/td[4]/table/tr/td/text()").get()
    
    if (ratePath is None):
      return 'n/a'
    
    return ratePath.strip()

コード例 #9

0

ファイルを表示

    def parse(self, response):
        rows = response.xpath(
            '//*[@id="mainlayout"]/div[2]/div[3]/div[2]/div').extract(
            )  # inspect html and you will see our elements are all <div>

        for row in rows[1:]:  # rows[0] are only the columns names
            Dat = Selector(
                text=row).xpath('//div[1]/text()').extract()[1].encode(
                    'utf-8', 'ignore')
            Dat = Dat.strip()[3:5] + '-' + Dat.strip()[0:2] + '-' + Dat.strip(
            )[6:8]
            Date = datetime.strptime(Dat, '%d-%m-%y').date()

            # the next line below doesn't work because this xpath address contains ctl01 that changes for every row:
            # Symbol = Selector(text=row).xpath('//*[@id="ctl00_CPI_rptAnnouncement_ctl01_dvSymItem"]/a/text()').extract()[0].encode('utf-8','ignore')
            # solution: use xpath "contains" to leave out reference numbers:
            Symbol = Selector(text=row).xpath(
                '//*[contains(@id,"dvSymItem")]/a/text()').extract()[0].encode(
                    'utf-8', 'ignore')
            # compnay has the same syntax problem so we use "contains" again:
            Company = Selector(
                text=row).xpath('//*[contains(@id,"dvCompItem")]/a/text()'
                                ).extract()[0].encode('utf-8',
                                                      'ignore').strip()
            CurrR = Selector(
                text=row).xpath('//div[4]/text()').extract()[0].encode(
                    'utf-8', 'ignore')
            NewR = Selector(
                text=row).xpath('//div[5]/text()').extract()[0].encode(
                    'utf-8', 'ignore')
            Period = Selector(
                text=row).xpath('//div[6]/text()').extract()[0].encode(
                    'utf-8', 'ignore')

            item = PwItem()

            item['Date'] = Date
            item['Symbol'] = Symbol
            item['Company'] = Company
            item['CurrR'] = CurrR
            item['NewR'] = NewR
            item['Period'] = Period

            yield item

コード例 #10

0

ファイルを表示

 def parse_basic_info(self, response):
     basic_info = response.xpath(
         '//div[@class="zong"]//div[@class="wai"]').extract()
     id = 2
     for info in basic_info:
         type = Selector(
             text=info).xpath('//div[@class="zi"]/text()').extract()[0]
         type = type.strip()
         name_info = Selector(text=info).xpath('//div[@id="b ' + str(id) +
                                               '"]/ul/li/ul/li').extract()
         id = id + 1
         for fake_name in name_info:
             name = None
             url = None
             has_a = Selector(text=fake_name).xpath('//li/a').extract()
             if len(has_a) == 0:
                 name = Selector(
                     text=fake_name).xpath('//li/text()').extract()[0]
                 url = None
             else:
                 name = Selector(
                     text=fake_name).xpath('//li/a/text()').extract()[0]
                 url = Selector(
                     text=fake_name).xpath('//li/a/@href').extract()[0]
             if name != None:
                 name = name.strip()
             if url != None:
                 url = url.strip()
                 if len(url) <= 7:
                     url = None
             item = SpiderLoaderItem(item=BankListItem(), response=response)
             item.add_value('type', type)
             item.add_value('name', name)
             item.add_value('url', url)
             item.add_value('longitude', '')
             item.add_value('latitude', '')
             item.add_value('address', '')
             item.add_value('tel', '')
             item.add_value('workday', '')
             item.add_value('table_name', 'CBRCBANK_BANK_LIST')
             yield item.load_item()

コード例 #11

0

ファイルを表示

    def parse(self, response):

        productList = Selector(text=response.body).xpath(
            '//li[contains(@class, "gl-item")]').extract()

        # $object = UPLOAD_PATH.$new_path.md5(time().mt_rand(100, 999999999)).
        # '.'.pathinfo($file->getInfo('name'), PATHINFO_EXTENSION);
        # $new_path = 'goods'.date('Y').'/'.date('m-d').'/';

        Class = Selector(text=response.body).xpath(
            '//div[contains(@class, "p-name p-name-type-2")]//em[not(i)]'
        ).extract()
        print(Class)

        for item in productList:
            if self.num > self.getNum:
                break
            name = Selector(text=item).xpath(
                '//div[contains(@class, "p-name")]/a/em').extract()[0]
            name = filterStr.filter_tags(name)
            skuid = Selector(text=item).xpath('//li/@data-sku').extract()[0]
            price = Selector(text=item).xpath(
                '//div[contains(@class, "p-price")]/strong/i').extract()[0]
            price = filterStr.filter_tags(price)
            imgsrc = Selector(text=item).xpath(
                '//li[contains(@class, "gl-item")]//img/@src').extract()[0]
            imgsrc = imgsrc.replace('//', '')

            # 去除京东超市
            # '京东超市金龙鱼 食用油 葵花籽清香型 食用植物调和油5L（新老包装随机发货）'
            name = name.replace("京东超市", "")
            name = name.replace("（京东定制）", "")
            name = name.replace("（京东定制装）", "")
            name = name.replace("京东自营", "")
            name = name.replace("（新老包装随机发货）", "")
            name = name.replace("新旧包装随机配送", "")
            name = name.replace("新老包装随机发放", "")
            name = name.replace("（新老包装随机发放，数量有限，赠完为止）", "")
            name = name.replace("中粮出品", "")
            name = name.replace("（中粮出品）", "")
            if "【沃尔玛】" in name:
                continue
            name = name.replace("【沃尔玛】", "")
            self.item['name'] = name.strip()
            self.item['price'] = price
            self.item['skuid'] = skuid
            # self.item['Class'] = Class
            self.item['imgsrc'] = imgsrc
            self.item['sourceType'] = SOURCE_TYPE_JD
            self.item['goods_id'] = self.insertGoods(self.item)
            self.num = self.num + 1

            yield self.item

コード例 #12

0

ファイルを表示

    def parse(self, response):
        # get all the listing blocks
        listings = response.xpath('//a[@class="col-xs-12 profitem"]').getall()

        # within each listing block get the details
        for i in listings:
            # there is more than 1 heading or suburb, just get the first one
            suburb = Selector(text=i).xpath(
                '//h4[@class="mat-header"]/text()').get().strip()
            # new or updated listing
            status = Selector(text=i).xpath(
                '//span[@class="mat-text-span text-uppercase mat-new hidden-xs"]/text()'
            ).get()

            # price
            price = Selector(
                text=i).xpath('//h4[@class="mat-header mat-price"]').get()
            # some regex to extract the price
            loc = re.search("</sup>", price)
            price = price[loc.span()[1]:]
            price = price.replace('<sup>', '')
            price = price.replace('</sup>', '')
            price = price.replace('</h4>', '')
            price = re.sub('\xa0', ' ', price)
            price = price.strip()

            # get all feature details in a list
            details = Selector(text=i).xpath(
                '//ul[@class="mat-feture"]/li/div[@class="mat-fetaure-avl"]/text()'
            ).getall()
            # listing details
            home_type = details[0].strip()
            available = details[1].strip()
            occupants = details[2].strip()

            # get description
            desc = Selector(text=i).xpath(
                '//div[@class="col-sm-4 col-md-6 hidden-xs hidden-sm mathes-list"]/p/text()'
            ).get().strip()
            desc = desc.replace('\r', '')
            desc = desc.replace('\n', '')

            listing = {
                'suburb': suburb,
                'status': status,
                'price': price,
                'home_type': home_type,
                'available': available,
                'occupants': occupants,
                'description': desc,
            }
            yield (listing)

コード例 #13

0

ファイルを表示

ファイル: spider.py プロジェクト: shchedrovmaxim/Parser

def getObj(response, xpaz):
    """This function for reading price"""
    selector = Selector(response=response).xpath(xpaz).get()
    if type(selector) == str:
        selector = selector.strip()
    else:
        return ''
    price = ''
    for char in selector:
        if char != ' ' and char != '$':
            price += char
    price = int(price)
    return price

コード例 #14

0

ファイルを表示

ファイル: spider_flightgirl.py プロジェクト: mage1028/weibo_fakenews

    def parse_comments(self, response):

        item = CommentsItem()
        item['id'] = response.meta['id']
        item['flag'] = response.meta['flag']
        item['author'] = []
        item['author_comment'] = []
        item['time'] = []

        text = response.text
        restojson = json.loads(text, encoding='utf-8')
        html = restojson['data']['html']
        html = html.split('\\n')
        html = ''.join(html)
        author_comments = Selector(
            text=html).xpath('//*[@class="WB_text"]').extract()

        for author_comment in author_comments:

            item['author'].append(
                Selector(text=author_comment).xpath('//a/text()').extract()[0])
            remove_author = Selector(
                text=author_comment).xpath('//a/text()').extract()[0]

            author_comment = dealcontent(author_comment)
            comment = Selector(text=author_comment).xpath('//text()').extract()
            comment.remove(remove_author)
            comment = ''.join(comment)

            while re.match(r'^ ', comment):
                comment = comment.strip(' ')

            item['author_comment'].append(comment)

        if item['flag'] == 'forwarded':
            item['time'] = Selector(text=html).xpath(
                '//*[@class="WB_from S_txt2"]/a/@title').extract()
        if item['flag'] == 'comment':
            item['time'] = Selector(text=html).xpath(
                '//*[@class="WB_from S_txt2"]/text()').extract()

        item['like_count'] = Selector(text=html).xpath(
            '////span[@node-type="like_status"]/em[2]/text()').extract()
        lens = len(item['like_count'])

        for i in range(0, lens):
            item['like_count'][i] = item['like_count'][i].replace('赞', '0')

        yield copy.deepcopy(item)

コード例 #15

0

ファイルを表示

ファイル: nydoc.py プロジェクト: pozernishku/legis_rem

    def parse(self, response):
        folders = response.xpath('//div[@class="AccordionPanel"]').extract()
        for folder in folders:
            year = Selector(text=folder).xpath('//div[@class="AccordionPanelTab"]/text()').extract_first()
            os.makedirs('./newyork/' + year)

            rows = Selector(text=folder).xpath('//table[@class="listingTable"]/tbody/tr').extract()
            for row in rows:
                name = Selector(text=row).xpath('//td[@headers="Name"]/text()').extract_first()
                name = name.strip() if name is not None else ''
                length = len(name)
                name = name if length < 235 else name[:235]
                date = Selector(text=row).xpath('//td[contains(@headers, "Date")]/text()').extract_first()
                href = Selector(text=row).xpath('//td/a[contains(text(), "Transcript") or contains(text(), "Transcript and Testimony")]/@href').extract_first()
                t_type = Selector(text=row).xpath('//td/a[contains(text(), "Transcript") or contains(text(), "Transcript and Testimony")]/text()').extract_first()
                if href is not None and 'Transcript' in t_type:
                    yield Request(href, callback=self.parsetranscript, meta={'year': year, 'name': name, 'date': date, 'download_timeout': 3500}, dont_filter=True)
                else:
                    continue

コード例 #16

0

ファイルを表示

def Company_Info(link):
    print("Trying to get Identification Number")
    url = link
    page = requests.get(url)

    vat = Selector(response=page).xpath('/html/body/div[4]/div/div[2]/div/div[1]/div[1]/div[contains(., "Identification")]/span[2]/text()').get()
    try:
        vat = vat.strip()
    except:
        vat = None
    data = {
        "vat" : vat
    }

    # print(data)
    return data

# Company_Info("https://www.yell.ge/company.php?lan=eng&id=139568")
# print(main)

コード例 #17

0

ファイルを表示

ファイル: spider.py プロジェクト: shchedrovmaxim/Parser

def getSpace(response, xpaz):
    """This function return size of apartamens """
    spaces = Selector(response=response).xpath(xpaz).get()
    if type(spaces) == str:
        spaces = spaces.strip()
    else:
        return ''
    total_split = [i for i in re.split(r'(\d+.\d+|\W+)', spaces) if i]

    total_space = 0
    live_space = 0
    kitchen_space = 0

    sqear = [None] * 3
    index = 0

    for string in total_split:
        if string.replace('.', '', 1).isdigit():
            sqear[index] = string
            index += 1

    if sqear[2] is None:
        if sqear[0]:
            total_space = float(sqear[0])
        else:
            total_split = 0
        if sqear[1]:
            kitchen_space = float(sqear[1])
        else:
            live_space = 0

    else:
        total_space = float(sqear[0])
        live_space = float(sqear[1])
        kitchen_space = float(sqear[2])

    return total_space, live_space, kitchen_space

コード例 #18

0

ファイルを表示

            position = ""

        # Company
        try:
            company = Selector(response=page).xpath(
                f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[2]/p/span[1]/a/text()'
            ).get()
        except:
            company = ""

        # Published
        try:
            published = Selector(response=page).xpath(
                f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[2]/p/span[4]/time/span[1]/text()'
            ).get()
            published = published.strip().split(",")
            publish_year = int(published[1].strip())
            publish_day = int(published[0].split(" ")[1])
            publish_month = int(months[published[0].split(" ")[0]])
        except:
            publish_year = 0
            publish_day = 0
            publish_month = 0
        if yesterday_day != publish_day or yesterday_month != publish_month:
            print("Not published yesterday")
            continue

        # Ends
        try:
            ends = Selector(response=page).xpath(
                f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[2]/p/span[4]/time/span[2]/text()'

コード例 #19

0

ファイルを表示

ファイル: kmt_councilors.py プロジェクト: g0v/councilor-voter-guide

candidates = []
for county, tid in ref.items():
    print county
    r = requests.get('https://kmt2018.com/candidate_json.asp?tid=%s&cid=2' % tid)
    if r.status_code == 500:
        continue
    r.encoding = 'utf-8'
    cs = r.json()
    cs = [x for x in cs if x['name'] != u'陸續更新中']
    for candidate in cs:
        print candidate['name']
        rd = requests.get('https://kmt2018.com/read_candidate.asp?ids=%s' % candidate['uid'])
        rd.encoding = 'utf-8'
        x = Selector(text=rd.text, type='html')
        for desc in x.css('.desc .title'):
            content = '\n'.join([x.strip() for x in desc.xpath('following-sibling::div[1]//text()').extract() if x.strip()])
            if desc.xpath('text()').extract_first() == u'競選口號':
                candidate['slogan'] = content
            elif desc.xpath('text()').extract_first() == u'經歷':
                candidate['experience'] = content
            elif desc.xpath('text()').extract_first() == u'學歷':
                candidate['education'] = content
        candidate['name'] = re.sub('\s', '', candidate['name'])
        candidate['county'] = county
        candidate['constituency'] = normalize_constituency(candidate['desc'])
        img_link = candidate['picture']
        f_name = '%s_%d_%s.%s' % (candidate['county'], candidate['constituency'], candidate['name'], img_link.split('.')[-1].split('?')[0])
        f = '%s/%s' % (path, f_name)
        cmd = 'wget -N --no-check-certificate "%s" -O %s' % (img_link, f)
        subprocess.call(cmd, shell=True)
        candidate['image'] = u'%s/%s/%s/%s/%s' % (common.storage_domain(), 'councilors', '2018', u'中國國民黨', f_name)

コード例 #20

0

ファイルを表示

ファイル: vacancy.py プロジェクト: Caravan2/scripts

def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    # headers = {"Accept-Language": "en-US,en;q=0.5"}
    page = requests.get(url)  #headers=headers)

    # Location
    try:
        location = Selector(response=page).xpath(
            '/html/body/div[2]/table/tr[contains(., "Location:")]').get()
        location = location.split("<td>")[1].split("</td>")[0].replace(
            "&amp;nbsp", " ")
        location = location.split(",")[0]
        location = [{'city': location, 'id': Geonames(location)}]
    except:
        location = [{'city': 'Yerevan', 'id': '616052'}]

    # Company url
    try:
        c_url = Selector(response=page).xpath(
            '/html/body/div[2]/table/tr[contains(., "Company:")]').get()
        c_url = c_url.split('href="')[1].split('">')[0]
    except:
        c_url = ""

    # Vacancy Description
    try:
        description = Selector(response=page).xpath('/html/body/div[4]').get()
        description = remove_tags(description)
        description = description.strip()
        description = description.replace('&amp;nbsp', " ")
    except:
        description = ""
    try:
        if detect(description) == "et":
            try:
                description_en = Translate(description)
            except:
                description_en = ""
            description_am = description
        else:
            description_en = description
            description_am = ""
    except:
        description_en = ""
        description_am = ""

    # Email
    try:
        email = Selector(response=page).xpath('//*[@id="job"]/a/@href').get()
        email = email.replace('mailto:', "")
        email = [email]
    except:
        email = []

    data = {
        "location": location,
        "c_link": c_url,
        "description_am": description_am,
        "description_en": description_en,
        "email": email
    }

    # print(data)
    return data

コード例 #21

0

ファイルを表示

        deadline_year = 0

    # Email
    try:
        email = re.findall(r'[\w\.-]+@[\w\.-]+', description)[0]
    except Exception as e:
        email = []

    # Publication stuff
    v_page = requests.get(v_link)

    try:
        published = Selector(response=v_page).xpath(
            '//*[@id="ContentplaceholderMain_T7553F19B005_Col00"]/div[2]/div[2]/div[1]/div[1]/text()'
        ).get()
        published = published.strip()
        published = published.split(" ")
        publish_day = published[1].replace(",", "")
        publish_day = int(publish_day)
        publish_month = int(months[f"{published[0]}"])
        publish_year = int(published[2])
    except:
        published = 0
        publish_month = 0
        publish_year = 0
    if publish_day != yesterday_day:
        print("Not published Yesterday")
        continue

    data = {
        "company": company,

コード例 #22

0

ファイルを表示

ファイル: habit_burger.py プロジェクト: zanachka/alltheplaces

    def parse(self, response):
        regions = response.xpath('//h2').xpath('@id').extract()
        non_us_cities = ['phnom penh', 'shanghai', 'hangzhou']

        for region in regions:
            region_body = response.xpath('//h2[@id = "{region}"]/following-sibling::ul[@class = "reglist"]'.format(region=region)).get()
            pois = Selector(text=region_body).css('.loc').extract()

            for poi in pois:
                ref = Selector(text=poi).xpath('//a/@href').extract()[0].split('/')[-1]
                name = Selector(text=poi).xpath('//h4/text()').extract()
                if name == []:
                    name = Selector(text=poi).xpath('//h3/text()').extract()
                name = ''.join(name)

                map_link = Selector(text=poi).xpath('//div[@class = "locaddress"]/a').xpath('@href').extract_first()
                lat, long = None, None
                if 'daddr' in map_link:
                    coords = map_link.split('daddr=')[1].split(',')
                    lat = coords[0]
                    long = coords[1]

                addr = Selector(text=poi).xpath('//div[@class = "locaddress"]/a').extract_first()
                addr = Selector(text=addr).xpath('//a/text()').extract()
                addr = [a.strip() for a in addr]

                addr_full = ', '.join(addr)
                street = ', '.join(addr[:-1])
                city, state, postcode = None, None, None

                if region in ['cambodia', 'china']:
                    for c in non_us_cities:
                        if c in poi.lower():
                            city = c.capitalize()
                    country = region.capitalize()
                else:
                    city = addr[-1].split(', ')[0]
                    state_postcode = addr[-1].split(', ')[1].split(' ')
                    if len(state_postcode) > 1:
                        state = state_postcode[0]
                        postcode = state_postcode[1]
                    country = "US"

                phone = Selector(text=poi).xpath('//div[@class="locinfo"]/text()').get()
                phone = phone.strip() if phone else None
                opening_hours = Selector(text=poi).xpath('//div[@class="lochrs"]/text()').extract()
                opening_hours = opening_hours = ', '.join([hours.strip() for hours in opening_hours]) if opening_hours else None

                properties = {
                    'ref': ref,
                    'website': 'https://www.habitburger.com/locations/' + ref,
                    'name': name,
                    'addr_full': addr_full,
                    'street': street,
                    'city': city,
                    'state': state,
                    'postcode': postcode,
                    'country': country,
                    'phone': phone,
                    'opening_hours': opening_hours,
                    'lat': lat,
                    'lon': long,
                }

                yield GeojsonPointItem(**properties)

コード例 #23

0

ファイルを表示

ファイル: downloadWeather.py プロジェクト: OptimalSynthesisInc/GNATS

    def parse(self, response):
        NATS_SERVER_HOME = os.environ['NATS_SERVER_HOME']
        WEATHER_DIR_PATH = NATS_SERVER_HOME + "/share/tg/weather"

        page = response.url.split("/")[-2]

        htmlDoc = response.body

        # We check the URL and distinguish the work flow
        # Different URL page contains different info.  The page formats are different.  We have to handle them individually.

        # If URL contains "sigmet"
        if ('sigmet' in response.url):
            filename_SIGMET = time.strftime('%Y%m%d_%H%M%S',
                                            time.localtime()) + '.sigmet'

            file_SIGMET = open(WEATHER_DIR_PATH + "/" + filename_SIGMET, 'wb')

            title = Selector(text=htmlDoc).xpath(
                '//div[@id="awc_main_content"]/div[@id="title"]/text()'
            ).extract_first()
            if (len(title.strip()) > 0):
                file_SIGMET.write(title + "\n\n")

            array_children = Selector(
                text=htmlDoc).xpath('//div[@id="awc_main_content"]/*')
            for i in range(0, len(array_children)):
                node_layer_1 = array_children[i]
                if ('p' == node_layer_1.xpath('name()').extract()[0]):
                    node_layer_2 = node_layer_1.xpath('//b')
                    if not (node_layer_2 is None):
                        file_SIGMET.write(
                            node_layer_2.xpath('text()').extract()[0] + '\n')
                elif ('b' == node_layer_1.xpath('name()').extract()[0]):
                    file_SIGMET.write(
                        node_layer_1.xpath('text()').extract()[0] + '\n')
                elif ('pre' == node_layer_1.xpath('name()').extract()[0]):
                    file_SIGMET.write(
                        node_layer_1.xpath('text()').extract()[0] + '\n\n')

            file_SIGMET.close()
        elif ('metar' in response.url):  # If URL contains "metar"
            filename_METAR = time.strftime('%Y%m%d_%H%M%S',
                                           time.localtime()) + '.metar'

            file_METAR = open(WEATHER_DIR_PATH + "/" + filename_METAR, 'wb')

            title = Selector(text=htmlDoc).xpath(
                '//div[@id="awc_main_content"]/div[@id="title"]/text()'
            ).extract_first()
            if (len(title.strip()) > 0):
                file_METAR.write(title + "\n\n")

            array_children = Selector(
                text=htmlDoc).xpath('//div[@id="awc_main_content"]/*')
            for i in range(0, len(array_children)):
                node_layer_1 = array_children[i]

                if ('p' == node_layer_1.xpath('name()').extract()[0]):
                    node_layer_2 = node_layer_1.xpath('//b')
                    if not (node_layer_2 is None):
                        file_METAR.write(
                            node_layer_2.xpath('text()').extract()[0] + '\n')
                elif ('b' == node_layer_1.xpath('name()').extract()[0]):
                    file_METAR.write(
                        node_layer_1.xpath('text()').extract()[0] + '\n')

            array_data = Selector(
                text=htmlDoc).xpath('//div[@id="awc_main_content"]/text()')
            for i in range(0, len(array_data)):
                if (len(array_data[i].extract().strip()) > 0):
                    file_METAR.write("\n" + array_data[i].extract())

            file_METAR.close()
        elif ('airep' in response.url):  # If URL contains "airep"
            filename_AIREP = time.strftime('%Y%m%d_%H%M%S',
                                           time.localtime()) + '.airep'

            file_AIREP = open(WEATHER_DIR_PATH + "/" + filename_AIREP, 'wb')

            title = Selector(text=htmlDoc).xpath(
                '//div[@id="awc_main_content"]/div[@id="title"]/text()'
            ).extract_first()
            if (len(title.strip()) > 0):
                file_AIREP.write(title + "\n\n")

            array_children = Selector(
                text=htmlDoc).xpath('//div[@id="awc_main_content"]/div/*')
            for i in range(0, len(array_children)):
                node_layer_1 = array_children[i]

                if ('p' == node_layer_1.xpath('name()').extract()[0]):
                    node_layer_2 = node_layer_1.xpath('//b')
                    if not (node_layer_2 is None):
                        file_AIREP.write(
                            node_layer_2.xpath('text()').extract()[0] + '\n\n')
                elif ('code' == node_layer_1.xpath('name()').extract()[0]):
                    file_AIREP.write(
                        node_layer_1.xpath('text()').extract()[0] + '\n')

            file_AIREP.close()

コード例 #24

0

ファイルを表示

ファイル: vacancy.py プロジェクト: Caravan2/scripts

def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    print(url)
    # headers = {"Accept-Language": "en-US,en;q=0.5"}
    page = requests.get(url)  #headers=headers)

    # Location
    try:
        location = Selector(response=page).xpath(
            '/html/body/main/section/div/div[1]/div[3]/ul/li[3]/a/text()').get(
            )
        location = location.strip()
        location = location.split(",")[0]
        location = [{"city": location, "id": Geonames(location)}]
    except:
        location = [{"city": "Yerevan", "id": "616052"}]

    # Website
    try:
        website = Selector(response=page).xpath(
            '/html/body/main/section/div/div[1]/div[3]/ul/li[4]/a/@href').get(
            )
        if website is None:
            website = []
        else:
            website = [website]
    except:
        website = []

    # Job Type
    try:
        job_type = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/ul/li[3]/text()').get()
        job_type = job_type.strip()
    except:
        job_type

    # Published
    try:
        published = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/ul/li[7]/text()').get()
        published = published.strip()
    except:
        published = ""

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/ul/li[2]/text()').get()
        salary = salary.strip()
        salary = salary.replace("֏", "")
        salary = salary.replace(",", "")
        salary = salary.replace(" ", "")
        salary = int(salary)
    except:
        salary = 0

    # Gender
    try:
        gender = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/ul/li[4]/text()[2]').get()
        gender = gender.strip()
    except:
        gender = ""

    # Description
    try:
        description = Selector(response=page).xpath(
            '/html/body/main/section/div/div[2]/div/p').get()
        description = remove_tags(description).strip()
    except:
        description = ""
    try:
        if detect(description) == "et":
            try:
                description_en = Translate(description)
            except:
                description_en = ""
            description_am = description
        else:
            description_en = description
            description_am = ""
    except:
        description_en = ""
        description_am = ""

    # Email
    try:
        driver.get(link)
        email = driver.find_element_by_xpath(
            '/html/body/main/section/div/div[2]/div/p').text
        email = re.findall(r'[\w\.-]+@[\w\.-]+', email)
    except Exception as e:
        email = []

    data = {
        "location": location,
        "website": website,
        "job_type": job_type,
        "publish_day": published,
        "salary": salary,
        "gender": gender,
        "description_am": description_am,
        "description_en": description_en,
        "email": email
    }

    # print(data)
    return data


# Vacancy("https://www.worknet.am/en/job/%D5%A2%D5%A1%D5%B6%D5%BE%D5%B8%D6%80-%D5%BA%D5%A1%D5%B0%D5%A5%D5%BD%D5%BF%D5%AB-%D5%A1%D5%B7%D5%AD%D5%A1%D5%BF%D5%A1%D5%AF%D5%AB%D6%81-4656")

コード例 #25

0

ファイルを表示

ファイル: loan.py プロジェクト: wliustc/SpiderS

 def parse_detail(self, response):
     loan_name = response.xpath('//h1/text()').extract()
     if not loan_name:
         loan_name = ""
     else:
         loan_name = loan_name[0].strip().replace(" ", "")
     mortgage_info = response.xpath(
         '//span[@class="item doc-color-red"]/span/text()').extract()
     if not mortgage_info:
         mortgage_info = ""
     else:
         mortgage_info = mortgage_info[0].strip().replace(" ", "")
     identity_limit = response.xpath(
         '//span[@class="spec can-reg"]/text()').extract()
     if not identity_limit:
         identity_limit = ""
     else:
         identity_limit = identity_limit[0].strip().replace(" ", "")
     lending_time_info = response.xpath(
         '//span[@class="spec fangkuan"]/text()').extract()
     if not lending_time_info:
         lending_time_info = ""
     else:
         lending_time_info = lending_time_info[0].strip()
     prepayment_requirement = response.xpath(
         '//span[@class="doc-color-tail"]/*/@hover-tip').extract()
     if not prepayment_requirement:
         prepayment_requirement = response.xpath(
             '//span[@class="doc-color-tail"]/span/text()').extract()
         if prepayment_requirement:
             prepayment_requirement = prepayment_requirement[0].strip()
         else:
             prepayment_requirement = ""
     else:
         prepayment_requirement = Selector(
             text=prepayment_requirement[0]).xpath(
                 "//span/text()").extract()[0]
         prepayment_requirement = prepayment_requirement.strip()
     extra_info = response.xpath(
         '//meta[@name="description"]/@content').extract()
     if extra_info:
         extra_info = extra_info[0].strip()
     else:
         extra_info = ""
     detail = response.xpath(
         '//div[@class="pd_other_item_content"]/text()').extract()
     item = response.meta['item']
     item['loan_name'] = loan_name
     item['mortgage_info'] = mortgage_info
     item['identity_limit_info'] = identity_limit
     item['lending_time_info'] = lending_time_info
     item['extra_info'] = extra_info
     item['prepayment_requirement'] = prepayment_requirement
     requrement_detail = ""
     if detail:
         for dl in detail:
             requrement_detail += dl.strip()
     item['requirement_detail'] = requrement_detail
     tmp_cookie = _cookie
     tmp_cookie['cityDomain'] = item['city']
     tmp_cookie['my_city'] = item['city']
     referer = "http://www.rong360.com/p_" + item['loan_id']
     tmp_header = _headers
     tmp_header['Referer'] = referer
     if item['loan_type'] == self.household_loan:
         for loan_amt in range(self.loan_amt_min, self.loan_amt_max + 1,
                               self.loan_amt_gap):
             for loan_duration in range(self.loan_duration_min,
                                        self.loan_duration_max + 1,
                                        self.loan_duration_gap):
                 tmp_form_data = _form_d
                 tmp_form_data['loan_limit'] = str(loan_amt)
                 tmp_form_data['loan_term'] = str(loan_duration)
                 yield scrapy.FormRequest(
                     self.interest_url,
                     formdata=tmp_form_data,
                     cookies=tmp_cookie,
                     headers=tmp_header,
                     method=_method,
                     meta={'item': item},
                     callback=self.parse_interest,
                     dont_filter=True,
                 )
     elif item['loan_type'] == self.zero_payment_loan:
         tmp_form_data = _form_d
         tmp_form_data['loan_limit'] = str(item['loan_amt'])
         tmp_form_data['loan_term'] = str(item['loan_duration'])
         yield scrapy.FormRequest(
             self.interest_url,
             formdata=tmp_form_data,
             cookies=tmp_cookie,
             headers=tmp_header,
             method=_method,
             meta={'item': item},
             callback=self.parse_interest,
             dont_filter=True,
         )
     else:
         pass
     pass

コード例 #26

0

ファイルを表示

ファイル: massCollect.py プロジェクト: xlybaby/var-daemon

async def get_links_from_url(definition):
    """Download the page at `url` and parse it for links.
    Returned links have had the fragment after `#` removed, and have been made
    absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
    'http://www.tornadoweb.org/en/stable/gen.html'.
    """
    url = definition["addr"]
    selectors = definition["selectors"]
    level = definition["level"]
    parent = ""
    if "parent" in definition:
        parent = definition["parent"]
    contents = []
    urls = []
    #response = await httpclient.AsyncHTTPClient().fetch(url)
    try:
        response = await http_client.fetch(url,
                                           method='GET',
                                           headers=http_header,
                                           validate_cert=False)
    except Exception as e:
        print("Error: %s" % e)
    else:
        print("fetched %s" % url)
        #print(response.body)

        #html = response.body.decode(errors="ignore")
        s = Selector(text=response.body)
        print(s)
        for selector in selectors:
            xpath = selector["xpath"]
            wrap = definition["wrap"]
            kv = {}
            if wrap == 1:
                strings = ""
                kv["url"] = url
                kv["no"] = 1
                for s in s.xpath(xpath).xpath(".//text()").getall():
                    strings = strings + s.strip()
                kv["value"] = strings
                kv["hashcode"] = hash(strings)
                contents.append(kv)
            else:
                #content.append(s.xpath(xpath).xpath(".//text()").getall())
                for idx, item in enumerate(s.xpath(xpath)):
                    #contents.append({"parent":parent, "rownum":level+"-"+str(idx), "data": {"value":item.xpath(".//text()").getall()}})
                    kv = {}
                    kv["url"] = url
                    kv["no"] = idx + 1
                    string = item.xpath(".//text()").getall()
                    if string:
                        strings = ""
                        for s in string:
                            strings = strings + s.strip()
                        kv["value"] = strings
                        kv["hashcode"] = hash(strings)
                    else:
                        kv["value"] = ""
                        kv["hashcode"] = ""
                    if selector["extract"] == 1:
                        href = item.xpath("@href")
                        if href:
                            kv["href"] = geturl(url, href.get())
                            #urls.append({"addr":href.get(),"parent":level+"-"+str(idx)})
                            urls.append(kv["href"])
                    contents.append(kv)
        #return [urljoin(url, remove_fragment(new_url)) for new_url in get_links(html)]
        return contents, urls

コード例 #27

0

ファイルを表示

ファイル: app.py プロジェクト: Caravan2/scripts

        try:
            name = Selector(response=page).xpath(
                '//*[@id="person-profile"]/div/h2/text()').get()
        except:
            name = ""

        if name is None or name == "":
            f = open("check.txt", "a")
            f.write(f"id: {i} was Not captured\n")
            continue

        # ID
        try:
            _id = Selector(response=page).xpath(
                '//*[@id="person-attributes"]/tbody/tr/td[2]/text()').get()
            _id = _id.strip()
        except:
            _id = ""

        whatsthere = people.find_one({"identification_number": _id})

        if whatsthere is None:

            # Listed Affiliations
            affiliations = []
            num_1 = len(
                Selector(response=page).xpath(
                    '//*[@id="affiliations-list"]/tbody/tr').getall()) + 1
            for tr in range(1, num_1):
                # Company
                try:

コード例 #28

0

ファイルを表示

ファイル: vacancy.py プロジェクト: Caravan2/scripts

def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    print(url)
    # headers = {"Accept-Language": "en-US,en;q=0.5"}
    page = requests.get(url)  #headers=headers)

    # Published
    try:
        published = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[2]/span/text()[2]'
        ).get()
        published = published.strip().split(" ")
        publish_day = int(published[0].split("/")[0])
        publish_month = int(published[0].split("/")[1])
        publish_year = int(published[0].split("/")[2])
    except Exception as e:
        publish_day = 0
        publish_month = 0
        publish_year = 0
    if yesterday_day != publish_day or yesterday_month != publish_month:
        print("Not published yesterday")
        return

    # Location #
    try:
        location = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[1]/text()'
        ).get()
        location = location.strip()
        location_id = []
        location = {"city": f"{location}", "id": f"{Geonames(location)}"}
        location_id.append(location)
    except:
        location_id = [{'city': 'Yerevan', 'id': '616052'}]

    # Posted by
    try:
        posted_by = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[1]/text()'
        ).get()
        posted_by = posted_by.strip()
    except:
        posted_by = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[2]/text()'
        ).get()
        email = email.strip()
        if email == "":
            email = []
        else:
            email = [email]
    except:
        email = []

    # Workspace
    try:
        workspace = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[2]/div[2]/div[2]/p/text()'
        ).get()
        workspace = workspace.strip()
    except:
        workspace = ""

    # Job_type
    try:
        job_type = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[3]/div[2]/div[2]/p/text()'
        ).get()
        job_type = job_type.strip()
    except:
        job_type = ""

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[4]/div[2]/div[2]/p/text()'
        ).get()
        salary = salary.strip().replace("Until ", "")
        if "-" in salary:
            salary = salary.split("-")
            min_salary = int(salary[0].strip())
            max_salary = int(salary[1].strip())
        elif "-" not in salary and salary != '':
            min_salary = int(salary)
            max_salary = int(salary)
        else:
            min_salary = 0
            max_salary = 0
    except:
        min_salary = 0
        max_salary = 0

    # Education
    try:
        education = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[5]/div[2]/div[2]/p/text()'
        ).get()
        education = education.strip()
    except:
        education = ""

    # Experience
    try:
        experience = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[6]/div[2]/div[2]/p/text()'
        ).get()
        experience = experience.strip()
    except:
        experience = ""

    # Gender
    try:
        gender = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[7]/div[2]/div[2]/p/i/@class'
        ).get()
        if "female" in gender:
            gender = "female"
        elif "male" in gender:
            gender = "male"
        else:
            gender = ''
    except:
        gender = ""

    # Age
    try:
        age = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[8]/div[2]/div[2]/p/text()'
        ).get()
        age = age.strip()
    except:
        age = ""

    print(1)

    # Description
    try:
        description = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[10]/div[2]/div/p/text()'
        ).get()
        description = description.strip()
    except:
        description = ""
    description_en = ""
    description_am = ""
    try:
        if detect(description) == "et":
            try:
                description_en = Translate(description)
            except:
                description_en = ""
            description_am = description
        else:
            description_en = description
            description_am = ""
    except:
        description_en = ""
        description_am = ""

    # Phone
    try:
        phone = Selector(response=page).css(
            '#sidebar-border > div.detailed-info-block.form-inline.clearfix > div.clearfix > div > div.user-details'
        ).extract()
        phones = []
        for phone in phone:
            phone = remove_tags(phone).strip()
            area_code = "374"
            number = phone.replace(" ", "")
            number = number.replace("-", "")
            number = number.replace("(", "")
            number = number.replace(")", "")
            phones.append({'country_code': area_code, "number": number})
    except:
        phone = []

    # Username
    try:
        username = Selector(response=page).xpath(
            '//*[@id="sidebar-border"]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div[2]/h6/a/text()'
        ).get()
        username = username.strip()
    except:
        username = ""

    data = {
        "publish_day": publish_day,
        "publish_month": publish_month,
        "publish_year": publish_year,
        "location_id": location_id,
        "posted_by": posted_by,
        "email": email,
        "workspace": workspace,
        "job_type": job_type,
        "min_salary": min_salary,
        "max_salary": max_salary,
        "education": education,
        "experience": experience,
        "gender": gender,
        "age": age,
        "description_am": description_am,
        "description_en": description_en,
        "phone": phones,
        "username": username
    }

    print(data)
    return data


# Vacancy("https://full.am/en/job/public/view/1163")

# https://full.am/en/job/public/view/12067
# https://full.am/en/job/public/view/1163

コード例 #29

0

ファイルを表示

    def parse(self, response):
        item = {}

        # get company name:
        comp_name = Selector(response).xpath(
            '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/h3/text()'
        ).extract()
        if len(comp_name) > 0:
            comp_name = " ".join(comp_name[0].split())
            item['company name'] = comp_name
        else:
            comp_name2 = Selector(response).xpath(
                '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/text()'
            ).extract()
            if len(comp_name2) > 0:
                comp_name2 = " ".join(comp_name2[0].split())
                item['company name'] = comp_name2

        # get company url:
        comp_url = response.url
        item['company_url'] = comp_url

        # get company address:
        address = Selector(response).xpath(
            '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/p[1]/text()'
        ).extract()  #extract the data list address
        join_address = " ".join("".join(address).split())
        item['company address'] = join_address

        # get company country:
        item['country'] = "Singapore"  #as default

        # get company phone & fax:
        phone = Selector(response).xpath(
            '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuephone"]/a/text()'
        ).extract()[0].strip()

        if len(
                Selector(response).xpath(
                    '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuefax"]/a/text()'
                ).extract()) > 0:
            fax = Selector(response).xpath(
                '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuefax"]/a/text()'
            ).extract()[0].strip()
            item['company phone number'] = [
                " ".join(phone.split()), " ".join(fax.split())
            ]
        else:
            item['company phone number'] = [" ".join(phone.split())]

        # get company email:
        if len(
                Selector(response).xpath(
                    '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/a[@id="textemail"]/@onclick'
                ).extract()) > 0:
            email = Selector(response).xpath(
                '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/a[@id="textemail"]/@onclick'
            ).extract()[0]
            item['company email'] = email.split("'")[1]

        # get company website:
        if len(
                Selector(response).xpath(
                    '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuewebsite"]/a/@href'
                ).extract()) > 0:
            web = Selector(response).xpath(
                '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/div[@class="valuewebsite"]/a/@href'
            ).extract()[0]
            item['company website'] = web.strip()

        # get company description:
        if len(
                Selector(response).xpath(
                    '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="company-description"]/text()'
                ).extract()) > 0:
            comp_description = Selector(response).xpath(
                '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="company-description"]/text()'
            ).extract()
            comp_description = "".join(comp_description)
            if comp_description.strip() != "":
                item['company description'] = comp_description.strip()

        # get company product & services:
        if len(
                Selector(response).xpath(
                    '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="owl-carousel-container"]/div[1]/div[@class="item"]/a/img/@title'
                ).extract()) > 0:
            comp_ps = Selector(response).xpath(
                '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="owl-carousel-container"]/div[1]/div[@class="item"]/a/img/@title'
            ).extract()
            item['products and services'] = comp_ps

        # get company categories:
        if len(
                Selector(response).xpath(
                    '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="company-description"]/ul/ul/li/a/text()'
                ).extract()) > 0:
            comp_cat = Selector(response).xpath(
                '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-12"]/div[@class="company-description"]/ul/ul/li/a/text()'
            ).extract()
            item['category'] = comp_cat

        # get company contacts:
        if len(
                Selector(response).xpath(
                    '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/p[2]/text()'
                ).extract()) > 0:
            contacts_raw = Selector(response).xpath(
                '//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/p[2]/text()'
            ).extract()
            contacts = []
            for elem in contacts_raw:
                elem = elem.strip()
                if elem != "Contact":
                    if "Tel" not in elem:
                        if "Mobile" not in elem:
                            if "mail" not in elem:
                                elem = elem.split(",")
                                el = {}
                                if len(elem) > 1:
                                    el['job_title'] = elem[1]
                                    el['name'] = elem[0]
                                    contacts.append(el)
                                    item['contacts'] = contacts
                                elif len(elem) == 1:
                                    if elem[0] != "":
                                        el['name'] = elem[0]
                                        contacts.append(el)
                                        item['contacts'] = contacts

                                # still can't handle for email's contact
                                # emails_raw = Selector(response).xpath('//*[@id="Contentplaceholder1_T5CE92B6B022_Col01"]/div/div[@class="col-md-9 company-details"]/div/div[@class="col-md-7 company-contact"]/p[2]/a/text()').extract()

        yield item

コード例 #30

0

ファイルを表示

def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    print(url)
    # headers = {"Accept-Language": "en-US,en;q=0.5"}
    page = requests.get(url)  #headers=headers)

    # Company
    try:
        company = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/text()'
        ).get()
    except:
        company = ""

    # Website
    try:
        website = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/@href'
        ).get()
        website = [website]
    except:
        website = []

    # Position
    try:
        position = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobPostTitle"]/text()'
        ).get()
    except:
        position = ""

    # logo
    try:
        logo = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_imgCompanyLogoLink"]/@src'
        ).get()
        logo = "http://jobfinder.am/" + logo
    except:
        logo = ''

    # Job_type
    try:
        job_type = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblPositionType"]/text()'
        ).get()
    except:
        job_type = ""

    # Category
    try:
        category = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblCategory"]/text()'
        ).get()
    except:
        category = ""

    # Experience
    try:
        experience = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblExperience"]/text()'
        ).get()
    except:
        experience = ""

    # Education
    try:
        education = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblEducation"]/text()'
        ).get()
    except:
        education = ""

    # Location
    try:
        location = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblLocation"]/text()'
        ).get()
    except:
        location = ""

    # Published
    try:
        published = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()'
        ).get()
        published = published.split(" ")
        published = published[0].split("-")
        publish_day = int(published[0])
        publish_month = int(published[1])
        publish_year = int("20" + published[2])
    except:
        publish_day = 0
        publish_month = 0
        publish_year = 0
    if yesterday_day != publish_day or yesterday_month != publish_month:
        print("Not published yesterday")
        return

    # Ends
    try:
        ends = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()'
        ).get()
        ends = ends.split(" ")
        ends = ends[0].split("-")
        deadline_day = int(ends[0])
        deadline_month = int(ends[1])
        deadline_year = int("20" + ends[2])
    except:
        deadline_day = 0
        deadline_month = 0
        deadline_year = 0

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblSalary"]/text()'
        ).get()
        salary = int(salary)
    except:
        salary = 0

    # Age
    try:
        age = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAge"]/text()'
        ).get()
        if "--------" in age:
            age = ""
    except:
        age = ""

    # Gender
    try:
        gender = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblGender"]/text()'
        ).get()
        if "--------" in gender:
            gender = ""
    except:
        gender = ""

    # Job Description
    try:
        j_description = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobDescription"]/text()'
        ).get()
    except:
        j_description = ""

    # Job Responsibilities
    try:
        j_responsibilities = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobResponsibilities"]/text()'
        ).get()
    except:
        j_responsibilities = ""

    # Required Qualifications
    try:
        r_qualifications = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblRequiredQualifications"]'
        ).get()
        r_qualifications = remove_tags(r_qualifications)
    except:
        r_qualifications = ""

    # Application Procedure
    try:
        a_procedure = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]'
        ).get()
        a_procedure = remove_tags(a_procedure)
    except:
        a_procedure = remove_tags(a_procedure)

    v_description = j_description + "\n" + j_responsibilities + "\n" + r_qualifications + "\n" + a_procedure
    try:
        if detect(v_description) == "et":
            try:
                v_description_en = Translate(v_description)
            except:
                v_description_en = ""
            v_description_am = v_description
        else:
            v_description_en = v_description
            v_description_am = ""
    except:
        v_description_en = ""
        v_description_am = ""

    # About Company
    try:
        c_description = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAboutCompany"]'
        ).get()
        c_description = remove_tags(c_description)
    except:
        c_description = ""
    try:
        if detect(c_description) == "et":
            try:
                c_description_en = Translate(c_description)
            except:
                c_description_en = ""
            c_description_am = c_description
        else:
            c_description_en = c_description
            c_description_am = ""
    except:
        c_description_en = ""
        c_description_am = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]/a/text()'
        ).get()
        email = email.strip()
        email = [email]
    except:
        email = []

    # Phone
    try:
        phone = re.search(r"\d{9}", v_description_en).group()
        phone = [{"country_code": "374", "number": phone}]
    except:
        phone = []

    data = {
        "company": company,
        "position": position,
        "website": website,
        "logo": logo,
        "job_type": job_type,
        "category": category,
        "experience": experience,
        "education": education,
        "location": location,
        "publish_day": publish_day,
        "publish_month": publish_month,
        "publish_year": publish_year,
        "deadline_day": deadline_day,
        "deadline_month": deadline_month,
        "deadline_year": deadline_year,
        "salary": salary,
        "age": age,
        "gender": gender,
        "v_description_am": v_description_am,
        "v_description_en": v_description_en,
        "c_description_am": c_description_am,
        "c_description_en": c_description_en,
        "email": email,
        "phone": phone,
    }

    # print(data)
    return data


# Vacancy('http://jobfinder.am/ViewJob.aspx?JobPostingID=49217')

コード例 #31

0

ファイルを表示

def Vacancy(link):
    url = link
    headers = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9,ru;q=0.8"
    }
    page = requests.get(url, headers=headers)

    # Company
    try:
        company = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/h4/text()').get()
    except:
        company = ""

    # position
    try:
        position = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[2]/div/div[1]/h4/text()').get()
    except:
        position = ""

    # logo
    try:
        logo = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/img/@src').get()
    except:
        logo = ""

    # Job_type
    try:
        job_type = Selector(response=page).xpath(
            '/html/body/div[3]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]//text()[2]'
        ).get()
        job_type = job_type.strip()
    except:
        job_type = ""

    # Contact Person
    try:
        person = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[2]').get(
            )
        person = person.strip()
    except:
        person = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[3]').get(
            )
        email = email.strip()
        email = [email]
    except:
        email = []

    # Phone
    try:
        phone = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[4]').get(
            )
        phone = phone.strip()
        if "," in phone:
            phones = phone.split(",")
            phone = []
            for each in phones:
                each = each.strip()
                if "+" in each and " " in each:
                    number = each.split(" ",
                                        1)[1].replace('-',
                                                      "").replace(" ", "")
                    country_code = each.split(" ", 1)[0].replace('+', "")
                    phone.append({
                        "country_code": country_code,
                        "number": number
                    })
                elif "+" in each and " " not in each:
                    if "+374" in each:
                        country_code = "374"
                        number = each.replace("+374", "")
                        phone.append({
                            "country_code": country_code,
                            "number": number
                        })
                    elif "+1" in each:
                        country_code = "1"
                        number = each.replace("+1", "")
                        phone.append({
                            "country_code": country_code,
                            "number": number
                        })
                    else:
                        country_code = "374"
                        number = each
                        phone.append({
                            "country_code": country_code,
                            "number": number
                        })
                elif "+" not in each:
                    number = each.replace('-', "").replace(" ", "")
                    country_code = "374"
                    phone.append({
                        "country_code": country_code,
                        "number": number
                    })
        else:
            if "+" in phone and " " in phone:
                number = phone.split(" ", 1)[1].replace('-',
                                                        "").replace(" ", "")
                country_code = phone.split(" ", 1)[0].replace('+', "")
                phone = [{"country_code": country_code, "number": number}]
            elif "+" in phone and " " not in phone:
                if "+374" in phone:
                    country_code = "374"
                    number = phone.replace("+374", "")
                    phone = [{"country_code": country_code, "number": number}]
                elif "+1" in phone:
                    country_code = "1"
                    number = phone.replace("+1", "")
                    phone = [{"country_code": country_code, "number": number}]
                else:
                    country_code = "374"
                    number = phone
                    phone = [{"country_code": country_code, "number": number}]
            elif "+" not in phone:
                number = phone.replace('-', "").replace(" ", "")
                country_code = "374"
                phone = [{"country_code": country_code, "number": number}]

    except Exception as e:
        phone = []

    # Website
    try:
        website = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[5]').get(
            )
        website = website.strip()
        if "not" in website:
            website = []
        else:
            website = [website]
    except:
        website = []

    # Published
    try:
        published = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[2]').get()
        published = published.strip()
        publish_day = int(published.split("-")[2])
        publish_month = int(published.split("-")[1])
        publish_year = int(published.split("-")[0])
    except:
        publish_day = 0
        publish_month = 0
        publish_year = 0

    # Ends
    try:
        ends = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[5]').get()
        ends = ends.strip()
        deadline_day = int(ends.split("-")[2])
        deadline_month = int(ends.split("-")[1])
        deadline_year = int(ends.split("-")[0])
    except:
        deadline_day = 0
        deadline_month = 0
        deadline_year = 0

    # Career Level
    try:
        career_level = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[1]/text()').get(
            )
        if career_level == None:
            career_level = ""
    except:
        career_level = ""

    # Education
    try:
        education = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[2]/text()').get(
            )
        if education == None:
            education = ""
    except:
        education = ""

    # Experience
    try:
        experience = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[3]/text()').get(
            )
        if experience == None:
            experience = ""
    except:
        experience = ""

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/strong/text()').get(
            )
        if "-" in salary:
            salary = salary.split("-")
            min_salary = salary[0].strip()
            min_salary = int(min_salary.replace(".", ""))
            max_salary = salary[1].strip()
            max_salary = int(max_salary.replace('.', ""))
        elif "-" not in salary and salary != "N/A":
            min_salary = int(salary.replace("."))
            max_salary = int(salary.replace("."))
        else:
            min_salary = 0
            max_salary = 0
    except:
        min_salary = 0
        max_salary = 0

    # Vacancy Description
    try:
        v_description = Selector(
            response=page).xpath('//*[@id="loyal"]/div[2]/div/div[1]').get()
        v_description = remove_tags(v_description).strip()
        v_description = v_description.replace('\xa0', " ")
    except:
        v_description = ""
    try:
        if detect(v_description) == "et":
            try:
                v_description_en = Translate(v_description)
            except:
                v_description_en = " "
            v_description_am = v_description
        else:
            v_description_en = v_description
            v_description_am = ""
    except:
        v_description_am = ""
        v_description_en = ""

    # Company Description
    try:
        c_description = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()').get()
        c_description = c_description.strip()
    except:
        c_description = ""
    try:
        if detect(c_description) == "et":
            try:
                c_description_en = Translate(c_description)
            except:
                c_description_en = " "
            c_description_am = c_description
        else:
            c_description_en = c_description
            c_description_am = ""
    except:
        c_description_am = ""
        c_description_en = ""
# c_descrip ; //*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()

    data = {
        "company": company,
        "position": position,
        "logo": logo,
        "person": person,
        "job_type": job_type,
        "email": email,
        "phone": phone,
        "website": website,
        "publish_day": publish_day,
        "publish_month": publish_month,
        "publish_year": publish_year,
        "deadline_day": deadline_day,
        "deadline_month": deadline_month,
        "deadline_year": deadline_year,
        "career_level": career_level,
        "education": education,
        "experience": experience,
        "min_salary": min_salary,
        "max_salary": max_salary,
        "v_description_am": v_description_am,
        "v_description_en": v_description_en,
        "c_description_am": c_description_am,
        "c_description_en": c_description_en,
    }

    print(data)
    return data


# Vacancy("https://rezume.am/job/2184")