def parse_review(review: bs4.element.Tag) -> dict: """ INPUT: review: HTML segment that contains all relevant review information OUTPUT: d: dictionary of relevant review information """ d = {} if review.select_one("div.rating-10 span"): d['rating'] = int(review.select_one("div.rating-10 span").text) d['headline'] = review.select_one("h2.text_header").text try: d['country'] = review.select_one('h3.text_sub_header').text\ .replace(')', '(').split('(')[1] except IndexError: d['country'] = 'None' d['body'] = review.select_one("div.text_content").text.strip() rows = review.select('tr') for row in rows: if row.select('td')[1].attrs['class'][0] == 'review-rating-stars': for x in row.select('span'): try: if x.attrs['class'] == ['star', 'fill']: num = int(x.text) d[row.td.attrs['class'][1]] = num except KeyError: continue else: d[row.td.attrs['class'][1]] = row.select('td')[1].text return d
def parse_event(event: bs4.element.Tag): """イベントひとつ分の要素から情報を抜き出す""" url = event.select_one('.events-list-item-title h3 a').get('href') community = event.select_one('.events-list-item-group a') community = community.text if community else None # thumbnail = event.select_one('.event_thumbnail img').get('src') # thumbnail = '' if re.search(r'/no_image_', thumbnail): thumbnail = None return Event( id=int(re.match(r'.+/(\d+)/?', url)[1]), title=event.select_one('.events-list-item-title h3 a span').text, url=url, dt_start=datetime.strptime( event.select_one('time').get('datetime'), '%Y-%m-%dT%H:%M:%S%z'), #2019-10-12T13:00:00+09:00 dt_end=datetime.strptime( event.select_one('time').get('datetime'), '%Y-%m-%dT%H:%M:%S%z'), # amount = event.select_one('.amount').text, # thumbnail = thumbnail, community=community, owner=community, place=''.join( map(lambda x: x.text, event.select('.events-list-item-venue > span'))))
def parse_item(item: bs4.element.Tag): """ :return SongItem(name, singer, id) """ mid = item.attrs['mid'] name = item.select_one('.song-name-text').text.strip() singer = item.select_one('.song-singer').text if singer: singer = re.sub(r'\s+', ' ', singer).strip() return SongItem(name, singer, mid)
def __init__(self, item: bs4.element.Tag): self.name = item.select_one('span.txt').text sub_menu = item.select('li.listItem a') if len(sub_menu) == 0: self.link, self.id = solve_link(item.select_one('a')['href']) return for item in sub_menu: self.append(sub_industry(item))
def tag2gift(tag: bs4.element.Tag): """ extrace info from tag and return the Gift Object constructed by those info :param tag bs4.element.Tag: html tag """ name: str = tag.select_one("td:first-child").get_text().strip() description: str = tag.select_one("td:nth-child(2)").get_text().strip() price_str: str = tag.select_one("td:nth-child(3)").get_text().strip() price: float = float(price_str[1:].replace(',', '')) return Gift(name, description, price)
def parse_block(self, block: bs4.element.Tag): url_block = block.select_one('a.ref_goods_n_p') if not url_block: logger.error('no url_block') return url = url_block.get('href') if not url: logger.error('no href') return name_block = block.select_one('div.dtlist-inner-brand-name') if not name_block: logger.error(f'no name_block on {url}') return brand_name = name_block.select_one('strong.brand-name') if not name_block: logger.error(f'no brand_name on {url}') return # Wrangler / brand_name = brand_name.text brand_name = brand_name.replace("/", "").strip() goods_name = name_block.select_one('span.goods-name') if not goods_name: logger.error(f'no goods_name on {url}') return goods_name = goods_name.text.strip() self.result.append( ParseResult( url=url, brand_name=brand_name, goods__name=goods_name, )) logger.debug('%s, %s, %s', url, brand_name, goods_name) logger.debug('-' * 100)
def parse_block(self, item: bs4.element.Tag): # Выбрать блок с ссылкой и названием url_block = item.select_one('a.snippet-link') href = url_block.get('href') if href: url = 'https://www.avito.ru' + href else: url = None title = url_block.string.strip() # Выбрать блок с ценой price_block = item.select_one('span.price') price_block = price_block.get_text('\n') price_block = list( filter(None, map(lambda i: i.strip(), price_block.split('\n')))) if len(price_block) == 2: price, currency = price_block price = int(price.replace(" ", "")) else: price, currency = None, None logger.error(f"Что-то пошло не так при поиске цены: %s, %s", price_block, url) # Выбрать блок с датой размещения объявления date = None date_block = item.select_one('div.item-date div.js-item-date.c-2') absolute_date = date_block.get('data-absolute-date') if absolute_date: date = self.parse_date(item=absolute_date) logger.info(f'%s, %s, %s, %s, %s', url, title, price, currency, date) return Block( url=url, title=title, price=price, currency=currency, date=date, )
def parse_event(event: bs4.element.Tag): """イベントひとつ分の要素から情報を抜き出す""" url = event.select_one('.event_title a').get('href') community = event.select_one('.series_title') community = community.text if community else None thumbnail = event.select_one('.event_thumbnail img').get('src') if re.search(r'/no_image_', thumbnail): thumbnail = None return Event(id=int(re.match(r'.+/(\d+)/?', url)[1]), title=event.select_one('.event_title a').text, url=url, dt_start=datetime.strptime( event.select_one('.dtstart .value-title').get('title'), '%Y-%m-%dT%H:%M:%S%z'), dt_end=datetime.strptime( event.select_one('.dtend .value-title').get('title'), '%Y-%m-%dT%H:%M:%S%z'), amount=event.select_one('.amount').text, thumbnail=thumbnail, community=community, owner=event.select_one('.event_owner img').get('title'), place=event.select_one('.event_place').text.strip())
def from_tag(cls, tag: bs4.element.Tag): rent = tag.find("span", class_="cassetteitem_price cassetteitem_price--rent").text admin_fee = tag.find("span", class_="cassetteitem_price cassetteitem_price--administration").text deposit = tag.find("span", class_="cassetteitem_price cassetteitem_price--deposit").text gratuity = tag.find("span", class_="cassetteitem_price cassetteitem_price--gratuity").text layout = tag.find("span", class_="cassetteitem_madori").text area = tag.find("span", class_="cassetteitem_menseki").text floor, *_ = tag.find_all("td")[2].stripped_strings min_floor, max_floor = parse_floor_range(floor) detail_href = tag.select_one("td.ui-text--midium.ui-text--bold a")["href"] url = f"{SUUMO_URL}{detail_href}" jnc_id = re.search(r"jnc_([0-9]*)/", detail_href).group(1) new_arrival = tag.find(class_="cassetteitem_other-checkbox--newarrival") is not None return cls(parse_money(rent, unit="万円"), parse_money(admin_fee, unit="円"), parse_money(deposit, unit="万円"), parse_money(gratuity, unit="万円"), layout, parse_area(area), min_floor, max_floor, url, jnc_id, new_arrival)
def parse_block(self, item: bs4.element.Tag): # Выбрать блок с ссылкой и названием url_block = item.select_one('a.snippet-link') if not url_block: raise CommandError('bad "url_block" css') href = url_block.get('href') if href: url = 'https://www.avito.ru' + href else: url = None title = url_block.string.strip() if not title: raise CommandError(f'no title for item: {url_block}') # Выбрать блок с ценой price_block = item.select_one('span.price') if not price_block: raise CommandError('bad "price_block" css') price_block = price_block.get_text('\n') price_block = list( filter(None, map(lambda i: i.strip(), price_block.split('\n')))) if len(price_block) == 2: price, currency = price_block price = int(price.replace(" ", "")) else: price, currency = None, None logger.error(f"Что-то пошло не так при поиске цены: %s, %s", price_block, url) # Выбрать блок с датой размещения объявления date = None date_block = item.select_one('div.item-date div.js-item-date.c-2') if not date_block: raise CommandError('bad "date_block" css') absolute_date = date_block.get('data-absolute-date') if absolute_date: date = self.parse_date(item=absolute_date) block = Block( url=url, title=title, price=price, currency=currency, date=date, ) logger.info(block) # обновляем значение, если объект в базе уже существует try: p = Product.objects.get(url=url) p.task = self.task p.title = title p.price = price p.currency = currency p.public_date = date p.save() except Product.DoesNotExist: p = Product( url=url, task=self.task, title=title, price=price, currency=currency, public_date=date, ).save() logger.info(f"Except in product url: {url}") return block