def parse_modal_infor(self,response): res = response.text result = response.meta.get("item") res_convert = json.loads(res) res_convert = res_convert['spec'] res_convert_del_header = re.sub(r'<li><label>([a-zA-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềếểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳýỵỷỹ\s\.&,.-])+<\/label><\/li>',"",res_convert) res_key = remove_tags_with_content(res_convert_del_header, which_ones=('div',)) res_key_replace_tags = replace_tags(res_key,'|','utf-8') res_key_array = list() res_key_gen = (value for value in res_key_replace_tags.split("||||")) for val in res_key_gen: res_key_array.append(val.replace("||","")) res_val = remove_tags_with_content(res_convert_del_header,which_ones=('span',)) res_val_remove_tags = remove_tags(res_val,which_ones = ('a','li',)) res_val = replace_tags(res_val_remove_tags,'|','utf-8') res_val = res_val.split("||") res_val_array = list() res_val_gen = (val for val in res_val) for val in res_val_gen: res_val_array.append(val.replace("|","")) res_modal = dict(zip(res_key_array,res_val_array)) result['data'] = res_modal # spec = Selector(text=json.loads(response.text)['spec']) # spec_values_container = list(filter(lambda x: len(x.xpath('./@class')) != 0, spec.css('li'))) # spec_values_dict_keys = [ x.xpath('./span/text()').get() if x.xpath('./span/text()').get().split() else x.xpath('./span/div/text()').get() for x in spec_values_container ] # spec_values_dict_values = [remove_tags(x.xpath('./div').get()) for x in spec_values_container] yield result
def test_replace_tags(self): self.assertEqual( replace_tags("This text contains <a>some tag</a>"), "This text contains some tag", ) self.assertEqual( replace_tags(b"This text is very im<b>port</b>ant", " "), "This text is very im port ant", )
def test_replace_tags(self): # make sure it always return uncode assert isinstance(replace_tags("no entities"), unicode) self.assertEqual(replace_tags(u"This text contains <a>some tag</a>"), u"This text contains some tag") self.assertEqual(replace_tags("This text is very im<b>port</b>ant", " "), u"This text is very im port ant") # multiline tags self.assertEqual(replace_tags('Click <a class="one"\r\n href="url">here</a>'), u"Click here")
def test_replace_tags(self): # make sure it always return uncode assert isinstance(replace_tags('no entities'), unicode) self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'), u'This text contains some tag') self.assertEqual( replace_tags('This text is very im<b>port</b>ant', ' '), u'This text is very im port ant') # multiline tags self.assertEqual( replace_tags('Click <a class="one"\r\n href="url">here</a>'), u'Click here')
def normalize_web_content(x, keep=('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong'), token='____SECTION____'): """Normalize web content. Parameters ---------- keep : tuple HTML tags to keep. token : str or None Token to use for replacing kep HTML tags. Do not replace if `None`. """ try: x = strip_html5_whitespace(x) x = remove_comments(x) x = remove_tags(x, keep=keep) if token: x = replace_tags(x, token=token) x = replace_entities(x) x = replace_escape_chars(x) except (TypeError, AttributeError): pass for part in _rx_web_sectionize.split(x): if part: yield part
def parse(self, response): articles = response.css('article') for article in articles: title = article.css('.entry-title > a::text').extract_first() date_published = article.css( '.entry-date.published::text').extract_first() link = article.xpath( './/*[@class="more-link"]/@href').extract_first() if 'category-video' in article.attrib['class']: content_dirty = article.css('.entry-content p').extract_first() content = remove_tags(content_dirty).replace('|', ' | ').replace( '\n', ' ') else: content_dirty = article.xpath( './/*[@class="entry-content"]').extract_first() content = replace_tags(replace_escape_chars(content_dirty), ' ').replace('(more…)', '').replace( 'Read More', '').strip() yield { 'Title': title, 'Published On': date_published, 'Content': content, 'Link': link, } next_page = response.xpath( './/a[contains(@class, "next")]/@href').extract_first() yield Request(next_page)
def process_item(self, item, spider): """处理item""" user_name = item.get('user_name', '') user_gender = item.get('user_gender', '') user_age = item.get('user_age', 0) vote_count = item.get('vote_count', 0) comment_count = item.get('comment_count', 0) god_comment = item.get('god_comment', '') content = item.get('content', '') url = item.get('url', '') # 更改性别显示方式 user_gender = '男' if user_gender == 'M' else '女' # 去除html标签 content = replace_tags(content, token='\n').strip() print('#' * 30) if user_name != '匿名用户': print('{} {} {}:'.format(user_name, user_gender, user_age)) else: print('匿名用户:') print('正文: {}'.format(content)) if god_comment != '': print('神评: {}'.format(god_comment)) print('#' * 30 + '\n')
def _parse_rooms_by_js(self, rooms_in_script): rooms = [] for room_item in rooms_in_script: additional_info = room_item.get('additionalInfo', {}) # amenities = additional_info.get('details', {}) \ # .get('amenities', []) # size = [ # remove_tags(amenity['description']) # for amenity in amenities # if amenity['type'] == 'room-size' # ] size = '' if additional_info['description'].startswith('<strong>'): size = additional_info['description'].split('</strong>', 1)[0] size = size[8:] bed_type_and_occupancy = room_item.get('bedTypeAndOccupancy', {}) # noqa bed_types = bed_type_and_occupancy.get('bedTypes', []) bed_extra_types = bed_type_and_occupancy.get('extraBeds', []) images = [ {'type': image.get('caption'), 'url': image.get('fullSizeUrl')} for image in room_item.get('images', []) ] description = list( filter( lambda item: item != '' and item != ' ', replace_tags( remove_tags( additional_info.get('description', ''), keep=('br',) ), '\n' ).split('\n') ) ) occupancy = room_item.get('maxOccupancy') room = { 'name': remove_tags(room_item['name']), 'size': len(size) > 0 and size[0] or None, 'occupancy': ''.join([occupancy.get('messageTotal'), occupancy.get('messageChildren')]), # noqa 'bed_types': bed_types, 'bed_metrics': bed_type_and_occupancy.get('bedTypesTooltipMessage'), # noqa 'bed_types_str': bed_type_and_occupancy.get('localisedName'), 'bed_extra_types': bed_extra_types, 'description': description, 'amenities': list(map(remove_tags, additional_info.get('details', {}).get('amenities', []))), # noqa 'images': images, 'room_type_code': room_item['ratePlans'][0]['payment']['book']['bookingParamsMixedRatePlan']['roomTypeCode'] # noqa } rooms.append(room) return rooms
def on_update(self, status): """A new status has appeared! 'status' is the parsed JSON dictionary describing the status.""" # tcp_connection = tcp() # print(status) json_toot = status try: if status['language'] in ['en', 'fr', 'None', 'es', 'de']: toot_text = replace_entities(replace_tags(json_toot['content'])) print("Toot Text: " + toot_text) print("------------------------------------------") message = toot_text + '\n' tcp_connection.sendto(message.encode('utf-8'),("localhost", 9009)) except: e = sys.exc_info()[0] print("Error: %s" % e)
def on_update(self, status): """A new status has appeared! 'status' is the parsed JSON dictionary describing the status.""" # print(status) # tcp_connection = tcp() json_toot = status try: if status['language'] in ['en', 'fr', 'None', 'es', 'de']: toot_text = replace_entities(replace_tags(json_toot['content'])) stopwords_combined = stopwords.words('english') + stopwords.words('french') + stopwords.words('spanish') \ + stopwords.words('german') wordsc = [w for w in toot_text.split(" ") if w.lower() not in stopwords_combined] print("Toot Text: " + toot_text) print(wordsc) print("------------------------------------------") message = toot_text + '\n' # tcp_connection.send(toot_text + '\n') except: e = sys.exc_info()[0] print("Error: %s" % e)
def format_bedsize(self, bedsize): if not bedsize: return '' return replace_tags(bedsize, '\n')
def test_replace_tags_multiline(self): self.assertEqual(replace_tags(b'Click <a class="one"\r\n href="url">here</a>'), u'Click here')
def test_replace_tags(self): self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'), u'This text contains some tag') self.assertEqual(replace_tags(b'This text is very im<b>port</b>ant', ' '), u'This text is very im port ant')
def test_returns_unicode(self): # make sure it always return uncode assert isinstance(replace_tags(b'no entities'), six.text_type) assert isinstance(replace_tags('no entities'), six.text_type)
def format_bedsize(self, bedsize): if not bedsize: return "" return replace_tags(bedsize, "\n")
def process_value(self, value): return " ".join(replace_tags(value).strip().split())
def test_replace_tags_multiline(self): self.assertEqual( replace_tags(b'Click <a class="one"\r\n href="url">here</a>'), 'Click here')
def test_returns_unicode(self): # make sure it always return uncode assert isinstance(replace_tags(b'no entities'), str) assert isinstance(replace_tags('no entities'), str)
def _parse(self): # pylint: disable=R0912,R0915 result = self._order hotel_name = strip_str(take_first(self._etree, marriott_xp.HOTEL_NAME)) if hotel_name: result['hotel_name'] = hotel_name else: self.logger.error('hotel_name is empty %s', self._message_id) address = strip_str(take_first(self._etree, marriott_xp.ADDRESS)) if address: result['address'] = address else: self.logger.error('address is empty %s', self._message_id) tel = take_first(self._etree, marriott_xp.PHONE) if tel: result['telephone'] = tel else: self.logger.error('telephone is empty %s', self._message_id) confirm_num = take_first(self._etree, marriott_xp.CONFIRM_NUM) if confirm_num: confirm_num = confirm_num.split(': ')[-1] result['confirm_code'] = confirm_num else: self.logger.error('confirm_number is empty %s', self._message_id) guest = take_first(self._etree, marriott_xp.GUEST) if guest: guest = guest.split('For ')[-1] result['guest_name'] = guest else: self.logger.error('guest_name is empty %s', self._message_id) check_in_out_time = strip_list( self._etree.xpath(marriott_xp.CHECK_IN_OUT_TIME)) if len(check_in_out_time) == 2: result['check_in_time'] = check_in_out_time[0] result['check_out_time'] = check_in_out_time[1] else: self.logger.error('check_in_date and check_out_time is empty %s', self._message_id) check_in_out_date = strip_list( self._etree.xpath(marriott_xp.CHECK_IN_OUT_DATE)) check_in_date, check_out_date = unpack(check_in_out_date) if check_in_date and check_out_date: result['check_in_date'] = check_in_date result['check_out_date'] = check_out_date tz = to_timezone(address) check_in_date_formatted = \ DateTime(check_in_date, 'MMMM DD, YYYY').tz_to_datetime(tz) if check_in_date_formatted: result['check_in_date_formatted'] = check_in_date_formatted else: self.logger.error('check_in_date_formatted is empty %s', self._message_id) check_out_date_formatted = \ DateTime(check_out_date, 'MMMM DD, YYYY').tz_to_datetime(tz) if check_out_date_formatted: result['check_out_date_formatted'] = check_out_date_formatted else: self.logger.error('check_out_date_formatted is empty %s', self._message_id) else: self.logger.error('check_in_date and check_out_date is empty %s', self._message_id) related_links = self._etree.xpath(marriott_xp.RELATED_LINK) related_text = self._etree.xpath(marriott_xp.RELATED_TEXT) related_links = to_dict(related_text, related_links) if related_links: related = [] for i in related_links: if 'Hotel Website' in i.get('name'): result['hotel_link'] = i.get('value') related_links.remove(i) elif 'Map & Directions' in i.get('name'): result['map_link'] = i.get('value') related_links.remove(i) elif 'Cancel' in i.get('name'): result['cancellation_link'] = i.get('value') else: related.append(i) if related: result['related_links'] = related_links else: self.logger.error('related_links is empty %s', self._message_id) room_type = take_first(self._etree, marriott_xp.ROOM_TYPE) room_type_value = take_first(self._etree, marriott_xp.ROOM_TYPE_VALUE) if room_type and room_type_value: result['room_type'] = room_type_value else: self.logger.error('room_type is empty %s', self._message_id) room_num_guest = strip_list( self._etree.xpath(marriott_xp.ROOM_NUM_GUEST)) room_num_guest_name, room_num_guest_value = group(room_num_guest) if room_num_guest_name and room_num_guest_value: for i, j in zip(room_num_guest_name, room_num_guest_value): if 'NUMBER OF ROOMS' in i: result['number_of_rooms'] = j elif 'GUESTS PER ROOM' in i: continue else: self.logger.warning('%s is %s %s', i, j, self._message_id) else: self.logger.error('room number is empty %s', self._message_id) guarantee = strip_list(self._etree.xpath( marriott_xp.GUARANTEED_METHOD)) if guarantee: result['guarantee_policies'] = [guarantee[-1]] else: self.logger.error('guarantee is empty %s', self._message_id) price_des = strip_str( take_first(self._etree, marriott_xp.CHARGE_DESCRIPTION)) if price_des: result['price_tips'] = [price_des] else: self.logger.warning('price_description is empty %s', self._message_id) notice = strip_list(self._etree.xpath(marriott_xp.HOTEL_ALERT)) if notice: result['notice'] = notice else: self.logger.error('notice is empty %s', self._message_id) rates = strip_list(self._etree.xpath(marriott_xp.RATES)) if rates: rates_type = rates.pop() if 'Best Available rate' in rates_type: nights = sum([int(i.split(' ')[0]) for i in rates[1::3]]) price = sum([float(i.split(' ')[0]) for i in rates[2::3]]) currency = ' ' + rates[2].split(' ')[-1] result['price'] = str(round(price / nights, 2)) + currency else: result['price'] = rates[-1] self.logger.error('price is empty %s', self._message_id) else: self.logger.error('rates is empty %s', self._message_id) taxes = strip_list(self._etree.xpath(marriott_xp.TAXES)) name, value = unpack(taxes) if name and value: if 'TAXES & FEES' in name: result['taxes_fee'] = taxes[-1] else: self.logger.error('%s is %s %s', name, value, self._message_id) else: self.logger.error('taxes is empty %s', self._message_id) total = strip_list(self._etree.xpath(marriott_xp.TOTAL)) name, value = unpack(total) if total: if 'Total' in name: result['total_cost'] = total[-1] else: self.logger.error('%s is %s %s', name, value, self._message_id) else: self.logger.error('total_price is empty %s', self._message_id) other_charge = strip_list(self._etree.xpath(marriott_xp.OTHER_CHARGE)) other_charge = [ i for i in other_charge if i != '\u2022' and i != 'Other Charges' ] if other_charge: result['other_charges'] = other_charge else: self.logger.warning('other_charge is empty %s', self._message_id) cancellation = take_first(self._etree, marriott_xp.RATE_CANCELLATION_DETAILS) if cancellation is not None: cancellation = replace_tags(etree.tostring(cancellation)) cancellation = cancellation.split('• \n') result['cancellation_policies'] = strip_list(cancellation) else: self.logger.error('cancellation_policy is empty %s', self._message_id) rate_guarantee_title = take_first(self._etree, marriott_xp.RATE_GUARANTEE_TITLE) rate_guarantee = strip_list( self._etree.xpath(marriott_xp.RATE_GUARANTEE)) rate_guarantee = [i for i in rate_guarantee if i != '\u2022'] if rate_guarantee and 'GUARANTEE' in rate_guarantee_title: guarantee = result.get('guarantee_policies') if guarantee: result['guarantee_policies'].extend(rate_guarantee) else: result['guarantee_policies'] = rate_guarantee else: self.logger.error('rate guarantee is empty %s', self._message_id) addition_title = take_first(self._etree, marriott_xp.ADDITION_INFO_TITLE) addition_link = strip_list( self._etree.xpath(marriott_xp.ADDITION_INFO_LINK)) addition_text = strip_list( self._etree.xpath(marriott_xp.ADDITION_INFO_TEXT)) if addition_text and 'ADDITIONAL' in addition_title: result['additional_information'] = to_dict(addition_text, addition_link) else: self.logger.error('additional information is empty %s', self._message_id) contact_links = self._etree.xpath(marriott_xp.CONTACT_LINK) contact_texts = strip_list(self._etree.xpath(marriott_xp.CONTACT_TEXT)) contact = strip_list(self._etree.xpath(marriott_xp.CONTACT_1)) contact_1 = [{'name': i, 'value': i} for i in contact] contact = chain(to_dict(contact_texts, contact_links), contact_1) contact = filter_dict_value(contact) if contact: result['contact_information'] = contact return result
def test_replace_tags(self): self.assertEqual(replace_tags('This text contains <a>some tag</a>'), 'This text contains some tag') self.assertEqual( replace_tags(b'This text is very im<b>port</b>ant', ' '), 'This text is very im port ant')
def _cleanup(value): return " ".join(replace_entities(replace_tags(value)).strip().split())
def get_financial_blob(sel): financial_html_blob = sel.xpath( '//div[@id="financial-details-wrapper"]').get() return replace_tags(financial_html_blob, "\n")