def process_item(self, item, spider): i = item['summary'][0] i = remove_tags(i) i = replace_escape_chars(i) item['summary'][0] = i i = item['job_title'][0] i = remove_tags(i) i = replace_escape_chars(i) item['job_title'][0] = i return item
def process_item(self, item, spider): i = item['summary'][0] i = remove_tags(i) i = replace_escape_chars(i) item['summary'][0] = i i = item['job_title'][0] i = remove_tags(i) i = replace_escape_chars(i) item['job_title'][0] = i print item return item
def parse_news(self, response): date = response.css( "div.content-time-published.margin .time-modified.margin::text" ).extract_first() title = response.css( "span#id-blasting-tv-masthead-video-title::text").extract_first() subtitle = response.css("h2.title-h2::text").extract_first() try: article = remove_tags_with_content(response.css( "div.article-body.p402_premium.template-a").extract_first(), which_ones=('div', 'script')) except: article = remove_tags_with_content( response.css("div#article-body-p1").extract_first(), which_ones=('div', 'a', 'script')) article = remove_tags(article) article = replace_escape_chars(article, which_ones=('\n')) article = re.sub(r'http\S+', '', article).strip() yield { 'article': article, 'subtitle': subtitle, 'title': title, 'date': date, 'link': response.url, 'website': 'blasting' }
def process_item(self, item, spider): body_only = Selector(text=item['body']).css('body').get() script_removed = remove_tags_with_content(body_only, which_ones=('style', 'script')) tags_replaced = replace_tags(script_removed, ' ') item['body'] = replace_escape_chars(tags_replaced, ' ') logging.info(f'Item cleaned up: {item["title"]}') return item
def _extract_links(self, response_text, response_url, response_encoding): base_url = self.base_url if self.base_url else response_url clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def process_item(self, item, spider): if item['title'] and item['author'] and item['date'] and item['text'] and item['link']: if not item['link'] in self.urls_seen: item['text'] = remove_tags(remove_tags_with_content(replace_escape_chars(filter(lambda x: x in string.printable, item['text'][25:])), which_ones=('div', 'img', 'script'))) item['title'] = filter(lambda x: x in string.printable, item['title']) self.urls_seen.add(item['link']) return item else: raise DropItem('Duplicate item %s' % item) else: raise DropItem('Missing fields %s' % item)
def _extract_links(self, response_text, response_url, response_encoding): base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin_rfc( base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars( remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def replace_escape(value): return replace_escape_chars(value, replace_by=u' ')
def load_sectionitem(self, page1_selector, page2_selector, term, is_open, clss, section_index, term_index, course_index): print("******* Begin loading section {} *******".format(section_index)) section_loader = ItemLoader(item=SectionItem(), selector=page2_selector) section_loader.add_xpath('sid', '//*[@id="SSR_CLS_DTL_WRK_CLASS_NBR"]') section_loader.add_xpath('days', '//*[@id="MTG_SCHED$0"]') section_loader.add_xpath('mon', '//*[@id="MTG_SCHED$0"]') section_loader.add_xpath('tue', '//*[@id="MTG_SCHED$0"]') section_loader.add_xpath('wed', '//*[@id="MTG_SCHED$0"]') section_loader.add_xpath('thu', '//*[@id="MTG_SCHED$0"]') section_loader.add_xpath('fri', '//*[@id="MTG_SCHED$0"]') section_loader.add_xpath('start', '//*[@id="MTG_SCHED$0"]') section_loader.add_xpath('ending', '//*[@id="MTG_SCHED$0"]') section_loader.add_xpath('professor', '//*[@id="MTG_INSTR$0"]') section_loader.add_xpath('room', '//*[@id="MTG_LOC$0"]') section_loader.add_xpath('cap', '//*[@id="SSR_CLS_DTL_WRK_ENRL_CAP"]') section_loader.add_xpath( 'enrolled', '//*[@id="SSR_CLS_DTL_WRK_ENRL_TOT"]' ) #can have individual and combined capacities section_loader.add_xpath('wcap', '//*[@id="SSR_CLS_DTL_WRK_WAIT_CAP"]') section_loader.add_xpath('wenrolled', '//*[@id="SSR_CLS_DTL_WRK_WAIT_TOT"]') section_loader.add_value('term', term) section_loader.selector = page1_selector section_loader.add_value('open', is_open) if (page1_selector.css("[id^='DERIVED_CLSRCH_DESCR200$" + str(course_index) + "']").extract_first() != None): words = replace_escape_chars( remove_tags( page1_selector.css("[id^='DERIVED_CLSRCH_DESCR200$" + str(course_index) + "']").extract_first())).split() title = '' for word in words[2:]: title = title + word + ' ' number = words[1] dept = Department.objects.get(code=words[0]) input_str = replace_escape_chars( remove_tags( page2_selector.css( "[id='PSXLATITEM_XLATLONGNAME']").extract_first())) session = '' session_dict = { 'University': 'un', 'University Eligible/CPE': 'uc', 'University Non-standard Dates': 'ud', 'CPE (Continuing Education)': 'ce', 'CPE Non-standard Dates': 'cu', 'CPE Summer Session 1': 'c1', 'CPE Summer Session 2': 'c2', 'CPE Summer Session 3': 'c3', } if ('*' in input_str): session = session_dict[input_str[1:]] else: session = session_dict[input_str] section_loader.add_value( 'clss', Course.objects.filter(title=title, session=session, dept=dept).get(number=number)) section_loader.add_xpath( 'component', '//*[@id="DERIVED_CLSRCH_SSR_CLASSNAME_LONG$' + str(section_index) + '"]') return section_loader.load_item()
def default_proc(input): input = remove_tags(input) input = replace_escape_chars(input) return input
def test_replace_escape_chars(self): # make sure it always return unicode assert isinstance(replace_escape_chars('no ec'), unicode) assert isinstance(replace_escape_chars('no ec', replace_by='str'), unicode) assert isinstance( replace_escape_chars('no ec', which_ones=( '\n', '\t', )), unicode) # text without escape chars self.assertEqual(replace_escape_chars(u'no ec'), u'no ec') self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n', )), u'no ec') # text with escape chars self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape') self.assertEqual( replace_escape_chars(u'escape\n', which_ones=('\t', )), u'escape\n') self.assertEqual( replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n') self.assertEqual( replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ') self.assertEqual( replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3') self.assertEqual( replace_escape_chars(u'escape\tchars\n', replace_by='\xc2\xa3'), u'escape\xa3chars\xa3')
def test_replace_escape_chars(self): # make sure it always return unicode assert isinstance(replace_escape_chars('no ec'), unicode) assert isinstance(replace_escape_chars('no ec', replace_by='str'), unicode) assert isinstance(replace_escape_chars('no ec', which_ones=('\n','\t',)), unicode) # text without escape chars self.assertEqual(replace_escape_chars(u'no ec'), u'no ec') self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec') # text with escape chars self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape') self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n') self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by='\xc2\xa3'), u'escape\xa3chars\xa3')