def test_with_tags(self): # text with tags self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>') self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p',)), u'') self.assertEqual(remove_tags_with_content(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)), u'<b>not will removed</b>')
def parse_modal_infor(self,response): res = response.text result = response.meta.get("item") res_convert = json.loads(res) res_convert = res_convert['spec'] res_convert_del_header = re.sub(r'<li><label>([a-zA-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềếểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳýỵỷỹ\s\.&,.-])+<\/label><\/li>',"",res_convert) res_key = remove_tags_with_content(res_convert_del_header, which_ones=('div',)) res_key_replace_tags = replace_tags(res_key,'|','utf-8') res_key_array = list() res_key_gen = (value for value in res_key_replace_tags.split("||||")) for val in res_key_gen: res_key_array.append(val.replace("||","")) res_val = remove_tags_with_content(res_convert_del_header,which_ones=('span',)) res_val_remove_tags = remove_tags(res_val,which_ones = ('a','li',)) res_val = replace_tags(res_val_remove_tags,'|','utf-8') res_val = res_val.split("||") res_val_array = list() res_val_gen = (val for val in res_val) for val in res_val_gen: res_val_array.append(val.replace("|","")) res_modal = dict(zip(res_key_array,res_val_array)) result['data'] = res_modal # spec = Selector(text=json.loads(response.text)['spec']) # spec_values_container = list(filter(lambda x: len(x.xpath('./@class')) != 0, spec.css('li'))) # spec_values_dict_keys = [ x.xpath('./span/text()').get() if x.xpath('./span/text()').get().split() else x.xpath('./span/div/text()').get() for x in spec_values_container ] # spec_values_dict_values = [remove_tags(x.xpath('./div').get()) for x in spec_values_container] yield result
def test_without_tags(self): # text without tags self.assertEqual(remove_tags_with_content('no tags'), 'no tags') self.assertEqual( remove_tags_with_content('no tags', which_ones=( 'p', 'b', )), 'no tags')
def test_without_tags(self): # text without tags self.assertEqual(remove_tags_with_content("no tags"), "no tags") self.assertEqual( remove_tags_with_content( "no tags", which_ones=( "p", "b", ), ), "no tags", )
def test_with_tags(self): # text with tags self.assertEqual(remove_tags_with_content("<p>one p tag</p>"), "<p>one p tag</p>") self.assertEqual( remove_tags_with_content("<p>one p tag</p>", which_ones=("p", )), "") self.assertEqual( remove_tags_with_content( "<b>not will removed</b><i>i will removed</i>", which_ones=("i", )), "<b>not will removed</b>", )
def process_item(self, item, spider): """ Process content based on its type. """ content_type = item.get('content_type', 'UNKNOWN') log = structlog.get_logger().bind( event = 'PROCESS_ITEM', content_type = content_type, source_url = item['source_url']) if content_type == 'HTML': plain_content = html.replace_escape_chars( html.remove_tags( html.remove_tags_with_content( item['content'], which_ones = ('script',) ) ), which_ones = ('\n','\t','\r',' '), replace_by = '') item['content'] = plain_content log.info(message = 'HTML content extracted') # @TODO elif content_type in ['PDF','MS_WORD', 'LIBREOFFICE', 'POWERPOINT', 'CSV', 'XLSX', 'XLS']: log.info( event = 'QUEUE_CONTENT', message = 'Pushing content for deferred processing') elif content_type in [None, 'UNKNOWN']: log.warn(error = 'UNRECOGNIZED_CONTENT_TYPE') return item
def test_tags_with_shared_prefix(self): # https://github.com/scrapy/w3lib/issues/114 self.assertEqual( remove_tags_with_content("<span></span><s></s>", which_ones=("s", )), "<span></span>", )
def process(self, data, url_object): """Process HTML data. Replaces entities and removes tags (except comments) before processing with TextProcessor. """ logging.info("Process HTML %s" % url_object.url) try: encoding, data = get_codec_and_string(data) # Remove style tags to avoid false positives from inline styles data = remove_tags_with_content(data, which_ones=('style',)) except UnicodeDecodeError as ude: logging.error('UnicodeDecodeError in handle_error_method: {}'.format(ude)) logging.error('Error happened for file: {}'.format(url_object.url)) return False # Convert HTML entities to their unicode representation entity_replaced_html = replace_entities(data) # Collapse whitespace (including newlines), since extra whitespace is # not significant in HTML (except inside comment tags) collapsed_html = _whitespace_re.sub(' ', entity_replaced_html) # Replace tags with <> character to make sure text processor # doesn't match across tag boundaries. replace_tags_text = _html_tag_re.sub('<>', collapsed_html) return self.text_processor.process(replace_tags_text, url_object)
def parse_item(self, response): sel = Selector(response) filename = "test/english/T" + str(self.count) + ".txt" tags_removed_text = remove_tags( remove_tags_with_content( sel.xpath('//*[@id="ja-content"]').extract()[0], which_ones=('script', ))) tabs_removed_text = tags_removed_text.replace("\t", '').replace('\r', '') newLineRemovedText = re.sub(r'(\s*\n\s*)\1*', '\n', tabs_removed_text) # with open(filename, "w") as out_file: # for item in newLineRemovedText.split('\n'): # out_file.write("%s\n" % item.strip()) # # followed_urls = "followedTestURLs/englishUrls.txt" # with open(followed_urls, "a") as out_file: # out_file.write("%s\n" % response.request.url) # docum = OrgItem() # docum['text'] = newLineRemovedText # docum['link'] = response.request.url self.writer.writerow({ 'text': newLineRemovedText, 'link': response.request.url }) # writing data into file. self.count += 1 print(self.count)
def handle_blog(self, response): hxs = HtmlXPathSelector(response) item = BuzzCrawlerItem() item['url'] = response.url item['date'] = dateutil.parser.parse(hxs.xpath(".//li[@class='entryDate']/time/@datetime").extract()[0]) item['title'] = hxs.xpath(".//h1[@id='headline']/text()").extract()[0].strip() item['blurb'] = "" unprocessed_content = hxs.xpath(".//span[@itemprop='articleBody']").extract()[0] sane_html = remove_tags_with_content(unprocessed_content,("noscript","div","h6")) h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True processed_content = h.handle(sane_html) if "noscript" in unprocessed_content: print sane_html.encode("iso-8859-15", "replace") print "*"*98 item['content'] = markdown(processed_content) item['source'] = 'wired.com' yield item
def handle_blog(self, response): hxs = HtmlXPathSelector(response) item = BuzzCrawlerItem() item['url'] = response.url item['date'] = dateutil.parser.parse( hxs.xpath(".//li[@class='entryDate']/time/@datetime").extract()[0]) item['title'] = hxs.xpath( ".//h1[@id='headline']/text()").extract()[0].strip() item['blurb'] = "" unprocessed_content = hxs.xpath( ".//span[@itemprop='articleBody']").extract()[0] sane_html = remove_tags_with_content(unprocessed_content, ("noscript", "div", "h6")) h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True processed_content = h.handle(sane_html) if "noscript" in unprocessed_content: print sane_html.encode("iso-8859-15", "replace") print "*" * 98 item['content'] = markdown(processed_content) item['source'] = 'wired.com' yield item
def parse_condition_page(self, response): """ Verify that 1 item is returned and 3 additional requests are made (one for each one of the tab items). Also verify that the expected fields are contained in the returned item. @url http://www.nhs.uk/conditions/Food-poisoning/Pages/Introduction.aspx @returns requests 3 3 @returns items 1 @scrapes main_content url title """ main_content = response.css('.healthaz-content').extract_first() main_content_text = remove_tags( remove_tags_with_content(main_content, ('script', 'noscript'))) yield { "main_content": main_content_text, "url": response.url, "title": response.css('.healthaz-header h1::text').extract_first() } for tab_href in response.css( '#ctl00_PlaceHolderMain_articles a::attr(href)'): yield response.follow(tab_href, self.parse_condition_page)
def parse_with_term(self, response, term, newspaper): # clean response from scripts response_content = remove_tags_with_content(response.text, ( 'script', 'a', )) selector = Selector(text=response_content) term_query = '//body//*[contains(text(), "%s")]/text()' % self.term term_nodes = selector.select(term_query).extract() if not term_nodes: return item = { 'url': response.url, 'newspaper': newspaper, 'term': term, 'response_content': response.text, 'timestamp': time.time() } related_terms = self.get_related_terms(term_nodes) if term in related_terms: related_terms.pop(term) item['related_terms'] = dict(related_terms) #with open(self.term, 'a') as content_file: # content_file.write("%s\n" % item) cb_client.insert(str(uuid.uuid4()), item) # update scraper process self.update_scraper_summary(item) return item
def parse_details_page_right_pane_kv(self, selector): for item in selector.css('div > div'): key = item.css('span ::text').re_first('(.+):\Z').strip() value = Selector(text=remove_tags_with_content(item.get(), which_ones=['span' ]), type='html').css('html > body > div') yield (key, value)
def test_returns_unicode(self): # make sure it always return unicode assert isinstance(remove_tags_with_content(b'no tags'), six.text_type) assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), six.text_type) assert isinstance(remove_tags_with_content(b'<p>one tag</p>', which_ones=('p',)), six.text_type) assert isinstance(remove_tags_with_content(b'<a>link</a>', which_ones=('b',)), six.text_type) assert isinstance(remove_tags_with_content(u'no tags'), six.text_type) assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), six.text_type) assert isinstance(remove_tags_with_content(u'<p>one tag</p>', which_ones=('p',)), six.text_type) assert isinstance(remove_tags_with_content(u'<a>link</a>', which_ones=('b',)), six.text_type)
def parse(self, response): """ Parse the response page """ # Skip error URLs if response.status != 200: return data = json.loads(response.text) title = data['title'] # Replace / with a space - creates issues with writing to file title = title.replace('/', ' ') description = data['description'] data = data['content'] # Remove <script>, <sup>, <math> tags with the content paragraph = remove_tags_with_content(data, which_ones=('script', 'sup', 'math', 'style')) # Remove the rest of the tags without removing the content paragraph = remove_tags(paragraph) # Replace & with & paragraph = paragraph.replace('&', '&') # Replace ' with ' paragraph = paragraph.replace(''', "'") paragraph = paragraph.replace('’', "'") paragraph = paragraph.replace('“', "'") paragraph = paragraph.replace('”', "'") # Replace with a space paragraph = re.sub("&.....;", ' ', paragraph) paragraph = re.sub("&....;", ' ', paragraph) # Replace 'U.S.' with 'US': paragraph = paragraph.replace('U.S.', 'US') # Some more replacements to improve the default tokenization paragraph = paragraph.replace('\r', '') paragraph = paragraph.replace('\t', '') text = title + '\n\n' + description + '\n\n' + paragraph # Create the directory dirname = 'data/qplum' if not os.path.exists(dirname): os.makedirs(dirname, exist_ok=True) elif not os.path.isdir(dirname): os.remove(dirname) os.makedirs(dirname, exist_ok=True) # Save the title and the text both filename = '{}/{}'.format(dirname, title) f = open(filename, 'w') f.write(text) f.close()
def process_response(self, request, response, spider): # clean body orig_body = response.body_as_unicode() body = remove_tags_with_content(orig_body, which_ones=('script', 'head')) body = remove_tags(remove_comments(body)) terms = tokenize(body.lower()) request.meta['terms'] = terms request.meta['body'] = body return response
def clean_tags_from_affiliations(value): """Clean the affiliaton string for an author.""" for affiliation in value.get('affiliations', []): # Remove tag AND content of any prefix like <label><sup>1</sup></label> affiliation['value'] = remove_tags_with_content(affiliation['value'], ('label',)) # Now remove all tags but KEEP content affiliation['value'] = remove_tags(affiliation['value']) # Remove random whitespaces affiliation['value'] = clean_whitespace_characters(affiliation['value']) return value
class ScienceDailyArticleLoader(ArticleLoader): content_in = Compose( Join('\n\n'), lambda x: remove_tags_with_content(x, ('div',)), # there's "div"s for advertisements ArticleLoader.default_input_processor, ) date_out = Compose( TakeFirst(), lambda date_str: datetime.strptime(date_str, "%B %d, %Y"), )
def parse(self, response): """Parse the response. * Scrapes last updated, description, title and licence information from dataset links * Skips scraping and follows all links on page links * Yields dictionary containing page and response info to pipelines.py """ if self.stats: self.output_stats(response) depth = response.meta['depth'] page = { 'url': response.url, 'updated': '', 'license': '', 'title': '', 'summary': '' } # Handle bad urls if response.status in self.handle_httpstatus_list: bad_url = self.process_bad_url(response) yield bad_url if bad_url != None else { 'page': page, 'response': response } return if not response.meta['pagelink']: # Remove all javascript and style content from html body response_plain = scrapy.Selector( text=remove_tags_with_content(response.text, ( 'script', 'style', ))) page['updated'] = self.update.search_pattern(response_plain) page['license'] = self.licence.search_pattern(response_plain) page['summary'] = self.description.search_pattern(response_plain) page['title'] = self.title.search_pattern(response_plain) elif depth < self.maxdepth: # Get all links on the page links = self.extractor.extract_links(response) for link in links: yield self.new_request(url=link.url, depth=depth + 1, retry=False, pagelink=self.is_pagelink(link.url), country=response.meta['country'], territory=response.meta['territory'], retries=0) # Print crawled information to file or upload yield {'page': page, 'response': response}
def test_remove_tags_with_content(self): # make sure it always return unicode assert isinstance(remove_tags_with_content('no tags'), unicode) assert isinstance(remove_tags_with_content('no tags', which_ones=('p',)), unicode) assert isinstance(remove_tags_with_content('<p>one tag</p>', which_ones=('p',)), unicode) assert isinstance(remove_tags_with_content('<a>link</a>', which_ones=('b',)), unicode) # text without tags self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags') self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p', 'b',)), u'no tags') # text with tags self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>') self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p',)), u'') self.assertEqual(remove_tags_with_content(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)), u'<b>not will removed</b>') # text with empty tags self.assertEqual(remove_tags_with_content(u'<br/>a<br />', which_ones=('br',)), u'a')
def test_remove_tags_with_content(self): # make sure it always return unicode assert isinstance(remove_tags_with_content("no tags"), unicode) assert isinstance(remove_tags_with_content("no tags", which_ones=("p",)), unicode) assert isinstance(remove_tags_with_content("<p>one tag</p>", which_ones=("p",)), unicode) assert isinstance(remove_tags_with_content("<a>link</a>", which_ones=("b",)), unicode) # text without tags self.assertEqual(remove_tags_with_content(u"no tags"), u"no tags") self.assertEqual(remove_tags_with_content(u"no tags", which_ones=("p", "b")), u"no tags") # text with tags self.assertEqual(remove_tags_with_content(u"<p>one p tag</p>"), u"<p>one p tag</p>") self.assertEqual(remove_tags_with_content(u"<p>one p tag</p>", which_ones=("p",)), u"") self.assertEqual( remove_tags_with_content(u"<b>not will removed</b><i>i will removed</i>", which_ones=("i",)), u"<b>not will removed</b>", ) # text with empty tags self.assertEqual(remove_tags_with_content(u"<br/>a<br />", which_ones=("br",)), u"a")
def process_item(self, item, spider): ps = [ remove_tags(remove_tags_with_content( p, ('script', ))).strip().replace(u'\xa0', u' ') for p in item['text'] ] item['text'] = '\n'.join(ps) # additional stripping for description if item['description']: item['description'] = item['description'].strip() return item
def ins_one(item, verdict): """ 更新详情页中间表信息 :param source: 来源 :param c_time: 日期 :param content: 正文 :param md: 附件md5值列表 :param title:标题 :return: """ source = item[0] c_time = item[1] content = item[2] md = item[3] title = item[4] url = item[5] cid = item[6] a = time.time() try: soup = BeautifulSoup(content, 'html.parser') content = soup.prettify() except: content = item[2] try: content = html.remove_tags(content, which_ones=('font', 'frame', 'iframe', 'span', 'o:p', 'form', 'link')) content = html.remove_tags_with_content(content, which_ones=('script', 'form')) except: content = item[2] if cid in verdict: try: img_md = fir_img_ins(content) except: img_md = None else: img_md = None try: conn, cursor = create_conn() sql = " UPDATE tbl_article_info_temp SET release_at=%s,source=%s,content=%s,attachment_id=%s,type=0,title=%s,status=1,img_md5=%s WHERE url=%s" cursor.execute( sql, (c_time, source, content, ",".join(md), title, img_md, url)) conn.commit() print(" 更新详情表成功" + str(title) + str(c_time) + " " + str(time.time() - a)) close_conn(conn, cursor) except Exception as err: print(str(err) + "定位3")
def crawl(): global visited_urls global visited_hashes global queue pages_crawled, words_found = 0, 0 # Set depth limit - may want to crawl less than 3 levels global depth_limit depth = request.args.get('depth') if depth: depth_limit = int(depth) start_url = request.args.get('url') if start_url: queue.append((start_url, 1)) while queue: url, depth = queue.popleft() if url not in visited_urls: visited_urls.add(url) try: response = requests.get(url, timeout=5) except Exception: print('Exception') response = None if (response is not None and response.status_code == requests.codes.ok and 'text/html' in response.headers['Content-Type']): # Need to create and check hash since multiple urls could have identical content page_content = BeautifulSoup( remove_tags_with_content(response.content, ('script', 'style')), 'html.parser') hash = hashlib.md5( page_content.get_text().encode()).hexdigest() if hash not in visited_hashes: visited_hashes.add(hash) if page_content: pages_crawled += 1 words_on_page = create_index(page_content, url) words_found += words_on_page add_pages_to_queue(page_content, url, depth) return jsonify({ 'pages_crawled': pages_crawled, 'words_found': words_found, })
def clean_content(self, text): """ Return a string of text cleaned up by tags, entities, escape chars, quotes and spaces """ temp = remove_tags_with_content(text, which_ones=('style', 'script', 'figcaption')) temp = remove_tags(temp) temp = remove_entities(temp) temp = replace_escape_chars(temp) temp = unquote_markup(temp) temp = " ".join(temp.split()) return temp
def parse_news(self, response): input = ''.join( response.xpath('//*[@id="story_body_content"]/span/p').extract()) content = remove_tags(remove_tags_with_content(input, ('div', 'figure')), keep=('p', )) item = ArticleItem() item['a_title'] = response.css('h1.story_art_title::text').get() item['a_datetime'] = response.css( 'div.shareBar__info--author span::text').get() item['a_source'] = response.css( 'div.shareBar__info--author::text').get() item['a_content'] = content yield item
def test_remove_tags_with_content(self): # make sure it always return unicode assert isinstance(remove_tags_with_content('no tags'), unicode) assert isinstance( remove_tags_with_content('no tags', which_ones=('p', )), unicode) assert isinstance( remove_tags_with_content('<p>one tag</p>', which_ones=('p', )), unicode) assert isinstance( remove_tags_with_content('<a>link</a>', which_ones=('b', )), unicode) # text without tags self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags') self.assertEqual( remove_tags_with_content(u'no tags', which_ones=( 'p', 'b', )), u'no tags') # text with tags self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>') self.assertEqual( remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p', )), u'') self.assertEqual( remove_tags_with_content( u'<b>not will removed</b><i>i will removed</i>', which_ones=('i', )), u'<b>not will removed</b>') # text with empty tags self.assertEqual( remove_tags_with_content(u'<br/>a<br />', which_ones=('br', )), u'a')
def _parse_topic_response(self, response): """ Parses various topics e.g. www.investopedia.com/terms/o/oddlottheory.asp """ # Get the title first title = response.css('title::text').extract_first() # Replace / with a space - creates issues with writing to file title = title.replace('/', ' ') # Get the first div with class content content = response.css('div.content') if isinstance(content, list) and len(content) > 0: content = content[0] else: content = response.css('div.roth__content')[0] text = title + '\n\n' for child in content.xpath('//p'): # Get the text from this child <p></p> tag paragraph = child.extract() # Remove tags including <p> and <a> paragraph = remove_tags( remove_tags_with_content(paragraph, ('script', ))).strip() # Replace '&' with '&' paragraph = paragraph.replace('&', '&') # Add to the file text += paragraph + '\n' # Create the directory dirname = 'data/investopedia' if not os.path.exists(dirname): os.makedirs(dirname, exist_ok=True) elif not os.path.isdir(dirname): os.remove(dirname) os.makedirs(dirname, exist_ok=True) # Save the text name = response.url.split('/')[-1] filename = '{}/{}'.format(dirname, name) f = open(filename, 'w') f.write(text) f.close()
def _parse_topic_response(self, response): """ Parses various topics e.g. www.investopedia.com/terms/o/oddlottheory.asp """ # Get the title first title = response.css('title::text').extract_first() # Replace / with a space - creates issues with writing to file title = title.replace('/', ' ') # Get the first div with class content content = response.css('div.content') if isinstance(content, list) and len(content) > 0: content = content[0] else: content = response.css('div.roth__content')[0] text = title + '\n\n' for child in content.xpath('//p'): # Get the text from this child <p></p> tag paragraph = child.extract() # Remove tags including <p> and <a> paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip() # Replace '&' with '&' paragraph = paragraph.replace('&', '&') # Add to the file text += paragraph + '\n' # Create the directory dirname = 'data/investopedia' if not os.path.exists(dirname): os.makedirs(dirname, exist_ok=True) elif not os.path.isdir(dirname): os.remove(dirname) os.makedirs(dirname, exist_ok=True) # Save the text name = response.url.split('/')[-1] filename = '{}/{}'.format(dirname, name) f = open(filename, 'w') f.write(text) f.close()
def test_returns_unicode(self): # make sure it always return unicode assert isinstance(remove_tags_with_content(b"no tags"), str) assert isinstance( remove_tags_with_content(b"no tags", which_ones=("p", )), str) assert isinstance( remove_tags_with_content(b"<p>one tag</p>", which_ones=("p", )), str) assert isinstance( remove_tags_with_content(b"<a>link</a>", which_ones=("b", )), str) assert isinstance(remove_tags_with_content("no tags"), str) assert isinstance( remove_tags_with_content("no tags", which_ones=("p", )), str) assert isinstance( remove_tags_with_content("<p>one tag</p>", which_ones=("p", )), str) assert isinstance( remove_tags_with_content("<a>link</a>", which_ones=("b", )), str)
def parse(self, response): # remove <script> tags from <p> elements for text in response.css('p'): yield { 'text': remove_tags( remove_tags_with_content(text.extract(), ('script', ))) } # add new URLs that are descendants of the request URL (same domain) for next_page in response.css('div > a'): a_tag = next_page.extract() if "href=" in a_tag: link = (a_tag.split('href="')[1]).split('"')[0] if link.count("/") > 2: if link.split("/")[2] in response.request.url: yield response.follow(next_page, self.parse)
def parse_item(self, response): item = ScifibotItem() # clean body orig_body = response.body_as_unicode() body = remove_tags_with_content(orig_body, which_ones=('script', 'head')) body = remove_tags(remove_comments(body)) tokens = tokenize(body.lower()) # decide if the page is interesting if not is_relevant(tokens): stats.inc_value('scifi/filtered_out') # probably not scifi page return item['keywords'] = tokens item['page'] = orig_body item['url'] = response.url return item
def parse_topic_response(self, response): """ Parse the content """ # Get the title first title = response.css('title::text').extract_first() # Replace / with a space - creates issues with writing to file title = title.replace('/', ' ') content = response.css('div#mw-content-text') # Just extract all the '<p></p>' children from this text = title + '\n\n' for child in content.xpath('//p'): # Get the text from this child <p></p> tag paragraph = child.extract() # Remove <script>, <sup>, <math> tags with the content paragraph = remove_tags_with_content(paragraph, which_ones=('script', 'sup', 'math')) # Remove the rest of the tags without removing the content paragraph = remove_tags(paragraph) # Replace '&' with '&' paragraph = paragraph.replace('&', '&') # Add to the file text += paragraph + '\n' # Create the directory dirname = 'data/wikipedia' if not os.path.exists(dirname): os.makedirs(dirname, exist_ok=True) elif not os.path.isdir(dirname): os.remove(dirname) os.makedirs(dirname, exist_ok=True) # Save the text name = response.url.split('/')[-1] filename = '{}/{}'.format(dirname, name) f = open(filename, 'w') f.write(text) f.close()
def parse_topic_response(self, response): """ Parse the content """ # Get the title first title = response.css('title::text').extract_first() # Replace / with a space - creates issues with writing to file title = title.replace('/', ' ') content = response.css('div#mw-content-text') # Just extract all the '<p></p>' children from this text = '' for child in content.xpath('//p'): # Get the text from this child <p></p> tag paragraph = child.extract() # Remove <script>, <sup>, <math> tags with the content paragraph = remove_tags_with_content(paragraph, which_ones=('script', 'sup', 'math')) # Remove the rest of the tags without removing the content paragraph = remove_tags(paragraph) # Replace '&' with '&' paragraph = paragraph.replace('&', '&') # Replace 'U.S.' with 'US': paragraph = paragraph.replace('U.S.', 'US') # Some more replacements to improve the default tokenization for c in '();.,[]"\'-:/%$+@?': paragraph = paragraph.replace(c, ' {} '.format(c)) # Add to the file text += paragraph.lower() + '\n' filename = 'wiki_data.txt' f = open(filename, 'a') f.write(text) f.close()
def download(self, url: AnyStr) -> Union[AnyStr, None]: headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } try: response = requests.get(url=url, headers=headers) response.raise_for_status() except Exception: return return remove_tags_with_content(text=response.text, which_ones=("script", ))
def _parse_topic_response(self, response): """ Parses various topics e.g. www.investopedia.com/terms/o/oddlottheory.asp """ # Get the title first title = response.css('title::text').extract_first() # Replace / with a space - creates issues with writing to file title = title.replace('/', ' ') # Get the first div with id Content content = response.css('div#Content')[0] content = content.css('div.content-box') text = '' for child in content.xpath('//p'): # Get the text from this child <p></p> tag paragraph = child.extract() # Remove tags including <p> and <a> paragraph = remove_tags( remove_tags_with_content(paragraph, ('script', ))).strip() # Replace '&' with '&' paragraph = paragraph.replace('&', '&') # Replace 'U.S.' with 'US': paragraph = paragraph.replace('U.S.', 'US') # Some more replacements to improve the default tokenization for c in '();.,[]"\'-:/%$+@?': paragraph = paragraph.replace(c, ' {} '.format(c)) # Add to the file text += paragraph.lower() + '\n' # Save the title and the text both filename = 'investopedia_data.txt' f = open(filename, 'a') f.write(text) f.close()
def _has_ajaxcrawlable_meta(text): """ >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>') True >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>") True >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>') False >>> _has_ajaxcrawlable_meta('<html></html>') False """ # Stripping scripts and comments is slow (about 20x slower than # just checking if a string is in text); this is a quick fail-fast # path that should work for most pages. if 'fragment' not in text: return False if 'content' not in text: return False text = html.remove_tags_with_content(text, ('script', 'noscript')) text = html.replace_entities(text) text = html.remove_comments(text) return _ajax_crawlable_re.search(text) is not None
def test_tags_with_shared_prefix(self): # https://github.com/scrapy/w3lib/issues/114 self.assertEqual(remove_tags_with_content(u'<span></span><s></s>', which_ones=('s',)), u'<span></span>')
def test_without_tags(self): # text without tags self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags') self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p', 'b',)), u'no tags')
def test_empty_tags(self): # text with empty tags self.assertEqual(remove_tags_with_content(u'<br/>a<br />', which_ones=('br',)), u'a')