def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) content = re.sub(re.compile(r"©$", flags=re.MULTILINE), " ", content) title = re.search(r"<title>(.*)</title>", content) if title == None: title = "Missing" else: title = title.group(1) content = self.cleanScripts(content) soup = BeautifulSoup(content, 'html.parser') unneededText = (('div', 'tagContainer'), ('div', 'tags'), ('div', 'moreStories'), ('ul', 'links')) for tagName, className in unneededText: results = soup.findAll(tagName, {"class": className}) [result.extract() for result in results] results = soup.findAll("div", {"class": "body-text"}) if len(results) != 1: raise scraper.FeedException( 'Number of div class="body-text" in HTML is not 1. Count = %d' % len(results)) self.saveStory(url, title, content, results[0])
def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) title = re.search(r"<title>(.*)</title>", content) if title == None: title = "Missing" else: title = title.group(1) content = self.cleanScripts(content) soup = BeautifulSoup(content, 'html.parser') unneededText = (('div', 'MorebyThisAuthor'), ('div', 'RelatedStories'), ('div', 'Comments'), ('div', 'ToolBarHorizontal')) for tagName, className in unneededText: results = soup.findAll(tagName, {"id": className}) [result.extract() for result in results] results = soup.findAll('div', {"id": 'gridMainColumn'}) if len(results) != 1: raise scraper.FeedException( 'Number of primary-content ids in HTML is not 1. Count = %d' % len(results)) self.saveStory(url, title, content, results[0])
def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) title = re.search(r"<title>(.*)\n?</title>", content) if title == None: title = "Missing" else: title = title.group(1) content = self.cleanScripts(content) soup = BeautifulSoup(content, 'html.parser') unneededText = ( ('div', 'articleSocialBar'), ('div', 'pluck'), ) for tagName, className in unneededText: results = soup.findAll(tagName, { "id" : className }) [result.extract() for result in results] results = soup.findAll('div', { "data-swiftype-name" : 'body' }) if len(results) != 1: raise scraper.FeedException('Number of primary-content ids in HTML is not 1. Count = %d' % len(results)) self.saveStory(url, title, content, results[0])
def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) title = re.search(r"<title>(.*)</title>", content) if title == None: title = "Missing" else: title = title.group(1) content = self.cleanScripts(content) soup = BeautifulSoup(content, 'html.parser') results = soup.findAll('div', {'class': 'main-container'}) if len(results) != 1: raise scraper.FeedException( 'Number of story-body ids in HTML is not 1. Count = %d URL = %s' % (len(results), url)) self.saveStory(url, title, content, results[0])
def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) content = re.sub(re.compile(r"\r", flags=re.MULTILINE), " ", content) title = re.search(r"<title>(.*)</title>", content) if title == None: title = "Missing" else: title = title.group(1) content = self.cleanScripts(content) soup = BeautifulSoup(content, 'html.parser') unneededClassText = ( ('div', 'commentsform'), ('section', 'clearfix'), ('h5', 'add-comment'), ('p', 'comments-disclaimer'), ('a', 'edit_from_site'), ('h2', ''), ('link', ''), ) for tagName, className in unneededClassText: results = soup.findAll(tagName, {"class": className}) [result.extract() for result in results] unneededIdText = (('span', 'topic'), ) for tagName, idName in unneededIdText: results = soup.findAll(tagName, {"id": idName}) [result.extract() for result in results] #results = soup.findAll("article", { "class" : "container" }) results = soup.findAll("div", {"class": "post"}) if len(results) != 1: raise scraper.FeedException( 'Number of div class="body" in HTML is not 1. Count = %d' % len(results)) self.saveStory(url, title, content, results[0])
def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) content = re.sub(re.compile(r"\r", flags=re.MULTILINE), " ", content) title = re.search(r"<title>(.*)</title>", content) if title == None: title = "Missing" else: title = title.group(1) content = self.cleanScripts(content) soup = BeautifulSoup(content, 'html.parser') results = soup.findAll("div", { "id" : lambda val : val is not None and val.startswith("single-post") } ) if len(results) != 1: raise scraper.FeedException('Number of div id="single-post-*" in HTML is not 1. Count = %d' % len(results)) self.saveStory(url, title, content, results[0])
def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) title = re.search(r"<title>(.*)</title>", content) if title == None: title = "Missing" else: title = title.group(1) content = self.cleanScripts(content) soup = BeautifulSoup(content) results = soup.findAll('div', {"id": 'WNContainerStory'}) if len(results) != 1: raise scraper.FeedException( 'Number of primary-content ids in HTML is not 1. Count = %d' % len(results)) self.saveStory(url, title, content, results[0])
def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) title = re.search(r"<title>(.*)</title>", content) if title == None: title = "Missing" else: title = title.group(1) content = self.cleanScripts(content) soup = BeautifulSoup(content, 'html.parser') #results = soup.findAll(id=re.compile(r'\bprimary-content\b')) results = soup.findAll('div', { 'class': 'postContent', 'itemprop':'articleBody' }) if not results: results = soup.findAll('div', { 'id': 'bw-share' }) if len(results) != 1: raise scraper.FeedException('Number of primary-content ids in HTML is not 1. Count = %d' % len(results)) self.saveStory(url, title, content, results[0])
def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) content = re.sub(re.compile(r"©$", flags=re.MULTILINE), " ", content) content = self.cleanScripts(content) # They don't know the difference between span and div # Their spans need to be converted for the soup to work content = re.sub("<span ", "<div ", content) content = re.sub("</span>", "</div>", content) soup = BeautifulSoup(content, 'html.parser') def t(v): print "=====", v return True results = soup.findAll("div", { "class" : "article-body" }) if len(results) == 0: raise scraper.FeedException('Number of tables in HTML is not 1. Count = %d' % len(results)) resultHTML = "" for r in results: resultHTML += "<div>%s</div>" % str(r) resultHTML = "".join(i for i in resultHTML if ord(i)<128) titleResults = soup.findAll("div", { "class" : "fp-newshead" }) if len(titleResults) != 1: title = "Missing" else: title = str(titleResults[0].contents[0]) title = re.sub(r'<[^>]{1,}>', '', title) self.saveStory(url, title, content, resultHTML)
def parseResponse(self, url, content): content = content.strip() content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content) content = re.sub(re.compile(r"©$", flags=re.MULTILINE), " ", content) content = self.cleanScripts(content) soup = BeautifulSoup(content, 'html.parser') results = soup.findAll('article') if len(results) != 1: raise scraper.FeedException( 'Number of div class="body" in HTML is not 1. Count = %d' % len(results)) title = re.search(r"<title>(.*)</title>", content) if title == None: title = "Missing" else: title = title.group(1) self.saveStory(url, title, content, results[0])