class Article: def __init__(self, url): print('Saving page: {}'.format(url)) res = requests.get(url) self.url = url self.article = Document(res.content) self._add_title() self._save_images() def _add_title(self): self.root = etree.fromstring(self.article.summary()) body = self.root.find('body') title = self.article.title() ascii_title = unidecode(title) if type(title) == unicode else title title_header = etree.HTML('<h2>{}</h2>'.format(ascii_title)) body.insert(0, title_header) def _save_images(self): tmppath = tempfile.mkdtemp() images = self.root.xpath('//img') for img in images: imgsrc = img.get('src') # handle scheme-agnostic URLs if 'http' not in imgsrc and '//' in imgsrc: imgsrc = 'http:{}'.format(imgsrc) # handle relative file paths elif 'http' not in imgsrc: parsed = urlparse(self.url) imgsrc = '{}://{}{}'.format(parsed.scheme, parsed.netloc, imgsrc) filename = os.path.basename(imgsrc) dest = os.path.join(tmppath, filename) try: res = requests.get(imgsrc) except Exception as e: print('Could not fetch image ({}) from "{}"'.format(str(e), imgsrc)) return if res.status_code == 404: print('Could not fetch image (HTTP 404), attempted fetch: "{}", source URL: {}'.format(imgsrc, img.get('src'))) continue with open(dest, 'wb') as f: f.write(res.content) img.set('src', dest) @property def title(self): return self.article.title() @property def html(self): return etree.tostring(self.root)
def main(): novels = { 'cbi': 'https://boxnovel.com/novel/castle-of-black-iron/chapter-', 'sgg': 'https://boxnovel.com/novel/super-gene/chapter-', 'sas': 'https://boxnovel.com/novel/strongest-abandoned-son/chapter-', 'atg': 'https://www.wuxiaworld.com/novel/against-the-gods/atg-chapter-' } total = [] if len(sys.argv) < 4: inicio = int(sys.argv[2]) fim = int(sys.argv[2]) + 1 else: inicio = int(sys.argv[2]) fim = int(sys.argv[3]) + 1 url = novels[sys.argv[1]] for i in range(inicio, fim): response = getPage(url + str(i)) doc = Document(response.text) fileName = re.sub(r'[^a-zA-Z0-9]+', ' ', doc.title()) total.append(doc.summary()) print(i) f = open(fileName + str(fim - 1) + '.html', 'w') for i in total: f.write(i) f.close()
def run(index): print "Index %d" % index dirname = "data/%04d" % index # url of english article url = open(dirname + "/url_en.txt").read() # download html html = urllib.urlopen(url).read().decode('latin-1') # apply readability document = Document(html) article = document.summary() article = nltk.clean_html(article) # replace latin characters article = re.sub(u' ', u'\n', article) article = re.sub(u'\x92', u'`', article) article = re.sub(u'\x96', u'-', article) # article_en.txt output = codecs.open(dirname + "/article_en.txt", 'w', encoding='ascii', errors='ignore') output.write(article) output.close() # title.txt output = codecs.open(dirname + "/title.txt", 'w', encoding='ascii', errors='ignore') output.write(document.title()) output.close()
def download_via_url(url): response = requests.get(url) doc = Document(response.text) title = doc.title() summary = doc.summary() soup = BeautifulSoup(summary, "html.parser") return title, soup.text
def parse(self, response): doc = Document(response.text) yield { 'full_title': doc.title(), # 'date': response.selector.xpath('//time/@datetime').getall() # 'date': response.xpath('//span[@class="post-date"]/text()').get() 'date': '2009' }
def extract(self, html): # https://github.com/buriy/python-readability/blob/master/readability/readability.py doc = Document(html) self.__title = doc.title() self.__html = doc.summary() self.__md = html2text.html2text(self.__html) self.__text = self.__format_to_text(self.__html) return self.__text
def _getResponseText(self, response): ''' (reponse) -> Text Returns text within the body of an HttpResponse object. ''' readability = Document(response.body) content = readability.title() + readability.summary() return content
def process_html(html): doc = Document(html) return { 'content': doc.content(), 'clean_html': doc.get_clean_html(), 'short_title': doc.short_title(), 'summary': html_to_text(doc.summary()), 'title': doc.title() }
def crawl_url(url): html = requests.get(url) doc = Document(html.content) content = doc.summary().encode('utf-8') title = doc.title().encode('utf-8') return { 'content': content, 'title': title }
def get_article_from_item(self, item): url = item['link'] logging.debug(url) author = 'n/a' if item.has_key('author'): author = item.author html = urllib.urlopen(url).read() doc = Document(html) return Article(doc.title(), doc.short_title(), author, doc.summary())
def extract_article(self): """Returns only readable content Returns: data - { 'title': 'Title of the article', 'content': 'HTML body of the article' } """ doc = Document(self._html) return {'title': doc.title(), 'content': doc.summary()}
def get_article(d): url = d['url'] if table.find_one(url=url): return print "fetching stuff for %s" % url d['html'] = requests.get(url).content try: doc = Document(d['html']) d['summary'] = html.fromstring(doc.summary()).xpath('string()') d['content'] = html.fromstring(doc.content()).xpath('string()') d['title'] = doc.title() except Exception, e: print e
def handle(self, url, content): # Fix of issue27 # content = re.sub('href="(.*?)"', '', content); doc = Document(content) try: hp = HParser(doc.summary()) text = doc.title() + '\n' + hp.tag_list[0].rtext().replace('==+NL+==', '\n') text = '\n'.join(list(map(lambda l: l.strip(), text.split('\n')))) text = re.sub('\n{3,}', '\n\n', text).strip() return text except: self.logger.exception('Fail to parse the summary from readability!') raise
def get_main_text(self): doc = Document(self._page.content, positive_keywords=re.compile('event-description__text|event-heading__title|event-heading__argument', re.I)) title = doc.title() summary = doc.summary(html_partial=True) self.summary_bs = BeautifulSoup(summary, 'html.parser') strings = [] for div in self.summary_bs.find_all(['div', 'span', 'body']): strings.extend([string for string in div.stripped_strings if string != "" and re.search(r'[<>{}=\[\]\|]', string) is None]) text = "\n".join(strings) preprocessed_text = TextUtils.handle(text) return '{}\n{}'.format(' '.join(TextUtils.handle(title)), ' '.join(preprocessed_text))
def preprocess_doc(html_text): """ Preprocessing of an html text as a String is done here. Tags that are advertisement and that do not describe the content are removed at first. The encoding is detected and next the html is parsed and preprocessed using the readability-lxml Document class to clean the content (text and images embedded in the text). An HTML string is returned together with the title of the website. :author: Sebastian :param html_text: html document in string format to preprocess. :returns: The preprocessed html as a String and the title if needed by the callee. """ # remove some common advertisement tags beforehand bs = BeautifulSoup(html_text, "lxml") for tag_desc in negative_tags: for tag in bs.findAll( attrs={'class': re.compile(r".*\b{}\b.*".format(tag_desc))}): tag.extract() doc = Document(str(bs.html), negative_keywords=negative_classes, positive_keywords=positive_classes) try: # Detect the encoding of the html, if not detectable use utf-8 as default. encoding = chardet.detect(doc.content().encode()).get('encoding') title = doc.title() except TypeError or IndexError as e: logger("Encountered {} setting encoding to utf-8.".format(str(e))) encoding = "utf-8" title = bs.title.getText() if not encoding: logger("Using default encoding utf-8") encoding = 'utf-8' title = bs.title.getText() doc.encoding = encoding head = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1' \ '-transitional.dtd">\n' + '<head>\n' + \ '<meta http-equiv="Content-Type" content="text/html" ' \ 'charset="' + encoding + '">\n' + '</head>\n' + '<body>\n' \ + '<h1>' + title.split(sep='|')[0] + '</h1>' # Unparsable Type Error in encoding, where's the problem. text = head + doc.summary()[12:] # sometimes some tags get messed up and need to be translated back text = text.replace("<", "<").replace(">", ">") logger( 'Preprocessing done. Type of text is: {}, Length of test is {}'.format( type(text), len(text))) return text, title
def retrieve_article_content(article): article.last_fetch = timezone.now() try: response = requests.get(article.link) if response.ok: doc = Document(response.content) article.content = doc.summary() article.title = doc.title() article.status = cst.READY_STATUS else: article.status = cst.ERROR_STATUS article.save() except Exception as e: logger.error(e) article.status = cst.ERROR_STATUS article.save()
def make_readable(url): try: html = urllib2.urlopen(url).read() except urllib2.URLError: return None document = Document(html) document_dict = { 'title': document.title(), 'summary': document.summary(), 'content': document.content(), 'short_title': document.short_title() } return document_dict
def get_url(): start_url = "http://www.ediliziaeterritorio.ilsole24ore.com/" sess.get(start_url, headers=headers) page = 1 keyword = 'Luigi Di Maio' url = "http://www.ricerca24.ilsole24ore.com/s24service?profilo=r24_service&search_query_id=fullquery&max_docs=1000&highlight=true&keywords_operator=AND&search_parameters=&order_by=2&page_number={}&page_size=10&v=2009&mt=text%2Fhtml%3B%20charset%3Diso-8859-1&cog_extra=true&xsl_id=html_all&keywords={}".format( page, keyword) response = sess.get(url, headers=headers) txt = response.text html = etree.HTML(txt) lis = html.xpath('//ul[@class="list list-results"]/li[@class="i"]') for li in lis: news_url = li.xpath('./article//h3/a/@href')[0] try: date = re.search(r'\d+-\d+-\d+', news_url).group() except: d = re.search(r'\d+/\d+/\d+', news_url).group() date = d.replace('/', '-') timeArray = time.strptime(date, "%Y-%m-%d") timestamp = int(time.mktime(timeArray)) stringDate = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp)) response = sess.get(news_url, headers=headers) txt = response.text try: num = re.findall(r'\d+ Commenti', txt) print num num = num[0].split(' ')[0] except Exception as error: print error num = 0 readable_article = Document(txt) title = readable_article.title() html = etree.HTML(readable_article.summary()) context = ''.join(html.xpath('//p//text()')).replace('\r', '').replace( '\n', '').replace('\t', '') item = {} item['time'] = stringDate item['timestamp'] = timestamp item['title'] = title item['context'] = context item['source'] = 'ricerca24' item['url'] = news_url item['commont_num'] = num with open('24.json', 'a') as f: f.write(json.dumps(item) + '\n')
def parser_content(self, html, index_url): print index_url import pdb if 'charset=gb2312' in html: try: code = chardet.detect(html)['encoding'] html = html.decode(code, 'ignore') except: pass html = re.sub('<select[\s\S]+?</select>', '', html) readable_article = Document(html) content = readable_article.summary() content = re.sub('</?div.*?>', '', content) title = readable_article.title() time_search = re.search("发布时间.{20}", html) # if u'发布日期' in content : # pdb.set_trace() if time_search: push_time = self.parser_match_time(time_search.group()) else: try: push_time = self.parser_html_time(html) except: push_time = '' text = PyQuery(readable_article).text() print "*" * 100 print push_time self.SAVECO.update( {"url": index_url}, { "url": index_url, "html": content, "text": text, "time": push_time, "title": title, "createdAt": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") }, upsert=True)
def parseURL_pr(url): parsed = urlparse(url) if ( "youtube" in parsed.hostname ): print url, 'has youtube and we dont parse that' return None try: response = urlopen(url) except IOError: return None if ( response.getcode() > 400 ): print url , ' is not accessible any more', response.getcode() return None html = response.read() doc = Document(html) content = {} #content['content'] = doc.summary() html = doc.summary(True) soup = BeautifulSoup(html) content['content'] = soup.get_text() content['title'] = doc.title() content['word_count'] = len(content['content']) return content
def parseURL_pr(url): parsed = urlparse(url) if ("youtube" in parsed.hostname): print url, 'has youtube and we dont parse that' return None try: response = urlopen(url) except IOError: return None if (response.getcode() > 400): print url, ' is not accessible any more', response.getcode() return None html = response.read() doc = Document(html) content = {} #content['content'] = doc.summary() html = doc.summary(True) soup = BeautifulSoup(html) content['content'] = soup.get_text() content['title'] = doc.title() content['word_count'] = len(content['content']) return content
def parser_content(self, html, index_url): print index_url import pdb html = re.sub('<select[\s\S]+?</select>', '', html) readable_article = Document(html) content = readable_article.summary() title = readable_article.title() time_search = re.search("发布时间.{20}", html) # if u'发布日期' in content : # pdb.set_trace() if time_search: push_time = self.parser_match_time(time_search.group()) else: push_time = self.parser_html_time(html) print "*" * 100 print push_time self.SAVECO.update({"url": index_url}, { "url": index_url, "html": content, "time": push_time, "title": title }, upsert=True)
def extractTitle(html): if html == "": return None try: doc = Document(html) short_title = doc.short_title() title = doc.title() if short_title is not None and short_title.strip() != "": title = short_title for delimiter in ['|', '-', '::', '/', '_']: if delimiter in title: parts = title.split(delimiter) if len(parts[0]) >= 4: title = parts[0] break elif len(parts[-1]) >= 4: title = parts[-1] break return title except: pass return None
# - pip install readability-lxml # - :bind <key sequence> spawn --userscript readerview.py from readability.readability import Document import os import tempfile # use readability-lxml (https://pypi.python.org/pypi/readability-lxml) to # extract the article text html = open(os.environ.get('QUTE_HTML')).read() url = os.environ.get('QUTE_URL') # set the url kwarg to get absolute links document = Document(html, url=url) article = document.summary() title = document.title() # add styling and whatever for better reading head = '''<html> <head> <title>''' + title + ''' [readerview]</title> <style> body { max-width: 800px; margin: 0 auto; background-color: #fdf6e3; color: #657b83; } #qute_orig_link { font-weight: bold; text-align: center;
def textgetter(url): """Scrapes web news and returns the content Parameters ---------- url : str web address to news report Returns ------- answer : dict Python dictionary with key/value pairs for: text (str) - Full text of article url (str) - url to article title (str) - extracted title of article author (str) - name of extracted author(s) base (str) - base url of where article was located provider (str) - string of the news provider from url published_date (str,isoformat) - extracted date of article top_image (str) - extracted url of the top image for article """ global done TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'] # regex for url check s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)') u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$") if s.search(url): site = u.search(s.search(url).group()).group(3) else: site = None answer = {} # check that its an url if s.search(url): if url in done.keys(): return done[url] pass try: r = requests.get(url, verify=False, timeout=1) except: done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url yield answer if r.status_code != 200: done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url if len(r.content) > 500: article = Article(url) article.download(input_html=r.content) article.parse() if len(article.text) >= 200: answer['author'] = ", ".join(article.authors) answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = article.publish_date if isinstance(article.publish_date, datetime.datetime): answer['published_date'] = article.publish_date.astimezone( pytz.utc).isoformat() answer['text'] = article.text answer['title'] = article.title answer['top_image'] = article.top_image answer['url'] = url else: doc = Paper(r.content) data = doc.summary() title = doc.title() soup = BeautifulSoup(data, 'lxml') newstext = " ".join([l.text for l in soup.find_all(TAGS)]) if len(newstext) > 200: answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url else: newstext = " ".join([ l.text for l in soup.find_all('div', class_='field-item even') ]) done[url] = newstext answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = 'No text returned' answer['title'] = None answer['top_image'] = None answer['url'] = url yield answer yield answer del r, data else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = 'This is not a proper url' answer['title'] = None answer['top_image'] = None answer['url'] = url yield answer
def textgetter(url): """Scrapes web news and returns the content Parameters ---------- url : str web address to news report Returns ------- answer : dict Python dictionary with key/value pairs for: text (str) - Full text of article url (str) - url to article title (str) - extracted title of article author (str) - name of extracted author(s) base (str) - base url of where article was located provider (str) - string of the news provider from url published_date (str,isoformat) - extracted date of article top_image (str) - extracted url of the top image for article """ global done TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'] # regex for url check s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)') u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$") if s.search(url): site = u.search(s.search(url).group()).group(3) else: site = None answer = {} # check that its an url if s.search(url): if url in done.keys(): return done[url] pass try: r = requests.get(url, verify=False, timeout=1) except: done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url yield answer if r.status_code != 200: done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url if len(r.content)>500: article = Article(url) article.download(input_html=r.content) article.parse() if len(article.text) >= 200: answer['author'] = ", ".join(article.authors) answer['base'] = s.search(url).group() answer['provider']=site answer['published_date'] = article.publish_date if isinstance(article.publish_date,datetime.datetime): answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat() answer['text'] = article.text answer['title'] = article.title answer['top_image'] = article.top_image answer['url'] = url else: doc = Paper(r.content) data = doc.summary() title = doc.title() soup = BeautifulSoup(data, 'lxml') newstext = " ".join([l.text for l in soup.find_all(TAGS)]) if len(newstext) > 200: answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url else: newstext = " ".join([ l.text for l in soup.find_all( 'div', class_='field-item even') ]) done[url] = newstext answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = 'No text returned' answer['title'] = None answer['top_image'] = None answer['url'] = url yield answer yield answer del r, data else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = 'This is not a proper url' answer['title'] = None answer['top_image'] = None answer['url'] = url yield answer
def update_and_send(proxy, post, url, country, is_proxy): user = post.author if is_proxy: try: r = requests.get(url, proxies={"http": proxy}) except: email.send_email_normal(user.email, 'Your requested web Article Blocked in ' + country, 'main/block_mail', user=user, post=post, server=app.config['SERVER_URL']) return True else: try: r = requests.get(url) except: email.send_email_normal(user.email, 'Your requested web Article Blocked', 'main/block_mail', user=user, post=post, server=app.config['SERVER_URL']) return True if r: doc = Document(r.text) sha256, html_text = calculate_hash_for_html_doc(doc) if sha256 == post.hashVal: return True else: try: originStampResult = save_render_zip_submit( html_text, sha256, url, doc.title()) except: app.logger.error( '300 Internal System Error. Could not submit hash to originstamp' ) app.logger.error('Hash: ' + sha256 + ' submitted to originstamp') dateTimeGMT = originStampResult.headers['Date'] post_new = Post(body=doc.title(), urlSite=url, hashVal=sha256, webTitl=doc.title(), origStampTime=datetime.strptime( dateTimeGMT, "%a, %d %b %Y %H:%M:%S %Z"), author=user) db.session.add(post_new) db.session.commit() post_created = Post.query.filter( and_(Post.urlSite.like(url), Post.hashVal.like(sha256))).first() ids = str(post.id) + ':' + str(post_created.id) if post_created: email.send_email_normal( user.email, 'Change in the requested Article found', 'main/normal_email', user=user, post=post_created, ids=ids, server=app.config['SERVER_URL']) return True else: email.send_email_normal(user.email, 'Your requested web Article Blocked in ' + country, 'main/block_email', user=user, post=post, server=app.config['SERVER_URL']) return True
def textgetter(url): """Scrapes web news and returns the content Parameters ---------- url : str web address to news report Returns ------- answer : dict Python dictionary with key/value pairs for: text (str) - Full text of article url (str) - url to article title (str) - extracted title of article author (str) - name of extracted author(s) base (str) - base url of where article was located provider (str) - string of the news provider from url published_date (str,isoformat) - extracted date of article top_image (str) - extracted url of the top image for article """ global done TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'] # regex for url check s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)') u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$") if s.search(url): site = u.search(s.search(url).group()).group(3) else: site = None answer = {} # check that its an url if s.search(url): if url in done.keys(): yield done[url] pass try: # make a request to the url r = requests.get(url, verify=False, timeout=1) except: # if the url does not return data, set to empty values done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None yield answer # if url does not return successfully, set ot empty values if r.status_code != 200: done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None # test if length of url content is greater than 500, if so, fill data if len(r.content)>500: # set article url article = Article(url) # test for python version because of html different parameters if int(platform.python_version_tuple()[0])==3: article.download(input_html=r.content) elif int(platform.python_version_tuple()[0])==2: article.download(html=r.content) # parse the url article.parse() article.nlp() # if parse doesn't pull text fill the rest of the data if len(article.text) >= 200: answer['author'] = ", ".join(article.authors) answer['base'] = s.search(url).group() answer['provider']=site answer['published_date'] = article.publish_date answer['keywords']=article.keywords answer['summary']=article.summary # convert the data to isoformat; exception for naive date if isinstance(article.publish_date,datetime.datetime): try: answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat() except: answer['published_date']=article.publish_date.isoformat() answer['text'] = article.text answer['title'] = article.title answer['top_image'] = article.top_image answer['url'] = url # if previous didn't work, try another library else: doc = Paper(r.content) data = doc.summary() title = doc.title() soup = BeautifulSoup(data, 'lxml') newstext = " ".join([l.text for l in soup.find_all(TAGS)]) # as we did above, pull text if it's greater than 200 length if len(newstext) > 200: answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None # if nothing works above, use beautiful soup else: newstext = " ".join([ l.text for l in soup.find_all( 'div', class_='field-item even') ]) done[url] = newstext answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None # if nothing works, fill with empty values else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = 'No text returned' answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None yield answer yield answer # the else clause to catch if invalid url passed in else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = 'This is not a proper url' answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None yield answer