def get_article_text(data): soup_tree = BeautifulSoup(data, 'html.parser') chek_out_link = soup_tree.find("div", {"class": "_5pcr userContentWrapper"}) for match in chek_out_link("form", {"class": "commentable_item"}): match.decompose() chek_out_link = chek_out_link.find("div", {"class": "lfloat _ohe"}) if chek_out_link: chek_out_link = chek_out_link.find("a")["href"] if chek_out_link: find_video = re.search("youtube", chek_out_link) if not find_video: get_link = chek_out_link driver.get(get_link) sleep(5) article_html = driver.page_source extractor = extractors.ArticleExtractor() what = extractor.get_content(article_html) sleep(5) content = detect_lang_and_sent_to_translate(what) sleep(5) content = '<p lang="' + 'rus' + '">' + "Ссылка на материал: " + '</p> ' + content return content content = "" return content
def extract_text(self, record): url = record.rec_headers.get('WARC-Target-URI') date = record.rec_headers.get('WARC-Date') id_ = iso_date_to_timestamp(date) + '/' + url if id_ not in self.pages: return mime = self.get_record_mime_type(record) if mime not in HTML_MIME_TYPES: return if record.http_headers and record.http_headers.get_statuscode().startswith('3'): return extractor = extractors.ArticleExtractor() content = record.content_stream().read() try: content = content.decode("utf-8") doc = extractor.get_doc(content) if doc.content: self.pages[id_]["text"] = doc.content if doc.title: self.pages[id_]["title"] = doc.title except # skip text extraction in case of errors pass
def clean_source_text(self): # retrieve the text for the source data = self.cleaned_data['source_text'] code = self.data['source_code'] if data == "": #with this, it should work for either source_add or source_import (for any empty rows) if code == "SCOPE_S_1": #currently, this method is limited to articles (media reports) url = self.data['source_url'] import requests exist = requests.head(url).status_code if (exist < 400 or exist > 499): from boilerpy3 import extractors import re extractor = extractors.ArticleExtractor() doc = extractor.get_doc_from_url(url) #may reassess which spacing and punctuation to include in the future content = doc.content.replace('\n', ' ').replace( '\r', ' ').replace('\t', ' ').replace('\f', ' ').replace('\v', ' ') content = re.sub(r'\s+', ' ', content) data += content #content_nopunc = re.sub('[^a-zA-Z]', ' ', content) #content_nopunc = re.sub(r'\s+', ' ', content_nopunc) return data
def getContent(url): extractor = extractors.ArticleExtractor() # From a URL content = extractor.get_content_from_url(url) #print(content) return content
def extractText(text): extractor = extractors.ArticleExtractor() try: text = text.replace('\n', ' ').replace('\r', '') document = extractor.get_doc(text) extractedText = document.get_text(True, False) except: extractedText = text return extractedText
def getContent(request): from boilerpy3 import extractors extractor = extractors.ArticleExtractor() # From a URL content = extractor.get_content_from_url( 'https://www.bbc.com/news/world-us-canada-56501686') return content
def getContent(): extractor = extractors.ArticleExtractor() # From a URL content = extractor.get_content_from_url( 'https://www.cnbc.com/2021/03/24/container-ship-runs-aground-in-suez-canal-causing-traffic-jam.html' ) return content
def parse_article(self, response): url = response.url url_h = hashlib.sha256(url.encode('utf8')).hexdigest().encode('utf8') if url_h not in self.visited_urls: body = response.text extractor = extractors.ArticleExtractor() extracted_text = extractor.get_content(body) yield Article(content=extracted_text, url=url, source=self.name, create_time=str(time.time())) self.visited_urls.add(url_h)
def extract(self, url: str, html_text: str): extractor = bp3_extractors.ArticleExtractor() bp_doc = extractor.get_doc(html_text) self.content = { 'url': url, 'text': bp_doc.content, 'title': bp_doc.title, 'publish_date': None, 'top_image_url': None, 'authors': None, 'extraction_method': METHOD_BOILER_PIPE_3, }
def getContentForDf(rawDataPath, categoryName): filenameArr = [] categoryArr = [] contentArr = [] with os.scandir(rawDataPath) as entries: for entry in entries: f = codecs.open(entry, "r", "utf_8_sig") data = f.read() # print(entry.name) if getContentType(rawDataPath + entry.name) == 'xml': arr = [] xmlstr = data.replace('&', '') root = ET.fromstring(xmlstr) for item in list(root): arr.append(item.text.encode('utf8')) lenArr = [len(str(el)) for el in arr] content = arr[lenArr.index(max(lenArr))] elif getContentType(rawDataPath + entry.name) == 'txt': content = data elif getContentType(rawDataPath + entry.name) == 'json': arr = [] data = data.replace('@', '') data = re.sub(r"(?m)^\s+", "", data) entryContent = json.loads(data) if type(entryContent) is list: arr.extend(entryContent) else: for key, value in entryContent.items(): arr.append(value) lenArr = [len(str(el)) for el in arr] content = arr[lenArr.index(max(lenArr))] elif getContentType(rawDataPath + entry.name) == 'html': # content = html2text.html2text(data) extractor = extractors.ArticleExtractor() content = extractor.get_content(data) else: content = '' filenameArr.append(entry.name) categoryArr.append(categoryName) contentArr.append(content) return filenameArr, categoryArr, contentArr
def getLinks(uri='', html='', fromMainTextFlag=True, **kwargs): kwargs.setdefault('sleepSec', 0) kwargs.setdefault('derefFlag', True) kwargs.setdefault('rmFragments', True) uri = uri.strip() if( uri != '' ): if( uri[-1] != '/' ): uri = uri + '/' allLinks = [] dedupSet = set() try: if( uri != '' and html == '' and kwargs['derefFlag'] is True ): html = derefURI(uri, sleepSec=kwargs['sleepSec']) if( fromMainTextFlag is True ): extractor = extractors.ArticleExtractor() html = extractor.get_content(html) soup = BeautifulSoup(html, 'html.parser' ) links = soup.findAll('a') for i in range( len(links) ): if( links[i].has_attr('href') is False ): continue link = links[i]['href'].strip() if( link == '' ): continue linkTitle = links[i].text.strip() if( link[:2] == '//' ): link = 'http:' + link elif( link[0] == '/' ): link = uri + link[1:] if( link[:4] != 'http' and kwargs['rmFragments'] is True ): continue if( link+linkTitle not in dedupSet ): allLinks.append({ 'title': linkTitle, 'link': link }) dedupSet.add(link+linkTitle) except: genericErrorInfo() return allLinks
def getContent(url): extractor = extractors.ArticleExtractor() # From a URL resp = requests.get(url) if resp.ok: content = extractor.get_content_from_url(url) else: content = 'error' # if webscraper has failed if content == '': content = 'error' return content
def retriever(links=[]): extractor = extractors.ArticleExtractor() articles = [] for link in links: # print('getting article {}'.format(link)) try: doc = extractor.get_doc_from_url(link) title = doc.title or '' body = doc.content or '' articles.append({'title': title, 'body': body, 'link': link}) except Exception as e: errors.append(link) return articles
def extract(request): extractor = extractors.ArticleExtractor() httpurl = request.POST.get('Url') doc = extractor.get_doc_from_url(httpurl) content = doc.content print(content)- file = open("output.txt", "w+") file.write(content) file.close() value = {'httpurl': httpurl, 'content': content} context = { 'value': value, } return render(request, 'text.html', context)
def extract_keywords_from_tweet(text: str, filterStopwords: bool) -> set: extractor = extractors.ArticleExtractor() keywords = set() # print(text) links = get_urls_from_text(text) # print("Debug: number of links:"+ str(len(links))) # From disaster dataset text = clean_text(text) keywords = text.split(" ") for key in keywords: if key in string.punctuation: keywords.remove(key) if filterStopwords == True: # Delete stopwords from text stop_words = stopwords.words('english') word_tokens = word_tokenize(text) filtered_sentence = set() for w in word_tokens: if w not in stop_words: filtered_sentence.add(w) keywords = filtered_sentence for url in links: try: external_content = extractor.get_content_from_url(url) # Debug # print("External content:" + external_content) if external_content != "": try: annotations = tagme.annotate(external_content) for ann in annotations.get_annotations( annotation_score_treshold): # # # print(ann) keywords.add(ann.entity_title) except: print("Error with tagme, skipping") except: pass return keywords
def extract_text(self, record): url = record.rec_headers.get('WARC-Target-URI') date = record.rec_headers.get('WARC-Date') ts = iso_date_to_timestamp(date) id_ = ts + '/' + url if self.main_url and url == self.main_url: print('Found Main Url: {0}'.format(url)) self.pages[id_] = {'timestamp': ts, 'url': url, 'title': url} mime = self.get_record_mime_type(record) if mime not in HTML_MIME_TYPES: return status = record.http_headers.get_statuscode() if record.http_headers and status.startswith('3'): return if id_ not in self.pages: if self.detect_pages: self.pages[id_] = {'timestamp': ts, 'url': url, 'title': url} else: return content = self._read_record(record) if not content: return try: extractor = extractors.ArticleExtractor() content = content.decode("utf-8") doc = extractor.get_doc(content) if doc.content: self.pages[id_]["text"] = doc.content if doc.title: self.pages[id_]["title"] = doc.title except Exception as e: print(e) # skip text extraction in case of errors pass
def cleanHtml(html, method='python-boilerpipe'): if( len(html) == 0 ): return '' if( method == 'python-boilerpipe' ): try: ''' #requires: https://github.com/slaveofcode/boilerpipe3 from boilerpipe.extract import Extractor extractor = Extractor(extractor='ArticleExtractor', html=html) return str(extractor.getText()) ''' extractor = extractors.ArticleExtractor() return extractor.get_content(html) except: genericErrorInfo() elif( method == 'nltk' ): """ Copied from NLTK package. Remove HTML markup from the given string. :param html: the HTML string to be cleaned :type html: str :rtype: str """ # First we remove inline JavaScript/CSS: cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) # Then we remove html comments. This has to be done before removing regular # tags since comments can contain '>' characters. cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) # Next we can remove the remaining tags: cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) #my addition to remove blank lines cleaned = re.sub("\n\s*\n*", "\n", cleaned) return cleaned.strip() return ''
def get_paragraphs_BP3(str_text, mode): """ using Boilerpy3 """ if mode =="": BP_extractor = extractors.DefaultExtractor() elif mode =="Article": BP_extractor = extractors.ArticleExtractor() elif mode =="Largest": BP_extractor = extractors.LargestContentExtractor() else: BP_extractor = extractors.KeepEverythingExtractor() from contextlib import redirect_stderr with open(os.devnull, 'w') as devnull: with redirect_stderr(devnull): try: text_det = BP_extractor.get_content(str_text) except: text_det = "" return re.split("\n",text_det)
def clean_html(html, method='boilerpy3', reportFailure=True): if (html == ''): return '' if (method == 'boilerpy3'): try: extractor = extractors.ArticleExtractor( raise_on_failure=reportFailure) return extractor.get_content(html) except: genericErrorInfo() elif (method == 'nltk'): """ Copied from NLTK package. Remove HTML markup from the given string. :param html: the HTML string to be cleaned :type html: str :rtype: str """ # First we remove inline JavaScript/CSS: cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) # Then we remove html comments. This has to be done before removing regular # tags since comments can contain '>' characters. cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) # Next we can remove the remaining tags: cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) #my addition to remove blank lines cleaned = re.sub("\n\s*\n*", "\n", cleaned) return cleaned.strip() return ''
def getContent(url): extractor = extractors.ArticleExtractor(raise_on_failure=False) # From a URL resp = requests.get(url) if resp.ok: #content = extractor.get_content_from_url(url, headers= {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}) r = urllib.request.Request( url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' }) html = urllib.request.urlopen(r).read().decode() content = extractor.get_content(html) else: content = 'error' if content == '': content = 'error' return content
def getCleanText(data, contentType): if contentType == 'xml': arr = [] xmlstr = data.replace('&', '') root = ET.fromstring(xmlstr) for item in list(root): arr.append(item.text.encode('utf8')) lenArr = [len(str(el)) for el in arr] content = arr[lenArr.index(max(lenArr))] elif contentType == 'txt': content = data elif contentType == 'json': arr = [] data = data.replace('@', '') data = re.sub(r"(?m)^\s+", "", data) entryContent = json.loads(data) if type(entryContent) is list: arr.extend(entryContent) else: for key, value in entryContent.items(): arr.append(value) lenArr = [len(str(el)) for el in arr] content = arr[lenArr.index(max(lenArr))] elif contentType == 'html': extractor = extractors.ArticleExtractor() content = extractor.get_content(data) else: content = '' return content
import xml.etree.ElementTree as ET from boilerpy3 import extractors import concurrent.futures from rake_nltk import Rake rake = Rake() RETIEVED_DOCUMENTS_DIR = PROJECT_ROOT_DIR + '/retrieved_documents/row-data/' EXTRACTED_DOCUMENTS_DIR = PROJECT_ROOT_DIR + '/retrieved_documents/extracted-data/' TOPICS_FILE = PROJECT_ROOT_DIR + '/topics.xml' baseUrl = 'https://www.chatnoir.eu' api_key = 'e47fe59e-2d2f-475e-a424-afcdb94ba17b' extractor = extractors.ArticleExtractor() def GetTrimmedKeyword(keyword): stop_words = ['better', 'difference', 'best'] trimmed_word = ' '.join([token.lemma_ for token in nlp(keyword) if token.text not in stop_words]) return trimmed_word def GetKeywords(text): keywords = [] rake.extract_keywords_from_text(text) keywords_rake = rake.get_ranked_phrases() for word in keywords_rake: trimmed_word = GetTrimmedKeyword(word) if trimmed_word: keywords.append(trimmed_word) return keywords
from trafilatura import extract try: from trafilatura.core import baseline except ImportError: baseline = None from trafilatura.utils import sanitize from evaldata import EVAL_PAGES ## TODO: time, best of 3 # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) TEST_DIR = os.path.abspath(os.path.dirname(__file__)) boilerpipe_extractor = extractors.ArticleExtractor( ) # ArticleExtractor DefaultExtractor LargestContentExtractor g = Goose() def trim(string): '''Remove unnecessary spaces within a text string''' if string is not None: # delete newlines that are not related to punctuation or markup # string = re.sub(r'(?<![p{P}>])\n', ' ', string) # proper trimming string = ' '.join( re.split(r'\s+', string.strip(' \t\n\r'), flags=re.UNICODE | re.MULTILINE)) string = string.strip()
def check_pages_and_text(self, record): url = record.rec_headers.get("WARC-Target-URI") date = record.rec_headers.get("WARC-Date") ts = iso_date_to_timestamp(date) id_ = ts + "/" + url if ( self.main_url and self.main_url == url and self.main_ts and self.main_ts == ts ): self.main_ts_flag = True self.main_url_flag = True print("Found Main Url: {0}".format(url)) print("Found Main ts: {0}".format(ts)) self.pages[id_] = {"timestamp": ts, "url": url, "title": url} if self.main_url and self.main_url == url and self.main_ts == None: self.main_url_flag = True print("Found Main Url: {0}".format(url)) self.pages[id_] = {"timestamp": ts, "url": url, "title": url} mime = self.get_record_mime_type(record) if mime not in HTML_MIME_TYPES: return status = record.http_headers.get_statuscode() if record.http_headers and status.startswith("3"): return if id_ not in self.pages: if self.detect_pages: self.pages[id_] = {"timestamp": ts, "url": url, "title": url} else: return # if not extracting text, then finish here if not self.extract_text: return content = self._read_record(record) if not content: return try: extractor = extractors.ArticleExtractor() content = content.decode("utf-8") doc = extractor.get_doc(content) curr_page = self.pages[id_] if doc.content: self.pages[id_]["text"] = doc.content self.has_text = True # only set title if unset, or set to url (default) # avoid overriding user-specified title, if any if doc.title and self.pages[id_].get("title", url) == url: self.pages[id_]["title"] = doc.title except Exception as e: # skip text extraction in case of errors print("Skipping, Text Extraction Failed For: " + url) print(e)
def extractTextFromURL(url1): extractor = extractors.ArticleExtractor() extractedText = extractor.get_content_from_url(url1) return extractedText
def check_pages_and_text(self, record): url = record.rec_headers.get("WARC-Target-URI") date = record.rec_headers.get("WARC-Date") ts = iso_date_to_timestamp(date) id_ = ts + "/" + url matched_id = "" # Check for both a matching url/ts and url entry if id_ in self.passed_pages_dict: matched_id = id_ if url in self.passed_pages_dict: matched_id = url # If we find a match build a record if matched_id != "": self.pages[matched_id] = { "timestamp": ts, "url": url, "title": url } # Add title and text if they've been provided if "title" in self.passed_pages_dict[matched_id]: self.pages[matched_id]["title"] = self.passed_pages_dict[ matched_id]["title"] if "text" in self.passed_pages_dict[matched_id]: self.pages[matched_id]["text"] = self.passed_pages_dict[ matched_id]["text"] # Delete the entry from our pages_dict so we can't match it again del self.passed_pages_dict[matched_id] if (self.main_url and self.main_url == url and self.main_ts and self.main_ts == ts): self.main_ts_flag = True self.main_url_flag = True print("Found Main Url: {0}".format(url)) print("Found Main ts: {0}".format(ts)) # If were not relying on passed in pages we want to add all records to the self.pages object if self.passed_pages_dict == {}: self.pages[id_] = {"timestamp": ts, "url": url, "title": url} if self.main_url and self.main_url == url and self.main_ts == None: self.main_url_flag = True print("Found Main Url: {0}".format(url)) if id_ not in self.pages: self.pages[id_] = {"timestamp": ts, "url": url, "title": url} mime = self.get_record_mime_type(record) if mime not in HTML_MIME_TYPES: return status = record.http_headers.get_statuscode() if record.http_headers and status.startswith("3"): return if id_ not in self.pages: if self.detect_pages: self.pages[id_] = {"timestamp": ts, "url": url, "title": url} else: return # if not extracting text, then finish here if not self.extract_text: return content = self._read_record(record) if not content: return try: extractor = extractors.ArticleExtractor() content = content.decode("utf-8") doc = extractor.get_doc(content) curr_page = self.pages[id_] if doc.content: self.pages[id_]["text"] = doc.content self.has_text = True # only set title if unset, or set to url (default) # avoid overriding user-specified title, if any if doc.title and self.pages[id_].get("title", url) == url: self.pages[id_]["title"] = doc.title except Exception as e: # skip text extraction in case of errors print("Skipping, Text Extraction Failed For: " + url) print(e)
def extract_raw_content(url): extractor = extractors.ArticleExtractor() content = extractor.get_content_from_url(url) return content
def boilerpy3_fulltext(parser, language): extractor = extractors.ArticleExtractor() content = extractor.get_content(parser.tostring())
def get_boilerplate_free_content(urim, cache_storage="", dbconn=None, session=None): import otmt from boilerpy3 import extractors if dbconn is None: dbconn = MongoClient(cache_storage) if session is None: session = get_web_session(cache_storage) db = dbconn.get_default_database() # 1. if boilerplate free content in cache, return it try: module_logger.info( "returing boilerplate free content from cache for {}".format(urim)) bpfree = db.derivedvalues.find_one({"urim": urim})["boilerplate free content"] return bytes(bpfree, "utf8") except (KeyError, TypeError): module_logger.info( "generating boilerplate free content for {}".format(urim)) r = session.get(urim) if len(r.history) == 0: raw_urim = otmt.generate_raw_urim(urim) else: raw_urim = otmt.generate_raw_urim(r.url) r2 = session.get(raw_urim) r2.raise_for_status() module_logger.info("content-type is {}".format( r2.headers['content-type'])) if 'text/html' not in r2.headers['content-type']: module_logger.warning( "we can only remove boilerplate from HTML, returning zero bytes" ) return bytes() # paragraphs = justext( # r.text, get_stoplist('English') # ) # bpfree = "" # for paragraph in paragraphs: # bpfree += "{}\n".format(paragraph.text) module_logger.debug( "attempting to extract boilerplate free content from {}".format( urim)) extractor = extractors.ArticleExtractor() try: bpfree = extractor.get_content(r2.text) module_logger.info( "storing boilerplate free content in cache {}".format(urim)) db.derivedvalues.update( {"urim": urim}, {"$set": { "boilerplate free content": bpfree }}, upsert=True) except Exception: module_logger.exception( "failed to extract boilerplate from {}, setting value to empty string" .format(urim)) hypercane.errors.errorstore.add(urim, traceback.format_exc()) return bytes() return bytes(bpfree, "utf8")
def get(self,request): query = request.GET['query'] query = query.lower() query = re.sub(r'[^\w\s]','',query) response_json = {} fact_check = requests.get('https://factchecktools.googleapis.com/v1alpha1/claims:search',params = {'query':query,'key':api_key,'languageCode':'en-US'}) db = client["news"] if len(fact_check.json()) == 0: response_json['Common Myths'] = [{'source':'No Results Available for this query','check':'Not Available','claim':'Not Available'}] else: claims = fact_check.json()['claims'] ratings = [claims[i]['claimReview'][0]['textualRating'] for i in range(0,len(claims))] factcheck = None for rating in ratings: if rating == 'False' or 'myth' in rating or 'no evidence' in rating: factcheck = False if factcheck == False: response_json['Common Myths'] = [] for claim in claims: current_result = {} current_result['source'] = claim['claimReview'][0]['url'] current_result['check'] = claim['claimReview'][0]['textualRating'] current_result['claim'] = claim['text'] response_json['Common Myths'].append(current_result) else: response_json['Common Myths'] = [{'source':'No Results Available for this query','check':'Not Available','claim':'Not Available'}] stored_queries = db["news"].find({'_id':query}) stored_result = [] for q in stored_queries: stored_result.append(q) is_stored = None if len(stored_result)==0: is_stored = False else: is_stored = True if is_stored == True: if request.GET['update'] == 'True': update_db.after_response(stored_result,db,query) response_json["News"] = [] query_json= stored_result[0] for news in query_json["News"]: latest_news = news[-1] current_dict = {} current_dict["source"] = latest_news["source"] current_dict["content"] = latest_news["content"] response_json["News"].append(current_dict) update_faq(query) response_json["similar_questions"] = related_questions(query) response_json["summary"] = query_json["summary"] response_json["hit_again"] = 'True' return Response(response_json) result = resource.list(q= query, cx = search_engine_id).execute() if len(result) == 0 or 'items' not in result: response_json['News'] = [{'source':'No Results Available for this query','content':'Not Available'}] else: url = None extractor = extractors.ArticleExtractor() response_json['News'] = [] content_summary = '' if is_stored == False: for item in result['items']: try: url = item['link'] if 'pdf' in url or 'xml.gz' in url: continue if url == 'https://www.cdc.gov/coronavirus/2019-ncov/faq.html' or url=='https://www.cdc.gov/coronavirus/2019-ncov/hcp/faq.html': page = requests.get("https://www.cdc.gov/coronavirus/2019-ncov/faq.html") soup = BeautifulSoup(page.content, 'html.parser') page_results= soup.find_all('div',attrs={'class': 'card bar'}) for content in page_results: question = content.find('span',attrs = {'role':'heading'}).contents[0] question = question.lower() re.sub(r'[^\w\s]','',question) question = question answer = content.find('div',attrs = {'class':'card-body'}).find('p').getText() if len(answer)!=0 and is_similar(query,question,0.5): current_result = {} current_result['source'] = url current_result['content'] = [] #print(question,":",answer) current_result['content'].append(answer) response_json['News'].append(current_result) content_summary = content_summary + answer else: response = requests.get(url) stemmer = Stemmer(language=LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) summary = summarizer(parser.document, 5) summary = '\n'.join([line._text for line in summary]) current_result = {} current_result['source'] = url current_result['content'] = [] current_result['content'].append(summary) content_summary = content_summary + summary if 'Last-Modified' in response.headers: current_result['last_modified'] = response.headers['Last-Modified'] else: current_result['last_modified'] = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime()) response_json['News'].append(current_result) except urllib.error.HTTPError as e: current_result['content'] = ["No results available"] continue except TypeError: current_result['content'] = ["No results available"] continue except AttributeError: current_result['content'] = ["No results available"] continue except requests.exceptions.SSLError as e: current_result['content'] = ["No results available"] continue response_json['summary'] = get_summary(content_summary) db_json = {} db_json['News'] = response_json['News'] db_json['summary'] = response_json['summary'] for i,news in enumerate(db_json['News']): url = news['source'] response = requests.get(url) headers = response.headers last_modified = None if 'Last-Modified' in headers: last_modified = headers['Last-Modified'] else: last_modified = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime()) db_json['News'][i]['last_modified'] = last_modified db_json['News'] = [[json] for json in db_json['News']] db_json['_id'] = query db["news"].insert_one(db_json) update_faq(query) response_json["similar_questions"] = related_questions(query) response_json["hit_again"] = 'False' return Response(response_json)