def extract_article_information_from_html(html): """ This methods gets a website the HTML as string and extracts the text of the article :param html: a HTML object from package requests :return: the article information """ article_information = {} # run with newspaper article_newspaper = Article('') article_newspaper.set_html(html) article_newspaper.parse() article_information["summary"] = article_newspaper.summary article_information["author"] = str(article_newspaper.authors).strip('[]') article_information["tags"] = article_newspaper.tags article_information["title"] = article_newspaper.title newspaper_text = article_newspaper.text # run with newsplease # article_newsplease = NewsPlease.from_html(html) # newsplease_text = article_newsplease.cleaned_text # run with goose goose_extractor = Goose() goose_extractor = goose_extractor.extract(raw_html=html) article_goose = goose_extractor.cleaned_text if len(newspaper_text.split(" ")) > len(article_goose.split(" ")): article_information["text"] = newspaper_text else: article_information["text"] = article_goose return article_information
def extract(self, item): """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of parsing the HTML-Code. :param item: A NewscrawlerItem to parse. :return: ArticleCandidate containing the recovered article data. """ article_candidate = ArticleCandidate() article_candidate.extractor = self._name() article = Article('') article.set_html(item['spider_response'].body) article.parse() article_candidate.title = article.title article_candidate.description = article.meta_description article_candidate.text = article.text article_candidate.topimage = article.top_image article_candidate.author = article.authors if article.publish_date: try: article_candidate.publish_date = article.publish_date.strftime( '%Y-%m-%d %H:%M:%S') except ValueError as exception: self.log.debug( '%s: Newspaper failed to extract the date in the supported format,' 'Publishing date set to None' % item['url']) article_candidate.language = article.meta_lang return article_candidate
def read_other_article(htmltext): """ Processes articles other than the ones for which specific rules have been written :param htmltext: the htmltext of the article :return: """ article = Article('') # so that you can use local files with newspaper3k article.set_html(htmltext) article.parse() authors = article.authors # sometimes is extracts stuff like "Reporter for Fox News. Follow Her on Twitter.." date = article.publish_date # TODO: date not extracted here properly if date is not None: date = article.publish_date.strftime('%d/%m/%Y') text = article.text # text = "".join(i for i in text if i != '\n') title = article.title publisher = 'other' if determine_publisher(htmltext) == ArticleType.CNN: publisher = 'cnn' result_dict = { 'title': title, 'authors': authors, 'text': text, 'date': date, 'publisher': publisher } return result_dict
def check_url_get_content(url): """This function takes url as argument, extracts text content and other information using Article class from newspaper library and returns result as a dictionary. """ result = {} try: # async with session.get(url, timeout=600) as resp: with urllib.request.urlopen(url, timeout=600) as resp: # content = await resp.read() content = resp.read() # if content: article = Article(url) article.set_html(content) article.parse() article.nlp() text = article.text keywords = article.keywords status_code = resp.status # else: # text = 'none' # keywords = 'none' # status_code = 'none' except Exception as e: text = 'none' keywords = 'none' status_code = 'none' result['Text'] = text result['Keywords'] = keywords result['status_code'] = status_code return result
def extract(self, item): """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of parsing the HTML-Code. :param item: A NewscrawlerItem to parse. :return: ArticleCandidate containing the recovered article data. """ article_candidate = ArticleCandidate() article_candidate.extractor = self._name() article = Article('') article.set_html(item['spider_response'].body) article.parse() article_candidate.title = article.title article_candidate.description = article.meta_description article_candidate.text = article.text article_candidate.topimage = article.top_image article_candidate.author = article.authors if article.publish_date is not None: try: article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S') except ValueError as exception: self.log.debug('%s: Newspaper failed to extract the date in the supported format,' 'Publishing date set to None' % item['url']) article_candidate.language = article.meta_lang return article_candidate
def handle(task, progress): url = task.url progress.set_status("Requesting page...") resp = http_downloader.page_text(url, json=False) if not resp: return False config = Config() config.memoize_articles = False config.verbose = False article = Article(url='', config=config) article.download() article.set_html(resp) article.parse() if not article.top_image: return None src = article.top_image if 'http' not in src: if 'https' in url: src = 'https://' + src.lstrip('/ ').strip() else: src = 'http://' + src.lstrip('/ ').strip() progress.set_status("Downloading image...") return http_downloader.download_binary(src, task.file, prog=progress, handler_id=tag)
def parse_item(self, response): tag = "" for value in response.url.split("/")[3:]: if str(value).isdigit(): continue tag = value break item = EnPItem() art_parser = Article(response.url, language='en', fetch_images=False) # a.download() art_parser.set_html(response.text) art_parser.parse() item["home"] = response.url item["title"] = art_parser.title item["content"] = art_parser.text item["authors"] = art_parser.authors try: item["publish_date"] = art_parser.publish_date.strftime( '%Y-%m-%d %H:%M:%S') except: pass item["images"] = list(art_parser.images) item["keywords"] = art_parser.keywords item["meta_keywords"] = art_parser.meta_keywords item["tags"] = tag #list(art_parser.tags) print(item) save_mess("%s.txt" % self.name, json.dumps(dict(item), ensure_ascii=False))
def process_html(self, html): # fetch page content and parse html using newspaper article = Article(url="") article.set_html(html) article.parse() return article
def generic(url): article = Article(url) r = requests.get(url, stream=True) # article.download() article.set_html(r.raw.read(MAX_DATA, decode_content=True)) article.parse() return article
def retrieve_article(url): try: config = Configuration() config.fetch_images = False req = urllib.request.Request( url, headers={ 'User-Agent': "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919" }) con = urllib.request.urlopen(req, timeout=10) html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128]) article = Article(url='', config=config) article.set_html(html) article.parse() text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)]) if len(text) < 300: article = Article(url='', config=config, language="id") article.set_html(html) article.parse() text = ''.join( [i if ord(i) < 128 else ' ' for i in str(article.text)]) text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') return text except Exception as e: print(e) return False
def parse(cls, from_url: str, resolved_url: str, http_status: int, html: str) -> List[model.Page]: a = Article(resolved_url) a.set_html(html) a.parse() try: parsed = model.Parsed(keywords=[ s.strip() for s in a.meta_data['news_keywords'].split(",") ]) except: parsed = None return [ model.Page( from_url=from_url, resolved_url=resolved_url, http_status=http_status, article_metadata=dict(a.meta_data), article_published_at=a.publish_date, article_title=a.title, article_text=a.text, article_summary=a.meta_data['description'], parsed=parsed, fetched_at=datetime.datetime.now(), ) ]
def main(argv): if len(argv) > 1: htmlist = argv[1] else: htmlist = 'htmlist' # Our permanent config for html cleaning config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = False config.fetch_images = False config.verbose= True cleaner = Article(url='', config=config) with open(htmlist, 'r') as f: htmfile = f.read().split('\n') raw = [] for htm in htmfile: print (htm) if not htm.endswith("rss.html"): with open(htm, 'r') as f: h = f.read() cleaner.set_html(h) cleaner.parse() sentences = nlp.split_sentences(cleaner.text) #raw.append(sentences]) with open('htm-out', 'a') as f: [f.write(r + '\n') for r in sentences]
def analyse(): article = Article('') article.set_html(html=request.data) article.parse() return jsonify(source_url=article.source_url, url=article.url, title=article.title, top_img=article.top_img, meta_img=article.meta_img, imgs=list(article.imgs), movies=article.movies, text=article.text, keywords=article.keywords, meta_keywords=article.meta_keywords, tags=list(article.tags), authors=article.authors, publish_date=article.publish_date, summary=article.summary, article_html=article.article_html, meta_description=article.meta_description, meta_lang=article.meta_lang, meta_favicon=article.meta_favicon, meta_data=article.meta_data, canonical_link=article.canonical_link, additional_data=article.additional_data)
def parse_content(self, response): """extract content of news by newspaper""" item = response.meta['item'] is_special, content = self._handle_special_site(response) if not is_special: # 不是特殊网站 article = Article(item['url'], language='zh') article.set_html(response.body) article.is_downloaded = True article.parse() item['pic'] = article.top_image item['content'] = str(article.text) item['publish_date'] = article.publish_date if publish_date: item['publish_date'] = publish_date.strftime( "%Y-%m-%d %H:%M:%S") else: item['publish_date'] = "null" else: item['pic'] = "" item['content'] = content # extract content failed if item['content'] == '': logging.error("empty content in: " + response.url) yield item # raw_content = response.xpath("//body//p/text()").extract() # item['content'] = ''.join(raw_content) item['content'] = item['content'].strip().replace(u"\xa0", "").replace(u"\u3000", "").replace("|", "")\ .replace("用微信扫码二维码分享至好友和朋友圈", "").strip("您当前的位置 :").strip("您所在的位置:").strip("提示:点击上方").strip(">").strip() yield item
def parse_item(self, response): tags = response.xpath( '//*[@id="bread-nav"]/a[position()>=1]/text()').extract() item = EnPItem() art_parser = Article(response.url, language='en', fetch_images=False) # a.download() art_parser.set_html(response.text) art_parser.parse() item["home"] = response.url item["title"] = art_parser.title item["content"] = art_parser.text item["authors"] = art_parser.authors try: item["publish_date"] = art_parser.publish_date.strftime( '%Y-%m-%d %H:%M:%S') except: pass item["images"] = list(art_parser.images) item["keywords"] = art_parser.keywords item["meta_keywords"] = art_parser.meta_keywords item["tags"] = tags #list(art_parser.tags) save_mess("daly_people.txt", json.dumps(dict(item), ensure_ascii=False))
def get_article_from_html(article_html): # Returns a `newspaper` Article object from article HTML article = Article('', keep_article_html=True) article.set_html(article_html) article.parse() attach_links(article) return article
def parse(self, from_url: str, resp: aiohttp.ClientResponse, html: str) -> es.Page: article = Article(str(resp.url)) article.set_html(html) article.parse() if article.clean_top_node is not None: parsed = Parsed( keywords=article.meta_keywords, tickers=_parse_tickers(article.clean_top_node)) article_html = etree.tostring( article.clean_top_node, encoding='utf-8').decode('utf-8') else: parsed = Parsed(keywords=article.meta_keywords, tickers=[]) article_html = None parsed = Parsed( keywords=article.meta_keywords, tickers=_parse_tickers(article.clean_top_node)) page = es.Page( from_url=from_url, resolved_url=str(resp.url), http_status=resp.status, article_metadata=json.dumps(article.meta_data), article_published_at=article.publish_date, article_title=article.title, article_text=article.text, article_html=article_html, parsed=json.dumps(dataclasses.asdict(parsed)), fetched_at=datetime.datetime.now(),) page.save()
def parse_item(self, response): tags = response.xpath( '//*[@class="row-fluid crumbs"]//text()').extract() item = EnPItem() art_parser = Article(response.url, language='en', fetch_images=False) # a.download() art_parser.set_html(response.text) art_parser.parse() item["home"] = response.url item["title"] = art_parser.title item["content"] = art_parser.text item["authors"] = art_parser.authors try: item["publish_date"] = art_parser.publish_date.strftime( '%Y-%m-%d %H:%M:%S') except: pass item["images"] = list(art_parser.images) item["keywords"] = art_parser.keywords item["meta_keywords"] = art_parser.meta_keywords item["tags"] = tags #list(art_parser.tags) print(item) save_mess("%s.txt" % self.name, json.dumps(dict(item), ensure_ascii=False))
def extract(results): try: config = Configuration() config.fetch_images = False req = urllib.request.Request( results["url"], headers={ 'User-Agent': "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919" }) con = urllib.request.urlopen(req, timeout=10) html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128]) article = Article(url='', config=config) article.set_html(html) article.parse() text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)]) if len(text) < 300: article = Article(url='', config=config, language="id") article.set_html(html) article.parse() text = ''.join( [i if ord(i) < 128 else ' ' for i in str(article.text)]) text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') print("=", end='', flush=True) return (results["url"], results["title"], text, article.publish_date) except Exception as e: print(e) return (results["url"], results["title"], None, None)
def scrap(self): url = self.get_url() print("retrieve page: {}".format(url)) # check if we are allowed to crawl this page if not self.is_allowed(): print("Retrieval is not allowed by robots.txt") return False # Get page content and headers try: response = requests.get(url, headers={'User-Agent': settings.USER_AGENT}) except (requests.ConnectTimeout, requests.HTTPError, requests.ReadTimeout, requests.Timeout, requests.ConnectionError): return self.content_type = response.headers['content-type'] if 'content-type' in response.headers else "" # usually "text/html" # don't store page content if it's not html self.raw_content = response.text if self.content_type.find("text/html") == -1: print("we don't process none html pages yet.") return False # store article title and content article = Article(url) article.set_html(self.raw_content) article.parse() self.article_title = article.title self.article_content = article.text self.article_top_image = article.top_image article.nlp() self.article_excerpt = article.summary self.article_keywords = article.keywords # parse html page soup = BeautifulSoup(self.raw_content, "html5lib") # Images for img in soup.findAll("img"): img_url = img.get('src', '') img_alt = img.get('alt', '') img_title = img.get('title', '') image_site_url, image_path = get_site_path(img_url) if self.site.site_url == image_site_url: img_site = self.site else: img_site = Site.objects.get_or_create(site_url=image_site_url)[0] # get image object image = Image.objects.get_or_create(path=image_path, site=img_site)[0] img_detail = ImageDetail.objects.get_or_create(image=image, page=self)[0] img_detail.title = img_title img_detail.alt = img_alt img_detail.save() # HTML Title self.page_title = soup.title.string self.save() return soup # for crawling
def extract_with_newspaper(self, html): '''Parses HTML using Newspaper.''' article = Article(self.url) article.set_html(html) filterwarnings('ignore', category=DeprecationWarning) with catch_warnings(): article.parse() return article.__dict__
def parse(response): print type(response) # print response.body article = Article(url=response.url, language="es") article.set_html(response.body) # article = self.articleProcessor.extractAll(response.body) print article.title print article.publish_date print response.url
def parse_article_page(response): article = Article(url=response.request.url) article.set_html(response.text) article.parse() if article.title and article.text: item = NewsArticle() item['title'] = article.title item['text'] = article.text yield item
def load_and_parse_full_article_text_and_image(url: str) -> Article: config = Config() config.MAX_SUMMARY_SENT = 8 article = Article(url, config=config) article.set_html(load_page_safe(url)) # safer than article.download() article.parse() return article
def handle(url, data, log): try: log.out(0, 'Downloading article...') resp = requests.get(url, headers={'User-Agent': data['user_agent']}) if resp.status_code != 200: return False #!cover config = Config() config.memoize_articles = False config.verbose = False article = Article(url='', config=config) log.out(0, 'Parsing article...') article.download() article.set_html(resp.text) article.parse() if article.top_image: src = article.top_image if 'http' not in src: #!cover if 'https' in url: src = 'https://' + src.lstrip('/ ').strip() else: src = 'http://' + src.lstrip('/ ').strip() log.out(0, 'Newspaper located image: %s' % src) r = requests.get(src, headers={'User-Agent': data['user_agent']}, stream=True) if r.status_code == 200: content_type = r.headers['content-type'] ext = mimetypes.guess_extension(content_type) if not ext or ext == '': #!cover log.out( 1, 'NewsPaper Error locating file MIME Type: %s' % url) return False if '.jp' in ext: ext = '.jpg' #!cover path = data['single_file'] % ext if not os.path.isfile(path): if not os.path.isdir(data['parent_dir']): #!cover log.out(1, ("+Building dir: %s" % data['parent_dir'])) os.makedirs( data['parent_dir'] ) # Parent dir for the full filepath is supplied already. with open(path, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) return path else: #!cover log.out( 0, ('\t\tError Reading Image: %s responded with code %i!' % (url, r.status_code))) return False except Exception as e: log.out(0, ('"Newspaper" Generic handler failed. ' + (str(e).strip()))) return False #!cover
class ParserNewsPaper(Parser): _extractor = None def parse_news_text(self, page_html: str, url: str) -> dict: if self._extractor is None: self._extractor = Article("", language="en") self._extractor.set_html(page_html) self._extractor.parse() news_text = re.sub(r'\s+', r' ', self._extractor.text) return {'url': url, 'text': news_text}
def get_body_from_html(url, html, cache=False): if cache: dc = DownloadCache(url) if not dc.is_cached(): dc.cache(html) narticle = NArticle(url, fetch_images=False) narticle.set_html(html) narticle.parse() return narticle.text
def top_image_from_html(url, html): try: article = Article(url=url) article.set_html(html) article.parse() return article.top_image except Exception as e: logger.error("error reading article " + url) return {}
def clean(html_content): config = Configuration() config.fetch_images = False # TODO: allow URL passing article = Article("http://example.com", config=config) article.set_html(html_content) article.is_downloaded = True article.parse() return article.text
def get_article(driver, url): driver.get(url) article = Article("") article.set_html(driver.page_source) article.parse() text = article.text text = re.sub(r"[\n ]+", " ", text, flags=re.M) return text
def parse_article(url) -> Tuple[str, List[str]]: """Parse article using newspaper3k to get summary and keywords.""" if not url: return "", [] article = Article(url) html_content = load_page_safe(url) if not html_content: return "", [] article.set_html(html_content) article.parse() article.nlp() return article.summary, list(set(article.keywords))
def main(argv): try: r = redis.StrictRedis('localhost', 6379, 0) article = Article(url='', fetch_images=False, language='fr') article.set_html(r.get('scraped')) article.parse() print(json.dumps(article.text)) except Exception: print(json.dumps(None))
def read_nyt_article(htmltext): """ uses the string of the new york times article which is passed to it to extract the important information :param htmltext: a string which contains the html of the new york times article :return: returns a dict which stores the extracted result """ soup = BeautifulSoup(htmltext, 'lxml') title = soup.html.head.title.text # extracts the title ps = soup.body.find_all('p') i = 0 article = Article('') # so that you can use local files with newspaper3k article.set_html(htmltext) article.parse() authors = article.authors date = article.publish_date # TODO: date not extracted here properly if date is not None: date = article.publish_date.strftime('%d/%m/%Y') # used to find where the article text start - it always starts with a '-' while '—' not in ps[i].text: i += 1 ps = ps[i:] # gets rid of useless sections ps = [i for i in ps if i.text != ''] ps = [i for i in ps if i.text != 'Advertisement'] ps = [i for i in ps if 'Now in print:' not in i.text] ps = [i for i in ps if 'And here\'s our email' not in i.text] ps = [i for i in ps if 'The Times is committed' not in i.text] ps = [i for i in ps if 'We\'d like to hear' not in i.text] ps = [i for i in ps if 'Follow The New York Times' not in i.text] ps = [i for i in ps if 'on Twitter: @' not in i.text] ps = [i for i in ps if 'on Twitter at' not in i.text] ps = [i for i in ps if 'contributed reporting' not in i.text] ps = [i for i in ps if 'contributed research' not in i.text] text = "\n ".join([" ".join(i.text.split()) for i in ps]) result_dict = { 'title': title, 'authors': authors, 'text': text, 'date': date, 'publisher': 'nytimes' } return result_dict
async def enrich(self, result): # none of the following lines will work if we couldn't make soup if not self.soup: return result sanitized = sanitize_html(self.response.body) if not sanitized: return result article = Article(self.url, config=FixedArticleConfig()) article.config.fetch_images = False article.set_html(sanitized) article.parse() result.set('title', article.title, 2, 'textlength') if len(article.meta_description) > 0: result.set('subtitle', article.meta_description, 2, 'textlength') if len(article.article_html) > 0: sanitized = sanitize_html(article.article_html) result.set('content', sanitized, 0, 'textlength') elif article.top_node is not None: sanitized = sanitize_html(tostring(article.top_node)) result.set('content', sanitized, 2) if article.authors: result.set('authors', article.authors, 2) if article.publish_date and len(str(article.publish_date)) > 0: result.set('published_at', article.publish_date, 2) result.add('keywords', list(article.keywords)) result.add('keywords', list(article.tags)) result.add('_candidate_images', list(article.imgs)) # Primary image guess is actually pretty crappy if article.top_image: result.add('_candidate_images', [article.top_img]) text = "" for paragraph in article.text.split("\n"): paragraph = paragraph.strip() # this is done to get rid of cases where a stray heading # like "Photographs" ends up as a paragraph if Summarizer.has_sentence(paragraph): text += " " + paragraph if len(text) > 0: result.set('_text', text, 2) return result
def _parse_article(self, key, url): a = Article('') html = Google().cache(url) a.set_html(html) a.parse() a.nlp() article = {"summary":a.summary, "publish_date":a.publish_date, "images":a.images, "top_image":a.top_image, "title":a.title, "authors":a.authors, "keywords":a.keywords, "text":a.text} # update #conn = r.connect(db="clearspark") conn = r.connect(**rethink_conn.conn())
def clean_source(url, source): """ Parse a pre-downloaded article using newspaper. Args: url (str): The url where the article was sourced (necessary for the newspaper API). source (str): Html source of the article page. Returns: Dictionary providing cleaned article and extracted content (see `construct_result`), or `None` if newspaper could not extract the article. """ article = Article(url) article.set_html(source) article.parse() if article.top_node is None: return None return construct_result(article)
import sys, json from newspaper import Article htmlStr = "" for line in sys.stdin: htmlStr = htmlStr + line #obj = json.loads(jsonStr) article = Article('') article.set_html(htmlStr); article.parse() article.nlp() ret = json.dumps(article.keywords) print ret
def extract_data(fname, loadp, savep): ###################### # initialize process # ###################### stream = GzipFile(loadp + fname) protocol = TBinaryProtocol.TBinaryProtocol(TTransport.TBufferedTransport(stream)) data = {'data': []} count = 0 #################### # begin extraction # #################### while True: page = WikiLinkItem() try: page.read(protocol) count += 1 except: stream.close() break print '- processing FILE {0} ENTRY # {1}'.format(fname, count) print '\t $ URL: {0}'.format(page.url) ##################### # initial filtering # ##################### if page.url[:3] == 'ftp': print '\t\t ###### Ftp prefix detected (ignore) ###### \n' continue if page.url[len(page.url) - 4:] != 'html': print '\t\t ###### Non-html suffix detected (ignore) ###### \n' continue if page.content.dom == None: print '\t\t ###### Empty dom detected (ignore) ###### \n' continue ####################### # secondary filtering # ####################### entities = extract_entities(page.mentions) if len(entities) < 2: print '\t\t ###### Single entity found (discard) ###### \n' continue print '\t $ # Entities:', len(entities) ######################### # alignment and parsing # ######################### html = mark_dom(page.content.dom, entities) news = Article(page.url, language = 'en') try: news.set_html(html) news.parse() except: print '\t\t ###### Parsing failed (discard) ###### \n' continue ################ # tokenization # ################ text = None try: text = ftfy.fix_text(news.text) text = text.encode('ascii', 'ignore') text = seperate_delimiter(word_tokenize(text)) except: print '\t\t ###### Tokenization failed (discard) ###### \n' continue ####################### # save processed data # ####################### print '\t $ Entry # {0} Saved \n'.format(count) data['data'].append({'text': text, 'dict': entities}) ##################### # save as json file # ##################### print '****** {0}.json saved ******\n'.format(fname[:3]) f = open(savep + '{0}.json'.format(fname[:3]), 'w') json.dump(data, f, indent = 4) f.close()
def prepare(self, response): article = Article(url=response.url) article.set_html(response.text) article.parse() return article
def parser_nlp(fname, html): Ts = timeit.default_timer() raw_html = html # basic info fid = int(fname.split('_')[0].split('/')[1]) pm = parse_machine() html = pm.fix_html(html) link_stats = pm.parse_links(html) link_factors = [t for t in list(set(" ".join(link_stats.keys()).lower().split())) if (len(t) > 3)] doc = db.articles( fid = fid, html = html, html_cnt = len(html), link_stats = link_stats, link_factors = link_factors, rand = random.random(), # extra lines = raw_html.count('\n'), spaces = raw_html.count(' '), tabs = raw_html.count('\t'), braces = raw_html.count('{'), brackets = raw_html.count('['), quesmarks = raw_html.count('?'), exclamarks = raw_html.count('!'), words = len(re.split('\s+', raw_html)), ) # check empty if ((doc.html == None) | (len(doc.html.replace(r'\s', '')) < 10)): doc.empty = True return doc try: # if True: pd = Article('', fetch_images=False) pd.set_html(doc.html) pd.parse() pd.nlp() except Exception as e: print("-"*60) print("[parser_nlp %s]: %s" % (doc.fid, e)) print(doc.html[:500]) print("-"*60) return doc #"%s: %s" % (e, doc.id) # select cleaned_text cleaned_text = " ".join(pd.text.lower().split()) if (len(cleaned_text) < 140): soup = bs(doc.html) if soup.body: cleaned_text = soup.body.text if (len(cleaned_text) < 140): cleaned_text = soup.text cleaned_text = sanitize_txt(cleaned_text, lower=True) bow = nlp.nlp().txt2words(cleaned_text or '', False) # save results try: opengraph = pd.meta_data.get('og', {}) if pd.meta_data else {} top_image = opengraph.get('image') or (pd.top_image if pd.top_image else None) if isinstance(top_image, dict): top_image = top_image.get('identifier') if isinstance(opengraph.get('locale'), dict): opengraph['locale'] = opengraph.get('locale').get('identifier') publish_date = pm.process_date(opengraph.get('updated_time') or pd.publish_date) # canonical_link & domain domain = canonical_link = str(opengraph.get('url') or pd.canonical_link) if '//' in domain: domain = domain.split('//')[1] if '?' in domain: domain = domain.split('?')[0] domain = '/'.join(domain.split('/')[0:1]) # update # doc.update( doc = db.articles( fid = doc.fid, html = doc.html, link_stats = doc.link_stats, link_factors = doc.link_factors, rand = doc.rand, html_cnt = doc.html_cnt, # lines = doc.lines, spaces = doc.spaces, tabs = doc.tabs, braces = doc.braces, brackets = doc.brackets, quesmarks = doc.quesmarks, exclamarks = doc.exclamarks, words = doc.words, # title = str(opengraph.get('title') or pd.title)[:500], # cleaned_text = str(cleaned_text), bow = bow, tags = [t.lower() for t in pd.tags], # opengraph = {sanitize_txt(k): sanitize_txt(v) for k,v in opengraph.items()}, # summary = str(pd.summary), keywords = pd.keywords, top_image = str(top_image), movies = pd.movies, publish_date = publish_date, meta_site_name = str(opengraph.get('site_name')), meta_lang = str(opengraph.get('locale') or pd.meta_lang), meta_description = str(opengraph.get('description') or pd.meta_description), meta_keywords = pd.meta_keywords, canonical_link = canonical_link, domain = domain, authors = [n.lower().replace(' ', '_') for n in pd.authors], ) except Exception as e: print("-"*60) print("[Error] while [%s] in parser_nlp: %s" % (doc.id, e)) data = { "title" : str(opengraph.get('title') or pd.title)[:500], "text" : cleaned_text[:140], "tags" : [t.lower() for t in pd.tags], "opengraph" : opengraph, "summary" : str(pd.summary), "keywords" : pd.keywords, "top_image" : str(top_image), "movies" : pd.movies, "date" : publish_date, #opengraph.get('updated_time') or pd.publish_date, "site_name" : str(opengraph.get('site_name')), "locale" : str(opengraph.get('locale') or pd.meta_lang), "desc" : str(opengraph.get('description') or pd.meta_description), "keywords" : pd.meta_keywords, "url" : canonical_link, "authors" : pd.authors, } for k,v in data.items(): print(k, v, v.__class__) print("-"*60) return doc
# -*- coding: utf-8 -*- from newspaper import Article from goose import Goose import requests import json import sys article = Article(sys.argv[1]) article.download() if not article.html: r = requests.get(sys.argv[1], verify=False, headers={ 'User-Agent': 'Mozilla/5.0' }) article.set_html(r.text) article.parse() article.nlp() published = '' if article.publish_date: published = article.publish_date.strftime("%Y-%m-%d %H:%M:%S") # Get body with goose g = Goose() goose_article = g.extract(raw_html=article.html) body = goose_article.cleaned_text summary = goose_article.meta_description # Maybe use https://github.com/xiaoxu193/PyTeaser if not summary: summary = article.summary