def get_details(): url = request.args.get('url', '') if not url: abort(400) if is_image(url): result = { "url": url, "top_image": url, "text": "", } return jsonify(result) article = Article(url) article.download() try: article.parse() except (IOError, UnicodeDecodeError): return '', 422 try: top_image = article.top_image.rsplit('?',1)[0] except AttributeError: top_image = '' result = { "url": url, "top_image": top_image, "text": article.text, } return jsonify(result)
def get_article(): tree_urls = ET.parse("DB_urls.xml") root_urls = tree_urls.getroot() # The problem with English and Chinese can be solved with for field_urls in root_urls.findall("row"): url_urls = field_urls.find("field").text # url_urls = 'http://news.sina.com.cn/c/2014-04-21/204729980947.shtml' # url_urls = 'http://china.caixin.com/2013-12-30/100623243.html' try: response = urllib2.urlopen(url_urls) status = response.code #print "detected webpage code:", status if(status == 404): continue else: a_zh = Article(url_urls, language = 'zh') a_zh.download() a_zh.parse() content_urls = a_zh.text if(content_urls == ''): a_en = Article(url_urls, language = 'en') a_en.download() a_en.parse() content_urls = content_urls + a_en.text if(content_urls != ''): compare_article(url_urls, content_urls) except: pass
def test_spanish_fulltext_extract(self): url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html" article = Article(url=url, language="es") article.download() article.parse() with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f: assert article.text == f.read()
def get_image(): url = request.args.get('url', '') if not url: abort(400) if is_image(url): return redirect(url) article = Article(url) article.download() try: article.parse() except (IOError, UnicodeDecodeError): return '', 422 try: top_image = article.top_image.rsplit('?',1)[0] except AttributeError: top_image = '' if not top_image == '': return redirect(top_image) else: return '', 422
def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ new_article = Article(self.article.url) resp = mock_response_with(new_article.url, 'cnn_article') new_article.download(resp) self.assertRaises(ArticleException, new_article.nlp)
def test_chinese_fulltext_extract(self): url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml" article = Article(url=url, language="zh") article.download() article.parse() with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f: assert article.text == f.read()
def test_arabic_fulltext_extract(self): url = "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html" article = Article(url=url, language="ar") article.download() article.parse() with codecs.open(os.path.join(TEXT_FN, "arabic_text_1.txt"), "r", "utf8") as f: assert article.text == f.read()
def run(self): logging.debug("run() - [WAIT]") from newspaper import Article ''' Library documentation: http://newspaper.readthedocs.org/en/latest/user_guide/quickstart.htm ''' NOTES_LIST = [ '118', '117', # '116', # '115', ] for note_id in NOTES_LIST: note = Article(url="http://site.tiagoprnl.in/core/visitor_home/nota/%s/" % note_id) note.download() print '*' * 100 # print 'H T M L' # print note.html #print '*' * 100 # print 'T E X T' note.parse() print note.text logging.debug("run() - [DONE]")
def get_nlp_data(url): article = Article(url) article.download() article.parse() article.nlp() return json.dumps(article.keywords)
def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ new_article = Article(self.article.url) html = mock_resource_with('cnn_article', 'html') new_article.download(html) self.assertRaises(ArticleException, new_article.nlp)
def main(): try: headlines = requests.get(headline_url) headlines = json.loads(headlines.text) for headline in headlines['Headlines']: print("Processing Article %s" % headline['Url']) article = Article(headline['Url']) article.download() article.parse() response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80) rdf = json.loads(response.text) for x in rdf: if '_type' in rdf[x] and 'name' in rdf[x]: print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name'])) for instance in rdf[x]['instances']: text = instance['prefix'] + instance['suffix'] blob = TextBlob(text) for sentence in blob.sentences: print(sentence) print(sentence.sentiment.polarity) print('--------------------') #print(rdf) except Exception as e: print ('Error in connect ' , e)
def check_url(args): """ :param (basestr, basestr) url, res_filename: :return: (pubdate_failed, fulltext_failed) """ url, res_filename = args pubdate_failed, fulltext_failed = False, False html = mock_resource_with(res_filename, 'html') try: a = Article(url) a.download(html) a.parse() if a.publish_date is None: pubdate_failed = True except Exception: print('<< URL: %s parse ERROR >>' % url) traceback.print_exc() pubdate_failed, fulltext_failed = True, True else: correct_text = mock_resource_with(res_filename, 'txt') if not (a.text == correct_text): # print('Diff: ', simplediff.diff(correct_text, a.text)) # `correct_text` holds the reason of failure if failure print('%s -- %s -- %s' % ('Fulltext failed', res_filename, correct_text.strip())) fulltext_failed = True # TODO: assert statements are commented out for full-text # extraction tests because we are constantly tweaking the # algorithm and improving # assert a.text == correct_text return pubdate_failed, fulltext_failed
def post_new(request): if request.method == "POST": form = PostForm(request.POST) if form.is_valid(): post = form.save(commit=False) post.author = request.user post.published_date = timezone.now() post.save() return redirect('blog.views.post_detail', pk=post.pk) elif request.method == 'GET': url = request.GET.get('url', '') if len(url) > 5: article = Article(url, language='en') article.download() article.parse() article.nlp() image = article.top_image summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'") title = article.title.replace(u'\u2019',"\'") source = url.split('//')[1].split('/')[0].replace('www.','') status = 'UD' form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,}) else: form = PostForm() return render(request, 'blog/post_edit.html', {'form': form})
def f(url): url_urls = url.text try: response = urllib2.urlopen(url_urls) status = response.code #print "detected webpage code:", status if(status == 404): pass else: a_zh = Article(url_urls, language = 'zh') a_zh.download() a_zh.parse() # content_urls = a_zh.text # if(content_urls == ''): # a_en = Article(url_urls, language = 'en') # a_en.download() # a_en.parse() # content_urls = content_urls + a_en.text # if(content_urls != ''): # pass # # compare_article(url_urls, content_urls) except: pass
def scrapeURLS(inFilPath): texts = [] cache = loadCache() toDelURLs = [] with open(inFilPath) as f: urls = f.readlines() for url in urls: if filter(urlFilters, url): toDelURLs.append(url) if url in cache: txt = cache[url] else: print "Scraping URL %s" % url article = Article(url) article.download() article.parse() txt = article.text.replace("\n", " ").replace(" ", " ").strip() if txt == "" or filter(txtFilter, txt): toDelURLs.append(url) continue cacheURL(url, txt) texts.append(txt) deleteURLs(inFilPath, toDelURLs) return texts
def test_download_file_failure(self): url = "file://" + os.path.join(HTML_FN, "does_not_exist.html") article = Article(url=url) article.download() self.assertEqual(0, len(article.html)) self.assertEqual(article.download_state, ArticleDownloadState.FAILED_RESPONSE) self.assertEqual(article.download_exception_msg, "No such file or directory")
def show_article(): url_to_clean = request.args.get('url_to_clean') if not url_to_clean: return redirect(url_for('index')) article = Article(url_to_clean) article.download() article.parse() try: html_string = ElementTree.tostring(article.clean_top_node) except: html_string = "Error converting html to string." try: article.nlp() except: log.error("Couldn't process with NLP") a = { 'html': html_string, 'authors': str(', '.join(article.authors)), 'title': article.title, 'text': article.text, 'top_image': article.top_image, 'videos': str(', '.join(article.movies)), 'keywords': str(', '.join(article.keywords)), 'summary': article.summary } return render_template('article/index.html', article=a, url=url_to_clean)
def extract(): url = sys.argv[1:].pop() a = Article(url, keep_article_html=True) a.download() a.parse() a.nlp() parsed_uri = urlparse(a.source_url) domain = '{uri.netloc}'.format(uri=parsed_uri) try: publish_date = a.publish_date.strftime('%Y-%m-%d %H:%M') except AttributeError: publish_date = "" try: authors = ", ".join(a.authors) except AttributeError: authors = "" result = {} result['html'] = a.html result['body'] = a.text result['title'] = a.title result['top_image'] = a.top_image result['author'] = authors result['html_body'] = a.article_html result['favicon'] = a.meta_favicon result['description'] = a.summary result['publish_date'] = publish_date result['keywords'] = a.keywords result['sitename'] = re.sub(r"^www.", "", domain) return json.dumps(result).encode('utf-8')
def test_download_file_success(self): url = "file://" + os.path.join(HTML_FN, "cnn_article.html") article = Article(url=url) article.download() self.assertEqual(article.download_state, ArticleDownloadState.SUCCESS) self.assertEqual(article.download_exception_msg, None) self.assertEqual(75406, len(article.html))
def extract(url=None, keep_html=True): """ Attempts to extract article from URL """ a = Article(url, keep_article_html=keep_html) try: a.download() except Exception, e: log.error('Error downloading %s: %s' % (url, str(e)))
def get_news(): urls = get_urls() news = News.query.with_entities(News.source_url).all() used_urls = [] for n in news: used_urls.append(n[0]) for url in urls: if not url in used_urls: used_urls.append(url) article = Article(url, language='pt', keep_article_html=True) article.download() article.parse() article.nlp() news_article = News(url) news_article.slug = slugify(article.title) news_article.title = article.title news_article.text = article.text news_article.top_image = article.top_image news_article.summary = article.summary news_article.article_html = article.article_html news_article.created_at = datetime.datetime.now() exists_this_news = News.query.filter_by(source_url=url).first() if not exists_this_news: print(url) db.session.add(news_article) db.session.commit()
def get_article_by_url(url): article = Article(url, fetch_images=False) article.download() if url == "empty": return "nolist" article.parse() return article.text
def get_article(url): a = Article(url) a.download() a.parse() article = dict() article['title'] = a.title article['publish_date'] = a.published_date article['authors'] = a.authors article['lead_image'] = a.top_image article['movies'] = a.movies article['text'] = a.text article['keywords'] = get_keywords(a.text) # This is more likely to fail. # try: # article.nlp() # article['summary'] = 'This summary is generated: \n ' + a.summary # except Exception: # print Exception # article['summary'] = a.summary return article
def insert_url(url): conn = sqlite3.connect('publico_news_sqllite3.db') cursor = conn.cursor() # get the article in plain text article = Article(url) article.download() article.parse() date = article.publish_date title = article.title text = article.text item = dict() item['datetime'] = date item['title'] = title item['text'] = text item['category'] = sys.argv[1].split('/')[6] item['link'] = sys.argv[1] item['origLink'] = sys.argv[1] print(item['category']) print(item['datetime']) if not duplicate(item, item['category'], cursor): status = insert_db(item, item['category'], cursor) if status == 1: print(sys.argv[1], "inserted") else: print("Error", status) else: print(url, "already in BD") conn.commit() conn.close()
def makeDocs(): utc = pytz.utc es = Elasticsearch(BONSAI_URL, verify_certs= True) es.indices.delete(index='news', ignore=[400, 404]) es.indices.create(index='news', ignore=400) print "Created" cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False) a = defaultdict(int) cnn_articles = cnn_paper.articles print cnn_paper.size() for i in range(10): article = cnn_articles[i] url = article.url art = Article(url) art.download() art.parse() print art.publish_date print art.text print "Article" + str(i) print art.publish_date is not None print art.text is not None if (art.publish_date is not None) and (art.text is not None): try: doc = { 'domain': 'CNN', 'date': utc.localize(art.publish_date), 'text': art.text } res = es.index(index="news", doc_type='article', id=i, body=doc) print "Doc" + str(i) except: print "Doc not accepted"
def test2(self): articles =[ 'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350', 'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923', 'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180' ] articles = [ 'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU', 'http://www.bbc.co.uk/news/uk-wales-35954982', 'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/', 'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/', 'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff', 'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html', 'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/', 'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178', 'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not'] with open("./Output2.txt", "w") as text_file: for url in articles: print(url) a = Article(url) a.download() a.parse() text_file.write(a.text.encode('utf-8')) text_file.write('\n')
def is_valid_article(link): print("Checking valid:\n" + link) if "cnn.com" not in link: return False if "html" not in link: return False article = Article(link) article.download() article.parse() article.nlp() keywords = article.keywords matched = False for key in keywords: if key in nc_set: matched = True for key in keywords: if key in contorversial_set: matched = False if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)): main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n") visited_articles.write(link+"\n") return True return False
def parse_news(self, response): item = ScrapyGooglenewsItem() #only log the warning info from request logging.getLogger("requests").setLevel(logging.WARNING) for href in response.xpath('//h2[@class="title"]/a/@href').extract(): item['link'] = href #use newspaper-0.0.8 to scrape the webpage, then get clean text. article = Article(item['link']) article.download() article.parse() item['title'] = article.title item['text'] = article.text #item['authors'] = article.authors #item['date'] = article.publish_date if response.url.split('&')[-1] == 'topic=w': item['domain'] = 'World' if response.url.split('&')[-1] == 'topic=n': item['domain'] = 'U.S.' if response.url.split('&')[-1] == 'topic=b': item['domain'] = 'Business' if response.url.split('&')[-1] == 'topic=tc': item['domain'] = 'Technology' if response.url.split('&')[-1] == 'topic=e': item['domain'] = 'Entertainment' if response.url.split('&')[-1] == 'topic=s': item['domain'] = 'Sports' if response.url.split('&')[-1] == 'topic=snc': item['domain'] = 'Science' if response.url.split('&')[-1] == 'topic=m': item['domain'] = 'Health' yield item
def runTest(self): # The "correct" fulltext needs to be manually checked # we have 50 so far FULLTEXT_PREPARED = 50 domain_counters = {} with open(URLS_FILE, 'r') as f: urls = [d.strip() for d in f.readlines() if d.strip()] for url in urls[:FULLTEXT_PREPARED]: domain = get_base_domain(url) if domain in domain_counters: domain_counters[domain] += 1 else: domain_counters[domain] = 1 res_filename = domain + str(domain_counters[domain]) html = mock_resource_with(res_filename, 'html') try: a = Article(url) a.download(html) a.parse() except Exception: print('<< URL: %s parse ERROR >>' % url) traceback.print_exc() continue correct_text = mock_resource_with(res_filename, 'txt') condensed_url = url[:30] + ' ...' print('%s -- fulltext status: %s' % (condensed_url, a.text == correct_text))
def parse_input(text, extractor='newspaper'): if isinstance(text, str) or isinstance(text, unicode): if text.startswith(('http://', 'https://')): # Input is a link - need to extract the text from html if extractor.lower() == 'goose': from goose import Goose urlparse = Goose() article = urlparse.extract(url=text) return unicode_to_ascii(article.cleaned_text) else: from newspaper import Article article = Article(text) article.download() article.parse() return unicode_to_ascii(article.text) elif text.endswith('.txt'): # Input is a file - need to read it textfile = open(text, 'rb') article = textfile.read() textfile.close() return unicode_to_ascii(article) else: # Input is a string containing the raw text return unicode_to_ascii(text) else: raise ValueError('Input text must be of type str or unicode.')
def main_func(search_term): search_term = url_encode(search_term) browser = None browser = webdriver.Chrome("chromedriver 2") scrapeCNN(browser, search_term) scrapeBBC(browser, search_term) scrapeFOX(browser, search_term) export_json() # Set the limit for number of articles to download LIMIT = 30 data = {} data['newspapers'] = {} documents = { "documents":[] } count = 1 # Iterate through each news company for company, value in all_data.items(): if 'rss' in value: d = fp.parse(value['rss']) print("Downloading articles from ", company) newsPaper = { "rss": value['rss'], "link": value['link'], "articles": [] } for entry in d.entries: # Check if publish date is provided, if no the article is skipped. # This is done to keep consistency in the data and to keep the script from crashing. if hasattr(entry, 'published'): if count > LIMIT: break article = {} article['link'] = entry.link date = entry.published_parsed article['published'] = datetime.fromtimestamp(mktime(date)).isoformat() try: content = Article(entry.link) content.download() content.parse() except Exception as e: # If the download for some reason fails (ex. 404) the script will continue downloading # the next article. print(e) print("continuing...") continue article['title'] = content.title article['text'] = content.text newsPaper['articles'].append(article) print(count, "articles downloaded from", company, ", url: ", entry.link) count = count + 1 else: # This is the fallback method if a RSS-feed link is not provided. # It uses the python newspaper library to extract articles print("Building site for ", company) for link in value['link']: content = Article(link) newsPaper = { "link": link, "articles": [] } noneTypeCount = 0 if count > LIMIT: break try: content.download() content.parse() except Exception as e: print(e) print("continuing...") continue # Again, for consistency, if there is no found publish date the article will be skipped. # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped. article = {} article['title'] = content.title article['text'] = content.text article['link'] = content.url if content.publish_date is not None: article['published'] = content.publish_date.isoformat() newsPaper['articles'].append(article) info = {} if len(content.text) < 5100: info["id"] = company+str(count) info["title"] = content.title info['link'] = content.url info['source'] = company info["language"] = "en" info["text"] = content.text documents["documents"].append(info) print(count, "articles downloaded from", company, " using newspaper, url: ", content.url) count = count + 1 noneTypeCount = 0 data['newspapers'][company] = newsPaper run_sample()
def article_title(articleTitle, c): #Article titles to txt file: '2- TitleList.txt' with open('2- TitleList.txt', 'a', encoding='utf-8') as file: file.write(c + '\n' + articleTitle + '\n') urlfile = '1- url.txt' with open(urlfile) as f: #Read urls and download news text and title line = f.readline() c = 0 error = 1 while line: line = line.rstrip('\n') a = Article(line, source_url="https://time.com", config=config) a.download() try: #Use try/except for urls with read timeout error a.parse() text_to_file(a.text.lower()) article_title(a.title.lower(), str(c)) except: error += 1 pass a = 'None' line = f.readline() clear_output(wait=True) print(c) time.sleep(2) c += 1
def textgetter(url): """Scrapes web news and returns the content Parameters ---------- url : str web address to news report Returns ------- answer : dict Python dictionary with key/value pairs for: text (str) - Full text of article url (str) - url to article title (str) - extracted title of article author (str) - name of extracted author(s) base (str) - base url of where article was located provider (str) - string of the news provider from url published_date (str,isoformat) - extracted date of article top_image (str) - extracted url of the top image for article """ global done TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'] # regex for url check s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)') u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$") if s.search(url): site = u.search(s.search(url).group()).group(3) else: site = None answer = {} # check that its an url if s.search(url): if url in done.keys(): yield done[url] pass try: # make a request to the url r = requests.get(url, verify=False, timeout=1) except: # if the url does not return data, set to empty values done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords'] = None answer['summary'] = None yield answer # if url does not return successfully, set ot empty values if r.status_code != 200: done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords'] = None answer['summary'] = None # test if length of url content is greater than 500, if so, fill data if len(r.content) > 500: # set article url article = Article(url) # test for python version because of html different parameters if int(platform.python_version_tuple()[0]) == 3: article.download(input_html=r.content) elif int(platform.python_version_tuple()[0]) == 2: article.download(html=r.content) # parse the url article.parse() article.nlp() # if parse doesn't pull text fill the rest of the data if len(article.text) >= 200: answer['author'] = ", ".join(article.authors) answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = article.publish_date answer['keywords'] = article.keywords answer['summary'] = article.summary # convert the data to isoformat; exception for naive date if isinstance(article.publish_date, datetime.datetime): try: answer[ 'published_date'] = article.publish_date.astimezone( pytz.utc).isoformat() except: answer[ 'published_date'] = article.publish_date.isoformat( ) answer['text'] = article.text answer['title'] = article.title answer['top_image'] = article.top_image answer['url'] = url # if previous didn't work, try another library else: doc = Paper(r.content) data = doc.summary() title = doc.title() soup = BeautifulSoup(data, 'lxml') newstext = " ".join([l.text for l in soup.find_all(TAGS)]) # as we did above, pull text if it's greater than 200 length if len(newstext) > 200: answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url answer['keywords'] = None answer['summary'] = None # if nothing works above, use beautiful soup else: newstext = " ".join([ l.text for l in soup.find_all('div', class_='field-item even') ]) done[url] = newstext answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url answer['keywords'] = None answer['summary'] = None # if nothing works, fill with empty values else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = 'No text returned' answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords'] = None answer['summary'] = None yield answer yield answer # the else clause to catch if invalid url passed in else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = 'This is not a proper url' answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords'] = None answer['summary'] = None yield answer
except: break d = dict() c = 0 print('---------------------------------') #from Com import Com r = list() with codecs.open("withnewspaper.csv", "w", "utf-8") as file: file.write('publishedAt,title,author,url,urlToImage,description,comments' + '\r\n') for i in a: print(i) url = i article = Article(url) article.download() article.parse() # p=Com(url) #print(p.get_comment) title = article.title authors = article.authors publish_date = article.publish_date text = article.text top_image = article.top_image '''print(type(title.encode('UTF-8'))) print(type(authors)) print(type(str(publish_date))) print(type(text)) print(type(top_image))''' html = (article.html) soup = BeautifulSoup(html, 'lxml')
# article的使用方法 from newspaper import Article # print(help(Article)) url = r'https://new.qq.com/omn/20180705/20180705A0T4T4.html' article = Article(url) # an online news article page article.download() # 下载文章 html = article.html # 网页源代码 # print(html) article.parse() # 文章解析 authors = article.authors # 文章作者 # print(authors) publish_date = article.publish_date # 文章发布时间 # print(publish_date) text = article.text # 文章内容 # print(text) top_image = article.top_image # 第一张图片 # print(top_image) movies = article.movies # 视频链接 # print(movies) title = article.title # 文章标题 print(title)
def goog_news(sch_word,yyyymm,pages=3,smry_words=50): quo_word = quote_plus(sch_word) mon = pd.to_datetime(yyyymm,format='%Y%m') mon_max = mon+pd.DateOffset(months=1)-pd.DateOffset(days=1) mrng = list(map(lambda x: x.strftime('%m/%d/%Y'),[mon,mon_max])) # rescnt 부분 links = [] driver = webdriver.Chrome('driver/chromedriver.exe') urs0 = f"url = f"https://www.google.com/search?q={quo_word}&safe=active&rlz=1C1SQJL_koKR831KR832&biw=763&bih=625&source=lnt&tbs=cdr%3A1%2Ccd_min%3A{mrng[0]}%2Ccd_max%3A{mrng[1]}&tbm=nws" driver.get(url) html0 = driver.page_source try: a0 = soup(html,'lxml') rescnt = a0.find('div',id='resultStat').get_text() except: rescnt = '' driver.close() # rescnt 부분끝 for i in np.arange(0,pages*10,10): url = f"https://www.google.com/search?q={quo_word}&safe=active&rlz=1C1SQJL_koKR831KR832&biw=763&bih=625&source=lnt&tbs=cdr%3A1%2Ccd_min%3A{mrng[0]}%2Ccd_max%3A{mrng[1]}&tbm=nws&start={i}" driver.get(url) html = driver.page_source a = soup(html,'lxml') b1 = a.find_all('a','l lLrAF') b2 = a.find_all('a','RTNUJf') links.append([h['href'] for h in b1+b2]) driver.close() links = list(itertools.chain(*links)) title,date,wd,text,smry,press = [],[],[],[],[],[] for h in links: press.append(re.split('/',h)[2]) a = Article(h) try: a.download() a.parse() except: next try: title.append(a.title) except: title.append('') try: dat = a.publish_date date.append(dat.strftime('%Y-%m-%d')) wd.append(dat.strftime('%a')) except: date.append('') wd.append('') try: text.append(a.text) smry.append(summarize(a.text,word_count=smry_words)) except: text.append('') smry.append('') news = pd.DataFrame({'mon':yyyymm,'keyword':sch_word,'rescnt':rescnt,'title':title,'date':date,'wkday':wd, 'text':text,'smry':smry,'press':press}) news = news.loc[news.text!=''] news = news.drop_duplicates() news.reset_index(drop=True,inplace=True) return news
def main(Url, pub_time, found_time, Source, Keywords, otherNames, Type): Keywords = Keywords.lower() article = Article(Url) article.download() if article.is_downloaded: article.parse() if article.is_parsed: print "parsed" article.nlp() else: print "failed download" article = urllib.urlopen(Url).read() article.download() article.parse() articleText = (article.text) articleText = articleText.encode('ascii', 'replace').replace( u"\u0029", "").replace(u"\u0028", "") Keywords = Keywords.split(",") classifier = '/usr/local/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz' jar = '/usr/local/share/stanford-ner/stanford-ner.jar' st = StanfordNERTagger(classifier, jar, encoding='utf-8') sentence = word_tokenize(articleText) output = [] realtypefind = [] keywordtotalcount = {} count = {} categories = defaultdict(list) totalcount = 0 for key in Keywords: keywordtotalcount[key] = 0 for key2 in key.split(): count[key2] = 0 itemposition = 0 totoltypecount = 0 taged = st.tag(sentence) for item in taged: firstItem = item[0].encode('utf-8').strip("\)(?.,:`") if firstItem: if item[1] not in categories: categories[item[1]].append(firstItem) else: categories[item[1]].append(firstItem) if item[1] == Type: totoltypecount = totoltypecount + 1 #Creats full name list, is checked against to make sure a article with mike newton is counting mike johnson or sam newton #as people who are mentioned in the article. if itemposition != (len(taged) - 1): if taged[itemposition + 1][1] == Type: realtypefind.append(" " + (item[0].lower() + " " + (taged[itemposition + 1][0] ).lower()).encode('utf-8')) output.append(item[0]) if item[0].lower() in count: count[item[0].lower()] = count[item[0].lower()] + 1 itemposition = itemposition + 1 #Creats full name list, is checked against to make sure a article with mike newton is counting mike johnson or sam newton #as people who are mentioned in the article. for key in keywordtotalcount: for T in range(0, len(key.split())): (keywordtotalcount[key] ) = (keywordtotalcount[key]) + count[(key.split())[T]] frequency = (FreqDist(output)).most_common(5) for freq in frequency: totalcount = totalcount + freq[1] keywords_database = ' '.join(article.keywords) article_people = [] for person in keywordtotalcount: if person in realtypefind: if person in otherNames and otherNames[person] in realtypefind: article_people.append(person) totalcountofperson = (keywordtotalcount[person] + keywordtotalcount[otherNames[person]]) # print person, "is in the article", (round(((keywordtotalcount[person] + keywordtotalcount[otherNames[person]])/float(totoltypecount)), 4) * 100), "%" # Sqlite_py_practice.main(Url, Source, post_date, dateTime, article.title, str(article.authors), str(keywords_database), article.summary, articleText) else: article_people.append(person) totalcountofperson = keywordtotalcount[person] # print person, "is in the article", (round((keywordtotalcount[person]/float(totoltypecount)), 4) * 100), "%" # Sqlite_py_practice.main(Url, Source, post_date, dateTime, article.title, str(article.authors), str(keywords_database), article.summary, articleText) else: if person in otherNames and otherNames[person] in realtypefind: article_people.append(person) totalcountofperson = keywordtotalcount[person] # print person, "is in the article", (round((keywordtotalcount[person]/float(totoltypecount)), 4) * 100), "%" # Sqlite_py_practice.main(Url, Source, post_date, dateTime, article.title, str(article.authors), str(keywords_database), article.summary, articleText) if len(article_people) >= 1: print Url article_id = mysql_article_entry.main(Url, Source, pub_time, found_time, article.title, str(article.authors), str(keywords_database), article.summary, articleText) mysql_article_person_link.main( article_id, article_people, totalcountofperson, (round( (totalcountofperson / float(totoltypecount)), 4) * 100), totoltypecount) mysql_article_based_weights.main(article_id, len(articleText), "yes") mysql_social_media_entry.main(article_id, Url)
def get_article(self, url): article = Article(url) article.download() article.parse() article.nlp() return article
def extract_news(code, news_links, crawl_source, cursor): '''抽取新闻,并进行NLP @param code: 上市公司编码 @param news_links: 需要抽取的新闻链接 @param crawl_source @param cursor: 数据库游标 ''' in_sql = """ INSERT INTO news_extract_content(url_md5,url,code_name,newspaper_title,newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,boilerpipe_article, boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent, boilerpipe_numwordsrules,boilerpipe_canola,up_time,add_time,extract_count,crawl_source) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),1,%s) on duplicate key update code_name = %s,newspaper_title = %s,newspaper_text = %s, newspaper_authors = %s,newspaper_summary = %s,newspaper_keywords = %s, boilerpipe_article = %s,boilerpipe_articlesentences = %s,boilerpipe_keepeverything = %s, boilerpipe_largestcontent = %s,boilerpipe_numwordsrules = %s,boilerpipe_canola = %s, up_time = now(),extract_count=extract_count+1,crawl_source = %s """ for link in news_links: #长度小于30的url一般都不是新闻连接,暴力,简单可依赖 if link is None or len(link) <= 30: continue #已经抓取的url就不需要抓取了 if link in bf: continue try: global NEWS_URL_EXTRACTE NEWS_URL_EXTRACTE += 1 url_md5 = hashlib.md5(link).hexdigest() #首先让使用newspaper newspaper_title = '' newspaper_text = '' newspaper_authors = '' newspaper_summary = '' newspaper_keywords = '' article = Article(link) article.download() html = article.html if html is None or len(html) == 0: continue article.parse() if article.text and len(article.text) > 0: newspaper_title = article.title newspaper_text = article.text newspaper_authors = article.authors if newspaper_authors and len(newspaper_authors) > 0: newspaper_authors = ','.join(newspaper_authors) else: newspaper_authors = '' article.nlp() newspaper_summary = article.summary newspaper_keywords = article.keywords if newspaper_keywords and len(newspaper_keywords) > 0: newspaper_keywords = ','.join(newspaper_keywords) else: newspaper_keywords = '' #然后使用boilerpipe extractor = Extractor(extractor='ArticleExtractor', html=html) boilerpipe_article = extractor.getText() extractor = Extractor(extractor='ArticleSentencesExtractor', html=html) boilerpipe_articlesentences = extractor.getText() extractor = Extractor(extractor='KeepEverythingExtractor', html=html) boilerpipe_keepeverything = extractor.getText() extractor = Extractor(extractor='LargestContentExtractor', html=html) boilerpipe_largestcontent = extractor.getText() extractor = Extractor(extractor='NumWordsRulesExtractor', html=html) boilerpipe_numwordsrules = extractor.getText() extractor = Extractor(extractor='CanolaExtractor', html=html) boilerpipe_canola = extractor.getText() #输入的参数 content = (url_md5,link,code, newspaper_title, newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,\ boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,\ boilerpipe_numwordsrules,boilerpipe_canola,crawl_source, \ code, newspaper_title,newspaper_text, newspaper_authors,\ newspaper_summary,newspaper_keywords,boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,\ boilerpipe_largestcontent,boilerpipe_numwordsrules,boilerpipe_canola,crawl_source) cursor.execute(in_sql, content) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
def get_article_text(self, url): article = Article(url) article.download() article.parse() return article.text
def parse_content(self, response): #这个函数用作新闻的具体解析 ID = 'songtengteng' website_name = '商务部贸易救济调查局' # 网站板块 website_block = response.xpath( "//div[@class='position']/a[2]/text()").extract_first() news_url = response.meta['url'] # 作者 news_author_list = response.xpath('//script') if len(news_author_list) != 0: news_author = news_author_list.re( 'v.{2}\ss.{4}e\s=\s\"[\u4e00-\u9fa5]+\"')[0][13:].replace( '"', '') else: news_author = '商务部贸易救济调查局' # 新闻发布时间,统一格式:YYYY MM DD HH:Mi:SS publish_time = response.xpath('//script').re( 'v.{2}\stm\s=\s\".*\"')[0][9:].replace('"', '') year = publish_time[0:4] month = publish_time[5:7] day = publish_time[8:10] juti_time = publish_time[-8:] publish_time = year + month + day + ' ' + juti_time # 新闻自带标签 news_tags = response.xpath('//script').re( 'v.{2}\sc.+e\s=\s\"[\u4e00-\u9fa5]+\"')[0][14:].replace('"', '') # 新闻标题 news_title = response.xpath('//h3/text()').extract_first() # 新闻正文 a = Article(response.url, language='zh') # Chinese a.download() a.parse() news_content = a.text #获取文章的图片和名称 image_urls = [] image_names = [] image_urls1 = response.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="article_con"]/center/img/@src|//p[@style="text-align: center"]/img/@src' ).extract() if image_urls1 != []: image_urls = image_urls1 for i in range(len(image_urls)): if i < 10 and i >= 0: image_name = news_title + '_000' + str(i) image_names.append(image_name) elif i < 100 and i >= 10: image_name = news_title + '_00' + str(i) image_names.append(image_name) elif i < 1000 and i >= 100: image_name = news_title + '_0' + str(i) image_names.append(image_name) else: image_name = news_title + str(i) image_names.append(image_name) yield self.getItem( id=ID, news_url=news_url, website_name=website_name, website_block=website_block, news_title=news_title, publish_time=publish_time, news_author=news_author, news_tags=news_tags, news_content=news_content, image_urls=image_urls, image_names=image_names, )
def parse_artical(self, response): # 具体文章解析 ID = 'songtengteng' # 新闻链接 news_url = response.meta['url'] # 新闻标题 news_title = response.xpath('//h1/text()').extract_first() # 作者 a = response.xpath( '//div[@class="info-source"]/span/a/text()').extract_first() if a == None: news_author = '' else: news_author = a # 发布时间 publish_time = response.xpath( '//div[@class="info-source"]/span[2]/text()').extract_first() if publish_time != None: year = publish_time[0:4] month = publish_time[5:7] day = publish_time[8:10] juti_time = publish_time[-5:] publish_time = year + month + day + ' ' + juti_time + ':' + '00' else: publish_time = response.xpath( '//*[@id="bd-left"]/div[2]/div[1]/div[1]/div[1]/span[2]/text()' ).extract_first() if publish_time != None: year = publish_time[0:4] month = publish_time[5:7] day = publish_time[8:10] juti_time = publish_time[-5:] publish_time = year + month + day + ' ' + juti_time + ':' + '00' # 正文 '''可以考虑下使用文章密度算法来快速解析文章正文''' a = Article(response.meta['url'], language='zh') # Chinese a.download() a.parse() news_content = a.text # 标签 news_tags = '' #图片 image_urls1 = response.xpath('//p[@class="pi"]/img/@src').extract() image_urls = [] image_names = [] if image_urls1 != []: for i in range(len(image_urls1)): image_url = image_urls1[i] image_urls.append(image_url) if i >= 0 and i < 10: image_title = news_title + '000' + str(i) elif i >= 10 and i < 100: image_title = news_title + '00' + str(i) elif i >= 100 and i < 1000: image_title = news_title + '0' + str(i) else: image_title = news_title + str(i) image_names.append(image_title) yield self.getItem(id=ID, news_url=news_url, website_name='搜狐焦点', website_block='市场', news_title=news_title, publish_time=publish_time, news_author=news_author, news_tags=news_tags, news_content=news_content, image_urls=image_urls, image_names=image_names)
def get_text_date(url): try: article = Article(url) article.download() if "Noticia servida automáticamente por la Agencia EFE" in article.html: return None, None article.html = re.sub(r"\n+", " ", article.html) article.html = re.sub( r"<blockquote class=\"twitter-tweet\".+?</blockquote>", "", article.html) article.html = re.sub( r"<blockquote class=\"instagram-media\".+?</blockquote>", "", article.html) article.html = re.sub( r"<blockquote class=\"tiktok-embed\".+?</blockquote>", "", article.html) article.html = re.sub(r"<blockquote cite=\".+?</blockquote>", "", article.html) #article.html = re.sub(r"<h2 class=\"mce\">·.+?</p>", "", article.html) # subtitulares de vertele article.html = re.sub(r"<figcaption.+?</figcaption>", "", article.html) article.html = re.sub( r"<p><em>Si alguien te ha reenviado esta carta.+?</em></p>", "", article.html) # Matrioska de verne article.html = re.sub( r"<p class=\"\">(<b>)?Información sobre el coronavirus(</b>)?.+?ante la enfermedad</a></p>", "", article.html) # El Pais nuevo pie coronavirus article.html = re.sub( r"<p class=\"\">(<b>)?Información sobre el coronavirus(</b>)?.+?sobre la pandemia.*?</p>", "", article.html) # El Pais viejo pie coronavirus article.html = re.sub(r"<p class=\"\">.*?Suscríbase aquí.*?</p>", "", article.html) # newsletter El País article.html = re.sub(r"<a[^>]+>Apúntate a .*?</a>", "", article.html) # newsletter 20 minutos article.html = re.sub(r"<p[^>]+>Apúntate a .*?</p>", "", article.html) # newsletter 20 minutos article.html = re.sub( r"<span class=\"datos-articulo\".+?</div><p class=\"enviaremailerr captcha\">", "", article.html) article.html = re.sub(r"<aside class=\"modulo temas\".+?</aside>", "", article.html) article.html = re.sub(r"Si quieres seguir recibiendo.+?</p>", "", article.html) article.html = re.sub(r"<p class=\"siguenos_opinion\">.+?</p>", "", article.html) article.html = re.sub(r"<p><a.+?<em>playlists</em> de EL PAÍS</a></p>", "", article.html) article.html = re.sub(r"<section class=\"more_info .+?</section>", "", article.html) article.html = re.sub(r"<span class=\"EPS-000.+?eps</span>", "", article.html) article.html = re.sub( r"<span class=\"f_a | color_black uppercase light.+?</span>", "", article.html) article.html = re.sub(r"<i>Puedes seguir a .+?[nN]ewsletter.?</i>", "", article.html) # pie de Materia article.html = re.sub(r"Puedes seguir a .+?(<i>)? *[nN]ewsletter</a>", "", article.html) # pie de Materia article.html = re.sub( r"<i>Puedes seguir a .+?(<i>)? *[nN]ewsletter</i></a>", "", article.html) # pie de Materia article.html = re.sub( r"<i>Puedes escribirnos a .+?[Nn]ewsletter</i></a>", "", article.html) # pie de Materia nuevo article.html = re.sub(r"<p><em><strong>¿Nos ayudas?.+?</p>", "", article.html) # Kiko Llaneras article.html = re.sub( r"<p class=\"nota_pie\".+?a nuestra <em>newsletter</em>\.?(</span>)*</p>", "", article.html) # pie de Planeta Futuro article.html = re.sub( r"<i>Puedes escribirnos a.+?<i>[nN]ewsletter</i></a>", "", article.html) # pie de Materia article.html = re.sub(r"<p class=" "><i>Puedes escribirnos a.+?</p>", "", article.html) article.html = re.sub( r"<i>Lee este y otros reportajes.+?con EL PAÍS.</i>", "", article.html) # pie Buenavida EL PAIS article.html = re.sub( r"<h3 class=\"title-related\">.+?</div>", "", article.html) # noticias relacionadas en El Confi article.html = re.sub( r"<button.+?</button>", "", article.html) # botones de compartir en elpais icon article.html = re.sub(r"<p class=\"g-pstyle.+?</p>", "", article.html) article.html = re.sub(r"<p class=\"nota_pie\">.+?</p>", "", article.html) article.html = re.sub(r"<strong>Apúntate a la .+?</strong>", "", article.html) article.html = re.sub(r"<p><strong>O súmate a .+?</strong></p>", "", article.html) #article.html = re.sub(r"<h2.*?>¿En qué se basa todo esto\?</h2>.*</div>", "", article.html) article.html = re.sub( r"<strong>Más en tu mejor yo</strong>: <a.*?</a>", "", article.html) article.html = re.sub(r"<p class=\"article-text\"> +<a.*?</a>", "", article.html) article.html = re.sub( r"<span>Este sitio web utiliza cookies propias.+?</span>", "", article.html) article.html = re.sub(r"\[LEER MÁS:.+?\]", "", article.html) article.html = re.sub(r"<div id=\"post-ratings-.+?Cargando…</div>", "", article.html) # rating EFE article.html = re.sub( r"<div id=\"div_guia\" class=\"guia\" itemprop=\"alternativeHeadline\">.+?</div>", "", article.html) # subtitulo EFE article.html = re.sub( r"<div class=\"f f__v video_player.+?</div></div></div>", "", article.html) article.html = article.html.replace("<em class=\"mce\">", "<em>") article.html = re.sub("([^ ])<em>", "\g<1> <em>", article.html) article.html = article.html.replace("<em> ", "<em>") article.html = re.sub("([^ ])<i>", "\g<1> <i>", article.html) article.html = article.html.replace("<i> ", "<i>") article.html = article.html.replace(" </em>", "</em>") #article.html = re.sub("</em>([^ \W])", "</em> \g<1>", article.html) article.html = re.sub("</em>([^\s\.,;:])", "</em> \g<1>", article.html) article.html = article.html.replace(" </i>", "</i>") article.html = re.sub("</i>([^\s\.,;:])", "</i> \g<1>", article.html) article.html = article.html.replace("<em>", "'") article.html = article.html.replace("</em>", "'") article.html = article.html.replace("<i>", "'") article.html = article.html.replace("</i>", "'") article.parse() """ if article.meta_description: article.text = article.meta_description + "\n\n" + article.text """ return article.text, article.publish_date except newspaper.article.ArticleException: return None, None
class PageReaderBase: def __init__(self, url, lang="en"): self.url = url self.lang = lang self.article = None self.text_property = "text" self.title_property = "title" self.authors_property = "authors" self.publish_date_property = "publish_date" self.html_property = "raw_html" self.dom_property = "doc" def _read(self): if self.article is None: if self.lang is None: self.article = Article(self.url) else: self.article = Article(self.url, language=self.lang) try: self.article.download() self.article.parse() except: logger.info( "failed when loading article content for {}\nError: {}". format(self.url, traceback.format_exc())) return self.article def _get(self, key): article = self._read() if article is None: return None data = article.__getattribute__(key) return data def main_text(self): text = self._get(self.text_property) if len(text) == 0: logger.info("No content has been fetched for {}".format(self.url)) return None return text def title(self): return self._get(self.title_property) def authors(self): authors = self._get(self.authors_property) if authors is None: authors = [] site_authors = read_site_authors(self.url, self._get(self.dom_property)) authors.extend(site_authors) return authors def publish_date(self): return self._get(self.publish_date_property) def html(self): return self._get(self.html_property) def page_title(self): dom_tree = self._get(self.dom_property) if dom_tree is not None: title = dom_tree.findtext(".//title") return title or "" return ""
def get_bot_response(): while True: userText = request.args.get('msg') msg = str(userText) entrada = msg.lower() f = csv.writer(open('inputs.csv', 'a', encoding='utf-8')) f.writerow([msg]) response = searchbot.get_response(userText) if float(response.confidence) >= 0.7: return str(searchbot.get_response(userText)) elif userText == str('NÃO'): return str('Refaça a pergunta, por favor!') elif userText == str("SIM"): return str("Agradecemos o seu contato") elif float(response.confidence) == 0.0: entrada = msg # print(entrada) p1 = 'http://receita.economia.gov.br/@@busca?advanced_search=False&sort_on=&SearchableText=' p2 = '&portal_type%3Alist=Document&created.query%3Arecord%3Alist%3Adate=1970-01-02&created.range%3Arecord=min' html = str(p1 + entrada + p2) stop2 = nltk.corpus.stopwords.words('portuguese') stop2.append('faço') stop2.append('um') stop2.append('gostaria') stop2.append('fazer') stop2.append('saber') stop2.append('posso') stop2.append('como') splitter = re.compile('\\W+') lista_palavras = [] lista = [p for p in splitter.split(entrada) if p != ''] for p in lista: if p not in stop2: if len(p) > 1: lista_palavras.append(p) ar = len(lista_palavras) ax = str(lista_palavras[0:ar]) e = str(ax).replace(',', ' ').strip('[]') e.strip("'") headers = {'User-Agent': 'Mozilla/5.0'} page = requests.get(html, headers=headers, verify=False, stream=False, timeout=5) soup = BeautifulSoup(page.content, 'lxml') cla = soup.find(class_='searchResults') links = cla.find_all('a') # namess = soup.find_all('a') # ra = (lista_palavras) # CRIAR A LISTA DE LINKS SITE RFB listr = [] for link in links: texto = str(link.get_text()).lower().replace('ã', 'a').replace('-', ' ').replace('ç', 'c').split() # print(len(texto)) url = str(link.get('href')) # print(len(url)) urls = str(link.get('href')).lower().replace('/', ' ').replace('-', ' ').replace('.', ' ').split() # print(len(urls)) if entrada in texto: listr.append(url) for i in range(0, ar): if lista_palavras[i] in texto: listr.append(url) elif lista_palavras[i] in urls: listr.append(url) listag = [] rec = 'site:receita.economia.gov.br intitle:' + msg + " -filetype:pdf -.pdf" for urla in search(rec, tld='com.br', lang='pt-br', stop=4, pause=5): listag.append(urla) g = int(len(listag)) # print(g) listago = [] for z in range(0, g): ur = str(listag[z]) listago.append(ur) # print(listago) # print(len(listago)) qo = int(len(listago)) # print(listr) # print(len(listr)) listaunida = listago + listr conj = list(set(listaunida)) # print(conj) # print(len(conj)) # print(type(conj)) # print(p) # print(len(p)) j = len(conj) reports2 = [] # news_pool.set(reports2)#, threads_per_source=2) # news_pool.join() for r in range(0, j): try: ia = str(conj[r]) article = Article(ia, language="pt") article.download() article.parse() article.text article.nlp() article.summary except: pass reports2.append(str(article.summary).replace('\n', ' ')) # print(len(reports2)) resposta_finalc = set(reports2) print(resposta_finalc) if resposta_finalc == set(): wikipedia.set_lang("pt") a = msg result = wikipedia.search(a, results=1) page = wikipedia.summary(result, sentences=5) content = page return str(content) else: resposta_final = ( str(resposta_finalc).replace('\n', ' ').replace('[', ' ').replace(']', ' ').replace(',', ' ').replace( "'", ' ').replace('{', ' ').replace("}", ' ')) f = csv.writer(open('chats.csv', 'a', encoding='utf-8')) f.writerow([msg + '\n' + resposta_final]) return str(resposta_final + '\n' + 'Ficou satisfeito com a resposta? SIM ou NÃO?')
def article(title): global articlePageList global articlePageListRec global firstTime neededUrl = '' neededImgUrl = '' indexOfArticleCategory = 0 flag = 0 flag2 = 0 pagesize = 5 if flag2 == 0: for articleList in articlePageList: for item in articleList: if item['title'] == title: neededUrl = item['url'] neededImgUrl = item['urlToImage'] flag = 1 flag2 = 1 break if flag == 1: break indexOfArticleCategory += 1 print(indexOfArticleCategory) #nltk.download('punkt') if flag2 == 0: indexOfArticleCategory = 0 for articleList in articlePageListRec: for item in articleList: if item['title'] == title: neededUrl = item['url'] neededImgUrl = item['urlToImage'] flag = 1 flag2 = 0 pagesize = 3 break if flag == 1: articlePageList = articlePageListRec break indexOfArticleCategory += 1 print(indexOfArticleCategory) url = neededUrl article = Article(url) article.download() try: article.parse() except: neededImgUrl = "notPresent" article.nlp() summary = article.summary movies = article.movies publishDate = article.publish_date if publishDate != None: dateStr = publishDate.strftime('%d, %B %Y') else: dateStr = '-' if movies == []: movies = '' if neededImgUrl == None: neededImgUrl = "notPresent" ### Recommendations ### listofpreff = [] articlePageListRec = [] global zipper if firstTime == 1: if session: uid = session['uid'] connection = pymysql.connect(host='localhost', user='******', password='', db='allinonenews') with connection.cursor(pymysql.cursors.DictCursor) as cur: sql = "SELECT * FROM prefferences WHERE id = %s" result = cur.execute(sql, (uid)) connection.commit() if result > 0: # Get stored hash preff = cur.fetchall() for i in preff: listofpreff = listofpreff + [i['category']] for prefference in listofpreff: url = 'https://newsapi.org/v2/everything?language=en&pageSize=3&page=1&q=' + prefference + '&apiKey=097f0f6fb89b43539cbaa31372c3f92d' r = requests.get(url) articlePageListRec.append(r.json()['articles']) cur.close() zipper = zip(articlePageListRec, listofpreff) return render_template('article.html', summary=summary, title=title, index=indexOfArticleCategory, neededImgUrl=neededImgUrl, movies=movies, date=dateStr, articleUrl=url, jso=articlePageList, zipper=zipper, pagesize=pagesize)
def scrapeAnalyse(url, isGeneral, keywords): nltk.download('punkt') if (isGeneral): all_data = [] headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } # ------ Google News ------ response = requests.get("https://news.google.com/search?q=" + keywords, headers=headers) soup = BeautifulSoup(response.content, "html.parser") for index, link in enumerate( soup.findAll('div', attrs={'class': 'NiLAwe'})): if index >= 5: break children = link.findChildren('a', recursive=False) for child in children: news_url = child.get('href') article = Article("https://www.news.google.com" + news_url[1:]) article.download() article.parse() date = None if article.publish_date == None: date = time.time() else: date = article.publish_date.timestamp() el = { "text": article.text, "date": date, "url": "https://www.news.google.com" + news_url[1:] } all_data.append(el) # ------ Yahoo News ------ # response = requests.get("https://news.search.yahoo.com/search?p=" + keywords, headers=headers) # soup = BeautifulSoup(response.content, "html.parser") # for index, link in enumerate(soup.findAll('h4', attrs={'class':'fz-16 lh-20'})): # if index >= 0: # break # children = link.findChildren('a', recursive=False) # for child in children: # news_url = re.sub("\/RV=2.*", "", child.get('href')) # article = Article(news_url) # article.download() # article.parse() # el = {"text": article.text, "date": article.publish_date, "url": news_url} # all_data.append(el) # ------ Bing News ------ response = requests.get("https://www.bing.com/news/search?q=" + keywords, headers=headers) soup = BeautifulSoup(response.content, "html.parser") for index, link in enumerate( soup.findAll('div', attrs={'class': 'news-card newsitem cardcommon'})): if index >= 5: break news_url = link.get('url') article = Article(news_url) article.download() article.parse() date = None if article.publish_date == None: date = time.time() else: date = article.publish_date.timestamp() el = {"text": article.text, "date": date, "url": news_url} all_data.append(el) # all_text = "".join(all_text) # all_text = "".join(x for x in all_text if x in printable) return all_data else: article = Article(url) article.download() article.parse() article.nlp() keywords = article.keywords date = None if article.publish_date == None: date = time.time() else: date = article.publish_date.timestamp() return (article.text, "+".join(keywords), date)
def __get_article(url): article = Article(url) article.download() article.parse() return article
def get_sentiment(self,start='2019-03-29',end=datetime.now().strftime("%Y-%m-%d")): """Gets daily sentiment using news sources from Google News for the specified time range""" start=datetime.strptime(start,"%Y-%m-%d").strftime("%m/%d/%Y") # puts start time in GoogleNews format end=datetime.strptime(end,"%Y-%m-%d").strftime("%m/%d/%Y") # puts end time in GoogleNews format googlenews=GoogleNews(lang='en',start=start,end=end,encode='utf-8') # creating object for collecting news googlenews.search(tick_map[self.ticker[0]]) # specifying the company # Getting Google Results for i in range(1,50): googlenews.getpage(i) # loops through google pages result=googlenews.result() # stores results df=pd.DataFrame(result) # appends results to DataFrame df.drop_duplicates(['link'],keep='first',inplace=True) # removes duplicate articles via links # Collecting Text From Articles L=[] # initializing empty list for ind in df.index: try: # "try" for forbidden websites D={} # initializing the dictionary article = Article(df['link'][ind]) # extracting information from articles article.download() article.parse() article.nlp() D['Date']=df['datetime'][ind] # storing information from articles D['Media']=df['media'][ind] D['Title']=article.title D['Article']=article.text D['Summary']=article.summary L.append(D) # appending results to list except: pass news_df=pd.DataFrame(L) # make DataFrame from list #Preliminary Cleaning news_df1=news_df.dropna(axis=0) # dropping old "date" column news_df2=news_df1[news_df1['Media']!=""].set_index('Date').sort_index(ascending=True) # remove articles with no media source news_df2=news_df2[news_df2['Article'].values!=""] # remove articles with no content # Making time format %Y-%m-%d and Additional Cleaning new_time_format=list(pd.Series(news_df2.index).apply(lambda DATE :DATE.strftime("%Y-%m-%d")).values) # string form of new time format new_time_format=[datetime.strptime(DATE,"%Y-%m-%d") for DATE in new_time_format] # datetime form of new time format news_df2.index=new_time_format # apply new time format news_df2.drop(columns=['Summary','Title'],inplace=True) # dropping columns news_df2=Generic_Parser_Mod.LM_sentiment(news_df2) # DataFrame of sentiment scores # Handling of Duplicated Entries duplicate_index=news_df2.index[news_df2.index.duplicated()] # identify duplicate time entries collapsed_dates=list(duplicate_index.unique()) # collapsing duplicate dates news_df3=[news_df2.loc[collapsed_dates[i]].median() for i in range(len(collapsed_dates))] # collapsing info in duplicate entries news_df3=pd.DataFrame(news_df3) # DataFrame of collapsed info news_df3.index=collapsed_dates # new collapsed info #Making new DataFrame without Duplicates news=news_df2.loc[[news_df2.index[i] not in duplicate_index for i in range(len(news_df2.index))]].append(news_df3,sort=False) # Post-Cleaning, due to unstable nature of API news=news.loc[start:end] # only articles from selected period news.sort_index(ascending=True,inplace=True) # order by date news.to_csv(f"Sentiment_Data/{self.ticker[0]}_scores.csv",index='date') # storing the sentiment data return news # return sentiment scores
def set_text(self): if not self.text and self.url: a = Article(self.url) a.download() a.parse() self.text = a.text
class ArticleTestCase(unittest.TestCase): def setup_stage(self, stage_name): stages = OrderedDict([ ('initial', lambda: None), ('download', lambda: self.article.download( mock_resource_with('cnn_article', 'html'))), ('parse', lambda: self.article.parse()), ('meta', lambda: None), # Alias for nlp ('nlp', lambda: self.article.nlp()) ]) assert stage_name in stages for name, action in stages.items(): if name == stage_name: break action() def setUp(self): """Called before the first test case of this unit begins """ self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch') @print_test def test_url(self): self.assertEqual( 'http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch', self.article.url) @print_test def test_download_html(self): self.setup_stage('download') html = mock_resource_with('cnn_article', 'html') self.article.download(html) self.assertEqual(75406, len(self.article.html)) @print_test def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain') @print_test def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press') @print_test def test_pre_download_parse(self): """Calling `parse()` before `download()` should yield an error """ article = Article(self.article.url) self.assertRaises(ArticleException, article.parse) @print_test def test_parse_html(self): self.setup_stage('parse') AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins'] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') self.assertEqual(text, self.article.text) self.assertEqual(text, fulltext(self.article.html)) # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') self.assertEqual(TOP_IMG, self.article.top_img) self.assertCountEqual(AUTHORS, self.article.authors) self.assertEqual(TITLE, self.article.title) self.assertEqual(LEN_IMGS, len(self.article.imgs)) self.assertEqual(META_LANG, self.article.meta_lang) self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date)) @print_test def test_meta_type_extraction(self): self.setup_stage('meta') meta_type = self.article.extractor.get_meta_type( self.article.clean_doc) self.assertEqual('article', meta_type) @print_test def test_meta_extraction(self): self.setup_stage('meta') meta = self.article.extractor.get_meta_data(self.article.clean_doc) META_DATA = defaultdict(dict, { 'medium': 'news', 'googlebot': 'noarchive', 'pubdate': '2013-11-27T08:36:32Z', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com', 'og': {'site_name': 'CNN', 'description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article'}, 'section': 'travel', 'author': 'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN', 'robots': 'index,follow', 'vr': { 'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'}, 'source': 'CNN', 'fb': {'page_id': 18793419640, 'app_id': 80401312489}, 'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm', 'article': { 'publisher': 'https://www.facebook.com/cnninternational'}, 'lastmod': '2013-11-28T02:03:23Z', 'twitter': {'site': {'identifier': '@CNNI', 'id': 2097571}, 'card': 'summary', 'creator': {'identifier': '@cnntravel', 'id': 174377718}}, 'viewport': 'width=1024', 'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm' }) self.assertDictEqual(META_DATA, meta) # if the value for a meta key is another dict, that dict ought to be # filled with keys and values dict_values = [v for v in list(meta.values()) if isinstance(v, dict)] self.assertTrue(all([len(d) > 0 for d in dict_values])) # there are exactly 5 top-level "og:type" type keys is_dict = lambda v: isinstance(v, dict) self.assertEqual(5, len([i for i in meta.values() if is_dict(i)])) # there are exactly 12 top-level "pubdate" type keys is_string = lambda v: isinstance(v, str) self.assertEqual(12, len([i for i in meta.values() if is_string(i)])) @print_test def test_pre_download_nlp(self): """Test running NLP algos before even downloading the article """ self.setup_stage('initial') new_article = Article(self.article.url) self.assertRaises(ArticleException, new_article.nlp) @print_test def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ self.setup_stage('parse') self.assertRaises(ArticleException, self.article.nlp) @print_test def test_nlp_body(self): self.setup_stage('nlp') self.article.nlp() KEYWORDS = ['balloons', 'delays', 'flight', 'forecasters', 'good', 'sailing', 'smooth', 'storm', 'thanksgiving', 'travel', 'weather', 'winds', 'york'] SUMMARY = mock_resource_with('cnn_summary', 'txt') self.assertEqual(SUMMARY, self.article.summary) self.assertCountEqual(KEYWORDS, self.article.keywords)
def get_Table_html(dataframe, recent_articles=None, titles_show=None, max_rows=10, styling=None): """ building html table with appropriate formatting and styling for both authors and content suggestions """ rows = [] for i in range(min(len(dataframe), max_rows)): row = [] for col in dataframe.columns: if (col == 'Author_wn') or (col == 'Similarity (0-10)'): continue value = dataframe.iloc[i][col] # update this depending on which # columns you want to show links for # and what you want those links to be if col == 'Suggested articles': try: if titles_show != None: title_curr = titles_show[value] if not isinstance(title_curr, str): article = Article(value) article.download() article.parse() title_curr = article.title else: article = Article(value) article.download() article.parse() title_curr = article.title cell = html.Td( html.A( href=value, children=title_curr, target='TargetArticle', )) # style={'color':'white', 'textDecoration': 'underline'})) except: cell = html.Td(children=value) print(value) elif col == 'Authors': try: path_link = recent_articles[dataframe.iloc[i] ['Author_wn']]['links'][0] # path_link = 'https://muckrack.com/' + dataframe.iloc[i]['Author_wn'] cell = html.Td( html.A( href=path_link, children=value, target='TargetArticle', )) # style={'color':'white', 'textDecoration': 'underline'})) except: cell = html.Td(children=value) print(value) else: cell = html.Td(children=value) row.append(cell) rows.append(html.Tr(row)) return html.Table( # Header # [html.Tr([html.Th(col) for col in dataframe.columns if (col == 'Author_wn') or (col == 'Similarity (0-10)')])] + rows, style=styling)
def Import_Data(url): """Using Newspaper.py, fetches body text from the given URL, returning said text, as well as the article object""" a = Article(url) a.download() a.parse() return a.text, a
def get_article(): url = None url = request.args.get('url', type=str) if url == None: return 'url parameter is required', 400 article = Article(url) article.download() if (article.download_state == 2): article.parse() article_dict = {} article_dict['status'] = 'ok' article_dict['article'] = {} article_dict['article']['source_url'] = article.source_url try: guess = guess_date(url=url, html=article.html) article_dict['article']['published'] = guess.date article_dict['article']['published_method_found'] = guess.method article_dict['article']['published_guess_accuracy'] = None if guess.accuracy is Accuracy.PARTIAL: article_dict['article']['published_guess_accuracy'] = 'partial' if guess.accuracy is Accuracy.DATE: article_dict['article']['published_guess_accuracy'] = 'date' if guess.accuracy is Accuracy.DATETIME: article_dict['article'][ 'published_guess_accuracy'] = 'datetime' if guess.accuracy is Accuracy.NONE: article_dict['article']['published_guess_accuracy'] = None except: article_dict['article']['published'] = article.publish_date article_dict['article']['published_method_found'] = None article_dict['article']['published_guess_accuracy'] = None article_dict['article']['title'] = article.title article_dict['article']['text'] = article.text article_dict['article']['authors'] = list(article.authors) try: title_lang = detect(article.title) except: title_lang = None try: text_lang = detect(article.text) except: text_lang = None article_dict['article']['images'] = list(article.images) article_dict['article']['top_image'] = article.top_image article_dict['article']['meta_image'] = article.meta_img article_dict['article']['movies'] = list(article.movies) article_dict['article']['meta_keywords'] = list(article.meta_keywords) article_dict['article']['tags'] = list(article.tags) article_dict['article']['meta_description'] = article.meta_description article_dict['article']['meta_lang'] = article.meta_lang article_dict['article']['title_lang'] = str(title_lang) article_dict['article']['text_lang'] = str(text_lang) article_dict['article']['meta_favicon'] = article.meta_favicon return jsonify(article_dict) else: article_dict = {} article_dict['status'] = 'error' article_dict['article'] = article.download_exception_msg return jsonify(article_dict)
class ArticleTestCase(unittest.TestCase): def runTest(self): self.test_url() self.test_download_html() self.test_pre_download_parse() self.test_parse_html() self.test_meta_type_extraction() self.test_meta_extraction() self.test_pre_download_nlp() self.test_pre_parse_nlp() self.test_nlp_body() def setUp(self): """Called before the first test case of this unit begins """ self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch') def tearDown(self): """Called after all cases have been completed, intended to free resources and etc """ pass @print_test def test_url(self): assert self.article.url == ( 'http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch') @print_test def test_download_html(self): html = mock_resource_with('cnn_article', 'html') self.article.download(html) assert len(self.article.html) == 75175 @print_test def test_pre_download_parse(self): """Calling `parse()` before `download()` should yield an error """ article = Article(self.article.url) self.assertRaises(ArticleException, article.parse) @print_test def test_parse_html(self): AUTHORS = ['Dana Ford', 'Tom Watkins'] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') assert self.article.text == text # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') assert self.article.top_img == TOP_IMG assert sorted(self.article.authors) == AUTHORS assert self.article.title == TITLE assert len(self.article.imgs) == LEN_IMGS assert self.article.meta_lang == META_LANG @print_test def test_meta_type_extraction(self): meta_type = self.article.extractor.get_meta_type( self.article.clean_doc) assert 'article' == meta_type @print_test def test_meta_extraction(self): meta = self.article.extractor.get_meta_data(self.article.clean_doc) META_DATA = defaultdict( dict, { 'medium': 'news', 'googlebot': 'noarchive', 'pubdate': '2013-11-27T08:36:32Z', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com', 'og': { 'site_name': 'CNN', 'description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article' }, 'section': 'travel', 'author': 'Dana Ford and Tom Watkins, CNN', 'robots': 'index,follow', 'vr': { 'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html' }, 'source': 'CNN', 'fb': { 'page_id': 18793419640, 'app_id': 80401312489 }, 'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm', 'article': { 'publisher': 'https://www.facebook.com/cnninternational' }, 'lastmod': '2013-11-28T02:03:23Z', 'twitter': { 'site': { 'identifier': '@CNNI', 'id': 2097571 }, 'card': 'summary', 'creator': { 'identifier': '@cnntravel', 'id': 174377718 } }, 'viewport': 'width=1024', 'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm' }) assert meta == META_DATA # if the value for a meta key is another dict, that dict ought to be # filled with keys and values dict_values = [v for v in list(meta.values()) if isinstance(v, dict)] assert all([len(d) > 0 for d in dict_values]) # there are exactly 5 top-level "og:type" type keys is_dict = lambda v: isinstance(v, dict) assert len(list(filter(is_dict, list(meta.values())))) == 5 # there are exactly 12 top-level "pubdate" type keys is_string = lambda v: isinstance(v, str) assert len(list(filter(is_string, list(meta.values())))) == 12 @print_test def test_pre_download_nlp(self): """Test running NLP algos before even downloading the article """ new_article = Article(self.article.url) self.assertRaises(ArticleException, new_article.nlp) @print_test def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ new_article = Article(self.article.url) html = mock_resource_with('cnn_article', 'html') new_article.download(html) self.assertRaises(ArticleException, new_article.nlp) @print_test def test_nlp_body(self): KEYWORDS = [ 'balloons', 'delays', 'flight', 'forecasters', 'good', 'sailing', 'smooth', 'storm', 'thanksgiving', 'travel', 'weather', 'winds', 'york' ] SUMMARY = mock_resource_with('cnn_summary', 'txt') assert self.article.summary == SUMMARY assert sorted(self.article.keywords) == sorted(KEYWORDS)
###### Contéudo Post Ontem ###### # Importando bibliotecas from newspaper import Article # Recebendo url artigo url = 'https://edition.cnn.com/2021/04/24/politics/inequality-biden-100-days/index.html' # Definindo classe artigo para variável artigo = Article(url) artigo.download() # Baixa o artigo artigo.parse() # Raspa o site, buscando artigo artigo.nlp() # Separa o sumário & Palavras Chaves # Recebe o conteúdo do Titulo titulo = artigo.title # TEXTO DO TITULO # Recebe o conteúdo do Sumário conteudo_texto = artigo.summary # TEXTO DO SUMÁRIO ###### Contéudo Post Ontem ###### ###### Contéudo Post Hoje ###### # instalando bibliotecas # pip install -U textblob # python -m textblob.download_corpora
def get_news(message): import numpy as np import matplotlib.pyplot as plt import pandas as p #Cleaning the texts import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer from textblob import TextBlob lemmatizer = WordNetLemmatizer() corpus=[] bow=[] nltk.download('wordnet') review=re.sub('[^a-zA-Z]', ' ', message) review=review.lower()#converts all characters to lowercase review=review.split()#splits the sentence into a list lemmatizer = WordNetLemmatizer() review=[lemmatizer.lemmatize(word,pos="v") for word in review if not word in set(stopwords.words('english'))]# removal of stopwords review=' '.join(review)#converting the list back into a sentence corpus.append(review)#creating a list of sentences bow.append(review.split(" "))#creating a list of words in each sentences and storing it in a list bowa=review.split() bowb=set(bowa) worddict=dict.fromkeys(bowb,0) #SENTIMENT ANALYSIS def clean_text(inp): ''' Utility function to clean text by removing links, special characters using simple regex statements. ''' return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", inp).split()) def get_text_sentiment(inp): ''' Utility function to classify sentiment of passed text using textblob's sentiment method ''' analysis = TextBlob(clean_text(inp)) if analysis.sentiment.polarity > 0: return 'positive' elif analysis.sentiment.polarity == 0: return 'neutral' else: return 'negative' def get_texts(inp): text_sentl=[] for t in inp: text_sent={} text_sent['text'] = t text_sent['sentiment'] = get_text_sentiment(t) text_sentl.append(text_sent) return text_sentl #finding the frequency of words in each sentence for word in bowa: worddict[word]+=1 #computing the term frequency def computeTF(wordDict, bow): tfDict = {} bowCount = len(bow) for word, count in wordDict.items(): tfDict[word] = count/float(bowCount) return tfDict tfBowA = computeTF(worddict, bowa) from collections import Counter # Initial Dictionary k = Counter(tfBowA) # Finding 3 highest values high = k.most_common(10) #print(high,"\n") sentence=[] for i in high: sentence.append(i[0]) def get_cosine_sim(*strs): vectors = [t for t in get_vectors(*strs)] d1=np.array([vectors[0]]) d2=np.array([vectors[1]]) return cosine_similarity(d1,d2) def get_vectors(*strs): text = [t for t in strs] vectorizer = CountVectorizer(text) vectorizer.fit(text) return vectorizer.transform(text).toarray() #SCRAPING from googlesearch import search from newspaper import Article links=list() sentence=' '.join(sentence) query =sentence print(query) for j in search(query, tld="com", num=10, start=0, stop=10, pause=2.0): #print(j) links.append(j) global pos global neg global nu #GETS THE ARTICLES FROM THEIR LINKS flag=0 for k in links: if((k[:20]=="https://timesofindia") | (k[:18]=="https://www.news18") | (k[:26]=="https://www.hindustantimes") | (k[:21]=="https://indianexpress")\ | (k[:20]=="https://www.livemint") | (k[:21]=="https://economictimes")\ | (k[:22]=="https://www.indiatoday") | (k[:20]=="https://gadgets.ndtv")\ | (k[:24]=="https://www.timesnownews") | (k[:19]=="https://edition.cnn")\ | (k[:15]=="https://www.bbc") | ("washingtonpost" in k) | ("theguardian" in k) | ("news.com.au" in k)\ | ("abc.net.au" in k) | ("www.nytimes" in k) | ("www.bloomberg" in k) | ("www.dailymail" in k)\ | ("www.newyorker" in k) | ("www.mirror.co" in k) | ("www.telegraph.co" in k) | ("news.sky" in k) | ("wikipedia.org" in k)): #A new article from TOI url=k #For different language newspaper refer above table article = Article(url, language="en") # en for English #To download the article article.download() #To parse the article article.parse() #To perform natural language processing ie..nlp article.nlp() #CHECKING SENTIMENT temp=(article.text).split('\n') file=open(r"C:\Users\Saksham\Desktop\article.txt","a+",encoding="utf-8") file.writelines(temp) file=open(r"C:\Users\Saksham\Desktop\article.txt","r",encoding="utf-8") t=file.read() text=[t] textinp=get_texts(text) for i in textinp: print(i['sentiment']) if(i['sentiment']=="positive"): pos=pos+1 elif(i['sentiment']=="negative"): neg=neg+1 else: nu=nu+1; file=open(r"C:\Users\Saksham\Desktop\article.txt","w",encoding="utf-8") #FINDING THE COSSIM VALUE message2=article.text from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity cossim=get_cosine_sim(message,message2) if(cossim<0.75): lines=message2.split('.') for line in lines: cossim=get_cosine_sim(message,line) cossim=cossim[0][0] if(cossim>0.75 or cossim>0.4): break if(pos>neg and pos>nu): sent="positive" elif(neg>pos and neg>nu): sent="negative" else: sent="neutral" if(cossim>=0.6): label['text']="It is true and similarity co-efficient is:",str(cossim),"sentiment is ",sent elif(cossim<0.6 and cossim>0.4): label['text']="Data is insufficient",str(cossim),"sentiment is ",sent else: label['text']="It is false and similarity co-efficient is:",str(cossim),"sentiment is ",sent
def extract_headlines_news(code, headlines_links, cursor): '''抽取yahoo的新闻链接并解析''' in_sql = """ INSERT INTO yahoo_comp_news(url_md5,url,code_name,newspaper_title,newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,boilerpipe_article, boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent, boilerpipe_numwordsrules,boilerpipe_canola,up_time,add_time,count) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),1) on duplicate key update code_name = %s,newspaper_title = %s,newspaper_text = %s, newspaper_authors = %s,newspaper_summary = %s,newspaper_keywords = %s, boilerpipe_article = %s,boilerpipe_articlesentences = %s,boilerpipe_keepeverything = %s, boilerpipe_largestcontent = %s,boilerpipe_numwordsrules = %s,boilerpipe_canola = %s, up_time = now(),count=count+1 """ for link in headlines_links: #长度小于35的url一般都不是新闻连接 if link is None or len(link) <= 35: continue try: url_md5 = hashlib.md5(link).hexdigest() #首先让使用newspaper newspaper_title = '' newspaper_text = '' newspaper_authors = '' newspaper_summary = '' newspaper_keywords = '' article = Article(link) article.download() html = article.html if html is None or len(html) == 0: continue article.parse() if article.text and len(article.text) > 0: newspaper_title = article.title newspaper_text = article.text newspaper_authors = article.authors if newspaper_authors and len(newspaper_authors) > 0: newspaper_authors = ','.join(newspaper_authors) else: newspaper_authors = '' article.nlp() newspaper_summary = article.summary newspaper_keywords = article.keywords if newspaper_keywords and len(newspaper_keywords) > 0: newspaper_keywords = ','.join(newspaper_keywords) else: newspaper_keywords = '' #然后使用boilerpipe extractor = Extractor(extractor='ArticleExtractor', html=html) boilerpipe_article = extractor.getText() extractor = Extractor(extractor='ArticleSentencesExtractor', html=html) boilerpipe_articlesentences = extractor.getText() extractor = Extractor(extractor='KeepEverythingExtractor', html=html) boilerpipe_keepeverything = extractor.getText() extractor = Extractor(extractor='LargestContentExtractor', html=html) boilerpipe_largestcontent = extractor.getText() extractor = Extractor(extractor='NumWordsRulesExtractor', html=html) boilerpipe_numwordsrules = extractor.getText() extractor = Extractor(extractor='CanolaExtractor', html=html) boilerpipe_canola = extractor.getText() #输入的参数 content = (url_md5,link,code, newspaper_title, newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,\ boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,\ boilerpipe_numwordsrules,boilerpipe_canola, \ code, newspaper_title,newspaper_text, newspaper_authors,\ newspaper_summary,newspaper_keywords,boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,\ boilerpipe_largestcontent,boilerpipe_numwordsrules,boilerpipe_canola) cursor.execute(in_sql, content) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
def calculate_summary(self, list_of_events): """ Returns event_data dictionary containing information for each event. Return Format: { Event1: {article1 :{raw_text:"",url:"",summary:""}, article2:{raw_text:"",url:"",summary:""},... }, Event2: {article1 :{raw_text:"",url:"",summary:""}, article2:{raw_text:"",url:"",summary:""},... }, ..... } """ event_data = defaultdict(lambda: {}) for event in list_of_events: event = self.get_event_as_string(event) conn = httplib.HTTPSConnection('api.cognitive.microsoft.com') params = self.create_params(event) conn.request("GET", "/bing/v5.0/news/search?%s" % params, "{body}", self.headers) response = conn.getresponse() data = response.read() j = json.loads(data) count = 0 article_data = defaultdict(lambda: {}) for x in j['value']: article = Article(x['url'], language='es') article.download() article.parse() article_text = article.text temp = article_text.split('.') raw_article = "" if len(temp) > 5: count += 1 article_txt = [] article_txt.append(event) for sent in temp: new_sent = sent.replace('\n', "") raw_article = raw_article + new_sent + ". " article_txt.append(new_sent) vect = TfidfVectorizer(min_df=1) tfidf = vect.fit_transform(article_txt) mod_vect = (tfidf * tfidf.T).A input_event = mod_vect[0] cosine_vals = {} for i in range(len(mod_vect) - 1): article_sentence = mod_vect[i + 1] cosine_similarity = self.calculate_cosine( input_event, article_sentence) cosine_vals[i + 1] = 1.0 - float(cosine_similarity) sorted_list = sorted(cosine_vals.items(), key=lambda x: x[1]) summary = "" top3 = itertools.islice(sorted_list, 3) for summary_sentence in top3: summary_sentence = article_txt[int( summary_sentence[0])] summary = summary + summary_sentence + u". " data = {} data["url"] = x['url'] data["raw_text"] = raw_article data["summary"] = summary article_data["article" + str(count)] = data event_data[event] = article_data #print event_data conn.close() return event_data
clean_string = ' '.join(clean_string.split()) return clean_string st.write("Good news! I found some news articles using Google News!") # parse articles data = {} for google_news_article in articles: google_news_article = 'http://' + google_news_article r = requests.get(google_news_article) article_url = r.url try: html = Article(article_url, config=config) html.download() html.parse() website_text = clean_text(html.text) data[article_url] = website_text except Exception as e: pass df = pd.DataFrame(data.items(), columns=['url','website_text']) df.dropna(inplace=True) df = df[df['website_text'].str.len() > 50] st.write("I just finished reading through those articles. They seem interesting!") # nlp pre-processing stop_words=stopwords.words('english')+list(string.punctuation) + ['“','”','–','—','’']