def collect_urls(): for feed in rss_list: d = feedparser.parse(feed) ''' # create file if it doesn't exist, if it does, open it try: e = load_obj(d.feed.title) except (OSError, IOError) as e: # create empty file f = {} save_obj(f,d.feed.title) e = load_obj(d.feed.title) ''' json_file = gen_json(d) ''' TODO: Capture current article ID from sort, and most recent article date ID = list size? ''' for post in d.entries: article = Article(post.link) from newspaper.article import ArticleException, ArticleDownloadState ''' TODO: If post date is before newest article in file, move to next feed ''' p = { "id": post.id, # check this "title": post.title, "link": post.link, "date": format_date(post.published) #text field? } if ("https://www.google.com/url?rct" in post.link ): # Fixes Google post URLs to get true article URL p["link"] = post.link.split("&url=", 1)[1] slept = 0 try: article.download() while article.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 10 seconds if slept > 9: raise ArticleException('Download never started') time.sleep(1) slept += 1 # Parse article article.parse() print(sa_everything(article.text)) p["sa_val"] = sentiment_analysis_helper(article.text) except: p["sa_val"] = "error" json_file.append(p) with open(d.feed.title, 'w+') as f: json.dump(json_file, f)
def scrape_article(id_num, headline, url): """ Scrapes the article for text and headline Input: id_num (int) headline(string) - headline of article url(string) - url of article Output: json object {'id': id_num, 'title': title of the article 'text': text of the article 'url': url of the article 'category': category the article falls under as specified by bbc_categorization } """ article = Article(url) from time import sleep from newspaper.article import ArticleException, ArticleDownloadState # Download article slept = 0 try: article.download() while article.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 10 seconds if slept > 9: raise ArticleException('Download never started') sleep(1) slept += 1 # Parse article article.parse() except: # If something goes wrong return { 'id': id_num, 'title': None, 'text': None, 'url': url, 'category': None } try: category = bbc_categorization(article.title + ' ' + article.text, id_num) except: raise AttributeError("issue with bbc") return { 'id': id_num, 'title': article.title, 'text': article.text, 'url': url, 'category': category }
def test_file_feed(): for feed in rss_list: d = feedparser.parse(feed) for post in d.entries: article = Article(post.link) from newspaper.article import ArticleException, ArticleDownloadState slept = 0 try: article.download() while article.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 10 seconds if slept > 9: raise ArticleException('Download never started') time.sleep(1) slept += 1 # Parse article article.parse() print(sa_everything(article.text)) except: print("Error") '''
def scrape_url(category_list, url_list): """ :param url_list: list of urls to scrape from the web :return: nothing, just create a text file containing article text of each url """ for i in range(0, len(url_list)): article_huff = Article(url_list[i]) # iterate through each article link slept = 0 article_huff.download() while article_huff.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 12 seconds if slept > 13: raise ArticleException('Download never started') sleep(1) slept += 1 article_huff.parse() article_info_huff = {"category": category_list[i], "title": article_huff.title, "text": article_huff.text} file_name = "../huffpostarticles/huffpost" + str(i) + ".json" with io.open(file_name, "w", encoding="utf-8") as f: f.write(json.dumps(article_info_huff))
def wf_summarize(url, summary_length): """ :param summary_length: length of the summary in percentage :param url: url to scrape from the web :return: call the summarize function """ article_huff = Article(url) slept = 0 article_huff.download() while article_huff.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 12 seconds if slept > 13: raise ArticleException('Download never started') sleep(1) slept += 1 n = int(summary_length) article_huff.parse() news_text = article_huff.text summarizer = ExtractiveTextSummarizer() summary = summarizer.summary_ranking(news_text, n) return {"title": article_huff.title, "summary": summary}
def summarize(url, summary_length): """ :param summary_length: length of the summary in percentage :param url: url to scrape from the web :return: call the summarize function """ # url = url.strip("https://") # print(url) article_huff = Article(url) slept = 0 article_huff.download() while article_huff.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 12 seconds if slept > 13: raise ArticleException('Download never started') sleep(1) slept += 1 article_huff.parse() summarizer = ExtractiveTextSummarizer() summary = summarizer.create_summary(article_huff.text, summary_length) return {"title": article_huff.title, "summary": summary}
"SELECT ipfsHash FROM `{!s}` where ipfsHash is not null limit 10000 offset 1" .format(domain)) data = cursor.fetchall() try: for text in data: try: ipfsHash = text[0] url = "https://gateway.ipfs.io/ipfs/" + ipfsHash print(url) article = Article(url) article.download() slept = 0 while article.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 10 seconds if slept > 9: raise ArticleException('Download never started') sleep(1) slept += 1 article.parse() img = article.top_image if img: # cursor.execute("UPDATE `{!s}` set image = {!a} , charCount='{:d}',wordCount='{:d}',entropy='{:d}',stopWords='{:d}',titleCount='{:d}', imgCount = '{:d}', title={!a}, where url='{!s}'".format( # domain, img, len(article.text), article.totalWords, article.entropyN, article.stopWords, len(article.title), len(article.imgs), article.title, url)) cursor.execute( "UPDATE `{!s}` set imgLink = {!a} , imgCount = '{:d}', charCount='{:d}', domainTitle={!a} where ipfsHash='{!s}'" .format(domain, img, len(article.text), len(article.imgs), article.title, ipfsHash)) else: cursor.execute(
def home_page(): # Scrape and parse textual content from web resource. This method employs Article from Newspaper3k library to download and parse html from the web resource. It uses heuristics to scrape main body of visible text. # :param url: Uniform Resource Locator. # :return: Scraped content of web resource. user_input = st.text_input('Enter URL of an article or text') with open(get_data_path('fake_news_sites.json')) as json_file: fake_news_db_news = json.load(json_file) with open(get_data_path('categories.json')) as json_file: categories = json.load(json_file) with open(get_data_path('opensources/sources.json')) as json_file: open_source_json = json.load(json_file) try: # Get domain name from the url domain_name = get_domain(user_input) # Get formated domain formated_domain = format_url(domain_name) except Exception: st.warning("Enter an URL to suppress the warning !!") try: my_article = Article(user_input, language="en", keep_article_html=True) my_article.download() slept = 0 while my_article.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 10 seconds if slept > 9: raise ArticleException('Download never started') sleep(1) slept += 1 my_article.parse() except Exception as ec: print(ec) if st.button('Check authenticity'): st.header("VirusTotal - Malicious URL Scanner (virustotal.com)") st.markdown('''---''') with st.spinner(text="Fetching measures - Analysis in progress"): # task = asyncio.create_task(scan_url(user_input)) # json_data = await task json_data = scan_url(user_input=user_input) if json_data is not None: category_key = list(json_data.keys()) category_value = [json_data[i]['result'] for i in category_key] left, center, right = st.beta_columns((1, 2, 1)) with left: left.markdown('''**No.** ''', unsafe_allow_html=True) for i in range(1, 21): left.write(i) with center: center.markdown('''**Detected by**''', unsafe_allow_html=True) for i in category_key[:20]: center.write(i) with right: right.markdown('''**Result**''', unsafe_allow_html=True) for link in category_value[:20]: if link == 'clean': right.markdown( f'<span style="color:green">clean site</span>', unsafe_allow_html=True) else: right.markdown( f'<span style="color:red">{link}</span>', unsafe_allow_html=True) else: st.warning( "Couldn't able to get detect the site or Invalid URL provided !!" ) st.header("News site authencity") st.markdown('''---''') left, right = st.beta_columns((1, 2)) res = get_opensource_news(domain_name, formated_domain, open_source_json) left.markdown('''**Source** : OpenSource http://www.opensources.co/''', unsafe_allow_html=True) right.markdown(f'**Checking Domain** : {domain_name}', unsafe_allow_html=True) if res is None: right.warning("URL is not found in OpenSource Database") else: right.markdown(f'**Category** : {res["type"]}', unsafe_allow_html=True) try: right.markdown(f'**Discription** : {categories[res["type"]]}', unsafe_allow_html=True) except: right.warning("Category Discription isn't available !!") if res["Source Notes (things to know?)"]: right.markdown( f'**Source Notes (things to know?)** : {res["Source Notes (things to know?)"]}', unsafe_allow_html=True) st.markdown('''---''') left1, right1 = st.beta_columns((1, 2)) res1 = get_fb_news_data(domain_name, formated_domain, fake_news_db_news) left1.markdown('''**Source** : FakeNews Site DB''', unsafe_allow_html=True) right1.markdown(f'**Checking Domain** : {domain_name}', unsafe_allow_html=True) if res1 is None: right1.warning("URL is not found in Fake news site database") else: try: right1.markdown(f'**Category** : {res1["siteCategory"]}', unsafe_allow_html=True) right1.markdown(f'**Site name** : {res1["siteTitle"]}', unsafe_allow_html=True) if type(res1["siteCategory"]) is list: right1.markdown( f'**Discription** : {categories[res1["siteCategory"][0]]}', unsafe_allow_html=True) else: right1.markdown( f'**Discription** : {categories[res1["siteCategory"]]}', unsafe_allow_html=True) if res1["siteNotes"]: right1.markdown( f'**Source Notes (things to know?)** : {res1["siteNotes"]}', unsafe_allow_html=True) except Exception: st.warning("Category is not available for this site !!") if res1["siteCategory"] == 'reliable': st.success( "This is a trusted news site, which means the claim and article published on this site is transparent, authentic, trustworthy, complete, and in the absence of biases, it also protects audiences and users from disinformation." ) else: st.error( "This news site is not reliable or not authentic, the information published by this site might not be true !!" ) st.markdown('''### **Article Title**''') # st.header(Article Title) title = my_article.title if title: st.markdown(f'{title}') else: st.warning( "Coudn\'t able extract the title or Invalid URL Provided") st.markdown('''### **Article Authors **''') author = my_article.authors if len(author) != 0: # st.markdown(f'{author}') st.markdown( f'<span style="background-color:#00C4EB;border-radius:5px;box-shadow: 0 5px 0 rgb(0, 116, 191);color: #FFFFFF;padding: 0.5em 1em;position: relative;text-decoration: none;font-weight:bold;cursor: pointer;">{author[0]}</span>', unsafe_allow_html=True) else: st.warning( "Coudn\'t able extract the author name or Invalid URL Provided" ) st.markdown('''### **Publish Date**''') date = my_article.publish_date if date: st.info(f'{date} ') else: st.warning( "Coudn\'t able extract the publish date or Invalid URL Provided" ) st.markdown('''### **Image**''') image_url = my_article.top_image if image_url: st.image(image_url, caption="Article Top Image") st.markdown( f'''<p align="center"><b> Source URL : <b><a href="{ image_url }">{ image_url }</a></p>''', unsafe_allow_html=True) else: st.warning( "Coudn\'t able extract the Image or Invalid URL Provided or No image is present" ) st.markdown('''### **Article Text**''') article_text = my_article.text if article_text: with st.beta_expander( "🧙 Click here for more info about the article 🔮"): st.markdown(f'{article_text}', unsafe_allow_html=True) else: st.warning( "Coudn\'t able extract the publish article or Invalid URL Provided" ) st.markdown('''### **Movies / Videos**''') videos = my_article.movies if videos: st.video(videos[0]) else: st.warning( "Coudn\'t able extract the publish videos or No videos were published or Invalid URL Provided " ) try: my_article.nlp() except Exception as ec: st.error(ec) # except ArticleException: # st.error("Article Exception Occured !!") st.markdown('''### **Keywords (NLP)**''') nlp_keywords = my_article.keywords if nlp_keywords: st.info(nlp_keywords) else: st.warning( "Coudn\'t able to get the top keywords or Invalid URL Provided" ) st.markdown('''### **Summary (NLP)**''') nlp_summary = my_article.summary if nlp_summary: st.markdown(f'{nlp_summary}', unsafe_allow_html=True) else: st.warning( "Coudn\'t able to get the summary of the article or Invalid URL Provided" ) st.header("News article veracity") st.markdown('''---''') if article_text is not None: with st.spinner(text="Inference is in Progress ⏳ ..."): output_label = asyncio.run( model_service.predict_from_server(article_text)) # left,right = st.beta_columns((1,2)) st.markdown( '''**Analysis based on:** : Artificial intelligence''') st.markdown( '''**Notes:** WARNING: This result may be inaccurate! This domain wasn't categorised on any human maintained list thus analysis was performed by machine learning model.''' ) if output_label: st.markdown(f'Predicted label : {output_label}', unsafe_allow_html=True) st.success("Real news") else: st.markdown(f'Predicted label : {output_label}', unsafe_allow_html=True) st.error("Fake news") st.balloons() else: st.warning( "Article text is not found, hence news article veracity analysis is incomplete !!" )
def article(text): try: try: ipfsHash = text[0] url = "https://gateway.ipfs.io/ipfs/" + ipfsHash # print(url) article = Article(url) article.download() slept = 0 while article.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 10 seconds if slept > 20: raise ArticleException('Download never started') sleep(1) slept += 1 article.parse() # for description # article.nlp() # print(article.summary[:400]) soup = BeautifulSoup(article.html, "lxml") domainDesc = article.text[:270] desc1 = soup.find(attrs={"property": re.compile(r"description", re.I)}) if desc1 is not None: desc1 = desc1['content'] if(len(desc1)>25): domainDesc = desc1 desc2 = soup.find(attrs={"name": re.compile(r"description", re.I)}) if desc2 is not None: desc2 = desc2['content'] if(len(desc2)>25): domainDesc = desc2 img = article.top_image outLinks = len(soup.find_all('a', href=True)) mariadb_connectionT = mariadb.connect( host=dbData["host"], user='******', password=dbData["password"], database='avSearch') cursor = mariadb_connectionT.cursor() if img: cursor.execute("UPDATE `{!s}` set imgLink = {!a} , imgCount = '{:d}', charCount='{:d}', outLinksCount='{:d}', domainTitle={!a} , domainDesc={!a} where ipfsHash='{!s}'".format( domain, img, len(article.imgs), len(article.text), outLinks, article.title, domainDesc, ipfsHash)) else: cursor.execute("UPDATE `{!s}` set charCount='{:d}',outLinksCount='{:d}', domainTitle={!a} , domainDesc={!a} where ipfsHash='{!s}'".format( domain, len(article.text), outLinks, article.title, domainDesc, ipfsHash)) mariadb_connectionT.commit() except mariadb.Error as err: print("db error", err) except ValueError as err: print("Value Error", url) print(err) except TypeError as err: print("Type Error", url) print(err) except ArticleException: print("Article exception", url) return finally: if cursor: cursor.close() mariadb_connectionT.close()
def article(text): try: try: url = text[0] article = Article(url) article.download() slept = 0 while article.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 10 seconds if slept > 9: raise ArticleException('Download never started') sleep(1) slept += 1 article.parse() article.nlp() mariadb_connectionT = mariadb.connect( host='127.0.0.1', user='******', password='******', database='condense') cursor = mariadb_connectionT.cursor() # if article.canonical_link and article.canonical_link != url: # cursor.execute("SELECT fbshares,url FROM `{!s}` where url='{!s}'".format( # domain, article.canonical_link)) # data0 = cursor.fetchone() # if data0: # cursor.execute( # "SELECT fbshares FROM `{!s}` where url='{!s}'".format(domain, url)) # data1 = cursor.fetchone() # if int(data1[0] or 0) < int(data0[0] or 0): # cursor.execute( # "delete FROM `{!s}` where url='{!s}'".format(domain, url)) # mariadb_connectionT.commit() # return # else: # cursor.execute("delete FROM `{!s}` where url='{!s}'".format( # domain, article.canonical_link)) # mariadb_connectionT.commit() # else: # cursor.execute("update `{!s}` set url='{!s}' where url='{!s}'".format( # domain, article.canonical_link, url)) # mariadb_connectionT.commit() article.nlpEntropy() keywords = article.keywords keywords = ' '.join(keywords) d = article.publish_date author = "".join(article.authors) if len(author) > 30 or not author: author = "" img = article.top_image if not d: d = articleDateExtractor.extractArticlePublishedDate( url, article.html) if not d: return cursor.execute( "UPDATE `{!s}` set isArticleData = '1', keywords = {!a}, image = {!a}, author={!a} , charCount='{:d}',wordCount='{:d}',stopWords='{:d}',titleCount='{:d}', imgCount = '{:d}', title={!a}, date='{:%Y-%m-%d}' where url='{!s}'" .format(domain, keywords, img, author, len(article.text), article.totalWords, article.stopWords, len(article.title), len(article.imgs), article.title, d, url)) mariadb_connectionT.commit() except mariadb.Error as err: print("db error", err) except ValueError as err: print("Value Error", url) print(err) except TypeError as err: print("Type Error", url) print(err) except ArticleException: print("Article exception", url) return finally: if cursor: cursor.close() mariadb_connectionT.close()