Exemple #1
0
def collect_urls():
    for feed in rss_list:
        d = feedparser.parse(feed)
        '''
        # create file if it doesn't exist, if it does, open it
        try:
            e = load_obj(d.feed.title)
        except (OSError, IOError) as e: # create empty file
            f = {}
            save_obj(f,d.feed.title)
            e = load_obj(d.feed.title)
        '''

        json_file = gen_json(d)
        '''
        TODO: Capture current article ID from sort, and most recent article date
        ID = list size?
        '''

        for post in d.entries:
            article = Article(post.link)
            from newspaper.article import ArticleException, ArticleDownloadState
            '''
            TODO: If post date is before newest article in file, move to next feed
            '''
            p = {
                "id": post.id,  # check this
                "title": post.title,
                "link": post.link,
                "date": format_date(post.published)
                #text field?
            }
            if ("https://www.google.com/url?rct" in post.link
                ):  # Fixes Google post URLs to get true article URL
                p["link"] = post.link.split("&url=", 1)[1]

            slept = 0
            try:
                article.download()
                while article.download_state == ArticleDownloadState.NOT_STARTED:
                    # Raise exception if article download state does not change after 10 seconds
                    if slept > 9:
                        raise ArticleException('Download never started')
                    time.sleep(1)
                    slept += 1
                # Parse article
                article.parse()
                print(sa_everything(article.text))
                p["sa_val"] = sentiment_analysis_helper(article.text)
            except:
                p["sa_val"] = "error"

            json_file.append(p)

        with open(d.feed.title, 'w+') as f:
            json.dump(json_file, f)
Exemple #2
0
def scrape_article(id_num, headline, url):
    """
    Scrapes the article for text and headline

    Input: id_num (int)
    headline(string) - headline of article
    url(string) - url of article

    Output: json object
    {'id': id_num,
     'title': title of the article
     'text': text of the article
     'url': url of the article
     'category': category the article falls under as specified by
     bbc_categorization
    }
    """
    article = Article(url)
    from time import sleep
    from newspaper.article import ArticleException, ArticleDownloadState

    # Download article
    slept = 0
    try:
        article.download()
        while article.download_state == ArticleDownloadState.NOT_STARTED:
            # Raise exception if article download state does not change after 10 seconds
            if slept > 9:
                raise ArticleException('Download never started')
            sleep(1)
            slept += 1
        # Parse article
        article.parse()
    except:
        # If something goes wrong
        return {
            'id': id_num,
            'title': None,
            'text': None,
            'url': url,
            'category': None
        }
    try:
        category = bbc_categorization(article.title + ' ' + article.text,
                                      id_num)
    except:
        raise AttributeError("issue with bbc")
    return {
        'id': id_num,
        'title': article.title,
        'text': article.text,
        'url': url,
        'category': category
    }
Exemple #3
0
def test_file_feed():
    for feed in rss_list:
        d = feedparser.parse(feed)
        for post in d.entries:
            article = Article(post.link)
            from newspaper.article import ArticleException, ArticleDownloadState
            slept = 0
            try:
                article.download()
                while article.download_state == ArticleDownloadState.NOT_STARTED:
                    # Raise exception if article download state does not change after 10 seconds
                    if slept > 9:
                        raise ArticleException('Download never started')
                    time.sleep(1)
                    slept += 1
                # Parse article
                article.parse()
                print(sa_everything(article.text))
            except:
                print("Error")
    '''
def scrape_url(category_list, url_list):
    """
    :param url_list: list of urls to scrape from the web
    :return: nothing, just create a text file containing article text of each url
    """

    for i in range(0, len(url_list)):
        article_huff = Article(url_list[i])  # iterate through each article link
        slept = 0
        article_huff.download()
        while article_huff.download_state == ArticleDownloadState.NOT_STARTED:
            # Raise exception if article download state does not change after 12 seconds
            if slept > 13:
                raise ArticleException('Download never started')
        sleep(1)
        slept += 1

        article_huff.parse()
        article_info_huff = {"category": category_list[i], "title": article_huff.title, "text": article_huff.text}
        file_name = "../huffpostarticles/huffpost" + str(i) + ".json"
        with io.open(file_name, "w", encoding="utf-8") as f:
            f.write(json.dumps(article_info_huff))
def wf_summarize(url, summary_length):
    """
        :param summary_length: length of the summary in percentage
        :param url: url to scrape from the web
        :return: call the summarize function
    """
    article_huff = Article(url)
    slept = 0
    article_huff.download()
    while article_huff.download_state == ArticleDownloadState.NOT_STARTED:
        # Raise exception if article download state does not change after 12 seconds
        if slept > 13:
            raise ArticleException('Download never started')
    sleep(1)
    slept += 1
    n = int(summary_length)

    article_huff.parse()
    news_text = article_huff.text
    summarizer = ExtractiveTextSummarizer()
    summary = summarizer.summary_ranking(news_text, n)
    return {"title": article_huff.title, "summary": summary}
def summarize(url, summary_length):
    """
    :param summary_length: length of the summary in percentage
    :param url: url to scrape from the web
    :return: call the summarize function
    """
    # url = url.strip("https://")
    # print(url)
    article_huff = Article(url)
    slept = 0
    article_huff.download()
    while article_huff.download_state == ArticleDownloadState.NOT_STARTED:
        # Raise exception if article download state does not change after 12 seconds
        if slept > 13:
            raise ArticleException('Download never started')
    sleep(1)
    slept += 1

    article_huff.parse()
    summarizer = ExtractiveTextSummarizer()
    summary = summarizer.create_summary(article_huff.text, summary_length)
    return {"title": article_huff.title, "summary": summary}
Exemple #7
0
    "SELECT ipfsHash FROM `{!s}` where ipfsHash is not null limit 10000 offset 1"
    .format(domain))
data = cursor.fetchall()
try:
    for text in data:
        try:
            ipfsHash = text[0]
            url = "https://gateway.ipfs.io/ipfs/" + ipfsHash
            print(url)
            article = Article(url)
            article.download()
            slept = 0
            while article.download_state == ArticleDownloadState.NOT_STARTED:
                # Raise exception if article download state does not change after 10 seconds
                if slept > 9:
                    raise ArticleException('Download never started')
                sleep(1)
                slept += 1
            article.parse()

            img = article.top_image

            if img:
                # cursor.execute("UPDATE `{!s}` set image = {!a} , charCount='{:d}',wordCount='{:d}',entropy='{:d}',stopWords='{:d}',titleCount='{:d}', imgCount = '{:d}', title={!a},  where url='{!s}'".format(
                #     domain, img, len(article.text), article.totalWords, article.entropyN, article.stopWords, len(article.title), len(article.imgs), article.title, url))
                cursor.execute(
                    "UPDATE `{!s}` set imgLink = {!a} , imgCount = '{:d}', charCount='{:d}', domainTitle={!a}  where ipfsHash='{!s}'"
                    .format(domain, img, len(article.text), len(article.imgs),
                            article.title, ipfsHash))
            else:
                cursor.execute(
def home_page():

    # Scrape and parse textual content from web resource. This method employs Article from Newspaper3k library to download and parse html from the web resource. It uses heuristics to scrape main body of visible text.
    # :param url: Uniform Resource Locator.
    # :return: Scraped content of web resource.

    user_input = st.text_input('Enter URL of an article or text')

    with open(get_data_path('fake_news_sites.json')) as json_file:
        fake_news_db_news = json.load(json_file)

    with open(get_data_path('categories.json')) as json_file:
        categories = json.load(json_file)

    with open(get_data_path('opensources/sources.json')) as json_file:
        open_source_json = json.load(json_file)

    try:
        # Get domain name from the url
        domain_name = get_domain(user_input)

        # Get formated domain
        formated_domain = format_url(domain_name)

    except Exception:
        st.warning("Enter an URL to suppress the warning !!")

    try:
        my_article = Article(user_input, language="en", keep_article_html=True)
        my_article.download()
        slept = 0
        while my_article.download_state == ArticleDownloadState.NOT_STARTED:
            # Raise exception if article download state does not change after 10 seconds
            if slept > 9:
                raise ArticleException('Download never started')
            sleep(1)
            slept += 1
        my_article.parse()
    except Exception as ec:
        print(ec)

    if st.button('Check authenticity'):
        st.header("VirusTotal - Malicious URL Scanner (virustotal.com)")
        st.markdown('''---''')
        with st.spinner(text="Fetching measures - Analysis in progress"):
            # task = asyncio.create_task(scan_url(user_input))
            # json_data = await task
            json_data = scan_url(user_input=user_input)
            if json_data is not None:
                category_key = list(json_data.keys())
                category_value = [json_data[i]['result'] for i in category_key]
                left, center, right = st.beta_columns((1, 2, 1))

                with left:
                    left.markdown('''**No.** ''', unsafe_allow_html=True)
                    for i in range(1, 21):
                        left.write(i)
                with center:
                    center.markdown('''**Detected by**''',
                                    unsafe_allow_html=True)
                    for i in category_key[:20]:
                        center.write(i)
                with right:
                    right.markdown('''**Result**''', unsafe_allow_html=True)
                    for link in category_value[:20]:
                        if link == 'clean':
                            right.markdown(
                                f'<span style="color:green">clean site</span>',
                                unsafe_allow_html=True)
                        else:
                            right.markdown(
                                f'<span style="color:red">{link}</span>',
                                unsafe_allow_html=True)
            else:
                st.warning(
                    "Couldn't able to get detect the site or Invalid URL provided !!"
                )

        st.header("News site authencity")
        st.markdown('''---''')

        left, right = st.beta_columns((1, 2))
        res = get_opensource_news(domain_name, formated_domain,
                                  open_source_json)
        left.markdown('''**Source** : OpenSource http://www.opensources.co/''',
                      unsafe_allow_html=True)
        right.markdown(f'**Checking Domain** : {domain_name}',
                       unsafe_allow_html=True)
        if res is None:
            right.warning("URL is not found in OpenSource Database")
        else:
            right.markdown(f'**Category** : {res["type"]}',
                           unsafe_allow_html=True)
            try:
                right.markdown(f'**Discription** : {categories[res["type"]]}',
                               unsafe_allow_html=True)
            except:
                right.warning("Category Discription isn't available !!")
            if res["Source Notes (things to know?)"]:
                right.markdown(
                    f'**Source Notes (things to know?)** : {res["Source Notes (things to know?)"]}',
                    unsafe_allow_html=True)

        st.markdown('''---''')
        left1, right1 = st.beta_columns((1, 2))
        res1 = get_fb_news_data(domain_name, formated_domain,
                                fake_news_db_news)

        left1.markdown('''**Source** : FakeNews Site DB''',
                       unsafe_allow_html=True)
        right1.markdown(f'**Checking Domain** : {domain_name}',
                        unsafe_allow_html=True)
        if res1 is None:
            right1.warning("URL is not found in Fake news site database")
        else:
            try:
                right1.markdown(f'**Category** : {res1["siteCategory"]}',
                                unsafe_allow_html=True)
                right1.markdown(f'**Site name** : {res1["siteTitle"]}',
                                unsafe_allow_html=True)
                if type(res1["siteCategory"]) is list:
                    right1.markdown(
                        f'**Discription** : {categories[res1["siteCategory"][0]]}',
                        unsafe_allow_html=True)
                else:
                    right1.markdown(
                        f'**Discription** : {categories[res1["siteCategory"]]}',
                        unsafe_allow_html=True)

                if res1["siteNotes"]:
                    right1.markdown(
                        f'**Source Notes (things to know?)** : {res1["siteNotes"]}',
                        unsafe_allow_html=True)
            except Exception:
                st.warning("Category is not available for this site !!")

            if res1["siteCategory"] == 'reliable':
                st.success(
                    "This is a trusted news site, which means the claim and article published on this site is transparent, authentic, trustworthy, complete, and in the absence of biases, it also protects audiences and users from disinformation."
                )
            else:
                st.error(
                    "This news site is not reliable or not authentic, the information published by this site might not be true !!"
                )

        st.markdown('''### **Article Title**''')
        # st.header(Article Title)
        title = my_article.title
        if title:
            st.markdown(f'{title}')
        else:
            st.warning(
                "Coudn\'t able extract the title or Invalid URL Provided")

        st.markdown('''### **Article Authors **''')
        author = my_article.authors
        if len(author) != 0:
            # st.markdown(f'{author}')
            st.markdown(
                f'<span style="background-color:#00C4EB;border-radius:5px;box-shadow: 0 5px 0 rgb(0, 116, 191);color: #FFFFFF;padding: 0.5em 1em;position: relative;text-decoration: none;font-weight:bold;cursor: pointer;">{author[0]}</span>',
                unsafe_allow_html=True)
        else:
            st.warning(
                "Coudn\'t able extract the author name or Invalid URL Provided"
            )

        st.markdown('''### **Publish Date**''')
        date = my_article.publish_date
        if date:
            st.info(f'{date} ')
        else:
            st.warning(
                "Coudn\'t able extract the publish date or Invalid URL Provided"
            )

        st.markdown('''### **Image**''')
        image_url = my_article.top_image
        if image_url:
            st.image(image_url, caption="Article Top Image")
            st.markdown(
                f'''<p align="center"><b> Source URL : <b><a href="{ image_url }">{ image_url }</a></p>''',
                unsafe_allow_html=True)
        else:
            st.warning(
                "Coudn\'t able extract the Image or Invalid URL Provided or No image is present"
            )

        st.markdown('''### **Article Text**''')
        article_text = my_article.text
        if article_text:
            with st.beta_expander(
                    "🧙 Click here for more info about the article 🔮"):
                st.markdown(f'{article_text}', unsafe_allow_html=True)
        else:
            st.warning(
                "Coudn\'t able extract the publish article or Invalid URL Provided"
            )

        st.markdown('''### **Movies / Videos**''')
        videos = my_article.movies
        if videos:
            st.video(videos[0])
        else:
            st.warning(
                "Coudn\'t able extract the publish videos or No videos were published or Invalid URL Provided "
            )

        try:
            my_article.nlp()
        except Exception as ec:
            st.error(ec)
        # except ArticleException:
        #     st.error("Article Exception Occured !!")

        st.markdown('''### **Keywords (NLP)**''')
        nlp_keywords = my_article.keywords
        if nlp_keywords:
            st.info(nlp_keywords)
        else:
            st.warning(
                "Coudn\'t able to get the top keywords or Invalid URL Provided"
            )

        st.markdown('''### **Summary (NLP)**''')
        nlp_summary = my_article.summary
        if nlp_summary:
            st.markdown(f'{nlp_summary}', unsafe_allow_html=True)
        else:
            st.warning(
                "Coudn\'t able to get the summary of the article or Invalid URL Provided"
            )

        st.header("News article veracity")
        st.markdown('''---''')

        if article_text is not None:

            with st.spinner(text="Inference is in Progress ⏳ ..."):
                output_label = asyncio.run(
                    model_service.predict_from_server(article_text))
                # left,right = st.beta_columns((1,2))
                st.markdown(
                    '''**Analysis based on:** : Artificial intelligence''')
                st.markdown(
                    '''**Notes:** WARNING: This result may be inaccurate! This domain wasn't categorised on any human maintained list thus analysis was performed by machine learning model.'''
                )
                if output_label:
                    st.markdown(f'Predicted label : {output_label}',
                                unsafe_allow_html=True)
                    st.success("Real news")
                else:
                    st.markdown(f'Predicted label : {output_label}',
                                unsafe_allow_html=True)
                    st.error("Fake news")
            st.balloons()
        else:
            st.warning(
                "Article text is not found, hence news article veracity analysis is incomplete !!"
            )
Exemple #9
0
def article(text):
    try:
        try:
            ipfsHash = text[0]

            url = "https://gateway.ipfs.io/ipfs/" + ipfsHash
          #  print(url)
            article = Article(url)
            article.download()
            slept = 0
            while article.download_state == ArticleDownloadState.NOT_STARTED:
                # Raise exception if article download state does not change after 10 seconds
                if slept > 20:
                    raise ArticleException('Download never started')
                sleep(1)
                slept += 1
            article.parse()

            # for description
            # article.nlp()
            # print(article.summary[:400])
            soup = BeautifulSoup(article.html, "lxml")
            domainDesc = article.text[:270]
            
            desc1 = soup.find(attrs={"property": re.compile(r"description", re.I)})
            if desc1 is not None:
                desc1 = desc1['content']
                if(len(desc1)>25):
                    domainDesc = desc1
            desc2 = soup.find(attrs={"name": re.compile(r"description", re.I)})
            if desc2 is not None:
                desc2 = desc2['content']
                if(len(desc2)>25):
                    domainDesc = desc2

            img = article.top_image
            
            outLinks = len(soup.find_all('a', href=True))

            mariadb_connectionT = mariadb.connect(
                host=dbData["host"], user='******', password=dbData["password"], database='avSearch')

            cursor = mariadb_connectionT.cursor()

            if img:
                cursor.execute("UPDATE `{!s}` set imgLink = {!a} , imgCount = '{:d}', charCount='{:d}', outLinksCount='{:d}', domainTitle={!a} , domainDesc={!a}  where ipfsHash='{!s}'".format(
                    domain, img, len(article.imgs), len(article.text), outLinks,  article.title, domainDesc, ipfsHash))
            else:
                cursor.execute("UPDATE `{!s}` set charCount='{:d}',outLinksCount='{:d}', domainTitle={!a} , domainDesc={!a} where ipfsHash='{!s}'".format(
                    domain, len(article.text), outLinks,  article.title, domainDesc, ipfsHash))
            mariadb_connectionT.commit()
        except mariadb.Error as err:
            print("db error", err)
        except ValueError as err:
            print("Value Error", url)
            print(err)
        except TypeError as err:
            print("Type Error", url)
            print(err)
        except ArticleException:
            print("Article exception", url)
            return
    finally:
        if cursor:
            cursor.close()
        mariadb_connectionT.close()
Exemple #10
0
def article(text):
    try:
        try:
            url = text[0]
            article = Article(url)
            article.download()
            slept = 0
            while article.download_state == ArticleDownloadState.NOT_STARTED:
                # Raise exception if article download state does not change after 10 seconds
                if slept > 9:
                    raise ArticleException('Download never started')
                sleep(1)
                slept += 1
            article.parse()
            article.nlp()
            mariadb_connectionT = mariadb.connect(
                host='127.0.0.1',
                user='******',
                password='******',
                database='condense')
            cursor = mariadb_connectionT.cursor()
            # if article.canonical_link and article.canonical_link != url:
            #     cursor.execute("SELECT fbshares,url FROM `{!s}` where url='{!s}'".format(
            #         domain, article.canonical_link))
            #     data0 = cursor.fetchone()
            #     if data0:
            #         cursor.execute(
            #             "SELECT fbshares  FROM `{!s}` where url='{!s}'".format(domain, url))
            #         data1 = cursor.fetchone()
            #         if int(data1[0] or 0) < int(data0[0] or 0):
            #             cursor.execute(
            #                 "delete FROM `{!s}` where url='{!s}'".format(domain, url))
            #             mariadb_connectionT.commit()
            #             return
            #         else:
            #             cursor.execute("delete FROM `{!s}` where url='{!s}'".format(
            #                 domain, article.canonical_link))
            #             mariadb_connectionT.commit()
            #     else:
            #         cursor.execute("update `{!s}` set url='{!s}' where url='{!s}'".format(
            #             domain, article.canonical_link, url))
            #         mariadb_connectionT.commit()
            article.nlpEntropy()
            keywords = article.keywords
            keywords = ' '.join(keywords)
            d = article.publish_date
            author = "".join(article.authors)
            if len(author) > 30 or not author:
                author = ""
            img = article.top_image
            if not d:
                d = articleDateExtractor.extractArticlePublishedDate(
                    url, article.html)
            if not d:
                return
            cursor.execute(
                "UPDATE `{!s}` set isArticleData = '1', keywords = {!a}, image = {!a}, author={!a} , charCount='{:d}',wordCount='{:d}',stopWords='{:d}',titleCount='{:d}', imgCount = '{:d}', title={!a}, date='{:%Y-%m-%d}' where url='{!s}'"
                .format(domain, keywords, img, author, len(article.text),
                        article.totalWords, article.stopWords,
                        len(article.title), len(article.imgs), article.title,
                        d, url))
            mariadb_connectionT.commit()
        except mariadb.Error as err:
            print("db error", err)
        except ValueError as err:
            print("Value Error", url)
            print(err)
        except TypeError as err:
            print("Type Error", url)
            print(err)
        except ArticleException:
            print("Article exception", url)
            return
    finally:
        if cursor:
            cursor.close()
        mariadb_connectionT.close()