Exemple #1
0
def crawl_link_article(url):
    result_json = None

    try:
        if 'http' not in url:
            if url[0] == '/':
                url = url[1:]
            try:
                article = Article('http://' + url)
                article.download()
                time.sleep(2)
                article.parse()
                flag = True
            except:
                logging.exception(
                    "Exception in getting data from url {}".format(url))
                flag = False
                pass
            if flag == False:
                try:
                    article = Article('https://' + url)
                    article.download()
                    time.sleep(2)
                    article.parse()
                    flag = True
                except:
                    logging.exception(
                        "Exception in getting data from url {}".format(url))
                    flag = False
                    pass
            if flag == False:
                return None
        else:
            try:
                article = Article(url)
                article.download()
                time.sleep(2)
                article.parse()
            except:
                logging.exception(
                    "Exception in getting data from url {}".format(url))
                return None

        if not article.is_parsed:
            return None

        visible_text = article.text
        top_image = article.top_image
        images = article.images
        keywords = article.keywords
        authors = article.authors
        canonical_link = article.canonical_link
        title = article.title
        meta_data = article.meta_data
        movies = article.movies
        publish_date = article.publish_date
        source = article.source_url
        summary = article.summary

        images = create_string(images)
        authors = create_string(authors)
        movies = create_string(movies)
        keywords = create_string(keywords)

        result_json = {
            'url': url,
            'text': visible_text,
            'images': images,
            'top_img': top_image,
            'keywords': keywords,
            'authors': authors,
            'canonical_link': canonical_link,
            'title': title,
            'movies': movies,
            'publish_date': get_epoch_time(publish_date),
            'source': source,
            'summary': summary
        }
    except:
        logging.exception(
            "Exception in fetching article form URL : {}".format(url))

    return result_json
Exemple #2
0
    links = re.findall(r'<a href="(/article/.+?)"', rr.text)
    for j in range(
            10):  #to remove extraneous links (note 10 articles per page)
        corpus_link_endings.append(links[1 + 2 * j])
corpus_links = []
for end in corpus_link_endings:
    corpus_links.append('https://uk.reuters.com' + end)
#Save links to file
f = open('reuters-corpus-10k-links.txt', 'w+')
for link in corpus_links:
    f.write(link + '\n')
f.close()

# Extract contents of article from url using newspaper3k
corpus_texts = []
from newspaper import Article
for link in corpus_links:
    a = Article(link)
    a.download()
    a.parse()
    corpus_texts.append(a.text)
# i=0
# for text in corpus_texts:
#     print(text,i)
#     i+=1

# Write corpus-texts to a file
f = open('reuters-corpus-10k.txt', 'w+')
for text in corpus_texts:
    f.write(text + '\n\n\n\n\n\n\n')
f.close()
Exemple #3
0
    doc = ' '.join(filtered_tokens)
    return doc


end_r_speeches = []
# for year in range(9):
# for speech in agg[9]['R']:
#     news = normalize_document(speech)
#     end_r_speeches.append(news)
from newspaper import Article
import newspaper

rep_sauce = newspaper.build('http://breitbart.com', memoize_articles=False)
for article in rep_sauce.articles:
    urll = article.url
    art = Article(urll)
    art.download()
    art.parse()
    text = normalize_document(art.text)
    end_r_speeches.append(text)

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in end_r_speeches]

# Set values for various parameters
feature_size = 100  # Word vector dimensionality
window_context = 30  # Context window size
min_word_count = 1  # Minimum word count
sample = 1e-3  # Downsample setting for frequent words

r_w2v_model = word2vec.Word2Vec(tokenized_corpus,
Exemple #4
0
     "link": value['link'],
     "articles": []
 }
 for entry in d.entries:
     # Check if publish date is provided, if no the article is skipped.
     # This is done to keep consistency in the data and to keep the script from crashing.
     if hasattr(entry, 'published'):
         if count > param.num_of_articles:
             break
         article = {}
         article['link'] = entry.link
         date = entry.published_parsed
         article['published'] = datetime.fromtimestamp(
             mktime(date)).isoformat()
         try:
             content = Article(entry.link)
             content.download()
             content.parse()
         except Exception as e:
             # If the download for some reason fails (ex. 404) the script will continue downloading
             # the next article.
             print(e)
             print("proceeding...")
             continue
         article['title'] = content.title
         article['text'] = content.text
         newsPaper['articles'].append(article)
         if param._comments:
             print(count, "articles downloaded from", company,
                   ", url: ", entry.link)
         count = count + 1
Exemple #5
0
#import the libraries
from newspaper import Article
import random
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#download the punkt package
nltk.download('punkt', quiet = True)

#get the article
article = Article('https://www.mayoclinic.org/diseases-conditions/chronic-kidney-disease/symptoms-causes/syc-20354521')
article.download()
article.parse()
article.nlp()
corpus = article.text

#print the article's test
print(corpus)

#tokenisation
text = corpus
sentence_list = nltk.sent_tokenize(text)

print(sentence_list)

# a function to return a random greeting respone to a user's greeting
Exemple #6
0
 def test_pre_download_nlp(self):
     """Test running NLP algos before even downloading the article
     """
     self.setup_stage('initial')
     new_article = Article(self.article.url)
     self.assertRaises(ArticleException, new_article.nlp)
Exemple #7
0
def process_article(url):
    a = Article(url, request_timeout=3)
    a.download()
    a.parse()
    a.nlp()
    return article_to_dict(a)
def parseurl(url):
    article = Article(url)
    article.download()
    article.parse()
    return article
Exemple #9
0
def build(url):
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()
    return article
def read_article(url):
    art = Article(url, language='vi')
    #print(fulltext(url, language='vi'))
    art.download()
    art.parse()
    return art.title + '.\n' + art.text + '.\n'
Exemple #11
0
from summarizer import summarize
import newspaper
from newspaper import Article
from pyteaser import SummarizeUrl
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

url = 'http://www.politifact.com/truth-o-meter/statements/2017/jun/22/antinews/its-fake-news-chinese-lunar-rover-found-no-evidenc/'

a = Article(url)
a.download()
a.parse()

TXT=a.text
TITLE=a.title


def Summarize(title, text):
 summaries = []
 sentences = split_sentences(text)
 keys = keywords(text)
 titleWords = split_words(title)
 
 if len(sentences) <= 1:
    return sentences
 
#score setences, and use the top 5 sentences
 ranks = score(sentences, titleWords, keys).most_common(1)
 for rank in ranks:
    summaries.append(rank[0])
 
 def get_article_from_url(self, url):
     article = Article(url)
     article.download()
     article.parse()
     article.nlp()
     return self.get_article_obj_from_article(article)
Exemple #13
0
import os


def conv(s):
    try:
        return int(s)
    except ValueError:
        return s


df = pd.read_csv('enk.csv')
query = df["url"].values
#query =["https://en.wikipedia.org/wiki/Vaccine" , "https://en.wikipedia.org/wiki/Vaccine"]
for i in range(len(query)):
    try:
        article = Article(query[i])
        article.download()
        article.parse()
        text = article.text
        blob = TextBlob(text)
        s = Textatistic(text)
        afb = len(article.images)
        vals = requests.get(query[i], timeout=4,
                            allow_redirects=False).elapsed.total_seconds()
        st = "/&callback=process&key=57bf606e01a24537ac906a86dc27891f94a0f587"
        # zz = urlopen ( url )
        quez = 'http://api.mywot.com/0.4/public_link_json2?hosts=' + query[
            i] + st
        stt = urllib.request.urlopen(quez).read()
        stt = str(stt)
        wot = re.findall('\d+', stt)
Exemple #14
0
 def set_text(self):
     if not self.text and self.url:
         a = Article(self.url)
         a.download()
         a.parse()
         self.text = a.text
Exemple #15
0
 def setUp(self):
     """Called before the first test case of this unit begins
     """
     self.article = Article(
         url='http://www.cnn.com/2013/11/27/travel/weather-'
             'thanksgiving/index.html?iref=allsearch')
Exemple #16
0
 def getTitle(title):
     article = Article(title)
     article.download()
     article.parse()
     return article.title
Exemple #17
0
 def test_pre_download_parse(self):
     """Calling `parse()` before `download()` should yield an error
     """
     article = Article(self.article.url)
     self.assertRaises(ArticleException, article.parse)
def random_sentences():
    """
    This function uses the Newspaper3k package (https://newspaper.readthedocs.io/en/latest/) to download text from CNN articles.
    The first, second, and last sentence from each article are taken and stored.
    In addition, 100 other random sentences hand-picked by me from random websites are added.
    This new dataset of random sentences is saved to be used later.
    :return: nothing, but a dataset of random sentences is outputted
    """
    # First bit, extract the random sentences from the CNN articls

    urls = pd.read_excel("training_sentences.xlsx",
                         sheet_name="Sheet2",
                         index_col=0)
    sentences = []

    for url in urls["url"]:
        #url = "https://www.cnn.com/2021/02/12/tech/facebook-myanmar-military-intl-hnk/index.html"
        print(url)
        article = Article(url)
        article.download()
        try:
            article.parse()
            last_index = len(nltk.sent_tokenize(article.text)) - 1
            first_sent = nltk.sent_tokenize(article.text)[0]
            first_sent.replace("(CNN) -", '', 1)
            first_sent.replace("(CNN)-", '', 1)
            first_sent.replace("(CNN)", '', 1)
            second_sent = nltk.sent_tokenize(article.text)[1]
            second_sent.replace("(CNN) -", '', 1)
            second_sent.replace("(CNN)-", '', 1)
            second_sent.replace("(CNN)", '', 1)
            last_sent = nltk.sent_tokenize(article.text)[last_index]
            last_sent.replace("(CNN) -", '', 1)
            last_sent.replace("(CNN)-", '', 1)
            last_sent.replace("(CNN)", '', 1)

            sentences.append(first_sent)
            if first_sent != second_sent:
                sentences.append(second_sent)
            if second_sent != last_sent and first_sent != last_sent:
                sentences.append(last_sent)
            if first_sent == second_sent or first_sent == last_sent or second_sent == last_sent:
                print(
                    "There was the same sentence twice. It did not get duplicated."
                )

        except newspaper.article.ArticleException:
            print('didnt work for that url:', url)

    # Second bit, use the ~100 random sentences that I compiled to even the mix of random to climate related sentences
    extra_sentences = pd.read_excel("training_sentences.xlsx",
                                    sheet_name="Sheet3",
                                    index_col=0)
    for sentence in extra_sentences["random_sentence"]:
        sentence = sentence.replace('\xa0', ' ')
        sentences.append(sentence)

    ##################################
    # SAVE THE SENTENCES
    ##################################
    print("Saving Sentences...")
    np.save("random.npy", np.array(sentences))
    print("Completed saving sentences!")

    print("Number of random sentences:", len(sentences))
    print(75 * '-')


# ==============================================================================#
# OLD CODE
# ==============================================================================#

# cnn_paper = newspaper.build('https://www.cnn.com')
#
# for article in cnn_paper.articles:
#     print(article.url)
#
# cnn_article_array = []
# for i in range(5):
#     cnn_article = cnn_paper.articles[i]
#     cnn_article.download()
#     cnn_article.parse()
#     cnn_article.nlp()
#     print(cnn_article.text)
Exemple #19
0
 def test_article_pdf_fetching(self):
     a = Article(url='https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf')
     a.download()
     self.assertNotEqual('%PDF-', a.html)
Exemple #20
0
    def sentiment(self, ticker, number_of_articles=50, text_boolean=False):
        '''
        classifier: default='nb'. Choose between a Niave Bayes Classifier (input='nb'), or NLTK's Sentiment
        Intensity Analyzer (input='si'). The method scrapes the top "specified number of articles"
        from google news. The classifier analyzes each article and averages the negative and positive
        scores to return a dictionary of scores. {'pos': x, 'neg': y}.
        ticker:choose the ticker symbol you want analyzed
        number_of_articles: Default= 50. Choose the number of articles you want scraped from google news, 
        text_boolean:Default=False. input True or False, if you want the text of the article that is closset to the average
        positive and negative score return in the dictionary. If this argument is 'True', the dictionary will have three keys
        {'pos': x, 'neg': y, 'text': summary}
        '''
        classifier = 'nb'
        ticker = ticker
        articles_examined = number_of_articles
        prefix = 'https://news.google.com/'
        url = 'https://news.google.com/search?q=' + ticker + '&hl=en-US&gl=US&ceid=US%3Aen'
        r1 = requests.get(url)
        coverpage = r1.content
        soup1 = BeautifulSoup(coverpage, 'html5lib')
        coverpage_news = soup1.find_all(
            'div', class_="NiLAwe y6IFtc R7GTQ keNKEd j7vNaf nID9nc")
        links = []
        for article in (coverpage_news):
            links.append(prefix + article.a["href"])

        titles = []
        texts = []
        summaries = []
        counter = 0
        for link in links:
            print(link)
            try:
                url = link
                article = Article(url, language="en")
                article.download()
                article.parse()
                article.nlp()
                titles.append(article.title)  #prints the title of the article
                texts.append(
                    (article.text))  #prints the entire text of the article
                summaries.append(
                    article.summary)  #prints the summary of the article
                #print(article.keywords) #prints the keywords of the article
                counter += 1
                if counter >= articles_examined:
                    break

            except newspaper.article.ArticleException:
                continue

        if classifier == 'nb':
            import pickle
            classifier_f = open("naivebayes.pickle", "rb")
            classifier = pickle.load(classifier_f)
            classifier_f.close()

            text_counter = 0
            texts_neg_sum = []
            texts_pos_sum = []
            result_te = ''
            for text in texts:
                print('text')
                prob_dist = classifier.prob_classify(text)
                texts_pos_sum.append(round(prob_dist.prob("pos"), 2))
                texts_neg_sum.append(round(prob_dist.prob("neg"), 2))
                text_counter += 1

            if sum(texts_neg_sum) > sum(texts_pos_sum):
                result_te = 'negative'
            elif sum(texts_neg_sum) < sum(texts_pos_sum):
                result_te = 'positive'

            n_sent = ((sum(texts_neg_sum) / text_counter) * 100)
            p_sent = ((sum(texts_pos_sum) / text_counter) * 100)

            if text_boolean == True:
                sent_list = []
                avg_num = 0
                if sum(texts_neg_sum) > sum(texts_pos_sum):
                    sent_list = texts_neg_sum
                    avg_num = n_sent
                elif sum(texts_neg_sum) < sum(texts_pos_sum):
                    sent_list = texts_pos_sum
                    avg_num = p_sent

                clossest_sent = min(sent_list, key=lambda x: abs(x - avg_num))
                avg_summary = summaries[sent_list.index(clossest_sent)]

                return {'pos': p_sent, 'neg': n_sent, 'text': avg_summary}

            elif text_boolean == False:
                return {'pos': p_sent, 'neg': n_sent}

            else:
                raise ValueError('text_boolean must be either True or False')

        else:
            raise ValueError(
                'Argument must be nb(Naive Bayes Classifier) or si(Sentiemnt Intensity Classifier)'
            )
Exemple #21
0
    def newspaper_parser(self, sleep_time=5):
        logging.debug('running newspaper_parser() for sercure sites...')
        results = []
        count = 0

        profile = webdriver.FirefoxProfile()
        browser = webdriver.Firefox(executable_path=r'gecko\geckodriver.exe')
        credential_names = list(self.credentials.keys())

        browser.get(self.login_url)
        cred1 = browser.find_element_by_id(credential_names[0])
        cred2 = browser.find_element_by_id(credential_names[1])
        cred1.send_keys(self.credentials[credential_names[0]])
        cred2.send_keys(self.credentials[credential_names[1]])
        time.sleep(10)
        browser.find_element_by_class_name(self.submit_id).click()
        time.sleep(10)

        cookies = browser.get_cookies()
        browser.close()

        s = requests.Session()
        for cookie in cookies:
            s.cookies.set(cookie['name'], cookie['value'])

        for l in self.links:
            try:
                page = s.get(l)
            except Exception as e:
                logging.error("issue bundling {} for {}, {}".format(
                    l, self.searchTerm, e))
                print(e)
                time.sleep(20)
                continue

            soup = BeautifulSoup(page.content, features="lxml")
            article = Article(url=l)
            article.set_html(str(soup))

            article.parse()
            article.nlp()
            up_date = article.publish_date
            if self.newspaper == 'Wall Street Journal':
                soup = BeautifulSoup(article.html, features="lxml")
                # if no articles, stop
                pub_date = soup.find("meta", {
                    "name": "article.published"
                }).get("content", None)
                up_date = soup.find("meta", {
                    "name": "article.updated"
                }).get("content", None)
                article.publish_date = pub_date

            data = {
                'search': self.searchTerm,
                'title': article.title,
                'date_published': article.publish_date,
                'date_updated': up_date,
                'news_outlet': self.newspaper,
                'authors': article.authors,
                # 'feature_img': article.top_image,
                'article_link': article.canonical_link,
                'keywords': article.keywords,
                # 'movies': article.movies,
                'summary': article.summary,
                'text': article.text,
                'html': article.html,
            }
            results.append(data)
            time.sleep(sleep_time)

            count += 1
        print("done for ", self.searchTerm)
        return results
def get_twittable_sentences_from_url(url):
    article = Article(url)
    article.download()
    article.parse()
    best_sentences = nlp.summarize(title=article.title, text=article.text)
    return [s for s in best_sentences if len(s) < 140]
import webbrowser
import newspaper
from newspaper import Article

url = "https://www.hindustantimes.com/india-news/pm-modi-asks-g20-for-an-effective-global-response-to-coronavirus-reports/story-myRgcYwmAhEX077ZZdGCbP.html"

toi_article = Article(url, language="en")  # en for English

toi_article.download()
toi_article.parse()
toi_article.nlp()
f = open('art.html', 'w')
a = "https://www.hindustantimes.com/rf/image_size_960x540/HT/p2/2020/03/26/Pictures/ahmedabad-ahmedabad-unorganised-hindustan-addresses-siddharaj-minister_1342d1d6-6f7e-11ea-ad54-628e87a77846.jpg"

message = """<html>
<head></head>
<body>
<h1>{title}</h1>
<h3>Authors:{auth}</h3>
<img src="{URL}">
<article>
  <h4>Article Publish Date: {date} </h4>
  <p>Summary:{summ}</p>
  <p>Detailed News: {text}</p>
</article>
</body>
</html>"""

new_message = message.format(URL=a,
                             title=toi_article.title,
                             auth=toi_article.authors,
Exemple #24
0
def getArticle(url):
    a = Article(url, language='ko')
    a.download()
    a.parse()

    return a
Exemple #25
0
    def __init__(self, url):
        self.article = NewsPlease.from_url(url)

        self.content = Article(url)
        self.content.download()
        self.content.parse()
    def parse_content(self, response):
        #这个函数用作新闻的具体解析

        ID = 'songtengteng'

        website_name = '商务部贸易救济调查局'

        # 网站板块
        website_block = response.xpath(
            "//div[@class='position']/a[2]/text()").extract_first()

        news_url = response.meta['url']

        # 作者
        news_author_list = response.xpath('//script')
        if len(news_author_list) != 0:
            news_author = news_author_list.re(
                'v.{2}\ss.{4}e\s=\s\"[\u4e00-\u9fa5]+\"')
            if news_author != []:
                news_author = news_author[0][13:].replace('"', '')
            else:
                news_author = '商务部贸易救济调查局'
        else:
            news_author = '商务部贸易救济调查局'

        # 新闻发布时间,统一格式:YYYY MM DD HH:Mi:SS
        publish_time = response.meta['publish_time']
        year = publish_time[0:4]
        month = publish_time[5:7]
        day = publish_time[8:10]
        juti_time = publish_time[-8:]
        publish_time = year + month + day + ' ' + juti_time

        # 新闻自带标签
        news_tags = response.xpath('//script').re(
            'v.{2}\sc.+e\s=\s\"[\u4e00-\u9fa5]+\"')[0][14:].replace('"', '')

        # 新闻标题
        news_title = response.xpath('//h3/text()').extract_first()

        # 新闻正文
        a = Article(response.url, language='zh')  # Chinese
        a.download()
        a.parse()
        news_content = a.text

        #获取文章的图片和名称
        image_urls = []
        image_names = []
        image_urls1 = response.xpath(
            '//p[@class="detailPic"]/img/@src|//div[@class="article_con"]/center/img/@src|//p[@style="text-align: center"]/img/@src'
        ).extract()
        if image_urls1 != []:
            image_urls = image_urls1
            for i in range(len(image_urls)):
                if i < 10 and i >= 0:
                    image_name = news_title + '_000' + str(i)
                    image_names.append(image_name)
                elif i < 100 and i >= 10:
                    image_name = news_title + '_00' + str(i)
                    image_names.append(image_name)
                elif i < 1000 and i >= 100:
                    image_name = news_title + '_0' + str(i)
                    image_names.append(image_name)
                else:
                    image_name = news_title + str(i)
                    image_names.append(image_name)

        yield self.getItem(
            id=ID,
            news_url=news_url,
            website_name=website_name,
            website_block=website_block,
            news_title=news_title,
            publish_time=publish_time,
            news_author=news_author,
            news_tags=news_tags,
            news_content=news_content,
            image_urls=image_urls,
            image_names=image_names,
        )
Exemple #27
0
from urllib.parse import urlparse
import newspaper
from newspaper import Article

urlFileRead = open("newspaperURLS.txt", "r")
urlRead = urlFileRead.readlines()

urlFileAppend = open("newspaperURLS.txt", "a")

cnn_paper = newspaper.build('http://cnn.com')
for article in cnn_paper.articles:
    urlFileAppend.write(article.url)

textfile_url = open("urltextfiles.txt", "w")

for urlLine in urlRead:
    first_article = Article(url=urlLine)
    first_article.download()
    print article.html
    first_article.parse()
    print(first_article.publish_date)
    textfile_url.write(first_article.text)
'''  urlFinal = "\"" + urlLine.rstrip() + "\""
    print (urlFinal)'''
Exemple #28
0
from newspaper import Article
import time
import sys , os
import json

# first filename witt be i+1
# so i should be the latest number given to an article
i=0


with open("links-sz-2020-07-13.txt" , "r") as link_file :
	all_lines = link_file.readlines()
	for link in all_lines[0:3]:
		article = Article(link)
		try:
			print(i , ": ", link)
			article.download()
			print(article.title)
			time.sleep(2)
			article.parse()
			article.nlp()
			# article.fetch_images() # I am not working with images at the moment

			## generate a filename
			i=i+1
			filename = f'{i:05}'
			# should check, if file exists ...

			keep = article.meta_data['og']
			keep['authors'] = article.authors
			keep['text-link'] = filename
from newspaper import Article
import nltk
from gtts import gTTS
import os

# BBC News: Coronavirus: How can AI help fight the pandemic?
url = 'https://www.bbc.com/news/technology-51851292'

# Getting the article text using newspaper3k library
article = Article(url)
article.download()
article.parse()
nltk.download('punkt')

# Applying NLP
article.nlp()

# Storing the text of the article in article_text
article_text = article.text[:250]

print("First 250 character of the article: \n", article_text)

# Selecting the Article's Language
article_language = 'en'  #English

# Applying Google Text-to-Speech
text_to_speech = gTTS(text=article_text, lang=article_language, slow=False)

# Saving the audio file
text_to_speech.save("text_to_speech_article.mp3")
Exemple #30
0
def read_doc(record):
    url = record.url
    article = None
    if url:
        article = Article(url, language="en")
    return url, article