Exemple #1
0
def summarise_one(url, title=True, keywords=True, summary=False, \
    top_img_src=False):
    '''
    Get url and return summary 
    '''
    article = Article(url)

    # configuration for Newspaper to minimize processing time
    configure = Config()
    configure.fetch_images = False
    configure.MAX_SUMMARY = 300
    configure.MAX_SUMMARY_SENT = 3
    
    try:
        article.download()
        article.parse()
    except:
        print(url) 

    title = article.title
    if keywords or summary:
        try:
            article.nlp()
            if keywords:
                keywords = article.keywords
            if summary:
                summary = article.summary
        except :
            print('NEwspaper error with nlp() call')
        
    if top_img_src:
        top_img_src = article.top_image
   
    return title, keywords, summary, top_img_src
Exemple #2
0
def download_article(url):
    config = Config()
    config.follow_meta_refresh = True
    config.keep_article_html = True
    article = Article(url, config=config)
    article.download()
    return article
def get_text(url):
    """Given URL of the article extract text from it

    Arguments:
        url {str} -- url of the article

    Returns:
        str -- Text of thr article
    """
    global CNT
    global NAN
    if (CNT % 100 == 0):
        print(CNT)
    config = Config()
    config.keep_article_html = True
    try:
        article = Article(url, config=config)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(e)
        NAN += 1
        return np.nan
    finally:
        CNT += 1
Exemple #4
0
def fetch_by_newspaper(url):
    is_linkedin_url = url.startswith(linkedinUrl)
    if is_linkedin_url:
        config = Config()
        config.MAX_TITLE = 1000
        article = get_article(url, config)
        article = replace_title_text_from_title_url(article)
    else:
        article = get_article(url)

    return json.dumps({
        "authors":
        article.authors,
        "html":
        article.html,
        "images:":
        list(article.images),
        "movies":
        article.movies,
        "publish_date":
        article.publish_date.strftime("%s") if article.publish_date else None,
        "text":
        article.text,
        "title":
        article.title,
        "topimage":
        article.top_image
    }), 200, {
        'Content-Type': 'application/json'
    }
    def extract(self, request: ExtractorRequest) -> ExtractorResponse:
        try:
            user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
            config = Config()
            config.browser_user_agent = user_agent
            article = Article(
                request.url, keep_article_html=True, fetch_images=False, config=config
            )
            article.download()
            article.parse()
            # article.nlp()

            text = article.text
            meta = {
                "source": "web_article",
                "source_url": article.source_url,
                "article_html": article.article_html,
                "title": article.title,
                "top_image": article.top_image,
                "images": article.images,
                "videos": article.movies,
                "meta_language": article.meta_lang,
                "meta_keywords": article.meta_keywords,
                "authors": article.authors,
                "publish_date": article.publish_date,
            }
            # construct response
            response = ExtractorResponse(meta=meta, text=text or "")
        except Exception as e:
            msg = f"Error using newspaper3k extractor: {str(e)}"
            log.error(msg)
            response = ExtractorResponse(error=msg)

        return response
def getPolarity(uniName):
    from GoogleNews import GoogleNews
    from newspaper import Article
    from newspaper import Config
    import pandas as pd
    from textblob import TextBlob

    uniName = uniName + ' Coronavirus'
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent
    googlenews = GoogleNews(start='08/01/2020', end='09/26/2020')
    googlenews.search(uniName)
    result = googlenews.result()

    for i in range(0, 5):
        googlenews.getpage(i)
        result = googlenews.result()
    df = pd.DataFrame(result)
    sum = 0
    counter = 1
    for ind in df.index:
        try:
            article = Article(df['link'][ind], config=config)
            article.download()
            article.parse()
            article.nlp()
            testimonial = TextBlob(article.summary)
            counter += 1
            sum += testimonial.sentiment.polarity
        except:
            pass

    return sum / counter
Exemple #7
0
def subjectivity():
    """
    Graphs the posts based on the relationship between subjectivity and popularity
    :return: a graph with the x axis being the subjectivity value, and y axis being upvotes
    """
    scores = []
    for index, row in topics_data.iterrows():
        if index in actual_list:
            scores.append(row['score'])

    subs = []
    for index, row in topics_data.iterrows():
        if index in actual_list:
            url = row['url']
        if 'newsweek' or 'democracynow' in url:
            user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
            config = Config()
            config.browser_user_agent = user_agent
            article = Article(url, config=config)
        else:
            article = Article(url)
        article.download()
        article.parse()
        article.nlp()
        text = article.summary
        obj = TextBlob(text)
        subjectivity = obj.sentiment.subjectivity
        subs.append(subjectivity)

        plt.figure(figsize=(50, 10))
        plt.scatter(subs, scores)
        plt.xlabel('Subjectivity')
        plt.ylabel('Score')
        plt.title('Posts in r/politics')
        plt.show()
def handle(task, progress):
	url = task.url
	progress.set_status("Requesting page...")
	resp = http_downloader.page_text(url, json=False)
	if not resp:
		return False

	config = Config()
	config.memoize_articles = False
	config.verbose = False
	article = Article(url='', config=config)

	article.download()
	article.set_html(resp)
	article.parse()
	if not article.top_image:
		return None

	src = article.top_image
	if 'http' not in src:
		if 'https' in url:
			src = 'https://' + src.lstrip('/ ').strip()
		else:
			src = 'http://' + src.lstrip('/ ').strip()

	progress.set_status("Downloading image...")

	return http_downloader.download_binary(src, task.file, prog=progress, handler_id=tag)
Exemple #9
0
def cache_art(url):
    headers = {
        'User-Agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
        'From': '*****@*****.**'
    }
    r = requests.get(url, headers=headers)

    from newspaper import Config
    from newspaper import Article

    config = Config()
    config.browser_user_agent = headers['User-Agent']
    article = Article(url, keep_article_html=True, config=config)
    article.download()
    # article = Article(url='')
    # article.set_html(str(soup))
    article.parse()
    struct = {
        'html': article.article_html,
        'author': article.authors,
        'pubdate': article.publish_date,
        'text': article.text,
        'image': article.top_image,
        'rawhtml': str(r.content),
        'buzotext': scrape(r.content, 'html5lib')
    }
    return struct
def get_top_news(news_information, todays_date):
    load_dotenv(find_dotenv(r"path to env variable"))
    news_api_key = os.getenv("news_api")
    params = (
        ('country', 'us'),
        ('apiKey', f'{news_api_key}'),
    )
    response = requests.get(
        """https://newsapi.org/v2/top-headlines""",
        params=params)
    news_articles = response.json()['articles']
    for i in news_articles:
        user_agent = """Mozilla/5.0 (Macintosh;
        Intel Mac OS X 10.15; rv:78.0)
        Gecko/20100101 Firefox/78.0"""
        config = Config()
        config.browser_user_agent = user_agent
        config.request_timeout = 10
        article = Article(i['url'], config=config)
        print(i["url"])
        article.download()
        article.parse()
        article.nlp()
        summary = article.summary  # noqa
        news_information[i["source"]["name"]] = {"title": i["title"], "summary": summary, "url": i["url"], "date": f"{todays_date}"}  # noqa
    def process_link(self, link=None, nlp=False):
        """processes the linksobtain by get_links() method, extracts
            both the text and a summary of the article with newspaper package
            
        Args:
            link :: str
                URL of links stored in the dictionary returned by get_links()
            nlp :: bool
                Whether or not to perform nlp on the text of the link. This extracts
                a summary of the text, but is a somewhat expensive operation.
        
        Returns:
            article :: 'article' object
                object that contains parsed properties for the link, such as
                summary, text and date.
        """

        #parameters for the processing
        config = Config()
        config.fetch_images = False  #no need for images
        config.memoize_articles = False  #no need for article caching

        try:
            article = Article(link, language="en", config=config)
            article.download()
            article.parse()
            if nlp:
                article.nlp()  #extract summary as per the newspaper API
        except:
            return None

        return article
Exemple #12
0
def get_article_text(url: str) -> str:
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54'
    }
    text = ''
    try:
        config = Config()
        config.browser_user_agent = headers['User-Agent']
        article = Article(url,
                          keep_article_html=True,
                          fetch_images=False,
                          config=config)
        article.download()
        article.parse()
        # article.nlp()
        text = article.text
        meta = {
            "source": "web_article",
            "source_url": article.source_url,
            "article_html": article.article_html,
            "title": article.title,
            "top_image": article.top_image,
            "images": article.images,
            "videos": article.movies,
            "meta_language": article.meta_lang,
            "meta_keywords": article.meta_keywords,
            "authors": article.authors,
            "publish_date": article.publish_date,
        }
        text = clean(text)
    except Exception as e:
        msg = f"Error using newspaper3k extractor: {str(e)}"
        print(msg)
    return text
Exemple #13
0
def filterNews(url, keyword):
    """
    对新闻进行筛选,通过关键词,以新闻标题为对象筛查
    通过user_agent和timeout来解决访问超时问题
    https://stackoverflow.com/questions/63061172/newspaper3k-api-article-download-failed-with-httpsconnectionpool-port-443-read
    :param url:
    :param keyword:
    :return:
    """
    user_agent = r"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
    r"Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47 "
    config = Config()
    config.browser_user_agent = user_agent
    config.request_timeout = 30
    # Try-Except-Continue will skip to the next article in the For loop if there is an exception
    # https://github.com/codelucas/newspaper/issues/444  solve the bugs for not found url
    try:
        news = newspaper.Article(url, config=config)
        news.download()
        news.parse()
        for word in keyword:
            if word in news.title:
                # print(word)
                return True
        return False
    except ArticleException:
        return False
Exemple #14
0
def home(request):
    main_url = "http://newsapi.org/v2/top-headlines?country=in&pageSize=17&apiKey=813ecab326254348a7d1e8ef73d2838f"

    # fetching data in json format
    open_news_page = requests.get(main_url).json()
    # getting all articles in a string article
    article = open_news_page["articles"]
    results = []

    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent
    for ar in article:
        article = Articles()

        intl = []
        intl.append(ar["title"])
        intl.append(ar["description"])
        intl.append(ar["urlToImage"])

        intl.append(ar["url"])
        toi_article = Article(intl[3], language="en", config=config)
        toi_article.download()

        #To parse the article
        toi_article.parse()

        #To perform natural language processing ie..nlp
        toi_article.nlp()
        summ = toi_article.summary
        intl.append(summ)
        results.append(intl)
    content = {'article': results}
    return render(request, 'summnews/index.html', content)
Exemple #15
0
def extract_content(url):
    content = {}
    content = get_static_content()
    try:
        ua = UserAgent()
    
        config = Config()
        config.browser_user_agent = ua.chrome
        config.language = 'es'
        
        article = Article(url, config= config)
         
        article.download()    
     
        article.parse()    
        
        text = article.text
        content['text'] = text
        
        top_image = article.top_image
        content['image'] = top_image

        movielinks = []
        for movie in article.movies:
            movielinks.append(movie)
        content['videos'] = movielinks

    except Exception as e:
        print_exc(e)
    
    return content
Exemple #16
0
def cache_art(url):
    headers = {
        'User-Agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
        'From': '*****@*****.**'
    }

    from newspaper import Config
    from newspaper import Article

    config = Config()
    config.browser_user_agent = headers['User-Agent']
    article = Article(url, keep_article_html=True, config=config)
    article.download()

    article.parse()
    struct = {
        'html': article.article_html,
        'link': url,
        'author': article.authors,
        'pubdate': article.publish_date,
        'text': article.text,
        'title': article.title,
        'image': article.top_image,
        'description': article.meta_description
    }

    return struct
Exemple #17
0
def GetArticle(FeedLink, request):
    user_agent = request.META['HTTP_USER_AGENT']
    config = Config()
    config.browser_user_agent = user_agent
    newarticle = Article(FeedLink, config=config)
    newarticle.download()
    newarticle.parse()
    return newarticle
Exemple #18
0
 def _getArticleObj(self):
     config = Config()
     config.keep_article_html = True
     article = Article(self.url, config=config)
     article.download()
     article.parse()
     article.nlp()
     return article
Exemple #19
0
def parse_submission(url):
    '''
    Input: HTTP URL
    Ouput: List of keywords compiled from names found in the article title and keywords found in the article text
    '''
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent

    a = Article(url, config=config)
    try:
        a.download()
        a.parse()
    except Exception as e:
        logging.error(e)
        return

    a.nlp()
    keywords = a.keywords
    title = a.title
    content = a.text
    thumbnail_image = get_thumbnail_image(a.html)

    #clean up content
    content = to_lower(content)
    content = strip_numbers(content)

    #clean up title
    title = to_lower(title)
    title = strip_numbers(title)

    #get names and identities from content text
    content_names = get_names(''.join(content))

    top_keywords = []
    for name in content_names:
        if name in title:
            #if the name is in the title, it is significant
            top_keywords.append(name)

    #Find other keywords that are not names and are found in the title
    for keyword in keywords:
        if keyword in title and keyword not in stop_words and not keyword.isdigit(
        ):
            top_keywords.append(keyword)

    if not top_keywords:
        top_keywords = title.split(" ")
    else:
        #remove duplicates
        top_keywords = list(dict.fromkeys(top_keywords))
        top_keywords = remove_duplicates(top_keywords)

    return {
        'keywords': top_keywords,
        'title': a.title,
        'thumbnail_url': thumbnail_image
    }
Exemple #20
0
def make_news_table(db, articles, logger):
    class News(db.Model):
        uuid = db.Column(UUIDType(), primary_key=True)
        url = db.Column(db.Text, nullable=False)
        title = db.Column(db.Text, nullable=False)
        picture = db.Column(URLType, nullable=True)
        icon = db.Column(URLType, nullable=True)
        body = db.Column(db.Text, nullable=False)
        publish_time = db.Column(db.DateTime, nullable=False)

        __tablename__ = 'News'

        def __repr__(self):
            return f'<News: {self.uuid}>'

    db.create_all()
    db.session.commit()

    urls = tuple(
        set(articles) - set(a[0]
                            for a in News.query.with_entities(News.url).all()))
    if len(urls) > 0:
        from newspaper import Article, Config
        from newspaper.mthreading import NewsPool
        from uuid import uuid4
        config = Config()
        config.memoize_articles = False
        config.fetch_images = True
        config.request_timeout = 20
        config.thread_request_timeout = 20
        config.follow_meta_refresh = True
        config.MIN_WORD_COUNT = 150
        config.MIN_SENT_COUNT = 4
        config.browser_user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
        news_pool = NewsPool(config=config)
        articles = tuple(Article(url, config=config) for url in urls)
        news_pool.set(articles, override_threads=4)
        logger.info(f"Articles being downloaded: {urls}")
        news_pool.join()
        for article in articles:
            logger.debug(f"Processing: {article.url}")
            if article.download_state == 1:
                logger.warn(f'Failed to download: {article.url}')
                continue
            article.parse()
            try:
                db.session.add(
                    News(uuid=uuid4(),
                         url=article.url,
                         title=article.title,
                         picture=article.top_image,
                         icon=article.meta_favicon,
                         body=article.text,
                         publish_time=article.publish_date))
            except Exception as e:
                logger.warn(e)
        db.session.commit()
    return News
def load_and_parse_full_article_text_and_image(url: str) -> Article:
    config = Config()
    config.MAX_SUMMARY_SENT = 8

    article = Article(url, config=config)
    article.set_html(load_page_safe(url))  # safer than article.download()
    article.parse()

    return article
def handle(url, data, log):
    try:
        log.out(0, 'Downloading article...')
        resp = requests.get(url, headers={'User-Agent': data['user_agent']})
        if resp.status_code != 200:
            return False  #!cover

        config = Config()
        config.memoize_articles = False
        config.verbose = False
        article = Article(url='', config=config)
        log.out(0, 'Parsing article...')

        article.download()
        article.set_html(resp.text)
        article.parse()
        if article.top_image:
            src = article.top_image
            if 'http' not in src:  #!cover
                if 'https' in url:
                    src = 'https://' + src.lstrip('/ ').strip()
                else:
                    src = 'http://' + src.lstrip('/ ').strip()
            log.out(0, 'Newspaper located image: %s' % src)

            r = requests.get(src,
                             headers={'User-Agent': data['user_agent']},
                             stream=True)
            if r.status_code == 200:
                content_type = r.headers['content-type']
                ext = mimetypes.guess_extension(content_type)
                if not ext or ext == '':  #!cover
                    log.out(
                        1, 'NewsPaper Error locating file MIME Type: %s' % url)
                    return False
                if '.jp' in ext:
                    ext = '.jpg'  #!cover
                path = data['single_file'] % ext
                if not os.path.isfile(path):
                    if not os.path.isdir(data['parent_dir']):  #!cover
                        log.out(1, ("+Building dir: %s" % data['parent_dir']))
                        os.makedirs(
                            data['parent_dir']
                        )  # Parent dir for the full filepath is supplied already.
                    with open(path, 'wb') as f:
                        r.raw.decode_content = True
                        shutil.copyfileobj(r.raw, f)
                return path
            else:  #!cover
                log.out(
                    0, ('\t\tError Reading Image: %s responded with code %i!' %
                        (url, r.status_code)))
                return False
    except Exception as e:
        log.out(0, ('"Newspaper" Generic handler failed. ' + (str(e).strip())))
    return False  #!cover
def get_article(url):
    config = Config()
    config.request_timeout = 1.3
    article = Article(url, language='zh', config=config)
    try:
        article.download()
        article.parse()
        # article.nlp()
    except Exception:
        return
    return article.text
Exemple #24
0
    def extract_text(self, response):

        conf = ArticleConfig()
        conf.fetch_images = False
        conf.follow_meta_refresh = False
        conf.memoize_articles = False

        article = Article(url=response.url, config=conf)
        article.download(html=response.body)
        article.parse()

        return article.text
Exemple #25
0
    def parse(self, response):

        newsItem = CNNItem()
        newsItem['title'] = uniToAscii(
            response.selector.xpath(
                '//meta[@property="og:title"]/@content').extract_first())
        newsItem['author'] = uniToAscii(
            response.selector.xpath(
                '//meta[@name="author"]/@content').extract_first())
        newsItem['link'] = response.url

        print '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', newsItem[
            'title']

        text = ""
        pars = response.selector.xpath(
            '//*[@class="zn-body__paragraph"]//text()').extract()
        for par in pars:
            par_text = uniToAscii(par)
            if par_text[-1:] != '.':
                par_text += '.'
            text = text + par_text + " "
        newsItem['text'] = text

        keywords = uniToAscii(
            response.xpath(
                "//meta[@name='keywords']/@content").extract_first())
        keywords_list = keywords.split(',')
        newsItem['keywords'] = [word.strip() for word in keywords_list]

        print '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', newsItem[
            'title'], ' ', len(pars), ' ', newsItem['keywords']
        if newsItem['text']:
            print newsItem['text'][:100]

        yield newsItem

        config = Config()
        config.fetch_images = False
        config.memoize_articles = False

        url_build = newspaper.build(url=response.url, config=config)

        print response.url

        storyLinks = [article_url for article_url in url_build.article_urls()]

        print storyLinks

        for link in storyLinks:
            if re.match('.*/2017/03/0.*\.html', str(link)):
                print 'Fetching from ', str(link)
                yield scrapy.Request(str(link))
Exemple #26
0
def nlp_feed(term,limiter=None):
    fresh_cache = True
    sources = set()
    config = Config()
    config.browser_user_agent = AGENT
    urls = search_fetch(term,limiter)
    print(urls)
    for url in urls[:1+EVIDENCE_BATCH_SIZE]:

        # Exclude any links (usually Associated Press ones) which are themselves fact-checks, which would be cheating...
        #unless running on closed domain, in which case 'factcheck' is in every domain...
        if limiter is None and 'factcheck' in url or 'fact-check' in url:
            continue

        # Scrape requested URLs if they aren't currently in cache.
        if url in article_cache:
            wc = article_cache[url]
            if wc != '':
                sources.add((url, wc))
                print("Cache Hit on ", url[:max(len(url), 50)], "......")
        else:
            try:
                # newspapers can't handle bbc ergo custom approach
                if 'www.bbc' in url:
                    article = requests.get(url)
                    if article.status_code != 200:
                        raise ArticleException
                    soup = BeautifulSoup(article.content, 'html.parser')
                    body = ' '.join(z.text for z in soup.findAll('p'))
                    sources.add((url, body))
                    if body not in article_cache:
                        article_cache[url] = body
                        fresh_cache = False

                else:
                    article = Article(url, config=config, language='en')
                    article.download()
                    article.parse()
                    sources.add((url, article.text))
                    if article.text not in article_cache:
                        article_cache[url] = article.text
                        fresh_cache = False

            except ArticleException:
                print("Couldn't fetch: ", url)
                article_cache[url] = ''
                fresh_cache = False

    # Only dump to disk if cache has been modified
    if not fresh_cache:
        dump_to_disk()
    return sources
Exemple #27
0
def getData(url):
    data_url = []  
    try:
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        config = Config()
        config.browser_user_agent = user_agent
        page = Article(url, config=config)
        page.download()
        page.parse()
        data_url.append(page.text)
    except:
        print('***FAILED TO DOWNLOAD***',page.url)   
    return data_url
    def newss(self, lienn):
        config = Config()
        config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        article = Article(lienn, language="fr", config=config)
        article.download()
        article.parse()
        title = article.title
        author = article.authors
        article.nlp()
        summary = article.summary
        keywords = article.keywords

        return title, lienn, author, summary, keywords
Exemple #29
0
    def get_keywords(self, url):
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        config = Config()
        config.browser_user_agent = user_agent
        paper = Article(url, config=config)
        try:
            paper.download()
            paper.parse()
            paper.nlp()
        except:
            return []

        return paper.keywords
Exemple #30
0
def whoHasTimeToRead(url):
	is_article = valid_url(url, verbose=True)
	config = Config()
	config.MAX_KEYWORDS = 10
	if is_article:
		sumitup = {}
		b = Article(url=url,config=config)
		b.download()
		b.parse()
		b.nlp()
		sumNews = summary(b.title, b.text, b.keywords)
		sumTitle = b.title
		movies = b.movies[0] if len(b.movies) > 0 else "None"
		return sumNews,sumTitle,movies
	return "Nope"
Exemple #31
0
    def scrape(url):
        config = Config()
        config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36'
        article = Article(url['url'], config=config)
        article.download()
        article.parse()

        soup = BeautifulSoup(article.html, 'lxml')

        ga = LongformArticle()
        ga.url = url['url']
        ga.title = url['title']
        ga.link = url['url']
        ga.text = article.text
        return ga
Exemple #32
0
    def __init__(self,
                 output_dir,
                 dataframe,
                 num_rows=None,
                 user_agent=default_user_agent) -> None:
        self.output_dir = output_dir

        temp_df = dataframe
        if num_rows:
            self.df = temp_df.iloc[:num_rows]
        else:
            self.df = temp_df

        self.user_agent = user_agent
        self.config = Config()
        self.config.browser_user_agent = user_agent

        self.download_logs_dict = {
            "link": [],
            "download_state": [],
            "download_exception_msg": []
        }

        self.number_of_downloaded = 0

        self.preprocess_df()
Exemple #33
0
def extract_article_html(url):
    config = Config()
    config.keep_article_html = True
    article = Article(url, config=config)

    article.download()
    article.parse()

    article_html = article.article_html

    html = lxml.html.fromstring(article_html)
    for tag in html.xpath('//*[@class]'):
        tag.attrib.pop('class')

    return lxml.html.tostring(html).decode('utf-8')
    """
Exemple #34
0
    def __init__(self, url):
        self.url = url
        self.config = Config()
        self.config.memoize_articles = False
        #self.config.fetch_images = False
        self.config.browser_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.35 Safari/537.36'
        self.config.request_timeout = 30
   
        #logger.info('url is %s', url)
        self.article = Article(url, config=self.config)
        self._import_article()

        self.results = {}
        self.results['sourceurl'] = url
        try:
            self.results['title'] = self.article.title
            self.results['authors'] = self.article.authors
            self.results['canonical_link'] = self.article.canonical_link
            self.results['url'] = self.article.url
            self.results['top_image'] = self.article.top_image
            self.results['images'] = list(self.article.images)
            self.results['movies'] = self.article.movies
            self.results['description'] = self.article.meta_description
            #self.results['meta_favicon'] = self.article.meta_favicon
            self.results['keywords'] = self.article.meta_keywords
            self.results['lang'] = self.article.meta_lang
            self.results['summary'] = self.article.summary
            self.results['tags'] = list(self.article.tags)
            self.results['text'] = self.article.text
        except Exception:
            logger.fatal('Unable to make results for:%s, %s', url, results)
            pass
Exemple #35
0
def main(argv):
    sourcelist = []
    if len(argv) > 1:
        sourcefile = argv[1]
        try:
            with open(sourcefile,'r') as f:
                sourcelist = f.read().strip().split('/n')
        except IOError:
            print("File does not exist")

    """
    Check for existence of memo cache
    If it doesn't exist, create memo cache and populate top sources file with the specified sources.txt file. If it is not specified return an error and terminate.
    If memo cache exists, if sources.txt is specified do a check against top sources and add any new ones. If no sources.txt is specified use top sources file.
     """
    firstrun = False
    memo_cache_path = os.path.join(os.path.dirname(__file__), '.memo_cache')
    if not os.path.exists(memo_cache_path):
        if len(sourcelist) > 0:
            firstrun = True
            os.makedirs(memo_cache_path)
            with open(os.path.join(memo_cache_path, '.top_sources'), 'w') as f:
                [f.write(source + '\n') for source in sourcelist]
        else:
            print("You must specify an input file on the first run")
            print("An input file contains line-separated urls to the top-level domains you wish to crawl")
            raise SystemExit
    else:
        if len(sourcelist) > 0:
            with open(os.path.join(memo_cache_path, '.top_sources'), 'w') as f:
                [f.write(source + '\n') for source in sourcelist]

        else:
            with open(os.path.join(memo_cache_path, '.top_sources'), 'r') as f:
                sourcelist = f.read().split('\n')

    # this config applies to the entire crawling process
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = True
    config.fetch_images = False

    top_sources = [IntiSource(url=source,config=config) for source in sourcelist]

    if firstrun:
        build_categories(top_sources)
Exemple #36
0
    def __init__(self, url):
        c = Config()
        c.keep_article_html = True

        article = Article(url=url, config=c)
        article.download()
        article.parse()
        
        try:
            article.nlp()
            summary = article.summary
            if summary == "":
                self.summary = "Summary not available!"
            else:
                self.summary = summary
        except Exception, e:
            self.summary = "Summary not available!"
Exemple #37
0
    def __init__(self):
        Config.__init__(self)
        self.memoize_articles = False
        self.fetch_images = False
        self.request_timeout = 60
        self.number_threads = 20
        self.keep_article_html = True
        self.MIN_WORD_COUNT = 20
        self.verbose = True
        self.keep_article_html = True

        self.urls = {
        'category1': [
        'a_url,articles,a_regex_for_links,a_tags',
        'b_url,article,,b.tags',
        'c_url,feed,,c.tags'
         ],
        }
        self.outputDir = 'content' + os.sep + 'category'
        self.max_number = 2000
        self.max_days = 100
Exemple #38
0
def main(argv):
    if len(argv) > 1:
        htmlist = argv[1]
    else:
        htmlist = 'htmlist'

    # Our permanent config for html cleaning
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    cleaner = Article(url='', config=config)

    with open(htmlist, 'r') as f:
        htmfile = f.read().split('\n')

    raw = []

    for htm in htmfile:
        print (htm)
        if not htm.endswith("rss.html"):
            with open(htm, 'r') as f:
                h = f.read()

            cleaner.set_html(h)
            cleaner.parse()
            sentences = nlp.split_sentences(cleaner.text)
            #raw.append(sentences])
        
            with open('htm-out', 'a') as f:
                [f.write(r + '\n') for r in sentences]
def getConf(language, parser):
    '''
    get newspaper Config object
    :return:
    '''
    conf = Config(); conf.fetch_images = False; conf.memoize_articles = False
    conf.set_language(language); conf.parser_class = parser
    return conf
Exemple #40
0
def main(argv):
    TOP_PATH = os.path.dirname(__file__)
    OUT_PATH = os.path.join(TOP_PATH, 'output')
    if not os.path.exists(OUT_PATH):
        os.makedirs(OUT_PATH)

    # Our permanent config for crawling
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    # Get contents of our source file
    sourcefile = os.path.join(TOP_PATH, "sources.txt")
    with open(os.path.join(sourcefile), 'r') as f:
        sourcelist = f.read().strip().split('\n')

    # Initialize our sources
    sources = [IntiSource(source,config=config) for source in sourcelist]

    # Make domain directories inside our output path and build sources
    for s in sources:
        if not os.path.exists(os.path.join(OUT_PATH, s.domain)):
            dom_path = os.path.join(OUT_PATH, s.domain)
            os.makedirs(dom_path)

        # Build
        s.build()

        if config.verbose:
            s.print_summary()

    # Multithreaded source downloading and parsing
    news_pool.set(sources, threads_per_source = 4)
    news_pool.join()

    article_parse(sources)
Exemple #41
0
from newspaper import Article, Config
from lxml.etree import tostring

config = Config()
config.keep_article_html = True

def extract(url):
    article = Article(url=url, config=config)
    article.download()
    article.parse()
    return dict(
            title=article.title,
            text=article.text,
            html=article.html,
            image=article.top_image,
            top=article.clean_top_node,
            authors=article.authors,
            )

if __name__ == '__main__':
    if len(sys.argv) < 2 :
        print("Usage : %s url" % (sys.argv[0]))
        sys.exit(1)

    a = extract(sys.argv[1])
    print(tostring(a['top']))
__author__ = 'James'
import newspaper
from newspaper import Config, news_pool

config = Config()
config.set_language('en')
config.memoize_articles = False


reuters = newspaper.build(url='http://www.reuters.com', config=config)
indo = newspaper.build(url='http://www.independent.ie', config=config)

papers = [reuters, indo]

news_pool.set(paper_list=papers, threads_per_source=3)
news_pool.join()

for paper in papers:
    print(paper.brand + ": " + str(paper.size()) + " article(s)")
    # for article in paper.articles:
    #     print(article.title)

# print("-----------\nCATEGORIES\n-----------")
#
# for category in a.categories:
#     print(category.url)
#     b = newspaper.Source(url=category.url)
#     b.build()
#     print("\t-----------\n\tFEEDS\t\n-----------\t")
#     for feed_url in b.feed_urls():
#         print("\t->" + feed_url)
Exemple #43
0
def getFeeds(url):
    config = Config()
    config.fetch_images = False
    paper = newspaper.build(url, config)
    for feed_url in paper.feed_urls(): # paper.article_urls(): #
        print feed_url
Exemple #44
0
def main():
	import newspaper # article download utility
	from newspaper import news_pool, Config, Article, Source
	import re # regex
	import csv # csv file-formatting
	import unicodedata # string cleaning
	from datetime import datetime # time-checking for cache-updates

	print("Retrieving sources and update times\n...")

	# Read active list of news/media sources
	f = open("sourcelist","r")
	sources = f.read().splitlines()
	times = []

	#
	# ONGOING: update time storage and retrieval
	#		-dependent on if caching is sufficient

	papers = {} # Empty dictionary

	print("Building papers\n....\n...\n...")

	# Store total and current number of articles for progress metrics
	total_articles = 0; current_articles = 0

	# Build diction, using url name for keys ex/ 'http://cnn.com' key will be 'cnn'
	for i in range(len(sources)):
		key = re.sub(r'(^https?:\/\/|\.com\n$|\.org\n$)','',sources[i])
		papers[key] = newspaper.build(sources[i],memoize_articles=True)
		
		# Print number of articles added from "recent" list for logging purposes
		total_articles = total_articles + papers[key].size()
		print(key,papers[key].size())

	print("Downloading articles (this may take a while)\n...\n...\n...")

	config = Config()
	config.fetch_images = False
	
	# Download all articles via multi-threading
	news_pool.set([x[1] for x in papers.items()], threads_per_source=2) # Test various thread counts
	news_pool.join()

	print("Extracting text from articles and writing to dump files \n...\n...\n...")

	# Append articles to aggregate and individual csv's
	# Format: col(1) = source, col(2) = date, col(3) = title, col(4) = authors, col(5) = text, col(6) = keywords
	with open('papers.csv','a') as outcsv:

		# Setup aggregate csv writer
		writer = csv.writer(outcsv)
		#writer.writerow(["Source","Date","Title","Authors","Text","Keywords"])

		# Traverse sources
		for i in papers:

			# Setup single_source csv writing
			source = i
			ind_outcsv = open(str(i+".csv"),'a')
			ind_writer = csv.writer(ind_outcsv)

			# Traverse articles in source			
			for j in range(papers[i].size()):

				# Parse articles and extract features
				current_articles += 1
				print("Processing " + str(i) + " article " + str(current_articles) + " of " + str(total_articles) + " (" + str("{0:.2f}".format((current_articles/float(total_articles)*100),2))
 + " %)")

				try:
					papers[i].articles[j].parse()

					# Grab key features
					title = unicodedata.normalize('NFKD',papers[i].articles[j].title).encode('ascii','ignore')
					authors = [x.encode('UTF-8') for x in papers[i].articles[j].authors]
					text = unicodedata.normalize('NFKD',papers[i].articles[j].text).encode('ascii','ignore')
					date = papers[i].articles[j].publish_date
					keywords = [x.encode('UTF-8') for x in papers[i].articles[j].keywords]
					
					# Add new row to both single-source and aggregate files
					ind_writer.writerow([source,date,title,authors,text,keywords])
					writer.writerow([source,date,title,authors,text,keywords])
					papers[i].articles[j].nlp()

				except httplib.BadStatusLine:
					print "httplib.BadStatusLine, no dice"