Python Config Examples, newspaper.Config Python Examples

Example #1

0

Show file

File: news_scraper.py Project: bmassman/fake_news

def get_configuration() -> newspaper.Config:
    """Return configuration for news site scraping."""
    conf = newspaper.Config()
    conf.memoize_articles = False
    conf.fetch_images = False
    conf.MIN_WORD_COUNT = 1
    conf.MAX_TEXT = 6 * 5000
    return conf

Example #2

0

Show file

File: narrations.py Project: abcelen/vox-Django-React

def calculate_article_word_count(url):
    config = newspaper.Config()
    config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) " \
                                "AppleWebKit/537.36 (KHTML, like Gecko) " \
                                "Chrome/64.0.3282.186 Safari/537.36"
    article = newspaper.Article(url, config=config)
    article.download()
    article.parse()

    if (len(article.text.split())) < 200:
        raise ValidationError('Could not find article')

    return len(article.text.split()) + len(article.title.split())

Example #3

0

Show file

def download_article(url):
    try:
        # user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/537.36 (KHTML, like Gecko) Chrome/20100101 Firefox/78.0'
        config = newspaper.Config()
        config.browser_user_agent = user_agent
        config.request_timeout = 10
        config.fetch_images = False
        config.memoize_articles = False
        article = newspaper.Article(url, config=config)
        article.download()
        article.parse()
        return article.text
    except:
        raise Exception("Error: Parsing failed.")

Example #4

0

Show file

def summarizeLinksToAudio(url, summary) -> None:
    """Summarize a list of urls into audio files."""
    results = list()
    result = str()
    try:
        config = newspaper.Config()
        configNews(config)
        urls = getURLS(url, summary)
        for url in urls:
            results.append(summarizeLinkToAudio(url))
    except Exception as e:
        logging.exception(e)
    finally:
        result = "".join(results)
        return result

Example #5

0

Show file

File: scrapers.py Project: darin-ellis7/scotusapp

    def genericScraper(self):
        config = newspaper.Config()
        config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14'
        #config.request_timeout = 15

        if self.source not in [
                "washingtonpost", "usnews"
        ]:  # washingtonpost and usnews get funky when you set a user agent for some reason (WaPo fails if the timeout isn't long, usnews throws a 403)
            a = newspaper.Article(self.url, config=config)
        else:
            a = newspaper.Article(self.url)
        try:  # make sure page download goes smoothly
            a.download()
            a.parse()
        except Exception as e:
            print("Rejected - DOWNLOAD ERROR: ", e)
            return None

        text = cleanText(a.text)
        if len(
                text
        ) < 500:  # not much article text - full article is likely not picked up, and worst case scenario a short article is rejected (probably not all that useful in the long run)
            print(
                "Rejected - Article text was less than 500 characters, likely bad scraping job"
            )
            return None

        # get title, author, date and images as necessary
        if not self.title:
            if a.title:
                self.title = a.title

        if not self.author:
            if a.authors:
                self.author = a.authors[0]

        if not self.date:
            if a.publish_date:
                self.date = a.publish_date.strftime("%Y-%m-%d")

        if not self.images:
            if a.top_image:
                self.images.append(a.top_image)

        article = Article(self.title, self.author, self.date, self.url,
                          self.source, text.strip(), self.images)
        return article

Example #6

0

Show file

File: scrape.py Project: Grocode87/news.api2

    def process_article(self, article):
        """Scrape data from news article from url

        Args:
            article ([ArticleObj]): [Article Object to scrape - article.url must not be null]

        Returns:
            [ArticleObj]
        """

        print("processing article: " + str(article.title) + " - " +
              str(article.source))

        try:
            user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
            config = newspaper.Config()
            config.browser_user_agent = user_agent

            newspaper_article = newspaper.Article(article.url,
                                                  language='en',
                                                  config=config)
            newspaper_article.download()
            newspaper_article.parse()
        except Exception as e:
            # handle any exceptions
            print("Error parsing newspaper article")
            print(e)

        if newspaper_article:
            article.text = self.get_text(newspaper_article).encode(
                'utf-8').decode("utf-8")
            #lang = langdetect.detect(article.text)
            lang = 'en'
            if lang == 'en':
                article.cleaned_text = self.clean_text(article.text)

                if len(article.text.split()) > 60 and article.title != None:
                    #article.entities = self.get_entities(article.text)
                    article.entities = []
                    #article.category = self.get_category(article.cleaned_text)
                    article.category = ""
                    if not article.img_url:
                        article.img_url = self.get_image_url(newspaper_article)

                    article.desc = ' '.join(article.text.split()[:50]) + "..."
                    return article

Example #7

0

Show file

def build(newspaperURL):
    logprint(
        "Fetching articles from {} ...\nThis might take some time...".format(
            newspaperURL))
    start = time.time()
    config = newspaper.Config()
    config.MIN_WORD_COUNT = 700
    config.MIN_SENT_COUNT = 40
    paper = newspaper.build(
        url=newspaperURL,
        config=config,
        memoize_articles=False,
        fetch_images=False,
    )
    end = time.time()
    logprint("Done. Fethching took {} seconds.".format(end - start))
    return paper

Example #8

0

Show file

File: scraper_app.py Project: AI-and-ML-developer-india/NewsLookout

    def applyConfig(self):
        """ apply configuration """

        os.environ['HTTP_PROXY'] = ''
        os.environ['HTTPS_PROXY'] = ''

        try:
            newspaper_config = newspaper.Config()
            newspaper_config.memoize_articles = True
            newspaper_config.http_success_only = True
            newspaper_config.fetch_images = False
            newspaper_config.number_threads = 2
            newspaper_config.browser_user_agent = self.configData['user_agent']
            newspaper_config.request_timeout = self.configData['fetch_timeout']

            # add this to config data
            self.configData['newspaper_config'] = newspaper_config

            # set OS environment variables for proxy server:
            if len(self.configData['proxy_url_http']) > 3 and len(
                    self.configData['proxy_url_https']) > 3:

                os.environ['HTTP_PROXY'] = self.configData['proxy_url_http']

                os.environ['HTTPS_PROXY'] = self.configData['proxy_url_https']

                self.configData['proxies'] = {
                    "http": self.configData['proxy_url_http'],
                    "https": self.configData['proxy_url_https']
                }
            # else:
            #     print("INFO: Not using any proxy servers: "
            #           , self.configData['proxy_url_http']
            #           , " or "
            #           , self.configData['proxy_url_https'])

            nltk.set_proxy(self.configData['proxies'])
            self.configData['newspaper_config'].proxies = self.configData[
                'proxies']
            # print("INFO: For NLTK, using Proxy configuration: ", nltk.getproxies())

        except Exception as e:
            print("ERROR: Unable to set proxy parameters: %s", e)

Example #9

0

Show file

File: snaggy_article.py Project: chalbersma/snaggy

    def __init__(self, url="", keep_html=True):
        self.config = newspaper.Config()
        self.config.keep_article_html = True

        self.url = url

        # Validate URL
        try:
            self.parse_result = urllib.parse.urlparse(self.url)
            print(self.parse_result)
        except Exception as e:
            self.parse_result = False
            self.article_data = dict()
        else:
            # It's okay to Attempt a parse
            self.article_data = self.extract()
        finally:
            # Any Future Cleanup Goes Here
            self.article_data["url"] = url
            pass

Example #10

0

Show file

    def __init__(self, url: str, articles_limit=100, summary_sentences=5):
        """
        Initializes the information source with summaries of the articles_limit articles. Each summary will have
        summary_sentences number of sentences.
        :param url: news website URL, e.g. "https://www.bbc.co.uk/"
        :param articles_limit: limit number of articles to fetch
        :param summary_sentences: number of sentences in each summary
        """
        config = newspaper.Config()
        config.MAX_SUMMARY_SENT = summary_sentences
        config.memoize_articles = False
        config.fetch_images = False

        self.paper = newspaper.build(url, config=config)
        summaries_list = list()

        i = 0
        while len(summaries_list) < articles_limit and i < len(
                self.paper.articles):
            article = self.paper.articles[i]
            i += 1
            try:
                article.download()
                article.parse()
                article.nlp()
            except newspaper.article.ArticleException:
                continue
            if article.summary != str():
                summaries_list.append(article.summary.split('\n'))

        self.summaries = list()
        for summary in summaries_list:
            sentence_to_id = dict(
                zip(
                    range(InformationSource._info_id,
                          InformationSource._info_id + len(summary)), summary))
            InformationSource._info_id += len(summary)
            self.summaries.append(sentence_to_id)

Example #11

0

Show file

def parse_article_url(url):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
    config = newspaper.Config()
    config.browser_user_agent = user_agent

    print(url)

    content = {}
    article = {}
    try:
        content = newspaper.Article(url, config=config)
        content.download()
        content.parse()
    except Exception as e:
        print("Invalid url! Please try again")
        return None

    article['title'] = content.title
    article['text'] = content.text
    article['link'] = content.url
    article['date'] = content.publish_date.isoformat()

    return article

Example #12

0

Show file

    def applyNetworkConfig(self):
        """ Apply configuration for networking
        """
        os.environ['HTTP_PROXY'] = ''
        os.environ['HTTPS_PROXY'] = ''
        try:
            newspaper_config = newspaper.Config()
            newspaper_config.memoize_articles = False
            newspaper_config.http_success_only = True
            newspaper_config.fetch_images = False
            newspaper_config.number_threads = 2
            newspaper_config.browser_user_agent = self.user_agent
            newspaper_config.request_timeout = self.fetch_timeout
            newspaper_config.use_cached_categories = False
            # add this to config data
            self.newspaper_config = newspaper_config
            # set OS environment variables for proxy server:
            if len(self.proxy_url_http) > 3 and len(self.proxy_url_https) > 3:
                os.environ['HTTP_PROXY'] = self.proxy_url_http
                os.environ['HTTPS_PROXY'] = self.proxy_url_https
                self.proxies = {
                    "http": self.proxy_url_http,
                    "https": self.proxy_url_https
                }
            else:
                os.environ['HTTP_PROXY'] = ''
                os.environ['HTTPS_PROXY'] = ''
                self.proxy_url_http = None
                self.proxy_url_https = None
                self.proxies = {}

            nltk.set_proxy(self.proxies)
            self.newspaper_config.proxies = self.proxies
            # print("INFO: For NLTK, using Proxy configuration: ", nltk.getproxies())
        except Exception as e:
            print("ERROR: Unable to set proxy parameters: %s", e)

Example #13

0

Show file

File: scraper.py Project: ssladam/news_corpus_compare

        "link": "http://www.nbcnews.com/"
    },
    "buzzfeed": {
        "rss": "http://www.buzzfeed.com/politics.xml",
        "link": "http://www.buzzfeed.com/"
    }
}

try:
    with open(script_path + 'scraped_articles.yaml') as data_file:
        data = yaml.load(data_file)
except Exception as e:
    print(e)
    print('Unable to load previous scraped articles.')

config = newspaper.Config()
config.fetch_images = False
config.memoize_articles = False

#iterate through all sources. Stop after RSS feed exhausted, or LIMIT reached
for source, value in sources.items():
    site = {
        "rss": value['rss'],
        "link": value['link'],
        "articles": [],
        "link_set": set()
    }
    count = 1
    print("Capturing articles from", source)
    d = fp.parse(value['rss'])
    if source not in data:

Example #14

0

Show file

File: qa_extract_twitter.py Project: rogervaas/zbrain

def convert_tweet(twitter_dumpfn):
    try:
        with open(twitter_dumpfn, 'r') as dumpf:
            data = json.loads(dumpf.read())
        if len(data['comments'])==0:
            return

        jsonfn = '%s/%s/%s.json' % (QASRC_DIRFN, corpus_name, data['id'])
        if os.path.exists(jsonfn):
            return

        url = data['textUrl'] if 'textUrl' in data else ''

        text = ''

        if url:

            skip = False
            for blocked_url in BLOCKLIST:
                if blocked_url in url:
                    skip = True
            if skip:
               return 

            logging.debug ('%-20s: %s ... ' % (data['user'], url))

            config = newspaper.Config()
            config.browser_user_agent = random.choice(USER_AGENTS)

            article = Article(url=url, config=config)
            article.download()
            article.parse()

            text = article.text

            # print (text)

        # text

        if text:
            ds = {'info': text, 'date': data['date'], 'dlg': [data['text']]}
        else:
            ds = {'info': data['text'], 'date': data['date'], 'dlg': []}

        fav = 0
        for c in data['comments']:
            if c['favorites'] == 0:
                continue
            ds['dlg'].append(c['text'])
            fav += 1

        if (not text) and (fav == 0):
            return

        # print(repr(ds))

        with open(jsonfn, 'w') as jsonf:
            jsonf.write(json.dumps(ds))

        logging.debug ('%-20s: %s written. %s' % (data['user'], jsonfn, url[:30]))

    except newspaper.article.ArticleException as ae:
        
        logging.info ('%-20s: %s' % (data['user'], str(ae)))

    except:
        logging.exception('exception caught %s' % repr(data))

Example #15

0

Show file

File: Internet.py Project: tandonneur/AdvancedAnalytics

    def newspaper_stories(words,
                          search_type='or',
                          search_level=0,
                          urls=None,
                          display=True,
                          memorize=False,
                          language='en'):
        config = newspaper.Config()
        config.memoize_articles = memorize
        config.language = language
        config.fetch_images = False
        config.request_timeout = 20
        config.MIN_WORD_COUNT = 300
        config.MIN_SENT_COUNT = 10
        if urls == None or urls == 'top_news':
            news_urls = {
                'huffington': 'http://huffingtonpost.com',
                'reuters': 'http://www.reuters.com',
                'cbs-news': 'http://www.cbsnews.com',
                'usa-today': 'http://usatoday.com',
                'cnn': 'http://cnn.com',
                'npr': 'http://www.npr.org',
                'abc-news': 'http://abcnews.com',
                'us-news': 'http://www.usnews.com',
                'msn': 'http://msn.com',
                'pbs': 'http://www.pbs.org',
                'nbc-news': 'http://www.nbcnews.com',
                'msnbc': 'http://www.msnbc.com',
                'fox': 'http://www.foxnews.com'
            }
        elif urls == 'all_us_news':
            news_urls = {
                'abc-news': 'https://abcnews.go.com',
                'al-jazeera-english': 'http://www.aljazeera.com',
                'ars-technica': 'http://arstechnica.com',
                'associated-press': 'https://apnews.com/',
                'axios': 'https://www.axios.com',
                'bleacher-report': 'http://www.bleacherreport.com',
                'bloomberg': 'http://www.bloomberg.com',
                'breitbart-news': 'http://www.breitbart.com',
                'business-insider': 'http://www.businessinsider.com',
                'buzzfeed': 'https://www.buzzfeed.com',
                'cbs-news': 'http://www.cbsnews.com',
                'cnbc': 'http://www.cnbc.com',
                'cnn': 'http://us.cnn.com',
                'crypto-coins-news': 'https://www.ccn.com',
                'engadget': 'https://www.engadget.com',
                'entertainment-weekly': 'http://www.ew.com',
                'espn': 'http://espn.go.com',
                'espn-cric-info': 'http://www.espncricinfo.com/',
                'fortune': 'http://fortune.com',
                'fox-news': 'http://www.foxnews.com',
                'fox-sports': 'http://www.foxsports.com',
                'google-news': 'https://news.google.com',
                'hacker-news': 'https://news.ycombinator.com',
                'ign': 'http://www.ign.com',
                'mashable': 'http://mashable.com',
                'medical-news-today': 'http://www.medicalnewstoday.com',
                'msnbc': 'http://www.msnbc.com',
                'mtv-news': 'http://www.mtv.com/news',
                'national-geographic': 'http://news.nationalgeographic.com',
                'national-review': 'https://www.nationalreview.com/',
                'nbc-news': 'http://www.nbcnews.com',
                'new-scientist': 'https://www.newscientist.com/section/news',
                'newsweek': 'http://www.newsweek.com',
                'new-york-magazine': 'http://nymag.com',
                'next-big-future': 'https://www.nextbigfuture.com',
                'nfl-news': 'http://www.nfl.com/news',
                'nhl-news': 'https://www.nhl.com/news',
                'politico': 'https://www.politico.com',
                'polygon': 'http://www.polygon.com',
                'recode': 'http://www.recode.net',
                'reddit-r-all': 'https://www.reddit.com/r/all',
                'reuters': 'http://www.reuters.com',
                'techcrunch': 'https://techcrunch.com',
                'techradar': 'http://www.techradar.com',
                'american-conservative':
                'http://www.theamericanconservative.com/',
                'hill': 'http://thehill.com',
                'huffington-post': 'http://www.huffingtonpost.com',
                'next-web': 'http://thenextweb.com',
                'verge': 'http://www.theverge.com',
                'wall-street-journal': 'http://www.wsj.com',
                'washington-post': 'https://www.washingtonpost.com',
                'washington-times': 'https://www.washingtontimes.com/',
                'time': 'http://time.com',
                'usa-today': 'http://www.usatoday.com/news',
                'vice-news': 'https://news.vice.com',
                'wired': 'https://www.wired.com'
            }
        elif urls == "texas_universities":
            news_urls = {
                'A&M': 'http://www.tamu.edu',
                'A&M-Commerce': 'http://www.tamuc.edu',
                'A&M-Corpus': 'http://www.tamucc.edu',
                'A&M-Kingsville': 'http://www.tamuk.edu',
                'A&M-Galveston': 'http://www.tamug.edu',
                'A&M-PrairieView': 'http://www.pvamu.edu',
                'A&M-International': 'http://www.tamiu.edu',
                'A&M-WestTexas': 'http://www.wtamu.edu',
                'Baylor': 'http://www.baylor.edu',
                'Rice': 'http://www.rice.edu',
                'SFAustin': 'http://www.sfasu.edu',
                'SMU': 'http://www.smu.edu',
                'SulRoss': 'http://www.sulross.edu',
                'TexasState': 'http://www.txstate.edu',
                'Texas_Tech': 'http://www.ttu.edu',
                'UDallas': 'http://www.udallas.edu',
                'UHouston': 'http://www.uh.edu',
                'UTexas': 'http://www.utexas.edu',
                'UT_Dallas': 'http://www.utdallas.edu',
                'UT_ElPaso': 'http://www.utep.edu',
                'UT_Houston': 'http://www.uth.edu',
                'UT_NorthTexas': 'http://www.unt.edu',
                'UT_SanAntonio': 'http://www.utsa.edu'
            }
        elif urls == 'popular':
            news_urls = {}
            agency_urls = newspaper.popular_urls()
            for i in range(len(agency_urls)):
                val = agency_urls[i]
                url = agency_urls[i].replace("http://", "")
                url = url.replace("www.", "")
                url = url.replace("blog.", "")
                url = url.replace("blogs.", "")
                url = url.replace(".com", "")
                url = url.replace(".net", "")
                url = url.replace(".au", "")
                url = url.replace(".org", "")
                url = url.replace(".co.uk", "")
                url = url.replace("the", "")
                url = url.replace(".", "-")
                url = url.replace('usa', 'usa-')
                if url == 'berkeley-edu':
                    continue
                if url == 'beta-na-leagueoflegends':
                    continue
                if url == 'bottomline-as-ucsb-edu':
                    continue
                news_urls[url] = val
        else:
            news_urls = urls

        print("\nSearch Level {:<d}:".format(search_level), end="")
        if search_level == 0:
            print(" Screening URLs for search words")
            print("   URLs must contain one or more of:", end="")
        else:
            print(" No URL Screening")
            print("   Deep Search for Articles containing: ", end="")
        i = 0
        for word in words:
            i += 1
            if i < len(words):
                if search_type == 'or':
                    print(word + " or ", end="")
                else:
                    print(word + " & ", end="")
            else:
                print(word)

        df_articles = pd.DataFrame(columns=[
            'agency', 'url', 'length', 'keywords', 'title', 'summary', 'text'
        ])
        n_articles = {}
        today = str(date.today())
        for agency, url in news_urls.items():
            paper = newspaper.build(url, config=config)
            if display:
                print("\n{:>6d} Articles available from {:<s} on {:<10s}:".
                      format(paper.size(), agency.upper(), today))
            article_collection = []
            for article in paper.articles:
                url_lower = article.url.lower()
                # Exclude articles that are in a language other then en
                # or contains mostly video or pictures
                # search_level 0 only downloads articles with at least
                # one of the key words in its URL
                # search_level 1 download all articles that appear to be
                # appear to be in English and are not mainly photos or
                # videos.
                # With either search level, if an article is downloaded
                # it is scanned to see if it contains the search words
                # It is also compared to other articles to verify that
                # it is not a duplicate of another article.

                # Special Filters for some Agencies
                if agency == 'cbs-news':
                    if url_lower.find('.com') >= 0:
                        # secure-fly are duplicates of http
                        if article.url.find('secure-fly') >= 0:
                            continue
                if agency == 'usa-today':
                    if url_lower.find('tunein.com') >= 0:
                        continue
                if agency == 'huffington':
                    # Ignore huffington if it's not .com
                    if url_lower.find('.com') < 0:
                        continue

                # Filter Articles that are primarily video, film or not en
                if url_lower.find('.video/')   >=0 or \
                   url_lower.find('/video')    >=0 or \
                   url_lower.find('/picture')  >=0 or \
                   url_lower.find('.pictures/')>=0 or \
                   url_lower.find('/photo')    >=0 or \
                   url_lower.find('.photos/')  >=0 or \
                   url_lower.find('espanol')   >=0 or \
                   url_lower.find('.mx/' )     >=0 or \
                   url_lower.find('/mx.' )     >=0 or \
                   url_lower.find('.fr/' )     >=0 or \
                   url_lower.find('/fr.' )     >=0 or \
                   url_lower.find('.de/' )     >=0 or \
                   url_lower.find('/de.' )     >=0 or \
                   url_lower.find('.it/' )     >=0 or \
                   url_lower.find('/it.' )     >=0 or \
                   url_lower.find('.gr/' )     >=0 or \
                   url_lower.find('/gr.' )     >=0 or \
                   url_lower.find('.se/' )     >=0 or \
                   url_lower.find('/se.' )     >=0 or \
                   url_lower.find('.es/' )     >=0 or \
                   url_lower.find('/es.' )     >=0 or \
                   url_lower.find('?button')   >=0 or \
                   url_lower.find('calendar.') >=0 or \
                   url_lower.find('calendar/') >=0 or \
                   url_lower.find('/event/')   >=0 or \
                   url_lower.find('engr.utexas') >=0 or \
                   url_lower.find('sites.smu.')  >=0:
                    continue

                # Filter if search_level == 0, URL quick search
                if search_level == 0:
                    # Verify url contains at least one of the key words
                    found_it = False
                    for word in words:
                        j = url_lower.find(word)
                        if j >= 0:
                            found_it = True
                            break
                    if found_it:
                        # Article contains words and passes filters
                        # Save this article for full review
                        article_collection.append(article.url)
                else:
                    #  No URL screening, Save for full review
                    article_collection.append(article.url)
            n_to_review = len(article_collection)
            if display:
                print("{:>6d} Selected for download".format(n_to_review))

            for article_url in article_collection:
                article = Article(article_url, config=config)
                try:
                    article.download()
                except:
                    if display:
                        print("Cannot download:", article_url[0:79])
                    continue
                n = 0
                # Limit download failures
                stop_sec = 1  # Initial max wait time in seconds
                while n < 2:
                    try:
                        article.parse()
                        n = 99
                    except:
                        n += 1
                        # Initiate download again before new parse attempt
                        article.download()
                        # Timeout for 5 seconds waiting for download
                        t0 = time()
                        tlapse = 0
                        while tlapse < stop_sec:
                            tlapse = time() - t0
                        # Double wait time if needed for next exception
                        stop_sec = stop_sec + 1
                if n != 99:
                    if display:
                        print("Cannot download:", article_url[0:79])
                    n_to_review -= 1
                    continue
                article.nlp()
                keywords = article.keywords
                title = article.title
                summary = article.summary
                text = article.text
                text_lower_case = text.lower()
                if search_type == 'or':
                    found_it = False
                    # Verify the url contains at least one of the key words
                    for word in words:
                        j = text_lower_case.find(word)
                        if j >= 0:
                            found_it = True
                            break
                else:
                    # search type 'and'
                    found_it = True
                    for word in words:
                        j = text_lower_case.find(word)
                        if j < 0:
                            found_it = False
                            break
                if found_it:
                    # Article contains words and passes filters
                    # Save this article for later full review
                    length = len(text)
                    df_story = pd.DataFrame([[
                        agency, article_url, length, keywords, title, summary,
                        text
                    ]],
                                            columns=[
                                                'agency', 'url', 'length',
                                                'keywords', 'title', 'summary',
                                                'text'
                                            ])
                    # Check for an identical already in the file
                    if df_articles.shape[0] == 0:
                        df_articles = df_articles.append(df_story)
                    else:
                        # Verify this story is not already in df_articles
                        same_story = False
                        for i in range(df_articles.shape[0]):
                            if text == df_articles['text'].iloc[i]:
                                same_story = True
                                n_to_review -= 1
                                continue
                        if not (same_story):
                            df_articles = df_articles.append(df_story)
                else:
                    n_to_review -= 1

                print("=", end='')
            n_articles[agency] = [n_to_review, len(article_collection)]
        if display:
            print("\n\nArticles Selected by Agency:")
            for agency in news_urls:
                ratio = str(n_articles[agency][0]) + "/" + \
                        str(n_articles[agency][1])
                ratio = ratio
                print("{:>10s} Articles from {:<s}".format(
                    ratio, agency.upper()))
            print("\nArticles Collected on " + today + ":",
                  df_articles.shape[0], 'from',
                  df_articles['agency'].nunique(), "Agencies.")
            print("\nSize    Agency    Title")
            print("*{:->78s}*".format("-"))
            for i in range(df_articles.shape[0]):
                k = len(df_articles['title'].iloc[i])
                if k > 63:
                    for j in range(25):
                        k = 63 - j
                        if df_articles['title'].iloc[i][k] == " ":
                            break

                    print("{:>5d} {:<10s} {:<63s}".format(
                        df_articles['length'].iloc[i],
                        df_articles['agency'].iloc[i],
                        df_articles['title'].iloc[i][0:k]))
                    if len(df_articles['title'].iloc[i]) > 63:
                        print("                {:<60s}".format(
                            df_articles['title'].iloc[i][k:120]))
                else:
                    print("{:>5d} {:<10s} {:<s}".format(
                        df_articles['length'].iloc[i],
                        df_articles['agency'].iloc[i],
                        df_articles['title'].iloc[i]))
                print("")
            print("*{:->78s}*".format("-"))
        return df_articles

Example #16

0

Show file

newslinks = []
for line in lines:
    newslinks.append(line[:-1])
#f.close()


def getdatestring(day):
    year, month, day = str(day)[:10].split("-")
    #day = day.split(" ")[0]
    return year, month, day


#links to articles
summaries = []
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
config = news.Config()
config.browser_user_agent = user_agent
x = 1
for link in newslinks:
    date, link = link.split(",", 1)
    print("article number " + str(x))
    a = news.Article(link, config=config)
    data = ""
    try:
        a.download()
        a.parse()
        a.nlp()
        #a.summary()
        data = a.text
    except news.article.ArticleException:
        print("something bad happened on article " + str(x) + ", for " + date)

Example #17

0

Show file

File: datagrabber.py Project: Maximilian-Mansky/Uppsala-University-Master-Thesis

import bs4 as bs
from datetime import date, datetime, timedelta
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common import action_chains
from selenium.common.exceptions import TimeoutException as TimeoutError_
from selenium.common.exceptions import MoveTargetOutOfBoundsException as OutOfBoundsError
import re
import json
from random import randint

configuration = nws.Config()
configuration.fetch_images = False
configuration.follow_meta_refresh = True

league_list = ["Europa League", "FA Cup", "Championship", "Champions League", "EFL Cup", "Premier League", "La Liga", "League One", "League Two",
               "Bundesliga", "Serie A", "Ligue 1"]


def datespan(start_date, end_date, delta):
    """generates daily time stamps of the format yyyy-mm-dd.

    Takes: start and end date and a time step.
    Returns: an iterable date."""
    current_date = start_date
    while current_date < end_date:
        yield current_date

Example #18

0

Show file

File: core.py Project: zshwuhan/SoNaR

def init_config():
    config = newspaper.Config()
    config.fetch_images = False
    config.verbose = True

    return config