Example #1
0
def get_configuration() -> newspaper.Config:
    """Return configuration for news site scraping."""
    conf = newspaper.Config()
    conf.memoize_articles = False
    conf.fetch_images = False
    conf.MIN_WORD_COUNT = 1
    conf.MAX_TEXT = 6 * 5000
    return conf
Example #2
0
def calculate_article_word_count(url):
    config = newspaper.Config()
    config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) " \
                                "AppleWebKit/537.36 (KHTML, like Gecko) " \
                                "Chrome/64.0.3282.186 Safari/537.36"
    article = newspaper.Article(url, config=config)
    article.download()
    article.parse()

    if (len(article.text.split())) < 200:
        raise ValidationError('Could not find article')

    return len(article.text.split()) + len(article.title.split())
Example #3
0
def download_article(url):
    try:
        # user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/537.36 (KHTML, like Gecko) Chrome/20100101 Firefox/78.0'
        config = newspaper.Config()
        config.browser_user_agent = user_agent
        config.request_timeout = 10
        config.fetch_images = False
        config.memoize_articles = False
        article = newspaper.Article(url, config=config)
        article.download()
        article.parse()
        return article.text
    except:
        raise Exception("Error: Parsing failed.")
Example #4
0
def summarizeLinksToAudio(url, summary) -> None:
    """Summarize a list of urls into audio files."""
    results = list()
    result = str()
    try:
        config = newspaper.Config()
        configNews(config)
        urls = getURLS(url, summary)
        for url in urls:
            results.append(summarizeLinkToAudio(url))
    except Exception as e:
        logging.exception(e)
    finally:
        result = "".join(results)
        return result
Example #5
0
    def genericScraper(self):
        config = newspaper.Config()
        config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14'
        #config.request_timeout = 15

        if self.source not in [
                "washingtonpost", "usnews"
        ]:  # washingtonpost and usnews get funky when you set a user agent for some reason (WaPo fails if the timeout isn't long, usnews throws a 403)
            a = newspaper.Article(self.url, config=config)
        else:
            a = newspaper.Article(self.url)
        try:  # make sure page download goes smoothly
            a.download()
            a.parse()
        except Exception as e:
            print("Rejected - DOWNLOAD ERROR: ", e)
            return None

        text = cleanText(a.text)
        if len(
                text
        ) < 500:  # not much article text - full article is likely not picked up, and worst case scenario a short article is rejected (probably not all that useful in the long run)
            print(
                "Rejected - Article text was less than 500 characters, likely bad scraping job"
            )
            return None

        # get title, author, date and images as necessary
        if not self.title:
            if a.title:
                self.title = a.title

        if not self.author:
            if a.authors:
                self.author = a.authors[0]

        if not self.date:
            if a.publish_date:
                self.date = a.publish_date.strftime("%Y-%m-%d")

        if not self.images:
            if a.top_image:
                self.images.append(a.top_image)

        article = Article(self.title, self.author, self.date, self.url,
                          self.source, text.strip(), self.images)
        return article
Example #6
0
    def process_article(self, article):
        """Scrape data from news article from url

        Args:
            article ([ArticleObj]): [Article Object to scrape - article.url must not be null]

        Returns:
            [ArticleObj]
        """

        print("processing article: " + str(article.title) + " - " +
              str(article.source))

        try:
            user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
            config = newspaper.Config()
            config.browser_user_agent = user_agent

            newspaper_article = newspaper.Article(article.url,
                                                  language='en',
                                                  config=config)
            newspaper_article.download()
            newspaper_article.parse()
        except Exception as e:
            # handle any exceptions
            print("Error parsing newspaper article")
            print(e)

        if newspaper_article:
            article.text = self.get_text(newspaper_article).encode(
                'utf-8').decode("utf-8")
            #lang = langdetect.detect(article.text)
            lang = 'en'
            if lang == 'en':
                article.cleaned_text = self.clean_text(article.text)

                if len(article.text.split()) > 60 and article.title != None:
                    #article.entities = self.get_entities(article.text)
                    article.entities = []
                    #article.category = self.get_category(article.cleaned_text)
                    article.category = ""
                    if not article.img_url:
                        article.img_url = self.get_image_url(newspaper_article)

                    article.desc = ' '.join(article.text.split()[:50]) + "..."
                    return article
Example #7
0
def build(newspaperURL):
    logprint(
        "Fetching articles from {} ...\nThis might take some time...".format(
            newspaperURL))
    start = time.time()
    config = newspaper.Config()
    config.MIN_WORD_COUNT = 700
    config.MIN_SENT_COUNT = 40
    paper = newspaper.build(
        url=newspaperURL,
        config=config,
        memoize_articles=False,
        fetch_images=False,
    )
    end = time.time()
    logprint("Done. Fethching took {} seconds.".format(end - start))
    return paper
    def applyConfig(self):
        """ apply configuration """

        os.environ['HTTP_PROXY'] = ''
        os.environ['HTTPS_PROXY'] = ''

        try:
            newspaper_config = newspaper.Config()
            newspaper_config.memoize_articles = True
            newspaper_config.http_success_only = True
            newspaper_config.fetch_images = False
            newspaper_config.number_threads = 2
            newspaper_config.browser_user_agent = self.configData['user_agent']
            newspaper_config.request_timeout = self.configData['fetch_timeout']

            # add this to config data
            self.configData['newspaper_config'] = newspaper_config

            # set OS environment variables for proxy server:
            if len(self.configData['proxy_url_http']) > 3 and len(
                    self.configData['proxy_url_https']) > 3:

                os.environ['HTTP_PROXY'] = self.configData['proxy_url_http']

                os.environ['HTTPS_PROXY'] = self.configData['proxy_url_https']

                self.configData['proxies'] = {
                    "http": self.configData['proxy_url_http'],
                    "https": self.configData['proxy_url_https']
                }
            # else:
            #     print("INFO: Not using any proxy servers: "
            #           , self.configData['proxy_url_http']
            #           , " or "
            #           , self.configData['proxy_url_https'])

            nltk.set_proxy(self.configData['proxies'])
            self.configData['newspaper_config'].proxies = self.configData[
                'proxies']
            # print("INFO: For NLTK, using Proxy configuration: ", nltk.getproxies())

        except Exception as e:
            print("ERROR: Unable to set proxy parameters: %s", e)
Example #9
0
    def __init__(self, url="", keep_html=True):
        self.config = newspaper.Config()
        self.config.keep_article_html = True

        self.url = url

        # Validate URL
        try:
            self.parse_result = urllib.parse.urlparse(self.url)
            print(self.parse_result)
        except Exception as e:
            self.parse_result = False
            self.article_data = dict()
        else:
            # It's okay to Attempt a parse
            self.article_data = self.extract()
        finally:
            # Any Future Cleanup Goes Here
            self.article_data["url"] = url
            pass
Example #10
0
    def __init__(self, url: str, articles_limit=100, summary_sentences=5):
        """
        Initializes the information source with summaries of the articles_limit articles. Each summary will have
        summary_sentences number of sentences.
        :param url: news website URL, e.g. "https://www.bbc.co.uk/"
        :param articles_limit: limit number of articles to fetch
        :param summary_sentences: number of sentences in each summary
        """
        config = newspaper.Config()
        config.MAX_SUMMARY_SENT = summary_sentences
        config.memoize_articles = False
        config.fetch_images = False

        self.paper = newspaper.build(url, config=config)
        summaries_list = list()

        i = 0
        while len(summaries_list) < articles_limit and i < len(
                self.paper.articles):
            article = self.paper.articles[i]
            i += 1
            try:
                article.download()
                article.parse()
                article.nlp()
            except newspaper.article.ArticleException:
                continue
            if article.summary != str():
                summaries_list.append(article.summary.split('\n'))

        self.summaries = list()
        for summary in summaries_list:
            sentence_to_id = dict(
                zip(
                    range(InformationSource._info_id,
                          InformationSource._info_id + len(summary)), summary))
            InformationSource._info_id += len(summary)
            self.summaries.append(sentence_to_id)
Example #11
0
def parse_article_url(url):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
    config = newspaper.Config()
    config.browser_user_agent = user_agent

    print(url)

    content = {}
    article = {}
    try:
        content = newspaper.Article(url, config=config)
        content.download()
        content.parse()
    except Exception as e:
        print("Invalid url! Please try again")
        return None

    article['title'] = content.title
    article['text'] = content.text
    article['link'] = content.url
    article['date'] = content.publish_date.isoformat()

    return article
Example #12
0
    def applyNetworkConfig(self):
        """ Apply configuration for networking
        """
        os.environ['HTTP_PROXY'] = ''
        os.environ['HTTPS_PROXY'] = ''
        try:
            newspaper_config = newspaper.Config()
            newspaper_config.memoize_articles = False
            newspaper_config.http_success_only = True
            newspaper_config.fetch_images = False
            newspaper_config.number_threads = 2
            newspaper_config.browser_user_agent = self.user_agent
            newspaper_config.request_timeout = self.fetch_timeout
            newspaper_config.use_cached_categories = False
            # add this to config data
            self.newspaper_config = newspaper_config
            # set OS environment variables for proxy server:
            if len(self.proxy_url_http) > 3 and len(self.proxy_url_https) > 3:
                os.environ['HTTP_PROXY'] = self.proxy_url_http
                os.environ['HTTPS_PROXY'] = self.proxy_url_https
                self.proxies = {
                    "http": self.proxy_url_http,
                    "https": self.proxy_url_https
                }
            else:
                os.environ['HTTP_PROXY'] = ''
                os.environ['HTTPS_PROXY'] = ''
                self.proxy_url_http = None
                self.proxy_url_https = None
                self.proxies = {}

            nltk.set_proxy(self.proxies)
            self.newspaper_config.proxies = self.proxies
            # print("INFO: For NLTK, using Proxy configuration: ", nltk.getproxies())
        except Exception as e:
            print("ERROR: Unable to set proxy parameters: %s", e)
Example #13
0
        "link": "http://www.nbcnews.com/"
    },
    "buzzfeed": {
        "rss": "http://www.buzzfeed.com/politics.xml",
        "link": "http://www.buzzfeed.com/"
    }
}

try:
    with open(script_path + 'scraped_articles.yaml') as data_file:
        data = yaml.load(data_file)
except Exception as e:
    print(e)
    print('Unable to load previous scraped articles.')

config = newspaper.Config()
config.fetch_images = False
config.memoize_articles = False

#iterate through all sources. Stop after RSS feed exhausted, or LIMIT reached
for source, value in sources.items():
    site = {
        "rss": value['rss'],
        "link": value['link'],
        "articles": [],
        "link_set": set()
    }
    count = 1
    print("Capturing articles from", source)
    d = fp.parse(value['rss'])
    if source not in data:
Example #14
0
def convert_tweet(twitter_dumpfn):
    try:
        with open(twitter_dumpfn, 'r') as dumpf:
            data = json.loads(dumpf.read())
        if len(data['comments'])==0:
            return

        jsonfn = '%s/%s/%s.json' % (QASRC_DIRFN, corpus_name, data['id'])
        if os.path.exists(jsonfn):
            return

        url = data['textUrl'] if 'textUrl' in data else ''

        text = ''

        if url:

            skip = False
            for blocked_url in BLOCKLIST:
                if blocked_url in url:
                    skip = True
            if skip:
               return 

            logging.debug ('%-20s: %s ... ' % (data['user'], url))

            config = newspaper.Config()
            config.browser_user_agent = random.choice(USER_AGENTS)

            article = Article(url=url, config=config)
            article.download()
            article.parse()

            text = article.text

            # print (text)

        # text

        if text:
            ds = {'info': text, 'date': data['date'], 'dlg': [data['text']]}
        else:
            ds = {'info': data['text'], 'date': data['date'], 'dlg': []}

        fav = 0
        for c in data['comments']:
            if c['favorites'] == 0:
                continue
            ds['dlg'].append(c['text'])
            fav += 1

        if (not text) and (fav == 0):
            return

        # print(repr(ds))

        with open(jsonfn, 'w') as jsonf:
            jsonf.write(json.dumps(ds))

        logging.debug ('%-20s: %s written. %s' % (data['user'], jsonfn, url[:30]))

    except newspaper.article.ArticleException as ae:
        
        logging.info ('%-20s: %s' % (data['user'], str(ae)))

    except:
        logging.exception('exception caught %s' % repr(data))
Example #15
0
    def newspaper_stories(words,
                          search_type='or',
                          search_level=0,
                          urls=None,
                          display=True,
                          memorize=False,
                          language='en'):
        config = newspaper.Config()
        config.memoize_articles = memorize
        config.language = language
        config.fetch_images = False
        config.request_timeout = 20
        config.MIN_WORD_COUNT = 300
        config.MIN_SENT_COUNT = 10
        if urls == None or urls == 'top_news':
            news_urls = {
                'huffington': 'http://huffingtonpost.com',
                'reuters': 'http://www.reuters.com',
                'cbs-news': 'http://www.cbsnews.com',
                'usa-today': 'http://usatoday.com',
                'cnn': 'http://cnn.com',
                'npr': 'http://www.npr.org',
                'abc-news': 'http://abcnews.com',
                'us-news': 'http://www.usnews.com',
                'msn': 'http://msn.com',
                'pbs': 'http://www.pbs.org',
                'nbc-news': 'http://www.nbcnews.com',
                'msnbc': 'http://www.msnbc.com',
                'fox': 'http://www.foxnews.com'
            }
        elif urls == 'all_us_news':
            news_urls = {
                'abc-news': 'https://abcnews.go.com',
                'al-jazeera-english': 'http://www.aljazeera.com',
                'ars-technica': 'http://arstechnica.com',
                'associated-press': 'https://apnews.com/',
                'axios': 'https://www.axios.com',
                'bleacher-report': 'http://www.bleacherreport.com',
                'bloomberg': 'http://www.bloomberg.com',
                'breitbart-news': 'http://www.breitbart.com',
                'business-insider': 'http://www.businessinsider.com',
                'buzzfeed': 'https://www.buzzfeed.com',
                'cbs-news': 'http://www.cbsnews.com',
                'cnbc': 'http://www.cnbc.com',
                'cnn': 'http://us.cnn.com',
                'crypto-coins-news': 'https://www.ccn.com',
                'engadget': 'https://www.engadget.com',
                'entertainment-weekly': 'http://www.ew.com',
                'espn': 'http://espn.go.com',
                'espn-cric-info': 'http://www.espncricinfo.com/',
                'fortune': 'http://fortune.com',
                'fox-news': 'http://www.foxnews.com',
                'fox-sports': 'http://www.foxsports.com',
                'google-news': 'https://news.google.com',
                'hacker-news': 'https://news.ycombinator.com',
                'ign': 'http://www.ign.com',
                'mashable': 'http://mashable.com',
                'medical-news-today': 'http://www.medicalnewstoday.com',
                'msnbc': 'http://www.msnbc.com',
                'mtv-news': 'http://www.mtv.com/news',
                'national-geographic': 'http://news.nationalgeographic.com',
                'national-review': 'https://www.nationalreview.com/',
                'nbc-news': 'http://www.nbcnews.com',
                'new-scientist': 'https://www.newscientist.com/section/news',
                'newsweek': 'http://www.newsweek.com',
                'new-york-magazine': 'http://nymag.com',
                'next-big-future': 'https://www.nextbigfuture.com',
                'nfl-news': 'http://www.nfl.com/news',
                'nhl-news': 'https://www.nhl.com/news',
                'politico': 'https://www.politico.com',
                'polygon': 'http://www.polygon.com',
                'recode': 'http://www.recode.net',
                'reddit-r-all': 'https://www.reddit.com/r/all',
                'reuters': 'http://www.reuters.com',
                'techcrunch': 'https://techcrunch.com',
                'techradar': 'http://www.techradar.com',
                'american-conservative':
                'http://www.theamericanconservative.com/',
                'hill': 'http://thehill.com',
                'huffington-post': 'http://www.huffingtonpost.com',
                'next-web': 'http://thenextweb.com',
                'verge': 'http://www.theverge.com',
                'wall-street-journal': 'http://www.wsj.com',
                'washington-post': 'https://www.washingtonpost.com',
                'washington-times': 'https://www.washingtontimes.com/',
                'time': 'http://time.com',
                'usa-today': 'http://www.usatoday.com/news',
                'vice-news': 'https://news.vice.com',
                'wired': 'https://www.wired.com'
            }
        elif urls == "texas_universities":
            news_urls = {
                'A&M': 'http://www.tamu.edu',
                'A&M-Commerce': 'http://www.tamuc.edu',
                'A&M-Corpus': 'http://www.tamucc.edu',
                'A&M-Kingsville': 'http://www.tamuk.edu',
                'A&M-Galveston': 'http://www.tamug.edu',
                'A&M-PrairieView': 'http://www.pvamu.edu',
                'A&M-International': 'http://www.tamiu.edu',
                'A&M-WestTexas': 'http://www.wtamu.edu',
                'Baylor': 'http://www.baylor.edu',
                'Rice': 'http://www.rice.edu',
                'SFAustin': 'http://www.sfasu.edu',
                'SMU': 'http://www.smu.edu',
                'SulRoss': 'http://www.sulross.edu',
                'TexasState': 'http://www.txstate.edu',
                'Texas_Tech': 'http://www.ttu.edu',
                'UDallas': 'http://www.udallas.edu',
                'UHouston': 'http://www.uh.edu',
                'UTexas': 'http://www.utexas.edu',
                'UT_Dallas': 'http://www.utdallas.edu',
                'UT_ElPaso': 'http://www.utep.edu',
                'UT_Houston': 'http://www.uth.edu',
                'UT_NorthTexas': 'http://www.unt.edu',
                'UT_SanAntonio': 'http://www.utsa.edu'
            }
        elif urls == 'popular':
            news_urls = {}
            agency_urls = newspaper.popular_urls()
            for i in range(len(agency_urls)):
                val = agency_urls[i]
                url = agency_urls[i].replace("http://", "")
                url = url.replace("www.", "")
                url = url.replace("blog.", "")
                url = url.replace("blogs.", "")
                url = url.replace(".com", "")
                url = url.replace(".net", "")
                url = url.replace(".au", "")
                url = url.replace(".org", "")
                url = url.replace(".co.uk", "")
                url = url.replace("the", "")
                url = url.replace(".", "-")
                url = url.replace('usa', 'usa-')
                if url == 'berkeley-edu':
                    continue
                if url == 'beta-na-leagueoflegends':
                    continue
                if url == 'bottomline-as-ucsb-edu':
                    continue
                news_urls[url] = val
        else:
            news_urls = urls

        print("\nSearch Level {:<d}:".format(search_level), end="")
        if search_level == 0:
            print(" Screening URLs for search words")
            print("   URLs must contain one or more of:", end="")
        else:
            print(" No URL Screening")
            print("   Deep Search for Articles containing: ", end="")
        i = 0
        for word in words:
            i += 1
            if i < len(words):
                if search_type == 'or':
                    print(word + " or ", end="")
                else:
                    print(word + " & ", end="")
            else:
                print(word)

        df_articles = pd.DataFrame(columns=[
            'agency', 'url', 'length', 'keywords', 'title', 'summary', 'text'
        ])
        n_articles = {}
        today = str(date.today())
        for agency, url in news_urls.items():
            paper = newspaper.build(url, config=config)
            if display:
                print("\n{:>6d} Articles available from {:<s} on {:<10s}:".
                      format(paper.size(), agency.upper(), today))
            article_collection = []
            for article in paper.articles:
                url_lower = article.url.lower()
                # Exclude articles that are in a language other then en
                # or contains mostly video or pictures
                # search_level 0 only downloads articles with at least
                # one of the key words in its URL
                # search_level 1 download all articles that appear to be
                # appear to be in English and are not mainly photos or
                # videos.
                # With either search level, if an article is downloaded
                # it is scanned to see if it contains the search words
                # It is also compared to other articles to verify that
                # it is not a duplicate of another article.

                # Special Filters for some Agencies
                if agency == 'cbs-news':
                    if url_lower.find('.com') >= 0:
                        # secure-fly are duplicates of http
                        if article.url.find('secure-fly') >= 0:
                            continue
                if agency == 'usa-today':
                    if url_lower.find('tunein.com') >= 0:
                        continue
                if agency == 'huffington':
                    # Ignore huffington if it's not .com
                    if url_lower.find('.com') < 0:
                        continue

                # Filter Articles that are primarily video, film or not en
                if url_lower.find('.video/')   >=0 or \
                   url_lower.find('/video')    >=0 or \
                   url_lower.find('/picture')  >=0 or \
                   url_lower.find('.pictures/')>=0 or \
                   url_lower.find('/photo')    >=0 or \
                   url_lower.find('.photos/')  >=0 or \
                   url_lower.find('espanol')   >=0 or \
                   url_lower.find('.mx/' )     >=0 or \
                   url_lower.find('/mx.' )     >=0 or \
                   url_lower.find('.fr/' )     >=0 or \
                   url_lower.find('/fr.' )     >=0 or \
                   url_lower.find('.de/' )     >=0 or \
                   url_lower.find('/de.' )     >=0 or \
                   url_lower.find('.it/' )     >=0 or \
                   url_lower.find('/it.' )     >=0 or \
                   url_lower.find('.gr/' )     >=0 or \
                   url_lower.find('/gr.' )     >=0 or \
                   url_lower.find('.se/' )     >=0 or \
                   url_lower.find('/se.' )     >=0 or \
                   url_lower.find('.es/' )     >=0 or \
                   url_lower.find('/es.' )     >=0 or \
                   url_lower.find('?button')   >=0 or \
                   url_lower.find('calendar.') >=0 or \
                   url_lower.find('calendar/') >=0 or \
                   url_lower.find('/event/')   >=0 or \
                   url_lower.find('engr.utexas') >=0 or \
                   url_lower.find('sites.smu.')  >=0:
                    continue

                # Filter if search_level == 0, URL quick search
                if search_level == 0:
                    # Verify url contains at least one of the key words
                    found_it = False
                    for word in words:
                        j = url_lower.find(word)
                        if j >= 0:
                            found_it = True
                            break
                    if found_it:
                        # Article contains words and passes filters
                        # Save this article for full review
                        article_collection.append(article.url)
                else:
                    #  No URL screening, Save for full review
                    article_collection.append(article.url)
            n_to_review = len(article_collection)
            if display:
                print("{:>6d} Selected for download".format(n_to_review))

            for article_url in article_collection:
                article = Article(article_url, config=config)
                try:
                    article.download()
                except:
                    if display:
                        print("Cannot download:", article_url[0:79])
                    continue
                n = 0
                # Limit download failures
                stop_sec = 1  # Initial max wait time in seconds
                while n < 2:
                    try:
                        article.parse()
                        n = 99
                    except:
                        n += 1
                        # Initiate download again before new parse attempt
                        article.download()
                        # Timeout for 5 seconds waiting for download
                        t0 = time()
                        tlapse = 0
                        while tlapse < stop_sec:
                            tlapse = time() - t0
                        # Double wait time if needed for next exception
                        stop_sec = stop_sec + 1
                if n != 99:
                    if display:
                        print("Cannot download:", article_url[0:79])
                    n_to_review -= 1
                    continue
                article.nlp()
                keywords = article.keywords
                title = article.title
                summary = article.summary
                text = article.text
                text_lower_case = text.lower()
                if search_type == 'or':
                    found_it = False
                    # Verify the url contains at least one of the key words
                    for word in words:
                        j = text_lower_case.find(word)
                        if j >= 0:
                            found_it = True
                            break
                else:
                    # search type 'and'
                    found_it = True
                    for word in words:
                        j = text_lower_case.find(word)
                        if j < 0:
                            found_it = False
                            break
                if found_it:
                    # Article contains words and passes filters
                    # Save this article for later full review
                    length = len(text)
                    df_story = pd.DataFrame([[
                        agency, article_url, length, keywords, title, summary,
                        text
                    ]],
                                            columns=[
                                                'agency', 'url', 'length',
                                                'keywords', 'title', 'summary',
                                                'text'
                                            ])
                    # Check for an identical already in the file
                    if df_articles.shape[0] == 0:
                        df_articles = df_articles.append(df_story)
                    else:
                        # Verify this story is not already in df_articles
                        same_story = False
                        for i in range(df_articles.shape[0]):
                            if text == df_articles['text'].iloc[i]:
                                same_story = True
                                n_to_review -= 1
                                continue
                        if not (same_story):
                            df_articles = df_articles.append(df_story)
                else:
                    n_to_review -= 1

                print("=", end='')
            n_articles[agency] = [n_to_review, len(article_collection)]
        if display:
            print("\n\nArticles Selected by Agency:")
            for agency in news_urls:
                ratio = str(n_articles[agency][0]) + "/" + \
                        str(n_articles[agency][1])
                ratio = ratio
                print("{:>10s} Articles from {:<s}".format(
                    ratio, agency.upper()))
            print("\nArticles Collected on " + today + ":",
                  df_articles.shape[0], 'from',
                  df_articles['agency'].nunique(), "Agencies.")
            print("\nSize    Agency    Title")
            print("*{:->78s}*".format("-"))
            for i in range(df_articles.shape[0]):
                k = len(df_articles['title'].iloc[i])
                if k > 63:
                    for j in range(25):
                        k = 63 - j
                        if df_articles['title'].iloc[i][k] == " ":
                            break

                    print("{:>5d} {:<10s} {:<63s}".format(
                        df_articles['length'].iloc[i],
                        df_articles['agency'].iloc[i],
                        df_articles['title'].iloc[i][0:k]))
                    if len(df_articles['title'].iloc[i]) > 63:
                        print("                {:<60s}".format(
                            df_articles['title'].iloc[i][k:120]))
                else:
                    print("{:>5d} {:<10s} {:<s}".format(
                        df_articles['length'].iloc[i],
                        df_articles['agency'].iloc[i],
                        df_articles['title'].iloc[i]))
                print("")
            print("*{:->78s}*".format("-"))
        return df_articles
Example #16
0
newslinks = []
for line in lines:
    newslinks.append(line[:-1])
#f.close()


def getdatestring(day):
    year, month, day = str(day)[:10].split("-")
    #day = day.split(" ")[0]
    return year, month, day


#links to articles
summaries = []
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
config = news.Config()
config.browser_user_agent = user_agent
x = 1
for link in newslinks:
    date, link = link.split(",", 1)
    print("article number " + str(x))
    a = news.Article(link, config=config)
    data = ""
    try:
        a.download()
        a.parse()
        a.nlp()
        #a.summary()
        data = a.text
    except news.article.ArticleException:
        print("something bad happened on article " + str(x) + ", for " + date)
import bs4 as bs
from datetime import date, datetime, timedelta
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common import action_chains
from selenium.common.exceptions import TimeoutException as TimeoutError_
from selenium.common.exceptions import MoveTargetOutOfBoundsException as OutOfBoundsError
import re
import json
from random import randint

configuration = nws.Config()
configuration.fetch_images = False
configuration.follow_meta_refresh = True

league_list = ["Europa League", "FA Cup", "Championship", "Champions League", "EFL Cup", "Premier League", "La Liga", "League One", "League Two",
               "Bundesliga", "Serie A", "Ligue 1"]


def datespan(start_date, end_date, delta):
    """generates daily time stamps of the format yyyy-mm-dd.

    Takes: start and end date and a time step.
    Returns: an iterable date."""
    current_date = start_date
    while current_date < end_date:
        yield current_date
Example #18
0
def init_config():
    config = newspaper.Config()
    config.fetch_images = False
    config.verbose = True

    return config