コード例 #1
0
def googlenews_extract(date_range, num_pages, search_text):
    ''' Use googlenews package to extract top 30 stories per day based on search string '''

    df_days = []

    # loop through date range to ensure equal sample size from each day
    #TODO: if we want to pull multiple years of data, perhaps add multi-threading...not necessary for < ~20 calls
    for date in date_range:

        result = []
        googlenews = GoogleNews(start=date, end=date)
        googlenews.search(search_text)
        print("Search Date = ", date)

        for i in range(0, num_pages):

            print('Executing GoogleNews call #', i + 1)

            googlenews.getpage(i)
            result_next = googlenews.result()
            print("Total records returned: ", len(result_next))

            df = pd.DataFrame(result_next)
            df['date_calendar'] = date

        df_days.append(df)
        appended_data = pd.concat(df_days)

    df_news = appended_data.reset_index(drop=True).drop(['date'], axis=1)

    return df_news
コード例 #2
0
    def scrapeTitles(self, num=0):
        '''
            Inputs: 

                num --> finds at least num titles

            Outputs: A list of raw titles

            How: Makese API call to Google News
                Cleans title list down to just words
                Strips out any titles that do not contain ticker
        '''
        found = 0
        titles = []
        end = self.end
        start = self.start

        # max start can be reduced by is 7 days
        tries = 0

        while found <= num:

            if not self.validDates() or tries > 7:
                break
        

            googlenews = GoogleNews(start=start, end=end)
            
            googlenews.search(self.ticker)

            result = googlenews.result()
            if len(result) == 0:
                break
            df = pd.DataFrame(result)

            if len(df) > 0:
                self.titleList = df['title'].tolist() + df['desc'].tolist()

            self.clean()
            self.stripTitleList()
            
            if self.titleList is not None:
                print(self.start, self.end)
                print("after stripTitleList: Not None")
            else:
                print(self.start, self.end)
                print("after stripTitleList: None")


            for t in self.titleList:
                if t not in titles:
                    titles.append((t))
                    found += 1
            
            start = self.reduceDate(start, 1)
            
            tries += 1
           
        self.start = start
        self.titleList = titles[:num]
コード例 #3
0
def get_search_results(keyword: str):
    googlenews = GoogleNews(lang="en", period="7d", encode="utf-8")
    googlenews.get_news(keyword)
    googlenews.search(keyword)
    googlenews.get_page(1)
    results = googlenews.results()
    return results[0:5]
コード例 #4
0
def google_new_scrape(keyword=0, earliest_date="2000-01-01", end_date=""):
    ealiest_date = dt.strptime(earliest_date, "20%y-%m-%d")
    ealiest_date = ealiest_date.strftime("%m/%d/20%y")
    googlenews = None
    if end_date != "":
        end_date = dt.strptime(end_date, "20%y-%m-%d")
        end_date = end_date.strftime("%m/%d/20%y")
        googlenews = GoogleNews(start=earliest_date,end=end_date)
    else:
        googlenews = GoogleNews(start=earliest_date)
    googlenews.search('trump')
    for i in range(1,1000):
        googlenews.getpage(i)
        result=googlenews.result()
        print(len(result), result)
        df=pd.DataFrame(result)
    list=[]
    for ind in df.index:
        dict={}
        article = Article(df['link'][ind])
        article.download()
        article.parse()
        #article.nlp()
        dict['Date']=df['date'][ind]
        dict['Media']=df['media'][ind]
        dict['Title']=article.title
        dict['Article']=article.text
        dict['Summary']=article.summary
        list.append(dict)
    news_df=pd.DataFrame(list)
    print(news_df)
    file_name = 'googlenews.csv'
    news_df.to_csv(file_name)
コード例 #5
0
ファイル: views.py プロジェクト: PROuserR/My-website
def index(request):
    googlenews = GoogleNews()
    googlenews.search('Shailene Woodley')
    news = googlenews.result()

    context = {'news': news}
    return render(request, 'index.html', context)
コード例 #6
0
def get_company_news_link(company='NaN', news_num=5, time_range='today'):

    if company == 'NaN':
        return 'please input company name'

    news_link = []
    googlenews = GoogleNews()
    googlenews.clear()

    if time_range != 'today':
        start_date = "{1}/{2}/{0}".format(time_range[0:4], time_range[5:7],
                                          time_range[8:10])
        end_date = "{1}/{2}/{0}".format(time_range[11:15], time_range[16:18],
                                        time_range[19:21])
        googlenews.set_time_range(start_date, end_date)

    googlenews.search(company)
    result = googlenews.result()

    try:
        for num in range(news_num):
            news_link.append(result[num]['link'])
    except IndexError:
        if len(news_link) == 0:
            return '此時段無' + company + '新聞 OR 網路不穩'
        return news_link
    else:
        return news_link
コード例 #7
0
    def fetch_articles(self):

        # how many pages to scrape
        N_pages = 1
        links = []
        # how many days from last update
        # TODO: look for the last update datetime in the DB
        days_from_last_update = (datetime.datetime.today() -
                                 self.history_start).days
        # for each day between start date and today:
        for day in range(0, days_from_last_update + 1):
            download_date = self.history_start + datetime.timedelta(days=day)
            googlenews = GoogleNews(start=download_date.strftime("%m/%d/%Y"),
                                    end=download_date.strftime("%m/%d/%Y"))
            googlenews.search(self.ticker)
            # iterate N_pages of Google News
            for i in range(0, N_pages):
                googlenews.getpage(i)
                result = googlenews.result()
                links = links + result

        links = list(set([x['link'] for x in links]))

        # for each link (without dups) get the article and its metadata
        articles = []
        for link in links:
            try:
                downloaded = self.download_and_parse_article(link)
                articles.append(downloaded)
            except Exception as e:
                print(e)

        return articles
コード例 #8
0
    def googleNewsCrawler(self):
        result_list = []
        googlenews = GoogleNews()

        for i in range(self.__numDays):
            startDateTime = self.__dateTime + timedelta(days=i)
            endDateTime = self.__dateTime + timedelta(days=i + self.__daysSpan)

            googlenews.setTimeRange(
                start=str(startDateTime.month) + '/' + str(startDateTime.day) +
                '/' + str(startDateTime.year),
                end=str(endDateTime.month) + '/' + str(endDateTime.day) + '/' +
                str(endDateTime.year))
            googlenews.search(self.__keyWords)
            for j in range(self.__pagsEveryDay - 1):
                googlenews.getpage(j + 2)
            logging.info(
                str(self.__keyWords + '__' + str(startDateTime.date()) +
                    " append " + str(int(self.__pagsEveryDay * 10)) +
                    " items"))
            result_list = result_list + googlenews.result()
            googlenews.clear()

            if (i + 1) % 10 == 0:
                self.toJson(result_list)
                result_list = []
                continue
        self.toJson(result_list)
コード例 #9
0
    def __init__(self, politician_name):
        """Initialize an object representing an article."""
        news = GoogleNews()
        news.setlang("uk")
        news.setencode("utf-8")
        news.setperiod("3d")
        news.search(politician_name)
        info = news.result()
        self.articles = []

        name, surname = politician_name.split()[0], politician_name.split()[1]
        self.link = f"https://www.google.com/search?q=+{name}+{surname}+новини&source=lnms&tbm=isch"

        def get_data(self):
            r = requests.get(self.link)
            return r.text

        html_data = get_data(self)
        soup = BeautifulSoup(html_data, "html.parser")
        image_links, num = [], 0
        for item in soup.find_all("img"):
            image_links.append(item["src"])
            num += 1
            if num == 6:
                break

        for i in range(5):
            text = info[i]
            info_list = [text["title"], text["link"], image_links[i + 1]]
            self.articles.append(info_list)
コード例 #10
0
ファイル: news_api.py プロジェクト: CelineZhou3105/Stonk-Ponk
def get_news(ticker):
    try:
        stock_data = stock_api.get_stock_data(ticker)
    except:
        raise Exception("Stock Not Found")

    try:
        googlenews = GoogleNews(period='2d')
        googlenews.search(ticker)
        result = googlenews.result()

        news_articles = []
        for item in result:
            news_article = {}
            news_article['title'] = item['title']
            news_article['media'] = item['media']
            news_article['date'] = item['date']
            news_article['description'] = item['desc']
            news_article['link'] = item['link']
            news_article['datetime'] = item['datetime']
            news_articles.append(news_article)

        return news_articles
    except:
        raise Exception("News Error")
コード例 #11
0
def get_admin_data(user_headline, user_img):
    admin_data = {
        'link': None,
        'headline': None,
        'content': None,
        'image': None
    }
    google_news = GoogleNews(lang='en')
    google_news.search(user_headline)
    links = google_news.get__links()
    print('No. of links found: ', len(links))
    if len(links) == 0:
        return admin_data
    elif len(links) == 1:
        link_used = links[0]
    else:
        link_used = links[1]

    admin_data['link'] = link_used
    print(link_used)
    article = Article(link_used)
    article.download()
    article.parse()
    article.nlp()
    admin_data['headline'] = article.title
    admin_data['content'] = article.summary
    if article.top_image is None:
        admin_data['image'] = user_img
    else:
        admin_data['image'] = article.top_image

    return admin_data
コード例 #12
0
ファイル: getNews.py プロジェクト: Steven-Chang1114/UniChoice
def getPolarity(uniName):
    from GoogleNews import GoogleNews
    from newspaper import Article
    from newspaper import Config
    import pandas as pd
    from textblob import TextBlob

    uniName = uniName + ' Coronavirus'
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent
    googlenews = GoogleNews(start='08/01/2020', end='09/26/2020')
    googlenews.search(uniName)
    result = googlenews.result()

    for i in range(0, 5):
        googlenews.getpage(i)
        result = googlenews.result()
    df = pd.DataFrame(result)
    sum = 0
    counter = 1
    for ind in df.index:
        try:
            article = Article(df['link'][ind], config=config)
            article.download()
            article.parse()
            article.nlp()
            testimonial = TextBlob(article.summary)
            counter += 1
            sum += testimonial.sentiment.polarity
        except:
            pass

    return sum / counter
コード例 #13
0
 def testResultHasLink(self):
     googlenews = GoogleNews()
     googlenews.search(keyword)
     result = googlenews.result()[0]
     print(result.get('link').lower())
     self.assertIn('http', result.get('link').lower())
     print('Result contains http link')
コード例 #14
0
 def testResultNumberAtTwoPages(self):
     googlenews = GoogleNews()
     googlenews.search(keyword)
     result = googlenews.page_at(2)
     length = len(result)
     self.assertEqual(length, 10)
     print('Result length at two pages is correct')
def crawling_news(company_name_list, start_date, end_date, save_file_name):
    #set logger Handler
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    logger.addHandler(stream_handler)

    #define googlenews
    googlenews = GoogleNews(lang='en',
                            start=start_date,
                            end=end_date,
                            encode='utf-8')
    #news.google.com search sample
    all_title = []
    logging.info('loop start')
    for i in range(len(company_name_list)):
        comp_name = company_name_list[i]
        googlenews.search(comp_name)
        logging.info('%s : %d%s' %
                     (comp_name,
                      ((i + 1) / len(company_name_list)) * 100, '%'))
        for j in range(len(googlenews.results())):
            temp = []
            temp.append(googlenews.results()[j].get('title'))
            temp.append(comp_name)
            temp.append(fixing_date(googlenews.results()[j].get('date')))
            all_title.append(temp)
        #clear result list
        googlenews.clear()
    all_title = pd.DataFrame(all_title)
    all_title.to_csv('%s.csv' % (save_file_name))
    logging.info('saved as %s.csv, done!!' % (save_file_name))
    return all_title
コード例 #16
0
ファイル: test_search.py プロジェクト: manishhedau/GoogleNews
 def testResultHasDate(self):
   googlenews = GoogleNews()
   googlenews.search(keyword)
   result = googlenews.result()[0]
   print(result.get('date').lower())
   self.assertIsNot('', result.get('date').lower())
   print('Result date is not empty')
コード例 #17
0
ファイル: testSearch.py プロジェクト: wastu01/GoogleNews
 def testResultNumberWithTwoPages(self):
     googlenews = GoogleNews()
     googlenews.search(keyword)
     googlenews.getpage(2)
     length = len(googlenews.result())
     self.assertEqual(length, 20)
     print('Result length with two pages is correct')
コード例 #18
0
ファイル: temp.py プロジェクト: manishmj9431/ISO-25-E1
def getNews(query):
    googleNews = GoogleNews()
    googleNews.search(query)

    news = []

    i = 0

    number = min([len(googleNews.result()), 6])

    for result in googleNews.result():
        if (i > number):
            break

        n = {}
        n["title"] = result['title']
        n["description"] = result['desc']
        n["link"] = result['link']

        if (i == 0):
            n["image"] = result['img']
        news.append(n)

        i += 1

    googleNews.clear()

    return news
コード例 #19
0
 def testResultHasImage(self):
     googlenews = GoogleNews()
     googlenews.search(keyword)
     result = googlenews.result()[0]
     print(result.get('img').lower())
     self.assertIn('base64', result.get('img').lower())
     print('Result contains image')
コード例 #20
0
    def getTitles(self, ticker, start, end):
        googlenews = GoogleNews(start=start, end=end)
        googlenews.search(ticker)
        result = googlenews.result()
        df = pd.DataFrame(result)

        return df['title']
コード例 #21
0
 def testResultContainsKeyword(self):
     googlenews = GoogleNews()
     googlenews.search(keyword)
     result = googlenews.result()[0]
     print(result.get('desc').lower())
     self.assertIn(keyword.lower(), result.get('desc').lower())
     print('Result contains keyword')
コード例 #22
0
ファイル: news.py プロジェクト: KCStinger/Covid-19_status
def get_news():
    dt_today = str(datetime.today().strftime('%m/%d/%Y'))
    dt_previous = datetime.today() - timedelta(days=5)
    dt_previous = str(dt_previous.strftime('%m/%d/%Y'))
    #print(dt_today)
    #print(dt_previous)

    googlenews = GoogleNews(start=dt_previous, end=dt_today)
    googlenews.search('Coronavirus')
    googlenews.getpage(1)
    result1 = googlenews.result()
    googlenews.getpage(2)
    result2 = googlenews.result()
    result = result1 + result2
    news_list = list()
    for i in result:
        if i['desc'] != '':
            dic = dict()
            dic['title'] = i['title']
            dic['source'] = i['media']
            dic['date&time'] = i['date']
            dic['desc'] = i['desc']
            dic['link'] = i['link']
            news_list.append(dic)
    return news_list
コード例 #23
0
def get_admin_data(user_headline, user_img, user_keywords):
    admin_data = {'link': None, 'headline': None,
                  'content': None, 'image': None}
    google_news = GoogleNews(lang='en')
    google_news.search(user_headline)
    links = google_news.get__links()
    print('No. of links found: ', len(links))
    if len(links) == 0:
        google_news = GoogleNews(lang='en')
        google_news.search(' '.join(user_keywords))
        links2 = google_news.get__links()
        if len(links2) == 0:
            return admin_data
        else:
            links = links2
    if len(links) == 1:
        link_used = links[0]
    else:
        link_used = links[1]

    admin_data['link'] = link_used
    # print(link_used)
    article = Article(link_used)
    article.download()
    article.parse()
    article.nlp()
    admin_data['headline'] = article.title
    admin_data['content'] = article.summary
    if article.top_image is not None:
        admin_data['image'] = article.top_image
    print('admin link: ', admin_data['link'])
    print('admin headline: ', admin_data['headline'])
    return admin_data
コード例 #24
0
def news(str):
    global i
    if i == 0:
        spacek(f"ofcures {str} which news  you want to listen")
    else:
        spacek(f"which news you want to listen{str}")

    try:
        s = takecommend().lower()
        s = s.replace('about', "")
        spacek("which page you want ot listen")

        s2 = int(takecommend())
        googlenews = GoogleNews()
        googlenews = GoogleNews('en', "2")
        # here you can use d which is denoted for how much linw you want to lesiten
        googlenews.search(s)
        googlenews.getpage(s2)
        googlenews.result()
        spacek(f" {str} here is news about ")
        spacek(s)
        print(googlenews.gettext())
        spacek(googlenews.gettext())
    except Exception as s:
        spacek(f"could not understand {str} what did  you say  say it again")
        i = 1
        news(str)
コード例 #25
0
ファイル: autonews_api.py プロジェクト: riyakwl28/auto_news
def extract_google(query_terms, startDate, endDate):
    if len(startDate) == 0:
        startDate = datetime.datetime.today().strftime('%d/%m/%Y')
    if len(endDate) == 0:
        endDate = datetime.datetime.strftime(
            datetime.datetime.today().date() - datetime.timedelta(days=7),
            '%d/%m/%Y')
    startDate = datetime.datetime.strptime(startDate,
                                           '%Y-%m-%d').strftime('%d/%m/%y')
    endDate = datetime.datetime.strptime(endDate,
                                         '%Y-%m-%d').strftime('%d/%m/%y')
    final_articles = []
    print(startDate)
    print(endDate)
    print("Crawling Starting")
    # here extracting news from google news
    googlenews = GoogleNews()
    googlenews.setTimeRange(startDate, endDate)
    for query in query_terms:
        googlenews.clear()

        #forming the search term
        googlenews.search("India Technology " + query)

        result = googlenews.result()

        for n in range(len(result)):
            source = result[n]['media']
            url = result[n]['link']
            try:
                article = Article(url)
                article.download()
                article.parse()
            except Exception as e:
                print("Trouble downloading so skipping")
                continue
            content = article.text

            # summarize the content
            temp_content = re.sub(r'^\s*[\(\[].*?[\)\]]\s*', '', content)
            sentences = sent_detector.tokenize(temp_content)
            summary = (" ".join(sentences[:2]).strip())

            date = result[n]['date']
            if (date.find('ago') != -1):
                date = current.date()
            title = result[n]['title']
            #         content=result[n]['desc']
            img = result[n]['img']
            #adding the extracted info in final_articles list
            final_articles.append({
                'source': source,
                'url': url,
                'date': date,
                'title': title,
                'content': content,
                'img': img
            })
    return final_articles
コード例 #26
0
def get_news(text):
    googlenews = GoogleNews()
    googlenews.search(text)
    googlenews.clear()
    googlenews.getpage(2)
    result = googlenews.result()

    return result
コード例 #27
0
ファイル: sentiment.py プロジェクト: theoportlock/system
 def run(self):
     googlenews = GoogleNews('en', 'd')
     googlenews.search(self.term)
     headline_results = googlenews.result()
     for i in headline_results:
         print(i["desc"])
         blob = TextBlob(i["desc"])
         self.sentiment += blob.sentiment.polarity / len(headline_results)
         self.subjectivity += blob.sentiment.subjectivity / len(
             headline_results)
コード例 #28
0
ファイル: news_source_v1.py プロジェクト: dray89/news_updates
 def run_search(self):
     stocks = self.get_df()
     googlenews = GoogleNews()
     links = []
     for i, j in stocks.itertuples():
         googlenews.search(j)
         results = googlenews.getlinks()
         for link in results:
             links.append(link)
     return links
コード例 #29
0
ファイル: googlenews.py プロジェクト: Siddharthbadal/Python
def todaysNews(str):
    googlenews = GoogleNews()
    googlenews = GoogleNews('en', 'd')

    googlenews.search(str)
    googlenews.getpage(1)

    googlenews.result()
    g = googlenews.gettext()
    return g
コード例 #30
0
ファイル: scrape_news.py プロジェクト: kafkoders/oclean
def scrape():

	link_list = []

	# Instance of class GoogleNews
	googlenews = GoogleNews()
	googlenews.search("oceans"+"+trash")

	for news_item in googlenews.result():
		sql_insert(news_item)
コード例 #31
0
from GoogleNews import GoogleNews
from readability import Document
from TextRank import Summary
from fetch_url import fetch_url
import sys
import re

number_of_links = int(sys.argv[1])
query = '+'.join(sys.argv[2:])
regex = re.compile("<(.*?)>|\&#13;")

article_list = []
summary_list = []

links = GoogleNews.search(query,number_of_links)


if not links:
	print "No links found"

else:
	result = fetch_url.fetch_parallel(links)

	while not result.empty():
		article = Document(result.get()).summary() 
		article = re.sub(regex, "", article)
		article = article.encode('ascii','ignore')
		summary = Summary.textrank(article)
		summary = summary.encode('ascii','ignore')
		article_list.append(article)
		summary_list.append(summary)