コード例 #1
0
def google_new_scrape(keyword=0, earliest_date="2000-01-01", end_date=""):
    ealiest_date = dt.strptime(earliest_date, "20%y-%m-%d")
    ealiest_date = ealiest_date.strftime("%m/%d/20%y")
    googlenews = None
    if end_date != "":
        end_date = dt.strptime(end_date, "20%y-%m-%d")
        end_date = end_date.strftime("%m/%d/20%y")
        googlenews = GoogleNews(start=earliest_date,end=end_date)
    else:
        googlenews = GoogleNews(start=earliest_date)
    googlenews.search('trump')
    for i in range(1,1000):
        googlenews.getpage(i)
        result=googlenews.result()
        print(len(result), result)
        df=pd.DataFrame(result)
    list=[]
    for ind in df.index:
        dict={}
        article = Article(df['link'][ind])
        article.download()
        article.parse()
        #article.nlp()
        dict['Date']=df['date'][ind]
        dict['Media']=df['media'][ind]
        dict['Title']=article.title
        dict['Article']=article.text
        dict['Summary']=article.summary
        list.append(dict)
    news_df=pd.DataFrame(list)
    print(news_df)
    file_name = 'googlenews.csv'
    news_df.to_csv(file_name)
コード例 #2
0
def news(str):
    global i
    if i == 0:
        spacek(f"ofcures {str} which news  you want to listen")
    else:
        spacek(f"which news you want to listen{str}")

    try:
        s = takecommend().lower()
        s = s.replace('about', "")
        spacek("which page you want ot listen")

        s2 = int(takecommend())
        googlenews = GoogleNews()
        googlenews = GoogleNews('en', "2")
        # here you can use d which is denoted for how much linw you want to lesiten
        googlenews.search(s)
        googlenews.getpage(s2)
        googlenews.result()
        spacek(f" {str} here is news about ")
        spacek(s)
        print(googlenews.gettext())
        spacek(googlenews.gettext())
    except Exception as s:
        spacek(f"could not understand {str} what did  you say  say it again")
        i = 1
        news(str)
コード例 #3
0
def googlenews_extract(date_range, num_pages, search_text):
    ''' Use googlenews package to extract top 30 stories per day based on search string '''

    df_days = []

    # loop through date range to ensure equal sample size from each day
    #TODO: if we want to pull multiple years of data, perhaps add multi-threading...not necessary for < ~20 calls
    for date in date_range:

        result = []
        googlenews = GoogleNews(start=date, end=date)
        googlenews.search(search_text)
        print("Search Date = ", date)

        for i in range(0, num_pages):

            print('Executing GoogleNews call #', i + 1)

            googlenews.getpage(i)
            result_next = googlenews.result()
            print("Total records returned: ", len(result_next))

            df = pd.DataFrame(result_next)
            df['date_calendar'] = date

        df_days.append(df)
        appended_data = pd.concat(df_days)

    df_news = appended_data.reset_index(drop=True).drop(['date'], axis=1)

    return df_news
コード例 #4
0
ファイル: getNews.py プロジェクト: Steven-Chang1114/UniChoice
def getPolarity(uniName):
    from GoogleNews import GoogleNews
    from newspaper import Article
    from newspaper import Config
    import pandas as pd
    from textblob import TextBlob

    uniName = uniName + ' Coronavirus'
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent
    googlenews = GoogleNews(start='08/01/2020', end='09/26/2020')
    googlenews.search(uniName)
    result = googlenews.result()

    for i in range(0, 5):
        googlenews.getpage(i)
        result = googlenews.result()
    df = pd.DataFrame(result)
    sum = 0
    counter = 1
    for ind in df.index:
        try:
            article = Article(df['link'][ind], config=config)
            article.download()
            article.parse()
            article.nlp()
            testimonial = TextBlob(article.summary)
            counter += 1
            sum += testimonial.sentiment.polarity
        except:
            pass

    return sum / counter
コード例 #5
0
ファイル: news.py プロジェクト: KCStinger/Covid-19_status
def get_news():
    dt_today = str(datetime.today().strftime('%m/%d/%Y'))
    dt_previous = datetime.today() - timedelta(days=5)
    dt_previous = str(dt_previous.strftime('%m/%d/%Y'))
    #print(dt_today)
    #print(dt_previous)

    googlenews = GoogleNews(start=dt_previous, end=dt_today)
    googlenews.search('Coronavirus')
    googlenews.getpage(1)
    result1 = googlenews.result()
    googlenews.getpage(2)
    result2 = googlenews.result()
    result = result1 + result2
    news_list = list()
    for i in result:
        if i['desc'] != '':
            dic = dict()
            dic['title'] = i['title']
            dic['source'] = i['media']
            dic['date&time'] = i['date']
            dic['desc'] = i['desc']
            dic['link'] = i['link']
            news_list.append(dic)
    return news_list
コード例 #6
0
ファイル: testSearch.py プロジェクト: wastu01/GoogleNews
 def testResultNumberWithTwoPages(self):
     googlenews = GoogleNews()
     googlenews.search(keyword)
     googlenews.getpage(2)
     length = len(googlenews.result())
     self.assertEqual(length, 20)
     print('Result length with two pages is correct')
コード例 #7
0
    def fetch_articles(self):

        # how many pages to scrape
        N_pages = 1
        links = []
        # how many days from last update
        # TODO: look for the last update datetime in the DB
        days_from_last_update = (datetime.datetime.today() -
                                 self.history_start).days
        # for each day between start date and today:
        for day in range(0, days_from_last_update + 1):
            download_date = self.history_start + datetime.timedelta(days=day)
            googlenews = GoogleNews(start=download_date.strftime("%m/%d/%Y"),
                                    end=download_date.strftime("%m/%d/%Y"))
            googlenews.search(self.ticker)
            # iterate N_pages of Google News
            for i in range(0, N_pages):
                googlenews.getpage(i)
                result = googlenews.result()
                links = links + result

        links = list(set([x['link'] for x in links]))

        # for each link (without dups) get the article and its metadata
        articles = []
        for link in links:
            try:
                downloaded = self.download_and_parse_article(link)
                articles.append(downloaded)
            except Exception as e:
                print(e)

        return articles
コード例 #8
0
    def googleNewsCrawler(self):
        result_list = []
        googlenews = GoogleNews()

        for i in range(self.__numDays):
            startDateTime = self.__dateTime + timedelta(days=i)
            endDateTime = self.__dateTime + timedelta(days=i + self.__daysSpan)

            googlenews.setTimeRange(
                start=str(startDateTime.month) + '/' + str(startDateTime.day) +
                '/' + str(startDateTime.year),
                end=str(endDateTime.month) + '/' + str(endDateTime.day) + '/' +
                str(endDateTime.year))
            googlenews.search(self.__keyWords)
            for j in range(self.__pagsEveryDay - 1):
                googlenews.getpage(j + 2)
            logging.info(
                str(self.__keyWords + '__' + str(startDateTime.date()) +
                    " append " + str(int(self.__pagsEveryDay * 10)) +
                    " items"))
            result_list = result_list + googlenews.result()
            googlenews.clear()

            if (i + 1) % 10 == 0:
                self.toJson(result_list)
                result_list = []
                continue
        self.toJson(result_list)
コード例 #9
0
def get_news(text):
    googlenews = GoogleNews()
    googlenews.search(text)
    googlenews.clear()
    googlenews.getpage(2)
    result = googlenews.result()

    return result
コード例 #10
0
ファイル: googlenews.py プロジェクト: Siddharthbadal/Python
def todaysNews(str):
    googlenews = GoogleNews()
    googlenews = GoogleNews('en', 'd')

    googlenews.search(str)
    googlenews.getpage(1)

    googlenews.result()
    g = googlenews.gettext()
    return g
コード例 #11
0
ファイル: news.py プロジェクト: Anwesha-dash811/hacktober-1
def news():
    topic = entry.get()
    googlenews = GoogleNews()
    googlenews = GoogleNews('en', 'd')
    googlenews.search(topic)
    googlenews.getpage()
    googlenews.result()
    a = googlenews.gettext()
    output.insert(END, a)
    speak = Dispatch(
        "SAPI.SpVoice"
    )  #calling this dispatch method helps to interact with Microsoft Speech SDK to speak
    speak.Speak(a)
コード例 #12
0
def get_stock(ticker, company, method='seeking alpha'):
    sentiments = {}
    ticker_data = yf.Ticker(ticker)
    data = ticker_data.history(start='2020-1-30', end='2020-11-17')
    data = data.drop(['Dividends', 'Stock Splits'], axis=1)
    data = data.assign(Sentiment=0)
    if (method == 'seeking alpha'):
        soup = BeautifulSoup(open('html/{}.txt'.format(ticker)), 'html.parser')
        articles = soup.find_all('article')
        for article in articles:
            article_title = article.find_all('a')[1].text
            spans = article.find_all('span')
            if (len(spans) == 1):
                article_date = spans[0].text
            else:
                article_date = spans[1].text
            article_date = article_date.split(', ')[1].replace('.',
                                                               '') + ' 2020'
            article_date = pd.to_datetime(article_date, format='%b %d %Y')
            headline_sentiment = analyzer.polarity_scores(
                article_title)['compound']
            if (article_date not in sentiments.keys()):
                sentiments[article_date] = [headline_sentiment]
            else:
                sentiments[article_date].append(headline_sentiment)
    elif (method == 'gn'):
        googlenews = GoogleNews(start='01/30/2020', end='11/17/2020')
        googlenews.search(company)
        for i in range(2, 6):
            googlenews.getpage(i)
        results = googlenews.result()
        for result in results:
            headline_sentiment = analyzer.polarity_scores(
                result['title'])['compound']
            try:
                article_date = pd.to_datetime(result['date'],
                                              format='%b %d, %Y')
            except:
                continue
            if (article_date not in sentiments.keys()):
                sentiments[article_date] = [headline_sentiment]
            else:
                sentiments[article_date].append(headline_sentiment)
    data['Prediction'] = data[['Close']].shift(-1)
    data = data[:-1]
    for s in sentiments:
        average_sentiment = np.average(sentiments[s])
        if (s in data.index):
            data.loc[s, 'Sentiment'] = average_sentiment
    data.to_csv('data/{}.csv'.format(ticker))
コード例 #13
0
def search_google_news(query, google_date):
    #-- Retrieve news articles
    # Init googlenews
    googlenews = GoogleNews()
    #googlenews.set_period('7d') # Cannot use set_period with set_time_range, use either or.
    #googlenews.set_time_range(str(google_date), '2020-10-12')
    googlenews.set_encode('utf-8')
    googlenews.search(query)
    googlenews.getpage(50)
    result = googlenews.result()
    # Clear before searching again
    googlenews.clear()

    return result
コード例 #14
0
def getNews(text):
    googlenews = GoogleNews("en", "m" )
    #Parameters: language and timeframe (day, month, year)
    googlenews.search(text)
    googlenews.getpage(1)
    #first Result Page
    headlines = googlenews.gettext()
    #only show Headlines (pictures or links are also available)

    #convert to string, Because Wordcloud expects string like object
    text = ' '.join([str(elem) for elem in headlines])
    generateWordCloud(text)
    googlenews.clear()
    return
コード例 #15
0
def news():
    speak("What kind of news would you like to hear ?")
    type = takeCommand()
    googleNews = GoogleNews()
    googleNews = GoogleNews(lang = 'en')
    googleNews.search(type) # will search the kind we want to hear
    googleNews.getpage(1) # page number of news 
    googleNews.result()
    list = googleNews.gettext()
    #print(list)
    if len(list) > 0:
       speak(random.choice(list))
    else:
       speak("No news related to this topic.") 
コード例 #16
0
def search():
    global state, config
    if config is None:
        raise Exception('Call initiateConfig first')
    if state is None:
        state = {}
    state['url'] = {}
    googlenews = GoogleNews()
    googlenews = GoogleNews('en', 'd')
    for city in config['cities']:
        googlenews.search('covid in ' + city)
        state['url'][city] = []
        for i in range(config['pagesPerCity']):
            googlenews.getpage(i)
            state['url'][city].extend(googlenews.get__links())
コード例 #17
0
    def news(topic: str,
             start_date: str = None,
             end_date: str = None,
             **kwargs):
        page_num = int(kwargs.get('Page', '0'))
        article_num = int(kwargs.get('Article', '0'))
        if page_num == 0 and article_num == 0:
            try:
                NewsHistory.objects.latest('search_time').delete()
            except Exception as e:
                print("No news history for this user", repr(e))
            googlenews = GoogleNews()
            googlenews.search(topic)
            googlenews.getpage(1)
            articles = googlenews.result()
            articles = [
                article for article in articles if len(article['title']) > 10
            ]
            db_entry = NewsHistory(user_id=1,
                                   search_topic=topic,
                                   last_fetched_count=0,
                                   news_articles=str(articles))
            articles = articles[0:3]
            db_entry.save()
        else:
            news_list = NewsHistory.objects.latest('search_time')
            news_items = ast.literal_eval(news_list.news_articles)
            if page_num != 0:
                article_start_num = page_num * 3
                articles = news_items[article_start_num:article_start_num + 3]
            elif article_num != 0:
                article = news_items[article_num - 1]
                article_link = '<a href="{}" target="_blank">Read full article</a>'.format(
                    article['link'])
                article = "<br>" + "<br>".join(
                    [article['title'], article['desc'], article_link])
                return {'response': article}

        article_text = []
        for i, article in enumerate(articles):
            serial_number = str(i + 1 + page_num * 3)
            article_summary = (serial_number,
                               f"{article['date']}, {article['media']}",
                               article['title'])
            article_text.append(article_summary)
        all_articles = "<br>".join([", ".join(i) for i in article_text])
        return {'response': all_articles, 'followup': True}
コード例 #18
0
def search(keyword=None, datestart=None, dateend=None, pgs=1):
    # Variáveis globais
    global noticias
    global cont
    global acabou

    # Parametros de busca
    print('Keyword: ', keyword)

    #Configuração da pesquisa
    googlenews = GoogleNews(start=datestart, end=dateend)
    googlenews.setlang('pt')
    googlenews.search(keyword)
    result = googlenews.result()

    # Passando os dados p/ um DataFrame
    df = pd.DataFrame(result)

    # Printando as 5 primeiras notícias
    print(df.head())

    # Pega um range de páginas obtidas do resultado acima
    for i in range(0, pgs):
        googlenews.getpage(i)
        result = googlenews.result()
        df = pd.DataFrame(result)

    # Converte o DataFrame acima para uma lista de dicionários
    for ind in df.index:
        print('Noticia numero: {}'.format(ind))
        dict = {}
        article = Article(df['link'][ind], config=config)
        article.download()
        try:
            article.parse()
            article.nlp()
            dict['Date'] = df['date'][ind]
            dict['Media'] = df['media'][ind]
            dict['Title'] = article.title
            dict['Article'] = article.text
            dict['Summary'] = article.summary
            dict['Created'] = False
            noticias.append(dict)
        except:
            print('Error')
        time.sleep(0)
コード例 #19
0
def crawl(coin):
    page = news_pages[search_keyword.index(coin)]
    news = GoogleNews(lang='ko', encode='utf-8')
    news.search(coin)
    time.sleep(30)
    news.getpage(page)
    title = news.get_texts()
    url = news.get_links()
    desc = news.get_desc()
    for t, u, d in zip(title, url, desc):
        # print(d)
        idx = coin_index[search_keyword.index(coin)]
        if t != "" and u != "" and d != "":
            dic = {
                u"title": u'{}'.format(t),
                u"desc": u'{}'.format(d),
                u"link": u'{}'.format(u)
            }
            if coin_list[search_keyword.index(coin)] in t or coin_eng[
                    search_keyword.index(coin)] in t:
                if idx == 0:
                    ref = db.collection(u'{}'.format(
                        coin_eng[search_keyword.index(coin)]))
                    ref.add(dic)
                    time.sleep(random.uniform(2, 4))
                    coin_index[search_keyword.index(coin)] += 1
                else:
                    flag = True
                    ref = db.collection(u'{}'.format(
                        coin_eng[search_keyword.index(coin)])).stream()
                    for doc in ref:
                        time.sleep(random.uniform(1, 3))
                        check_dic = doc.to_dict()
                        #print('[check] {}'.format(check_dic))
                        if dic['title'] == check_dic['title']:
                            flag = False
                            break
                    if flag:
                        print('[{}] ///// {} '.format(coin, dic))
                        ref = db.collection(u'{}'.format(
                            coin_eng[search_keyword.index(coin)]))
                        ref.add(dic)
                        time.sleep(random.uniform(1, 5))
                        #print(coin,t,u)
                        coin_index[search_keyword.index(coin)] += 1
    news_pages[search_keyword.index(coin)] += 1
コード例 #20
0
def get_news(text):
	googlenews = GoogleNews()
	googlenews.search(text)
	googlenews.clear()
	googlenews.getpage(2)
	result = googlenews.result()
	for index in result[0]:
		# print(index, '\n', result[0][index])
		if index == "title":
			title = result[0][index]
		elif index == "link":
			link = result[0][index]
			domain = get_domain(link)
		else:
			continue

	return title, link, domain
コード例 #21
0
class Engine:
    def __init__(self):
        self.news = GoogleNews()
        self.news.setlang('en')
        #self.news.setTimeRange('01/01/2000','01/01/2015')
        self.news.setencode('utf-8')
        self.pageNumber = 1
        self.searchTerm = ""

    def nextPage(self):
        if self.news.result == None:
            raise RuntimeError("Engine has not searched yet")
        self.pageNumber += 1
        self.news.clear()
        self.news.getpage(self.pageNumber)
        if len(self.news.result()) == 0: return False
        else: return True

    def previousPage(self):
        if self.news.result == None:
            raise RuntimeError("Engine has not searched yet")
        self.pageNumber -= 1
        self.news.clear()
        self.news.getpage(self.pageNumber)
        if len(self.news.result()) == 0: return False
        else: return True

    def search(self, term):
        self.news.search(term)
        if len(self.news.result()) == 0:
            return False
        else:
            return self.news.result()

    def getPageNumber(self):
        return self.pageNumber

    def getResults(self):
        return self.news.result()

    def clear(self):
        self.news.clear()

    def resetPageNumber(self):
        self.pageNumber = 1
コード例 #22
0
    def get_links(self, pages=1):
        """obtains all relevant links from the search,
            for each company.
            
        Args:
            pages :: int
                number of google pages to search resuts from
                
        Stores:
            links :: dict(list[dict])
                dictionaries of list, keys being search terms
                and values being relevant information (e.g. URL)
        """

        gnews = GoogleNews(start=self.date_from, end=self.date_to)
        links = {}

        #obtaining all the URLs
        for s in self.search_terms:
            gnews.search(s)
            for p in range(1, pages + 1):
                gnews.getpage(p)
                result = gnews.result()  #stores values until cleared

            links[s] = result
            gnews.clear()

        #removing irrelevant links
        for s in self.search_terms:
            tmp = []
            num = dd[s]  #number of relevant terms in search_terms
            rel_str = ' '.join(s.lower().split()[:num])  #relevant string

            for d in links[s]:
                #selection criterion, e.g. if search term
                #is 'apple news', then want to subset based on 'apple' rather than 'apple news'
                #--> filter with first word of each search term
                if rel_str in d['desc'].lower():
                    tmp.append(d)
            links[s] = tmp

        self.search_info = links

        return None
コード例 #23
0
    def news(topic: str, start_date: str = None, end_date: str = None):
        help_text = "news: use this to fetch news<br><br>"\
            "Usage: news topic<br>"\
            "options:<br>"\
            "--help: get help (this screen)<br><br>"\
            "Followup: After fetching a set of news articles, enter<br>"\
            "n: fetch the next set of articles<br>"\
            "number: fetch the details of the article"

        googlenews = GoogleNews()
        page_num = 1
        detail = None
        if start_date is not None and end_date is not None:
            googlenews.setTimeRange(start_date, end_date)
        if topic.split()[0] == '--help':
            return {'response': help_text}
        if topic.count('~') > 0:
            followup = topic.split('~')[1]
            if followup.split()[0] == 'n':
                page_num = int(followup.split()[1]) + 1
                print(f"Page number: {page_num}")
            elif followup.split()[0].isnumeric():
                detail = int(followup.split()[0])
            topic = topic.split('~')[0]
        googlenews.search(topic)
        googlenews.getpage(1)
        news_results = googlenews.result()
        if detail is not None:
            news_details = news_results[detail + 1]
            print(news_details)
            details = f'{news_details["title"]}<br>{news_details["desc"]}<br>'\
                '<a href="{news_details["link"]}" target="_blank">Read full article</a>'
            return {'response': details}
        articles = []
        start_num = (page_num - 1) * 3
        end_num = page_num * 3
        for i, article in enumerate(news_results[start_num:end_num]):
            serial_number = str(i + 1 + (page_num - 1) * 3)
            article_summary = (serial_number,
                               f"{article['date']}, {article['media']}",
                               article['title'])
            articles.append(article_summary)
        all_articles = "<br>".join([", ".join(i) for i in articles])
        return {'response': all_articles, 'followup': True}
コード例 #24
0
def extract_links(dir_c, dir_k, lang):
    for t in topics:
        print('Current topic: ', t + '\n')

        kw = get_keywords(dir_k, t)
        print('Keywords: ', kw + '\n')

        f_clean = open(dir_c + t + '.txt', 'r')
        fp = f_clean.readlines()
        min_d, max_d, num_d = get_date_range(fp)
        print('Date range: ', min_d, max_d + '\n')

        f_out = open(lang + '/links/' + t + '_links.txt', 'w')

        key_enc = quote(kw.encode('utf8'))
        googlenews = GoogleNews()
        googlenews.setlang(lang)
        googlenews.setTimeRange(min_d, max_d)
        googlenews.search(key_enc)
        result = googlenews.result()

        page = 1
        num_art = len(result)
        curr_art = num_art

        while curr_art < 10*num_d:
            page += 1
            googlenews.getpage(page)
            result = googlenews.result()
            num_art = len(result)
            if curr_art < num_art:
                curr_art = num_art
            else: break
        
        for i in range(curr_art):
            date = str(dateparser.parse(result[i]['date']).date())
            link = result[i]['link']
            f_out.write(date + '\n' + link)
            f_out.write('\n--------------------------------\n')

        print('--------------------------------\n')
        f_out.close()
コード例 #25
0
ファイル: get_corpus.py プロジェクト: b04901140/IR_final
def get_corpus_in_time_interval(start_time, end_time, args):
    query = args.query
    page_count = args.pages

    gn = GoogleNews(start=start_time, end=end_time)
    corpus = list()

    gn.search(query)
    for i in range(1, page_count + 1):
        gn.clear()
        gn.getpage(i)
        all_rel_news = gn.result()
        for raw_news in all_rel_news:
            news = News(raw_news)
            if i == 1:
                news.set_relv()
            if news.mainText != 'fail':
                corpus.append(news)

    return corpus
コード例 #26
0
ファイル: getNews.py プロジェクト: b04901140/IR_final
def getNews(topic, start_time, end_time):
    googlenews = GoogleNews(start=start_time, end=end_time)
    titles = []
    texts = []
    labels = []
    for i in range(1, 2):
        googlenews.clear()
        googlenews.search(topic)
        googlenews.getpage(i)
        tmp = googlenews.result()

        #result  += [x["title"]+x["desc"] for x in tmp]
        (tmp_title, tmp_text) = get_content(tmp)
        titles += tmp_title
        texts += tmp_text
        if i == 1:
            labels += [1 for _ in range(len(tmp_text))]
        else:
            labels += [0 for _ in range(len(tmp_text))]
    #labels = np.array(labels)
    return (titles, texts, labels)
コード例 #27
0
def googlenews_extract(date_range, num_pages, search_text):

    ''' Use googlenews package to extract stories from top {num_pages} pages per day based on {search_text} '''
    
    df_days = []
    
    #TODO: add multi-threading

    for date in date_range:
        
        result = []
        googlenews = GoogleNews(start=date, end=date)
        googlenews.search(search_text)
        print("Search Date = ", date)
        
        for i in range(0, num_pages):

            print('Executing GoogleNews call #', i+1)

            googlenews.getpage(i)
            result_next = googlenews.result()
            print("Total records returned: ", len(result_next))
            
            df = pd.DataFrame(result_next)   
            df['date_calendar'] = date
        
        df_days.append(df) 
        appended_data = pd.concat(df_days)

    # Drop duplicate titles
    appended_data = appended_data.drop_duplicates(subset=['title'])

    # Append to master news df
    df_news = appended_data.reset_index(drop=True).drop(['date'], axis=1)
      
    return df_news
コード例 #28
0
 def search_news(self, max_page=10):
     news_list = list()
     # iterate for keywords
     for keyword in self.keywords_dict[self.ticker]:
         # GoogleNews accepcts different date format from yfinance
         googlenews_client = GoogleNews(start=self.gnews_date_fmt(self.start_time),
                                        end=self.gnews_date_fmt(self.end_time))
         googlenews_client.search(keyword)
         for i in range(1, max_page):
             googlenews_client.getpage(i)
         news_list = news_list + googlenews_client.result()
     # convert to pandas dataframe and remove duplicates
     temp_df = pd.DataFrame(news_list)
     temp_df.drop_duplicates(subset='link', inplace=True, keep='first')
     # get text from links
     content_list = list()
     for ind in temp_df.index:
         article_link = temp_df['link'][ind]
         if any(link_filter in article_link
                 for link_filter in self.link_filter_list):
             continue
         try:
             record_dict = dict()
             article = Article(article_link, config=self.config)
             article.download()
             article.parse()
             record_dict = {'Date': temp_df['date'][ind],
                            'Media': temp_df['media'][ind],
                            'Title': article.title,
                            'Article': article.text,
                            'Link': article_link}
             content_list.append(record_dict)
         except:
             print('Can\'t fetch article: {:s}'.format(
                 temp_df['link'][ind]))
     self.news_df = pd.DataFrame(content_list)
コード例 #29
0

model = SentimentClassifier(len(class_names))
model.load_state_dict(
    torch.load('model/best_model_state.bin', map_location='cpu'))
model = model.to(device)

# review_text = input('Enter the review you want to check:\n')

## Google News start

news_content = []
searchInput = input('Enter the search keyword:\n')
googlenews.search(searchInput)
for i in range(1, 1 + 1):
    googlenews.getpage(i)
    for i in googlenews.result():
        news_content.append(i['desc'])
    googlenews.clear()

## End
for i in news_content:
    encoded_review = tokenizer.encode_plus(
        i,
        max_length=MAX_LEN,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
コード例 #30
0
ファイル: gnews.py プロジェクト: SecexSaudeTCU/noticias_ner
def __extrai_noticias_gnews(q,
                            dia_inicio,
                            dia_fim,
                            num_limite_paginas=1,
                            lang='pt-BR',
                            sleep=1,
                            tentativas=5):
    """
    Retorna data frame com as notícias obtidas na aba News do Google

    Parâmetros
    ----------
        q : str
            String de busca

        data_inicio, dta_fim : datatime.Date
            Datas de início e fim para realização da busca

        num_limite_num_limite_paginas : int
            Número máxima de páginas que serão obtidas.

        lang : str
            Código da lingua para realização da busca (padrão pt-BR)

        sleep : int
            Número de segundos para esperar entre tentativas após cada erro de obtenção de página

        tentativas : int
            Número de tentativas de obnteção de uma página antes de se considerar a extração concluída

    Retorno
    -------
        resultados : DataFrame
            Dataframe com os reulstados de busca
    """

    # String de busca formatado adequadamente para URL
    # q = urllib.parse.quote(q)

    # Strings com as datas no formato esperado pela lib GoogleNews
    formato_data = '%m/%d/%Y'
    dia_inicio_formatado = dia_inicio.strftime(formato_data)
    dia_fim_formatado = dia_fim.strftime(formato_data)

    # Instancia interface de busca ao Google News com idioma pt-BR e período adequado
    gn = GoogleNews(lang=lang,
                    start=dia_inicio_formatado,
                    end=dia_fim_formatado)

    # Inicializa lista para armazenar os resultados de busca
    resultados = []

    # Realiza busca da primeira página
    logger = logging.getLogger('covidata')
    logger.info(f'Buscando página 1')
    gn.search(q)
    resultados = resultados + gn.result()
    gn.clear()

    # Para a página 2 em diante (p2 corresponde ao índice 1)
    for i in range(2, num_limite_paginas + 1):

        logger.info(f'Buscando página {i}')

        # Busca a página
        gn.getpage(i)

        # Adiciona reusltado à lista
        resultados = resultados + gn.result()

        # Caso a consulta à página não tenha gerado resultados
        if gn.result() == []:
            logger.info(
                f'A consulta à página {i} não retornou nehnum resultado')

            # Diminui o contador de tentaivas
            tentativas = tentativas - 1
            logger.info(f'*** Há {tentativas} restantes ***')

            # Caso o número de tentativas tenha chegado a zero, interrompe a execução
            if tentativas < 1:
                break

            # Caso contrário
            else:
                # Pausa script por sleep segundos antes de buscar a próxima página
                logger.info(f'Execução interrompida por {sleep} segundos')
                time.sleep(sleep)

        # Apaga cache do resultado
        gn.clear()

    # Cria e retorna dataframe
    return pd.DataFrame(resultados)