コード例 #1
0
    def googleNewsCrawler(self):
        result_list = []
        googlenews = GoogleNews()

        for i in range(self.__numDays):
            startDateTime = self.__dateTime + timedelta(days=i)
            endDateTime = self.__dateTime + timedelta(days=i + self.__daysSpan)

            googlenews.setTimeRange(
                start=str(startDateTime.month) + '/' + str(startDateTime.day) +
                '/' + str(startDateTime.year),
                end=str(endDateTime.month) + '/' + str(endDateTime.day) + '/' +
                str(endDateTime.year))
            googlenews.search(self.__keyWords)
            for j in range(self.__pagsEveryDay - 1):
                googlenews.getpage(j + 2)
            logging.info(
                str(self.__keyWords + '__' + str(startDateTime.date()) +
                    " append " + str(int(self.__pagsEveryDay * 10)) +
                    " items"))
            result_list = result_list + googlenews.result()
            googlenews.clear()

            if (i + 1) % 10 == 0:
                self.toJson(result_list)
                result_list = []
                continue
        self.toJson(result_list)
コード例 #2
0
ファイル: autonews_api.py プロジェクト: riyakwl28/auto_news
def extract_google(query_terms, startDate, endDate):
    if len(startDate) == 0:
        startDate = datetime.datetime.today().strftime('%d/%m/%Y')
    if len(endDate) == 0:
        endDate = datetime.datetime.strftime(
            datetime.datetime.today().date() - datetime.timedelta(days=7),
            '%d/%m/%Y')
    startDate = datetime.datetime.strptime(startDate,
                                           '%Y-%m-%d').strftime('%d/%m/%y')
    endDate = datetime.datetime.strptime(endDate,
                                         '%Y-%m-%d').strftime('%d/%m/%y')
    final_articles = []
    print(startDate)
    print(endDate)
    print("Crawling Starting")
    # here extracting news from google news
    googlenews = GoogleNews()
    googlenews.setTimeRange(startDate, endDate)
    for query in query_terms:
        googlenews.clear()

        #forming the search term
        googlenews.search("India Technology " + query)

        result = googlenews.result()

        for n in range(len(result)):
            source = result[n]['media']
            url = result[n]['link']
            try:
                article = Article(url)
                article.download()
                article.parse()
            except Exception as e:
                print("Trouble downloading so skipping")
                continue
            content = article.text

            # summarize the content
            temp_content = re.sub(r'^\s*[\(\[].*?[\)\]]\s*', '', content)
            sentences = sent_detector.tokenize(temp_content)
            summary = (" ".join(sentences[:2]).strip())

            date = result[n]['date']
            if (date.find('ago') != -1):
                date = current.date()
            title = result[n]['title']
            #         content=result[n]['desc']
            img = result[n]['img']
            #adding the extracted info in final_articles list
            final_articles.append({
                'source': source,
                'url': url,
                'date': date,
                'title': title,
                'content': content,
                'img': img
            })
    return final_articles
コード例 #3
0
def initalize_google_news(start_date, end_date):
    """Initializes the googlenews object."""

    print("initalize_google_news...")

    googlenews = GoogleNews(encode="utf-8")  # create googlenews object
    googlenews.setlang("en")
    googlenews.setperiod("d")
    googlenews.setencode("utf-8")
    googlenews.setTimeRange(start_date, end_date)  # using user specified date range

    return googlenews
コード例 #4
0
class GoogleNewsMethods():

    # Creates a googlenews object
    def __init__(self):
        self.googlenews = GoogleNews(lang="en")

    # This will return a list of news for perticular stock on a given date
    def newscollection(self, stock, date):
        self.googlenews.search(stock)
        self.googlenews.setTimeRange('05/01/2020', '05/28/2020')
        self.googlenews.setperiod('05/15/2020')
        self.newsList = self.googlenews.result()
        return (self.newsList)
コード例 #5
0
    def news(topic: str, start_date: str = None, end_date: str = None):
        help_text = "news: use this to fetch news<br><br>"\
            "Usage: news topic<br>"\
            "options:<br>"\
            "--help: get help (this screen)<br><br>"\
            "Followup: After fetching a set of news articles, enter<br>"\
            "n: fetch the next set of articles<br>"\
            "number: fetch the details of the article"

        googlenews = GoogleNews()
        page_num = 1
        detail = None
        if start_date is not None and end_date is not None:
            googlenews.setTimeRange(start_date, end_date)
        if topic.split()[0] == '--help':
            return {'response': help_text}
        if topic.count('~') > 0:
            followup = topic.split('~')[1]
            if followup.split()[0] == 'n':
                page_num = int(followup.split()[1]) + 1
                print(f"Page number: {page_num}")
            elif followup.split()[0].isnumeric():
                detail = int(followup.split()[0])
            topic = topic.split('~')[0]
        googlenews.search(topic)
        googlenews.getpage(1)
        news_results = googlenews.result()
        if detail is not None:
            news_details = news_results[detail + 1]
            print(news_details)
            details = f'{news_details["title"]}<br>{news_details["desc"]}<br>'\
                '<a href="{news_details["link"]}" target="_blank">Read full article</a>'
            return {'response': details}
        articles = []
        start_num = (page_num - 1) * 3
        end_num = page_num * 3
        for i, article in enumerate(news_results[start_num:end_num]):
            serial_number = str(i + 1 + (page_num - 1) * 3)
            article_summary = (serial_number,
                               f"{article['date']}, {article['media']}",
                               article['title'])
            articles.append(article_summary)
        all_articles = "<br>".join([", ".join(i) for i in articles])
        return {'response': all_articles, 'followup': True}
コード例 #6
0
def extract_links(dir_c, dir_k, lang):
    for t in topics:
        print('Current topic: ', t + '\n')

        kw = get_keywords(dir_k, t)
        print('Keywords: ', kw + '\n')

        f_clean = open(dir_c + t + '.txt', 'r')
        fp = f_clean.readlines()
        min_d, max_d, num_d = get_date_range(fp)
        print('Date range: ', min_d, max_d + '\n')

        f_out = open(lang + '/links/' + t + '_links.txt', 'w')

        key_enc = quote(kw.encode('utf8'))
        googlenews = GoogleNews()
        googlenews.setlang(lang)
        googlenews.setTimeRange(min_d, max_d)
        googlenews.search(key_enc)
        result = googlenews.result()

        page = 1
        num_art = len(result)
        curr_art = num_art

        while curr_art < 10*num_d:
            page += 1
            googlenews.getpage(page)
            result = googlenews.result()
            num_art = len(result)
            if curr_art < num_art:
                curr_art = num_art
            else: break
        
        for i in range(curr_art):
            date = str(dateparser.parse(result[i]['date']).date())
            link = result[i]['link']
            f_out.write(date + '\n' + link)
            f_out.write('\n--------------------------------\n')

        print('--------------------------------\n')
        f_out.close()
コード例 #7
0
        if 0 < choice < 5:
            break
        else:
            print("That is not between 1 and 4! Try again:")
    print ("You entered: {} ") # Good to use format instead of string formatting with %
mydict = {1:go_to_stackoverflow, 2:import_from_phone, 3:import_from_camcorder, 4:import_from_camcorder}
mydict[choice]()
print(askUser())

s_req = input("Enter the term you would like to search")
st_date = input("Please enter your desired start date (MM-DD-YYY): ")
en_date = input("Please enter your desired end date (MM-DD-YYY): ")

googlenews = GoogleNews()
googlenews.setlang('en')
googlenews.setTimeRange(st_date,en_date)
googlenews.search(s_req)
googlenews.result()




#create a least squares regression model using the variablles
all_adj_close= all_data[['Adj Close']]
all_returns = np.log(all_adj_close / all_adj_close.shift(1))

#isolate the returns you want to value for the OLS
print("As a reminder, you have selected the following: " + input_string)
sample_stocks = input("Please choose 2 of the stocks you have chosen to calculate a OLS regression: ")

reg_choices = sample_stocks.split(",")
コード例 #8
0
    return dataset



if __name__ == '__main__':
    import time
    import requests
    from bs4 import BeautifulSoup    
    from GoogleNews import GoogleNews
    googlenews = GoogleNews()
    googlenews = GoogleNews(lang='en')
    googlenews = GoogleNews(period='d')
    googlenews = GoogleNews(start='02/01/2020',end='02/28/2020')
    googlenews.setlang('en')
    googlenews.setperiod('d')
    googlenews.setTimeRange('02/01/2020','02/28/2020')
    googlenews.search('APPL')
    googlenews.getpage(2)
    x = googlenews.result()
    for item in x:
        web_link = item['link']
        
        start = time.time()
        page_source = requests.get(web_link)
        soup = BeautifulSoup(page_source.text, "lxml")
        print('s: ', time.time()-start)
        try: 
            text = soup.find('article').text
            #print(text)
        except:
            continue
コード例 #9
0
ファイル: webapp.py プロジェクト: manrj/twitter-streamlit
searchInput = st.sidebar.text_input('search query')
val = len(searchInput)
if val > 0:
    agree = st.sidebar.checkbox('frequency')
    if agree:
        option = st.sidebar.selectbox('How would you like to be contacted?',
                                      ('1h', '1d', '7d', '1y'))
        googlenews.setperiod(option)
    else:
        st.sidebar.markdown('Select the time range for the search')
        dt1 = st.sidebar.date_input('from date', datetime.date.today())
        dt2 = st.sidebar.date_input('till date', datetime.date.today())
        if dt1 > dt2:
            st.sidebar.error('SELECT A VALID "FROM" DATE')
        else:
            googlenews.setTimeRange(dt1, dt2)
    with st.spinner('Getting data...'):
        googlenews.search(searchInput)
        news_content = []

        ## ''' Google News start '''
        for i in range(1, 1 + 1):
            googlenews.getpage(i)
            for i in googlenews.result():
                news_content.append(i['desc'])
            googlenews.clear()

        ## ''' Twitter handle '''
        q = '%40' + '#' + searchInput + ' -filter:retweets -filter:replies'
        # count : no of tweets to be retrieved per one call and parameters according to twitter API
        params = {'q': q, 'count': 1000, 'lang': 'en', 'result_type': 'recent'}
コード例 #10
0
for single_date in daterange(start_date, end_date):
    all_date.append(single_date.strftime("%m/%d/%Y"))
    
googlenews = GoogleNews()
googlenews.setlang('en')

webscraper = WebScraper()

now = datetime.now()
save_file_name = now.strftime("googlenews_results_%H+%M_%m-%d-%Y")
save_file_name = save_file_name + '.csv'

count = 0
for cur_date in all_date:
    print("The current date searching ",cur_date)
    googlenews.setTimeRange(cur_date, cur_date)

    googlenews.search('tesla')
    webscraper.save_csv(save_file_name, googlenews)

    page_counter = 2
    while True:
        googlenews.getpage(page_counter)
        if(not googlenews.result()):
            print("last page is ", str(page_counter - 1))
            break
        webscraper.save_csv(save_file_name, googlenews)
        page_counter += 1 
    
    if count == 1:
        break