Python GoogleNews.clearの例、GoogleNews.GoogleNews.clear Pythonの例

コード例 #1

0

ファイルを表示

ファイル: crawling_for_sp500_complete_ver.py プロジェクト: jinwooahnKHU/2020_2_Financial_Data_Analysis

def crawling_news(company_name_list, start_date, end_date, save_file_name):
    #set logger Handler
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    logger.addHandler(stream_handler)

    #define googlenews
    googlenews = GoogleNews(lang='en',
                            start=start_date,
                            end=end_date,
                            encode='utf-8')
    #news.google.com search sample
    all_title = []
    logging.info('loop start')
    for i in range(len(company_name_list)):
        comp_name = company_name_list[i]
        googlenews.search(comp_name)
        logging.info('%s : %d%s' %
                     (comp_name,
                      ((i + 1) / len(company_name_list)) * 100, '%'))
        for j in range(len(googlenews.results())):
            temp = []
            temp.append(googlenews.results()[j].get('title'))
            temp.append(comp_name)
            temp.append(fixing_date(googlenews.results()[j].get('date')))
            all_title.append(temp)
        #clear result list
        googlenews.clear()
    all_title = pd.DataFrame(all_title)
    all_title.to_csv('%s.csv' % (save_file_name))
    logging.info('saved as %s.csv, done!!' % (save_file_name))
    return all_title

コード例 #2

0

ファイルを表示

ファイル: temp.py プロジェクト: manishmj9431/ISO-25-E1

def getNews(query):
    googleNews = GoogleNews()
    googleNews.search(query)

    news = []

    i = 0

    number = min([len(googleNews.result()), 6])

    for result in googleNews.result():
        if (i > number):
            break

        n = {}
        n["title"] = result['title']
        n["description"] = result['desc']
        n["link"] = result['link']

        if (i == 0):
            n["image"] = result['img']
        news.append(n)

        i += 1

    googleNews.clear()

    return news

コード例 #3

0

ファイルを表示

ファイル: scrapper+scheduler.py プロジェクト: avrland/polishNewsTitleDatabase

  def job(self):    
      #Download current database
      self.getDB()
      self.print_header(self.rawFileName)
      self.lineCounter(self.rawFileName)
      x = 0
      for tag in self.newsTags:
        #print("Collecting newses from tag: " + tag + "...")
        self.logger.info(f"Collecting newses from tag: {tag}")
        googlenews = GoogleNews()
        googlenews.clear()
        googlenews.set_lang(self.newsLang)
        googlenews.setperiod('1d')
        googlenews.get_news(tag)
        output = googlenews.results(sort=True)
        output = pd.DataFrame(output)
        x = x + len(output['title'])
        self.saveToFile(output, self.rawFileName)
      self.logger.info(f"Collected amount of news:  {x}")
      self.removeDuplicates(self.rawFileName, self.finalFileName)

      #os.remove(rawFileName) #delete bufor file
      #logger.info(f"Removed file with duplicates:  {rawFileName}")
      os.rename(self.finalFileName, self.rawFileName) #rename final file to bufor name
      self.logger.info(f"Renamed: {self.finalFileName} to: {self.rawFileName}")
      self.backupDB()

コード例 #4

0

ファイルを表示

    def googleNewsCrawler(self):
        result_list = []
        googlenews = GoogleNews()

        for i in range(self.__numDays):
            startDateTime = self.__dateTime + timedelta(days=i)
            endDateTime = self.__dateTime + timedelta(days=i + self.__daysSpan)

            googlenews.setTimeRange(
                start=str(startDateTime.month) + '/' + str(startDateTime.day) +
                '/' + str(startDateTime.year),
                end=str(endDateTime.month) + '/' + str(endDateTime.day) + '/' +
                str(endDateTime.year))
            googlenews.search(self.__keyWords)
            for j in range(self.__pagsEveryDay - 1):
                googlenews.getpage(j + 2)
            logging.info(
                str(self.__keyWords + '__' + str(startDateTime.date()) +
                    " append " + str(int(self.__pagsEveryDay * 10)) +
                    " items"))
            result_list = result_list + googlenews.result()
            googlenews.clear()

            if (i + 1) % 10 == 0:
                self.toJson(result_list)
                result_list = []
                continue
        self.toJson(result_list)

コード例 #5

0

ファイルを表示

def get_company_news_link(company='NaN', news_num=5, time_range='today'):

    if company == 'NaN':
        return 'please input company name'

    news_link = []
    googlenews = GoogleNews()
    googlenews.clear()

    if time_range != 'today':
        start_date = "{1}/{2}/{0}".format(time_range[0:4], time_range[5:7],
                                          time_range[8:10])
        end_date = "{1}/{2}/{0}".format(time_range[11:15], time_range[16:18],
                                        time_range[19:21])
        googlenews.set_time_range(start_date, end_date)

    googlenews.search(company)
    result = googlenews.result()

    try:
        for num in range(news_num):
            news_link.append(result[num]['link'])
    except IndexError:
        if len(news_link) == 0:
            return '此時段無' + company + '新聞 OR 網路不穩'
        return news_link
    else:
        return news_link

コード例 #6

0

ファイルを表示

ファイル: autonews_api.py プロジェクト: riyakwl28/auto_news

def extract_google(query_terms, startDate, endDate):
    if len(startDate) == 0:
        startDate = datetime.datetime.today().strftime('%d/%m/%Y')
    if len(endDate) == 0:
        endDate = datetime.datetime.strftime(
            datetime.datetime.today().date() - datetime.timedelta(days=7),
            '%d/%m/%Y')
    startDate = datetime.datetime.strptime(startDate,
                                           '%Y-%m-%d').strftime('%d/%m/%y')
    endDate = datetime.datetime.strptime(endDate,
                                         '%Y-%m-%d').strftime('%d/%m/%y')
    final_articles = []
    print(startDate)
    print(endDate)
    print("Crawling Starting")
    # here extracting news from google news
    googlenews = GoogleNews()
    googlenews.setTimeRange(startDate, endDate)
    for query in query_terms:
        googlenews.clear()

        #forming the search term
        googlenews.search("India Technology " + query)

        result = googlenews.result()

        for n in range(len(result)):
            source = result[n]['media']
            url = result[n]['link']
            try:
                article = Article(url)
                article.download()
                article.parse()
            except Exception as e:
                print("Trouble downloading so skipping")
                continue
            content = article.text

            # summarize the content
            temp_content = re.sub(r'^\s*[\(\[].*?[\)\]]\s*', '', content)
            sentences = sent_detector.tokenize(temp_content)
            summary = (" ".join(sentences[:2]).strip())

            date = result[n]['date']
            if (date.find('ago') != -1):
                date = current.date()
            title = result[n]['title']
            #         content=result[n]['desc']
            img = result[n]['img']
            #adding the extracted info in final_articles list
            final_articles.append({
                'source': source,
                'url': url,
                'date': date,
                'title': title,
                'content': content,
                'img': img
            })
    return final_articles

コード例 #7

0

ファイルを表示

ファイル: try2.py プロジェクト: arvin-matabang/ExtensionForIndigitousHackathon

def get_news(text):
    googlenews = GoogleNews()
    googlenews.search(text)
    googlenews.clear()
    googlenews.getpage(2)
    result = googlenews.result()

    return result

コード例 #8

0

ファイルを表示

def search_google_news(query, google_date):
    #-- Retrieve news articles
    # Init googlenews
    googlenews = GoogleNews()
    #googlenews.set_period('7d') # Cannot use set_period with set_time_range, use either or.
    #googlenews.set_time_range(str(google_date), '2020-10-12')
    googlenews.set_encode('utf-8')
    googlenews.search(query)
    googlenews.getpage(50)
    result = googlenews.result()
    # Clear before searching again
    googlenews.clear()

    return result

コード例 #9

0

ファイルを表示

def getNews(text):
    googlenews = GoogleNews("en", "m" )
    #Parameters: language and timeframe (day, month, year)
    googlenews.search(text)
    googlenews.getpage(1)
    #first Result Page
    headlines = googlenews.gettext()
    #only show Headlines (pictures or links are also available)

    #convert to string, Because Wordcloud expects string like object
    text = ' '.join([str(elem) for elem in headlines])
    generateWordCloud(text)
    googlenews.clear()
    return

コード例 #10

0

ファイルを表示

ファイル: news.py プロジェクト: arvin-matabang/ExtensionForIndigitousHackathon

def get_news(text):
	googlenews = GoogleNews()
	googlenews.search(text)
	googlenews.clear()
	googlenews.getpage(2)
	result = googlenews.result()
	for index in result[0]:
		# print(index, '\n', result[0][index])
		if index == "title":
			title = result[0][index]
		elif index == "link":
			link = result[0][index]
			domain = get_domain(link)
		else:
			continue

	return title, link, domain

コード例 #11

0

ファイルを表示

async def search(ctx, *, message):
    googlenews = GoogleNews(lang='en', period='d')
    googlenews.search(message)
    result = googlenews.gettext()
    embed = discord.Embed()
    embed.colour = discord.Colour.from_rgb(255, 225, 135)
    embed.set_author(name="Google NEWS",
                     icon_url="https://i.imgur.com/tDLGRiT.jpg")
    embed.set_footer(
        text="Data from Google News | WallSt Bot made by Bruno Lazaro.")
    embed.add_field(
        name=f"{message} News",
        value=
        (f'{result[1]}\n\n{result[2]}\n\n{result[3]}\n\n{result[4]}\n\n{result[5]}\n\n{result[6]}'
         ),
        inline=False)
    await ctx.send(embed=embed)
    googlenews.clear()

コード例 #12

0

ファイルを表示

ファイル: module_scraper.py プロジェクト: vineetjnair9/6-867_Project

    def get_links(self, pages=1):
        """obtains all relevant links from the search,
            for each company.
            
        Args:
            pages :: int
                number of google pages to search resuts from
                
        Stores:
            links :: dict(list[dict])
                dictionaries of list, keys being search terms
                and values being relevant information (e.g. URL)
        """

        gnews = GoogleNews(start=self.date_from, end=self.date_to)
        links = {}

        #obtaining all the URLs
        for s in self.search_terms:
            gnews.search(s)
            for p in range(1, pages + 1):
                gnews.getpage(p)
                result = gnews.result()  #stores values until cleared

            links[s] = result
            gnews.clear()

        #removing irrelevant links
        for s in self.search_terms:
            tmp = []
            num = dd[s]  #number of relevant terms in search_terms
            rel_str = ' '.join(s.lower().split()[:num])  #relevant string

            for d in links[s]:
                #selection criterion, e.g. if search term
                #is 'apple news', then want to subset based on 'apple' rather than 'apple news'
                #--> filter with first word of each search term
                if rel_str in d['desc'].lower():
                    tmp.append(d)
            links[s] = tmp

        self.search_info = links

        return None

コード例 #13

0

ファイルを表示

def detailedNews():
    name = request.form["companyName"]
    googlenews = GoogleNews()
    googlenews.clear()
    googlenews.search(name)
    newsresult = googlenews.result(sort=True)
    if 'logged_in' in session:
        time1 = datetime.now()
        unm = session["username"]
        db = pymysql.connect(host="localhost",
                             user="******",
                             password="",
                             database="stock")
        cursor = db.cursor()
        sqlnews = "select * from newshistory where username= %s "

        if ((cursor.execute(sqlnews, (unm)) != 0)):
            db.commit()
            newshistory = cursor.fetchall()
            count1 = len(newsresult)

        if (count1 > 5):
            count1 = 5
        else:
            count1 = count1

        try:

            with db.cursor() as cursor:
                sql = "insert into newshistory(newsname,username,time)values(%s,%s,%s)"
                cursor.execute(sql, (name, unm, time1))
                db.commit()

        finally:
            db.close()

    return render_template('DetailedNews.html',
                           title='Display News',
                           l=newsresult,
                           year=datetime.now().year,
                           name=name,
                           newshistory=newshistory,
                           count1=count1)

コード例 #14

0

ファイルを表示

ファイル: get_corpus.py プロジェクト: b04901140/IR_final

def get_corpus_in_time_interval(start_time, end_time, args):
    query = args.query
    page_count = args.pages

    gn = GoogleNews(start=start_time, end=end_time)
    corpus = list()

    gn.search(query)
    for i in range(1, page_count + 1):
        gn.clear()
        gn.getpage(i)
        all_rel_news = gn.result()
        for raw_news in all_rel_news:
            news = News(raw_news)
            if i == 1:
                news.set_relv()
            if news.mainText != 'fail':
                corpus.append(news)

    return corpus

コード例 #15

0

ファイルを表示

def scrape_the_news():
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent

    topiclist = NLP_news()
    print(topiclist[0])

    googlenews = GoogleNews()
    googlenews.set_lang('en')
    googlenews.set_encode('utf-8')
    googlenews.set_period('7d')
    googlenews.get_news(topiclist[0])

    result = googlenews.results()

    googlenews.clear()

    df = pd.DataFrame(result)
    df = df.drop(['date', 'media'], axis=1)
    df.columns = ['Date', 'Summary', 'Image', 'Link', 'Site', 'Title']
    df = df[['Title', 'Summary', 'Image', 'Link', 'Date', 'Site']]

    conn = psycopg2.connect("dbname=EdTech user=postgres password=edtech123")
    curr = conn.cursor()

    for i, row in df.iterrows():
        try:
            row.Link = 'https://' + row.Link
            columns = row.keys()
            values = [row[column] for column in columns]

            insert_statement = "INSERT INTO scrapenews_newslist VALUES (nextval('scrapenews_newslist_id_seq'::regclass),%s, %s, %s, %s, %s, %s)"
            curr.execute(insert_statement, tuple(values))
        except:
            print('could not add row', i)

    conn.commit()

    curr.close()
    conn.close()

コード例 #16

0

ファイルを表示

ファイル: getNews.py プロジェクト: b04901140/IR_final

def getNews(topic, start_time, end_time):
    googlenews = GoogleNews(start=start_time, end=end_time)
    titles = []
    texts = []
    labels = []
    for i in range(1, 2):
        googlenews.clear()
        googlenews.search(topic)
        googlenews.getpage(i)
        tmp = googlenews.result()

        #result  += [x["title"]+x["desc"] for x in tmp]
        (tmp_title, tmp_text) = get_content(tmp)
        titles += tmp_title
        texts += tmp_text
        if i == 1:
            labels += [1 for _ in range(len(tmp_text))]
        else:
            labels += [0 for _ in range(len(tmp_text))]
    #labels = np.array(labels)
    return (titles, texts, labels)

コード例 #17

0

ファイルを表示

def googlenews_function(keyword='台積電',
                        language='cn',
                        start_date='2020/12/01',
                        end_date='2020/12/28'):
    '''
    - 日期
    - 關鍵字
    - 語言
    - 爬幾頁

    '''
    googlenews = GoogleNews()
    googlenews.clear()
    googlenews.set_encode('utf-8')
    googlenews.set_lang(language)

    all_date_start = start_date.split('/')
    start_year = all_date_start[0]
    start_month = all_date_start[1]
    start_day = all_date_start[2]
    all_date_start = '{}/{}/{}'.format(start_month, start_day, start_year)

    all_date_end = end_date.split('/')
    end_year = all_date_end[0]
    end_month = all_date_end[1]
    end_day = all_date_end[2]
    all_date_end = '{}/{}/{}'.format(end_month, end_day, end_year)

    googlenews.set_time_range(start=all_date_start, end=all_date_end)

    googlenews.search(keyword)
    data = googlenews.result()
    print("資料總筆數:", len(data))
    news = pd.DataFrame(data)
    # news.to_csv("GoogleNews_" + keyword +"_日期" + start_date.replace('/', '-') + '到' +end_date.replace('/', '-')+ ".csv", index= False)
    return news

コード例 #18

0

ファイルを表示

def getNews(query):
    googleNews = GoogleNews()
    googleNews.search(query)
    news = []

    i = 0

    number = min([len(googleNews.result()), 7])

    for result in googleNews.result():
        if (i > number):
            break

        news.append({
            "title": result['title'],
            "description": result['desc'],
            "link": result['link'],
            "date": result['date'],
            "image": result['img']
        })

    googleNews.clear()

    return news

コード例 #19

0

ファイルを表示

ファイル: engine.py プロジェクト: 14henderson/GoogleSearchScraper

class Engine:
    def __init__(self):
        self.news = GoogleNews()
        self.news.setlang('en')
        #self.news.setTimeRange('01/01/2000','01/01/2015')
        self.news.setencode('utf-8')
        self.pageNumber = 1
        self.searchTerm = ""

    def nextPage(self):
        if self.news.result == None:
            raise RuntimeError("Engine has not searched yet")
        self.pageNumber += 1
        self.news.clear()
        self.news.getpage(self.pageNumber)
        if len(self.news.result()) == 0: return False
        else: return True

    def previousPage(self):
        if self.news.result == None:
            raise RuntimeError("Engine has not searched yet")
        self.pageNumber -= 1
        self.news.clear()
        self.news.getpage(self.pageNumber)
        if len(self.news.result()) == 0: return False
        else: return True

    def search(self, term):
        self.news.search(term)
        if len(self.news.result()) == 0:
            return False
        else:
            return self.news.result()

    def getPageNumber(self):
        return self.pageNumber

    def getResults(self):
        return self.news.result()

    def clear(self):
        self.news.clear()

    def resetPageNumber(self):
        self.pageNumber = 1

コード例 #20

0

ファイルを表示

ファイル: sentimentWithGoogleNews.py プロジェクト: manrj/twitter-streamlit

model.load_state_dict(
    torch.load('model/best_model_state.bin', map_location='cpu'))
model = model.to(device)

# review_text = input('Enter the review you want to check:\n')

## Google News start

news_content = []
searchInput = input('Enter the search keyword:\n')
googlenews.search(searchInput)
for i in range(1, 1 + 1):
    googlenews.getpage(i)
    for i in googlenews.result():
        news_content.append(i['desc'])
    googlenews.clear()

## End
for i in news_content:
    encoded_review = tokenizer.encode_plus(
        i,
        max_length=MAX_LEN,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

コード例 #21

0

ファイルを表示

ファイル: gnews.py プロジェクト: SecexSaudeTCU/noticias_ner

def __extrai_noticias_gnews(q,
                            dia_inicio,
                            dia_fim,
                            num_limite_paginas=1,
                            lang='pt-BR',
                            sleep=1,
                            tentativas=5):
    """
    Retorna data frame com as notícias obtidas na aba News do Google

    Parâmetros
    ----------
        q : str
            String de busca

        data_inicio, dta_fim : datatime.Date
            Datas de início e fim para realização da busca

        num_limite_num_limite_paginas : int
            Número máxima de páginas que serão obtidas.

        lang : str
            Código da lingua para realização da busca (padrão pt-BR)

        sleep : int
            Número de segundos para esperar entre tentativas após cada erro de obtenção de página

        tentativas : int
            Número de tentativas de obnteção de uma página antes de se considerar a extração concluída

    Retorno
    -------
        resultados : DataFrame
            Dataframe com os reulstados de busca
    """

    # String de busca formatado adequadamente para URL
    # q = urllib.parse.quote(q)

    # Strings com as datas no formato esperado pela lib GoogleNews
    formato_data = '%m/%d/%Y'
    dia_inicio_formatado = dia_inicio.strftime(formato_data)
    dia_fim_formatado = dia_fim.strftime(formato_data)

    # Instancia interface de busca ao Google News com idioma pt-BR e período adequado
    gn = GoogleNews(lang=lang,
                    start=dia_inicio_formatado,
                    end=dia_fim_formatado)

    # Inicializa lista para armazenar os resultados de busca
    resultados = []

    # Realiza busca da primeira página
    logger = logging.getLogger('covidata')
    logger.info(f'Buscando página 1')
    gn.search(q)
    resultados = resultados + gn.result()
    gn.clear()

    # Para a página 2 em diante (p2 corresponde ao índice 1)
    for i in range(2, num_limite_paginas + 1):

        logger.info(f'Buscando página {i}')

        # Busca a página
        gn.getpage(i)

        # Adiciona reusltado à lista
        resultados = resultados + gn.result()

        # Caso a consulta à página não tenha gerado resultados
        if gn.result() == []:
            logger.info(
                f'A consulta à página {i} não retornou nehnum resultado')

            # Diminui o contador de tentaivas
            tentativas = tentativas - 1
            logger.info(f'*** Há {tentativas} restantes ***')

            # Caso o número de tentativas tenha chegado a zero, interrompe a execução
            if tentativas < 1:
                break

            # Caso contrário
            else:
                # Pausa script por sleep segundos antes de buscar a próxima página
                logger.info(f'Execução interrompida por {sleep} segundos')
                time.sleep(sleep)

        # Apaga cache do resultado
        gn.clear()

    # Cria e retorna dataframe
    return pd.DataFrame(resultados)

コード例 #22

0

ファイルを表示

ファイル: news_topic.py プロジェクト: marco-create/Python-Web

main_topic = input('Choose a topic: ')

while run:
    try:
        gnews.search(main_topic)
        for n, result in enumerate(gnews.result()):
            print(n, result['title'])

        ## choose which article to pick
        article = input('\nChoose an article by its main index or choose [all] to visualize all links: ')
        if article == 'all':
            [print(f'{n}: {new}') for n, new in enumerate(gnews.gettext())]
            print('--')
            [print(f'{n}: {link}') for n, link in enumerate(gnews.get__links())]
        else:
            article = int(article)
            print(f'Article - Title: {gnews.gettext()[article]}')
            list_artcl = gnews.gettext()
            print(f'Article - Link: {gnews.get__links()[article]}')
            list_links = gnews.get__links()
        print('==========================================================================\n')
        go_further = input('DO you want to read other articles? (y/n) ').lower()
        if go_further == 'y' or go_further == 'yes':
            gnews.clear()   ## before going in loop, needs to clears the article list
        else:
            run = False ## break the loop

    except:
        print('Error :S')
        run = False ## break the loop

コード例 #23

0

ファイルを表示

    def run(self):
        googlenews = GoogleNews()
        calList = self.genCalList(self.start,self.end)
        posL,negL,neuL,comL = [],[],[],[]

        pageCount = 10
        ALL_RESULT = {}
        t = time.time()

        for date in calList:
            print(f'\n\n{date}')
            PREV_RES = []
            RESULT = {}
            continu = True

            for page in range(1,pageCount+1):
                t1 = time.time()
                googlenews = GoogleNews(start=date, end=date, lang='en')
                googlenews.search(self.word)
                googlenews.getpage(page)
                results = googlenews.result()
                googlenews.clear()
                
                if results == []:
                    continu = False
                    break

                results = [dict(t) for t in {tuple(d.items()) for d in results}]
                results = [i for i in results if i not in PREV_RES]
                if len(results) < 1:
                    break

                for res in results:
                    RESULT[res['title']] = res['link']
                                        
                PREV_RES = results
                print(f"Page: {page}. Name: {self.word}. t={round(time.time()-t1,2)}s")

            if not os.path.exists(f'./news/data/{self.word}'):
                os.mkdir(f'./news/data/{self.word}')

            old_data = {}
            if os.path.isfile(f'./news/data/{self.word}/{self.word}.json'):
                with open(f'./news/data/{self.word}/{self.word}.json','r') as JSON:
                    old_data = json.load(JSON)
                    JSON.close()

            if continu == False:
                print('No results.')
                with open(f'./news/data/{self.word}/{self.word}.json','w') as JSON:
                    old_data[date] = {}
                    json.dump(old_data,JSON)
                    JSON.close()
                continue

            titles = [i for i in RESULT.keys()]
            links = [i for i in RESULT.values()]
            body = self.genBody(links)
            allWords = [self.word] + self.synonyms

            print('\nFetching <p> text...')
            oldText = self.genAllText(body,allWords)
            allText = [i for i in oldText if i != '']
            print(f"Texts aquired of total: {(len(allText)/len(oldText))*100}%")

            if allText == []:
                with open(f'./news/data/{self.word}/{self.word}.json','w') as JSON:
                    old_data[date] = {}
                    json.dump(old_data,JSON)
                    JSON.close()
                continue

            print('Sentiment analysis...')
            pos,neg,neu,com,count = self.measureSentiment(allText)
            pos_,neg_,neu_,com_ = self.avSentiment(pos,neg,neu,com,count)

            posL += [pos_]
            negL += [neg_]
            neuL += [neu_]
            comL += [com_]

            formatted = {'synonyms':self.synonyms,'pos':pos_,'neg':neg_,'neu':neu_,'com':com_,'raw text':allText,'sample size':len(RESULT),'page count':pageCount}
            old_data[date] = formatted
            with open(f'./news/data/{self.word}/{self.word}.json','w') as JSON:
                json.dump(old_data,JSON)
                JSON.close()

        print(time.time()-t)

コード例 #24

0

ファイルを表示

    def parse(self):
        #for the number of news sources you have loop through them
        for i in range(len(self.news_sources)):
            if self.news_sources[i] == 'NewsYCombinator':
                #access the website and find all the stories on the front page
                self.markup.append(
                    requests.get('https://news.ycombinator.com/').text)
                soup = BeautifulSoup(self.markup[i], 'html.parser')
                links = soup.findAll("a", {"class": "storylink"})
                self.saved_links = []
                #search all stories on front page to find out whether or not your key words are there
                for link in links:
                    for keyword in self.keywords:
                        if keyword in link.text:
                            self.saved_links.append(link)

                #get all of the links and save them, then declare that articles have been found
                for a in range(len(self.saved_links)):
                    self.read_links.append(str(self.saved_links[a]['href']))
                    self.hasArticles = True
            elif self.news_sources[i] == 'NewYorkTimes':
                #To get your api key go to nyt developers website create an account and create an app, then select search api and copy your key from there
                api = articleAPI('API KEY')
                #loop through all key words to find out whether or not articles have them
                for a in range(len(self.keywords)):
                    if (datetime.datetime.now().day - 1 > 0):
                        articles = api.search(
                            q=self.keywords[a],
                            begin_date=datetime.datetime.now().year * 10000 +
                            (datetime.datetime.now().month) * 100 +
                            (datetime.datetime.now().day - 1),
                            page=1)
                    elif (datetime.datetime.now().month - 1 == 4
                          or datetime.datetime.now().month - 1 == 6
                          or datetime.datetime.now().month - 1 == 9
                          or datetime.datetime.now().month - 1 == 11):
                        articles = api.search(
                            q=self.keywords[a],
                            begin_date=datetime.datetime.now().year * 10000 +
                            (datetime.datetime.now().month - 1) * 100 +
                            (datetime.datetime.now().day + 29),
                            page=1)
                    elif (datetime.datetime.now().month - 1 == 2
                          and datetime.datetime.now().year % 4 == 0):
                        articles = api.search(
                            q=self.keywords[a],
                            begin_date=datetime.datetime.now().year * 10000 +
                            (datetime.datetime.now().month - 1) * 100 +
                            (datetime.datetime.now().day + 28),
                            page=1)
                    elif (datetime.datetime.now().month - 1 == 2):
                        articles = api.search(
                            q=self.keywords[a],
                            begin_date=datetime.datetime.now().year * 10000 +
                            (datetime.datetime.now().month - 1) * 100 +
                            (datetime.datetime.now().day + 28),
                            page=1)
                    else:
                        articles = api.search(
                            q=self.keywords[a],
                            begin_date=datetime.datetime.now().year * 10000 +
                            (datetime.datetime.now().month - 1) * 100 +
                            (datetime.datetime.now().day + 30),
                            page=1)
                    self.list_of_articles = []
                    for docs in articles['response']['docs']:
                        article_blurbs = {}
                        article_blurbs = docs.get('headline').get(
                            'main') + '\n' + docs.get(
                                'web_url') + '\n' + docs.get('snippet')
                        self.list_of_articles.append(str(article_blurbs))
                #if has an article, declare articles have been found
                if len(self.list_of_articles) > 0:
                    self.hasArticles = True
            elif self.news_sources[i] == 'GoogleNews':
                googlenews = GoogleNews()
                googlenews = GoogleNews(lang='en')
                googlenews = GoogleNews(
                    start=str(datetime.datetime.now().month) + '/' +
                    str(datetime.datetime.now().day - 1) + '/' +
                    str(datetime.datetime.now().year),
                    end=str(datetime.datetime.now().month) + '/' +
                    str(datetime.datetime.now().day) + '/' +
                    str(datetime.datetime.now().year))

                self.googleArticles = [[] for z in range(len(self.keywords))]
                for a in range(len(self.keywords)):
                    googlenews.search(self.keywords[a])
                    gnews = googlenews.result()
                    for docs2 in gnews:
                        self.googleArticles[a].append(
                            str(docs2.get('title')) + '\n' +
                            str(docs2.get('date')) + '\n' +
                            str(docs2.get('link')) + '\n' +
                            str(docs2.get('desc')))
                    googlenews.clear()

                if len(self.googleArticles) > 0:
                    self.hasArticles = True