def update(): """This function asks user if he or she wants to update data to the newest. If the answer is 'yes' performs update """ print('\n\nОбновить данные?') print('\t1 - да\n\t2 - нет') answer = 0 while answer != 1 and answer != 2: print(' > ', end='') try: answer = int(input()) except ValueError: continue if answer == 2: return print("\n", end='') StocksParser.parse(companies_file, stocks_directory) print('\nОжидание новостей... Это может занять некоторое время\n') NewsParser.parse(driver_path, companies_file, news_directory) Classifier.classify(companies_file, news_directory, prnews_directory, train_set_path)
def test_count_words(self, mock_generate_phrases_array): news_parser = NewsParser(url='fake_one') mock_generate_phrases_array.return_value = [[ 'dólar', 'fechou', 'forte', 'queda', 'nesta', 'segunda-feira', '27', 'dia', 'fraqueza', 'moeda', 'norte-americana', 'exterior', 'meio', 'tensões', 'diplomáticas', 'estados', 'unidos', 'china', 'investidores', 'modo', 'espera', 'antes', 'reunião', 'política', 'monetária', 'federal', 'reserve', 'bc', 'eua' ], [ 'moeda', 'norte-americana', 'recuou', '0,89%', 'vendida', 'r$', '5,1577' ], ['veja', 'cotações']] dictionary_test = { 'dólar': 1, 'fechou': 1, 'forte': 1, 'queda': 1, 'nesta': 1, 'segunda-feira': 1, '27': 1, 'dia': 1, 'fraqueza': 1, 'moeda': 2, 'norte-americana': 2, 'exterior': 1, 'meio': 1, 'tensões': 1, 'diplomáticas': 1, 'estados': 1, 'unidos': 1, 'china': 1, 'investidores': 1, 'modo': 1, 'espera': 1, 'antes': 1, 'reunião': 1, 'política': 1, 'monetária': 1, 'federal': 1, 'reserve': 1, 'bc': 1, 'eua': 1, 'recuou': 1, '0,89%': 1, 'vendida': 1, 'r$': 1, '5,1577': 1, 'veja': 1, 'cotações': 1 } self.assertDictEqual(news_parser.count_words(), dictionary_test)
def test_content_raw_to_phrases(self, mock_get_news): data = open('html_test.html', 'r') mock_get_news.return_value = soup(data, 'html.parser').findAll( 'p', {'class': 'content-text__container'}) news_parser = NewsParser(url='fake_one') response_test = [ ' o dólar fechou em forte queda nesta segunda-feira 27 em dia de fraqueza da moeda norte-americana no exterior em meio às tensões diplomáticas entre estados unidos e china com os investidores em modo de espera antes da reunião de política monetária do federal reserve bc dos eua', ' a moeda norte-americana recuou 0,89% vendida a r$ 5,1577', 'veja mais cotações' ] self.assertEqual(news_parser.content_raw_to_phrases(), response_test)
def test_phrase_to_words(self): news_parser = NewsParser(url='fake_one') phrase_test = ' o dólar fechou em forte queda nesta segunda-feira 27 em dia de fraqueza da moeda norte-americana no exterior em meio às tensões diplomáticas entre estados unidos e china com os investidores em modo de espera antes da reunião de política monetária do federal reserve bc dos eua' word_array_test = [ 'dólar', 'fechou', 'forte', 'queda', 'nesta', 'segunda-feira', '27', 'dia', 'fraqueza', 'moeda', 'norte-americana', 'exterior', 'meio', 'tensões', 'diplomáticas', 'estados', 'unidos', 'china', 'investidores', 'modo', 'espera', 'antes', 'reunião', 'política', 'monetária', 'federal', 'reserve', 'bc', 'eua' ] self.assertEqual(news_parser.phrase_to_words(phrase_test), word_array_test)
def test_rank_words(self, mock_count_words): news_parser = NewsParser(url='fake_one') mock_count_words.return_value = { 'dólar': 1, 'fechou': 1, 'forte': 1, 'queda': 1, 'nesta': 1, 'segunda-feira': 1, '27': 1, 'dia': 1, 'fraqueza': 1, 'moeda': 2, 'norte-americana': 2, 'exterior': 1, 'meio': 1, 'tensões': 1, 'diplomáticas': 1, 'estados': 1, 'unidos': 1, 'china': 1, 'investidores': 1, 'modo': 1, 'espera': 1, 'antes': 1, 'reunião': 1, 'política': 1, 'monetária': 1, 'federal': 1, 'reserve': 1, 'bc': 1, 'eua': 1, 'recuou': 1, '0,89%': 1, 'vendida': 1, 'r$': 1, '5,1577': 1, 'veja': 1, 'cotações': 1 } rank_test = [('moeda', 2), ('norte-americana', 2), ('dólar', 1), ('fechou', 1), ('forte', 1), ('queda', 1), ('nesta', 1), ('segunda-feira', 1), ('27', 1), ('dia', 1), ('fraqueza', 1), ('exterior', 1), ('meio', 1), ('tensões', 1), ('diplomáticas', 1), ('estados', 1), ('unidos', 1), ('china', 1), ('investidores', 1), ('modo', 1), ('espera', 1), ('antes', 1), ('reunião', 1), ('política', 1), ('monetária', 1), ('federal', 1), ('reserve', 1), ('bc', 1), ('eua', 1), ('recuou', 1), ('0,89%', 1), ('vendida', 1), ('r$', 1), ('5,1577', 1), ('veja', 1), ('cotações', 1)] self.assertEqual(news_parser.rank_words(), rank_test)
def main(): news_parser = NewsParser("https://g1.globo.com/economia/noticia/2020/07/27/dolar.ghtml") #content_news = news_parser.get_news() #array_of_phrases = news_parser.content_raw_to_phrases() #matrix_of_words = news_parser.generate_phrases_array() #count_of_words = news_parser.count_words() ranked_words = news_parser.rank_words() print(news_parser) #print(content_news) #print(array_of_phrases) #print(matrix_of_words) #print(count_of_words) print(ranked_words)
def update(): parser = NewsParser() newspack = parser.riaParser() db = sqlite3.connect("database.db") cursor = db.cursor() for i in range(len(newspack)): #cursor.execute('INSERT INTO News (Title,URL) VALUES ("{}","{}")'.format(newspack[i]["title"],newspack[i]["url"])) cursor.execute('UPDATE News SET Title="{}" WHERE ID={}'.format( newspack[i]["title"], i + 1)) cursor.execute('UPDATE News SET URL="{}" WHERE ID={}'.format( newspack[i]["url"], i + 1)) db.commit() db.close() return 0
def makeNewsURLForm(NewsURL, startYear, endYear, startMonth, endMonth): madeURL=[] start, end = 1, 12 for year in range(startYear,endYear+1): if startYear == endYear: start = startMonth end = endMonth else: if year == startYear: start = startMonth elif year == endYear: end = endMonth for month in range(start,end+1): for day in range(1,calendar.monthrange(year,month)[1]+1): if len(str(month)) == 1: month = '0' + str(month) if len(str(day)) == 1: day = '0' + str(day) if datetime.now().month == month and datetime.now().year == year and int(datetime.now().day) < int(day): continue url = NewsURL + str(year) + str(month) + str(day) #끝페이지보다 더 큰 값을 이동하면 자동으로 마지막 페이지로 이동하게 된다. totalpage = NewsParser.findNewsTotalpage(url+'&page=10000') for page in range(1,totalpage+1): madeURL.append(url+'&page='+str(page)) return madeURL
def test_generate_phrases_array(self, mock_content_raw_to_phrases): news_parser = NewsParser(url='fake_one') mock_content_raw_to_phrases.return_value = [ ' o dólar fechou em forte queda nesta segunda-feira 27 em dia de fraqueza da moeda norte-americana no exterior em meio às tensões diplomáticas entre estados unidos e china com os investidores em modo de espera antes da reunião de política monetária do federal reserve bc dos eua', ' a moeda norte-americana recuou 0,89% vendida a r$ 5,1577', 'veja mais cotações' ] phrases_array_test = [[ 'dólar', 'fechou', 'forte', 'queda', 'nesta', 'segunda-feira', '27', 'dia', 'fraqueza', 'moeda', 'norte-americana', 'exterior', 'meio', 'tensões', 'diplomáticas', 'estados', 'unidos', 'china', 'investidores', 'modo', 'espera', 'antes', 'reunião', 'política', 'monetária', 'federal', 'reserve', 'bc', 'eua' ], [ 'moeda', 'norte-americana', 'recuou', '0,89%', 'vendida', 'r$', '5,1577' ], ['veja', 'cotações']] self.assertEqual(news_parser.generate_phrases_array(), phrases_array_test)
def populate(self): """ Populates the HitStore data members for query by the server code """ se = StorageEngine('sqlite3') parser = NewsParser(self.last_name, self.first_name) articles = se.get_collection('articles') # for document in articles.find(): for document in articles: # news = parser.parse_doc(document) # if parser.is_article_hit(news): news = { 'title' : document[0], 'author' : document[1], 'date' : document[2], 'body' : document[3], 'link' : document[4] } if parser.is_article_hit(news): self.total_hits += 1 self.article_list.append(news['link'])
def update_server(parameter): """This function performs data update. If passed '2' it does nothing. Otherwise if passed something that differs from 1 displays an error. """ if parameter == '2': return ' ' if parameter != '1': return 'Allowed answers: 1 or 2' StocksParser.parse(companies_file, stocks_directory) print('\nWaiting for news... It may take a while\n') NewsParser.parse(driver_path, companies_file, news_directory) Classifier.classify(companies_file, news_directory, prnews_directory, train_set_path) return 'Updated'
class ArticleContentParser(): def __init__(self): self.htmlGetter = NewsParser() def getArticleContent(self, url): article_content = ['-1'] # 初始化,如果html无法打开,beautifulsoup无法读取则可直接返回空值 news_list = [] # 用来存放新闻 try: html = self.htmlGetter.getHtmlByRequests(url) soup = BeautifulSoup(html, 'html.parser') article_content = soup.select('.article-content') except Exception as e: print 'Beautiful Soup load error: ', e s = str(e) getArticleContentError = get_logger('ArticleContentParser.log') getArticleContentError.error = ( 'ArticleContentParser getArticleContentError Beautiful soup load error: ' + s) # 如果前面出现错误直接返回空值 if article_content == ['-1']: return news_list # 判断是不是另一种问答形式文本 if article_content == []: article_content = soup.select('.answer-text-full') """ 由于这种类型的网页不更改header无法直接通过toutiao+/group/id的形式打开,所以解析网页也没有什么意义 if article_content == []: article_content = soup.select('.tt-ignored-node') """ for news in article_content: newsText = news.get_text() news_list.append(newsText) return news_list
def test_get_news(self): news_parser = NewsParser( url='https://g1.globo.com/economia/noticia/2020/07/27/dolar.ghtml') response = news_parser.get_news() self.assertIsInstance(response, list)
def crawling(self, parseURLs): print("Crawling Start!") for url in parseURLs: pageN = 1 options = webdriver.ChromeOptions() options.add_argument('headless') options.add_argument('disable-gpu') options.add_argument('--start-maximized') driver = webdriver.Chrome(self.driverPATH, chrome_options=options) end = True while end: articles = [] driver.get(url + "&page=" + str(pageN)) print(str(os.getpid()) + ":" + url + "&page=" + str(pageN)) time.sleep(1.5) pages = driver.find_elements_by_css_selector( '#newsWrp > ul > li') try: for page in pages: articles.append( page.find_element_by_css_selector( 'a').get_attribute('href')) del pages for contentURL in articles: time.sleep(0.01) contentHtml = self.getURLdata(contentURL) documentContent = BeautifulSoup( contentHtml.content, 'html.parser') try: articleTitle = documentContent.find_all( 'h2', {'class': 'end_tit'}) title = '' title += NewsParser.clearHeadlineE( str(articleTitle[0].find_all(text=True))) if not title: continue articleBodyContents = documentContent.find_all( "div", {"id": "articeBody"}) content = NewsParser.clearContent( list(articleBodyContents[0].find_all( text=True))) if not content: continue fileName = self.DATA_DIR + '/' + str( os.getpid()) + '_' + str(self.number) + ".txt" self.fileWrite(fileName, title, content) self.number += 1 del content, title del articleTitle, articleBodyContents del contentHtml, documentContent except Exception: del contentHtml, documentContent pass del articles except: end = False pageN += 1 driver.quit()
def buttons_answer_cb(bot, event): UserID = event.data['from']['userId'] answer = event.data['callbackData'] print('User:'******'books': user.change_branch(UserID, 'book') if answer == 'QR': if qr.check_City(UserID) == 404: user.change_branch(UserID, 'geolocation') bot.send_text( chat_id=UserID, text= "Для этой функции мне нужно знать ваш город, для этого пожалуйста отправьте свое местоположение", inline_keyboard_markup="{}".format( json.dumps([[ { "text": "Хорошо", "callbackData": "geo_ok", "style": "attention" }, { "text": "Не хочу!", "callbackData": "geo_neok", "style": "primary" }, ]]))) else: bot.send_text( chat_id=UserID, text= "Сайт, на котором вы можете получить QR код для выхода на улицу" ) bot.send_text(chat_id=UserID, text=qr.recive_qr(UserID)) if answer == "geo_ok": bot.send_text(chat_id=UserID, text="Жду ваше расположение", inline_keyboard_markup="{}".format( json.dumps([[ { "text": "Отмена", "callbackData": "F", "style": "attention" }, ]]))) if answer == "geo_neok": user.change_branch(UserID, 'main') if user.check_branch(UserID) == 404: user.add(UserID) if answer == "F": user.change_branch(UserID, 'main') bot.send_text(chat_id=UserID, text="Отменяюсь") if user.check_branch(UserID) == 404: user.add(UserID) if answer == "stats": bot.send_text(chat_id=UserID, text="Статистика на сегодняшний день:") parser = StatsParser() data = parser.get_data(event.data['from']['userId']) for d in data: message = f"""{d['title']} Заболевших: {d['sick']} (+ {d['sick_incr']}) Умерших: {d['died']} (+ {d['died_incr']}) Выздоровевших {d['healed']} (+ {d['healed_incr']})""" bot.send_text(chat_id=UserID, text=message) user.change_branch(UserID, 'choose') if answer == 'news': p = NewsParser() data = p.mailruParser() for news in data: message = f"""От Mail.ru {news['title']} {news['url']}""" bot.send_text(chat_id=event.data['from']['userId'], text=message) if answer == 'pharmacy': if qr.check_City(UserID) == 404: user.change_branch(UserID, 'geolocation') bot.send_text( chat_id=UserID, text= "Для этой функции мне нужно знать ваш город, для этого пожалуйста отправьте свое местоположение", inline_keyboard_markup="{}".format( json.dumps([[ { "text": "Хорошо", "callbackData": "geo_ok", "style": "attention" }, { "text": "Не хочу!", "callbackData": "geo_neok", "style": "primary" }, ]]))) else: print('pharmacy') p = PlacesParse() data = p.getPharmacy(UserID) for pharmacy in data: bot.send_text(chat_id=event.data['from']['userId'], text=pharmacy['name']) bot.send_text(chat_id=event.data['from']['userId'], text=pharmacy['url']) user.change_branch(UserID, 'main') if answer == 'shops': if qr.check_City(UserID) == 404: user.change_branch(UserID, 'geolocation') bot.send_text( chat_id=UserID, text= "Для этой функции мне нужно знать ваш город, для этого пожалуйста отправьте свое местоположение", inline_keyboard_markup="{}".format( json.dumps([[ { "text": "Хорошо", "callbackData": "geo_ok", "style": "attention" }, { "text": "Не хочу!", "callbackData": "geo_neok", "style": "primary" }, ]]))) else: print('shops') p = PlacesParse() data = p.getShop(UserID) for shop in data: bot.send_text(chat_id=event.data['from']['userId'], text=shop['name']) bot.send_text(chat_id=event.data['from']['userId'], text=shop['url']) user.change_branch(UserID, 'main') if answer == "da": print('u') user.change_branch(UserID, 'main') bot.send_text(chat_id=UserID, text="Вы подписались на уведомления") if answer == "net": user.change_branch(UserID, 'main') if user.check_branch(UserID) == 'main': bot.send_text(chat_id=event.data['from']['userId'], text="Главное меню", inline_keyboard_markup="{}".format( json.dumps([[{ "text": "Статистика", "callbackData": "stats", "style": "attention" }], [{ "text": "Новоcти COVID", "callbackData": "news", "style": "attention" }], [{ "text": "Ближайшие магазины", "callbackData": "shops", "style": "primary" }], [{ "text": "Ближайшие аптеки", "callbackData": "pharmacy", "style": "primary" }], [{ "text": "QR код", "callbackData": "QR" }], [{ "text": "Экскурсия", "callbackData": "exc" }], [{ "text": "Сказки для детей", "callbackData": "books" }]]))) if user.check_branch(UserID) == 'choose': bot.send_text(chat_id=UserID, text="Хотите подписаться на ежедневные уведомления?", inline_keyboard_markup="{}".format( json.dumps([[ { "text": "Да", "callbackData": "da", "style": "attention" }, { "text": "Нет", "callbackData": "net", "style": "primary" }, ]]))) if answer == 'exc': bot.send_text( chat_id=event.data['from']['userId'], text= 'Мы предлагаем вам небольшую экскурсию по достопримичательностям Москвы! Надеемся вы узнаете для себя что-нибудь новое и интересное, а так же хорошо проведете время :)', inline_keyboard_markup="{}".format( json.dumps([[ { "text": "Начнем же!", "callbackData": "moscow_view_1", "style": "primary" }, ]]))) user.change_branch(UserID, 'excursion') if user.check_branch(UserID) == 'excursion': if event.data['callbackData'] == "moscow_view_1": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[0]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[0]['info']) bot.send_text(chat_id=event.data['from']['userId'], text=f"""{dataset[0]['q']} 1. {dataset[0]['a2']} 2. {dataset[0]['a3']} 3. {dataset[0]['a1']}""", inline_keyboard_markup="{}".format( json.dumps([[{ "text": "1", "callbackData": "moscow_view_2ff", "style": "primary" }, { "text": "2", "callbackData": "moscow_view_2f", "style": "primary" }, { "text": "3", "callbackData": "moscow_view_2t", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_2t" or event.data[ 'callbackData'] == "moscow_view_2f" or event.data[ 'callbackData'] == "moscow_view_2ff": if event.data['callbackData'] == "moscow_view_2f" or event.data[ 'callbackData'] == "moscow_view_2ff": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[0]['ra']) else: bot.send_text(chat_id=event.data['from']['userId'], text="Правильно!") bot.send_text(chat_id=event.data['from']['userId'], text=dataset[1]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[1]['info'], inline_keyboard_markup="{}".format( json.dumps([[{ "text": "Далее", "callbackData": "moscow_view_3", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_3": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[2]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[2]['info'], inline_keyboard_markup="{}".format( json.dumps([[{ "text": "Далее", "callbackData": "moscow_view_4", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_4": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[3]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[3]['info']) bot.send_text(chat_id=event.data['from']['userId'], text=f"""{dataset[3]['q']} 1. {dataset[3]['a2']} 2. {dataset[3]['a1']} 3. {dataset[3]['a3']}""", inline_keyboard_markup="{}".format( json.dumps([[{ "text": "1", "callbackData": "moscow_view_5f", "style": "primary" }, { "text": "2", "callbackData": "moscow_view_5t", "style": "primary" }, { "text": "3", "callbackData": "moscow_view_5ff", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_5f" or event.data[ 'callbackData'] == "moscow_view_5t" or event.data[ 'callbackData'] == "moscow_view_5ff": if event.data['callbackData'] == "moscow_view_5f" or event.data[ 'callbackData'] == "moscow_view_5ff": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[3]['ra']) else: bot.send_text(chat_id=event.data['from']['userId'], text="Правильно!") bot.send_text(chat_id=event.data['from']['userId'], text=dataset[4]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[4]['info'], inline_keyboard_markup="{}".format( json.dumps([[{ "text": "Далее", "callbackData": "moscow_view_6", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_6": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[5]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[5]['info'], inline_keyboard_markup="{}".format( json.dumps([[{ "text": "Далее", "callbackData": "moscow_view_7", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_7": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[6]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[6]['info']) bot.send_text(chat_id=event.data['from']['userId'], text=f"""{dataset[6]['q']} 1. {dataset[6]['a2']} 2. {dataset[6]['a1']} 3. {dataset[6]['a3']}""", inline_keyboard_markup="{}".format( json.dumps([[{ "text": "1", "callbackData": "moscow_view_8t", "style": "primary" }, { "text": "2", "callbackData": "moscow_view_8f", "style": "primary" }, { "text": "3", "callbackData": "moscow_view_8ff", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_8f" or event.data[ 'callbackData'] == "moscow_view_8t" or event.data[ 'callbackData'] == "moscow_view_8ff": if event.data['callbackData'] == "moscow_view_8f" or event.data[ 'callbackData'] == "moscow_view_8ff": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[6]['ra']) else: bot.send_text(chat_id=event.data['from']['userId'], text="Правильно!") bot.send_text(chat_id=event.data['from']['userId'], text=dataset[7]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[7]['info'], inline_keyboard_markup="{}".format( json.dumps([[{ "text": "Далее", "callbackData": "moscow_view_9", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_9": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[8]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[8]['info'], inline_keyboard_markup="{}".format( json.dumps([[{ "text": "Далее", "callbackData": "moscow_view_10", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_10": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[9]['url']) bot.send_text(chat_id=event.data['from']['userId'], text=dataset[9]['info']) bot.send_text(chat_id=event.data['from']['userId'], text=f"""{dataset[9]['q']} 1. {dataset[9]['a3']} 2. {dataset[9]['a1']} 3. {dataset[9]['a2']}""", inline_keyboard_markup="{}".format( json.dumps([[{ "text": "1", "callbackData": "moscow_view_11f", "style": "primary" }, { "text": "2", "callbackData": "moscow_view_11ff", "style": "primary" }, { "text": "3", "callbackData": "moscow_view_11t", "style": "primary" }]]))) if event.data['callbackData'] == "moscow_view_11f" or event.data[ 'callbackData'] == "moscow_view_11t" or event.data[ 'callbackData'] == "moscow_view_11ff": if event.data['callbackData'] == "moscow_view_11f" or event.data[ 'callbackData'] == "moscow_view_11ff": bot.send_text(chat_id=event.data['from']['userId'], text=dataset[9]['ra']) else: bot.send_text(chat_id=event.data['from']['userId'], text="Правильно!") bot.send_text(chat_id=event.data['from']['userId'], text="Спасибо за внимание!", inline_keyboard_markup="{}".format( json.dumps([[{ "text": "Конец экскурсии!", "callbackData": "end", "style": "primary" }]]))) user.change_branch(UserID, 'main') if user.check_branch(UserID) == 'book': if event.data['callbackData'] == "1" or event.data[ 'callbackData'] == "2" or event.data[ 'callbackData'] == "3" or event.data[ 'callbackData'] == "4" or event.data[ 'callbackData'] == "5": num = int(event.data['callbackData']) - 1 bot.send_text(chat_id=event.data['from']['userId'], text=books[num]['title']) with open(books[num]['audio'], 'rb') as f: bot.send_text(chat_id=event.data['from']['userId'], text="Пожалуйста, подождите...") bot.send_file(file=f, chat_id=event.data['from']['userId']) print('audio') with open(books[num]['text'], 'rb') as f: bot.send_file(file=f, chat_id=event.data['from']['userId']) print('text') bot.send_text(chat_id=event.data['from']['userId'], text="Главное меню", inline_keyboard_markup="{}".format( json.dumps([[{ "text": "Статистика", "callbackData": "stats", "style": "attention" }], [{ "text": "Новоcти COVID", "callbackData": "news", "style": "attention" }], [{ "text": "Ближайшие магазины", "callbackData": "shops", "style": "primary" }], [{ "text": "Ближайшие аптеки", "callbackData": "pharmacy", "style": "primary" }], [{ "text": "QR код", "callbackData": "QR" }], [{ "text": "Экскурсия", "callbackData": "exc" }], [{ "text": "Сказки для детей", "callbackData": "books" }]]))) user.change_branch(UserID, 'main') if event.data['callbackData'] == "books": bot.send_text(chat_id=event.data['from']['userId'], text=f"""Какую книгу хотите получить? 1. {books[0]['title']} 2. {books[1]['title']} 3. {books[2]['title']} 4. {books[3]['title']} 5. {books[4]['title']}""", inline_keyboard_markup="{}".format( json.dumps([[{ "text": "1", "callbackData": "1", "style": "attention" }, { "text": "2", "callbackData": "2", "style": "attention" }, { "text": "3", "callbackData": "3", "style": "attention" }, { "text": "4", "callbackData": "4", "style": "attention" }, { "text": "5", "callbackData": "5", "style": "primary" }]])))
def __init__(self): self.htmlGetter = NewsParser()
def crawling(self,categoryName): print(str(os.getpid())+" : "+categoryName+"\n") url= "http://news.naver.com/main/list.nhn?mode=LSD&mid=shm&sid1=" + str(self.categoriesCode.get(categoryName)) + "&date=" urls = self.makeNewsURLForm(url, self.date['startYear'], self.date['endYear'], self.date['startMonth'], self.date['endMonth']) number=0 print("Crawling Start!") for url in urls: print(str(os.getpid())+" : "+url) pageHtml = self.getURLdata(url) document = BeautifulSoup(pageHtml.content,'html.parser') #가운데의 줄을 기준으로 headline과 일반으로 나누어져 있음 pages = document.select('.newsflash_body .type06_headline li dl') pages.extend(document.select('.newsflash_body .type06 li dl')) articles = [] for line in pages: articles.append(line.a.get('href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음 del pages for contentURL in tqdm(articles): # 기사 URL # 크롤링 대기 시간 time.sleep(0.01) # 기사 HTML 가져옴 contentHtml = self.getURLdata(contentURL) documentContent = BeautifulSoup(contentHtml.content, 'html.parser') try: # 기사 제목 가져옴 articleTitle = documentContent.find_all('h3', {'id': 'articleTitle'}, {'class': 'tts_head'}) title = '' # 뉴스 기사 제목 초기화 title += NewsParser.clearHeadline(str(articleTitle[0].find_all(text=True))) if not title: # 공백일 경우 기사 제외 처리 continue # 기사 본문 가져옴 articleBodyContents = documentContent.find_all('div', {'id': 'articleBodyContents'}) content = NewsParser.clearContent(list(articleBodyContents[0].find_all(text=True))) if not len(content): # 공백일 경우 기사 제외 처리 continue try: if not(os.path.isdir(os.path.join(self.DATA_DIR,str(self.categoriesFolder.get(categoryName))))): os.makedirs(os.path.join(self.DATA_DIR,str(self.categoriesFolder.get(categoryName)))) print("폴더 생성") except OSError: print("폴더 생성에 실패했습니다.") print(os.getcwd()) fileName = self.DATA_DIR+'/'+str(self.categoriesFolder.get(categoryName))+'/'+categoryName+str(number)+".txt" print(fileName) self.fileWrite(fileName,title,content) number+=1 del content, title del articleTitle, articleBodyContents del contentHtml, documentContent except Exception: del contentHtml, documentContent pass
def crawling(self, categoryName): print(str(os.getpid()) + " : " + categoryName + '\n') url = "http://sports.news.naver.com/" + str( self.categoryCode.get( categoryName)) + "/news/index.nhn?isphoto=N&date=" urls = self.makeNewsURLForm(url, self.date['startYear'], self.date['endYear'], self.date['startMonth'], self.date['endMonth']) number = 0 print("Crawling Start!") for url in tqdm(urls): print(str(os.getpid()) + " : " + url) options = webdriver.ChromeOptions() options.add_argument('headless') options.add_argument('disable-gpu') driver = webdriver.Chrome(self.driverPath, chrome_options=options) driver.implicitly_wait(2) driver.get(url) driver.implicitly_wait(2) pages = driver.find_elements_by_css_selector('#_newsList > ul >li') articles = [] for page in pages: articles.append( page.find_element_by_css_selector('a').get_attribute( 'href')) del pages driver.quit() for contentURL in articles: time.sleep(0.05) contentHtml = self.getURLdata(contentURL) documentContent = BeautifulSoup(contentHtml.content, 'html.parser') try: # 기사 제목 가져옴 articleTitle = documentContent.find_all( 'h4', {'class': 'title'}) title = '' # 뉴스 기사 제목 초기화 title += NewsParser.clearHeadline( str(articleTitle[0].find_all(text=True))) if not title: # 공백일 경우 기사 제외 처리 continue # 기사 본문 가져옴 articleBodyContents = documentContent.find_all( 'div', {'id': 'newsEndContents'}) content = NewsParser.clearContentS( list(articleBodyContents[0].find_all(text=True))) if not len(content): # 공백일 경우 기사 제외 처리 continue try: if not (os.path.isdir(self.DATA_DIR)): os.makedirs(self.DATA_DIR) print("폴더 생성") except OSError: print("폴더 생성에 실패했습니다.") fileName = self.DATA_DIR + '/' + categoryName + str( number) + ".txt" self.fileWrite(fileName, title, content) number += 1 del content, title del articleTitle, articleBodyContents del contentHtml, documentContent except Exception: del contentHtml, documentContent pass