Esempio n. 1
0
def update():
    """This function asks user if he or
    she wants to update data to the newest.
    If the answer is 'yes' performs update
    """

    print('\n\nОбновить данные?')
    print('\t1 - да\n\t2 - нет')

    answer = 0
    while answer != 1 and answer != 2:
        print(' > ', end='')

        try:
            answer = int(input())
        except ValueError:
            continue

    if answer == 2:
        return

    print("\n", end='')

    StocksParser.parse(companies_file, stocks_directory)

    print('\nОжидание новостей... Это может занять некоторое время\n')

    NewsParser.parse(driver_path, companies_file, news_directory)
    Classifier.classify(companies_file, news_directory, prnews_directory,
                        train_set_path)
Esempio n. 2
0
 def test_count_words(self, mock_generate_phrases_array):
     news_parser = NewsParser(url='fake_one')
     mock_generate_phrases_array.return_value = [[
         'dólar', 'fechou', 'forte', 'queda', 'nesta', 'segunda-feira',
         '27', 'dia', 'fraqueza', 'moeda', 'norte-americana', 'exterior',
         'meio', 'tensões', 'diplomáticas', 'estados', 'unidos', 'china',
         'investidores', 'modo', 'espera', 'antes', 'reunião', 'política',
         'monetária', 'federal', 'reserve', 'bc', 'eua'
     ],
                                                 [
                                                     'moeda',
                                                     'norte-americana',
                                                     'recuou', '0,89%',
                                                     'vendida', 'r$',
                                                     '5,1577'
                                                 ], ['veja', 'cotações']]
     dictionary_test = {
         'dólar': 1,
         'fechou': 1,
         'forte': 1,
         'queda': 1,
         'nesta': 1,
         'segunda-feira': 1,
         '27': 1,
         'dia': 1,
         'fraqueza': 1,
         'moeda': 2,
         'norte-americana': 2,
         'exterior': 1,
         'meio': 1,
         'tensões': 1,
         'diplomáticas': 1,
         'estados': 1,
         'unidos': 1,
         'china': 1,
         'investidores': 1,
         'modo': 1,
         'espera': 1,
         'antes': 1,
         'reunião': 1,
         'política': 1,
         'monetária': 1,
         'federal': 1,
         'reserve': 1,
         'bc': 1,
         'eua': 1,
         'recuou': 1,
         '0,89%': 1,
         'vendida': 1,
         'r$': 1,
         '5,1577': 1,
         'veja': 1,
         'cotações': 1
     }
     self.assertDictEqual(news_parser.count_words(), dictionary_test)
Esempio n. 3
0
 def test_content_raw_to_phrases(self, mock_get_news):
     data = open('html_test.html', 'r')
     mock_get_news.return_value = soup(data, 'html.parser').findAll(
         'p', {'class': 'content-text__container'})
     news_parser = NewsParser(url='fake_one')
     response_test = [
         ' o dólar fechou em forte queda nesta segunda-feira 27 em dia de fraqueza da moeda norte-americana no exterior em meio às tensões diplomáticas entre estados unidos e china com os investidores em modo de espera antes da reunião de política monetária do federal reserve bc dos eua',
         ' a moeda norte-americana recuou 0,89% vendida a r$ 5,1577',
         'veja mais cotações'
     ]
     self.assertEqual(news_parser.content_raw_to_phrases(), response_test)
Esempio n. 4
0
 def test_phrase_to_words(self):
     news_parser = NewsParser(url='fake_one')
     phrase_test = ' o dólar fechou em forte queda nesta segunda-feira 27 em dia de fraqueza da moeda norte-americana no exterior em meio às tensões diplomáticas entre estados unidos e china com os investidores em modo de espera antes da reunião de política monetária do federal reserve bc dos eua'
     word_array_test = [
         'dólar', 'fechou', 'forte', 'queda', 'nesta', 'segunda-feira',
         '27', 'dia', 'fraqueza', 'moeda', 'norte-americana', 'exterior',
         'meio', 'tensões', 'diplomáticas', 'estados', 'unidos', 'china',
         'investidores', 'modo', 'espera', 'antes', 'reunião', 'política',
         'monetária', 'federal', 'reserve', 'bc', 'eua'
     ]
     self.assertEqual(news_parser.phrase_to_words(phrase_test),
                      word_array_test)
Esempio n. 5
0
 def test_rank_words(self, mock_count_words):
     news_parser = NewsParser(url='fake_one')
     mock_count_words.return_value = {
         'dólar': 1,
         'fechou': 1,
         'forte': 1,
         'queda': 1,
         'nesta': 1,
         'segunda-feira': 1,
         '27': 1,
         'dia': 1,
         'fraqueza': 1,
         'moeda': 2,
         'norte-americana': 2,
         'exterior': 1,
         'meio': 1,
         'tensões': 1,
         'diplomáticas': 1,
         'estados': 1,
         'unidos': 1,
         'china': 1,
         'investidores': 1,
         'modo': 1,
         'espera': 1,
         'antes': 1,
         'reunião': 1,
         'política': 1,
         'monetária': 1,
         'federal': 1,
         'reserve': 1,
         'bc': 1,
         'eua': 1,
         'recuou': 1,
         '0,89%': 1,
         'vendida': 1,
         'r$': 1,
         '5,1577': 1,
         'veja': 1,
         'cotações': 1
     }
     rank_test = [('moeda', 2), ('norte-americana', 2), ('dólar', 1),
                  ('fechou', 1), ('forte', 1), ('queda', 1), ('nesta', 1),
                  ('segunda-feira', 1), ('27', 1), ('dia', 1),
                  ('fraqueza', 1), ('exterior', 1), ('meio', 1),
                  ('tensões', 1), ('diplomáticas', 1), ('estados', 1),
                  ('unidos', 1), ('china', 1), ('investidores', 1),
                  ('modo', 1), ('espera', 1), ('antes', 1), ('reunião', 1),
                  ('política', 1), ('monetária', 1), ('federal', 1),
                  ('reserve', 1), ('bc', 1), ('eua', 1), ('recuou', 1),
                  ('0,89%', 1), ('vendida', 1), ('r$', 1), ('5,1577', 1),
                  ('veja', 1), ('cotações', 1)]
     self.assertEqual(news_parser.rank_words(), rank_test)
Esempio n. 6
0
def main():
    news_parser = NewsParser("https://g1.globo.com/economia/noticia/2020/07/27/dolar.ghtml")
    #content_news = news_parser.get_news()
    #array_of_phrases = news_parser.content_raw_to_phrases()
    #matrix_of_words = news_parser.generate_phrases_array()
    #count_of_words = news_parser.count_words()
    ranked_words = news_parser.rank_words()

    print(news_parser)
    #print(content_news)
    #print(array_of_phrases)
    #print(matrix_of_words)
    #print(count_of_words)
    print(ranked_words)
Esempio n. 7
0
def update():
    parser = NewsParser()
    newspack = parser.riaParser()
    db = sqlite3.connect("database.db")
    cursor = db.cursor()
    for i in range(len(newspack)):
        #cursor.execute('INSERT INTO News (Title,URL) VALUES ("{}","{}")'.format(newspack[i]["title"],newspack[i]["url"]))
        cursor.execute('UPDATE News SET Title="{}" WHERE ID={}'.format(
            newspack[i]["title"], i + 1))
        cursor.execute('UPDATE News SET URL="{}" WHERE ID={}'.format(
            newspack[i]["url"], i + 1))
    db.commit()
    db.close()
    return 0
Esempio n. 8
0
    def makeNewsURLForm(NewsURL, startYear, endYear, startMonth, endMonth):
        madeURL=[]
        start, end = 1, 12 
        for year in range(startYear,endYear+1):
            if startYear == endYear:
                start = startMonth
                end = endMonth
            else:
                if year == startYear:
                    start = startMonth
                elif year == endYear:
                    end = endMonth
            
            for month in range(start,end+1):
                for day in range(1,calendar.monthrange(year,month)[1]+1):
                    if len(str(month)) == 1:
                        month = '0' + str(month)
                    if len(str(day)) == 1:
                        day = '0' + str(day)
                    if datetime.now().month == month and datetime.now().year == year and int(datetime.now().day) < int(day):
                        continue
                    
                    url = NewsURL + str(year) + str(month) + str(day)

                    #끝페이지보다 더 큰 값을 이동하면 자동으로 마지막 페이지로 이동하게 된다.
                    totalpage = NewsParser.findNewsTotalpage(url+'&page=10000')
                    for page in range(1,totalpage+1):
                        madeURL.append(url+'&page='+str(page))
        return madeURL
Esempio n. 9
0
 def test_generate_phrases_array(self, mock_content_raw_to_phrases):
     news_parser = NewsParser(url='fake_one')
     mock_content_raw_to_phrases.return_value = [
         ' o dólar fechou em forte queda nesta segunda-feira 27 em dia de fraqueza da moeda norte-americana no exterior em meio às tensões diplomáticas entre estados unidos e china com os investidores em modo de espera antes da reunião de política monetária do federal reserve bc dos eua',
         ' a moeda norte-americana recuou 0,89% vendida a r$ 5,1577',
         'veja mais cotações'
     ]
     phrases_array_test = [[
         'dólar', 'fechou', 'forte', 'queda', 'nesta', 'segunda-feira',
         '27', 'dia', 'fraqueza', 'moeda', 'norte-americana', 'exterior',
         'meio', 'tensões', 'diplomáticas', 'estados', 'unidos', 'china',
         'investidores', 'modo', 'espera', 'antes', 'reunião', 'política',
         'monetária', 'federal', 'reserve', 'bc', 'eua'
     ],
                           [
                               'moeda', 'norte-americana', 'recuou',
                               '0,89%', 'vendida', 'r$', '5,1577'
                           ], ['veja', 'cotações']]
     self.assertEqual(news_parser.generate_phrases_array(),
                      phrases_array_test)
Esempio n. 10
0
 def populate(self):
   """
   Populates the HitStore data members for query by the server code 
   """
   se = StorageEngine('sqlite3')
   parser = NewsParser(self.last_name, self.first_name)
   articles = se.get_collection('articles')
   # for document in articles.find():
   for document in articles:
     # news = parser.parse_doc(document)
     # if parser.is_article_hit(news):
     news = {
       'title' : document[0],
       'author' : document[1],
       'date' : document[2],
       'body' : document[3],
       'link' : document[4]
     }
     if parser.is_article_hit(news):
       self.total_hits += 1
       self.article_list.append(news['link'])
Esempio n. 11
0
def update_server(parameter):
    """This function performs data update.
    If passed '2' it does nothing.
    Otherwise if passed something that differs from 1
    displays an error.
    """

    if parameter == '2':
        return ' '
    if parameter != '1':
        return 'Allowed answers: 1 or 2'

    StocksParser.parse(companies_file, stocks_directory)

    print('\nWaiting for news... It may take a while\n')

    NewsParser.parse(driver_path, companies_file, news_directory)
    Classifier.classify(companies_file, news_directory, prnews_directory,
                        train_set_path)

    return 'Updated'
class ArticleContentParser():
    def __init__(self):
        self.htmlGetter = NewsParser()

    def getArticleContent(self, url):
        article_content = ['-1']  # 初始化,如果html无法打开,beautifulsoup无法读取则可直接返回空值
        news_list = []  # 用来存放新闻

        try:
            html = self.htmlGetter.getHtmlByRequests(url)
            soup = BeautifulSoup(html, 'html.parser')
            article_content = soup.select('.article-content')

        except Exception as e:
            print 'Beautiful Soup load error: ', e
            s = str(e)
            getArticleContentError = get_logger('ArticleContentParser.log')
            getArticleContentError.error = (
                'ArticleContentParser getArticleContentError Beautiful soup load error: '
                + s)

        # 如果前面出现错误直接返回空值
        if article_content == ['-1']:
            return news_list
        # 判断是不是另一种问答形式文本
        if article_content == []:
            article_content = soup.select('.answer-text-full')
            """
            由于这种类型的网页不更改header无法直接通过toutiao+/group/id的形式打开,所以解析网页也没有什么意义
            if article_content == []:
                article_content = soup.select('.tt-ignored-node')
            """

        for news in article_content:
            newsText = news.get_text()
            news_list.append(newsText)

        return news_list
Esempio n. 13
0
 def test_get_news(self):
     news_parser = NewsParser(
         url='https://g1.globo.com/economia/noticia/2020/07/27/dolar.ghtml')
     response = news_parser.get_news()
     self.assertIsInstance(response, list)
    def crawling(self, parseURLs):
        print("Crawling Start!")
        for url in parseURLs:
            pageN = 1
            options = webdriver.ChromeOptions()
            options.add_argument('headless')
            options.add_argument('disable-gpu')
            options.add_argument('--start-maximized')
            driver = webdriver.Chrome(self.driverPATH, chrome_options=options)
            end = True
            while end:
                articles = []
                driver.get(url + "&page=" + str(pageN))
                print(str(os.getpid()) + ":" + url + "&page=" + str(pageN))
                time.sleep(1.5)
                pages = driver.find_elements_by_css_selector(
                    '#newsWrp > ul > li')
                try:
                    for page in pages:
                        articles.append(
                            page.find_element_by_css_selector(
                                'a').get_attribute('href'))
                    del pages

                    for contentURL in articles:
                        time.sleep(0.01)

                        contentHtml = self.getURLdata(contentURL)
                        documentContent = BeautifulSoup(
                            contentHtml.content, 'html.parser')

                        try:
                            articleTitle = documentContent.find_all(
                                'h2', {'class': 'end_tit'})
                            title = ''
                            title += NewsParser.clearHeadlineE(
                                str(articleTitle[0].find_all(text=True)))
                            if not title:
                                continue

                            articleBodyContents = documentContent.find_all(
                                "div", {"id": "articeBody"})
                            content = NewsParser.clearContent(
                                list(articleBodyContents[0].find_all(
                                    text=True)))
                            if not content:
                                continue

                            fileName = self.DATA_DIR + '/' + str(
                                os.getpid()) + '_' + str(self.number) + ".txt"
                            self.fileWrite(fileName, title, content)
                            self.number += 1

                            del content, title
                            del articleTitle, articleBodyContents
                            del contentHtml, documentContent

                        except Exception:
                            del contentHtml, documentContent
                            pass
                    del articles
                except:
                    end = False
                pageN += 1
            driver.quit()
Esempio n. 15
0
def buttons_answer_cb(bot, event):
    UserID = event.data['from']['userId']
    answer = event.data['callbackData']
    print('User:'******'books':
        user.change_branch(UserID, 'book')
    if answer == 'QR':
        if qr.check_City(UserID) == 404:
            user.change_branch(UserID, 'geolocation')
            bot.send_text(
                chat_id=UserID,
                text=
                "Для этой функции мне нужно знать ваш город, для этого пожалуйста отправьте свое местоположение",
                inline_keyboard_markup="{}".format(
                    json.dumps([[
                        {
                            "text": "Хорошо",
                            "callbackData": "geo_ok",
                            "style": "attention"
                        },
                        {
                            "text": "Не хочу!",
                            "callbackData": "geo_neok",
                            "style": "primary"
                        },
                    ]])))

        else:
            bot.send_text(
                chat_id=UserID,
                text=
                "Сайт, на котором вы можете получить QR код для выхода на улицу"
            )
            bot.send_text(chat_id=UserID, text=qr.recive_qr(UserID))
    if answer == "geo_ok":
        bot.send_text(chat_id=UserID,
                      text="Жду ваше расположение",
                      inline_keyboard_markup="{}".format(
                          json.dumps([[
                              {
                                  "text": "Отмена",
                                  "callbackData": "F",
                                  "style": "attention"
                              },
                          ]])))
    if answer == "geo_neok":
        user.change_branch(UserID, 'main')
        if user.check_branch(UserID) == 404:
            user.add(UserID)
    if answer == "F":
        user.change_branch(UserID, 'main')
        bot.send_text(chat_id=UserID, text="Отменяюсь")
        if user.check_branch(UserID) == 404:
            user.add(UserID)
    if answer == "stats":
        bot.send_text(chat_id=UserID, text="Статистика на сегодняшний день:")
        parser = StatsParser()
        data = parser.get_data(event.data['from']['userId'])
        for d in data:
            message = f"""{d['title']}
        Заболевших: {d['sick']} (+ {d['sick_incr']})
        Умерших: {d['died']} (+ {d['died_incr']})
        Выздоровевших {d['healed']} (+ {d['healed_incr']})"""
            bot.send_text(chat_id=UserID, text=message)
        user.change_branch(UserID, 'choose')
    if answer == 'news':
        p = NewsParser()
        data = p.mailruParser()
        for news in data:
            message = f"""От Mail.ru
            {news['title']}
            {news['url']}"""
            bot.send_text(chat_id=event.data['from']['userId'], text=message)
    if answer == 'pharmacy':
        if qr.check_City(UserID) == 404:
            user.change_branch(UserID, 'geolocation')
            bot.send_text(
                chat_id=UserID,
                text=
                "Для этой функции мне нужно знать ваш город, для этого пожалуйста отправьте свое местоположение",
                inline_keyboard_markup="{}".format(
                    json.dumps([[
                        {
                            "text": "Хорошо",
                            "callbackData": "geo_ok",
                            "style": "attention"
                        },
                        {
                            "text": "Не хочу!",
                            "callbackData": "geo_neok",
                            "style": "primary"
                        },
                    ]])))
        else:
            print('pharmacy')
            p = PlacesParse()
            data = p.getPharmacy(UserID)
            for pharmacy in data:
                bot.send_text(chat_id=event.data['from']['userId'],
                              text=pharmacy['name'])
                bot.send_text(chat_id=event.data['from']['userId'],
                              text=pharmacy['url'])
            user.change_branch(UserID, 'main')
    if answer == 'shops':
        if qr.check_City(UserID) == 404:
            user.change_branch(UserID, 'geolocation')
            bot.send_text(
                chat_id=UserID,
                text=
                "Для этой функции мне нужно знать ваш город, для этого пожалуйста отправьте свое местоположение",
                inline_keyboard_markup="{}".format(
                    json.dumps([[
                        {
                            "text": "Хорошо",
                            "callbackData": "geo_ok",
                            "style": "attention"
                        },
                        {
                            "text": "Не хочу!",
                            "callbackData": "geo_neok",
                            "style": "primary"
                        },
                    ]])))
        else:
            print('shops')
            p = PlacesParse()
            data = p.getShop(UserID)
            for shop in data:
                bot.send_text(chat_id=event.data['from']['userId'],
                              text=shop['name'])
                bot.send_text(chat_id=event.data['from']['userId'],
                              text=shop['url'])
            user.change_branch(UserID, 'main')
    if answer == "da":
        print('u')
        user.change_branch(UserID, 'main')
        bot.send_text(chat_id=UserID, text="Вы подписались на уведомления")
    if answer == "net":
        user.change_branch(UserID, 'main')
    if user.check_branch(UserID) == 'main':
        bot.send_text(chat_id=event.data['from']['userId'],
                      text="Главное меню",
                      inline_keyboard_markup="{}".format(
                          json.dumps([[{
                              "text": "Статистика",
                              "callbackData": "stats",
                              "style": "attention"
                          }],
                                      [{
                                          "text": "Новоcти COVID",
                                          "callbackData": "news",
                                          "style": "attention"
                                      }],
                                      [{
                                          "text": "Ближайшие магазины",
                                          "callbackData": "shops",
                                          "style": "primary"
                                      }],
                                      [{
                                          "text": "Ближайшие аптеки",
                                          "callbackData": "pharmacy",
                                          "style": "primary"
                                      }],
                                      [{
                                          "text": "QR код",
                                          "callbackData": "QR"
                                      }],
                                      [{
                                          "text": "Экскурсия",
                                          "callbackData": "exc"
                                      }],
                                      [{
                                          "text": "Сказки для детей",
                                          "callbackData": "books"
                                      }]])))
    if user.check_branch(UserID) == 'choose':
        bot.send_text(chat_id=UserID,
                      text="Хотите подписаться на ежедневные уведомления?",
                      inline_keyboard_markup="{}".format(
                          json.dumps([[
                              {
                                  "text": "Да",
                                  "callbackData": "da",
                                  "style": "attention"
                              },
                              {
                                  "text": "Нет",
                                  "callbackData": "net",
                                  "style": "primary"
                              },
                          ]])))
    if answer == 'exc':
        bot.send_text(
            chat_id=event.data['from']['userId'],
            text=
            'Мы предлагаем вам небольшую экскурсию по достопримичательностям Москвы! Надеемся вы узнаете для себя что-нибудь новое и интересное, а так же хорошо проведете время :)',
            inline_keyboard_markup="{}".format(
                json.dumps([[
                    {
                        "text": "Начнем же!",
                        "callbackData": "moscow_view_1",
                        "style": "primary"
                    },
                ]])))
        user.change_branch(UserID, 'excursion')
    if user.check_branch(UserID) == 'excursion':
        if event.data['callbackData'] == "moscow_view_1":
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[0]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[0]['info'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=f"""{dataset[0]['q']}
            1. {dataset[0]['a2']}
            2. {dataset[0]['a3']}
            3. {dataset[0]['a1']}""",
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "1",
                                  "callbackData": "moscow_view_2ff",
                                  "style": "primary"
                              }, {
                                  "text": "2",
                                  "callbackData": "moscow_view_2f",
                                  "style": "primary"
                              }, {
                                  "text": "3",
                                  "callbackData": "moscow_view_2t",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_2t" or event.data[
                'callbackData'] == "moscow_view_2f" or event.data[
                    'callbackData'] == "moscow_view_2ff":
            if event.data['callbackData'] == "moscow_view_2f" or event.data[
                    'callbackData'] == "moscow_view_2ff":
                bot.send_text(chat_id=event.data['from']['userId'],
                              text=dataset[0]['ra'])
            else:
                bot.send_text(chat_id=event.data['from']['userId'],
                              text="Правильно!")
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[1]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[1]['info'],
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "Далее",
                                  "callbackData": "moscow_view_3",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_3":
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[2]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[2]['info'],
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "Далее",
                                  "callbackData": "moscow_view_4",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_4":
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[3]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[3]['info'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=f"""{dataset[3]['q']}
            1. {dataset[3]['a2']}
            2. {dataset[3]['a1']}
            3. {dataset[3]['a3']}""",
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "1",
                                  "callbackData": "moscow_view_5f",
                                  "style": "primary"
                              }, {
                                  "text": "2",
                                  "callbackData": "moscow_view_5t",
                                  "style": "primary"
                              }, {
                                  "text": "3",
                                  "callbackData": "moscow_view_5ff",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_5f" or event.data[
                'callbackData'] == "moscow_view_5t" or event.data[
                    'callbackData'] == "moscow_view_5ff":
            if event.data['callbackData'] == "moscow_view_5f" or event.data[
                    'callbackData'] == "moscow_view_5ff":
                bot.send_text(chat_id=event.data['from']['userId'],
                              text=dataset[3]['ra'])
            else:
                bot.send_text(chat_id=event.data['from']['userId'],
                              text="Правильно!")
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[4]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[4]['info'],
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "Далее",
                                  "callbackData": "moscow_view_6",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_6":
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[5]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[5]['info'],
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "Далее",
                                  "callbackData": "moscow_view_7",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_7":
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[6]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[6]['info'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=f"""{dataset[6]['q']}
            1. {dataset[6]['a2']}
            2. {dataset[6]['a1']}
            3. {dataset[6]['a3']}""",
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "1",
                                  "callbackData": "moscow_view_8t",
                                  "style": "primary"
                              }, {
                                  "text": "2",
                                  "callbackData": "moscow_view_8f",
                                  "style": "primary"
                              }, {
                                  "text": "3",
                                  "callbackData": "moscow_view_8ff",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_8f" or event.data[
                'callbackData'] == "moscow_view_8t" or event.data[
                    'callbackData'] == "moscow_view_8ff":
            if event.data['callbackData'] == "moscow_view_8f" or event.data[
                    'callbackData'] == "moscow_view_8ff":
                bot.send_text(chat_id=event.data['from']['userId'],
                              text=dataset[6]['ra'])
            else:
                bot.send_text(chat_id=event.data['from']['userId'],
                              text="Правильно!")
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[7]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[7]['info'],
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "Далее",
                                  "callbackData": "moscow_view_9",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_9":
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[8]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[8]['info'],
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "Далее",
                                  "callbackData": "moscow_view_10",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_10":
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[9]['url'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=dataset[9]['info'])
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=f"""{dataset[9]['q']}
            1. {dataset[9]['a3']}
            2. {dataset[9]['a1']}
            3. {dataset[9]['a2']}""",
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "1",
                                  "callbackData": "moscow_view_11f",
                                  "style": "primary"
                              }, {
                                  "text": "2",
                                  "callbackData": "moscow_view_11ff",
                                  "style": "primary"
                              }, {
                                  "text": "3",
                                  "callbackData": "moscow_view_11t",
                                  "style": "primary"
                              }]])))
        if event.data['callbackData'] == "moscow_view_11f" or event.data[
                'callbackData'] == "moscow_view_11t" or event.data[
                    'callbackData'] == "moscow_view_11ff":
            if event.data['callbackData'] == "moscow_view_11f" or event.data[
                    'callbackData'] == "moscow_view_11ff":
                bot.send_text(chat_id=event.data['from']['userId'],
                              text=dataset[9]['ra'])
            else:
                bot.send_text(chat_id=event.data['from']['userId'],
                              text="Правильно!")
            bot.send_text(chat_id=event.data['from']['userId'],
                          text="Спасибо за внимание!",
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "Конец экскурсии!",
                                  "callbackData": "end",
                                  "style": "primary"
                              }]])))
            user.change_branch(UserID, 'main')
    if user.check_branch(UserID) == 'book':
        if event.data['callbackData'] == "1" or event.data[
                'callbackData'] == "2" or event.data[
                    'callbackData'] == "3" or event.data[
                        'callbackData'] == "4" or event.data[
                            'callbackData'] == "5":
            num = int(event.data['callbackData']) - 1
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=books[num]['title'])
            with open(books[num]['audio'], 'rb') as f:
                bot.send_text(chat_id=event.data['from']['userId'],
                              text="Пожалуйста, подождите...")
                bot.send_file(file=f, chat_id=event.data['from']['userId'])
            print('audio')
            with open(books[num]['text'], 'rb') as f:
                bot.send_file(file=f, chat_id=event.data['from']['userId'])
            print('text')
            bot.send_text(chat_id=event.data['from']['userId'],
                          text="Главное меню",
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "Статистика",
                                  "callbackData": "stats",
                                  "style": "attention"
                              }],
                                          [{
                                              "text": "Новоcти COVID",
                                              "callbackData": "news",
                                              "style": "attention"
                                          }],
                                          [{
                                              "text": "Ближайшие магазины",
                                              "callbackData": "shops",
                                              "style": "primary"
                                          }],
                                          [{
                                              "text": "Ближайшие аптеки",
                                              "callbackData": "pharmacy",
                                              "style": "primary"
                                          }],
                                          [{
                                              "text": "QR код",
                                              "callbackData": "QR"
                                          }],
                                          [{
                                              "text": "Экскурсия",
                                              "callbackData": "exc"
                                          }],
                                          [{
                                              "text": "Сказки для детей",
                                              "callbackData": "books"
                                          }]])))
            user.change_branch(UserID, 'main')
        if event.data['callbackData'] == "books":
            bot.send_text(chat_id=event.data['from']['userId'],
                          text=f"""Какую книгу хотите получить?
            1. {books[0]['title']}
            2. {books[1]['title']}
            3. {books[2]['title']}
            4. {books[3]['title']}
            5. {books[4]['title']}""",
                          inline_keyboard_markup="{}".format(
                              json.dumps([[{
                                  "text": "1",
                                  "callbackData": "1",
                                  "style": "attention"
                              }, {
                                  "text": "2",
                                  "callbackData": "2",
                                  "style": "attention"
                              }, {
                                  "text": "3",
                                  "callbackData": "3",
                                  "style": "attention"
                              }, {
                                  "text": "4",
                                  "callbackData": "4",
                                  "style": "attention"
                              }, {
                                  "text": "5",
                                  "callbackData": "5",
                                  "style": "primary"
                              }]])))
 def __init__(self):
     self.htmlGetter = NewsParser()
Esempio n. 17
0
    def crawling(self,categoryName):
        print(str(os.getpid())+" : "+categoryName+"\n")
        url= "http://news.naver.com/main/list.nhn?mode=LSD&mid=shm&sid1=" + str(self.categoriesCode.get(categoryName)) + "&date="
        urls = self.makeNewsURLForm(url, self.date['startYear'], self.date['endYear'], self.date['startMonth'], self.date['endMonth']) 
        number=0

        print("Crawling Start!")
        for url in urls:
            print(str(os.getpid())+" : "+url)
            pageHtml = self.getURLdata(url)
            document = BeautifulSoup(pageHtml.content,'html.parser')

            #가운데의 줄을 기준으로 headline과 일반으로 나누어져 있음
            pages = document.select('.newsflash_body .type06_headline li dl')
            pages.extend(document.select('.newsflash_body .type06 li dl'))

            articles = []
            for line in pages:
                articles.append(line.a.get('href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음
            del pages

            for contentURL in tqdm(articles):  # 기사 URL
                # 크롤링 대기 시간
                time.sleep(0.01)
                
                # 기사 HTML 가져옴
                contentHtml = self.getURLdata(contentURL)
                documentContent = BeautifulSoup(contentHtml.content, 'html.parser')
                
                try:
                    # 기사 제목 가져옴
                    articleTitle = documentContent.find_all('h3', {'id': 'articleTitle'}, {'class': 'tts_head'})
                    title = ''  # 뉴스 기사 제목 초기화
                    title += NewsParser.clearHeadline(str(articleTitle[0].find_all(text=True)))
                    if not title:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 본문 가져옴
                    articleBodyContents = documentContent.find_all('div', {'id': 'articleBodyContents'})
                    content = NewsParser.clearContent(list(articleBodyContents[0].find_all(text=True)))
                    if not len(content):  # 공백일 경우 기사 제외 처리
                        continue

                    try:
                        if not(os.path.isdir(os.path.join(self.DATA_DIR,str(self.categoriesFolder.get(categoryName))))):
                            os.makedirs(os.path.join(self.DATA_DIR,str(self.categoriesFolder.get(categoryName))))
                            print("폴더 생성")
                    except OSError:
                        print("폴더 생성에 실패했습니다.")

                    print(os.getcwd())


                    fileName = self.DATA_DIR+'/'+str(self.categoriesFolder.get(categoryName))+'/'+categoryName+str(number)+".txt"
                    print(fileName)
                    self.fileWrite(fileName,title,content)
                    number+=1

                    del content, title
                    del articleTitle, articleBodyContents
                    del contentHtml, documentContent

                except Exception:
                    del contentHtml, documentContent
                    pass
    def crawling(self, categoryName):
        print(str(os.getpid()) + " : " + categoryName + '\n')
        url = "http://sports.news.naver.com/" + str(
            self.categoryCode.get(
                categoryName)) + "/news/index.nhn?isphoto=N&date="
        urls = self.makeNewsURLForm(url, self.date['startYear'],
                                    self.date['endYear'],
                                    self.date['startMonth'],
                                    self.date['endMonth'])
        number = 0

        print("Crawling Start!")
        for url in tqdm(urls):
            print(str(os.getpid()) + " : " + url)
            options = webdriver.ChromeOptions()
            options.add_argument('headless')
            options.add_argument('disable-gpu')
            driver = webdriver.Chrome(self.driverPath, chrome_options=options)
            driver.implicitly_wait(2)
            driver.get(url)
            driver.implicitly_wait(2)
            pages = driver.find_elements_by_css_selector('#_newsList > ul >li')

            articles = []
            for page in pages:
                articles.append(
                    page.find_element_by_css_selector('a').get_attribute(
                        'href'))
            del pages
            driver.quit()
            for contentURL in articles:
                time.sleep(0.05)

                contentHtml = self.getURLdata(contentURL)
                documentContent = BeautifulSoup(contentHtml.content,
                                                'html.parser')

                try:
                    # 기사 제목 가져옴
                    articleTitle = documentContent.find_all(
                        'h4', {'class': 'title'})
                    title = ''  # 뉴스 기사 제목 초기화
                    title += NewsParser.clearHeadline(
                        str(articleTitle[0].find_all(text=True)))
                    if not title:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 본문 가져옴
                    articleBodyContents = documentContent.find_all(
                        'div', {'id': 'newsEndContents'})
                    content = NewsParser.clearContentS(
                        list(articleBodyContents[0].find_all(text=True)))
                    if not len(content):  # 공백일 경우 기사 제외 처리
                        continue

                    try:
                        if not (os.path.isdir(self.DATA_DIR)):
                            os.makedirs(self.DATA_DIR)
                            print("폴더 생성")
                    except OSError:
                        print("폴더 생성에 실패했습니다.")

                    fileName = self.DATA_DIR + '/' + categoryName + str(
                        number) + ".txt"
                    self.fileWrite(fileName, title, content)
                    number += 1

                    del content, title
                    del articleTitle, articleBodyContents
                    del contentHtml, documentContent

                except Exception:
                    del contentHtml, documentContent
                    pass