Example #1
0
    def get(self):
        wikipedia.set_lang(u'ru')
        try:
            page = wikipedia.page(u'Проект:Города_России/Списки_улиц/Казани')
            streets = []
            for link in page.links:
                nlink = unicode(link).encode('utf-8').strip().decode('utf-8')
                norm_name = normalize(nlink)
                try:
                    street_info = StreetInfo.get_by_norm_name(norm_name)
                    if not street_info:
                        street_info = StreetInfo()

                    street_page = wikipedia.page(nlink)
                    street_info.name = nlink
                    street_info.norm_name = norm_name
                    street_info.info = unicode(street_page.summary).encode('utf-8').strip()
                    street_info.images = [Image(url=x) for x in street_page.images]
                    street_info.city = u'Казань'.encode('utf-8').strip()
                    street_info.lang=u'ru'.encode('utf-8').strip()
                    
                    street_info.put()
                    
                except Exception, e:
                    print nlink.encode('utf-8')
        except DeadlineExceededError:
            pass
        
        self.response.headers['Content-Type'] = "text/html; charset=utf-8"
        self.response.write(json.dumps({'success':True}))
Example #2
0
    def test_change_through_multiple_langs(self):
        # set language to Swedish
        wikipedia.set_lang("sv")
        self.assertEqual(wikipedia.API_URL,
                         'http://sv.wikipedia.org/w/api.php')

        # switch language to German
        wikipedia.set_lang("de")
        self.assertEqual(wikipedia.API_URL,
                         'http://de.wikipedia.org/w/api.php')

        # switch language to Russian
        wikipedia.set_lang("ru")
        self.assertEqual(wikipedia.API_URL,
                         'http://ru.wikipedia.org/w/api.php')

        # switch language to Polish
        wikipedia.set_lang("pl")
        self.assertEqual(wikipedia.API_URL,
                         'http://pl.wikipedia.org/w/api.php')

        # switch language to Portuguese
        wikipedia.set_lang("pt")
        self.assertEqual(wikipedia.API_URL,
                         'http://pt.wikipedia.org/w/api.php')
Example #3
0
	def searchIntent(self, session: DialogSession):
		search = self._extractSearchWord(session)
		if not search:
			self._whatToSearch(session, 'whatToSearch')
			return

		wikipedia.set_lang(self.LanguageManager.activeLanguage)

		try:
			with Online():
				result = wikipedia.summary(search, sentences=3)
		except OfflineError:
			self.endDialog(sessionId=session.sessionId, text=self.randomTalk('offline', skill='system'))
		except wikipedia.DisambiguationError as e:
			self.logWarning(msg=e)
			self._whatToSearch(session, 'ambiguous')
		except wikipedia.WikipediaException as e:
			self.logWarning(msg=e)
			self._whatToSearch(session, 'noMatch')
		except Exception as e:
			self.logWarning(msg=e, printStack=True)
		else:
			if not result:
				self._whatToSearch(session, 'noMatch')
			else:
				self.endDialog(sessionId=session.sessionId, text=result)
Example #4
0
def title(query: str) -> str:
    wikipedia.set_lang('en')
    search = wikipedia.search(query)[0]
    return json.dumps(
        filterResult(
            wikipedia.page(
                search['title']
            )))
Example #5
0
 def test_zh(self):
     wikipedia.set_lang('zh')
     wikipedia.set_proxy({
         'http': 'http://localhost:1080',
         'https': 'https://localhost:1080'
     })
     wikipedia.set_request_lang('zh-CN,zh;q=0.9,en;q=0.8,da;q=0.7')
     print(wikipedia.summary('流行性感冒'))
Example #6
0
def import_images():
    image_collection.remove(source='wiki')

    wikipedia.set_lang('ru')
    root_page = wikipedia.page('Экспонаты эрмитажа')

    for link in root_page.links:
        import_images_from_page(link)
Example #7
0
def pageid(query: str) -> str:
    print(query)
    wikipedia.set_lang('en')
    search = wikipedia.search(query)[0]
    return json.dumps(
        filterResult(
            wikipedia.page(
                None, search['pageid']
            )))
Example #8
0
 def __init__(self, parent=None):
     super(SoundThread, self).__init__(parent)
     self.name = ""
     home = expanduser("~")
     self.filename = home + "/Curie.mp3"
     wikipedia.set_lang("es")
     self.player = QMediaPlayer(None, QMediaPlayer.StreamPlayback)
     media = QMediaContent(QUrl.fromLocalFile(self.filename))
     self.player.setMedia(media)
Example #9
0
    def test_set_lang_then_back_to_eng(self):
        # set language to Spanish
        wikipedia.set_lang("es")
        self.assertEqual(wikipedia.API_URL,
                         'http://es.wikipedia.org/w/api.php')

        # revert language back to English
        wikipedia.set_lang("en")
        self.assertEqual(wikipedia.API_URL,
                         'http://en.wikipedia.org/w/api.php')
Example #10
0
def main():
    choice = ''
    known_language = ''
    while choice.lower() not in ['y', 'yes', 'n', 'no']:
        choice = raw_input('Default language - English. You wanna to change this? [y/n]: ')
        if choice.lower() in ['yes', 'y']:
            known_language = raw_input('You know your language abbreviation? [y/n]: ')
            if known_language.lower() in ['no', 'n']:
                lang_abbreviation = raw_input('Enter a native language naming: ')
                find_lang(lang_abbreviation)
                wikipedia.set_lang(raw_input('Enter your choice: '))
                name = raw_input('Page for search: ')
                MyPrettyPrinter().pprint(wikipedia.summary(name))
Example #11
0
    def searchIntent(self, session: DialogSession, **_kwargs):
        search = self._extractSearchWord(session)
        if not search:
            self._whatToSearch(session, 'whatToSearch')
            return

        wikipedia.set_lang(self.LanguageManager.activeLanguage)
        result = wikipedia.summary(search, sentences=3)

        if not result:
            self._whatToSearch(session, 'noMatch')
        else:
            self.endDialog(sessionId=session.sessionId, text=result)
Example #12
0
 def get(self):
     wikipedia.set_lang(u"ru")
     page = wikipedia.page(u"Проект:Города_России/Списки_улиц/Казани")
     streets = []
     for link in page.links:
         nlink = unicode(link).encode("utf-8").strip()
         try:
             street_page = wikipedia.page(nlink)
             streets.append(
                 {"name": nlink, "info": street_page.summary, "images": street_page.images, "city": u"Казань"}
             )
         except Exception, e:
             print nlink
Example #13
0
 def get(self):
     self.response.headers["Content-Type"] = "text/html; charset=utf-8"
     wikipedia.set_lang(u"ru")
     page = wikipedia.page(u"Проект:Города_России/Списки_улиц/Казани")
     text = page.content
     alphabet = re.split("\n\n\n==\s*...\s*==\n", text)
     byline = []
     for line in alphabet:
         byline.append(re.split("\n", line))
     byline.remove(byline[0])
     # byline[0] - names with number
     # byline[1..]- names with corresponding letter
     for line in byline[0]:
         self.response.write(json.dumps(line))
Example #14
0
    def update_on_priv_msg(self, data, connection):

        if data['message'].find('.w ') == -1:
            return
        i18n_server = i18n()
        w = wikipedia.set_lang(i18n_server.get_text('wiki_lang', lang=self.config.lang))
        q = data['message'].split(' ')
        query = ''
        for word in q:
            if word.strip() != '.w':
                query += word + ' '
        w = wikipedia.search(query)
        if w.__len__() == 0:  # TODO BUG BELOW, ERROR MESSAGE NOT SHOWN!
            connection.send_back(data['nick'] + ', ' +
                                 i18n_server.get_text('wiki_fail',
                                                      lang=self.config.lang),
                                 data)
            return
        try:
            page = wikipedia.WikipediaPage(w.pop(0))
        except wikipedia.DisambiguationError as error:
            print('disambiguation page')
            page = wikipedia.WikipediaPage(error.args[1][0])
        connection.send_back(data['nick'] + ' ' + page.url, data)
        index = 51 + page.summary[50:350].rfind('. ')
        if index == 50 or index > 230:
            index = page.summary[0:350].rfind(' ')
            connection.send_back(page.summary[0:index], data)
        else:
            connection.send_back(page.summary[0:index], data)
Example #15
0
 def update_on_priv_msg(self, data):
     if data['message'].find('.w ') == -1:
         return
     i18n_server = i18n()
     w = wikipedia.set_lang(i18n_server.get_text('wiki_lang'))
     q = data['message'].split(' ')
     query = ''
     for word in q:
         if word.strip() != '.w':
             query += word + ' '
     w = wikipedia.search(query)
     if w.__len__() == 0:
         Connection.singleton().send_back(data['nick'] + ', ' + i18n_server.get_text('wiki_fail'), data)
         return
     try:
         page = wikipedia.WikipediaPage(w.pop(0))
     except wikipedia.DisambiguationError as error:
         print('disambiguation page')
         page = wikipedia.WikipediaPage(error.args[1][0])
     Connection.singleton().send_back(data['nick'] + ' ' + page.url, data)
     index = 51 + page.summary[50:230].find('. ')
     if index == -1 or index > 230:
         Connection.singleton().send_back(page.summary[0:230], data)
     else:
         Connection.singleton().send_back(page.summary[0:index], data)
Example #16
0
def extract_actor_from_wikipedia(lastname, firstname):
    wikipedia.set_lang("fr")

    searchs = wikipedia.search(lastname + " " + firstname)

    for search in searchs:
        page = wikipedia.page(search)
        rc = {"links": list({"title": "wikipedia", "url": page.url})}

        if lastname in page.title and firstname in page.title:
            rc = dict({"links": [], "name": firstname + " " + lastname})
            for img in page.images:
                if img.endswith(".jpg"): rc["photo"] = img

            save_domains = [
                "unifrance.org", "www.lefilmfrancais", "www.allocine.fr",
                "catalogue.bnf.fr", "www.allmovie.com"
            ]
            libs = [
                "UniFrance", "Le Film Francais", "Allocine", "La BNF",
                "All movie"
            ]
            try:
                for ref in page.references:
                    domain = urlparse(ref).netloc
                    try:
                        idx = save_domains.index(domain)
                        rc["links"].append({"title": libs[idx], "url": ref})
                    except:
                        pass
            except:
                pass

            html: wikipedia.BeautifulSoup = wikipedia.BeautifulSoup(
                page.html(), "html5lib")
            #Recherche de la section des films
            # for link in html.findAll('a', attrs={'href': wikipedia.re.compile("^http://")}):
            #     if "film" in link.text:
            #         pass

            rc["summary"] = page.summary
            rc["title"] = page.title
            rc["url"] = page.url

            return rc

    return None
Example #17
0
    def searchIntent(self, session: DialogSession):
        search = self._extractSearchWord(session)
        if not search:
            self._whatToSearch(session, 'whatToSearch')
            return

        wikipedia.set_lang(self.LanguageManager.activeLanguage)

        try:
            result = wikipedia.summary(search, sentences=3)
        except wikipedia.DisambiguationError as e:
            self.logWarning(msg='Ambiguous result')
            self._whatToSearch(session, 'ambiguous')
        except wikipedia.WikipediaException as e:
            self.logWarning(msg='No match')
            self._whatToSearch(session, 'noMatch')
        except Exception as e:
            self.logWarning(msg=str(e), printStack=True)
        else:
            if not result:
                self._whatToSearch(session, 'noMatch')
            else:
                self.endDialog(sessionId=session.sessionId, text=result)
Example #18
0
 def update_on_priv_msg(self, data):
     if data['message'].find('.w ') == -1:
         return
     i18n_server = i18n()
     w = wikipedia.set_lang(i18n_server.get_text('wiki_lang'))
     q = data['message'].split(' ')
     query = ''
     for word in q:
         if word.strip() != '.w':
             query += word + ' '
     w = wikipedia.search(query)
     if w.__len__() == 0:
         Connection.singleton().send_channel(data['nick'] + ', ' + i18n_server.get_text('wiki_fail'))
         return
     page = wikipedia.WikipediaPage(w.pop(0))
     Connection.singleton().send_channel(data['nick'] + ' ' + page.url)
     Connection.singleton().send_channel(page.summary[0:230])
Example #19
0
 def test_lang(self):
     ''' ensure the url gets updated correctly '''
     wikipedia.set_lang("fr")
     self.assertEqual(wikipedia.WIKIPEDIA_GLOBALS['API_URL'],
                      'http://fr.wikipedia.org/w/api.php')
Example #20
0
def geo(lat: float, lon: float) -> str:
    wikipedia.set_lang('en')
    return json.dumps(
        filterResult(
            wikipedia.page(
                wikipedia.geosearch(lat, lon))))
    def searchIntent(self, intent: str, session: DialogSession) -> bool:
        slots = session.slots
        sessionId = session.sessionId
        customData = session.customData

        search = customData.get('userInput', slots.get('what'))

        if not search:
            self.continueDialog(sessionId=sessionId,
                                text=self.randomTalk('whatToSearch'),
                                intentFilter=[self._INTENT_USER_ANSWER],
                                previousIntent=self._INTENT_SEARCH,
                                customData={
                                    'module': self.name,
                                })
            return True

        wikipedia.set_lang(self.LanguageManager.activeLanguage)
        engine = customData.get('engine', 'wikipedia')

        try:
            if engine == 'wikipedia':
                result = wikipedia.summary(search, sentences=3)
            else:
                result = wikipedia.summary(search, sentences=3)

            if result:
                self.endDialog(sessionId=sessionId, text=result)
            else:
                self.continueDialog(
                    sessionId=sessionId,
                    text=self.TalkManager.randomTalk('noMatch').format(search),
                    intentFilter=[self._INTENT_USER_ANSWER],
                    previousIntent=self._INTENT_SEARCH,
                    customData={
                        'module': self.name,
                        'engine': engine
                    })

        except wikipedia.DisambiguationError:
            self.continueDialog(
                sessionId=sessionId,
                text=self.TalkManager.randomTalk('ambiguous').format(search),
                intentFilter=[self._INTENT_USER_ANSWER],
                previousIntent=self._INTENT_SEARCH,
                customData={
                    'module': self.name,
                    'engine': engine
                })
        except wikipedia.WikipediaException:
            self.continueDialog(
                sessionId=sessionId,
                text=self.TalkManager.randomTalk('noMatch').format(search),
                intentFilter=[self._INTENT_USER_ANSWER],
                previousIntent=self._INTENT_SEARCH,
                customData={
                    'module': self.name,
                    'engine': engine
                })
        except Exception as e:
            self._logger.error(f'Error: {e}')
            self.endDialog(sessionId=sessionId,
                           text=self.TalkManager.randomTalk('error',
                                                            module='system'))
        return True
Example #22
0
def main():
    kalk_mode = False
    aut_reg_mode = False
    quest_mode = False
    weather_mode = False
    wiki_mode = False
    quest_stage = 0
    answer_quest_continue = False
    wikipedia.set_lang('ru')

    for event in longpoll.listen():
        if event.type == VkEventType.MESSAGE_NEW:
            if event.to_me:
                request = event.text.lower()

                if request in ['stop', 'стоп']:  # остановить все режимы
                    kalk_mode = False
                    aut_reg_mode = False
                    quest_mode = False
                    weather_mode = False
                    wiki_mode = False
                    write_msg(event.user_id, 'Все режимы остановленны.', create_empty_keyboard())

                elif request in ["что ты можешь", 'помощь', 'help']:
                    write_msg(event.user_id, "Вот, что я могу:", create_help_keyboard())

                elif request in ['калькулятор', 'kalk'] and not wiki_mode:  # Калькулятор
                    kalk_mode = not kalk_mode
                    if kalk_mode:

                        aut_reg_mode = False
                        quest_mode = False
                        weather_mode = False
                        wiki_mode = False

                        write_msg(event.user_id, 'Режим калькулятор запущен.', create_empty_keyboard())
                        write_msg(event.user_id,
                                  "Введите выражение. Используйте знаки из допустимого списка: "
                                  "+, -, *, /, **, (, ), %, //. Для остановки режима введите 'stop', 'стоп' "
                                  "или слово, использованное для старта.",
                                  create_empty_keyboard())
                    else:
                        write_msg(event.user_id, 'Режим калькулятор остановлен.', create_empty_keyboard())

                elif request in ['регион', 'aut_reg', 'номер'] and not wiki_mode:  # Регион регистрации автомобиля
                    aut_reg_mode = not aut_reg_mode
                    if aut_reg_mode:

                        kalk_mode = False
                        quest_mode = False
                        weather_mode = False
                        wiki_mode = False

                        write_msg(event.user_id,
                                  "Режим поиска региона регистрации автомобиля запущен. "
                                  "Для остановки режима введите 'stop', 'стоп' или слово, использованное для старта.",
                                  create_empty_keyboard())
                        write_msg(event.user_id, "Введите номер региона. Исспользуйте только цифры.",
                                  create_empty_keyboard())
                    else:
                        write_msg(event.user_id, 'Режим поиска региона регистрации автомобиля остановлен.',
                                  create_empty_keyboard())

                elif request in ['квест', 'quest', 'текстовый квест'] and not wiki_mode:  # текстовый квест
                    quest_mode = not quest_mode
                    if quest_mode:

                        kalk_mode = False
                        aut_reg_mode = False
                        weather_mode = False
                        wiki_mode = False

                        write_msg(event.user_id, "Режим текстового квеста запущен. Для остановки режима введите "
                                                 "'stop', 'стоп' или слово, использованное для старта.",
                                  create_empty_keyboard())

                        if quest_stage != 0:
                            write_msg(event.user_id, 'Хотите продолжить с момента, на котором остановились?',
                                      create_answer_keyboard('Да', 'Нет'))
                            answer_quest_continue = True
                            continue

                        quest(event.user_id, quest_stage)
                    else:
                        write_msg(event.user_id, 'Режим текстового квеста остановлен.', create_empty_keyboard())

                elif request in ['погода', 'weather'] and not wiki_mode:  # погода
                    weather_mode = not weather_mode
                    if weather_mode:

                        kalk_mode = False
                        aut_reg_mode = False
                        quest_mode = False
                        wiki_mode = False

                        write_msg(event.user_id,
                                  "Режим погода запущен. "
                                  "Для остановки режима введите 'stop', 'стоп' или слово, использованное для старта.",
                                  create_empty_keyboard())
                        write_msg(event.user_id, "Введите название города, погоду в котором вы хотите узнать.",
                                  create_empty_keyboard())
                    else:
                        write_msg(event.user_id, 'Режим погода остановлен.', create_empty_keyboard())

                elif request == 'wiki' or request == 'поиск':  # поиск в wikipedia
                    wiki_mode = not wiki_mode
                    if wiki_mode:

                        kalk_mode = False
                        aut_reg_mode = False
                        quest_mode = False
                        weather_mode = False

                        write_msg(event.user_id,
                                  "Режим wikipedia запущен. "
                                  "Для остановки режима введите 'stop', 'стоп' или слово, использованное для старта.",
                                  create_empty_keyboard())
                        write_msg(event.user_id, "Введите ваш запрос:",
                                  create_empty_keyboard())
                    else:
                        write_msg(event.user_id, 'Режим wikipedia остановлен.', create_empty_keyboard())

                elif kalk_mode:
                    kalk(event.user_id, request)

                elif aut_reg_mode:
                    find_aut_reg(event.user_id, request)

                elif quest_mode:
                    if answer_quest_continue is True:
                        answer_quest_continue = False
                        if request == 'да':
                            quest(event.user_id, quest_stage)
                        elif request == 'нет':
                            quest_stage = 0
                            quest(event.user_id, quest_stage)

                    elif request == 'пойти в дверь №1' and quest_stage == 0:
                        quest_stage = 1
                        quest(event.user_id, quest_stage)

                    elif request == 'пойти в дверь №2' and quest_stage == 0:
                        quest_stage = 2
                        quest(event.user_id, quest_stage)

                    elif request == 'выпить из пузырька' and quest_stage == 1:
                        quest_stage = 5
                        quest(event.user_id, quest_stage)

                    elif request == 'попробовать протиснуться в дверцу' and quest_stage == 1:
                        quest_stage = 6
                        quest(event.user_id, quest_stage)

                    elif request == 'нырнуть за ключом' and quest_stage == 2:
                        quest_stage = 7
                        quest(event.user_id, quest_stage)

                    elif request == 'попробовать открыть дверь' and quest_stage == 2:
                        quest_stage = 8
                        quest(event.user_id, quest_stage)

                    elif request == 'начать с начала':
                        quest_stage = 0
                        quest(event.user_id, quest_stage)

                    elif request == 'закончить':
                        quest_stage = 0
                        quest_mode = False
                        write_msg(event.user_id, 'Режим текстового квеста остановлен.', create_empty_keyboard())

                elif weather_mode:
                    weather(event.user_id, request)

                elif wiki_mode:
                    wiki_search(event.user_id, request)

                # общение с пользователем
                elif request == "привет" or request == 'hello':
                    write_msg(event.user_id, "Здравствуйте.", create_empty_keyboard())

                elif request == "пока" or request == 'goodbye':
                    write_msg(event.user_id, "До свидания.", create_empty_keyboard())

                else:
                    write_msg(event.user_id, "Не понимаю вашего ответа... Если нужна помощь введите 'help' "
                                             "или 'помощь'.", create_quest_keyboard('Help'))
Example #23
0
class WikipediaScraping:
    """
    :Date: 2018-02-16
    :Version: 1.2
    :Author: Edwin Puertas - Pontificia Universidad Javeriana
    :Copyright: Por definir
    :Organization: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA
    This class extracts wikipedia content.
    """

    wikipedia.set_lang('es')

    def __init__(self):
        """
        :Date: 2017-02-08
        :Author: Edwin Puertas - Pontificia Universidad Javeriana
        :Copyright: Por definir
        :Organization: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA

        This function return inicialized wikipedia object article

        :return: Wikipedia object
        """

        self.support = Support()
        self.media_wiki = wikipediaapi.Wikipedia('es')
        print('WikipediaScraping')

    def search_articles(self, list_word):
        art_list = []
        try:
            for word in list_word:
                art_list += wikipedia.search(word)

            art_list = self.support.normalized_list(art_list)
            return art_list
        except:
            Logging.write_standard_error(sys.exc_info())

    def get_content(self, word):
        try:
            wiki = wikipediaapi.Wikipedia(
                language='es', extract_format=wikipediaapi.ExtractFormat.WIKI)
            article = wiki.page(word)
            list_tmp = [
                'Historia', 'Véase también', 'Referencias', 'Enlaces externos'
            ]
            text = ''
            list_subtitle = []
            if article.exists():
                text = article.summary
                for s in article.sections:
                    if not (s.title in list_tmp):
                        text += s.title + '\n' + s.text
                        list_subtitle.append(s.title)
            text = self.support.clean_text(text)
            return list_subtitle, text
        except:
            Logging.write_standard_error(sys.exc_info())

    def get_article(self, word):
        """
        :Date: 2017-05-09
        :Version: 1.2
        :Author: Edwin Puertas - Pontificia Universidad Javeriana
        :Organization: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA
        This function return wikipedia object article with sets values
        :param word: Name of the wikipedia article to extract
        :type word: Text
        :rtype: Dictionary
        :return: Dictionary of synonyms
        """
        art = {}
        result = None
        try:
            article = self.media_wiki.page(word)
            if article.exists():
                art['Id'] = article.pageid  # Id
                art['Title'] = article.title  #Title
                art['SubTitle'] = self.get_content(art['Title'])[0]  #SubTitle
                art['Summary'] = article.summary  #Summary
                art['Content'] = self.get_content(art['Title'])[1]  #Content
                art['Links'] = self.normalized_links(article.links)  #Links
                art['Categories'] = self.get_categories(
                    art['Title'])  #Categories
                art['URL'] = article.canonicalurl  #URL
                art['Weight'] = len(art['Links'])  # Peso del Artículos
                art['Frequency'] = Counter(tb(article.summary +
                                              art['Content']))
                result = art
            return result
        except:
            Logging.write_standard_error(sys.exc_info())

    def get_nearby_articles(self, list_articles):
        """
        :Date: 2017-05-09
        :Version: 1.2
        :Author: Edwin Puertas - Pontificia Universidad Javeriana
        :Organization: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA
        This function return list of wikipedia article nearby to seed words
        :param list_articles: List of the wikipedia article
        :type list_articles: List
        :rtype: Dictionary
        :return: Dictionary of nearby articles
        """
        try:
            dic_temp = {}
            for art in list_articles:
                key = self.media_wiki.page(art)
                if key.exists():
                    dic_temp[str(key.title).lower()] = self.get_categories(
                        key.title)
            return dic_temp
        except:
            Logging.write_standard_error(sys.exc_info())

    def get_categories(self, word):
        """
        :Date: 2017-05-09
        :Version: 1.2
        :Author: Edwin Puertas - Pontificia Universidad Javeriana
        :Organization: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA
        This function return list of wikipedia article nearby to seed words
        :param word: Name of the wikipedia article to extract
        :type word: Text
        :rtype: List
        :return: List of categories by article
        """
        try:
            list_categories = self.media_wiki.page(word).categories
            new_list = []
            for item in list_categories:
                item = str(item).lower()
                item = self.support.clean_text(item)
                if not ('wiki' in item) and item != '':
                    item = item[10:len(item)]
                    item = item.strip()
                    new_list.append(item)
            new_list = self.support.normalized_list(new_list)
            return new_list
        except:
            Logging.write_standard_error(sys.exc_info())

    def normalized_links(self, list_links):
        """
        :Date: 2017-05-09
        :Version: 1.2
        :Author: Edwin Puertas - Pontificia Universidad Javeriana
        :Organization: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA
        This function return list of wikipedia article nearby to seed words
        :param list_links: list of word
        :type list: Text
        :rtype: List
        :return: List any repeats words
        """
        list_tmp = []
        try:
            for item in list_links:
                text = self.support.clean_text(item).strip()
                if text != '':
                    list_tmp.append(text)

            list_tmp = self.support.normalized_list(list_tmp)
            return list_tmp
        except:
            Logging.write_standard_error(sys.exc_info())

    def compare_categories(self, main_categories, second_categories):
        """
        :Date: 2017-05-09
        :Version: 1.2
        :Author: Edwin Puertas - Pontificia Universidad Javeriana
        :Organization: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA
        This function return True or False if the words in main_categories exist in second_categories.
        :param main_categories: list of words by main categories
        :type list: Text
        :param second_categories: list of words by second categories
        :type list: Text
        :rtype: Boolean
        :return: True or False
        """

        try:
            val = False
            list_tmp = []
            for i in main_categories:
                if (i not in list_tmp) and (i in second_categories):
                    list_tmp.append(i)

            if len(list_tmp) > 0:
                val = True

            return val
        except:
            Logging.write_standard_error(sys.exc_info())

    def creted_corpus_wikipedia(self, seed_words):
        """
        :Date: 2017-05-09
        :Version: 1.2
        :Author: Edwin Puertas - Pontificia Universidad Javeriana
        :Organization: Centro de Excelencia y Apropiación de Big Data y Data Analytics - CAOBA
        This function create a corpus by articles in Wikipedia by seed words.
        :param seed_words: list of seed words
        :type list: Text
        :rtype: XML
        :return: Corpus of Wikipedia Articles
        """

        parent_categories = []
        list_article_invalid = []
        try:
            print('Searching Wikipedia with the seed {0}'.format(
                str(seed_words)))
            print('Searching Articles in Wikipedia, wait a moment please.')
            list_nearby_parent_articles = self.search_articles(seed_words)
            print('Found articles:{0}'.format(list_nearby_parent_articles))
            print('Building main domain region, wait a moment please.')

            #Saved articles seed words
            list_articles = []
            list_titles = []
            for word in seed_words:
                art = self.get_article(word)
                if art is not None:
                    list_articles.append({
                        'resource': 'Wikipedia',
                        'title': art['Title'],
                        'subtitle': art['SubTitle'],
                        'content': art['Content']
                    })
                    list_titles.append(art['Title'])
                    parent_categories += self.get_categories(str(word))

            parent_categories = self.support.normalized_list(parent_categories)
            print('Main domain region {0}'.format(parent_categories))

            for art in list_nearby_parent_articles:
                child = self.get_article(art)
                child_categories = child['Categories']
                if self.compare_categories(parent_categories,
                                           child_categories):
                    if (art is not None) and not (child['Title']
                                                  in list_titles):
                        print(
                            'Retrieving Information from [{0}], [{1}]'.format(
                                child['Title'], child['URL']))
                        list_articles.append({
                            'resource': 'Wikipedia',
                            'title': child['Title'],
                            'subtitle': child['SubTitle'],
                            'content': child['Content']
                        })
                        list_titles.append(child['Title'])
                        dict_grand_child = {}
                        print('Validating articles children of {0}'.format(
                            str(child['Title'])))
                        print(
                            'Validating domain region for articles {0}, wait a moment please.'
                            .format(len(child['Links'])))
                        dict_grand_child = self.get_nearby_articles(
                            child['Links'])
                        for k, v in dict_grand_child.items():
                            grand_child_categories = v
                            if self.compare_categories(parent_categories,
                                                       grand_child_categories):
                                grand_child = self.get_article(k)
                                if grand_child is not None:
                                    if not (grand_child['Title']
                                            in list_titles):
                                        print(
                                            'Retrieving Information from [{0}], [{1}]'
                                            .format(grand_child['Title'],
                                                    grand_child['URL']))
                                        list_articles.append({
                                            'resource':
                                            'Wikipedia',
                                            'title':
                                            grand_child['Title'],
                                            'subtitle':
                                            grand_child['SubTitle'],
                                            'content':
                                            grand_child['Content']
                                        })
                                        list_titles.append(
                                            grand_child['Title'])
                                    else:
                                        print('Article [{0}] excluded!'.format(
                                            grand_child['Title']))
                                        if grand_child['Title'] != '':
                                            list_article_invalid.append(
                                                grand_child['Title'])

            excluded_items = len(list_article_invalid)
            recovered_items = len(list_articles)
            total_items = excluded_items + recovered_items
            print(
                '\n# Article excluded: [{0}] \n# Articles recovered: [{1}] \nTotal Articles: [{2}]'
                .format(excluded_items, recovered_items, total_items))
            return list_articles
        except:
            Logging.write_standard_error(sys.exc_info())
Example #24
0
 def test_lang(self):
   ''' ensure the url gets updated correctly '''
   wikipedia.set_lang("fr")
   self.assertEqual(wikipedia.WIKIPEDIA_GLOBALS['API_URL'], 'http://fr.wikipedia.org/w/api.php')
Example #25
0
 def test_lang(self):
   wikipedia.set_lang("fr")
   self.assertEqual(wikipedia.API_URL, 'https://fr.wikipedia.org/w/api.php')
Example #26
0
import vk_api
from vk_api.bot_longpoll import VkBotLongPoll, VkBotEventType
import random

from wikipedia import wikipedia

vk_session = vk_api.VkApi(token=TOKEN)

longpoll = VkBotLongPoll(vk_session, GROUP_ID)
vk = vk_session.get_api()

wikipedia.set_lang('ru')


def wiki_response(request_text):
    return str(wikipedia.page(request_text).content[:1000])


def help():
    return f"What do you want to ask Wikipedia?"


def main():
    flag_wiki, flag_help = False, True
    for event in longpoll.listen():
        if event.type == VkBotEventType.MESSAGE_NEW and flag_help:
            flag_wiki = not flag_wiki
            flag_help = not flag_help
            vk.messages.send(user_id=event.obj.message['from_id'],
                             message=help(),
                             random_id=random.randint(0, 2**64))
Example #27
0
 def test_lang(self):
     wikipedia.set_lang("fr")
     self.assertEqual(wikipedia.api_url, 'http://fr.wikipedia.org/w/api.php')
Example #28
0
 def test_set_lang_it(self):
     wikipedia.set_lang("zh")
     self.assertEqual(wikipedia.API_URL,
                      'http://zh.wikipedia.org/w/api.php')