Beispiel #1
0
 async def moegirl_search(q):
     moegirlwiki = MediaWiki(url='http://zh.moegirl.org/api.php')
     t = moegirlwiki.search(q)
     if len(t) == 0:
         return False
     p = moegirlwiki.page(t[0])
     return p.summary
Beispiel #2
0
def wikipedia_summary(topic, lang='en'):
    wikipedia = MediaWiki(lang=lang)
    search = wikipedia.search(topic)
    summary = wikipedia.summary(search[0])
    text = '**{}**\n\n{}\n**Read more at:** [{}]({})'.format(
        page.title, summary, page.title, page.url)
    return text
def apiWikipedia(search, language):
    print(language, search)
    if(language == 'pt'):
        language = 'pt-br'
    wikipedia = MediaWiki(lang=language)
    if(len(wikipedia.search(search)) < 1):
        raise Exception('apiWikipedia: Content not found')
    page = wikipedia.page(search)
    return page.summary, page.url
def main(search_term):
    wikipedia = MediaWiki(lang='pap', user_agent='code-for-nl-pap-parser')
    wikidata = MediaWiki(url='https://www.wikidata.org/w/api.php',
                         user_agent='code-for-nl-pap-parser')

    search_result = wikipedia.search(search_term, results=4)

    for result_item in search_result:
        page = wikipedia.page(result_item)
        print(
            'I found page \'%s\' for term \'%s\'' % (result_item, search_term),
            'with categories', '/'.join(page.categories),
            'https://pap.wikipedia.org/wiki/' +
            urllib.parse.quote(result_item))
        # print(page.images)

        # Now I am going to search this one on wikidata, this will return a code. like Q215887
        search_data = wikidata.search(result_item, results=1)

        for data_item in search_data:
            Q_CODE = data_item
            print(result_item, 'is known on wikidata with the code', Q_CODE,
                  'https://www.wikidata.org/wiki/' + Q_CODE)
            # Now try the qwikidata interface
            entity = get_entity_dict_from_api(Q_CODE)
            q = WikidataItem(entity)
            pap_data_label = q.get_label(lang='pap')
            nl_data_label = q.get_label(lang='nl')
            if pap_data_label and nl_data_label:
                # First get the page. Read the images found
                data_page = wikidata.page(result_item)
                # print(data_page.images)

                print(pap_data_label, 'is called', nl_data_label, 'in dutch')
            elif pap_data_label and not nl_data_label:
                print(pap_data_label, 'has no entry for dutch!')
            elif not pap_data_label and nl_data_label:
                print(Q_CODE, 'does not match papiamentu entry')
            elif not pap_data_label and not nl_data_label:
                print(pap_data_label, 'has no entry for dutch or papiamentu!')
Beispiel #5
0
def get_wikipedia_article(s_word):
    try:
        wikipedia = MediaWiki(url=wikiurl)
        wp_words = wikipedia.search(s_word, results=1)
        wp_article = wikipedia.page(wp_words[0])
        return wp_article
    except DisambiguationError as e:
        wp_article = wikipedia.page(random.choice(e.options))
        return wp_article
    except Exception as e:
        app.logger.info('Exception')
        app.logger.info(e)
        return False
Beispiel #6
0
class WikiMedia:
    """Wikipedia class."""

    def __init__(self):
        self.wikipedia = MediaWiki()
        self.wikipedia.language = "fr"

    def get_infos(self, query):
        """Method allowing to retrieve informations from wikipedia.fr."""
        try:
            titles = self.wikipedia.search(query)
            if len(titles) > 0:
                infos = self.wikipedia.page(titles[0])
                summary = self.wikipedia.summary(titles[0], sentences=3)

                # Add regex  to remove == string == in summary:
                summary = re.sub(r"={2}\s.+={2}", r"", summary)
                status = True
                url = infos.url

            # Return empty results if no titles are return from the API
            else:
                summary = ""
                url = ""
                status = False

        # Use one except block in case of disambiguations errors.
        # Allow to search for the next title if the first one lead
        # to a disambiguation error.

        except mediawiki.exceptions.DisambiguationError:
            if len(titles) > 1:
                try:
                    infos = self.wikipedia.page(titles[1])
                    summary = self.wikipedia.summary(titles[1], sentences=3)
                    summary = re.sub(r"={2}\s.+={2}", r"", summary)
                    url = infos.url
                    status = True

                except mediawiki.exceptions.DisambiguationError:
                    summary = ""
                    url = ""
                    status = False
                    logging.exception("Exception occurred")
            else:
                summary = ""
                url = ""
                status = False
                logging.exception("Exception occurred")

        return {"summary": summary, "url": url, "status": status}
Beispiel #7
0
        if not is_not_blank(wikiPage.section(sections[x])):
            sections[x] = None
        if sections[x] in bannedSections:
            sections[x] = None
        # if "\u0x8211" in sections[x]:     # trying to remove the sections with the - not recognised by system (e.g illinois state senator (1997–2004))
        #     sections[x] = None
    sections = filter(None, sections)       # removing the empty sections

    actualSection = copy.copy(sections)     # make a shallow copy of the list to have the case sensitive sections list

    for x in range(len(sections)):
        sections[x] = sections[x].lower()       # set the first letter lower case to being user-friendlier :) (actually for speech to text)
        sections[x] = unidecode(sections[x])    # remove accent if there is one

    # Suggestions
    suggestions = wikipedia_mediawiki.search(keyword, 5, False)     # get the related topic
    # Check that the suggested pages exist, if not, remove the suggestion from the list
    for x in range(len(suggestions)):
        try:
            suggestedPage = wikipedia_mediawiki.page(suggestions[x])  # search on wikipedia the suggestion's page
        except:
            suggestions[x] = None

    suggestions = filter(None, suggestions)  # removing the empty suggestions


    for x in range(len(suggestions)):
        suggestions[x] = suggestions[x].lower()                     # set the first letter lower case to being user-friendlier :) (actually for speech to text)
        suggestions[x] = unidecode(suggestions[x])                  # remove accent if there is one
    # Content
    content = wikiPage.summarize(sentences=3)                   # get the summary of the wikipedia page
            # Если в данных уже есть нужный тег
            if "wikipedia" in res["tags"]:
                search_page = wikipedia.page(res["tags"]["wikipedia"][3:])
            else:
                # Отлов ошибок, если в запросе нет имени
                try:
                    # Поиск по координатам
                    page_names = wikipedia.geosearch(latitude=res["lat"],
                                                     longitude=res["lon"])
                    page_name = [
                        name for name in page_names
                        if check_levenshtein(name, res["tags"].get("name"))
                    ]

                    if not page_name and res["tags"].get("name"):
                        page_names = wikipedia.search(res["tags"].get("name"))
                        page_name = [
                            name for name in page_names
                            if check_levenshtein(name, res["tags"].get("name"))
                        ]

                    if page_name:
                        search_page = wikipedia.page(page_name[0])

                except TypeError:
                    continue

            if search_page:
                # Дополняем данные из первого задания
                page_dict = {
                    "summary": search_page.summary,
                        p = wikipedia.page(geo_res)
                        data_found = True
                        break
            # Проверим по геоположению и полю alt_name
            if not data_found and "alt_name" in value['tags']:
                for geo_res in geo:
                    if Levenshtein.distance(geo_res,
                                            value['tags']['alt_name']) <= 3:
                        p = wikipedia.page(geo_res)
                        data_found = True
                        break

            # Предпоследняя попытка. Поиск по имени
            if not data_found:
                if "name" in value['tags']:
                    name_search = wikipedia.search(value['tags']['name'])

                    for name_res in name_search:
                        if Levenshtein.distance(geo_res,
                                                value['tags']['name']) <= 3:
                            p = wikipedia.page(name_res)
                            data_found = True
                            break

            # Последняя попытка. Поиск по альтернативному имени
            if not data_found:
                if "alt_name" in value['tags']:
                    name_search = wikipedia.search(value['tags']['alt_name'])

                    for name_res in name_search:
                        if Levenshtein.distance(
Beispiel #10
0
def get_search(search_phrase):
    wikipedia = MediaWiki(user_agent='chucha-user-agent-string')
    open_search_result = wikipedia.search(search_phrase)
    return open_search_result
Beispiel #11
0
class DialogueManager:
    """ Simple Question Answering Dialogue Manager """
    def __init__(self, log_path: str, base_model: str) -> None:
        self._wiki = MediaWiki()
        self._entity_recognizer = TFLiteNLU(log_path)
        self._tokenizer = AutoTokenizer.from_pretrained(base_model)
        self._answerer = TFAutoModelForQuestionAnswering.from_pretrained(
            base_model)

    def __call__(self, utterance: str) -> str:
        result = self._entity_recognizer(utterance)
        if result.intent == "ask.question":
            return self._answer(result)
        elif result.intent == "greet":
            return self.greet()
        elif result.intent == "command.exit":
            return self.exit()
        elif result.intent == "request.help":
            return self.help()
        else:
            return self.fallback()

    def _answer(self, result: Result) -> str:
        if result.slots:
            # get the tagged entity for page search
            entity = result.slots.get("entity").get("raw_value")
            # perform the search to find the wikipedia page
            entity = self._wiki.search(entity)[0]
            # get the page content to feed as context to the qa model
            passage = self._wiki.page(entity, auto_suggest=False).content
            # prepare qa model inputs
            inputs = self._tokenizer(
                result.utterance,
                passage,
                return_tensors="tf",
                padding=True,
                truncation=True,
            )
            # compute answer span
            start_scores, end_scores = self._answerer(inputs)
            start, end = tf.argmax(start_scores,
                                   -1)[0], tf.argmax(end_scores, -1)[0]
            # prepare the passage ids for slicing
            tokens = self._tokenizer.convert_ids_to_tokens(
                (inputs["input_ids"].numpy()[0]))
            # retrieve only the answer from the passage
            answer = self._tokenizer.convert_tokens_to_string(
                tokens[start:end + 1])
            return answer
        return "I don't have an answer for that"

    @staticmethod
    def greet() -> str:
        return "Hello, Ask me anything"

    @staticmethod
    def exit() -> str:
        return "Goodbye"

    @staticmethod
    def fallback() -> str:
        return (
            "I'm having trouble understanding your request, could you please "
            "repeat it")

    @staticmethod
    def help() -> str:
        return "Ask a question like, how long is the amazon river?"
medline_clause_extract = medline.to_csv('medline_nih_extract.csv')

fifty_fundamental = pd.read_csv(
    '/Users/gurdit.chahal/Capstone_Data_Mining/w210-herbert/data_sources/fifty_fundamental_herbs_labeled.csv'
)

from mediawiki import MediaWiki
wikipediamw = MediaWiki()
#wikipedia.page(wikipedia.search('Agastache rugosa')[0])
ff_dict = defaultdict(lambda: '')
for name in fifty_fundamental.Scientific_Name:
    try:
        wikipagemw = wikipediamw.page(name)
    except wikipediamw.PageError:
        print(name)
        search = wikipediamw.search(name)
        if search:
            wikipage = wikipediamw.page(search[0])
        else:
            continue  #continue forces to loop to next iteration whereas pass goes to rest of loop
    print("content for: " + name)
    toc = get_toc_mw(
        wikipagemw
    )  #transform ordered dictionairy of sections and subsections to tuples of sections and subsections
    for section in toc:
        ff_dict[name,
                section[0]] += wiki_topic_text(wikipagemw, section) + ' \n '

ff_kept = defaultdict(lambda: '')
for row in fifty_fundamental.itertuples(index=True, name='Pandas'):
    name = getattr(row, 'Scientific_Name')
def getArticle(movie):
    wiki = MediaWiki()
    result = wiki.search(movie, results=3)
    page = wiki.page(result[0])
    return page
Beispiel #14
0
from mediawiki import MediaWiki
from PIL import Image, ExifTags
import requests
from os.path import isfile, join
from os import makedirs
import time

show_img = False

wikipedia = MediaWiki()
pages = wikipedia.search('hd,i')
pages = [p_name for p_name in pages if not p_name.endswith('(disambiguation)')]

for p_name in pages:
    p = wikipedia.page(p_name)
    images = p.images

    for urlimg in images:
        if any([urlimg.lower().endswith(p) for p in ['svg', 'ogg', 'ogv']]):
            continue  # cannot identify image file <_io.BytesIO object at 0x0000012E68413EB8>
        try:
            filename = urlimg.rsplit('/', 1)[1]
            filename = join(p_name, filename)
            makedirs(p_name, exist_ok=True)
            if not isfile(filename):
                response = requests.get(urlimg, stream=True)
                trys = 0
                while response.status_code != 200 and trys < 5:
                    time.sleep(2)
                    response = requests.get(urlimg, stream=True)
                    print("try again", urlimg)
Beispiel #15
0
class WikiApi:

    def __init__(self):
        self.wikipedia = MediaWiki(lang='ru')
        self.wikiquote = CustomWikiEngine(url="https://{lang}.wikiquote.org/w/api.php",
                                   lang='ru')

    def quotes(self, *words):
        results = []

        for word in words:
            titles = self.wikiquote.quotes(word, results=2)
            results += titles

        return results

    def quote_page(self, title):
        response = {}
        try:
            response = self.wikiquote.page(title=title)
        except Exception as e:
            logging.exception(e)
        return response

    def get_pages_by_categories(self, category, limit=10):
        # https://en.wikipedia.org/w/api.php?a
        # ction=query&
        # generator=categorymembers&
        # gcmlimit=100&
        # gcmtitle=Category:American%20male%20film%20actors&
        # prop=pageimages&
        # pilimit=100
        S = requests.Session()

        URL = "https://ru.wikipedia.org/w/api.php"

        PARAMS = {
            'action': "query",
            'generator': "categorymembers",
            'gcmtitle': category,
            'gcmlimit': limit,
            'format': "json"
        }

        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        titles = []
        if 'query' in DATA and DATA['query'] and DATA['query']['pages']:
            titles = [value['title'] for key, value in DATA['query']['pages'].items()]
        return titles

    def movies(self):
        # https://ru.wikipedia.org/w/api.php?format=xml&action=query&list=embeddedin&einamespace=0&eilimit=500&eititle=Template:Infobox_film
        pass


    def search(self, *words):
        results = []

        for word in words:
            response = self.wikipedia.search(word, results=4)
            short_descriptions = response
            results += short_descriptions
        return results

    def opensearch(self, *words):
        results = []

        for word in words:
            response = self.wikipedia.opensearch(word)
            results += response
        return results

    def parse(self, *pages):
        results = []

        for page in pages:
            try:
                response = self.wikipedia.page(title=page)
                content = response.content
                sections = re.split(r'==.+?==', content)
                if sections:
                    summary = sections[0]
                    results.append(summary)
                    section_headers = re.findall(r'== \w+ ==', content)
                    if '== Сюжет ==' in section_headers:
                        index = section_headers.index('== Сюжет ==') + 1
                        if len(sections) > index:
                            plot = sections[index]
                            results.append(plot)
            except Exception as e:
                logging.error(e)
        return results
class WikiCandidatesSelector:
    """
    Class responsible for model of candidates selection from Wikipedia (Model level one)
    :param logger: logger to use in model
    :param separate: if make separate queries for each found entity
    :param n: number of results return after each query
    """
    def __init__(self,
                 logger=DEFAULT_LOGGER,
                 separate: bool = True,
                 n: int = 3,
                 **kwargs):

        self.profiler = kwargs.get('profiler', DEFAULT_MEASURER)
        self.logger = logger

        self.tagger = SequenceTagger.load('ner-fast')
        self.wikipedia = MediaWiki()
        self.separate = separate
        self.n = n

        self.logger.info("Candidate selector is loaded and ready to use.")

    def get_wiki_candidates_raw(self, query: str) -> List[str]:
        """
        Query Wikipedia with given text query
        :param query: text to query in Wikipedia
        :return list of links found for query
        """

        search_results = self.wikipedia.search(query, results=self.n)
        return [t.replace(' ', '_') for t in search_results]

    def get_entities(self, text: str) -> List[str]:
        """
        Get the list of named entities for given text.
        COMMENT: We should reinitialize this method for using another NER model
        :param text: str, text used for NER extraction
        :return list of str (entities found in text)
        """

        sentence = Sentence(text)
        self.tagger.predict(sentence)
        entities = []
        for entity in sentence.get_spans('ner'):
            entities.append(entity.text)
        return entities

    def get_wiki_candidates_NER(self, query: str) -> Set[str]:
        """
        Method to get the Wikipedia articles candidates with use of NER model
        :param query: str query claim
        :return set of links found for query
        """
        self.profiler.start_measure_local('NER_model')
        entities = self.get_entities(query)
        self.profiler.finish_measure_local()

        self.profiler.start_measure_local('wiki_search')
        search_results = self.get_wiki_candidates_raw(query)
        if not self.separate:
            search_results_en = self.get_wiki_candidates_raw(
                ' '.join(entities))
        else:
            search_results_en = []
            for e in entities:
                search_results_en += self.get_wiki_candidates_raw(e)
        self.profiler.finish_measure_local()

        return set([t for t in search_results + search_results_en])

    def get_wiki_texts(self, articles_names: Set[str]) -> Dict:
        """
        Method that gets Wikipedia texts for given articles names if exist
        :param articles_names set of names for Wikipedia articles.
        :return the dict with article names as keys and list of related sentences as values
        """
        result = {}
        for name in articles_names:
            try:
                page = self.wikipedia.page(name)
                result[name] = page.summary.replace('\n', ' ').split('. ')
            except Exception as e:
                self.logger.warning(
                    f"[Candidates picker] Page for id {name} is not found.")
        return result

    def get_candidates(self, claim: str) -> Dict:
        """
        The main method of the class that get the Wikipedia texts for related articles for given query
        :param claim: str query claim
        :return the dict with article names as keys and list of related sentences as values
        """

        candidates = self.get_wiki_candidates_NER(claim)
        self.logger.info(
            f"[Candidates picker] Candidates found: {', '.join(candidates)}")

        self.profiler.start_measure_local('wiki_texts')
        texts_dict = self.get_wiki_texts(candidates)
        self.profiler.finish_measure_local()

        return texts_dict
Beispiel #17
0
            opera_garnier[0]['geometry']['location']['lng'])
# search = wikipedia.page('Rue Scribe Paris')
# content = search.content
opera_garnier_content = "La rue Scribe est une voie du 9e arrondissement de Paris.\n\n\n== Situation et accès ==\nLa rue Scribe est située dans le 9e arrondissement de Paris, elle commence boulevard des Capucines, se développe vers le nord-nord-est, croise la rue Auber, longe l'opéra Garnier et la place Charles-Garnier, rejoint la rue des Mathurins au niveau de la place Diaghilev ; elle est prolongée au-delà du boulevard Haussmann par la rue de Mogador.\nElle fait partie de la re-composition de la Chaussée-d'Antin entreprise au XIXe siècle et qui culmine avec les transformations de Paris sous le Second Empire et la construction de l'opéra Garnier.\nCe site est desservi par les stations de métro Opéra et Chaussée d'Antin - La Fayette et par la station de RER Auber sur la ligne \u2009\u200d.\n\n\n== Origine du nom ==\nLa rue Scribe est nommée d'après l'auteur dramatique Eugène Scribe (1791-1861), en raison de son voisinage avec l'opéra Garnier, par décret du 2 mars 1864.\n\n\n== Histoire ==\nLa rue Scribe a été ouverte par décret du 29 septembre 1860, entre le boulevard des Capucines et la rue des Mathurins. La largeur de la partie comprise entre le boulevard des Capucines et la rue Auber a été portée à 22 m ; le décret ne prévoyait que 20 m.\nLa partie comprise entre la rue des Mathurins et le boulevard Haussmann a été incorporée à la place Diaghilev.\n\n\n== Bâtiments remarquables et lieux de mémoire ==\n\nNo 1 : anciens locaux du Jockey Club de Paris.\nNos 2 à 6 : arrière du Grand Hôtel (désormais InterContinental Paris Le Grand), dont l'entrée est située sur la place de l'Opéra ; les façades et toitures font l'objet d'une inscription au titre des monuments historiques depuis le 22 août 1975 ; l'immeuble est en outre assujetti à la servitude d'architecture prévue par le décret du 29 septembre 1860 pour les abords de l'Opéra.\nNos 6, 11, 11 bis, 15 et 17 : les façades et les toitures sur rue des immeubles aux abords de l'Opéra font l'objet d'une inscription au titre des monuments historiques depuis le 30 décembre 1977 ; les immeubles sont en outre assujettis à la servitude d'architecture prévue par le décret du 29 septembre 1860 pour les abords de l'Opéra.\nNo 31 : domicile d'Angelo Mariani.\nLa bouche de métro de la station Opéra dessinée en 1900 par l'architecte Hector Guimard pour la Compagnie du chemin de fer métropolitain de Paris et mise en place en 1904 au coin de la rue Auber, fait l'objet d'une inscription au titre des monuments historiques depuis le 29 mai 1978, ainsi que d'un label « Patrimoine du XXe siècle ».\nLe théâtre national de l'Opéra, construit par Charles Garnier. La rue Scribe longe la façade ouest, du côté du pavillon de l’Empereur (appelé après la chute du Second Empire « pavillon du chef de l'État ») et de la bibliothèque-musée de l'Opéra.\n\n\n== Notes et références ==\n\n\n== Annexes ==\n\n\n=== Lien externe ===\n(fr) Nomenclature officielle Portail de Paris   Portail de la route"

formatted_address = geocode_result[0]['formatted_address']
location = (geocode_result[0]['geometry']['location']['lat'],
            geocode_result[0]['geometry']['location']['lng'])

print(formatted_address)

from mediawiki import MediaWiki

wikipedia = MediaWiki()
wikipedia.language = "fr"

search = wikipedia.page('Citée Parad')

content = search.content

search = wikipedia.search('washington')
print(search)
# search = wikipedia.opensearch('washington')
# print(search)
# p = wikipedia.page(search[0])
# print('toto')
# p.title
# p.summary
# p.categories
# p.images
# p.links