def send_request(file_name, title):
    url = 'https://en.wikipedia.org/w/api.php'
    arguments = {
        'action': 'query',
        'prop': 'revisions',
        'rvprop': 'content',
        'format': 'json',
        'titles': title
    }
    response = requests.get(url, params=arguments)

    if response.status_code != 200:
        print("Error : waiting status code 200 and "
              "received '{}' as status code".format(response.status_code))
        return ("")

    res = response.json()

    if not res.get('query'):
        print("Error : could not find 'query' in response.")
        return ("")

    if not res['query'].get('pages'):
        print("Error : could not find 'pages' in response['query'].")
        return ("")

    content = ""
    for page in res['query']['pages']:
        page_content = res['query']['pages'][page]

        if not page_content.get('revisions'):
            print("Error : could not find 'revisions' in page_content.")
            return ("")
        if not page_content['revisions'][0].get('*'):
            print("Error : could not find '*' in revisions.")
            return ("")
        content += "{}\n".format(page_content['revisions'][0]['*'])

    result = dewiki.from_string(content)
    tmp = str(result)[:9]

    if tmp == "#REDIRECT":
        redir = " ".join(str(result).split(' ')[1:]).strip()
        send_request(file_name, redir)
    else:
        with open(file_name + ".wiki", 'w') as out_file:
            out_file.write(dewiki.from_string(result))
    return (result)
Esempio n. 2
0
def request(word):

    if len(sys.argv)!=2:
        print('Enter one string to search!')
        return
    if type(sys.argv[1])!=str:
        print('Wrong argument type!')
        return

    try:
        api_search='https://en.wikipedia.org/w/api.php?action=opensearch&search='+word
        wiki_search=requests.get(api_search)
        page_search=wiki_search.json()
        article_name=page_search[1][0]
        api_parse='https://en.wikipedia.org/w/api.php?action=parse&page='+article_name+'&prop=wikitext&format=json&contentmodel=wikitext'
        article_content=(requests.get(api_parse)).json()
        content=article_content['parse']['wikitext']['*']
        content=dewiki.from_string(content)

        word=word.replace(' ','_')
        article_file=open(word+'.wiki','w+')
        article_file.write(content)
    except:
        print('Something went wrong during the search.')
        return
def req_wiki():
    if len(sys.argv) == 2:
        query = sys.argv[1]
        session = requests.Session()

        url = 'https://en.wikipedia.org/w/api.php'

        arguments = {
            "action": "parse",
            "page": query,
            "prop": "wikitext",
            "format": "json",
            "redirects": True,
            "formatversion": 2
        }

        req = session.get(url=url, params=arguments)
        data = req.json()
        try:
            answer = dewiki.from_string(data['parse']['wikitext'])

            res = ""
            for line in answer.split("\n"):
                if len(line) > 0:
                    res += line
                    res += "\n"
            with open(query + ".wiki", "w") as handle:
                handle.write(res)
        except:
            print("Probably this is misspeling. Try again.")
def request_wiki(req):
    req = req.lower()
    r = requests.get(
        "https://fr.wikipedia.org/w/api.php?action=query&titles={}&prop=revisions&rvprop=content&format=json"
        .format(req))
    if r.status_code != 200:
        print("Erreur HTTP code :", r.status_code)
        exit(0)
    data = r.json()
    pages = data["query"]["pages"]
    id = ""
    for k in pages:
        id = k
    page = pages[id]
    txt = ""
    if page.get("revisions"):
        revision = page['revisions']
        # print(revision)
        txt = revision[0].get('*')
        if txt.find("#REDIRECT") != -1:
            start = txt.find("[")
            end = txt.find("]")
            sub = txt[(start + 2):end]
            request_wiki(sub)
        else:
            txt = dewiki.from_string(txt)
            file = open("{}.wiki".format(sys.argv[1]), 'w')
            file.write(txt)
            file.close()
    else:
        print("Ce mot est inconnu par Wikipédia!")
def z(q):
    url = "https://en.wikipedia.org/w/api.php"
    param = ("?action=query&titles=" + q +
             "&prop=revisions&rvprop=content&format=json&formatversion=2")

    r = req.get(url + param)
    if r.status_code != 200:
        print("Request failed", file=sys.stderr)
        return

    try:
        content = r.json()["query"]["pages"][0]["revisions"][0]["content"]
    except KeyError as e:
        if "invalidreason" in r.json()["query"]["pages"][0].keys():
            print("Invalid query:",
                  r.json()["query"]["pages"][0]["invalidreason"],
                  file=sys.stderr)
        else:
            print("Nothing found ):", file=sys.stderr)
        return
    except Exception as e:
        print(repr(e), file=sys.stderr)
        return

    if (type(content) != str or len(content) == 0):
        print("Nothing found ):", file=sys.stderr)  # parano

    out_file = "".join(q.split()) + ".wiki"
    with open(out_file, "w") as f:
        print(dewiki.from_string(content), file=f)
Esempio n. 6
0
def search_wiki(str):
    url = "https://fr.wikipedia.org/w/api.php"
    payload = {
        "action": "parse",
        "format": "json",
        "errorformat": "bc",
        "page": str,
        "prop": "wikitext",
        "formatversion": "latest"
    }
    response = requests.get(url, params=payload)
    if response.status_code != 200:
        raise Exception("Erreur HTTP: " + str(response.status_code))
    elif response.json().get('error'):
        raise Exception("Erreur: " + response.json().get('error')['info'])
    else:
        res = dewiki.from_string(response.json().get('parse')['wikitext'])
        print(res)
        try:
            with open(
                    response.json().get('parse')['title'].replace(' ', '_') +
                    ".wiki", "w") as file:
                file.write(res)
        except PermissionError as e:
            print(e)
            exit(1)
Esempio n. 7
0
    def get_abstract(self):
        logger.debug(self.page)

        if self.abstract:
            return self.abstract
        else:
            resdb = self.dbq.query(self.qb)

            if resdb:
                try:
                    self.abstract = resdb[0][0].value
                except:
                    logger.error('Error getting abstract from DBpedia: page'.format(
                                 page=self.page))

            if self.abstract is None:
                try:
                    wikitext = wtp.get_wikitext_from_api(self.page,
                                                         lang=self.lang)
                except ValueError:
                    wikitext = ''

                match = SECTIONREGEX.search(wikitext)

                if match:
                    start = match.start()
                    wikitext = wikitext[:start]
                self.abstract = dewiki.from_string(wikitext).strip()

            return self.abstract
Esempio n. 8
0
def process_request(req):
    url = 'https://fr.wikipedia.org/w/api.php'
    payload = {
        'action': 'query',
        'titles': req,
        'prop': 'revisions',
        'rvprop': 'content',
        'format': 'json'
    }
    r = requests.get(url, params=payload)
    #print("debug url = ", r.url)
    if r.status_code != 200:
        print("Erreur HTTP, code ", str(r.status_code))
        exit(1)
    res = r.json()
    #print("debug : res=", res)
    if res.get('query'):
        #print('debug : query trouve')
        query = res['query']
        if query.get('pages'):
            pages = query['pages']
            #print('debug : pages trouve')
            txt = ""
            for pageid in pages:
                page = pages[pageid]
                if page.get('revisions'):
                    revisions = page['revisions']
                    #print('debug : revisions trouve ',revisions)
                    if revisions[0].get('*'):
                        #print('debug : contentu trouve ',revisions[0]['*'] )
                        txt += revisions[0]['*'] + "\n"

                    txt = dewiki.from_string(txt)
            write_in_file(req, txt)
def read_article(article_ID,
                 root_dir="/data/ClinicalTrialsWikipediaArticles/articles/"):
    article = json.load(open(root_dir + article_ID + ".json", "r"))
    wikitext = article["text"]
    text = dewiki.from_string(wikitext)
    text = remove_refs(text)
    text = remove_lists(text)
    return text
Esempio n. 10
0
def open_json_file_and_return_sentences( filename ):
    json_data = open( filename )
    data = json.load( json_data )
    json_data.close()

    data = data[ "query" ][ "pages" ].values()[ 0 ][ "revisions" ][ 0 ][ "*" ]

    plain_text = dewiki.from_string( data )
    
    sentences = sentence_detector.tokenize( plain_text )

    return sentences
Esempio n. 11
0
def search(searched_text: str):
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'prop': 'revisions',
        'rvprop': 'content',
        'titles': searched_text,
        'format': 'json'
    }
    response = requests.get(url=url, params=params)
    if response.status_code == 200:
        response_dict = response.json()
        pages = response_dict['query']['pages']
        pageid = ''.join(pages.keys())
        if pageid != '-1':
            with open(searched_text.replace(' ', '_') + '.wiki', 'w') as f:
                f.write(dewiki.from_string(pages[pageid]['revisions'][0]['*']))
                return
    print('Invalid query')
Esempio n. 12
0
def request_api(argv):

    if (len(argv) != 2):
        print(
            "Error\nPlease provide only one argument aside from the Python file"
        )
        return

    try:
        r = requests.get(
            'https://en.wikipedia.org/w/api.php?action=parse&page={}&prop=wikitext&formatversion=2&format=json&redirects&limit=1'
            .format(argv[1]))
        r = json.loads(r.text)
        r = dewiki.from_string(r['parse']['wikitext'])
        file_name = argv[1].replace(" ", "_")
        with open(file_name + '.wiki', 'w') as w:
            w.write(r)
    except:
        print("Error\nPlease try again with a valid search result")
Esempio n. 13
0
def request_wikipedia(query):
    try:
        request_params = {
            'action': 'query',
            'titles': query,
            'prop': 'revisions',
            'rvprop': 'content',
            'redirects': 1,
            'utf-8': 1,
            'format': 'json'
        }
        response = requests.get('https://fr.wikipedia.org/w/api.php',
                                params=request_params)
        response_json = json.loads(response.text)
        pages_dict = response_json['query']['pages']
        pages_id = (list(pages_dict.keys())[0])
        response_text = response_json['query']['pages'][pages_id]['revisions'][
            0]['*']
        clean_text = dewiki.from_string(response_text)
        put_infile(query, clean_text)
    except KeyError:
        print("There is no results for this query.")
Esempio n. 14
0
def wiki(req):
    url = "https://en.wikipedia.org/w/api.php"
    payload = {
        'action': 'query',
        'titles': req,
        'prop': 'revisions',
        'rvprop': 'content',
        'format': 'json'
    }
    r = requests.get(url, params=payload)
    if (r.status_code) != 200:
        print("request error")
        return
    res = r.json()
    try:
        txt = ""
        for pageid in res['query']['pages']:
            txt += res['query']['pages'][pageid]['revisions'][0]['*'] + '\n'
        write_to_file(req, dewiki.from_string(txt))
    except Exception as e:
        print("request error", e)
        print("The term might not exist")
        return
Esempio n. 15
0
def request_wikipeadia(page: str):
	URL = "https://en.wikipedia.org/w/api.php"

	PARAMS = {
		"action": "parse",
		"page": page,
		"prop": "wikitext",
		"format": "json",
		"redirects": "true"
	}

	try:
		res = requests.get(url=URL, params=PARAMS)
		res.raise_for_status()
	except requests.HTTPError as e:
		raise e
	try:
		data = json.loads(res.text)
	except json.decoder.JSONDecodeError as e:
		raise e
	if data.get("error") is not None:
		raise Exception(data["error"]["info"])
	return (dewiki.from_string(data["parse"]["wikitext"]["*"]))
Esempio n. 16
0
def request_wikipedia(title, filename=None, loop=0):
    try:
        title = title.strip().lower()
        while '  ' in title:
            title = title.replace('  ', ' ')
        parameters = {
            'action': 'query',
            'titles': title,
            'format': 'json',
            'prop': 'revisions',
            'rvslots': '*',
            'rvprop': 'timestamp|user|comment|content'
        }
        r = requests.get('https://en.wikipedia.org/w/api.php',
                         params=parameters)
        json = r.json()
        query = json['query']
        pages = query['pages']
        page = list(pages.values())[0]
        revisions = page['revisions']
        revision = revisions[0]
        slots = revision['slots']
        slot = slots['main']
        text = slot['*']
        text = dewiki.from_string(text)
        if None == filename:
            filename = title + ".wiki"
        if 0 == loop and text.startswith("#REDIRECT"):
            request_wikipedia(text[9:], filename, 1)
        else:
            if ('}}\n') in text:
                text = text[text.index('}}\n') + 3:]
            out = open(filename, "w")
            out.write(text)
    except (KeyError, IndexError):
        print('No page found for "' + title + '"!')
Esempio n. 17
0
def parseXML(xmlFile):
    """
    Парсинг XML
    """
    with open(xmlFile, encoding='utf8') as fobj:
        xml = fobj.read()
    print("Start to parse")
    root = etree.fromstring(xml)
    articles = {}
    for i, page in enumerate(root.getchildren()):
        print(i)
        title = None
        for elem in page.getchildren():
            if elem.tag.endswith('title') and elem.text:
                title = elem.text
                print(title)
            if elem.tag.endswith('revision'):
                for child in elem.getchildren():
                    if child.tag.endswith('text') and child.text:
                        text = dewiki.from_string(child.text)
                        # text = unwiki.loads(child.text)
                        text = clear_tables(text)
                        text = clear_refs(text)
                        text = clear_bottom_links(text)
                        text = clear_bottom_links(text, keyword='Вуламалли')
                        text = clear_bottom_links(text, keyword='Асăрхавсем')
                        # print(text)
                        if title in articles:
                            articles[title].append(text)
                        else:
                            articles[title] = [text]
        if i != 0 and i % BACKUP_FREQUENCY == 0:
            backup(articles)
            articles = {}
    # print(articles)
    backup(articles)
Esempio n. 18
0
def get_data(title):
    session = requests.Session()
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "prop": "wikitext",
        "page": title,
        "format": "json",
        "redirects": True,
        "disabletoc": True,
        "disableeditsection": True,
        "formatversion": 2
    }
    try:
        response = session.get(url=url, params=params)
        data = response.json()
        result = dewiki.from_string(data["parse"]["wikitext"])
        result = clean_data(result)
        with open("{filename}.wiki".format(filename=title), "w+") as fd:
            fd.write(result)
    except KeyError:
        print(data['error']['info'])
        return False
    return True
Esempio n. 19
0
def text_cleaner(wiki_text):
    unmarked_text = dewiki.from_string(wiki_text)  # Remove Wiki markup
    clean_text = ''.join(
        unmarked_text.split('}}')[1:])  # Remove weird introduction thing
    return clean_text
import requests
import dewiki
import sys, json

if __name__ == '__main__':
    if len(sys.argv) == 2:
        r = requests.get(
            'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles={0}&format=json&rvprop=content'
            .format(sys.argv[1]))
        if r.status_code != 200:
            print("Erreur HTTP, code ", str(r.status_code))
            exit(1)
        res = r.json()
        ok = ''
        if res.get('query'):
            if res['query'].get('pages'):
                pages = res['query']['pages']
                for pageId in pages:
                    if (pages[pageId].get('revisions')):
                        ok += pages[pageId]['revisions'][0]['*']
        else:
            print('search didn\'t found valid informations')
        if len(ok) > 0:
            f = open(sys.argv[1].replace(" ", "_") + '.wiki', "w")
            f.write(dewiki.from_string(ok))
            f.close()
    else:
        print('you must provide a query parameter as argument')

#https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=chocolatine&format=json
#https://en.wikipedia.org/?curid=4685852
def send_http_request(url, s_to_search):
    payload = {
        'action': 'query',
        'titles': s_to_search,
        'prop': 'revisions',
        'rvprop': 'content',
        'redirects': 'true',
        'format': 'json'
    }
    r = requests.get(url, params=payload)
    # Faire l equivalent de cette commande:
    #print(r.status_code)
    #print(r.url)
    #       https://fr.wikipedia.org/w/api.php?action=query&titles=chocolatine&prop=revisions&rvprop=content&format=json
    #   curl "https://fr.wikipedia.org/w/api.php?action=query&titles=chocolatine&prop=revisions&rvprop=content&format=json" | jq  '.'
    #
    # action: query
    # titles: pages ayant pour titre 'chocolatine'
    # prop:revisions propriete inclure les revisions
    # prop:revisions&rvprop=content: lire le contenu
    #       exple:
    #           curl "https://fr.wikipedia.org/w/api.php?action=query&titles=chocolatine&prop=revisions&rvprop=content&format=json" | jq  '.'
    #               ...
    #               "pages": {
    #                   "908647": {
    #                       "pageid": 908647,
    #                       "ns": 0,
    #                       "title": "Chocolatine",
    #                       "revisions": [
    #                           {
    #                               "contentformat": "text/x-wiki",
    #                               "contentmodel": "wikitext",
    #                               "*": "#REDIRECT[[Pain au chocolat]]"
    #                           }
    #                       ]
    #   Ci-dessus il y une redirection : donc on fait e1r12p13% curl "https://fr.wikipedia.org/w/api.php?action=query&titles=chocolatine&prop=revisions&rvprop=content&redirects&format=json" | jq  '.'
    #

    #sys.exit(1)
    #print(r.status_code)

    #TBD verifier que la page n est pas redirige avec le header ---> #print(r.headers)
    #print(r.content)
    if r.status_code == 200:
        # reponse http == ok
        if not r.text:
            print("verifier r.text pas le temps %s" % r.text)
            sys.exit(1)
        #res = r.json()
        #print("debug : res=", res)
        """
        res = json.loads(r.text)
        #print(res)
        pages = res['query']['pages']

        for page_id in pages.keys():
            print(page_id)
            for page in page_id[pa
            sys.exit(1)
            """
        #https://github.com/ajouanna/Piscine_Django_Python/blob/master/D03/ex02/request_wikipedia.py
        # parcourir le resultat json
        res = r.json()
        #print("debug : res=", res)
        if res.get('query'):
            #print('debug : query trouve')
            query = res['query']
            if query.get('pages'):
                pages = query['pages']
                #print('debug : pages trouve')
                txt = ""
                for pageid in pages:
                    page = pages[pageid]
                    if page.get('revisions'):
                        revisions = page['revisions']
                        #print('debug : revisions trouve ',revisions)
                        if revisions[0].get('*'):
                            #print('debug : contentu trouve ',revisions[0]['*'] )
                            txt += revisions[0]['*'] + "\n"

                            txt = dewiki.from_string(txt)
        #write_in_file(req, txt)
        filename = s_to_search.replace(" ", "_") + ".wiki"
        f = open(filename, "w")
        f.write(txt)
        f.close()

    else:
        # selon le cas :
        # soit erreur
        # soit page redirige, etc....
        print("http request status is %s" % r.status)
        print("TBD analyser le code retour: ce n est pas forcement une erreur")
        sys.exit(1)
 def get_text(self, response):
     pages = response["query"]["pages"]
     _, page_info = pages.popitem()
     return dewiki.from_string(page_info['revisions'][0]["*"])
Esempio n. 23
0
def send_http_request(url, s_to_search):
    payload = {
        'action': 'query',
        'titles': s_to_search,
        'prop': 'revisions',
        'rvprop': 'content',
        'format': 'json'
    }
    r = requests.get(url, params=payload)
    #print(r.status_code)

    #TBD verifier que la page n est pas redirige avec le header ---> #print(r.headers)
    #print(r.content)
    if r.status_code == 200:
        # reponse http == ok
        if not r.text:
            print("verifier r.text pas le temps %s" % r.text)
            sys.exit(1)
        #res = r.json()
        #print("debug : res=", res)
        """
        res = json.loads(r.text)
        #print(res)
        pages = res['query']['pages']

        for page_id in pages.keys():
            print(page_id)
            for page in page_id[pa
            sys.exit(1)
            """
        #https://github.com/ajouanna/Piscine_Django_Python/blob/master/D03/ex02/request_wikipedia.py
        # parcourir le resultat json
        res = r.json()
        #print("debug : res=", res)
        if res.get('query'):
            #print('debug : query trouve')
            query = res['query']
            if query.get('pages'):
                pages = query['pages']
                #print('debug : pages trouve')
                txt = ""
                for pageid in pages:
                    page = pages[pageid]
                    if page.get('revisions'):
                        revisions = page['revisions']
                        #print('debug : revisions trouve ',revisions)
                        if revisions[0].get('*'):
                            #print('debug : contentu trouve ',revisions[0]['*'] )
                            txt += revisions[0]['*'] + "\n"

                            txt = dewiki.from_string(txt)
        #write_in_file(req, txt)
        filename = s_to_search.replace(" ", "_") + ".wiki"
        f = open(filename, "w")
        f.write(txt)
        f.close()

    else:
        # selon le cas :
        # soit erreur
        # soit page redirige, etc....
        print("http request status is %s" % r.status)
        print("TBD analyser le code retour: ce n est pas forcement une erreur")
        sys.exit(1)