def send_request(file_name, title): url = 'https://en.wikipedia.org/w/api.php' arguments = { 'action': 'query', 'prop': 'revisions', 'rvprop': 'content', 'format': 'json', 'titles': title } response = requests.get(url, params=arguments) if response.status_code != 200: print("Error : waiting status code 200 and " "received '{}' as status code".format(response.status_code)) return ("") res = response.json() if not res.get('query'): print("Error : could not find 'query' in response.") return ("") if not res['query'].get('pages'): print("Error : could not find 'pages' in response['query'].") return ("") content = "" for page in res['query']['pages']: page_content = res['query']['pages'][page] if not page_content.get('revisions'): print("Error : could not find 'revisions' in page_content.") return ("") if not page_content['revisions'][0].get('*'): print("Error : could not find '*' in revisions.") return ("") content += "{}\n".format(page_content['revisions'][0]['*']) result = dewiki.from_string(content) tmp = str(result)[:9] if tmp == "#REDIRECT": redir = " ".join(str(result).split(' ')[1:]).strip() send_request(file_name, redir) else: with open(file_name + ".wiki", 'w') as out_file: out_file.write(dewiki.from_string(result)) return (result)
def request(word): if len(sys.argv)!=2: print('Enter one string to search!') return if type(sys.argv[1])!=str: print('Wrong argument type!') return try: api_search='https://en.wikipedia.org/w/api.php?action=opensearch&search='+word wiki_search=requests.get(api_search) page_search=wiki_search.json() article_name=page_search[1][0] api_parse='https://en.wikipedia.org/w/api.php?action=parse&page='+article_name+'&prop=wikitext&format=json&contentmodel=wikitext' article_content=(requests.get(api_parse)).json() content=article_content['parse']['wikitext']['*'] content=dewiki.from_string(content) word=word.replace(' ','_') article_file=open(word+'.wiki','w+') article_file.write(content) except: print('Something went wrong during the search.') return
def req_wiki(): if len(sys.argv) == 2: query = sys.argv[1] session = requests.Session() url = 'https://en.wikipedia.org/w/api.php' arguments = { "action": "parse", "page": query, "prop": "wikitext", "format": "json", "redirects": True, "formatversion": 2 } req = session.get(url=url, params=arguments) data = req.json() try: answer = dewiki.from_string(data['parse']['wikitext']) res = "" for line in answer.split("\n"): if len(line) > 0: res += line res += "\n" with open(query + ".wiki", "w") as handle: handle.write(res) except: print("Probably this is misspeling. Try again.")
def request_wiki(req): req = req.lower() r = requests.get( "https://fr.wikipedia.org/w/api.php?action=query&titles={}&prop=revisions&rvprop=content&format=json" .format(req)) if r.status_code != 200: print("Erreur HTTP code :", r.status_code) exit(0) data = r.json() pages = data["query"]["pages"] id = "" for k in pages: id = k page = pages[id] txt = "" if page.get("revisions"): revision = page['revisions'] # print(revision) txt = revision[0].get('*') if txt.find("#REDIRECT") != -1: start = txt.find("[") end = txt.find("]") sub = txt[(start + 2):end] request_wiki(sub) else: txt = dewiki.from_string(txt) file = open("{}.wiki".format(sys.argv[1]), 'w') file.write(txt) file.close() else: print("Ce mot est inconnu par Wikipédia!")
def z(q): url = "https://en.wikipedia.org/w/api.php" param = ("?action=query&titles=" + q + "&prop=revisions&rvprop=content&format=json&formatversion=2") r = req.get(url + param) if r.status_code != 200: print("Request failed", file=sys.stderr) return try: content = r.json()["query"]["pages"][0]["revisions"][0]["content"] except KeyError as e: if "invalidreason" in r.json()["query"]["pages"][0].keys(): print("Invalid query:", r.json()["query"]["pages"][0]["invalidreason"], file=sys.stderr) else: print("Nothing found ):", file=sys.stderr) return except Exception as e: print(repr(e), file=sys.stderr) return if (type(content) != str or len(content) == 0): print("Nothing found ):", file=sys.stderr) # parano out_file = "".join(q.split()) + ".wiki" with open(out_file, "w") as f: print(dewiki.from_string(content), file=f)
def search_wiki(str): url = "https://fr.wikipedia.org/w/api.php" payload = { "action": "parse", "format": "json", "errorformat": "bc", "page": str, "prop": "wikitext", "formatversion": "latest" } response = requests.get(url, params=payload) if response.status_code != 200: raise Exception("Erreur HTTP: " + str(response.status_code)) elif response.json().get('error'): raise Exception("Erreur: " + response.json().get('error')['info']) else: res = dewiki.from_string(response.json().get('parse')['wikitext']) print(res) try: with open( response.json().get('parse')['title'].replace(' ', '_') + ".wiki", "w") as file: file.write(res) except PermissionError as e: print(e) exit(1)
def get_abstract(self): logger.debug(self.page) if self.abstract: return self.abstract else: resdb = self.dbq.query(self.qb) if resdb: try: self.abstract = resdb[0][0].value except: logger.error('Error getting abstract from DBpedia: page'.format( page=self.page)) if self.abstract is None: try: wikitext = wtp.get_wikitext_from_api(self.page, lang=self.lang) except ValueError: wikitext = '' match = SECTIONREGEX.search(wikitext) if match: start = match.start() wikitext = wikitext[:start] self.abstract = dewiki.from_string(wikitext).strip() return self.abstract
def process_request(req): url = 'https://fr.wikipedia.org/w/api.php' payload = { 'action': 'query', 'titles': req, 'prop': 'revisions', 'rvprop': 'content', 'format': 'json' } r = requests.get(url, params=payload) #print("debug url = ", r.url) if r.status_code != 200: print("Erreur HTTP, code ", str(r.status_code)) exit(1) res = r.json() #print("debug : res=", res) if res.get('query'): #print('debug : query trouve') query = res['query'] if query.get('pages'): pages = query['pages'] #print('debug : pages trouve') txt = "" for pageid in pages: page = pages[pageid] if page.get('revisions'): revisions = page['revisions'] #print('debug : revisions trouve ',revisions) if revisions[0].get('*'): #print('debug : contentu trouve ',revisions[0]['*'] ) txt += revisions[0]['*'] + "\n" txt = dewiki.from_string(txt) write_in_file(req, txt)
def read_article(article_ID, root_dir="/data/ClinicalTrialsWikipediaArticles/articles/"): article = json.load(open(root_dir + article_ID + ".json", "r")) wikitext = article["text"] text = dewiki.from_string(wikitext) text = remove_refs(text) text = remove_lists(text) return text
def open_json_file_and_return_sentences( filename ): json_data = open( filename ) data = json.load( json_data ) json_data.close() data = data[ "query" ][ "pages" ].values()[ 0 ][ "revisions" ][ 0 ][ "*" ] plain_text = dewiki.from_string( data ) sentences = sentence_detector.tokenize( plain_text ) return sentences
def search(searched_text: str): url = 'https://en.wikipedia.org/w/api.php' params = { 'action': 'query', 'prop': 'revisions', 'rvprop': 'content', 'titles': searched_text, 'format': 'json' } response = requests.get(url=url, params=params) if response.status_code == 200: response_dict = response.json() pages = response_dict['query']['pages'] pageid = ''.join(pages.keys()) if pageid != '-1': with open(searched_text.replace(' ', '_') + '.wiki', 'w') as f: f.write(dewiki.from_string(pages[pageid]['revisions'][0]['*'])) return print('Invalid query')
def request_api(argv): if (len(argv) != 2): print( "Error\nPlease provide only one argument aside from the Python file" ) return try: r = requests.get( 'https://en.wikipedia.org/w/api.php?action=parse&page={}&prop=wikitext&formatversion=2&format=json&redirects&limit=1' .format(argv[1])) r = json.loads(r.text) r = dewiki.from_string(r['parse']['wikitext']) file_name = argv[1].replace(" ", "_") with open(file_name + '.wiki', 'w') as w: w.write(r) except: print("Error\nPlease try again with a valid search result")
def request_wikipedia(query): try: request_params = { 'action': 'query', 'titles': query, 'prop': 'revisions', 'rvprop': 'content', 'redirects': 1, 'utf-8': 1, 'format': 'json' } response = requests.get('https://fr.wikipedia.org/w/api.php', params=request_params) response_json = json.loads(response.text) pages_dict = response_json['query']['pages'] pages_id = (list(pages_dict.keys())[0]) response_text = response_json['query']['pages'][pages_id]['revisions'][ 0]['*'] clean_text = dewiki.from_string(response_text) put_infile(query, clean_text) except KeyError: print("There is no results for this query.")
def wiki(req): url = "https://en.wikipedia.org/w/api.php" payload = { 'action': 'query', 'titles': req, 'prop': 'revisions', 'rvprop': 'content', 'format': 'json' } r = requests.get(url, params=payload) if (r.status_code) != 200: print("request error") return res = r.json() try: txt = "" for pageid in res['query']['pages']: txt += res['query']['pages'][pageid]['revisions'][0]['*'] + '\n' write_to_file(req, dewiki.from_string(txt)) except Exception as e: print("request error", e) print("The term might not exist") return
def request_wikipeadia(page: str): URL = "https://en.wikipedia.org/w/api.php" PARAMS = { "action": "parse", "page": page, "prop": "wikitext", "format": "json", "redirects": "true" } try: res = requests.get(url=URL, params=PARAMS) res.raise_for_status() except requests.HTTPError as e: raise e try: data = json.loads(res.text) except json.decoder.JSONDecodeError as e: raise e if data.get("error") is not None: raise Exception(data["error"]["info"]) return (dewiki.from_string(data["parse"]["wikitext"]["*"]))
def request_wikipedia(title, filename=None, loop=0): try: title = title.strip().lower() while ' ' in title: title = title.replace(' ', ' ') parameters = { 'action': 'query', 'titles': title, 'format': 'json', 'prop': 'revisions', 'rvslots': '*', 'rvprop': 'timestamp|user|comment|content' } r = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) json = r.json() query = json['query'] pages = query['pages'] page = list(pages.values())[0] revisions = page['revisions'] revision = revisions[0] slots = revision['slots'] slot = slots['main'] text = slot['*'] text = dewiki.from_string(text) if None == filename: filename = title + ".wiki" if 0 == loop and text.startswith("#REDIRECT"): request_wikipedia(text[9:], filename, 1) else: if ('}}\n') in text: text = text[text.index('}}\n') + 3:] out = open(filename, "w") out.write(text) except (KeyError, IndexError): print('No page found for "' + title + '"!')
def parseXML(xmlFile): """ Парсинг XML """ with open(xmlFile, encoding='utf8') as fobj: xml = fobj.read() print("Start to parse") root = etree.fromstring(xml) articles = {} for i, page in enumerate(root.getchildren()): print(i) title = None for elem in page.getchildren(): if elem.tag.endswith('title') and elem.text: title = elem.text print(title) if elem.tag.endswith('revision'): for child in elem.getchildren(): if child.tag.endswith('text') and child.text: text = dewiki.from_string(child.text) # text = unwiki.loads(child.text) text = clear_tables(text) text = clear_refs(text) text = clear_bottom_links(text) text = clear_bottom_links(text, keyword='Вуламалли') text = clear_bottom_links(text, keyword='Асăрхавсем') # print(text) if title in articles: articles[title].append(text) else: articles[title] = [text] if i != 0 and i % BACKUP_FREQUENCY == 0: backup(articles) articles = {} # print(articles) backup(articles)
def get_data(title): session = requests.Session() url = "https://en.wikipedia.org/w/api.php" params = { "action": "parse", "prop": "wikitext", "page": title, "format": "json", "redirects": True, "disabletoc": True, "disableeditsection": True, "formatversion": 2 } try: response = session.get(url=url, params=params) data = response.json() result = dewiki.from_string(data["parse"]["wikitext"]) result = clean_data(result) with open("{filename}.wiki".format(filename=title), "w+") as fd: fd.write(result) except KeyError: print(data['error']['info']) return False return True
def text_cleaner(wiki_text): unmarked_text = dewiki.from_string(wiki_text) # Remove Wiki markup clean_text = ''.join( unmarked_text.split('}}')[1:]) # Remove weird introduction thing return clean_text
import requests import dewiki import sys, json if __name__ == '__main__': if len(sys.argv) == 2: r = requests.get( 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles={0}&format=json&rvprop=content' .format(sys.argv[1])) if r.status_code != 200: print("Erreur HTTP, code ", str(r.status_code)) exit(1) res = r.json() ok = '' if res.get('query'): if res['query'].get('pages'): pages = res['query']['pages'] for pageId in pages: if (pages[pageId].get('revisions')): ok += pages[pageId]['revisions'][0]['*'] else: print('search didn\'t found valid informations') if len(ok) > 0: f = open(sys.argv[1].replace(" ", "_") + '.wiki', "w") f.write(dewiki.from_string(ok)) f.close() else: print('you must provide a query parameter as argument') #https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=chocolatine&format=json #https://en.wikipedia.org/?curid=4685852
def send_http_request(url, s_to_search): payload = { 'action': 'query', 'titles': s_to_search, 'prop': 'revisions', 'rvprop': 'content', 'redirects': 'true', 'format': 'json' } r = requests.get(url, params=payload) # Faire l equivalent de cette commande: #print(r.status_code) #print(r.url) # https://fr.wikipedia.org/w/api.php?action=query&titles=chocolatine&prop=revisions&rvprop=content&format=json # curl "https://fr.wikipedia.org/w/api.php?action=query&titles=chocolatine&prop=revisions&rvprop=content&format=json" | jq '.' # # action: query # titles: pages ayant pour titre 'chocolatine' # prop:revisions propriete inclure les revisions # prop:revisions&rvprop=content: lire le contenu # exple: # curl "https://fr.wikipedia.org/w/api.php?action=query&titles=chocolatine&prop=revisions&rvprop=content&format=json" | jq '.' # ... # "pages": { # "908647": { # "pageid": 908647, # "ns": 0, # "title": "Chocolatine", # "revisions": [ # { # "contentformat": "text/x-wiki", # "contentmodel": "wikitext", # "*": "#REDIRECT[[Pain au chocolat]]" # } # ] # Ci-dessus il y une redirection : donc on fait e1r12p13% curl "https://fr.wikipedia.org/w/api.php?action=query&titles=chocolatine&prop=revisions&rvprop=content&redirects&format=json" | jq '.' # #sys.exit(1) #print(r.status_code) #TBD verifier que la page n est pas redirige avec le header ---> #print(r.headers) #print(r.content) if r.status_code == 200: # reponse http == ok if not r.text: print("verifier r.text pas le temps %s" % r.text) sys.exit(1) #res = r.json() #print("debug : res=", res) """ res = json.loads(r.text) #print(res) pages = res['query']['pages'] for page_id in pages.keys(): print(page_id) for page in page_id[pa sys.exit(1) """ #https://github.com/ajouanna/Piscine_Django_Python/blob/master/D03/ex02/request_wikipedia.py # parcourir le resultat json res = r.json() #print("debug : res=", res) if res.get('query'): #print('debug : query trouve') query = res['query'] if query.get('pages'): pages = query['pages'] #print('debug : pages trouve') txt = "" for pageid in pages: page = pages[pageid] if page.get('revisions'): revisions = page['revisions'] #print('debug : revisions trouve ',revisions) if revisions[0].get('*'): #print('debug : contentu trouve ',revisions[0]['*'] ) txt += revisions[0]['*'] + "\n" txt = dewiki.from_string(txt) #write_in_file(req, txt) filename = s_to_search.replace(" ", "_") + ".wiki" f = open(filename, "w") f.write(txt) f.close() else: # selon le cas : # soit erreur # soit page redirige, etc.... print("http request status is %s" % r.status) print("TBD analyser le code retour: ce n est pas forcement une erreur") sys.exit(1)
def get_text(self, response): pages = response["query"]["pages"] _, page_info = pages.popitem() return dewiki.from_string(page_info['revisions'][0]["*"])
def send_http_request(url, s_to_search): payload = { 'action': 'query', 'titles': s_to_search, 'prop': 'revisions', 'rvprop': 'content', 'format': 'json' } r = requests.get(url, params=payload) #print(r.status_code) #TBD verifier que la page n est pas redirige avec le header ---> #print(r.headers) #print(r.content) if r.status_code == 200: # reponse http == ok if not r.text: print("verifier r.text pas le temps %s" % r.text) sys.exit(1) #res = r.json() #print("debug : res=", res) """ res = json.loads(r.text) #print(res) pages = res['query']['pages'] for page_id in pages.keys(): print(page_id) for page in page_id[pa sys.exit(1) """ #https://github.com/ajouanna/Piscine_Django_Python/blob/master/D03/ex02/request_wikipedia.py # parcourir le resultat json res = r.json() #print("debug : res=", res) if res.get('query'): #print('debug : query trouve') query = res['query'] if query.get('pages'): pages = query['pages'] #print('debug : pages trouve') txt = "" for pageid in pages: page = pages[pageid] if page.get('revisions'): revisions = page['revisions'] #print('debug : revisions trouve ',revisions) if revisions[0].get('*'): #print('debug : contentu trouve ',revisions[0]['*'] ) txt += revisions[0]['*'] + "\n" txt = dewiki.from_string(txt) #write_in_file(req, txt) filename = s_to_search.replace(" ", "_") + ".wiki" f = open(filename, "w") f.write(txt) f.close() else: # selon le cas : # soit erreur # soit page redirige, etc.... print("http request status is %s" % r.status) print("TBD analyser le code retour: ce n est pas forcement une erreur") sys.exit(1)