def get_bad_answers(answers, delay=50000, cutoff=-1): """ Query Wikipedia for each answer, and return a list of answers that _don't_ correspond directly to page names. Args: answers: list of answer strings that should corespond to Wiki pages delay=50000: number of microseconds to wait between pages cutoff=-1: only process cutoff pages (useful for development) Returns: None """ # wikipedia module does its own rate limiting, so let's use that wiki.set_rate_limiting(True, min_wait=datetime.timedelta(0, 0, delay)) bad_results = [] for i, answer in enumerate(answers): print(i, len(answers)) res = wiki.search(answer, results=3) if res[0] != answer: print("bad result!", answer, res) bad_results.append((i, answer)) if cutoff > 0 and i >= cutoff - 1: break print(bad_results)
def get_bad_answers(answers, delay=50000, cutoff=-1): """ Query Wikipedia for each answer, and return a list of answers that _don't_ correspond directly to page names. Args: answers: list of answer strings that should corespond to Wiki pages delay=50000: number of microseconds to wait between pages cutoff=-1: only process cutoff pages (useful for development) Returns: None """ # wikipedia module does its own rate limiting, so let's use that wiki.set_rate_limiting(True, min_wait=datetime.timedelta(0,0,delay)) bad_results = [] for i, answer in enumerate(answers): print (i, len(answers)) res = wiki.search(answer, results=3) if res[0] != answer: print ("bad result!", answer, res) bad_results.append((i,answer)) if cutoff > 0 and i >= cutoff-1: break print (bad_results)
def checkTenPagesForTMNT(): """Get 10 random wiki titles, check if any of them isTMNT(). We grab the max allowed Wikipedia page titles (10) using wikipedia.random(). If any title is in TMNT meter, return the title. Otherwise, return False. Args: None Returns: String or False: The TMNT compliant title, or False if none found. """ wikipedia.set_rate_limiting(True) try: titles = wikipedia.random(10) except wikipedia.exceptions.HTTPTimeoutError as e: print(f"Wikipedia timout exception: {e}") time.sleep(TIMEOUT_BACKOFF) main() except wikipedia.exceptions.WikipediaException as e: print(f"Wikipedia exception: {e}") sys.exit(1) except Exception as e: print(f"Exception while fetching wiki titles: {e}") sys.exit(1) for title in titles: if words.isTMNT(title): return title return False
def get_wiki_titles() -> List: """ Get a list of random titles from wikipedia. Change number in lib/constants.py Parameters: None. Returns: List of titles. """ wikipedia.set_rate_limiting(True) try: titles = wikipedia.random(FETCH) except wikipedia.exceptions.HTTPTimeoutError as e: print(f"Wikipedia timout exception: {e}") time.sleep(TIMEOUT_BACKOFF) sys.exit(1) except wikipedia.exceptions.WikipediaException as e: print(f"Wikipedia exception: {e}") sys.exit(1) except Exception as e: print(f"Exception while fetching wiki titles: {e}") sys.exit(1) return titles
def main(): # ------ User interaction parser = argparse.ArgumentParser() parser.add_argument( 'template', help= 'file containing newline-delimited keywords, filename pattern: *_template*' ) parser.add_argument('domain', help='wikipedia domain/project (en, de, fr, ...)') args = parser.parse_args() if not os.path.exists(args.template): raise ValueError(f'File "{args.template}" does not exist.') if not args.template.split('.')[0].endswith('_template'): raise ValueError('The keywords file must end with `_template`') # ------ Setup wiki.set_rate_limiting(rate_limit=True) wiki.set_lang(args.domain) # ------ Load keywords from file keywords = read_keywords(args.template) print(f'Loaded {len(keywords)} keywords') #print(keywords) #print(get_output_filename(args.template, args.domain)) #exit() # ------ Fetch suggestions print('Fetching suggestions...') r = search(keywords, n_results=1) r = {key: value[0] for key, value in r.items()} # ------ Review print(f'Whole mapping:') pretty_print_dic(r) print( '\nBelow is a more readable report (exact matches are not reported):') case_mismatch, no_match = {}, {} for k, val in r.items(): if k == val: continue if k.lower() == val.lower(): case_mismatch[k] = val elif k != val: no_match[k] = val print(f'Cases mismatch:') pretty_print_dic(case_mismatch) print(f'No match:') pretty_print_dic(no_match) # Write final_keywords = '\n'.join(r.values()) outfile = get_output_filename(args.template, args.domain) header = get_header() with open(outfile, 'w') as f: f.write(header + final_keywords) print(f'Written in file {outfile}')
def get_random(ntotal, outpath): """ Collect ntotal of sentences from random wiki articles and write to outpath """ outfile = file(outpath, "a") ct = 0 ct_timeouts = 0 while ct < ntotal: print "Collected " + str(ct) if ct_timeouts > TIMEOUT_CT_LIMIT: print "Timeouts in excess of " + str(TIMEOUT_CT_LIMIT) outfile.close() sys.exit(1) try: (sentences, addct) = get_random_page(ntotal - ct) for sentence in sentences: utf8sentence = sentence.encode('UTF-8') outfile.write(utf8sentence + "\n") ct += addct except wiki.exceptions.HTTPTimeoutError as e: ct_timeouts += 1 print "Timeout error, enabling rate limit" wiki.set_rate_limiting(True) except wiki.exceptions.WikipediaException: # ignore others I suppose... pass outfile.close()
def get_wiki_info(self, wiki_url): import wikipedia import wikipediaapi import urllib.parse as urlparse wiki_text = "" url_segments = wiki_url.rpartition('/') if "en.wikipedia.org" == url_segments[2]: return wiki_text try: wikipedia.set_lang("en") wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta( 0, 0, 50000)) title_path = url_segments[2] title = urlparse.unquote(title_path) title = title.replace("_", " ") wikiWiki = wikipediaapi.Wikipedia('en') wiki_page = wikiWiki.page(title) #contents += pagePy.summary #wiki_page = wikipedia.page(title) wiki_text = wiki_page.summary except (IndexError, wikipedia.exceptions.WikipediaException): pass finally: return wiki_text
def searches(self, np, lang): page_dict = {} i = 1 while i < 9: try: wikipedia.set_lang(lang) wikipedia.set_rate_limiting(True) docs = wikipedia.search(np) for doc in docs[: self.k_wiki_results]: if doc and lang + " " + doc not in page_dict: try: p = wikipedia.page(doc) page_dict[lang + " " + doc] = p.summary except Exception as e: continue except ( ConnectionResetError, ConnectionError, ConnectionAbortedError, ConnectionRefusedError, ): print("Connection reset error received! Trial #" + str(i)) time.sleep(600 * i) i += 1 else: break return page_dict
def __init__(self, lang): """ Initializes the wikiScraper class, given a language. """ # set rate limit wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta(0, 0, 500000)) # set language wikipedia.set_lang(lang)
def wikirandom(sents, boole, client, message, lang="it"): wikipedia.set_lang(lang) wikipedia.set_rate_limiting(rate_limit=True) random = wikipedia.random() result = wikipedia.summary(random, sentences=sents) if boole: return result else: result += "\n" + create_link(random, lang) return utils.get_config.sendMessage(client, message, result)
def generateShuffledPageLinks(self, page: str) -> List[str]: wikipedia.set_rate_limiting(True) try: wikiPage = wikipedia.WikipediaPage(title=page) except wikipedia.PageError: return "PageError" allLinks = [x for x in wikiPage.links if "(identifier)" not in x] random.shuffle(allLinks) return allLinks[0:25]
def get_related_titles(pages, rate_limit=True, min_wait=datetime.timedelta(0, 0, 50000), titles=set()): if rate_limit is True: wikipedia.set_rate_limiting(True, min_wait) for page in pages: wikipage = wikipedia.WikipediaPage(page) for title in wikipage.links: titles.add(title) return titles
def post_new_puzzle(postgres): # find three hints for the crossword wikipedia.set_lang('simple') wikipedia.set_rate_limiting(True) crossword_hints = [] attempts = 0 while len(crossword_hints) < 3: assert attempts < 30, "Too many attempts" crossword_hints = get_new_words(crossword_hints) attempts += 1 # sort the words, longest first crossword_hints = sorted(crossword_hints, key=lambda x: len(x['crossword'])) crossword_hints.reverse() for word in crossword_hints: print_safe(word['topic'] + " / " + word['crossword']) print_safe(word['hint'] + "\n") matrix, solved = None, None while not matrix: # try to find a valid crossword matrix, solved, width, height = get_puzzle_matrix(crossword_hints) # if not, add words and random shuffle if not matrix: print_safe("Can't make crossword, retrying...") if random.random() < 0.33: assert attempts < 30, "Too many attempts" crossword_hints = get_new_words(crossword_hints) attempts += 1 random.shuffle(crossword_hints) print_safe(matrix_to_string(solved)) # make an image out of the matrix image_name = make_puzzle_image(matrix, 'puzzle.gif') # tweet the image and hints to_tweet = u"1: %s\n" % (crossword_hints[0]['hint'].decode('utf8')) to_tweet += u"2: %s\n" % (crossword_hints[1]['hint'].decode('utf8')) to_tweet += u"3: %s" % (crossword_hints[2]['hint'].decode('utf8')) twitter = connect_twitter() response = post_tweet(twitter, to_tweet, image_name) assert response['id'], "Failed posting puzzle to Twitter" # store the puzzle in the database db_insert(postgres, response['id'], crossword_hints, solved)
def get_related_categories(pages, rate_limit=True, min_wait=datetime.timedelta(0, 0, 50000), categories=set()): if rate_limit is True: wikipedia.set_rate_limiting(True, min_wait) for page in pages: wikipage = wikipedia.WikipediaPage(page) for category in wikipage.categories: categories.add(category) return categories
def main(): seen_articles = load_seen() wikipedia.set_rate_limiting(True) in_the_news = wikipedia.page(pageid=IN_THE_NEWS_ID) soup = BeautifulSoup(in_the_news.html()) headlines = (Headline(elem) for elem in soup.ul.find_all('li')) for headline in headlines: if headline.unseen(seen_articles): print(SEPARATOR) print(headline.article_text(seen_articles)) seen_articles.extend(headline.unseen(seen_articles)) save_seen(seen_articles)
def __init__(self): """ Intialize queryer :param kb_name: name of KB :param kb_path: path to KB file (any format readable by OntoEmma) :param out_path: output path of enriched KB :param restrict_to_ids: set restricting KB ids to query """ # Initialize wikipedia queryer wikipedia.set_lang('en') wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta(0, 0, 2000)) self.wiki_summary = dict() self.wiki_dict = dict() self.tokenizer = RegexpTokenizer(r'[A-Za-z\d]+') self.STOP = set(stopwords.words('english'))
def main(sc=None): wikipedia.set_rate_limiting(True) multiLimitRange = range(MULTITHREAD_LIMIT) LANGUAGES.sort() for language in LANGUAGES: try: wikipedia.set_lang(language) allSet = sets.Set() for i in xrange(RANDOM_TIMES): try: allSet.update(wikipedia.random(pages=10)) except wikipedia.exceptions.DisambiguationError as e: allSet.update(e.options) except Exception as e: print >> sys.stderr, e readySet = sets.Set() readySet.update(allSet) getPages_threads = { i: threading.Thread(target=getPages, args=(language, allSet, readySet)) for i in multiLimitRange } for i in multiLimitRange: try: getPages_threads[i].start() except Exception as e: print >> sys.stderr, e for i in multiLimitRange: try: if getPages_threads[i].isAlive(): getPages_threads[i].join() except Exception as e: print >> sys.stderr, e print "== %s: %d Done ==" % (language, len(allSet)) except wikipedia.exceptions.PageError as e: print >> sys.stderr, e except requests.exceptions.ConnectionError as e: print >> sys.stderr, e except wikipedia.exceptions.WikipediaException as e: print >> sys.stderr, e except Exception as e: print >> sys.stderr, e pass
def wikiDatabase(num): ''' Gets random articles from Wikipedia, creates dictionaries from them, then writes them to csv files @param num: how many articles to get @return: ''' wikipedia.set_rate_limiting(True) wikipedia.set_lang("en") allWords = [] fives = num // 500 others = num % 500 titles = [] for i in range(fives): [titles.append(j) for j in wikipedia.random(500)] if others > 0: [titles.append(j) for j in wikipedia.random(others)] print("startingWIkidata, len = " + str(len(titles))) for page in titles: pageList = [] try: words = wikipedia.page(page).content.split(' ') except (KeyError, ValueError, RuntimeError, ConnectionError, wikipedia.exceptions.WikipediaException, wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError): pass for word in words: for i in word.split('\n'): pageList.append(i) allWords.append(pageList) print("completed " + str(len(allWords)) + " articles\nnow generating dict and writing to csv") for k in range(1, 10): dictionaryChain = makeDictFromListofLists(allWords, k) writeToCSV(dictionaryChain, 'wikipediaDatabase' + str(k) + '.csv')
def get_wiki_info(wiki_title): ''' gets a wiki page and its links using the wikipedia library Args: wiki_title (str): word specifying the title of the wiki page Returns: (wikipedia object, list): the wiki page object itself and the links on the respective page ''' wikipedia.set_rate_limiting(True) try: wiki_page = wikipedia.page(title=wiki_title) except requests.Timeout as e: time.sleep(1) wiki_page = wikipedia.page(title=wiki_title) links = [link.lower() for link in wiki_page.links] return (wiki_page, links)
def findSentencesFromApi(self): global debug # at this point, any remaining words in the words dict don't have a match in the tatoeba file # so need to find a matching sentence elsewhere self.refineNewWordsDict( ) # remove words that were found in Tatoeba file self.filterNonwords() # remove most non-words if debug == True: print( str(len(self.newWords)) + ' words remain after checking Tatoeba files.') wikipedia.set_rate_limiting(True) wikipedia.set_lang(self.language[:2]) for key in self.newWords: sentence = self.getOneSentenceForWord(key) if sentence != '': self.newWords[key] -= 1 # get a translation translatedSentence = self.translateSentence(sentence) with open(self.clozefile, 'a+') as cf: cf.write('"' + sentence + '"' + '\t' + '"' + translatedSentence + '"' + '\t' + key + '\n') self.sentencesWritten += 1
def get_articles(language, no_words, max_no_articles, search, **kwargs): """ Retrieve articles from Wikipedia """ wikipedia.set_rate_limiting(True) # be polite wikipedia.set_lang(language) if search is not None: titles = wikipedia.search(search, results=max_no_articles) else: titles = wikipedia.random(pages=max_no_articles) articles = [] current_no_words = 0 for title in titles: print("INFO: loading {}".format(title)) page = wikipedia.page(title=title) content = page.content article_no_words = len(content.split()) current_no_words += article_no_words print("INFO: article contains {} words".format(article_no_words)) articles.append((title, content)) if current_no_words >= no_words: break return articles
try: f.write((wiki.page(title).content + "\n").encode("UTF-8")) except Exception as e: print(e) else: print("Skipped raw wiki dataset download") def make_dataset(): if isdir("dataset"): rmtree("dataset") mkdir("dataset") for lang in language_names: raw_lang_file_name = path(WIKIDATASET_DIR_RAW, lang + "_raw") with open(raw_lang_file_name, "r") as f: raw_text = f.read() sanitized = sanitize_text(raw_text) with open(path("dataset", lang), "w") as wf: wf.write(sanitized) if __name__ == "__main__": wiki.set_rate_limiting(True, timedelta(seconds=0.01)) # download_raw_wiki_dataset() make_dataset() print("DONE")
return False if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) urllib3_logger = logging.getLogger('urllib3') urllib3_logger.setLevel(logging.CRITICAL) logging.info("running %s" % " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) if len(sys.argv) < 3: print "Usage: python replace.py infile outfile mode(default = 0)" sys.exit(1) infile = sys.argv[1] outfile = sys.argv[2] if len(sys.argv) == 4: mode = int(sys.argv[3]) else: mode = 0 wikipedia.set_rate_limiting(1); from replace import replacer rep = replacer(infile,debug_mode) if not mode : logging.info( "Usage: replacer") rep.replace(outfile) else: logging.info( "Usage: confirm" ) rep.confirm(outfile)
link) if DEBUG_TIMING: print(f'input to dataframe took {time()-temp_time} seconds') temp_time = time() output_dataframe_to_file(df) if DEBUG_TIMING: print(f'output to dataframe took {time()-temp_time} seconds') print( f'total parse from list with header time is {time()-start} seconds' ) #start the main function/call parsing on all the timelines #be nice and don't request to much wikipedia.set_rate_limiting(True) wikipedia.set_lang('en') print("Starting to parse the websites") #Put list of wikiedia timeline pages followed by the parse here: url = "https://en.wikipedia.org/wiki/Timeline_of_ancient_history" parse_history_from_list_url(url) url = "https://en.wikipedia.org/wiki/Timeline_of_the_Middle_Ages" parse_history_from_table_url(url) url = r"https://en.wikipedia.org/wiki/16th_century#Events" parse_history_from_list_url(url)
def main(argv): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="fetch snippets of language-specific text from random pages " "of one or many wikipedias", ) parser.add_argument( "--langs", type=str, nargs="+", required=False, help="one or more languages for which to fetch wiki snippets " "as ISO 639-1 language codes", ) parser.add_argument( "--langs_file", type=str, required=False, help="path to text file on disk containing languages for which to fetch " "wiki snippets as ISO 639-1 language codes, one per line", ) parser.add_argument( "--skip_langs", type=str, nargs="+", required=False, default=[], help="one or more languages for which to *skip* fetching wiki snippets " "as ISO 639-1 language codes, as a convenience", ) parser.add_argument( "--outdir", type=str, required=False, help="path to directory on disk to which wiki snippets will be saved", ) parser.add_argument( "--n_snippets", "-n", type=int, default=25, help="number of text snippets to save per language", ) parser.add_argument( "--timeout", type=int, default=3600, help="maximum number of seconds to spend fetching snippets, per language", ) parser.add_argument( "--mode", type=str, choices=["skip", "append", "overwrite"], default="skip", help="", ) parser.add_argument( "--featured", action="store_true", default=False, help="if set, will attempt to get 'featured articles' for each language " "before falling back to random pages", ) parser.add_argument( "--good", action="store_true", default=False, help="if set, will attempt to get 'good articles' for each language " "before falling back to random pages", ) args = parser.parse_args(argv) if not bool(args.langs) ^ bool(args.langs_file): raise ValueError("either `langs` or `langs_file` must be specified") if args.langs_file: with io.open(args.langs_file, mode="rt", encoding="utf-8") as f: langs = [line.strip() for line in f] else: langs = args.langs outdir = os.path.realpath(os.path.expanduser(args.outdir)) if args.outdir else None featured = ( get_all_lang_links("Category:Featured_articles", "en") if args.featured else None ) good = get_all_lang_links("Category:Good_articles", "en") if args.good else None wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta(milliseconds=10)) wiki_langs = wikipedia.languages() for i, lang in enumerate(langs): if lang not in wiki_langs: raise ValueError( "lang='{}' is invalid; available langs are\n{}".format( lang, sorted(wiki_langs.keys()) ) ) wikilang = wiki_langs[lang] if lang in args.skip_langs: logging.info( "skipping lang %s '%s' (%s/%s) ...", wikilang, lang, i + 1, len(langs), ) continue n_snippets = args.n_snippets mode = "wt" if outdir: fname = os.path.join(outdir, lang + ".txt") if os.path.isfile(fname): if args.mode == "skip": logging.info( "snippets for lang %s '%s' (%s/%s) already fetched, skipping ...", wikilang, lang, i + 1, len(langs), ) continue else: with io.open(fname, mode="rt", encoding="utf-8") as f: n_snippets_existing = sum(1 for _ in f) if args.mode == "append": mode = "at" n_snippets = max(n_snippets - n_snippets_existing, 0) if n_snippets == 0: logging.info( "snippets for lang %s '%s' (%s/%s) already fetched, skipping ...", wikilang, lang, i + 1, len(langs), ) continue logging.info( "fetching snippets for lang %s '%s' (%s/%s) " "to %s %s existing snippets ...", wikilang, lang, i + 1, len(langs), args.mode, n_snippets_existing, ) else: logging.info( "fetching snippets for lang %s '%s' (%s/%s) ...", wiki_langs[lang], lang, i + 1, len(langs), ) else: logging.info( "fetching snippets for lang %s '%s' (%s/%s) ...", wiki_langs[lang], lang, i + 1, len(langs), ) snippets = get_snippets(lang, n_snippets, featured, good, args.timeout) if outdir: with io.open(fname, mode=mode, encoding="utf-8") as f: for snippet in snippets: f.write(snippet + "\n") logging.info("saved %s %s snippets to %s", len(snippets), lang, fname) else: logging.info( "fetched %s %s snippets but did not save to disk:\n%s\n...", len(snippets), lang, snippets[:3], )
def __init__(self): wikipedia.set_rate_limiting(True)
from html import XHTML import pycountry import wikipedia import gettext import gettext import os names= [] wikipedia.set_rate_limiting(rate_limit=1) done_languages = ['es', 'en', 'it', 'de', 'zh', 'fr', 'nl', 'aa', 'ab', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ae', 'ay', 'az', 'ba', 'bm', 'be', 'bn', 'bi', 'bo', 'bs', 'br', 'bg', 'ca', 'cs', 'ch', 'ce', 'cu', 'cv', 'kw', 'co', 'cr', 'cy', 'da', 'de', 'dv', 'dz', 'el', 'en', 'eo', 'et', 'eu', 'ee', 'fo', 'fa', 'fj', 'fi', 'fr', 'fy', 'ff', 'gd', 'ga', 'gl', 'gv', 'gn', 'gu', 'ht', 'ha', 'sh', 'he', 'hz', 'hi', 'ho', 'hr', 'hu', 'hy', 'ig', 'io', 'ii', 'iu', 'ie', 'ia', 'id', 'ik', 'is', 'it', 'jv', 'ja', 'kl', 'kn', 'ks', 'ka', 'kr', 'kk', 'km', 'ki', 'rw', 'ky', 'kv', 'kg', 'ko', 'kj', 'ku', 'lo', 'la', 'lv', 'li', 'ln', 'lt', 'lb', 'lu', 'lg', 'mh', 'ml', 'mr', 'mk', 'mg', 'mt', 'mn', 'mi', 'ms'] # lang_list = wikipedia.languages() # for key in lang_list.iteritems(): # name = key[0].encode('utf-8') # names.append(name) # print names lang_list = [lang.iso639_1_code.encode('utf-8') for lang in pycountry.languages if hasattr(lang, 'iso639_1_code')] print lang_list for lang in lang_list: if lang in done_languages: continue else: newpath = "C:\Users\s158079\Downloads\Study_materials\Quartile 4\Adaptive web\Project\\final_lang\countries-" + str(lang).lower() + "\\" print str(lang).lower() if not os.path.exists(newpath): os.makedirs(newpath) wikipedia.set_lang(lang) try: lg = gettext.translation('iso3166', pycountry.LOCALES_DIR,languages=[lang])
def set_rate_limiting(self, rate): wiki.set_rate_limiting(rate)
import sys import io import re import wikipedia from tqdm import tqdm import json year_re = re.compile(r"\b[12][0-9]{3}\b") wikipedia.set_lang("en") wikipedia.set_rate_limiting(False) in_fname = sys.argv[1] out_fname = sys.argv[2] in_f = io.open(in_fname) out_f = open(out_fname, "wb") #list of all PERSON NE #nes = ["Dorfer", "Mr. J. C. Rastrick","Masaaki Shirakawa"] #example of summary with var1 and var2 (for loop needed) found_nes = {} def page_years(page_title, depth=0): if depth > 5: return [] try: page = wikipedia.page(page_title).content return year_re.findall(page)
from sys import exit, argv from os import mkdir, listdir from datetime import datetime from urllib.request import unquote try: import wikipedia except ImportError: print('Please install the wikipedia\'s API') # =========== # initization # =========== wikipedia.set_lang('fr') wikipedia.set_rate_limiting(True) # Avoid spamming to the server ADQ = set(open('all_titles_article_de_qualite.txt').read().lower().split('\n')) BA = set(open('all_titles_bon_article.txt').read().lower().split('\n')) downloaded = set([e.lower() for e in listdir('articles/normal')]).union(ADQ).union(BA) # If a parameter are given to the script, # bound equal to the parameter, else 100 000 bound = int(argv[1]) if len(argv) > 1 else 100000 # ==== # core # ==== while len(downloaded) <= bound:
InHGame = False cities = "bath birmingham bradford brighton bristol cambridge canterbury carlisle chester chichester coventry derby durham ely exeter gloucester hereford kingston hull lancaster leeds leicester lichfield lincoln liverpool london manchester newcastle norwich nottingham oxford peterborough plymouth portsmouth preston ripon salford salisbury sheffield southampton stalbans stoke-on-trent sunderland truro wakefield westminster winchester wolverhampton worcester york" location_city = cities.split(" ") #location_city.lower() conditions = { "Showers": "shower.png", "Partly Cloudy": "parlycloudy.png", "Sunny": "sunny.png", "Cloudy": "cloudy.png", "Breezy": "breezy.png", "snow": "snow.png" } wiki_wiki = wikipediaapi.Wikipedia('en') wikipedia.set_rate_limiting(True, min_wait=timedelta(0, 0, 50000)) #Token for the bot to be able to login to discord TOKEN = "DISCORD DEV TOKEN HERE" client = discord.Client() #Opens and wipes the text file which stores the context of the current response f = open("context.txt", "r+") f.write("") f.truncate() #stores the current date and time CDT = datetime.now() #A list and a dictionary, sboring a list of greetings to use, and some responses to a set of preset inputs. greetings = ["hello", "hi", "howdy", "whassup", "yo", "hey"]
if __name__ == '__main__': logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) urllib3_logger = logging.getLogger('urllib3') urllib3_logger.setLevel(logging.CRITICAL) logging.info("running %s" % " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) if len(sys.argv) < 3: print "Usage: python replace.py infile outfile mode(default = 0)" sys.exit(1) infile = sys.argv[1] outfile = sys.argv[2] if len(sys.argv) == 4: mode = int(sys.argv[3]) else: mode = 0 wikipedia.set_rate_limiting(1) from replace import replacer rep = replacer(infile, debug_mode) if not mode: logging.info("Usage: replacer") rep.replace(outfile) else: logging.info("Usage: confirm") rep.confirm(outfile)
def get_wiki_data(answers, pkl_file, delay=50000): """ Download pages from Wikipedia and store them in a binary pickle file. The pickle file is a list of dicts, where each dict has the following keys: 'title' 'summary' 'content' 'links' 'categories' 'sections' This should be able to recover from crashes. Just run it again with the same arguments, and it'll pick up where it left off. Args: answers: list of answer strings that should corespond to Wiki pages pkl_file: file used to store pickled output delay=50000: number of microseconds to wait between pages Returns: None """ # funtion to try/except a page property (e.g. summary, links) def try_page_property(dict_page, page, attr): try: tmp = getattr(page, attr) dict_page[attr] = tmp except KeyError: # sometimes links doesn't work dict_page[attr] = None # wikipedia module does its own rate limiting, so let's use that wiki.set_rate_limiting(True, min_wait=datetime.timedelta(0,0,delay)) # try to read which pages we've already done (i.e. if we're restarting) pages = [] if os.path.isfile(pkl_file): with open(pkl_file, 'rb') as f: try: pages = pickle.load(f) except Exception: # if nothing gets loaded/bad file descriptor pass # find pages we need to do pages_todo = set(answers) for page in pages: update_keys = (key in page.keys() for key in PAGE_KEYS) if all(update_keys): # if we have entries for all data keys pages_todo.discard(page['title']) pages_todo = list(pages_todo) # download wiki pages for i, title in enumerate(pages_todo): print (u"page {} of {} (title={})".format(i+1, len(pages_todo), title)) try: #XXX even though we checked for bad pages, this sometimes raises # an exception. For example, we could do # title=u"Pascal (programming language)", but still get a PageError # saying Page id "pascal programming langauge" does not match any # pages. We can manually look up the Mediawiki pageid (using their API) # and look up the page using that if title in ANSWER_TO_PAGEID.keys(): page = wiki.page(pageid=ANSWER_TO_PAGEID[title]) else: page = wiki.page(title=title, auto_suggest=False) # get the page data and store it in a dict dict_page = dict() for key in PAGE_KEYS: try_page_property(dict_page, page, key) print (page.title) pages.append(dict_page) # dumping each time is safe, but slow # TODO: how safe is this? pickle.dump(pages, open(pkl_file, 'wb'), pickle.HIGHEST_PROTOCOL) except wiki.exceptions.WikipediaException as e: print (u"wiki issues with page (title={})".format(title)) print (e) except Exception as e: print ("something else bad has happened!") print (e)
from sklearn.naive_bayes import MultinomialNB, BernoulliNB # Naive Bayes model classes from sklearn.feature_extraction.text import CountVectorizer import wikipedia as wiki # wikipedia api wrapper wiki.set_rate_limiting(True) # might actually speed things up. from sklearn.cross_validation import train_test_split # split the data you have into training and test sets def fetch_wiki(title, lang): """ Return the regular English or simple versions of an article. Simple versions are far shorter than the regular ones, so only pull the summary of regular articles. In case of an error, just return None instead of crashing the program. """ assert lang in ("en", "simple"), "Language must be 'en' or 'simple'" try: wiki.set_lang(lang) page = wiki.page(title) # print page.title # used for testing the function return (page.summary, 1) if lang == "en" else (page.content, 0) # 1: english, 0: simple except: # NOTE: you should never have a blind `except` like this. but, hey, we're hacking. print " - error with " + lang + " page for: " + title return None articles = [ "General Relativity", "Bayes Theorem",
def __init__(self): wikipedia.set_rate_limiting(True) filename = "book/pages_used.txt" self.bib = open(filename, "w")
import wikipedia import codecs import os import datetime topic = "climate change" wikipedia.set_lang("en") wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta(seconds=1)) def writepage(term,direc): try: termpage=wikipedia.page(term) content=termpage.content f=codecs.open(direc + "//" + term+".txt","w","utf-8") f.write(content) f.close() return termpage.links except Exception as e: print(e) return [] topics = ['climate change'] for topic in topics: results=wikipedia.search(topic) print(results) links = [] #os.makedirs(topic) for term in results: newlink = writepage(term,topic) links = links + newlink for link in set(links): nl=writepage(link,topic)
from __future__ import print_function import wikipedia import datetime, os, sys from time import time wikipedia.set_rate_limiting(rate_limit=True, min_wait=datetime.timedelta(0, 0, 1e6)) wikipedia.set_lang("en") # Nicely formatted time string def hms_string(sec_elapsed): h = int(sec_elapsed / (60 * 60)) m = int((sec_elapsed % (60 * 60)) / 60) s = sec_elapsed % 60 return "{}:{:>02}:{:>05.2f}".format(h, m, s) cat_orig = [ "Artificial intelligence", "Companies", "Computer architecture", "Computer model", "Computer engineering", "Data", "Free software", "Human-computer interaction", "Internet", "Mobile web", "Networks", "Operating systems", "Programming", "Software", "Automation", "Unsolved problems in computer science", "Computer security", "Computing and society", "Embedded systems", "Multimedia", "Languages", "Information technology", "Information systems", "Platforms", "Product lifecycle management", "Real-time computing", "Software engineering", "Computer science", "Classes of computers", "Machine Learning", "Statistics", "Mathematics", "Supercomputer", "Quantum physics", "Quantum computer", "compiler", "Central processing unit", "Graphics processing unit", "Personal Computer",
import wikipedia from PyDictionary import PyDictionary import codecs import re from datetime import datetime, date, time #--------------------------------------------- myflash = {} pageerror = [] wikipedia.set_rate_limiting(False) #--------------------------------------------- start = datetime.now() with codecs.open('insource.txt', 'r', encoding="utf-8") as f: for line in f: try: term = str(wikipedia.page(line, auto_suggest=False)) term = term[16:-2] definition = str(wikipedia.WikipediaPage(line).content).split('\n') definition = definition[0] if definition == '': pageerror.append(line) else: myflash[term] = definition except (wikipedia.exceptions.PageError, KeyError): try: line = re.sub('^[^a-zA-z]*|[^a-zA-Z]*$', '', line) term = str(wikipedia.page(line, auto_suggest=False)) term = term[16:-2]
# Use try_again_dec with the main methods of wikipedia for name in ['geosearch', 'languages', 'page', 'search', 'suggest', 'summary']: setattr( wikipedia, name, try_again_dec( wikipedia.exceptions.HTTPTimeoutError, wikipedia.exceptions.RedirectError, requests.exceptions.RequestException, retry=3 )(getattr(wikipedia, name)) ) # wikipedia configuration wikipedia.set_lang('en') wikipedia.set_rate_limiting(True) wikipedia.set_user_agent( 'Newsparser NE comparison (http://newsparser704.pythonanywhere.com/)' ) class WikiData: pages = 'pages' ne_mapping = 'ne_mapping' nlp = nlp def __init__(self): self.mongo_client = MongoClient()