Example #1
0
def get_bad_answers(answers, delay=50000, cutoff=-1):
    """
    Query Wikipedia for each answer, and return a list of answers that 
    _don't_ correspond directly to page names.

    Args:
        answers: list of answer strings that should corespond to Wiki pages

        delay=50000: number of microseconds to wait between pages
        cutoff=-1: only process cutoff pages (useful for development)

    Returns:
        None
    """
    # wikipedia module does its own rate limiting, so let's use that
    wiki.set_rate_limiting(True, min_wait=datetime.timedelta(0, 0, delay))

    bad_results = []
    for i, answer in enumerate(answers):
        print(i, len(answers))
        res = wiki.search(answer, results=3)
        if res[0] != answer:
            print("bad result!", answer, res)
            bad_results.append((i, answer))

        if cutoff > 0 and i >= cutoff - 1:
            break

    print(bad_results)
def get_bad_answers(answers, delay=50000, cutoff=-1):
    """
    Query Wikipedia for each answer, and return a list of answers that 
    _don't_ correspond directly to page names.

    Args:
        answers: list of answer strings that should corespond to Wiki pages

        delay=50000: number of microseconds to wait between pages
        cutoff=-1: only process cutoff pages (useful for development)

    Returns:
        None
    """
    # wikipedia module does its own rate limiting, so let's use that
    wiki.set_rate_limiting(True, min_wait=datetime.timedelta(0,0,delay))
    
    bad_results = []
    for i, answer in enumerate(answers):
        print (i, len(answers))
        res = wiki.search(answer, results=3)
        if res[0] != answer:
            print ("bad result!", answer, res)
            bad_results.append((i,answer))

        if cutoff > 0 and i >= cutoff-1:
            break
    
    print (bad_results)
Example #3
0
def checkTenPagesForTMNT():
    """Get 10 random wiki titles, check if any of them isTMNT().

    We grab the max allowed Wikipedia page titles (10) using wikipedia.random().
    If any title is in TMNT meter, return the title. Otherwise, return False.

    Args:
        None
    Returns:
        String or False: The TMNT compliant title, or False if none found.
    """
    wikipedia.set_rate_limiting(True)
    try:
        titles = wikipedia.random(10)
    except wikipedia.exceptions.HTTPTimeoutError as e:
        print(f"Wikipedia timout exception: {e}")
        time.sleep(TIMEOUT_BACKOFF)
        main()
    except wikipedia.exceptions.WikipediaException as e:
        print(f"Wikipedia exception: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Exception while fetching wiki titles: {e}")
        sys.exit(1)

    for title in titles:
        if words.isTMNT(title):
            return title
    return False
Example #4
0
def get_wiki_titles() -> List:
    """
	Get a list of random titles from wikipedia. Change number in lib/constants.py

	Parameters:
	  None.

	Returns:
	  List of titles.
	"""
    wikipedia.set_rate_limiting(True)

    try:
        titles = wikipedia.random(FETCH)
    except wikipedia.exceptions.HTTPTimeoutError as e:
        print(f"Wikipedia timout exception: {e}")
        time.sleep(TIMEOUT_BACKOFF)
        sys.exit(1)
    except wikipedia.exceptions.WikipediaException as e:
        print(f"Wikipedia exception: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Exception while fetching wiki titles: {e}")
        sys.exit(1)

    return titles
Example #5
0
def main():
    # ------ User interaction
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'template',
        help=
        'file containing newline-delimited keywords, filename pattern: *_template*'
    )
    parser.add_argument('domain',
                        help='wikipedia domain/project (en, de, fr, ...)')
    args = parser.parse_args()

    if not os.path.exists(args.template):
        raise ValueError(f'File "{args.template}" does not exist.')
    if not args.template.split('.')[0].endswith('_template'):
        raise ValueError('The keywords file must end with `_template`')

    # ------ Setup
    wiki.set_rate_limiting(rate_limit=True)
    wiki.set_lang(args.domain)

    # ------ Load keywords from file
    keywords = read_keywords(args.template)
    print(f'Loaded {len(keywords)} keywords')
    #print(keywords)
    #print(get_output_filename(args.template, args.domain))
    #exit()

    # ------ Fetch suggestions
    print('Fetching suggestions...')
    r = search(keywords, n_results=1)
    r = {key: value[0] for key, value in r.items()}

    # ------ Review
    print(f'Whole mapping:')
    pretty_print_dic(r)

    print(
        '\nBelow is a more readable report (exact matches are not reported):')
    case_mismatch, no_match = {}, {}
    for k, val in r.items():
        if k == val:
            continue
        if k.lower() == val.lower():
            case_mismatch[k] = val
        elif k != val:
            no_match[k] = val

    print(f'Cases mismatch:')
    pretty_print_dic(case_mismatch)
    print(f'No match:')
    pretty_print_dic(no_match)

    # Write
    final_keywords = '\n'.join(r.values())
    outfile = get_output_filename(args.template, args.domain)
    header = get_header()
    with open(outfile, 'w') as f:
        f.write(header + final_keywords)
    print(f'Written in file {outfile}')
def get_random(ntotal, outpath):
    """
    Collect ntotal of sentences from random wiki articles and write to outpath
    """
    outfile = file(outpath, "a")
    ct = 0
    ct_timeouts = 0
    while ct < ntotal:
        print "Collected " + str(ct)
        if ct_timeouts > TIMEOUT_CT_LIMIT:
            print "Timeouts in excess of " + str(TIMEOUT_CT_LIMIT)
            outfile.close()
            sys.exit(1)
        try:
            (sentences, addct) = get_random_page(ntotal - ct)
            for sentence in sentences:
                utf8sentence = sentence.encode('UTF-8')
                outfile.write(utf8sentence + "\n")
            ct += addct
        except wiki.exceptions.HTTPTimeoutError as e:
            ct_timeouts += 1
            print "Timeout error, enabling rate limit"
            wiki.set_rate_limiting(True)
        except wiki.exceptions.WikipediaException:
            # ignore others I suppose...
            pass 
    outfile.close()
Example #7
0
    def get_wiki_info(self, wiki_url):
        import wikipedia
        import wikipediaapi
        import urllib.parse as urlparse
        wiki_text = ""
        url_segments = wiki_url.rpartition('/')
        if "en.wikipedia.org" == url_segments[2]:
            return wiki_text
        try:
            wikipedia.set_lang("en")
            wikipedia.set_rate_limiting(True,
                                        min_wait=datetime.timedelta(
                                            0, 0, 50000))

            title_path = url_segments[2]
            title = urlparse.unquote(title_path)
            title = title.replace("_", " ")
            wikiWiki = wikipediaapi.Wikipedia('en')
            wiki_page = wikiWiki.page(title)
            #contents += pagePy.summary
            #wiki_page = wikipedia.page(title)
            wiki_text = wiki_page.summary
        except (IndexError, wikipedia.exceptions.WikipediaException):
            pass
        finally:
            return wiki_text
 def searches(self, np, lang):
     page_dict = {}
     i = 1
     while i < 9:
         try:
             wikipedia.set_lang(lang)
             wikipedia.set_rate_limiting(True)
             docs = wikipedia.search(np)
             for doc in docs[: self.k_wiki_results]:
                 if doc and lang + " " + doc not in page_dict:
                     try:
                         p = wikipedia.page(doc)
                         page_dict[lang + " " + doc] = p.summary
                     except Exception as e:
                         continue
         except (
             ConnectionResetError,
             ConnectionError,
             ConnectionAbortedError,
             ConnectionRefusedError,
         ):
             print("Connection reset error received! Trial #" + str(i))
             time.sleep(600 * i)
             i += 1
         else:
             break
     return page_dict
 def __init__(self, lang):
     """
         Initializes the wikiScraper class, given a language.
         """
     # set rate limit
     wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta(0, 0, 500000))
     # set language
     wikipedia.set_lang(lang)
Example #10
0
 def __init__(self, lang):
     """
         Initializes the wikiScraper class, given a language.
         """
     # set rate limit
     wikipedia.set_rate_limiting(True,
                                 min_wait=datetime.timedelta(0, 0, 500000))
     # set language
     wikipedia.set_lang(lang)
Example #11
0
def wikirandom(sents, boole, client, message, lang="it"):
    wikipedia.set_lang(lang)
    wikipedia.set_rate_limiting(rate_limit=True)
    random = wikipedia.random()
    result = wikipedia.summary(random, sentences=sents)
    if boole:
        return result
    else:
        result += "\n" + create_link(random, lang)
        return utils.get_config.sendMessage(client, message, result)
Example #12
0
    def generateShuffledPageLinks(self, page: str) -> List[str]:

        wikipedia.set_rate_limiting(True)
        try:
            wikiPage = wikipedia.WikipediaPage(title=page)
        except wikipedia.PageError:
            return "PageError"

        allLinks = [x for x in wikiPage.links if "(identifier)" not in x]
        random.shuffle(allLinks)

        return allLinks[0:25]
Example #13
0
def get_related_titles(pages,
                       rate_limit=True,
                       min_wait=datetime.timedelta(0, 0, 50000),
                       titles=set()):
    if rate_limit is True:
        wikipedia.set_rate_limiting(True, min_wait)

    for page in pages:
        wikipage = wikipedia.WikipediaPage(page)

        for title in wikipage.links:
            titles.add(title)

    return titles
Example #14
0
def post_new_puzzle(postgres):
    # find three hints for the crossword
    wikipedia.set_lang('simple')
    wikipedia.set_rate_limiting(True)
    crossword_hints = []
    attempts = 0
    while len(crossword_hints) < 3:
        assert attempts < 30, "Too many attempts"
        crossword_hints = get_new_words(crossword_hints)
        attempts += 1

    # sort the words, longest first
    crossword_hints = sorted(crossword_hints,
                             key=lambda x: len(x['crossword']))
    crossword_hints.reverse()

    for word in crossword_hints:
        print_safe(word['topic'] + " / " + word['crossword'])
        print_safe(word['hint'] + "\n")

    matrix, solved = None, None
    while not matrix:
        # try to find a valid crossword
        matrix, solved, width, height = get_puzzle_matrix(crossword_hints)

        # if not, add words and random shuffle
        if not matrix:
            print_safe("Can't make crossword, retrying...")
            if random.random() < 0.33:
                assert attempts < 30, "Too many attempts"
                crossword_hints = get_new_words(crossword_hints)
                attempts += 1
            random.shuffle(crossword_hints)

    print_safe(matrix_to_string(solved))

    # make an image out of the matrix
    image_name = make_puzzle_image(matrix, 'puzzle.gif')

    # tweet the image and hints
    to_tweet = u"1: %s\n" % (crossword_hints[0]['hint'].decode('utf8'))
    to_tweet += u"2: %s\n" % (crossword_hints[1]['hint'].decode('utf8'))
    to_tweet += u"3: %s" % (crossword_hints[2]['hint'].decode('utf8'))
    twitter = connect_twitter()
    response = post_tweet(twitter, to_tweet, image_name)
    assert response['id'], "Failed posting puzzle to Twitter"

    # store the puzzle in the database
    db_insert(postgres, response['id'], crossword_hints, solved)
Example #15
0
def get_related_categories(pages,
                           rate_limit=True,
                           min_wait=datetime.timedelta(0, 0, 50000),
                           categories=set()):

    if rate_limit is True:
        wikipedia.set_rate_limiting(True, min_wait)

    for page in pages:
        wikipage = wikipedia.WikipediaPage(page)

        for category in wikipage.categories:
            categories.add(category)

    return categories
Example #16
0
def main():
    seen_articles = load_seen()

    wikipedia.set_rate_limiting(True)
    in_the_news = wikipedia.page(pageid=IN_THE_NEWS_ID)
    soup = BeautifulSoup(in_the_news.html())

    headlines = (Headline(elem) for elem in soup.ul.find_all('li'))

    for headline in headlines:
        if headline.unseen(seen_articles):
            print(SEPARATOR)
            print(headline.article_text(seen_articles))
            seen_articles.extend(headline.unseen(seen_articles))

    save_seen(seen_articles)
Example #17
0
    def __init__(self):
        """
        Intialize queryer
        :param kb_name: name of KB
        :param kb_path: path to KB file (any format readable by OntoEmma)
        :param out_path: output path of enriched KB
        :param restrict_to_ids: set restricting KB ids to query
        """
        # Initialize wikipedia queryer
        wikipedia.set_lang('en')
        wikipedia.set_rate_limiting(True,
                                    min_wait=datetime.timedelta(0, 0, 2000))
        self.wiki_summary = dict()
        self.wiki_dict = dict()

        self.tokenizer = RegexpTokenizer(r'[A-Za-z\d]+')
        self.STOP = set(stopwords.words('english'))
Example #18
0
def main(sc=None):
    wikipedia.set_rate_limiting(True)
    multiLimitRange = range(MULTITHREAD_LIMIT)
    LANGUAGES.sort()
    for language in LANGUAGES:
        try:
            wikipedia.set_lang(language)
            allSet = sets.Set()
            for i in xrange(RANDOM_TIMES):
                try:
                    allSet.update(wikipedia.random(pages=10))
                except wikipedia.exceptions.DisambiguationError as e:
                    allSet.update(e.options)
                except Exception as e:
                    print >> sys.stderr, e
            readySet = sets.Set()
            readySet.update(allSet)
            getPages_threads = {
                i: threading.Thread(target=getPages,
                                    args=(language, allSet, readySet))
                for i in multiLimitRange
            }
            for i in multiLimitRange:
                try:
                    getPages_threads[i].start()
                except Exception as e:
                    print >> sys.stderr, e
            for i in multiLimitRange:
                try:
                    if getPages_threads[i].isAlive():
                        getPages_threads[i].join()
                except Exception as e:
                    print >> sys.stderr, e
            print "== %s: %d Done ==" % (language, len(allSet))
        except wikipedia.exceptions.PageError as e:
            print >> sys.stderr, e
        except requests.exceptions.ConnectionError as e:
            print >> sys.stderr, e
        except wikipedia.exceptions.WikipediaException as e:
            print >> sys.stderr, e
        except Exception as e:
            print >> sys.stderr, e
        pass
Example #19
0
def wikiDatabase(num):
    '''
    Gets random articles from Wikipedia, creates dictionaries from them, then writes them to csv files
    @param num: how many articles to get
    @return:
    '''
    wikipedia.set_rate_limiting(True)
    wikipedia.set_lang("en")

    allWords = []

    fives = num // 500
    others = num % 500

    titles = []

    for i in range(fives):
        [titles.append(j) for j in wikipedia.random(500)]
    if others > 0:
        [titles.append(j) for j in wikipedia.random(others)]

    print("startingWIkidata, len = " + str(len(titles)))
    for page in titles:
        pageList = []
        try:
            words = wikipedia.page(page).content.split(' ')
        except (KeyError, ValueError, RuntimeError, ConnectionError,
                wikipedia.exceptions.WikipediaException,
                wikipedia.exceptions.PageError,
                wikipedia.exceptions.DisambiguationError):
            pass
        for word in words:
            for i in word.split('\n'):
                pageList.append(i)
        allWords.append(pageList)

    print("completed " + str(len(allWords)) +
          " articles\nnow generating dict and writing to csv")

    for k in range(1, 10):
        dictionaryChain = makeDictFromListofLists(allWords, k)
        writeToCSV(dictionaryChain, 'wikipediaDatabase' + str(k) + '.csv')
Example #20
0
def get_wiki_info(wiki_title):
    '''
    gets a wiki page and its links using the wikipedia library
    
    Args:
        wiki_title (str): word specifying the title of the wiki page
    
    Returns:
        (wikipedia object, list): the wiki page object itself and the links on the respective page
    '''
    wikipedia.set_rate_limiting(True)

    try:
        wiki_page = wikipedia.page(title=wiki_title)
    except requests.Timeout as e:
        time.sleep(1)
        wiki_page = wikipedia.page(title=wiki_title)

    links = [link.lower() for link in wiki_page.links]
    return (wiki_page, links)
Example #21
0
 def findSentencesFromApi(self):
     global debug
     # at this point, any remaining words in the words dict don't have a match in the tatoeba file
     # so need to find a matching sentence elsewhere
     self.refineNewWordsDict(
     )  # remove words that were found in Tatoeba file
     self.filterNonwords()  # remove most non-words
     if debug == True:
         print(
             str(len(self.newWords)) +
             ' words remain after checking Tatoeba files.')
     wikipedia.set_rate_limiting(True)
     wikipedia.set_lang(self.language[:2])
     for key in self.newWords:
         sentence = self.getOneSentenceForWord(key)
         if sentence != '':
             self.newWords[key] -= 1
             # get a translation
             translatedSentence = self.translateSentence(sentence)
             with open(self.clozefile, 'a+') as cf:
                 cf.write('"' + sentence + '"' + '\t' + '"' +
                          translatedSentence + '"' + '\t' + key + '\n')
                 self.sentencesWritten += 1
Example #22
0
def get_articles(language, no_words, max_no_articles, search, **kwargs):
    """ Retrieve articles from Wikipedia """
    wikipedia.set_rate_limiting(True)  # be polite
    wikipedia.set_lang(language)

    if search is not None:
        titles = wikipedia.search(search, results=max_no_articles)
    else:
        titles = wikipedia.random(pages=max_no_articles)

    articles = []
    current_no_words = 0
    for title in titles:
        print("INFO: loading {}".format(title))
        page = wikipedia.page(title=title)
        content = page.content
        article_no_words = len(content.split())
        current_no_words += article_no_words
        print("INFO: article contains {} words".format(article_no_words))
        articles.append((title, content))
        if current_no_words >= no_words:
            break

    return articles
Example #23
0
                    try:
                        f.write((wiki.page(title).content + "\n").encode("UTF-8"))
                    except Exception as e:
                        print(e)
    else:
        print("Skipped raw wiki dataset download")


def make_dataset():
    if isdir("dataset"):
        rmtree("dataset")

    mkdir("dataset")

    for lang in language_names:
        raw_lang_file_name = path(WIKIDATASET_DIR_RAW, lang + "_raw")
        with open(raw_lang_file_name, "r") as f:
            raw_text = f.read()
            sanitized = sanitize_text(raw_text)
            with open(path("dataset", lang), "w") as wf:
                wf.write(sanitized)


if __name__ == "__main__":
    wiki.set_rate_limiting(True, timedelta(seconds=0.01))

    # download_raw_wiki_dataset()

    make_dataset()
    print("DONE")
Example #24
0
                        return False


if __name__ == '__main__':

    logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
    urllib3_logger = logging.getLogger('urllib3')
    urllib3_logger.setLevel(logging.CRITICAL)
    logging.info("running %s" % " ".join(sys.argv))
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 3:
        print "Usage: python replace.py infile  outfile mode(default = 0)"
        sys.exit(1)
    infile = sys.argv[1]
    outfile =  sys.argv[2] 
    if len(sys.argv) == 4:
	mode = int(sys.argv[3])
    else:
	mode = 0

    wikipedia.set_rate_limiting(1);    
    from replace import replacer
    rep = replacer(infile,debug_mode)
    if not mode :
     logging.info( "Usage: replacer")
     rep.replace(outfile)
    else:
     logging.info( "Usage: confirm" )
     rep.confirm(outfile)
   
                            link)
    if DEBUG_TIMING:
        print(f'input to dataframe took {time()-temp_time} seconds')
        temp_time = time()
    output_dataframe_to_file(df)
    if DEBUG_TIMING:
        print(f'output to dataframe took {time()-temp_time} seconds')
        print(
            f'total parse from list with header time is {time()-start} seconds'
        )


#start the main function/call parsing on all the timelines

#be nice and don't request to much
wikipedia.set_rate_limiting(True)
wikipedia.set_lang('en')

print("Starting to parse the websites")
#Put list of wikiedia timeline pages followed by the parse here:
url = "https://en.wikipedia.org/wiki/Timeline_of_ancient_history"

parse_history_from_list_url(url)

url = "https://en.wikipedia.org/wiki/Timeline_of_the_Middle_Ages"

parse_history_from_table_url(url)

url = r"https://en.wikipedia.org/wiki/16th_century#Events"

parse_history_from_list_url(url)
Example #26
0
def main(argv):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="fetch snippets of language-specific text from random pages "
        "of one or many wikipedias",
    )
    parser.add_argument(
        "--langs",
        type=str,
        nargs="+",
        required=False,
        help="one or more languages for which to fetch wiki snippets "
        "as ISO 639-1 language codes",
    )
    parser.add_argument(
        "--langs_file",
        type=str,
        required=False,
        help="path to text file on disk containing languages for which to fetch "
        "wiki snippets as ISO 639-1 language codes, one per line",
    )
    parser.add_argument(
        "--skip_langs",
        type=str,
        nargs="+",
        required=False,
        default=[],
        help="one or more languages for which to *skip* fetching wiki snippets "
        "as ISO 639-1 language codes, as a convenience",
    )
    parser.add_argument(
        "--outdir",
        type=str,
        required=False,
        help="path to directory on disk to which wiki snippets will be saved",
    )
    parser.add_argument(
        "--n_snippets",
        "-n",
        type=int,
        default=25,
        help="number of text snippets to save per language",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=3600,
        help="maximum number of seconds to spend fetching snippets, per language",
    )
    parser.add_argument(
        "--mode",
        type=str,
        choices=["skip", "append", "overwrite"],
        default="skip",
        help="",
    )
    parser.add_argument(
        "--featured",
        action="store_true",
        default=False,
        help="if set, will attempt to get 'featured articles' for each language "
        "before falling back to random pages",
    )
    parser.add_argument(
        "--good",
        action="store_true",
        default=False,
        help="if set, will attempt to get 'good articles' for each language "
        "before falling back to random pages",
    )
    args = parser.parse_args(argv)

    if not bool(args.langs) ^ bool(args.langs_file):
        raise ValueError("either `langs` or `langs_file` must be specified")
    if args.langs_file:
        with io.open(args.langs_file, mode="rt", encoding="utf-8") as f:
            langs = [line.strip() for line in f]
    else:
        langs = args.langs

    outdir = os.path.realpath(os.path.expanduser(args.outdir)) if args.outdir else None
    featured = (
        get_all_lang_links("Category:Featured_articles", "en") if args.featured else None
    )
    good = get_all_lang_links("Category:Good_articles", "en") if args.good else None

    wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta(milliseconds=10))
    wiki_langs = wikipedia.languages()

    for i, lang in enumerate(langs):
        if lang not in wiki_langs:
            raise ValueError(
                "lang='{}' is invalid; available langs are\n{}".format(
                    lang, sorted(wiki_langs.keys())
                )
            )
        wikilang = wiki_langs[lang]
        if lang in args.skip_langs:
            logging.info(
                "skipping lang %s '%s' (%s/%s) ...", wikilang, lang, i + 1, len(langs),
            )
            continue
        n_snippets = args.n_snippets
        mode = "wt"
        if outdir:
            fname = os.path.join(outdir, lang + ".txt")
            if os.path.isfile(fname):
                if args.mode == "skip":
                    logging.info(
                        "snippets for lang %s '%s' (%s/%s) already fetched, skipping ...",
                        wikilang,
                        lang,
                        i + 1,
                        len(langs),
                    )
                    continue
                else:
                    with io.open(fname, mode="rt", encoding="utf-8") as f:
                        n_snippets_existing = sum(1 for _ in f)
                    if args.mode == "append":
                        mode = "at"
                        n_snippets = max(n_snippets - n_snippets_existing, 0)
                        if n_snippets == 0:
                            logging.info(
                                "snippets for lang %s '%s' (%s/%s) already fetched, skipping ...",
                                wikilang,
                                lang,
                                i + 1,
                                len(langs),
                            )
                            continue
                    logging.info(
                        "fetching snippets for lang %s '%s' (%s/%s) "
                        "to %s %s existing snippets ...",
                        wikilang,
                        lang,
                        i + 1,
                        len(langs),
                        args.mode,
                        n_snippets_existing,
                    )
            else:
                logging.info(
                    "fetching snippets for lang %s '%s' (%s/%s) ...",
                    wiki_langs[lang],
                    lang,
                    i + 1,
                    len(langs),
                )
        else:
            logging.info(
                "fetching snippets for lang %s '%s' (%s/%s) ...",
                wiki_langs[lang],
                lang,
                i + 1,
                len(langs),
            )
        snippets = get_snippets(lang, n_snippets, featured, good, args.timeout)
        if outdir:
            with io.open(fname, mode=mode, encoding="utf-8") as f:
                for snippet in snippets:
                    f.write(snippet + "\n")
            logging.info("saved %s %s snippets to %s", len(snippets), lang, fname)
        else:
            logging.info(
                "fetched %s %s snippets but did not save to disk:\n%s\n...",
                len(snippets),
                lang,
                snippets[:3],
            )
Example #27
0
 def __init__(self):
     wikipedia.set_rate_limiting(True)
from html import XHTML
import pycountry
import wikipedia
import gettext
import gettext
import os
names= []
wikipedia.set_rate_limiting(rate_limit=1)
done_languages = ['es', 'en', 'it', 'de', 'zh', 'fr', 'nl', 'aa', 'ab', 'af', 'ak', 'am', 'an', 'ar', 'as',  'av', 'ae', 'ay', 'az', 'ba', 'bm', 'be', 'bn', 'bi', 'bo', 'bs', 'br', 'bg', 'ca', 'cs', 'ch', 'ce', 'cu', 'cv', 'kw', 'co', 'cr', 'cy', 'da', 'de', 'dv', 'dz', 'el', 'en', 'eo', 'et', 'eu', 'ee', 'fo', 'fa', 'fj', 'fi', 'fr', 'fy', 'ff', 'gd', 'ga', 'gl', 'gv', 'gn', 'gu', 'ht', 'ha', 'sh', 'he', 'hz', 'hi', 'ho', 'hr', 'hu', 'hy', 'ig', 'io', 'ii', 'iu', 'ie', 'ia', 'id', 'ik', 'is', 'it', 'jv', 'ja', 'kl', 'kn', 'ks', 'ka', 'kr', 'kk', 'km', 'ki', 'rw', 'ky', 'kv', 'kg', 'ko', 'kj', 'ku', 'lo', 'la', 'lv', 'li', 'ln', 'lt', 'lb', 'lu', 'lg', 'mh', 'ml', 'mr', 'mk', 'mg', 'mt', 'mn', 'mi', 'ms']
# lang_list = wikipedia.languages()
# for key in lang_list.iteritems():
#     name = key[0].encode('utf-8')
#     names.append(name)
# print names
lang_list = [lang.iso639_1_code.encode('utf-8')
         for lang in pycountry.languages
         if hasattr(lang, 'iso639_1_code')]

print lang_list

for lang in lang_list:
    if lang in done_languages:
        continue
    else:
        newpath = "C:\Users\s158079\Downloads\Study_materials\Quartile 4\Adaptive web\Project\\final_lang\countries-" + str(lang).lower() + "\\"
        print str(lang).lower()
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        wikipedia.set_lang(lang)
        try:
            lg = gettext.translation('iso3166', pycountry.LOCALES_DIR,languages=[lang])
 def set_rate_limiting(self, rate):
     wiki.set_rate_limiting(rate)
Example #30
0
import sys
import io
import re
import wikipedia
from tqdm import tqdm
import json

year_re = re.compile(r"\b[12][0-9]{3}\b")

wikipedia.set_lang("en")
wikipedia.set_rate_limiting(False)

in_fname = sys.argv[1]
out_fname = sys.argv[2]

in_f = io.open(in_fname)
out_f = open(out_fname, "wb")

#list of all PERSON NE
#nes = ["Dorfer", "Mr. J. C. Rastrick","Masaaki Shirakawa"]

#example of summary with var1 and var2 (for loop needed)
found_nes = {}


def page_years(page_title, depth=0):
    if depth > 5:
        return []
    try:
        page = wikipedia.page(page_title).content
        return year_re.findall(page)
Example #31
0
from sys import exit, argv
from os import mkdir, listdir
from datetime import datetime
from urllib.request import unquote
try:
    import wikipedia
except ImportError:
    print('Please install the wikipedia\'s API')

# ===========
# initization
# ===========

wikipedia.set_lang('fr')
wikipedia.set_rate_limiting(True)  # Avoid spamming to the server

ADQ = set(open('all_titles_article_de_qualite.txt').read().lower().split('\n'))
BA = set(open('all_titles_bon_article.txt').read().lower().split('\n'))
downloaded = set([e.lower()
                  for e in listdir('articles/normal')]).union(ADQ).union(BA)

# If a parameter are given to the script,
# bound equal to the parameter, else 100 000
bound = int(argv[1]) if len(argv) > 1 else 100000

# ====
# core
# ====

while len(downloaded) <= bound:
Example #32
0
InHGame = False

cities = "bath birmingham bradford brighton bristol cambridge canterbury carlisle chester chichester coventry derby durham ely exeter gloucester hereford kingston hull lancaster leeds leicester lichfield lincoln liverpool london manchester newcastle norwich nottingham oxford peterborough plymouth portsmouth preston ripon salford salisbury sheffield southampton stalbans stoke-on-trent sunderland truro wakefield westminster winchester wolverhampton worcester york"
location_city = cities.split(" ")
#location_city.lower()
conditions = {
    "Showers": "shower.png",
    "Partly Cloudy": "parlycloudy.png",
    "Sunny": "sunny.png",
    "Cloudy": "cloudy.png",
    "Breezy": "breezy.png",
    "snow": "snow.png"
}

wiki_wiki = wikipediaapi.Wikipedia('en')
wikipedia.set_rate_limiting(True, min_wait=timedelta(0, 0, 50000))

#Token for the bot to be able to login to discord
TOKEN = "DISCORD DEV TOKEN HERE"
client = discord.Client()

#Opens and wipes the text file which stores the context of the current response
f = open("context.txt", "r+")
f.write("")
f.truncate()

#stores the current date and time
CDT = datetime.now()

#A list and a dictionary, sboring a list of greetings to use, and some responses to a set of preset inputs.
greetings = ["hello", "hi", "howdy", "whassup", "yo", "hey"]
Example #33
0

if __name__ == '__main__':

    logging.basicConfig(
        format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
        level=logging.INFO)
    urllib3_logger = logging.getLogger('urllib3')
    urllib3_logger.setLevel(logging.CRITICAL)
    logging.info("running %s" % " ".join(sys.argv))
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 3:
        print "Usage: python replace.py infile  outfile mode(default = 0)"
        sys.exit(1)
    infile = sys.argv[1]
    outfile = sys.argv[2]
    if len(sys.argv) == 4:
        mode = int(sys.argv[3])
    else:
        mode = 0

    wikipedia.set_rate_limiting(1)
    from replace import replacer
    rep = replacer(infile, debug_mode)
    if not mode:
        logging.info("Usage: replacer")
        rep.replace(outfile)
    else:
        logging.info("Usage: confirm")
        rep.confirm(outfile)
Example #34
0
def get_wiki_data(answers, pkl_file, delay=50000):
    """
    Download pages from Wikipedia and store them in a binary pickle file.
    The pickle file is a list of dicts, where each dict has the following keys:
        'title'
        'summary'
        'content'
        'links'
        'categories'
        'sections'

    This should be able to recover from crashes.  Just run it again with the
    same arguments, and it'll pick up where it left off.

    Args:
        answers: list of answer strings that should corespond to Wiki pages
        pkl_file: file used to store pickled output

        delay=50000: number of microseconds to wait between pages

    Returns:
        None
    """
    # funtion to try/except a page property (e.g. summary, links)
    def try_page_property(dict_page, page, attr):
        try:
            tmp = getattr(page, attr)
            dict_page[attr] = tmp
        except KeyError: # sometimes links doesn't work
            dict_page[attr] = None


    # wikipedia module does its own rate limiting, so let's use that
    wiki.set_rate_limiting(True, min_wait=datetime.timedelta(0,0,delay))

    # try to read which pages we've already done (i.e. if we're restarting)
    pages = []
    if os.path.isfile(pkl_file):
        with open(pkl_file, 'rb') as f:
            try:
                pages = pickle.load(f)
            except Exception: # if nothing gets loaded/bad file descriptor
                pass 
    
    # find pages we need to do
    pages_todo = set(answers)
    for page in pages:
        update_keys = (key in page.keys() for key in PAGE_KEYS)
        if all(update_keys): # if we have entries for all data keys
            pages_todo.discard(page['title'])
        
    pages_todo = list(pages_todo)

    # download wiki pages
    for i, title in enumerate(pages_todo):
        print (u"page {} of {} (title={})".format(i+1, len(pages_todo), title))
        
        try:
            #XXX even though we checked for bad pages, this sometimes raises
            # an exception.  For example, we could do 
            # title=u"Pascal (programming language)", but still get a PageError
            # saying Page id "pascal programming langauge" does not match any
            # pages. We can manually look up the Mediawiki pageid (using their API)
            # and look up the page using that 
            if title in ANSWER_TO_PAGEID.keys():
                page = wiki.page(pageid=ANSWER_TO_PAGEID[title])
            else:
                page = wiki.page(title=title, auto_suggest=False)
            
            # get the page data and store it in a dict
            dict_page = dict()
            for key in PAGE_KEYS:
                try_page_property(dict_page, page, key) 
            
            print (page.title)
            pages.append(dict_page)
            
            # dumping each time is safe, but slow
            # TODO: how safe is this?
            pickle.dump(pages, open(pkl_file, 'wb'), pickle.HIGHEST_PROTOCOL)

        except wiki.exceptions.WikipediaException as e:
            print (u"wiki issues with page (title={})".format(title))
            print (e)

        except Exception as e:
            print ("something else bad has happened!")
            print (e)
Example #35
0
from sklearn.naive_bayes import MultinomialNB, BernoulliNB  # Naive Bayes model classes
from sklearn.feature_extraction.text import CountVectorizer

import wikipedia as wiki  # wikipedia api wrapper

wiki.set_rate_limiting(True)  # might actually speed things up.
from sklearn.cross_validation import train_test_split  # split the data you have into training and test sets


def fetch_wiki(title, lang):
    """
    Return the regular English or simple versions of an article.
    Simple versions are far shorter than the regular ones, so only
    pull the summary of regular articles.
    In case of an error, just return None instead of crashing the program.
    """
    assert lang in ("en", "simple"), "Language must be 'en' or 'simple'"

    try:
        wiki.set_lang(lang)
        page = wiki.page(title)
        # print page.title  # used for testing the function
        return (page.summary, 1) if lang == "en" else (page.content, 0)  # 1: english, 0: simple
    except:  # NOTE: you should never have a blind `except` like this. but, hey, we're hacking.
        print " - error with " + lang + " page for: " + title
        return None


articles = [
    "General Relativity",
    "Bayes Theorem",
Example #36
0
 def __init__(self):
     wikipedia.set_rate_limiting(True)
     filename = "book/pages_used.txt"
     self.bib = open(filename, "w")
Example #37
0
import wikipedia
import codecs
import os
import datetime
topic = "climate change"
wikipedia.set_lang("en")
wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta(seconds=1))
def writepage(term,direc):
	try:
		termpage=wikipedia.page(term)
		content=termpage.content
		f=codecs.open(direc + "//" + term+".txt","w","utf-8")
		f.write(content)
		f.close()
		return termpage.links
	except Exception as e:
		print(e)
		return []

topics = ['climate change']
for topic in topics:
	results=wikipedia.search(topic)
	print(results)
	links = []
	#os.makedirs(topic)
	for term in results:
		newlink = writepage(term,topic)
		links = links + newlink

	for link in set(links):
		nl=writepage(link,topic)
from __future__ import print_function
import wikipedia
import datetime, os, sys
from time import time

wikipedia.set_rate_limiting(rate_limit=True,
                            min_wait=datetime.timedelta(0, 0, 1e6))
wikipedia.set_lang("en")


# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


cat_orig = [
    "Artificial intelligence", "Companies", "Computer architecture",
    "Computer model", "Computer engineering", "Data", "Free software",
    "Human-computer interaction", "Internet", "Mobile web", "Networks",
    "Operating systems", "Programming", "Software", "Automation",
    "Unsolved problems in computer science", "Computer security",
    "Computing and society", "Embedded systems", "Multimedia", "Languages",
    "Information technology", "Information systems", "Platforms",
    "Product lifecycle management", "Real-time computing",
    "Software engineering", "Computer science", "Classes of computers",
    "Machine Learning", "Statistics", "Mathematics", "Supercomputer",
    "Quantum physics", "Quantum computer", "compiler",
    "Central processing unit", "Graphics processing unit", "Personal Computer",
Example #39
0
import wikipedia
from PyDictionary import PyDictionary
import codecs
import re
from datetime import datetime, date, time

#---------------------------------------------

myflash = {}
pageerror = []
wikipedia.set_rate_limiting(False)

#---------------------------------------------
start = datetime.now()

with codecs.open('insource.txt', 'r', encoding="utf-8") as f:
    for line in f:
        try:
            term = str(wikipedia.page(line, auto_suggest=False))
            term = term[16:-2]
            definition = str(wikipedia.WikipediaPage(line).content).split('\n')
            definition = definition[0]
            if definition == '':
                pageerror.append(line)
            else:
                myflash[term] = definition
        except (wikipedia.exceptions.PageError, KeyError):
            try:
                line = re.sub('^[^a-zA-z]*|[^a-zA-Z]*$', '', line)
                term = str(wikipedia.page(line, auto_suggest=False))
                term = term[16:-2]
Example #40
0
# Use try_again_dec with the main methods of wikipedia
for name in ['geosearch', 'languages', 'page', 'search', 'suggest', 'summary']:
    setattr(
        wikipedia, name,
        try_again_dec(
            wikipedia.exceptions.HTTPTimeoutError,
            wikipedia.exceptions.RedirectError,
            requests.exceptions.RequestException,
            retry=3
        )(getattr(wikipedia, name))
    )

# wikipedia configuration
wikipedia.set_lang('en')
wikipedia.set_rate_limiting(True)
wikipedia.set_user_agent(
    'Newsparser NE comparison (http://newsparser704.pythonanywhere.com/)'
)


class WikiData:
    
    pages = 'pages'
    ne_mapping = 'ne_mapping'

    nlp = nlp
    

    def __init__(self):
        self.mongo_client = MongoClient()