def text_iterator(use_wiki, wiki_location, use_qb, qb_location, use_source, source_location, limit=-1, min_pages=0, country_list='data/country_list.txt'): qdb = QuestionDatabase(qb_location) doc_num = 0 cw = CachedWikipedia(wiki_location, country_list) pages = qdb.questions_with_pages() errors = {} for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): # This bit of code needs to line up with the logic in qdb.py # to have the same logic as the page_by_count function if len(pages[pp]) < min_pages: continue if use_qb: train_questions = [x for x in pages[pp] if x.fold == "train"] question_text = u"\n".join(u" ".join(x.raw_words()) for x in train_questions) else: question_text = u'' if use_source: filename = '%s/%s' % (source_location, pp) if os.path.isfile(filename): try: with gzip.open(filename, 'rb') as f: source_text = f.read() except zlib.error: print("Error reading %s" % filename) else: source_text = '' else: source_text = u'' if use_wiki: wikipedia_text = cw[pp].content else: wikipedia_text = u"" total_text = wikipedia_text total_text += "\n" total_text += question_text total_text += "\n" total_text += unidecode(source_text) yield pp, total_text doc_num += 1 if limit > 0 and doc_num > limit: break print("ERRORS") print("----------------------------------------") for ii in errors: print("%s\t%s" % (ii, errors[ii]))
def text_iterator(use_wiki, wiki_location, use_qb, qb_location, limit, min_pages=0): qdb = QuestionDatabase(qb_location) doc_num = 0 cw = CachedWikipedia(wiki_location) pages = qdb.questions_with_pages() errors = {} for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): # This bit of code needs to line up with the logic in qdb.py # to have the same logic as the page_by_count function if len(pages[pp]) < min_pages: continue if use_qb: train_questions = [x for x in pages[pp] if x.fold == "train"] question_text = u"\n".join(u" ".join(x.raw_words()) for x in train_questions) else: question_text = u'' if use_wiki: try: wiki_links = cw[pp].links except: wiki_links = [] try: wiki_categories = cw[pp].categories except: wiki_categories = [] try: wikipedia_text = cw[pp].content + ' ' + \ ' '.join(wiki_links + wiki_categories) except wikipedia.exceptions.PageError: errors[pp] = "Not found" continue except wikipedia.exceptions.DisambiguationError: errors[pp] = "Disambiguation" except KeyError: errors[pp] = "KeyError" except ValueError: errors[pp] = "No JSON object could be decoded" except ConnectionError: print("Connection error ... ") errors[pp] = "Connection error" sleep(600) print("done waiting") else: wikipedia_text = u"" total_text = wikipedia_text total_text += question_text yield pp, total_text doc_num += 1 if limit > 0 and doc_num > limit: break print("ERRORS") print("----------------------------------------") for ii in errors: print("%s\t%s" % (ii, errors[ii]))