Example #1
0
def text_iterator(use_wiki, wiki_location,
                  use_qb, qb_location,
                  use_source, source_location,
                  limit=-1,
                  min_pages=0, country_list='data/country_list.txt'):
    qdb = QuestionDatabase(qb_location)
    doc_num = 0

    cw = CachedWikipedia(wiki_location, country_list)
    pages = qdb.questions_with_pages()

    errors = {}
    for pp in sorted(pages, key=lambda k: len(pages[k]),
                     reverse=True):
        # This bit of code needs to line up with the logic in qdb.py
        # to have the same logic as the page_by_count function
        if len(pages[pp]) < min_pages:
            continue

        if use_qb:
            train_questions = [x for x in pages[pp] if x.fold == "train"]
            question_text = u"\n".join(u" ".join(x.raw_words())
                                       for x in train_questions)
        else:
            question_text = u''

        if use_source:
            filename = '%s/%s' % (source_location, pp)
            if os.path.isfile(filename):
                try:
                    with gzip.open(filename, 'rb') as f:
                        source_text = f.read()
                except zlib.error:
                    print("Error reading %s" % filename)
            else:
                source_text = ''
        else:
            source_text = u''

        if use_wiki:
            wikipedia_text = cw[pp].content
        else:
            wikipedia_text = u""

        total_text = wikipedia_text
        total_text += "\n"
        total_text += question_text
        total_text += "\n"
        total_text += unidecode(source_text)

        yield pp, total_text
        doc_num += 1

        if limit > 0 and doc_num > limit:
            break

    print("ERRORS")
    print("----------------------------------------")
    for ii in errors:
        print("%s\t%s" % (ii, errors[ii]))
Example #2
0
def text_iterator(use_wiki,
                  wiki_location,
                  use_qb,
                  qb_location,
                  use_source,
                  source_location,
                  limit=-1,
                  min_pages=0,
                  country_list='data/country_list.txt'):
    qdb = QuestionDatabase(qb_location)
    doc_num = 0

    cw = CachedWikipedia(wiki_location, country_list)
    pages = qdb.questions_with_pages()

    errors = {}
    for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
        # This bit of code needs to line up with the logic in qdb.py
        # to have the same logic as the page_by_count function
        if len(pages[pp]) < min_pages:
            continue

        if use_qb:
            train_questions = [x for x in pages[pp] if x.fold == "train"]
            question_text = u"\n".join(u" ".join(x.raw_words())
                                       for x in train_questions)
        else:
            question_text = u''

        if use_source:
            filename = '%s/%s' % (source_location, pp)
            if os.path.isfile(filename):
                try:
                    with gzip.open(filename, 'rb') as f:
                        source_text = f.read()
                except zlib.error:
                    print("Error reading %s" % filename)
            else:
                source_text = ''
        else:
            source_text = u''

        if use_wiki:
            wikipedia_text = cw[pp].content
        else:
            wikipedia_text = u""

        total_text = wikipedia_text
        total_text += "\n"
        total_text += question_text
        total_text += "\n"
        total_text += unidecode(source_text)

        yield pp, total_text
        doc_num += 1

        if limit > 0 and doc_num > limit:
            break

    print("ERRORS")
    print("----------------------------------------")
    for ii in errors:
        print("%s\t%s" % (ii, errors[ii]))
Example #3
0
def text_iterator(use_wiki, wiki_location, use_qb, qb_location, limit,
                  min_pages=0):
    qdb = QuestionDatabase(qb_location)
    doc_num = 0

    cw = CachedWikipedia(wiki_location)
    pages = qdb.questions_with_pages()

    errors = {}
    for pp in sorted(pages, key=lambda k: len(pages[k]),
                     reverse=True):
        # This bit of code needs to line up with the logic in qdb.py
        # to have the same logic as the page_by_count function
        if len(pages[pp]) < min_pages:
            continue

        if use_qb:
            train_questions = [x for x in pages[pp] if x.fold == "train"]
            question_text = u"\n".join(u" ".join(x.raw_words())
                                       for x in train_questions)
        else:
            question_text = u''

        if use_wiki:
            try:
                wiki_links = cw[pp].links
            except:
                wiki_links = []

            try:
                wiki_categories = cw[pp].categories
            except:
                wiki_categories = []

            try:
                wikipedia_text = cw[pp].content + ' ' + \
                    ' '.join(wiki_links + wiki_categories)
            except wikipedia.exceptions.PageError:
                errors[pp] = "Not found"
                continue
            except wikipedia.exceptions.DisambiguationError:
                errors[pp] = "Disambiguation"
            except KeyError:
                errors[pp] = "KeyError"
            except ValueError:
                errors[pp] = "No JSON object could be decoded"
            except ConnectionError:
                print("Connection error ... ")
                errors[pp] = "Connection error"
                sleep(600)
                print("done waiting")
        else:
            wikipedia_text = u""

        total_text = wikipedia_text
        total_text += question_text

        yield pp, total_text
        doc_num += 1

        if limit > 0 and doc_num > limit:
            break

    print("ERRORS")
    print("----------------------------------------")
    for ii in errors:
        print("%s\t%s" % (ii, errors[ii]))