def prepare_word_sets(self, corpus_dir, train_b, valid_b, test_b):
     if self.print_level > 0:
         print '-> Preparing word sets'
     word_sets_file = '%s/word_sets.pkl' % corpus_dir
     print (word_sets_file)
     # if not exist, will create from traning set and store
     # word_sets contains all one gram and two grams after removing stopwords
     self.word_sets = load_from_pkl(word_sets_file)
     if self.word_sets is None:
         # Prepare list of words (and pairs) that appear in training set
         # note that if tuples = [1], then parser,parse('one two three') -> ['one', 'two', 'three]
         # if tuples = [2], then parser.parse('one two three') -> ['one two', 'two three']
         # if tuples = [1,2], then parser,parse('one two three) -> ['one', 'two', 'three', 'one two', 'two three']
         parser = SimpleWordParser(tuples=[1,2])
         words = set()
         for exam in [train_b, valid_b, test_b]:
             if exam is not None:
                 words.update(np.concatenate([self._words_to_names(parser.parse(qst)) for qst in exam['question']]))
                 words.update(np.concatenate([self._words_to_names(parser.parse(ans)) for ans in exam['answer']]))
         words.difference_update(['']) # ignore empty word
         words = sorted(words)
         if self.print_level > 1:
             print '%d word sets: ...%s...' % (len(words), words[::5000])
         self.word_sets = words
         save_to_pkl(word_sets_file, self.word_sets)
Exemple #2
0
 def _end_action(self):
     if self._outf is not None:
         self._outf.close()
         self._outf = None
     # Write pages_in_corpus
     if self.action == 'write':
         save_to_pkl('%s.pages.pkl' % self.outfile, self.pages_in_corpus)
     gc.collect()
    def read_pages_in_categories(self, target_categories, max_cat_depth, important_categories, reread=False):
        print "=> Reading pages in target categories for %s" % self.wiki_name
        self.target_categories = target_categories
        self.max_cat_depth = max_cat_depth
        use_categories_file = "%s/%s_%s" % (self.wiki_dir, self.wiki_name, WikiCorpusBuilder.USE_CATEGORIES_FILE)
        pages_in_categories_file = "%s/%s_%s" % (
            self.wiki_dir,
            self.wiki_name,
            WikiCorpusBuilder.PAGES_IN_CATEGORIES_FILE,
        )
        if reread or (not os.path.exists(use_categories_file)) or (not os.path.exists(pages_in_categories_file)):
            if self.target_categories is None:
                # generated from the above method
                self.use_categories = self.wikir.all_categories
            else:
                # this block check that target categories(which we think are very relevant) are all included in our search category
                self.use_categories = set(
                    [
                        cat
                        for cat in self.wikir.all_categories
                        if self.wikir.search_categories(cat, self.target_categories, max_depth=self.max_cat_depth) >= 0
                    ]
                )
            save_to_pkl(use_categories_file, self.use_categories)

            self.pages_in_categories = self.wikir.read_pages_in_categories(
                wikifile="%s/%s" % (self.wiki_dir, self.wiki_file),
                use_categories=self.use_categories,
                max_read_lines=99900000000,
            )
            save_to_pkl(pages_in_categories_file, self.pages_in_categories)
        else:
            self.use_categories = load_from_pkl(use_categories_file)
            self.pages_in_categories = load_from_pkl(pages_in_categories_file)

        print "Using %d categories related to %s target categories with depth <= %d" % (
            len(self.use_categories),
            "x" if self.target_categories is None else len(self.target_categories),
            self.max_cat_depth,
        )
        print "Missing important categories: %s" % str(
            [cat for cat in important_categories if cat not in self.use_categories]
        )
        print "There are %d pages in the %d categories" % (len(self.pages_in_categories), len(self.use_categories))
 def read_categories(self, reread=False):
     # this function create 'all_categories.pkl' and 'parent_categories.pkl'
     # there are 29586 categories and 27923 parent categories
     print "=> Reading categories for %s" % self.wiki_name
     categories_file = "%s/%s_%s" % (self.wiki_dir, self.wiki_name, WikiCorpusBuilder.ALL_CATEGORIES_FILE)
     parents_file = "%s/%s_%s" % (self.wiki_dir, self.wiki_name, WikiCorpusBuilder.PARENT_CATEGORIES_FILE)
     gc.collect()
     if reread or (not os.path.exists(categories_file)) or (not os.path.exists(parents_file)):
         # if it is the 1st time run this code, will end up in this block and create this 2 category files
         # it will call the WikiReader to get all the category names from wiki file by scanning through it and match category regex
         self.wikir.read_sub_categories(
             wikifile="%s/%s" % (self.wiki_dir, self.wiki_file), max_read_lines=99900000000
         )
         save_to_pkl(categories_file, self.wikir.all_categories)
         save_to_pkl(parents_file, self.wikir.parent_categories)
     else:
         self.wikir.all_categories = load_from_pkl(categories_file)
         self.wikir.parent_categories = load_from_pkl(parents_file)
     print "There are a total of %d categories" % len(self.wikir.all_categories)
 def _save_to_cache(self, fname, data):
     filename = self._cache_filename(fname)
     print 'Saving to cache %s' % filename
     return save_to_pkl(filename, data)
    def create_corpus(
        self,
        train_b,
        valid_b,
        min_pos_words_in_page_name,
        min_pos_words_in_section,
        only_first_section_per_page=False,
        max_sections_per_page=99999999,
        use_all_pages_match_pos_word=True,
        use_all_pages_match_answer=True,
        pages_to_use=None,
        always_use_first_section=False,
        max_read_lines=99900000000,
        reread=False,
    ):
        print "=> Creating corpus"
        self.min_pos_words_in_page_name = min_pos_words_in_page_name
        self.min_pos_words_in_section = min_pos_words_in_section
        self.only_first_section_per_page = only_first_section_per_page
        self.max_sections_per_page = max_sections_per_page
        self.use_all_pages_match_pos_word = use_all_pages_match_pos_word
        self.use_all_pages_match_answer = use_all_pages_match_answer
        self.always_use_first_section = always_use_first_section
        exams_words_file = "%s/%s_%s" % (self.wiki_dir, self.wiki_name, WikiCorpusBuilder.EXAMS_WORDS_FILE)
        pos_words_file = "%s/%s_%.4f_%s%s" % (
            self.wiki_dir,
            self.wiki_name,
            self.wiki_common_words_min_frac,
            "wsw_" if self.use_wiki_stop_words else "",
            WikiCorpusBuilder.POSITIVE_WORDS_FILE,
        )
        answers_file = "%s/%s_%s" % (self.wiki_dir, self.wiki_name, WikiCorpusBuilder.ANSWERS_FILE)
        corpus_file = "%s/%s_%.4f_%s%.4f_%d_%d_%s_%s_%s" % (
            self.wiki_dir,
            self.wiki_name,
            self.wiki_common_words_min_frac,
            "wsw_" if self.use_wiki_stop_words else "",
            self.wiki_uncommon_words_max_frac,
            self.min_pos_words_in_page_name,
            self.min_pos_words_in_section,
            self.use_all_pages_match_pos_word,
            self.use_all_pages_match_answer,
            self.always_use_first_section,
        )
        if pages_to_use is not None:
            corpus_file = "%s_pn%d" % (corpus_file, len(pages_to_use))
        corpus_file = "%s_%s" % (corpus_file, WikiCorpusBuilder.CORPUS_FILE)
        print "Corpus file: %s" % corpus_file
        gc.collect()

        # Get the corpus of the train+validation sets
        if reread or (not os.path.exists(pos_words_file)) or (not os.path.exists(answers_file)):
            # Get all the words that appear in the exams
            if valid_b is None:
                all_exams = train_b[["ID", "question", "answer"]]
            else:
                all_exams = pd.concat([train_b[["ID", "question", "answer"]], valid_b[["ID", "question", "answer"]]])
            parser = SimpleWordParser()
            exams_locdic = build_training_location_dictionary(
                all_exams,
                parser=parser,
                use_answers=True,
                min_word_docs_frac=0,
                max_word_docs_frac=1.0,
                min_word_count_frac=0,
                max_word_count_frac=1.0,
                ascii_conversion=True,
            )
            self.exams_words = exams_locdic.word_ids.keys()
            # Set the "positive_words" as all the words from the train(+validation) files that are uncommon in Wiki
            self.pos_words = set(self.exams_words).intersection(self.wiki_uncommon_words)
            # Get all the answers (each answer = a set of words)
            self.all_answers = set()
            for answer in all_exams["answer"]:
                self.all_answers.add(tuple(sorted(parser.parse(answer))))
            save_to_pkl(exams_words_file, self.exams_words)
            save_to_pkl(pos_words_file, self.pos_words)
            save_to_pkl(answers_file, self.all_answers)
        else:
            self.exams_words = load_from_pkl(exams_words_file)
            self.pos_words = load_from_pkl(pos_words_file)
            self.all_answers = load_from_pkl(answers_file)

        print "There are %d positive words (%d wiki uncommon words, %d words from exams)" % (
            len(self.pos_words),
            len(self.wiki_uncommon_words),
            len(self.exams_words),
        )
        print "There are a total of %d unique answers" % len(self.all_answers)
        print "Using %d stop words" % (len(self.stop_words))
        if pages_to_use is None:
            use_pages = self.pages_in_categories
        else:
            use_pages = pages_to_use
        print "Considering %d pages" % len(use_pages)

        if reread or (not os.path.exists(corpus_file)):
            print "Writing %s corpus to %s" % (self.wiki_name, corpus_file)
            ld = self.wikir.read(
                wikifile="%s/%s" % (self.wiki_dir, self.wiki_file),
                outfile=corpus_file,
                only_first_section_per_page=self.only_first_section_per_page,
                max_sections_per_page=self.max_sections_per_page,
                use_pages=use_pages,
                max_read_lines=max_read_lines,
                stop_words=self.stop_words,
                pos_words=self.pos_words,
                page_name_word_sets=self.all_answers,
                corpus_words=None,  ##set(exams_locdic.word_ids.keys()),
                min_pos_words_in_page_name=self.min_pos_words_in_page_name,
                min_pos_words_in_section=self.min_pos_words_in_section,
                use_all_pages_match_pos_word=self.use_all_pages_match_pos_word,
                use_all_pages_match_sets=self.use_all_pages_match_answer,
                always_use_first_section=self.always_use_first_section,
                action="write",
            )
            print "Done writing corpus"

        gc.collect()
        return corpus_file
    def find_common_words(
        self,
        wiki_common_words_min_frac=0.2,
        wiki_uncommon_words_max_frac=0.01,
        use_wiki_stop_words=True,
        max_read_lines=100000000,
        reread=False,
    ):
        print "=> Finding common/uncommon words"
        self.wiki_common_words_min_frac = wiki_common_words_min_frac
        self.wiki_uncommon_words_max_frac = wiki_uncommon_words_max_frac
        self.use_wiki_stop_words = use_wiki_stop_words
        # the 3 files not exist at begining, need to create once
        common_words_file = "%s/%s_%.4f_%s" % (
            self.wiki_dir,
            self.wiki_name,
            self.wiki_common_words_min_frac,
            WikiCorpusBuilder.COMMON_WORDS_FILE,
        )
        uncommon_words_file = "%s/%s_%.4f_%s" % (
            self.wiki_dir,
            self.wiki_name,
            self.wiki_uncommon_words_max_frac,
            WikiCorpusBuilder.UNCOMMON_WORDS_FILE,
        )
        stop_words_file = "%s/%s_%.4f_%s%s" % (
            self.wiki_dir,
            self.wiki_name,
            self.wiki_common_words_min_frac,
            "wsw_" if self.use_wiki_stop_words else "",
            WikiCorpusBuilder.STOP_WORDS_FILE,
        )
        # Read first X lines from Wiki corpus, and get the set of Wiki stop-words (words that appear in many documents),
        # as well as the "uncommon" words (words that appear in a small fraction of the documents)
        if (
            reread
            or (not os.path.exists(common_words_file))
            or (not os.path.exists(uncommon_words_file))
            or (not os.path.exists(stop_words_file))
        ):
            # this line creates a locdic variable (Cardal_LocationDict object)
            # by calling the read function, it actually read the wiki file with action = 'locdic', this will create a location dict
            # for each page, and for each section in each page, we read all its section text, and perform the add_words function in Cardal_LocationDict
            # the input for this function are have page_name, section_name, section_number, section_text
            # the add_words function: 1st arg is page_name + page_id, 2nd arg is section_name + section_id, 3rd arg is the section_text
            # this will also compute the count of all parsed words
            wiki_locdic = self.wikir.read(
                wikifile="%s/%s" % (self.wiki_dir, self.wiki_file),
                outfile="%s/%s_locdic1.txt" % (self.wiki_dir, self.wiki_name),  # ignored...
                # only_first_section_per_page=True, max_read_lines=max_read_lines,
                only_first_section_per_page=False,
                max_sections_per_page=1,
                max_read_lines=max_read_lines,
                stop_words=SpecialWords.ignore_words,
                pos_words=set(),
                min_pos_words_in_page_name=0,
                min_pos_words_in_section=0,
                action="locdic",
            )
            # there are 2 fraction thresholds for common words and uncommon words
            # depends on the threshold, these 2 values could be different
            self.wiki_common_words = set(
                [
                    word
                    for dc, word in wiki_locdic.sort_words_by_num_docs()
                    if dc > (self.wiki_common_words_min_frac * wiki_locdic.get_num_docs())
                ]
            )
            self.wiki_uncommon_words = set(
                [
                    word
                    for dc, word in wiki_locdic.sort_words_by_num_docs()
                    if dc < (self.wiki_uncommon_words_max_frac * wiki_locdic.get_num_docs())
                ]
            )
            # we add common words to stopwords
            self.stop_words = set(SpecialWords.ignore_words).union(self.wiki_common_words)
            if self.use_wiki_stop_words:
                self.stop_words.update(WikiReader.WIKI_STOP_WORDS)
            wiki_locdic = None
            gc.collect()
            save_to_pkl(common_words_file, self.wiki_common_words)
            save_to_pkl(uncommon_words_file, self.wiki_uncommon_words)
            save_to_pkl(stop_words_file, self.stop_words)
        else:
            self.wiki_common_words = load_from_pkl(common_words_file)
            self.wiki_uncommon_words = load_from_pkl(uncommon_words_file)
            self.stop_words = load_from_pkl(stop_words_file)

        print "There are %d common words (>%.4f docs)" % (len(self.wiki_common_words), self.wiki_common_words_min_frac)
        print "There are %d uncommon words (<%.4f docs)" % (
            len(self.wiki_uncommon_words),
            self.wiki_uncommon_words_max_frac,
        )
        print "Using %d stop words (%s wiki stop words)" % (
            len(self.stop_words),
            "with" if self.use_wiki_stop_words else "without",
        )