Example #1
0
    def prepare_lucene_indexes(self, corpus_dir):
        self.lucene_dir1, self.lucene_parser1, self.lucene_corpus1 = None, None, None
        self.lucene_dir2, self.lucene_parser2, self.lucene_corpus2 = None, None, None
        self.lucene_dir3, self.lucene_parser3, self.lucene_corpus3 = None, None, None

        # Lucene Index 1: ck12html
        self.lucene_dir1 = '%s/lucene_idx1' % corpus_dir
        self.lucene_parser1 = SimpleWordParser(word_func=EnglishStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus1 = LuceneCorpus(index_dir=self.lucene_dir1, filenames=[self.ck12html_corpus], parser=self.lucene_parser1)
        if not os.path.exists(self.lucene_dir1):
             self.lucene_corpus1.prp_index()

        # Lucene Index 2: ck12text
        self.lucene_dir2 = '%s/lucene_idx2' % corpus_dir
        self.lucene_parser2 = SimpleWordParser(word_func=LancasterStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus2 = LuceneCorpus(index_dir=self.lucene_dir2, filenames=[self.ck12text_corpus], parser=self.lucene_parser2)
        if not os.path.exists(self.lucene_dir2):
             self.lucene_corpus2.prp_index()

        # Lucene Index 3: simplewiki
        self.lucene_dir3 = '%s/lucene_idx3' % corpus_dir
        self.lucene_parser3 = SimpleWordParser(word_func=PorterStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus3 = LuceneCorpus(index_dir=self.lucene_dir3, filenames=[self.simplewiki_corpus],
                                           parser=self.lucene_parser3, similarity=BM25Similarity())
        if not os.path.exists(self.lucene_dir3):
             self.lucene_corpus3.prp_index()
Example #2
0
class FeatureExtractor(object):
    '''
    This is the main class that runs the various search functions and prepares the features.
    Each feature is a score (or value) for the relevant question,answer pair.
    '''

    def __init__(self, base_dir, recalc=False, norm_scores_default=False, print_level=1):
        """
        :param base_dir:
        :param recalc:
        :param norm_scores_default:
        :param print_level:
        :return:
        """
        self.base_dir = base_dir
        self.recalc = recalc
        self.norm_scores_default = norm_scores_default
        self.print_level = print_level

    def _words_to_names(self, words):
        names = []
        for word in words:
            if len(word) == 0:
                return ''
            names.append(word[0].upper() + word[1:])
        return names

    def prepare_word_sets(self, corpus_dir, train_b, valid_b, test_b):
        if self.print_level > 0:
            print '-> Preparing word sets'
        word_sets_file = '%s/word_sets.pkl' % corpus_dir
        print (word_sets_file)
        # if not exist, will create from traning set and store
        # word_sets contains all one gram and two grams after removing stopwords
        self.word_sets = load_from_pkl(word_sets_file)
        if self.word_sets is None:
            # Prepare list of words (and pairs) that appear in training set
            # note that if tuples = [1], then parser,parse('one two three') -> ['one', 'two', 'three]
            # if tuples = [2], then parser.parse('one two three') -> ['one two', 'two three']
            # if tuples = [1,2], then parser,parse('one two three) -> ['one', 'two', 'three', 'one two', 'two three']
            parser = SimpleWordParser(tuples=[1,2])
            words = set()
            for exam in [train_b, valid_b, test_b]:
                if exam is not None:
                    words.update(np.concatenate([self._words_to_names(parser.parse(qst)) for qst in exam['question']]))
                    words.update(np.concatenate([self._words_to_names(parser.parse(ans)) for ans in exam['answer']]))
            words.difference_update(['']) # ignore empty word
            words = sorted(words)
            if self.print_level > 1:
                print '%d word sets: ...%s...' % (len(words), words[::5000])
            self.word_sets = words
            save_to_pkl(word_sets_file, self.word_sets)

    def prepare_ck12html_corpus(self, corpus_dir):
        self.ck12html_corpus = '%s/CK12/OEBPS/ck12.txt' % corpus_dir
        if not os.path.exists(self.ck12html_corpus):
            # Doc per HTML section (h1-4)
            htmlr = HtmlReader(min_chars_per_line=1, min_words_per_section=20)
            locdic = htmlr.read(htmldir='%s/CK12/OEBPS' % corpus_dir,
                                outfile=self.ck12html_corpus,
                                ignore_sections=set(['explore more.*', 'review', 'practice', 'references']),
                                stop_words=None, pos_words=set([]), corpus_words=None,
                                min_pos_words_in_page_name=0, min_pos_words_in_section=0, action='write')


    def prepare_ck12text_corpus(self, corpus_dir):
        self.ck12text_corpus = '%s/CK12/ck12_text.txt' % corpus_dir
        if not os.path.exists(self.ck12text_corpus):
            textr = TextReader(min_chars_per_line=1, min_words_per_section=25)
            locdic = textr.read(dir='%s/CK12' % corpus_dir,
                                outfile=self.ck12text_corpus,
                                # see "Peoples-Physics-Book-Basic_b_v10_zgo_s1.text" for instance
                                # each chapter is begin with 'CHAPTER'
                                first_line_regexp='^(CHAPTER)',
                                action='write')

    def prepare_simplewiki_corpus(self, corpus_dir, train_b, valid_b):
        # some explanations of the parameters, note that by modifying these numbers, you get different wiki corpus
        # here for simplicity, I only show one possible combination
        # common_words_min_frac = 1.0, meaning no words are treated as common words
        # uncommon words_max_frac = 0,05 meaning there are 403716 uncommon words in this setting
        # min_pos_words_in_page_name=0 meaning an eligible page must have 0 pos words(words that appears in train_b and valid_b), cuz we only want relevant wiki pages
        # min_pos_words_in_section=5 meaning eligible section must have 5 pos words
        # use_all_pages_match_pos_word=True
        # use_all_pages_match_answer=True
        # always_use_first_section=True
        self.simplewiki_corpus = '%s/simplewiki/simplewiki_1.0000_0.0500_0_5_True_True_True_corpus.txt' % corpus_dir
        if not os.path.exists(self.simplewiki_corpus):
            wkb = WikiCorpusBuilder(wiki_name='simplewiki', wiki_dir='%s/simplewiki'%corpus_dir,
                                    wiki_file='simplewiki-20151102-pages-articles.xml', debug_flag=False)
            # Create 2 files all_categories.pkl and parent_categories.pkl, if exist, will just load,
            # They are stored in wkb.all_categories and wkb.parent_categories
            # we scan the wiki file find all categories that has <title>Categories:xxx</title> and their parent Catetories
            # details can be found in read_categories method in WikiReader.py
            wkb.read_categories(reread=False)
            # Create 2 files 'use_categories.pkl' and 'pages_in_categories.pkl'
            # for all singlewiki corpus, target_categories = None, important_categories=['Earth', 'Cellular respiration', 'DNA', 'Units of length', 'History of science',
            #                                                           'Evolutionary biology', 'Nonmetals', 'Health', 'Charles Darwin']
            # important_categories are science-related categories, if not found in target_catefories, which is generated from above method, will give an alert
            # it will all read_pages_in_categories in Cardal_WikiReader.py
            wkb.read_pages_in_categories(target_categories=None, max_cat_depth=9999,
                                         important_categories=['Earth', 'Cellular respiration', 'DNA', 'Units of length', 'History of science',
                                                               'Evolutionary biology', 'Nonmetals', 'Health', 'Charles Darwin'], reread=False)
            # this will read all the text from wiki file
            # parse useful pure text and build a dict of words
            # depends on the common words and uncommon words fraction, we pick up common words and uncommon words
            # we also add common words to stop words
            # we finally save common_words.pkl, uncommon_words.pkl and stop_words.pkl to corpus dir
            wkb.find_common_words(wiki_common_words_min_frac=1.0, wiki_uncommon_words_max_frac=0.05, use_wiki_stop_words=False, reread=False)
            # note that wkb.create_corpus function returns a value, this is just the location of corpus name
            # this will create the corpus file as well as exams_words.pkl(all words that appear in train_b and valid_b),
            # positive_words.pkl(all words in exam that are also uncommon in wiki),
            # and all_answers.pkl(this is a set, each element is a tuple of words within that answer)
            self.simplewiki_corpus = wkb.create_corpus(train_b, valid_b, min_pos_words_in_page_name=0, min_pos_words_in_section=5,
                                                        only_first_section_per_page=False, use_all_pages_match_pos_word=True, use_all_pages_match_answer=True,
                                                        always_use_first_section=True, max_read_lines=9990000000, reread=False)


    def prepare_lucene_indexes(self, corpus_dir):
        self.lucene_dir1, self.lucene_parser1, self.lucene_corpus1 = None, None, None
        self.lucene_dir2, self.lucene_parser2, self.lucene_corpus2 = None, None, None
        self.lucene_dir3, self.lucene_parser3, self.lucene_corpus3 = None, None, None

        # Lucene Index 1: ck12html
        self.lucene_dir1 = '%s/lucene_idx1' % corpus_dir
        self.lucene_parser1 = SimpleWordParser(word_func=EnglishStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus1 = LuceneCorpus(index_dir=self.lucene_dir1, filenames=[self.ck12html_corpus], parser=self.lucene_parser1)
        if not os.path.exists(self.lucene_dir1):
             self.lucene_corpus1.prp_index()

        # Lucene Index 2: ck12text
        self.lucene_dir2 = '%s/lucene_idx2' % corpus_dir
        self.lucene_parser2 = SimpleWordParser(word_func=LancasterStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus2 = LuceneCorpus(index_dir=self.lucene_dir2, filenames=[self.ck12text_corpus], parser=self.lucene_parser2)
        if not os.path.exists(self.lucene_dir2):
             self.lucene_corpus2.prp_index()

        # Lucene Index 3: simplewiki
        self.lucene_dir3 = '%s/lucene_idx3' % corpus_dir
        self.lucene_parser3 = SimpleWordParser(word_func=PorterStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus3 = LuceneCorpus(index_dir=self.lucene_dir3, filenames=[self.simplewiki_corpus],
                                           parser=self.lucene_parser3, similarity=BM25Similarity())
        if not os.path.exists(self.lucene_dir3):
             self.lucene_corpus3.prp_index()


    def prepare_features(self, dataf_q, dataf_b, train_df, cache_dir):
        """
        :param dataf_q: this is not used in my implementation
        :param dataf_b:
        :param train_df: note that we create word counter for training set only, that's why we need to pass this even we are creating features for valid set
        :param cache_dir:
        :return:

        Basic Features:
        1. AnswerInQuestionFunc(): calculate the set of all words of parsed from question, for each ans, calculate the fraction of
        (# intersection of answer words and question words) / (# of total answer words). A little variation is when we parse, we can set
        word stemming on

        2. AnswersInAnswersFunc(): calculate for each answer, the avg ratio of its words appears in other questions. Will not use this feature

        3. AnswerCountFunc(count_type='count', parser): it will use the parser to parse all the answer(if use_question = True, will do the same for questions),
           after all ans are parsed, this will build a counter dict for all the unique words, for each ans, the mean count of words in that ans is calculated as feature

        4. AnswerCountFunc(count_type='correct', parser): same as above, but here only words from correct answers are used to build counter dict

        5. AnswersLengthFunc(log_flag=False): this will gives relative length(# of char) of each ans to mean(answers for the same question)

        Lucene Feature:
        6. AnswersLuceneSearchFunc(lucene_corpus, max_docs, weight_func, score_func): for each ans, we search qst+ans in lucene_corpus, retrievel max_docs number
        of documents, for the score returned by each documents, we can apply an optional score_func as transformation, then we use weight_func to weight
        all these scores and sum as the feature of this ans
        """
        self.cache_dir = '%s/%s' % (self.base_dir, cache_dir)
        create_dirs([self.cache_dir])
        stemmer1 = PorterStemmer()
        stem1 = stemmer1.stem
        check_same_question = not set(dataf_b['ID']).isdisjoint(train_df['ID'])
        stemmed_parser  = SimpleWordParser(word_func=stem1, ignore_special_words=True , min_word_length=1)

        func_name = 'ans_in_qst_stem'
        self.add_answer_func(dataf_b, func=AnswersInQuestionFunc(parser=stemmed_parser), name=func_name)

        func_name = 'ans_in_ans_stem'
        self.add_answer_func(dataf_b, func=AnswersInAnswersFunc(parser=stemmed_parser), name=func_name)

        func_name = 'ans_words_stem_count'
        self.add_answer_func(dataf_b, func=AnswerCountFunc(train_df, check_same_question=check_same_question, count_type='count', parser=stemmed_parser, single_words=True), name=func_name)

        func_name = 'ans_length_ratio'
        self.add_answer_func(dataf_b, func=AnswersLengthFunc(log_flag=False), name=func_name)

        func_name = 'ans_num_words'
        self.add_answer_func(dataf_b, func=AnswersNumWordsFunc(), name=func_name)

        func_name = 'luc_stem_1'
        self.add_answer_func(dataf_b,
                             func=AnswersLuceneSearchFunc(lucene_corpus=self.lucene_corpus1, parser=self.lucene_parser1,
                             max_docs=250, weight_func=lambda n: 1.0/(5.0+np.arange(n))**2, score_func=lambda s: (s/10.0)**3,
                             norm_scores= True),
                             name= func_name)

        func_name = 'luc_stem_2'
        self.add_answer_func(dataf_b,
                             func=AnswersLuceneSearchFunc(lucene_corpus=self.lucene_corpus2, parser=self.lucene_parser2,
                             max_docs=250, weight_func=lambda n: 1.0/(5.0+np.arange(n))**2, score_func=lambda s: (s/10.0)**3,
                             norm_scores= True),
                             name= func_name)

        func_name = 'luc_stem_3'
        self.add_answer_func(dataf_b,
                             func=AnswersLuceneSearchFunc(lucene_corpus=self.lucene_corpus3, parser=self.lucene_parser3,
                             max_docs=250, weight_func=lambda n: 1.0/(5.0+np.arange(n))**2, score_func=lambda s: (s/10.0)**3,
                             norm_scores= True),
                             name= func_name)

    def _cache_filename(self, fname):
        return '%s/%s.pkl' % (self.cache_dir, fname)

    def _read_from_cache(self, fname):
        filename = self._cache_filename(fname)
        #print 'Loading from cache %s' % filename
        return load_from_pkl(filename)

    def _save_to_cache(self, fname, data):
        filename = self._cache_filename(fname)
        print 'Saving to cache %s' % filename
        return save_to_pkl(filename, data)

    def _is_in_cache(self, name):
        if self.cache_dir is None:
            return False
        exists = True
        if np.isscalar(name):
            exists = os.path.exists(self._cache_filename(name))
        else:
            for n in name:
                exists = exists and os.path.exists(self._cache_filename(n))
        return exists

    def add_answer_func(self, train_b, func, name, question_ids=None):
        '''
        Run a score function on each set of question and answers
        '''
        if (not self.recalc) and (self.cache_dir is not None) and (self._is_in_cache(name)):
            if np.isscalar(name):
                train_b[name] = self._read_from_cache(name)
            else:
                for n in name:
                    train_b[n] = self._read_from_cache(n)
            return

        groups = train_b.groupby('ID').groups
        for i,(idx,inds) in enumerate(groups.iteritems()):
            assert len(set(train_b.irow(inds)['question']))==1
            if (question_ids is not None) and (idx not in question_ids): continue
            question = train_b.iloc[inds[0]]['question']
            answers = np.array(train_b.iloc[inds]['answer'])
            if 'correct' in train_b.columns:
                print '\n-----> #%d : correct = %s' % (i, ', '.join(['%d'%c for c in np.array(train_b.iloc[inds]['correct'])]))
                sys.stdout.flush()
            vals = func(question, answers)
            if question_ids is not None:
                print 'vals = %s' % str(vals)
            for val,ind in zip(vals, inds):
                if np.isscalar(val):
                    train_b.set_value(ind, name, val)
                else:
                    assert len(val)==len(name)
                    for v,n in zip(val,name):
                        train_b.set_value(ind, n, v)

        if (self.cache_dir is not None) and (question_ids is None):
            if np.isscalar(name):
                self._save_to_cache(name, np.array(train_b[name]))
            else:
                for n in name:
                    self._save_to_cache(n, np.array(train_b[n]))