Python LuceneCorpus Examples

Programming Language: Python

Namespace/Package Name: LuceneCorpus

Class/Type: LuceneCorpus

Examples at hotexamples.com: 2

Python LuceneCorpus - 2 examples found. These are the top rated real world Python examples of LuceneCorpus.LuceneCorpus extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

prp_index(1)

Example #1

Show file

File: FeatureExtractor.py Project: XihuanZeng/kaggle

    def prepare_lucene_indexes(self, corpus_dir):
        self.lucene_dir1, self.lucene_parser1, self.lucene_corpus1 = None, None, None
        self.lucene_dir2, self.lucene_parser2, self.lucene_corpus2 = None, None, None
        self.lucene_dir3, self.lucene_parser3, self.lucene_corpus3 = None, None, None

        # Lucene Index 1: ck12html
        self.lucene_dir1 = '%s/lucene_idx1' % corpus_dir
        self.lucene_parser1 = SimpleWordParser(word_func=EnglishStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus1 = LuceneCorpus(index_dir=self.lucene_dir1, filenames=[self.ck12html_corpus], parser=self.lucene_parser1)
        if not os.path.exists(self.lucene_dir1):
             self.lucene_corpus1.prp_index()

        # Lucene Index 2: ck12text
        self.lucene_dir2 = '%s/lucene_idx2' % corpus_dir
        self.lucene_parser2 = SimpleWordParser(word_func=LancasterStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus2 = LuceneCorpus(index_dir=self.lucene_dir2, filenames=[self.ck12text_corpus], parser=self.lucene_parser2)
        if not os.path.exists(self.lucene_dir2):
             self.lucene_corpus2.prp_index()

        # Lucene Index 3: simplewiki
        self.lucene_dir3 = '%s/lucene_idx3' % corpus_dir
        self.lucene_parser3 = SimpleWordParser(word_func=PorterStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus3 = LuceneCorpus(index_dir=self.lucene_dir3, filenames=[self.simplewiki_corpus],
                                           parser=self.lucene_parser3, similarity=BM25Similarity())
        if not os.path.exists(self.lucene_dir3):
             self.lucene_corpus3.prp_index()

Example #2

Show file

File: FeatureExtractor.py Project: XihuanZeng/kaggle

class FeatureExtractor(object):
    '''
    This is the main class that runs the various search functions and prepares the features.
    Each feature is a score (or value) for the relevant question,answer pair.
    '''

    def __init__(self, base_dir, recalc=False, norm_scores_default=False, print_level=1):
        """
        :param base_dir:
        :param recalc:
        :param norm_scores_default:
        :param print_level:
        :return:
        """
        self.base_dir = base_dir
        self.recalc = recalc
        self.norm_scores_default = norm_scores_default
        self.print_level = print_level

    def _words_to_names(self, words):
        names = []
        for word in words:
            if len(word) == 0:
                return ''
            names.append(word[0].upper() + word[1:])
        return names

    def prepare_word_sets(self, corpus_dir, train_b, valid_b, test_b):
        if self.print_level > 0:
            print '-> Preparing word sets'
        word_sets_file = '%s/word_sets.pkl' % corpus_dir
        print (word_sets_file)
        # if not exist, will create from traning set and store
        # word_sets contains all one gram and two grams after removing stopwords
        self.word_sets = load_from_pkl(word_sets_file)
        if self.word_sets is None:
            # Prepare list of words (and pairs) that appear in training set
            # note that if tuples = [1], then parser,parse('one two three') -> ['one', 'two', 'three]
            # if tuples = [2], then parser.parse('one two three') -> ['one two', 'two three']
            # if tuples = [1,2], then parser,parse('one two three) -> ['one', 'two', 'three', 'one two', 'two three']
            parser = SimpleWordParser(tuples=[1,2])
            words = set()
            for exam in [train_b, valid_b, test_b]:
                if exam is not None:
                    words.update(np.concatenate([self._words_to_names(parser.parse(qst)) for qst in exam['question']]))
                    words.update(np.concatenate([self._words_to_names(parser.parse(ans)) for ans in exam['answer']]))
            words.difference_update(['']) # ignore empty word
            words = sorted(words)
            if self.print_level > 1:
                print '%d word sets: ...%s...' % (len(words), words[::5000])
            self.word_sets = words
            save_to_pkl(word_sets_file, self.word_sets)

    def prepare_ck12html_corpus(self, corpus_dir):
        self.ck12html_corpus = '%s/CK12/OEBPS/ck12.txt' % corpus_dir
        if not os.path.exists(self.ck12html_corpus):
            # Doc per HTML section (h1-4)
            htmlr = HtmlReader(min_chars_per_line=1, min_words_per_section=20)
            locdic = htmlr.read(htmldir='%s/CK12/OEBPS' % corpus_dir,
                                outfile=self.ck12html_corpus,
                                ignore_sections=set(['explore more.*', 'review', 'practice', 'references']),
                                stop_words=None, pos_words=set([]), corpus_words=None,
                                min_pos_words_in_page_name=0, min_pos_words_in_section=0, action='write')


    def prepare_ck12text_corpus(self, corpus_dir):
        self.ck12text_corpus = '%s/CK12/ck12_text.txt' % corpus_dir
        if not os.path.exists(self.ck12text_corpus):
            textr = TextReader(min_chars_per_line=1, min_words_per_section=25)
            locdic = textr.read(dir='%s/CK12' % corpus_dir,
                                outfile=self.ck12text_corpus,
                                # see "Peoples-Physics-Book-Basic_b_v10_zgo_s1.text" for instance
                                # each chapter is begin with 'CHAPTER'
                                first_line_regexp='^(CHAPTER)',
                                action='write')

    def prepare_simplewiki_corpus(self, corpus_dir, train_b, valid_b):
        # some explanations of the parameters, note that by modifying these numbers, you get different wiki corpus
        # here for simplicity, I only show one possible combination
        # common_words_min_frac = 1.0, meaning no words are treated as common words
        # uncommon words_max_frac = 0,05 meaning there are 403716 uncommon words in this setting
        # min_pos_words_in_page_name=0 meaning an eligible page must have 0 pos words(words that appears in train_b and valid_b), cuz we only want relevant wiki pages
        # min_pos_words_in_section=5 meaning eligible section must have 5 pos words
        # use_all_pages_match_pos_word=True
        # use_all_pages_match_answer=True
        # always_use_first_section=True
        self.simplewiki_corpus = '%s/simplewiki/simplewiki_1.0000_0.0500_0_5_True_True_True_corpus.txt' % corpus_dir
        if not os.path.exists(self.simplewiki_corpus):
            wkb = WikiCorpusBuilder(wiki_name='simplewiki', wiki_dir='%s/simplewiki'%corpus_dir,
                                    wiki_file='simplewiki-20151102-pages-articles.xml', debug_flag=False)
            # Create 2 files all_categories.pkl and parent_categories.pkl, if exist, will just load,
            # They are stored in wkb.all_categories and wkb.parent_categories
            # we scan the wiki file find all categories that has <title>Categories:xxx</title> and their parent Catetories
            # details can be found in read_categories method in WikiReader.py
            wkb.read_categories(reread=False)
            # Create 2 files 'use_categories.pkl' and 'pages_in_categories.pkl'
            # for all singlewiki corpus, target_categories = None, important_categories=['Earth', 'Cellular respiration', 'DNA', 'Units of length', 'History of science',
            #                                                           'Evolutionary biology', 'Nonmetals', 'Health', 'Charles Darwin']
            # important_categories are science-related categories, if not found in target_catefories, which is generated from above method, will give an alert
            # it will all read_pages_in_categories in Cardal_WikiReader.py
            wkb.read_pages_in_categories(target_categories=None, max_cat_depth=9999,
                                         important_categories=['Earth', 'Cellular respiration', 'DNA', 'Units of length', 'History of science',
                                                               'Evolutionary biology', 'Nonmetals', 'Health', 'Charles Darwin'], reread=False)
            # this will read all the text from wiki file
            # parse useful pure text and build a dict of words
            # depends on the common words and uncommon words fraction, we pick up common words and uncommon words
            # we also add common words to stop words
            # we finally save common_words.pkl, uncommon_words.pkl and stop_words.pkl to corpus dir
            wkb.find_common_words(wiki_common_words_min_frac=1.0, wiki_uncommon_words_max_frac=0.05, use_wiki_stop_words=False, reread=False)
            # note that wkb.create_corpus function returns a value, this is just the location of corpus name
            # this will create the corpus file as well as exams_words.pkl(all words that appear in train_b and valid_b),
            # positive_words.pkl(all words in exam that are also uncommon in wiki),
            # and all_answers.pkl(this is a set, each element is a tuple of words within that answer)
            self.simplewiki_corpus = wkb.create_corpus(train_b, valid_b, min_pos_words_in_page_name=0, min_pos_words_in_section=5,
                                                        only_first_section_per_page=False, use_all_pages_match_pos_word=True, use_all_pages_match_answer=True,
                                                        always_use_first_section=True, max_read_lines=9990000000, reread=False)


    def prepare_lucene_indexes(self, corpus_dir):
        self.lucene_dir1, self.lucene_parser1, self.lucene_corpus1 = None, None, None
        self.lucene_dir2, self.lucene_parser2, self.lucene_corpus2 = None, None, None
        self.lucene_dir3, self.lucene_parser3, self.lucene_corpus3 = None, None, None

        # Lucene Index 1: ck12html
        self.lucene_dir1 = '%s/lucene_idx1' % corpus_dir
        self.lucene_parser1 = SimpleWordParser(word_func=EnglishStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus1 = LuceneCorpus(index_dir=self.lucene_dir1, filenames=[self.ck12html_corpus], parser=self.lucene_parser1)
        if not os.path.exists(self.lucene_dir1):
             self.lucene_corpus1.prp_index()

        # Lucene Index 2: ck12text
        self.lucene_dir2 = '%s/lucene_idx2' % corpus_dir
        self.lucene_parser2 = SimpleWordParser(word_func=LancasterStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus2 = LuceneCorpus(index_dir=self.lucene_dir2, filenames=[self.ck12text_corpus], parser=self.lucene_parser2)
        if not os.path.exists(self.lucene_dir2):
             self.lucene_corpus2.prp_index()

        # Lucene Index 3: simplewiki
        self.lucene_dir3 = '%s/lucene_idx3' % corpus_dir
        self.lucene_parser3 = SimpleWordParser(word_func=PorterStemmer().stem, split_words_regexp='[\-\+\*\/\,\;\:\(\)]', min_word_length=1)
        self.lucene_corpus3 = LuceneCorpus(index_dir=self.lucene_dir3, filenames=[self.simplewiki_corpus],
                                           parser=self.lucene_parser3, similarity=BM25Similarity())
        if not os.path.exists(self.lucene_dir3):
             self.lucene_corpus3.prp_index()


    def prepare_features(self, dataf_q, dataf_b, train_df, cache_dir):
        """
        :param dataf_q: this is not used in my implementation
        :param dataf_b:
        :param train_df: note that we create word counter for training set only, that's why we need to pass this even we are creating features for valid set
        :param cache_dir:
        :return:

        Basic Features:
        1. AnswerInQuestionFunc(): calculate the set of all words of parsed from question, for each ans, calculate the fraction of
        (# intersection of answer words and question words) / (# of total answer words). A little variation is when we parse, we can set
        word stemming on

        2. AnswersInAnswersFunc(): calculate for each answer, the avg ratio of its words appears in other questions. Will not use this feature

        3. AnswerCountFunc(count_type='count', parser): it will use the parser to parse all the answer(if use_question = True, will do the same for questions),
           after all ans are parsed, this will build a counter dict for all the unique words, for each ans, the mean count of words in that ans is calculated as feature

        4. AnswerCountFunc(count_type='correct', parser): same as above, but here only words from correct answers are used to build counter dict

        5. AnswersLengthFunc(log_flag=False): this will gives relative length(# of char) of each ans to mean(answers for the same question)

        Lucene Feature:
        6. AnswersLuceneSearchFunc(lucene_corpus, max_docs, weight_func, score_func): for each ans, we search qst+ans in lucene_corpus, retrievel max_docs number
        of documents, for the score returned by each documents, we can apply an optional score_func as transformation, then we use weight_func to weight
        all these scores and sum as the feature of this ans
        """
        self.cache_dir = '%s/%s' % (self.base_dir, cache_dir)
        create_dirs([self.cache_dir])
        stemmer1 = PorterStemmer()
        stem1 = stemmer1.stem
        check_same_question = not set(dataf_b['ID']).isdisjoint(train_df['ID'])
        stemmed_parser  = SimpleWordParser(word_func=stem1, ignore_special_words=True , min_word_length=1)

        func_name = 'ans_in_qst_stem'
        self.add_answer_func(dataf_b, func=AnswersInQuestionFunc(parser=stemmed_parser), name=func_name)

        func_name = 'ans_in_ans_stem'
        self.add_answer_func(dataf_b, func=AnswersInAnswersFunc(parser=stemmed_parser), name=func_name)

        func_name = 'ans_words_stem_count'
        self.add_answer_func(dataf_b, func=AnswerCountFunc(train_df, check_same_question=check_same_question, count_type='count', parser=stemmed_parser, single_words=True), name=func_name)

        func_name = 'ans_length_ratio'
        self.add_answer_func(dataf_b, func=AnswersLengthFunc(log_flag=False), name=func_name)

        func_name = 'ans_num_words'
        self.add_answer_func(dataf_b, func=AnswersNumWordsFunc(), name=func_name)

        func_name = 'luc_stem_1'
        self.add_answer_func(dataf_b,
                             func=AnswersLuceneSearchFunc(lucene_corpus=self.lucene_corpus1, parser=self.lucene_parser1,
                             max_docs=250, weight_func=lambda n: 1.0/(5.0+np.arange(n))**2, score_func=lambda s: (s/10.0)**3,
                             norm_scores= True),
                             name= func_name)

        func_name = 'luc_stem_2'
        self.add_answer_func(dataf_b,
                             func=AnswersLuceneSearchFunc(lucene_corpus=self.lucene_corpus2, parser=self.lucene_parser2,
                             max_docs=250, weight_func=lambda n: 1.0/(5.0+np.arange(n))**2, score_func=lambda s: (s/10.0)**3,
                             norm_scores= True),
                             name= func_name)

        func_name = 'luc_stem_3'
        self.add_answer_func(dataf_b,
                             func=AnswersLuceneSearchFunc(lucene_corpus=self.lucene_corpus3, parser=self.lucene_parser3,
                             max_docs=250, weight_func=lambda n: 1.0/(5.0+np.arange(n))**2, score_func=lambda s: (s/10.0)**3,
                             norm_scores= True),
                             name= func_name)

    def _cache_filename(self, fname):
        return '%s/%s.pkl' % (self.cache_dir, fname)

    def _read_from_cache(self, fname):
        filename = self._cache_filename(fname)
        #print 'Loading from cache %s' % filename
        return load_from_pkl(filename)

    def _save_to_cache(self, fname, data):
        filename = self._cache_filename(fname)
        print 'Saving to cache %s' % filename
        return save_to_pkl(filename, data)

    def _is_in_cache(self, name):
        if self.cache_dir is None:
            return False
        exists = True
        if np.isscalar(name):
            exists = os.path.exists(self._cache_filename(name))
        else:
            for n in name:
                exists = exists and os.path.exists(self._cache_filename(n))
        return exists

    def add_answer_func(self, train_b, func, name, question_ids=None):
        '''
        Run a score function on each set of question and answers
        '''
        if (not self.recalc) and (self.cache_dir is not None) and (self._is_in_cache(name)):
            if np.isscalar(name):
                train_b[name] = self._read_from_cache(name)
            else:
                for n in name:
                    train_b[n] = self._read_from_cache(n)
            return

        groups = train_b.groupby('ID').groups
        for i,(idx,inds) in enumerate(groups.iteritems()):
            assert len(set(train_b.irow(inds)['question']))==1
            if (question_ids is not None) and (idx not in question_ids): continue
            question = train_b.iloc[inds[0]]['question']
            answers = np.array(train_b.iloc[inds]['answer'])
            if 'correct' in train_b.columns:
                print '\n-----> #%d : correct = %s' % (i, ', '.join(['%d'%c for c in np.array(train_b.iloc[inds]['correct'])]))
                sys.stdout.flush()
            vals = func(question, answers)
            if question_ids is not None:
                print 'vals = %s' % str(vals)
            for val,ind in zip(vals, inds):
                if np.isscalar(val):
                    train_b.set_value(ind, name, val)
                else:
                    assert len(val)==len(name)
                    for v,n in zip(val,name):
                        train_b.set_value(ind, n, v)

        if (self.cache_dir is not None) and (question_ids is None):
            if np.isscalar(name):
                self._save_to_cache(name, np.array(train_b[name]))
            else:
                for n in name:
                    self._save_to_cache(n, np.array(train_b[n]))