def prepare_word_sets(self, corpus_dir, train_b, valid_b, test_b):
     if self.print_level > 0:
         print '-> Preparing word sets'
     word_sets_file = '%s/word_sets.pkl' % corpus_dir
     print (word_sets_file)
     # if not exist, will create from traning set and store
     # word_sets contains all one gram and two grams after removing stopwords
     self.word_sets = load_from_pkl(word_sets_file)
     if self.word_sets is None:
         # Prepare list of words (and pairs) that appear in training set
         # note that if tuples = [1], then parser,parse('one two three') -> ['one', 'two', 'three]
         # if tuples = [2], then parser.parse('one two three') -> ['one two', 'two three']
         # if tuples = [1,2], then parser,parse('one two three) -> ['one', 'two', 'three', 'one two', 'two three']
         parser = SimpleWordParser(tuples=[1,2])
         words = set()
         for exam in [train_b, valid_b, test_b]:
             if exam is not None:
                 words.update(np.concatenate([self._words_to_names(parser.parse(qst)) for qst in exam['question']]))
                 words.update(np.concatenate([self._words_to_names(parser.parse(ans)) for ans in exam['answer']]))
         words.difference_update(['']) # ignore empty word
         words = sorted(words)
         if self.print_level > 1:
             print '%d word sets: ...%s...' % (len(words), words[::5000])
         self.word_sets = words
         save_to_pkl(word_sets_file, self.word_sets)
def add_qa_features(train):
    '''
    Add simple features computed for each question
    These features are:
    1. Does the question contains 'which'
    2. Does the question contains '___'
    3. Does the question contains 'not', 'except', 'least'
    4. Number of words in question
    5. Average Number of words for answers of this question
    '''
    parser = SimpleWordParser()
    train['q_which']     = np.array([('which' in qst.lower().split(' ')) for qst in train['question']])
    train['q____']       = np.array([('___' in qst) for qst in train['question']])
    not_words_weights = {'NOT':1, 'EXCEPT':1, 'LEAST':1}    # note the 'not' words can have unequal weights
    train['q_not']       = np.array([np.max([not_words_weights.get(w,0) for w in qst.split(' ')]) for qst in train['question']])
    train['q_num_words'] = np.array([len(parser.parse(qst)) for qst in train['question']])
    train['a_num_words'] = np.array([np.mean([len(parser.parse(ans)) for ans in anss]) for anss in np.array(train[['answerA','answerB','answerC','answerD']])])
Exemple #3
0
    def read(self, htmldir, outfile, stop_words=set(), pos_words=set(), page_name_word_sets=None, corpus_words=None,
             page_title_ignore_suffixes=['-1', '-2', '- Advanced'],
             ignore_sections=set(),
             min_pos_words_in_page_name=0, min_pos_words_in_section=0,
             use_all_pages_match_pos_word=False, use_all_pages_match_sets=False, always_use_first_section=False,
             action='write'):

        # reset the class variables every time since these class variables are static variables that belongs to the Class, not a particular class object
        self._reset(outfile=outfile, stop_words=stop_words, pos_words=pos_words, page_name_word_sets=page_name_word_sets, corpus_words=corpus_words,
                    min_pos_words_in_page_name=min_pos_words_in_page_name, min_pos_words_in_section=min_pos_words_in_section,
                    use_all_pages_match_pos_word=use_all_pages_match_pos_word, use_all_pages_match_sets=use_all_pages_match_sets,
                    always_use_first_section=always_use_first_section,
                    action=action)

        parser = SimpleWordParser(tolower=True, ascii_conversion=True, ignore_special_words=False)
        # the action variable is 'write', so _start_action will open the output file and write to it
        self._start_action()
        page_name, section_name, section_in_page = None, None, 0
        page_name_words, section_words = [], []
        start_time = time.time()
        # we only include x.html while x is a scalar, meaning we ignore the table html
        filenames = ['%s/%s'%(htmldir,fname) for fname in os.listdir(htmldir) if re.match(r'(\d+).html', fname) != None]
        assert len(filenames)>0
        for ifname,fname in enumerate(filenames):
            print 'Reading %s' % fname
            with open(fname, 'rb') as myfile:
                # this is a very long string
                text = myfile.read()
            soup = BeautifulSoup(text, 'lxml')
            if soup.h1 is None:
                print 'Could not find page title in file %s - skipping' % fname
                continue
            # note that the html file could have many h1 tags, while only the first one is the title
            page_name = soup.h1.text.strip()
            # e.g some of the page name has Momentum-1, where the suffix '-1' should be eliminated
            for ptis in page_title_ignore_suffixes:
                if page_name.endswith(ptis):
                    page_name = page_name[:-len(ptis)]
                    break
            page_name_words = parser.parse(page_name)
            # page name = surface processes and landforms __0
            # this is write fo file with the page name
            page_name = CorpusReader.part_name_from_words(page_name_words, ifname)
            print 'page name = %s' % page_name
            self._add_page(page_name, page_name_words)
            # using the section_re to split the text(without title)
            parts = re.split('(<h[1-4])', text)
            # start from 3 because the first 3 parts belong to the title <h1> tag, which should be skipped
            for ipart in range(3,len(parts),2):
                # odd number of parts are splitter tags
                # even number of parts are the contents of the tag
                soup = BeautifulSoup(parts[ipart] + parts[ipart+1], 'lxml')
                section_name = soup.find(parts[ipart][1:]).text.strip().lower()
                # some section that has name that matches set(['review', 'practice', 'references', 'explore more.*'])
                # we know this is a review section that does not contains information about science knowledge
                if np.any([(re.match(isr, section_name) is not None) for isr in ignore_sections]):
                    continue
                section_name_words = parser.parse(section_name)
                section_in_page = (ipart - 1) / 2
                # only select text from all the <p> tags within each section
                text = ''
                for p in soup.find_all('p'):
                    text += p.next.strip()
                # this will replace some of the symbols to Eng, e.g '&#916;' -> 'Delta'
                text = HtmlReader.parse_text(text)
                # word tokenizing
                words = parser.parse(text)
                section_words = words
                # for each filename, add those sections, which is write to files
                # note that section_name is not written to file.
                self._add_section(page_name, page_name_words, section_name, section_name_words, section_in_page, section_words)

        end_time = time.time()
        print 'read_html total time = %.1f secs.' % (end_time-start_time)
        print 'Read %d pages, %d sections; applied action on %d sections' % (self.num_pages, self.num_sections, self.num_section_action)
        self._end_action()

        return self._locdic
Exemple #4
0
def prp_binary_dataf(train):
    """
    :param train: pandas df
    :return:
    this function expand each question into 4 rows, one for each answer
    e.g How many hours in a day? A.22 B.23 C.24 D.25 other_features
    Now becomes
    How many hours in a day? A.22 other_fearures False
    How many hours in a day? B.23 other_fearures False
    How many hours in a day? C.24 other_fearures True
    How many hours in a day? D.25 other_fearures False

    The reason of doing this is we want to fit a binary classifier than gives scores to each ans for given question
    """
    stemmer = PorterStemmer()
    parser = SimpleWordParser(word_func=stemmer.stem, min_word_length=1, tolower=True, ascii_conversion=True, ignore_special_words=False)
    indices, questions, answers, correct, ans_names, more_cols_vals = [], [], [], [], [], []
    is_all, is_both, is_none, keywords = [], [], [], []
    if 'correctAnswer' in train.columns:
        correct_answer = np.array(train['correctAnswer'])
    else:
        correct_answer = np.zeros(len(train))
    more_cols = [col for col in train.columns if col not in ['question', 'answerA', 'answerB', 'answerC', 'answerD', 'correctAnswer']]
    for idx,(qst,ansA,ansB,ansC,ansD),cor,mcols in zip(train.index, np.array(train[['question', 'answerA', 'answerB', 'answerC', 'answerD']]),
                                                       correct_answer, np.array(train[more_cols])):
        for ia,(ic,ans) in enumerate(zip(['A','B','C','D'],[ansA, ansB, ansC, ansD])):
            indices.append(idx)
            questions.append(qst)
            a_ans, a_all, a_both, a_none, a_keywords = ans, 0, 0, 0, 0
            if ans.endswith(MARK_ANSWER_ALL):
                a_ans = ans[:-len(MARK_ANSWER_ALL)]
                a_all = 1
            elif ans.endswith(MARK_ANSWER_BOTH):
                a_ans = ans[:-len(MARK_ANSWER_BOTH)]
                a_both = 1
            elif ans.endswith(MARK_ANSWER_NONE):
                a_ans = ans[:-len(MARK_ANSWER_NONE)]
                a_none = 1
            else:
                words = parser.parse(ans)
                if 'both' in words:
                    a_both = 0.5
                # note that this is not used
                if stemmer.stem('investigation') in words:
                    a_keywords = 1
            answers.append(a_ans)
            is_all.append(a_all)
            is_both.append(a_both)
            is_none.append(a_none)
            keywords.append(a_keywords)
            # this is for test set
            if cor==0:
                correct.append(0) # no 'correctAnswer' column -> set correct=0 for all answers
            else:
                correct.append(1 if ia==(ord(cor)-ord('A')) else 0)
            ans_names.append(ic)
            more_cols_vals.append(mcols)
    pdict = {'ID': indices, 'question': questions, 'answer': answers, 'correct': correct, 'ans_name': ans_names,
             'is_all': is_all, 'is_both': is_both, 'is_none': is_none} #, 'ans_keywords': keywords}
    for icol,mcol in enumerate(more_cols):
        pdict[mcol] = np.array([vals[icol] for vals in more_cols_vals])
    return pd.DataFrame(pdict)
    def create_corpus(
        self,
        train_b,
        valid_b,
        min_pos_words_in_page_name,
        min_pos_words_in_section,
        only_first_section_per_page=False,
        max_sections_per_page=99999999,
        use_all_pages_match_pos_word=True,
        use_all_pages_match_answer=True,
        pages_to_use=None,
        always_use_first_section=False,
        max_read_lines=99900000000,
        reread=False,
    ):
        print "=> Creating corpus"
        self.min_pos_words_in_page_name = min_pos_words_in_page_name
        self.min_pos_words_in_section = min_pos_words_in_section
        self.only_first_section_per_page = only_first_section_per_page
        self.max_sections_per_page = max_sections_per_page
        self.use_all_pages_match_pos_word = use_all_pages_match_pos_word
        self.use_all_pages_match_answer = use_all_pages_match_answer
        self.always_use_first_section = always_use_first_section
        exams_words_file = "%s/%s_%s" % (self.wiki_dir, self.wiki_name, WikiCorpusBuilder.EXAMS_WORDS_FILE)
        pos_words_file = "%s/%s_%.4f_%s%s" % (
            self.wiki_dir,
            self.wiki_name,
            self.wiki_common_words_min_frac,
            "wsw_" if self.use_wiki_stop_words else "",
            WikiCorpusBuilder.POSITIVE_WORDS_FILE,
        )
        answers_file = "%s/%s_%s" % (self.wiki_dir, self.wiki_name, WikiCorpusBuilder.ANSWERS_FILE)
        corpus_file = "%s/%s_%.4f_%s%.4f_%d_%d_%s_%s_%s" % (
            self.wiki_dir,
            self.wiki_name,
            self.wiki_common_words_min_frac,
            "wsw_" if self.use_wiki_stop_words else "",
            self.wiki_uncommon_words_max_frac,
            self.min_pos_words_in_page_name,
            self.min_pos_words_in_section,
            self.use_all_pages_match_pos_word,
            self.use_all_pages_match_answer,
            self.always_use_first_section,
        )
        if pages_to_use is not None:
            corpus_file = "%s_pn%d" % (corpus_file, len(pages_to_use))
        corpus_file = "%s_%s" % (corpus_file, WikiCorpusBuilder.CORPUS_FILE)
        print "Corpus file: %s" % corpus_file
        gc.collect()

        # Get the corpus of the train+validation sets
        if reread or (not os.path.exists(pos_words_file)) or (not os.path.exists(answers_file)):
            # Get all the words that appear in the exams
            if valid_b is None:
                all_exams = train_b[["ID", "question", "answer"]]
            else:
                all_exams = pd.concat([train_b[["ID", "question", "answer"]], valid_b[["ID", "question", "answer"]]])
            parser = SimpleWordParser()
            exams_locdic = build_training_location_dictionary(
                all_exams,
                parser=parser,
                use_answers=True,
                min_word_docs_frac=0,
                max_word_docs_frac=1.0,
                min_word_count_frac=0,
                max_word_count_frac=1.0,
                ascii_conversion=True,
            )
            self.exams_words = exams_locdic.word_ids.keys()
            # Set the "positive_words" as all the words from the train(+validation) files that are uncommon in Wiki
            self.pos_words = set(self.exams_words).intersection(self.wiki_uncommon_words)
            # Get all the answers (each answer = a set of words)
            self.all_answers = set()
            for answer in all_exams["answer"]:
                self.all_answers.add(tuple(sorted(parser.parse(answer))))
            save_to_pkl(exams_words_file, self.exams_words)
            save_to_pkl(pos_words_file, self.pos_words)
            save_to_pkl(answers_file, self.all_answers)
        else:
            self.exams_words = load_from_pkl(exams_words_file)
            self.pos_words = load_from_pkl(pos_words_file)
            self.all_answers = load_from_pkl(answers_file)

        print "There are %d positive words (%d wiki uncommon words, %d words from exams)" % (
            len(self.pos_words),
            len(self.wiki_uncommon_words),
            len(self.exams_words),
        )
        print "There are a total of %d unique answers" % len(self.all_answers)
        print "Using %d stop words" % (len(self.stop_words))
        if pages_to_use is None:
            use_pages = self.pages_in_categories
        else:
            use_pages = pages_to_use
        print "Considering %d pages" % len(use_pages)

        if reread or (not os.path.exists(corpus_file)):
            print "Writing %s corpus to %s" % (self.wiki_name, corpus_file)
            ld = self.wikir.read(
                wikifile="%s/%s" % (self.wiki_dir, self.wiki_file),
                outfile=corpus_file,
                only_first_section_per_page=self.only_first_section_per_page,
                max_sections_per_page=self.max_sections_per_page,
                use_pages=use_pages,
                max_read_lines=max_read_lines,
                stop_words=self.stop_words,
                pos_words=self.pos_words,
                page_name_word_sets=self.all_answers,
                corpus_words=None,  ##set(exams_locdic.word_ids.keys()),
                min_pos_words_in_page_name=self.min_pos_words_in_page_name,
                min_pos_words_in_section=self.min_pos_words_in_section,
                use_all_pages_match_pos_word=self.use_all_pages_match_pos_word,
                use_all_pages_match_sets=self.use_all_pages_match_answer,
                always_use_first_section=self.always_use_first_section,
                action="write",
            )
            print "Done writing corpus"

        gc.collect()
        return corpus_file
Exemple #6
0
    def read(self, dir, outfile, stop_words=set(), pos_words=set(),
             first_line_regexp='^CHAPTER',
             ignore_sections=set(), section_end_regexp='^\s*$',
             action='write'):

        self._reset(outfile=outfile, stop_words=stop_words, pos_words=pos_words, page_name_word_sets=set(), corpus_words=None,
                    min_pos_words_in_page_name=0, min_pos_words_in_section=0,
                    use_all_pages_match_pos_word=True, use_all_pages_match_sets=True, always_use_first_section=False,
                    action=action)

        parser = SimpleWordParser(tolower=True, ascii_conversion=True, ignore_special_words=False)

        first_line_re = re.compile(first_line_regexp)
        section_end_re = re.compile(section_end_regexp)

        self._start_action()
        page_name, section_name, section_in_page = None, None, 0
        page_name_words, section_words = [], []
        start_time = time.time()
        # this includes all the .text files which are converted from pdf books
        filenames = ['%s/%s'%(dir,fname) for fname in os.listdir(dir) if fname.endswith('.text')]
        assert len(filenames)>0
        for ifname,fname in enumerate(filenames):
            print 'Reading %s' % fname
            page_name = fname[:-5]
            page_name_words = []
            # 1 file is 1 page
            print 'page name = %s' % page_name
            self._add_page(page_name, page_name_words)
            section_in_page = 0
            section_name, section_name_words = '', []
            with open (fname, 'rb') as myfile:
                found_first_line = False
                text = ''
                # will search for the first_line_re in the file, e.g 'C HAPTER' for CK12-textbooks
                # given that we find first line, if we find a line that contains section_end_re, that is multiple spaces,
                # we write lines we have seen so far to new section
                # it turns out that if we set each paragraph a section, there are more than 5000 sections for 1 page(file)
                # to actually add a section, we also use _add_section method in CorpusReader.py to check if it is a valid section.
                # For instance, we ignore it if section has too few words or if it merely contains figures and formulas
                for line in myfile:
                    line = line.strip()
                    # note that the online pdf to text converter that I used will produce some of the title caption as
                    # 'V IRAL S EXUALLY T RANSMITTED I NFECTIONS', where the space between chars should be substituted
                    line = re.sub('(?<=[A-Z]{1})(\s)(?=[A-Z]{2,})', '', line)
                    if found_first_line:
                        if re.match(section_end_re, line) is not None:
                            # Add previous section
                            section_words = parser.parse(text)
                            self._add_section(page_name, page_name_words, section_name, section_name_words, section_in_page, section_words)
                            section_in_page += 1
                            section_name, section_name_words = '', []
                            text = ''
                        else:
                            text += ' ' + line
                    else:
                        if re.match(first_line_re, line) is not None:
                            found_first_line = True
            assert found_first_line, 'Could not find first line in file %s' % fname
            # Add last section
            section_words = parser.parse(text)
            self._add_section(page_name, page_name_words, section_name, section_name_words, section_in_page, section_words)

        end_time = time.time()
        print 'read_text total time = %.1f secs.' % (end_time-start_time)
        print 'Read %d pages, %d sections; applied action on %d sections' % (self.num_pages, self.num_sections, self.num_section_action)
        self._end_action()

        return self._locdic