def prepare_word_sets(self, corpus_dir, train_b, valid_b, test_b): if self.print_level > 0: print '-> Preparing word sets' word_sets_file = '%s/word_sets.pkl' % corpus_dir print (word_sets_file) # if not exist, will create from traning set and store # word_sets contains all one gram and two grams after removing stopwords self.word_sets = load_from_pkl(word_sets_file) if self.word_sets is None: # Prepare list of words (and pairs) that appear in training set # note that if tuples = [1], then parser,parse('one two three') -> ['one', 'two', 'three] # if tuples = [2], then parser.parse('one two three') -> ['one two', 'two three'] # if tuples = [1,2], then parser,parse('one two three) -> ['one', 'two', 'three', 'one two', 'two three'] parser = SimpleWordParser(tuples=[1,2]) words = set() for exam in [train_b, valid_b, test_b]: if exam is not None: words.update(np.concatenate([self._words_to_names(parser.parse(qst)) for qst in exam['question']])) words.update(np.concatenate([self._words_to_names(parser.parse(ans)) for ans in exam['answer']])) words.difference_update(['']) # ignore empty word words = sorted(words) if self.print_level > 1: print '%d word sets: ...%s...' % (len(words), words[::5000]) self.word_sets = words save_to_pkl(word_sets_file, self.word_sets)
def add_qa_features(train): ''' Add simple features computed for each question These features are: 1. Does the question contains 'which' 2. Does the question contains '___' 3. Does the question contains 'not', 'except', 'least' 4. Number of words in question 5. Average Number of words for answers of this question ''' parser = SimpleWordParser() train['q_which'] = np.array([('which' in qst.lower().split(' ')) for qst in train['question']]) train['q____'] = np.array([('___' in qst) for qst in train['question']]) not_words_weights = {'NOT':1, 'EXCEPT':1, 'LEAST':1} # note the 'not' words can have unequal weights train['q_not'] = np.array([np.max([not_words_weights.get(w,0) for w in qst.split(' ')]) for qst in train['question']]) train['q_num_words'] = np.array([len(parser.parse(qst)) for qst in train['question']]) train['a_num_words'] = np.array([np.mean([len(parser.parse(ans)) for ans in anss]) for anss in np.array(train[['answerA','answerB','answerC','answerD']])])
def read(self, htmldir, outfile, stop_words=set(), pos_words=set(), page_name_word_sets=None, corpus_words=None, page_title_ignore_suffixes=['-1', '-2', '- Advanced'], ignore_sections=set(), min_pos_words_in_page_name=0, min_pos_words_in_section=0, use_all_pages_match_pos_word=False, use_all_pages_match_sets=False, always_use_first_section=False, action='write'): # reset the class variables every time since these class variables are static variables that belongs to the Class, not a particular class object self._reset(outfile=outfile, stop_words=stop_words, pos_words=pos_words, page_name_word_sets=page_name_word_sets, corpus_words=corpus_words, min_pos_words_in_page_name=min_pos_words_in_page_name, min_pos_words_in_section=min_pos_words_in_section, use_all_pages_match_pos_word=use_all_pages_match_pos_word, use_all_pages_match_sets=use_all_pages_match_sets, always_use_first_section=always_use_first_section, action=action) parser = SimpleWordParser(tolower=True, ascii_conversion=True, ignore_special_words=False) # the action variable is 'write', so _start_action will open the output file and write to it self._start_action() page_name, section_name, section_in_page = None, None, 0 page_name_words, section_words = [], [] start_time = time.time() # we only include x.html while x is a scalar, meaning we ignore the table html filenames = ['%s/%s'%(htmldir,fname) for fname in os.listdir(htmldir) if re.match(r'(\d+).html', fname) != None] assert len(filenames)>0 for ifname,fname in enumerate(filenames): print 'Reading %s' % fname with open(fname, 'rb') as myfile: # this is a very long string text = myfile.read() soup = BeautifulSoup(text, 'lxml') if soup.h1 is None: print 'Could not find page title in file %s - skipping' % fname continue # note that the html file could have many h1 tags, while only the first one is the title page_name = soup.h1.text.strip() # e.g some of the page name has Momentum-1, where the suffix '-1' should be eliminated for ptis in page_title_ignore_suffixes: if page_name.endswith(ptis): page_name = page_name[:-len(ptis)] break page_name_words = parser.parse(page_name) # page name = surface processes and landforms __0 # this is write fo file with the page name page_name = CorpusReader.part_name_from_words(page_name_words, ifname) print 'page name = %s' % page_name self._add_page(page_name, page_name_words) # using the section_re to split the text(without title) parts = re.split('(<h[1-4])', text) # start from 3 because the first 3 parts belong to the title <h1> tag, which should be skipped for ipart in range(3,len(parts),2): # odd number of parts are splitter tags # even number of parts are the contents of the tag soup = BeautifulSoup(parts[ipart] + parts[ipart+1], 'lxml') section_name = soup.find(parts[ipart][1:]).text.strip().lower() # some section that has name that matches set(['review', 'practice', 'references', 'explore more.*']) # we know this is a review section that does not contains information about science knowledge if np.any([(re.match(isr, section_name) is not None) for isr in ignore_sections]): continue section_name_words = parser.parse(section_name) section_in_page = (ipart - 1) / 2 # only select text from all the <p> tags within each section text = '' for p in soup.find_all('p'): text += p.next.strip() # this will replace some of the symbols to Eng, e.g 'Δ' -> 'Delta' text = HtmlReader.parse_text(text) # word tokenizing words = parser.parse(text) section_words = words # for each filename, add those sections, which is write to files # note that section_name is not written to file. self._add_section(page_name, page_name_words, section_name, section_name_words, section_in_page, section_words) end_time = time.time() print 'read_html total time = %.1f secs.' % (end_time-start_time) print 'Read %d pages, %d sections; applied action on %d sections' % (self.num_pages, self.num_sections, self.num_section_action) self._end_action() return self._locdic
def prp_binary_dataf(train): """ :param train: pandas df :return: this function expand each question into 4 rows, one for each answer e.g How many hours in a day? A.22 B.23 C.24 D.25 other_features Now becomes How many hours in a day? A.22 other_fearures False How many hours in a day? B.23 other_fearures False How many hours in a day? C.24 other_fearures True How many hours in a day? D.25 other_fearures False The reason of doing this is we want to fit a binary classifier than gives scores to each ans for given question """ stemmer = PorterStemmer() parser = SimpleWordParser(word_func=stemmer.stem, min_word_length=1, tolower=True, ascii_conversion=True, ignore_special_words=False) indices, questions, answers, correct, ans_names, more_cols_vals = [], [], [], [], [], [] is_all, is_both, is_none, keywords = [], [], [], [] if 'correctAnswer' in train.columns: correct_answer = np.array(train['correctAnswer']) else: correct_answer = np.zeros(len(train)) more_cols = [col for col in train.columns if col not in ['question', 'answerA', 'answerB', 'answerC', 'answerD', 'correctAnswer']] for idx,(qst,ansA,ansB,ansC,ansD),cor,mcols in zip(train.index, np.array(train[['question', 'answerA', 'answerB', 'answerC', 'answerD']]), correct_answer, np.array(train[more_cols])): for ia,(ic,ans) in enumerate(zip(['A','B','C','D'],[ansA, ansB, ansC, ansD])): indices.append(idx) questions.append(qst) a_ans, a_all, a_both, a_none, a_keywords = ans, 0, 0, 0, 0 if ans.endswith(MARK_ANSWER_ALL): a_ans = ans[:-len(MARK_ANSWER_ALL)] a_all = 1 elif ans.endswith(MARK_ANSWER_BOTH): a_ans = ans[:-len(MARK_ANSWER_BOTH)] a_both = 1 elif ans.endswith(MARK_ANSWER_NONE): a_ans = ans[:-len(MARK_ANSWER_NONE)] a_none = 1 else: words = parser.parse(ans) if 'both' in words: a_both = 0.5 # note that this is not used if stemmer.stem('investigation') in words: a_keywords = 1 answers.append(a_ans) is_all.append(a_all) is_both.append(a_both) is_none.append(a_none) keywords.append(a_keywords) # this is for test set if cor==0: correct.append(0) # no 'correctAnswer' column -> set correct=0 for all answers else: correct.append(1 if ia==(ord(cor)-ord('A')) else 0) ans_names.append(ic) more_cols_vals.append(mcols) pdict = {'ID': indices, 'question': questions, 'answer': answers, 'correct': correct, 'ans_name': ans_names, 'is_all': is_all, 'is_both': is_both, 'is_none': is_none} #, 'ans_keywords': keywords} for icol,mcol in enumerate(more_cols): pdict[mcol] = np.array([vals[icol] for vals in more_cols_vals]) return pd.DataFrame(pdict)
def create_corpus( self, train_b, valid_b, min_pos_words_in_page_name, min_pos_words_in_section, only_first_section_per_page=False, max_sections_per_page=99999999, use_all_pages_match_pos_word=True, use_all_pages_match_answer=True, pages_to_use=None, always_use_first_section=False, max_read_lines=99900000000, reread=False, ): print "=> Creating corpus" self.min_pos_words_in_page_name = min_pos_words_in_page_name self.min_pos_words_in_section = min_pos_words_in_section self.only_first_section_per_page = only_first_section_per_page self.max_sections_per_page = max_sections_per_page self.use_all_pages_match_pos_word = use_all_pages_match_pos_word self.use_all_pages_match_answer = use_all_pages_match_answer self.always_use_first_section = always_use_first_section exams_words_file = "%s/%s_%s" % (self.wiki_dir, self.wiki_name, WikiCorpusBuilder.EXAMS_WORDS_FILE) pos_words_file = "%s/%s_%.4f_%s%s" % ( self.wiki_dir, self.wiki_name, self.wiki_common_words_min_frac, "wsw_" if self.use_wiki_stop_words else "", WikiCorpusBuilder.POSITIVE_WORDS_FILE, ) answers_file = "%s/%s_%s" % (self.wiki_dir, self.wiki_name, WikiCorpusBuilder.ANSWERS_FILE) corpus_file = "%s/%s_%.4f_%s%.4f_%d_%d_%s_%s_%s" % ( self.wiki_dir, self.wiki_name, self.wiki_common_words_min_frac, "wsw_" if self.use_wiki_stop_words else "", self.wiki_uncommon_words_max_frac, self.min_pos_words_in_page_name, self.min_pos_words_in_section, self.use_all_pages_match_pos_word, self.use_all_pages_match_answer, self.always_use_first_section, ) if pages_to_use is not None: corpus_file = "%s_pn%d" % (corpus_file, len(pages_to_use)) corpus_file = "%s_%s" % (corpus_file, WikiCorpusBuilder.CORPUS_FILE) print "Corpus file: %s" % corpus_file gc.collect() # Get the corpus of the train+validation sets if reread or (not os.path.exists(pos_words_file)) or (not os.path.exists(answers_file)): # Get all the words that appear in the exams if valid_b is None: all_exams = train_b[["ID", "question", "answer"]] else: all_exams = pd.concat([train_b[["ID", "question", "answer"]], valid_b[["ID", "question", "answer"]]]) parser = SimpleWordParser() exams_locdic = build_training_location_dictionary( all_exams, parser=parser, use_answers=True, min_word_docs_frac=0, max_word_docs_frac=1.0, min_word_count_frac=0, max_word_count_frac=1.0, ascii_conversion=True, ) self.exams_words = exams_locdic.word_ids.keys() # Set the "positive_words" as all the words from the train(+validation) files that are uncommon in Wiki self.pos_words = set(self.exams_words).intersection(self.wiki_uncommon_words) # Get all the answers (each answer = a set of words) self.all_answers = set() for answer in all_exams["answer"]: self.all_answers.add(tuple(sorted(parser.parse(answer)))) save_to_pkl(exams_words_file, self.exams_words) save_to_pkl(pos_words_file, self.pos_words) save_to_pkl(answers_file, self.all_answers) else: self.exams_words = load_from_pkl(exams_words_file) self.pos_words = load_from_pkl(pos_words_file) self.all_answers = load_from_pkl(answers_file) print "There are %d positive words (%d wiki uncommon words, %d words from exams)" % ( len(self.pos_words), len(self.wiki_uncommon_words), len(self.exams_words), ) print "There are a total of %d unique answers" % len(self.all_answers) print "Using %d stop words" % (len(self.stop_words)) if pages_to_use is None: use_pages = self.pages_in_categories else: use_pages = pages_to_use print "Considering %d pages" % len(use_pages) if reread or (not os.path.exists(corpus_file)): print "Writing %s corpus to %s" % (self.wiki_name, corpus_file) ld = self.wikir.read( wikifile="%s/%s" % (self.wiki_dir, self.wiki_file), outfile=corpus_file, only_first_section_per_page=self.only_first_section_per_page, max_sections_per_page=self.max_sections_per_page, use_pages=use_pages, max_read_lines=max_read_lines, stop_words=self.stop_words, pos_words=self.pos_words, page_name_word_sets=self.all_answers, corpus_words=None, ##set(exams_locdic.word_ids.keys()), min_pos_words_in_page_name=self.min_pos_words_in_page_name, min_pos_words_in_section=self.min_pos_words_in_section, use_all_pages_match_pos_word=self.use_all_pages_match_pos_word, use_all_pages_match_sets=self.use_all_pages_match_answer, always_use_first_section=self.always_use_first_section, action="write", ) print "Done writing corpus" gc.collect() return corpus_file
def read(self, dir, outfile, stop_words=set(), pos_words=set(), first_line_regexp='^CHAPTER', ignore_sections=set(), section_end_regexp='^\s*$', action='write'): self._reset(outfile=outfile, stop_words=stop_words, pos_words=pos_words, page_name_word_sets=set(), corpus_words=None, min_pos_words_in_page_name=0, min_pos_words_in_section=0, use_all_pages_match_pos_word=True, use_all_pages_match_sets=True, always_use_first_section=False, action=action) parser = SimpleWordParser(tolower=True, ascii_conversion=True, ignore_special_words=False) first_line_re = re.compile(first_line_regexp) section_end_re = re.compile(section_end_regexp) self._start_action() page_name, section_name, section_in_page = None, None, 0 page_name_words, section_words = [], [] start_time = time.time() # this includes all the .text files which are converted from pdf books filenames = ['%s/%s'%(dir,fname) for fname in os.listdir(dir) if fname.endswith('.text')] assert len(filenames)>0 for ifname,fname in enumerate(filenames): print 'Reading %s' % fname page_name = fname[:-5] page_name_words = [] # 1 file is 1 page print 'page name = %s' % page_name self._add_page(page_name, page_name_words) section_in_page = 0 section_name, section_name_words = '', [] with open (fname, 'rb') as myfile: found_first_line = False text = '' # will search for the first_line_re in the file, e.g 'C HAPTER' for CK12-textbooks # given that we find first line, if we find a line that contains section_end_re, that is multiple spaces, # we write lines we have seen so far to new section # it turns out that if we set each paragraph a section, there are more than 5000 sections for 1 page(file) # to actually add a section, we also use _add_section method in CorpusReader.py to check if it is a valid section. # For instance, we ignore it if section has too few words or if it merely contains figures and formulas for line in myfile: line = line.strip() # note that the online pdf to text converter that I used will produce some of the title caption as # 'V IRAL S EXUALLY T RANSMITTED I NFECTIONS', where the space between chars should be substituted line = re.sub('(?<=[A-Z]{1})(\s)(?=[A-Z]{2,})', '', line) if found_first_line: if re.match(section_end_re, line) is not None: # Add previous section section_words = parser.parse(text) self._add_section(page_name, page_name_words, section_name, section_name_words, section_in_page, section_words) section_in_page += 1 section_name, section_name_words = '', [] text = '' else: text += ' ' + line else: if re.match(first_line_re, line) is not None: found_first_line = True assert found_first_line, 'Could not find first line in file %s' % fname # Add last section section_words = parser.parse(text) self._add_section(page_name, page_name_words, section_name, section_name_words, section_in_page, section_words) end_time = time.time() print 'read_text total time = %.1f secs.' % (end_time-start_time) print 'Read %d pages, %d sections; applied action on %d sections' % (self.num_pages, self.num_sections, self.num_section_action) self._end_action() return self._locdic