Esempio n. 1
0
    def read(self, htmldir, outfile, stop_words=set(), pos_words=set(), page_name_word_sets=None, corpus_words=None,
             page_title_ignore_suffixes=['-1', '-2', '- Advanced'],
             ignore_sections=set(),
             min_pos_words_in_page_name=0, min_pos_words_in_section=0,
             use_all_pages_match_pos_word=False, use_all_pages_match_sets=False, always_use_first_section=False,
             action='write'):

        # reset the class variables every time since these class variables are static variables that belongs to the Class, not a particular class object
        self._reset(outfile=outfile, stop_words=stop_words, pos_words=pos_words, page_name_word_sets=page_name_word_sets, corpus_words=corpus_words,
                    min_pos_words_in_page_name=min_pos_words_in_page_name, min_pos_words_in_section=min_pos_words_in_section,
                    use_all_pages_match_pos_word=use_all_pages_match_pos_word, use_all_pages_match_sets=use_all_pages_match_sets,
                    always_use_first_section=always_use_first_section,
                    action=action)

        parser = SimpleWordParser(tolower=True, ascii_conversion=True, ignore_special_words=False)
        # the action variable is 'write', so _start_action will open the output file and write to it
        self._start_action()
        page_name, section_name, section_in_page = None, None, 0
        page_name_words, section_words = [], []
        start_time = time.time()
        # we only include x.html while x is a scalar, meaning we ignore the table html
        filenames = ['%s/%s'%(htmldir,fname) for fname in os.listdir(htmldir) if re.match(r'(\d+).html', fname) != None]
        assert len(filenames)>0
        for ifname,fname in enumerate(filenames):
            print 'Reading %s' % fname
            with open(fname, 'rb') as myfile:
                # this is a very long string
                text = myfile.read()
            soup = BeautifulSoup(text, 'lxml')
            if soup.h1 is None:
                print 'Could not find page title in file %s - skipping' % fname
                continue
            # note that the html file could have many h1 tags, while only the first one is the title
            page_name = soup.h1.text.strip()
            # e.g some of the page name has Momentum-1, where the suffix '-1' should be eliminated
            for ptis in page_title_ignore_suffixes:
                if page_name.endswith(ptis):
                    page_name = page_name[:-len(ptis)]
                    break
            page_name_words = parser.parse(page_name)
            # page name = surface processes and landforms __0
            # this is write fo file with the page name
            page_name = CorpusReader.part_name_from_words(page_name_words, ifname)
            print 'page name = %s' % page_name
            self._add_page(page_name, page_name_words)
            # using the section_re to split the text(without title)
            parts = re.split('(<h[1-4])', text)
            # start from 3 because the first 3 parts belong to the title <h1> tag, which should be skipped
            for ipart in range(3,len(parts),2):
                # odd number of parts are splitter tags
                # even number of parts are the contents of the tag
                soup = BeautifulSoup(parts[ipart] + parts[ipart+1], 'lxml')
                section_name = soup.find(parts[ipart][1:]).text.strip().lower()
                # some section that has name that matches set(['review', 'practice', 'references', 'explore more.*'])
                # we know this is a review section that does not contains information about science knowledge
                if np.any([(re.match(isr, section_name) is not None) for isr in ignore_sections]):
                    continue
                section_name_words = parser.parse(section_name)
                section_in_page = (ipart - 1) / 2
                # only select text from all the <p> tags within each section
                text = ''
                for p in soup.find_all('p'):
                    text += p.next.strip()
                # this will replace some of the symbols to Eng, e.g '&#916;' -> 'Delta'
                text = HtmlReader.parse_text(text)
                # word tokenizing
                words = parser.parse(text)
                section_words = words
                # for each filename, add those sections, which is write to files
                # note that section_name is not written to file.
                self._add_section(page_name, page_name_words, section_name, section_name_words, section_in_page, section_words)

        end_time = time.time()
        print 'read_html total time = %.1f secs.' % (end_time-start_time)
        print 'Read %d pages, %d sections; applied action on %d sections' % (self.num_pages, self.num_sections, self.num_section_action)
        self._end_action()

        return self._locdic