Beispiel #1
0
 def paragraphs(self):
     if not self.debugging:
         warnings.warn('Debugging mode has to be True when call the class')
         return None
     list_paragraphs_soup = self.soup.find_all(name='p')  # re.compile(
     list_paragraphs = []
     for item in list_paragraphs_soup:
         if len(tl.convert_to_text(item.get_text())) != 0:
             item.string = tl.convert_to_text(item.get_text())
             list_paragraphs.append(item.get_text())
     return list_paragraphs
Beispiel #2
0
    def get_keywords(self, rules):
        keywords = []
        for rule in rules:
            for keyword in self.soup.find_all(**rule):
                keywords.append(tl.convert_to_text(keyword.get_text()))
                keyword.extract()

        return keywords
Beispiel #3
0
    def get_first_title(self, rules):
        for rule in rules:
            for title_tag in self.soup.find_all(**rule):
                title = tl.convert_to_text(title_tag.get_text())
                title_tag.extract()
                return title

        return None
Beispiel #4
0
 def get(self, rules):
     results = list()
     for rule in rules:
         finds = self.soup.find_all(**rule)
         for item in finds:
             text = tl.convert_to_text(item.get_text())
             results.append(text)
             item.extract()
     return results
Beispiel #5
0
 def headings(self):
     if not self.debugging:
         warnings.warn('Debugging mode has to be True when call the class')
         return None
     list_heading_soup = self.soup.find_all(name=re.compile('^h[1-6]$'))
     list_heading = []
     for item in list_heading_soup:
         list_heading.append(tl.convert_to_text(item.get_text()))
     return list_heading
Beispiel #6
0
 def _deal_default(self):
     #print('DEFAULT')
     ParserSections.number_paragraphs += len(
         list(self.content.find_all('p')))
     txt_paragraph = tl.convert_to_text(
         re.sub('(?<!\.)\\n', '', self.content.get_text()))
     if self.content_section is None:
         self._create_section()
         warnings.warn(" Section with no name - _deal_default " +
                       "the name was defined as no_name_section")
     if txt_paragraph != '':
         self.content_section[self.i]['content'].append(txt_paragraph)
Beispiel #7
0
 def _deal_para(self):
     if self.content_section is None:
         self._create_section()
         warnings.warn(" Section with no name - deal_para " +
                       "the name was defined as no_name_section")
     ParserSections.number_paragraphs += 1
     txt_paragraph = tl.convert_to_text(
         re.sub('(?<!\.)\\n', '', self.content.get_text()))
     #print('The paragraph is', txt_paragraph)
     if txt_paragraph != '' or txt_paragraph is None:
         #print('We add it to the content_section')
         self.content_section[self.i]['content'].append(txt_paragraph)
Beispiel #8
0
 def span(self):
     import copy
     if not self.debugging:
         warnings.warn('Debugging mode has to be True when call the class')
         return None
     soup_one = copy.copy(self.soup)
     find_one1 = soup_one.find_all(name=re.compile('^h[1-6]$'))
     for e in find_one1:
         e.extract()
     find_one = soup_one.find_all(name=re.compile('span|p'), limit=1)
     list_paragraphs = []
     while len(find_one) != 0:
         text = tl.convert_to_text(find_one[0].get_text())
         if (find_one[0].name is not None) and (len(text) != 0):
             list_paragraphs.append(text)
         find_one[0].extract()
         find_one = soup_one.find_all(name=re.compile('span|p'), limit=1)
     return list_paragraphs
Beispiel #9
0
    def __init__(self,
                 soup,
                 parameters,
                 debugging=False,
                 parser_type='lxml',
                 new=False):
        # parser_types = ['xml.parser', 'lxml', 'xml5lib', 'lxml-xml']
        self.parser_type = parser_type
        self.soup = bs4.BeautifulSoup(repr(soup), parser_type)
        self.soup1 = list(self.soup.children)
        self.new = new
        if len(self.soup1) != 1:
            #self.save_soup_to_file('some_thing_wrong_children.xml')
            warnings.warn(' Something is wrong in children!=1')
            exit()
        self.soup1 = self.soup1[0]
        self.parameters = parameters
        if debugging:
            self.save_soup_to_file('ParseXML_initial.xml', prettify=True)
        self.sub_section_name = 'section_h'
        self.paragraphs = list()
        self.content_section = []
        self.data = list()
        #print(self.soup1)
        #print('The find_all returns',len(self.soup1.find_all(**parameters)),'results.')
        self.i = 0
        # for i,item in enumerate(self.soup1.find_all(**parameters)):
        #     print('=== item #',i,'==')
        #     print(item.name)
        #     for child in item:
        #         print('->',child.name)
        for i, item in enumerate(self.soup1.find_all(**parameters)):
            #print('===',i,'====')
            #print(item.name)
            if item.name is not None:
                if self.sub_section_name in item.name:
                    #print('IT\'S A HEADING!')
                    self.content_section.append({
                        'type':
                        item.name,
                        'name':
                        re.sub('(?<!\.)\\n', '',
                               item.section_title.get_text()),
                        'content': []
                    })

                    #print('Creation of self.content_section:',self.content_section)
            for content in item.contents:
                #print('===')
                #print(content.name)
                self.content = content
                if self.content.name is not None:  # deal with empty content
                    if self.sub_section_name in self.content.name:  # standard sections <section_h#>
                        #print('>>> It\'s a sub-heading. <<<')

                        self._create_sub_division()
                        content.extract()
                    else:
                        #print('>>> It\'s not a sub-heading <<<')
                        self._deal()
                        content.extract()
            self.i += 1
        self.data = self.content_section
        lost_section = {
            'type': 'lost_content',
            'name': 'lost_content',
            'content': []
        }
        save_lost = False
        tags_lost = self.soup.find_all()
        for tag in tags_lost:
            text1 = tl.convert_to_text(re.sub('(?<!\.)\\n', '',
                                              tag.get_text()))
            if len(text1) > 0:
                save_lost = True
                lost_section['content'].append(text1)
            tag.extract()
        text1 = tl.convert_to_text(
            re.sub('(?<!\.)\\n', '', self.soup1.get_text()))
        if len(text1) > 0:
            save_lost = True
            lost_section['content'].append(text1)
        if save_lost:
            self.data.append(lost_section)