Example #1
0
            def crawl_link_to_index(inp):
                idx, link = inp
                print idx, link                
                try:
                    print link
                    response= urllib.urlopen(link)
                    
                    while response.getcode()==502:
                        time.sleep(60)
                        response= urllib.urlopen(link)
                    page_content = response.read()

                
                    tree = etree.HTML(page_content, parser=html_parser)
                    etree.strip_elements(tree, 'script')
                    etree.strip_tags(tree, 'script')
                    text_data = "\n".join(filter(lambda chunk: chunk != '',
                                [t.strip() for t in tree.itertext()]))
                
                    page_title = tree.find(".//title").text
                   
                    es.index(index = index_name,
                             doc_type = "page",
                             id = idx,
                             body = {
                                 "url": link,
                                 "title": page_title,
                                 "page_text": text_data
                             })
                    print "-" * 10
                except Exception, e:
                    print e
 def transform(self, data):
     raw, xml = data
     self._tag_texts(xml)
     self._identify_extra_p_tags(xml)
     self._tag_text_in_body(xml)
     etree.strip_tags(xml, "REMOVE_P")
     return data
Example #3
0
    def email(self, alerts):
        node = alert_node()
        dismisseds = [
            a.message_id
            for a in mAlert.objects.filter(dismiss=True, node=node)
        ]
        msgs = []
        for alert in alerts:
            if alert.getId() not in dismisseds:
                """
                This is all to allow <a> tags in alert messages.
                We need to strip out all the tags so we can send a
                plain text email.
                """
                msg = unicode(alert).encode('utf8')
                msgnode = etree.fromstring('<msg>{}</msg>'.format(msg))
                for i in msgnode.xpath('//a'):
                    new = etree.Element('span')
                    new.text = '{} ({})'.format(i.text, i.attrib['href'])
                    msgnode.replace(i, new)

                etree.strip_tags(msgnode, '*')
                msgs.append(msgnode.text)
        if len(msgs) == 0:
            return

        hostname = socket.gethostname()
        send_mail(subject='%s: %s' % (
            hostname,
            _("Critical Alerts").encode('utf8'),
        ),
                  text='\n'.join(msgs))
Example #4
0
 def write_xml_file(self, xml_file, root):
     tree = root.getroottree()
     # Strip the merge tag
     etree.strip_tags(tree, 'merge')
     with open(xml_file, 'w+') as f:
         f.write(etree.tostring(tree, pretty_print=True, encoding='utf-8'))
         f.close()
def _remove_element_or_comment(node):
    parent = node.getparent()
    if parent is not None:
        if node.tail:
            text = (node.text or "").strip() + node.tail
            previous = node.getprevious()
            if previous is not None:
                if not previous.tail:
                    previous.tail = ""
                previous.tail += text
            else:
                if not parent.text:
                    parent.text = ""
                parent.text += text

        removed = node.tag
        try:
            node.tag = "REMOVE_NODE"

        except AttributeError:
            parent.remove(node)

        else:
            if node.getchildren():
                etree.strip_tags(parent, "REMOVE_NODE")
            else:
                parent.remove(node)

        return removed
    def parse_vote_page(self, response):
        lxs = LxmlSelector(response)
        item = response.meta["item"]
        etree.strip_tags(lxs.xmlNode, "b", "font", "i", "sup")
        meta = self.meta_as_dict(lxs)

        date_txt = lxs.xpath("//text()").re(r"[DUdu\s:]+(\d+/\d+/\d+)")
        if date_txt:
            item["date"] = datetime.strptime(date_txt[0], "%d/%m/%Y").isoformat()
        else:
            page_text = "".join(lxs.xpath("//text()").extract())
            page_text = page_text.replace(u"\u00A0", " ")
            page_text = page_text.encode("utf-8")
            date_txt = re.search(r"du[:\s]+(\d+)[er]*\s+(.+?)\s+(\d+)", page_text)
            if date_txt:
                date_txt = " ".join(date_txt.groups())
                item["date"] = datetime.strptime(date_txt, "%d %B %Y").isoformat()
            else:
                raise

        if lxs.css("#analyse p.nomgroupe"):
            item["votes"] = self.parse_vote_first_layout(lxs, response)
        else: # 2nd layout!
            item["votes"] = self.parse_vote_second_layout(lxs)

        if item.get("file_href"):
            yield Request(
                url=item["file_href"],
                callback=self.parse_info_page,
                meta={
                    "item": item,
                }
            )
        else:
            yield item
Example #7
0
    def reprocess_definition(self, definition):
        etree.strip_tags(definition, self.STRIP_TAGS_LIST)
        definition = etree.tostring(definition, encoding='unicode')

        definition = definition.replace('<div class="rakibolana-definition">',
                                        '')
        definition = definition.replace('\n Ahitsio\n  </div>\n\n', '')
        definition = ''.join(definition.split(':')[1:]).strip()

        # Segment phrases
        for char in 'ABDEFGHIJKLMNOPRSTVZ':
            definition = definition.replace(' ' + char, '. ' + char)

        for char in '.;:?':
            definition = definition.replace(char, '##')

        # fix OCR errors as much as possible
        definition = definition.replace('u', 'v')
        definition = definition.replace('-', '')
        definition = definition.replace('Y ', 'y ')

        definition = definition.replace(char, '##')
        definition = '$$'.join(definition.split('##')).strip()
        print(definition)

        return definition
Example #8
0
def filter_types(tree):
    for el in tree.iterfind("//EM"):
        if ('TIPO' in el.attrib and len(el.attrib['TIPO']) > 0):
            el.attrib['CATEG'] = el.attrib['CATEG'] + '_' + el.attrib['TIPO']
        else:
            el.tag = "to_strip"
    etree.strip_tags(tree, 'to_strip')
Example #9
0
 def _clean(self):
     """
     Removes some of extraneous tags to make parsing easier
     """
     etree.strip_tags(self.tree, 'strong')
     for xx in self.tree.find_class('pydocx-tab'):
         xx.drop_tag()
Example #10
0
 def __init__(self, xml):
     if isinstance(xml, getattr(etree, "_Element")):
         self._tree = etree.parse(six.StringIO(etree.tostring(xml)))
     else:
         self._tree = etree.parse(xml)
     etree.strip_tags(self._tree, etree.Comment)
     self._container = {}
Example #11
0
    def clean_proprietary(self):
        p = etree.XMLParser(remove_blank_text=True, resolve_entities=False)

        tree = etree.parse(self.gv.word_document_xml, p)

        omml = tree.xpath(
            '//m:oMath',
            namespaces={
                'm':
                'http://schemas.openxmlformats.org/officeDocument/2006/math'
            })

        for omml_paragraph in omml:
            omml_paragraph.tag = '{http://www.w3.org/1998/Math/MathML}math'

        etree.strip_tags(
            tree,
            '{http://schemas.openxmlformats.org/officeDocument/2006/math}oMathPara'
        )

        omml = tree.xpath(
            '//m:oMathParaPr',
            namespaces={
                'm':
                'http://schemas.openxmlformats.org/officeDocument/2006/math'
            })

        for omml_paragraph in omml:
            omml_paragraph.getparent().remove(omml_paragraph)

        tree.write(self.gv.word_document_xml)
def due(pdfpath):
    ntree = uniform_cm(pdfpath)
    etree.strip_tags(ntree, 'textline')

    # Search for all text "textbox" elements
    for textbox in ntree.xpath('//textbox'):
        new_line = etree.Element("new_line")
        previous_bb = None

        # From a given textbox element, iterate over all the "text" elements
        for x in textbox.iter("text"):
            # Get current bb valu
            bb = getBBoxFirstValue(x)
            # Check current and past values aren't empty
            if bb is not None and previous_bb is not None and (bb - previous_bb) > 20:
                # Inserte newline into parent tag
                x.getparent().insert(x.getparent().index(x), new_line)

                # A new "new_line" element is created
                new_line = etree.Element("new_line")

            # Append current element is new_line tag
            new_line.append(x)

            # Keep latest non empty BBox 1st value
            if bb is not None:
                previous_bb = bb

        # Add last new_line element if not null
        textbox.append(new_line)
    tree = ntree
    return tree
Example #13
0
	def _parse_search_results(self, log, orig_title, orig_authors, root, matches, timeout, isbn):
		max_results = self.prefs[Moly_hu.KEY_MAX_BOOKS]
		results = root.xpath('//a[@class="book_selector"]')
		log.info('Found %d possible books (max: %d)'%(len(results), max_results))
		i = 0
		for result in results:
			book_urls = result.xpath('@href')
			
			etree.strip_tags(result, 'strong')
			author_n_title = result.text
			author_n_titles = author_n_title.split(':', 1)
			author = author_n_titles[0].strip(' \r\n\t')
			title = author_n_titles[1].strip(' \r\n\t')
			log.info('Orig: %s, target: %s'%(self.strip_accents(orig_title), self.strip_accents(title)))
		
			if orig_title:
				if orig_title.lower() not in title.lower() and self.strip_accents(orig_title) not in self.strip_accents(title):
					continue
			if orig_authors:
				author1 = orig_authors[0]
				authorsplit = author1.split(" ")
				author2 = author1
				if len(authorsplit) > 1:
					author2 = '%s %s'%(authorsplit[1], authorsplit[0])
				if author1.lower() not in author.lower() and self.strip_accents(author1) not in self.strip_accents(author) and author2.lower() not in author.lower() and self.strip_accents(author2) not in self.strip_accents(author):
					continue
		
			for book_url in book_urls:
				result_url = Moly_hu.BASE_URL + book_url
				
				if (result_url not in matches):
					matches.append(result_url)
					i += 1
				if (i >= max_results):
					return
Example #14
0
def stringify_children(node):
    from lxml.etree import tostring, strip_tags

    strip_tags(node, '*')
    text = tostring(node, method='text', encoding=unicode)

    return text
Example #15
0
        def analysis_section(notice, child):
            # Create the section element
            section_elm = Element('analysisSection')

            # Add the title element
            title_elm = SubElement(section_elm, 'title')
            title_elm.text = child['title']

            # Add paragraphs
            for paragraph in child['paragraphs']:
                paragraph_number = child['paragraphs'].index(paragraph)
                paragraph_footnotes = [
                    fn for fn in child['footnote_refs']
                    if fn['paragraph'] == paragraph_number]
                text = self.resolve_footnotes(notice, paragraph,
                                              paragraph_footnotes)
                paragraph_elm = fromstring(
                    '<analysisParagraph>'
                    + text +
                    '</analysisParagraph>')

                # Make sure to strip out elements that don't belong
                strip_tags(paragraph_elm, 'EM')

                section_elm.append(paragraph_elm)

            # Construct an analysis section for any children.
            try:
                map(lambda c:  section_elm.append(analysis_section(notice, c)),
                    child['children'])
            except:
                print("Failed to write analysis for", child['title'])
                pass

            return section_elm
Example #16
0
def ntcir_topic_read_xhtml(filename):
    with open(filename, 'rt') as f:
        xhtml_tokens = f.read()
        xml_document = unicode_to_tree(mathmlcan(xhtml_tokens))
    for topic_element in xml_document.xpath(
            '//ntcir-math:topic | //mathml:topic', namespaces=XML_NAMESPACES):
        topic_number_elements = topic_element.xpath(
            './/ntcir-math:num | .//mathml:num', namespaces=XML_NAMESPACES)
        assert len(topic_number_elements) == 1
        topic_number_element = topic_number_elements[0]
        topic_number = topic_number_element.text
        tokens = []
        for math_element in topic_element.xpath(
                './/ntcir-math:formula/mathml:math | .//mathml:formula/mathml:math',
                namespaces=XML_NAMESPACES):
            etree.strip_tags(
                math_element,
                '{{{}}}semantics'.format(XML_NAMESPACES['mathml']))
            math_element = remove_namespaces(copy(math_element))
            math_token = Math(tree_to_unicode(math_element))
            tokens.append(math_token)
        for keyword_element in topic_element.xpath(
                './/ntcir-math:keyword | .//mathml:keyword',
                namespaces=XML_NAMESPACES):
            tokens.append(Text(keyword_element.text))
        yield (topic_number, tokens)
Example #17
0
def handle_lists(element, dedupbool):
    '''Process lists elements'''
    processed_element = etree.Element(element.tag)
    for child in element.iter('item'):
        newchildelem = etree.Element('item')
        if len(child) == 0:
            processed_child = process_node(child)
            if processed_child is not None:
                newchildelem.text, newchildelem.tail = processed_child.text, processed_child.tail
                processed_element.append(newchildelem)
        else:
            # proceed with iteration, fix for nested elements
            for subelem in child.iter():
                processed_subchild = handle_textnode(subelem,
                                                     comments_fix=False,
                                                     deduplicate=dedupbool)
                # add child element to processed_element
                if processed_subchild is not None:
                    subchildelem = etree.SubElement(newchildelem,
                                                    processed_subchild.tag)
                    subchildelem.text, subchildelem.tail = processed_subchild.text, processed_subchild.tail
                subelem.tag = 'done'
            etree.strip_tags(newchildelem, 'item')
        if newchildelem.text or len(newchildelem) > 0:
            processed_element.append(newchildelem)
        child.tag = 'done'
    # avoid double tags??
    if len(processed_element) > 0:  # if it has children
        # test if it has text
        if text_chars_test(''.join(processed_element.itertext())) is True:
            return processed_element
    return None
Example #18
0
 def _extract_cases_from_html(self, html):
     """Build list of data dictionaries, one dictionary per case (table row)."""
     # Strip inconsistently placed <font> and <br>
     # tags that make stable coverage almost impossible
     etree.strip_tags(html, 'font', 'br')
     for ul in html.xpath('//table[@id="AutoNumber1"]/tr[2]/td/table/tr/td//ul'):
         preceding = ul.xpath('./preceding::*[1]')[0]
         preceding_text = ' '.join(preceding.text_content().split()).strip(':')
         if preceding_text and not preceding_text.lower().endswith('future date'):
             # Below will fail if they change up strings or date formats
             case_date = convert_date_string(preceding_text.split()[-1])
             for element in ul.xpath('./li | ./a'):
                 if element.tag == 'li':
                     text = normalize_dashes(' '.join(element.text_content().split()))
                     if not text:
                         continue
                     anchor = element.xpath('.//a')[0]
                 elif element.tag == 'a':
                     # Malformed html, see connappct_example.html
                     anchor = element
                     glued = '%s %s' % (anchor.text_content(), anchor.tail)
                     text = normalize_dashes(' '.join(glued.split()))
                 self.cases.append({
                     'date': case_date,
                     'url': anchor.xpath('./@href')[0],
                     'docket': text.split('-')[0].replace('Concurrence', '').replace('Dissent', ''),
                     'name': text.split('-', 1)[1],
                 })
Example #19
0
 def reprocess_definition(self, definition):
     etree.strip_tags(definition, self.STRIP_TAGS_LIST)
     definition = etree.tostring(definition, encoding='unicode')
     definition = definition.replace('<td class="main">', '')
     definition = definition.replace('</td>', '')
     definition = definition.replace('\n', '')
     return definition
Example #20
0
def cleanup_address_p(paragraph):
    """Function for dealing with the somewhat messy paragraphs inside an
    address block. This deals with the potential lack of spaces in the XML,
    extra E tags, and strange characters up front."""
    if paragraph.text:
        ended_with_space = paragraph.text.endswith(' ')
    else:
        ended_with_space = True
    #   Inside baseball -- adds spaces to tags that don't have them
    for child in paragraph.getchildren():
        if not child.text:
            continue

        if not ended_with_space:
            child.text = ' ' + child.text
        if child.tail and not child.tail.startswith(' '):
            child.text = child.text + ' '

        if child.tail:
            ended_with_space = child.tail.endswith(' ')
        else:
            ended_with_space = child.text.endswith(' ')
    etree.strip_tags(paragraph, 'E')
    txt = paragraph.text.strip()
    while txt and not (txt[0] in string.letters or txt[0] in string.digits):
        txt = txt[1:]
    return txt
Example #21
0
    def __call__(self, document, url):
        """Filter article content from raw html"""
        # turn the raw html into an etree
        root = html.fromstring(document)

        # so that they'll continue to work
        root.make_links_absolute(url)

        content = []

        # if we have a selector and a whitelist
        # then we can work.
        if self.css and self.whitelist:
            etree.strip_tags(root, 'img',
                             etree.Comment)  # remove img/comment tags
            try:
                # get each matching block that we want
                # the text from
                for block in self.css(root):

                    # parse out the text and whitelisted html
                    para = self.__parse(block)

                    # if we got something, keep it
                    if para: content.append(para)
            except Exception as e:
                log.exception('{} in filter at: {}'.format(type(e), url))

        # return a list of paragraphs
        return content
Example #22
0
def dump_article_text(file_path,
                      xpath_str,
                      filter_tags=filter_tag_list,
                      remove_stop_words=True):
    """
    This method is designed to extract all text from xml documents. 
    Every document has specific tags that are striped in order to produce
    clean text output for downstream processing
    
    Keyword arguments:
        file_path - the file path for xml document
        xpath_str - the xpath string to extract tags from the xml document
        filter_tag_list - the list of tags to strip from the xml document
        remove_stop_words - a flag to indicate if stop words should be removed
    """

    tree = (ET.parse(open(file_path, "rb"), parser=parser))

    # Process xml without specified tags
    ET.strip_tags(tree, *filter_tags)

    root = tree.getroot()
    all_tags = root.xpath(xpath_str)
    text = list(map(lambda x: list(x.itertext()), list(all_tags)))

    # Remove stop words
    if remove_stop_words:
        text = (list(
            map(lambda x: remove_stopwords(re.sub("\n", "", "".join(x))),
                text)))

    else:
        text = (list(map(lambda x: re.sub("\n", "", "".join(x)), text)))

    return text
Example #23
0
def stenogramma(request):
    u = url + '/video/view.php'
    if request.method == "GET" and "t" in request.GET:
        u = u + '?t=%s' % request.GET['t']

        #o, l, r = get_page(u)
        r = requests.get(u)
        o = html.fromstring(r.text.encode('UTF-8'))

        h1 = o.xpath('//h1')[0].text_content()
        text = o.xpath('//div[@class="body"]')[0]
        etree.strip_tags(text, 'font')

        for s in text.xpath('//a'):
            etree.strip_tags(s, 'b')
            #tt = s.text.rstrip()
            #s.append(tt)

        n = etree.tostring(text, encoding='unicode')

        cs = []  # get_comments(o, u)

        return render(request, "text.html", {
            'h1': h1,
            'n': n,
            'cs': cs,
            'u': u
        })

    return redirect('https://catalog.oper.ru/')
Example #24
0
    def enclose_and_change_self_size(self, outer_xpath, size_attribute, tag,
                                     change_tag):
        tree = self.load_dom_tree()

        # search the tree and grab the parent
        for child in tree.xpath(
                outer_xpath, namespaces={'tei':
                                         'http://www.tei-c.org/ns/1.0'}):
            self.debug.print_debug(
                self, u'Enclosing and changing size: {0} to {1}'.format(
                    child.tag, change_tag))
            new_element = etree.Element(tag)
            child.attrib[u'meTypesetSize'] = size_attribute
            if child.tag == '{http://www.tei-c.org/ns/1.0}' + change_tag:
                child.tag = 'REMOVE'
            else:
                for sub_element in child:
                    if sub_element.tag == '{http://www.tei-c.org/ns/1.0}' + change_tag:
                        child.tag = 'REMOVE'

            if child.tag != 'REMOVE':
                child.tag = change_tag

            child.addnext(new_element)
            Manipulate.append_safe(new_element, child, self)

            if child.tag == 'REMOVE':
                etree.strip_tags(child.getparent(), 'REMOVE')

            if not (child.attrib['rend'] is None):
                if u'bold' in child.attrib[u'rend']:
                    child.attrib[u'rend'] = child.attrib[u'rend'].replace(
                        u'bold', u'')

        self.save_tree(tree)
 def transform(self, data):
     raw, xml = data
     for tag in self.TAGS:
         nodes = xml.findall(".//" + tag)
         if len(nodes) > 0:
             etree.strip_tags(xml, tag)
     return data
Example #26
0
def header_to_xml(header_lines, book, output_xml_path):

    header_lines = [[y for y in x] for x in group_ranges(header_lines)]

    # First line: last line
    d = {x[0]: x[-1] for x in header_lines}

    # Delete section tags
    ET.strip_tags(book, "section")

    for from_line, to_line in d.items():
        f = book.find('.//line[@num="' + str(from_line) + '"]')

        new_element = ET.Element('header')

        prev = f.getprevious()
        if prev is not None:
            for line_num in range(from_line, to_line + 1):
                e = book.find('.//line[@num="' + str(line_num) + '"]')
                new_element.append(e)
            prev.addnext(new_element)
        else:
            parent = f.getparent()
            for line_num in range(from_line, to_line + 1):
                e = book.find('.//line[@num="' + str(line_num) + '"]')
                new_element.append(e)
            parent.insert(0, new_element)

    ET.strip_tags(book, "line")

    # Write to file
    with open(output_xml_path, 'wb') as f:
        f.write(ET.tostring(book, pretty_print=True))
Example #27
0
 def _load_from(self, data):
     try:
         self._xmp = parse(BytesIO(data))
     except XMLSyntaxError:
         data = re_xml_illegal_bytes.sub(b'', data)
         try:
             self._xmp = parse(BytesIO(data))
         except XMLSyntaxError as e:
             if str(e).startswith(
                     "Start tag expected, '<' not found") or str(
                         e).startswith("Document is empty"):
                 # This is usually triggered by processing instructions
                 # in another otherwise empty document, or empty documents,
                 # which we consider safe to coerce to a well-formed
                 # XMP. For harder cases like truncated XMP, we want to
                 # raise the exception so that someone is alerted.
                 self._xmp = parse(BytesIO(XMP_EMPTY))
             else:
                 raise PdfError() from e
     pis = self._xmp.xpath('/processing-instruction()')
     for pi in pis:
         etree.strip_tags(self._xmp, pi.tag)
     try:
         self._get_rdf_root()
     except ValueError:
         if self._xmp.find('.', self.NS).tag == '{adobe:ns:meta/}xmpmeta':
             # Looks like: <x:xmpmeta></x:xmpmeta>, so reload with template
             # that includes <rdf:RDF>
             return self._load_from(XMP_EMPTY)
         else:
             raise  # Probably not XMP
Example #28
0
        def analysis_section(notice, child):
            # Create the section element
            section_elm = Element('analysisSection')

            # Add the title element
            title_elm = SubElement(section_elm, 'title')
            title_elm.text = child['title']

            # Add paragraphs
            for paragraph in child['paragraphs']:
                paragraph_number = child['paragraphs'].index(paragraph)
                paragraph_footnotes = [
                    fn for fn in child['footnote_refs']
                    if fn['paragraph'] == paragraph_number
                ]
                text = self.resolve_footnotes(notice, paragraph,
                                              paragraph_footnotes)
                paragraph_elm = fromstring('<analysisParagraph>' + text +
                                           '</analysisParagraph>')

                # Make sure to strip out elements that don't belong
                strip_tags(paragraph_elm, 'EM')

                section_elm.append(paragraph_elm)

            # Construct an analysis section for any children.
            map(lambda c: section_elm.append(analysis_section(notice, c)),
                child['children'])

            return section_elm
Example #29
0
 def reprocess_definition(self, definition):
     etree.strip_tags(definition, self.STRIP_TAGS_LIST)
     definition = etree.tostring(definition, encoding='unicode')
     definition = definition.replace('<td class="main">', '')
     definition = definition.replace('</td>', '')
     definition = definition.replace('\n', '')
     return definition
Example #30
0
def xmltotxt(xmloutput):
    '''Convert to plain text format'''
    returnlist = []
    etree.strip_tags(xmloutput, 'hi')
    for element in xmloutput.iter():
        # process text
        if element.text is None and element.tail is None:
            # newlines for textless elements
            if element.tag in ('row', 'table'):
                returnlist.append('\n')
            continue
        if element.text is not None and element.tail is not None:
            textelement = ' '.join([element.text, element.tail])
        elif element.text is not None and element.tail is None:
            textelement = element.text
        else:
            textelement = element.tail
        if element.tag in ('code', 'fw', 'head', 'lb', 'p', 'quote', 'row', 'table'):
            returnlist.extend(['\n', textelement, '\n'])
        elif element.tag == 'item':
            returnlist.extend(['\n- ', textelement, '\n'])
        elif element.tag == 'cell':
            returnlist.extend(['|', textelement, '|'])
        elif element.tag == 'comments':
            returnlist.append('\n\n')
        else:
            returnlist.extend([textelement, ' '])
    return sanitize(''.join(returnlist))
Example #31
0
 def evaluate(self, pred, true):
     ''' Computes TEDS score between the prediction and the ground truth of a
         given sample
     '''
     if (not pred) or (not true):
         return 0.0
     parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
     pred = html.fromstring(pred, parser=parser)
     true = html.fromstring(true, parser=parser)
     if pred.xpath('body/table') and true.xpath('body/table'):
         pred = pred.xpath('body/table')[0]
         true = true.xpath('body/table')[0]
         if self.ignore_nodes:
             etree.strip_tags(pred, *self.ignore_nodes)
             etree.strip_tags(true, *self.ignore_nodes)
         n_nodes_pred = len(pred.xpath(".//*"))
         n_nodes_true = len(true.xpath(".//*"))
         n_nodes = max(n_nodes_pred, n_nodes_true)
         tree_pred = self.load_html_tree(pred)
         tree_true = self.load_html_tree(true)
         distance = APTED(tree_pred, tree_true,
                          CustomConfig()).compute_edit_distance()
         return 1.0 - (float(distance) / n_nodes)
     else:
         return 0.0
def compare_xml_content(f1: str, f2: str) -> bool:
    """
    Compare the contents of two XML files and report
    if the contents are the same, minus comments
    :param f1: File 1
    :param f2: File 2
    :return: Is file content, minus comments, the same?
    """
    file1 = pathlib.Path(f1)
    file2 = pathlib.Path(f2)

    if file1.is_file() and file2.is_file():
        parser = etree.XMLParser(remove_blank_text=True)
        root = etree.parse(str(file1), parser).getroot()
        etree.strip_tags(root, etree.Comment)
        f1_hash = hashlib.sha512(etree.tostring(root)).hexdigest()

        parser = etree.XMLParser(remove_blank_text=True)
        root = etree.parse(str(file2), parser).getroot()
        etree.strip_tags(root, etree.Comment)
        f2_hash = hashlib.sha512(etree.tostring(root)).hexdigest()

        return f1_hash == f2_hash

    return False
Example #33
0
def preprocess(tree, filename, strip_tags):
    """
    Wrapper function that help apply the parser to different pages.
    :param tree: Document tree that contains different page tree as children
    :param filename: Filename of the output HTML file
    :param strip_tags: Tags name to be flatten
    :return: All words and equations from this file (with coordinate information)
    """
    etree.strip_tags(tree, *strip_tags)
    words = []
    equations = []
    for page_tree in tree:
        generate_rawtext_from_ocrx(page_tree)
        remove_ocr_img_for_non_img(page_tree)
        img_segment_clean_up(page_tree)
        split_paragraph(page_tree)
        words += [*get_all_words_with_coordinates(page_tree)]
        equations += list(get_equation(page_tree))
        remove_ocr_elements(page_tree)
        add_name(page_tree)

    with open(filename, 'wb') as out_html:
        out_html.write(etree.tostring(tree, pretty_print=True))

    return words, equations
Example #34
0
def import_from_html(filename_or_fobj, encoding='utf-8', index=0,
                     ignore_colspan=True, preserve_html=False, row_tag='tr',
                     column_tag='td|th', *args, **kwargs):
    # TODO: unescape before returning: html_parser.unescape(html)
    # TODO: lxml -> unicode?

    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    kwargs['encoding'] = encoding
    html = fobj.read().decode(encoding)
    html_tree = document_fromstring(html)
    tables = html_tree.xpath('//table')
    table = tables[index]

    strip_tags(table, 'thead')
    strip_tags(table, 'tbody')
    row_elements = table.xpath(row_tag)
    if not preserve_html:
        table_rows = [[value_element.text_content().strip()
                       for value_element in row.xpath(column_tag)]
                      for row in row_elements]
    else:
        table_rows = [[_get_content(value_element)
                       for value_element in row.xpath(column_tag)]
                      for row in row_elements]

    max_columns = max(len(row) for row in table_rows)
    if ignore_colspan:
        table_rows = filter(lambda row: len(row) == max_columns, table_rows)

    meta = {'imported_from': 'html', 'filename': filename,}
    return create_table(table_rows, meta=meta, *args, **kwargs)
Example #35
0
def sanitize_tree(tree, include_formatting=False):
    '''Convert and sanitize the output from the generic algorithm (post-processing)'''
    # delete unnecessary elements
    for elem in tree.xpath(SANITIZED_XPATH):
        elem.getparent().remove(elem)
    etree.strip_tags(tree, MANUALLY_STRIPPED + ['a', 'span'])
    tree = prune_html(tree)
    # convert
    cleaned_tree = convert_tags(tree, include_formatting)
    for elem in cleaned_tree.iter('td', 'th', 'tr'):
        # elem.text, elem.tail = trim(elem.text), trim(elem.tail)
        # finish table conversion
        if elem.tag == 'tr':
            elem.tag = 'row'
        elif elem.tag in ('td', 'th'):
            if elem.tag == 'th':
                elem.set('role', 'head')
            elem.tag = 'cell'
    # sanitize
    sanitization_list = list()
    for tagname in [element.tag for element in set(cleaned_tree.iter())]:
        if tagname not in TEI_VALID_TAGS:
            sanitization_list.append(tagname)
        #    if tagname in ('article', 'content', 'link', 'main', 'section', 'span'):
        #        for element in cleaned_tree.iter(tagname):
        #            merge_with_parent(element)
        #    else:
        #    print(tagname)
    etree.strip_tags(cleaned_tree, sanitization_list)
    text = trim(' '.join(cleaned_tree.itertext()))
    return cleaned_tree, text, len(text)
def get_text(elements, itemize=False):
    paragraphs = []
    highlight_elements = ['varname', 'parameter']
    strip_elements = [
            'returnvalue',
            'command',
            'link',
            'footnote',
            'simpara',
            'footnoteref',
            'function'
    ] + highlight_elements
    for element in elements:
        # put "Since MPD version..." in paranthese
        etree.strip_tags(element, "application")
        for e in element.xpath("footnote/simpara"):
            e.text = "(" + e.text.strip() + ")"

        for e in element.xpath("|".join(highlight_elements)):
            e.text = "*" + e.text.strip() + "*"
        etree.strip_tags(element, *strip_elements)
        if itemize:
            initial_indent = "    * "
            subsequent_indent = "      "
        else:
            initial_indent = "    "
            subsequent_indent = "    "
        wrapper = TextWrapper(subsequent_indent=subsequent_indent,
                              initial_indent=initial_indent)
        text = element.text.replace("\n", " ").strip()
        text = re.subn(r'\s+', ' ', text)[0]
        paragraphs.append(wrapper.fill(text))
    return "\n\n".join(paragraphs)
Example #37
0
def xmltostring(xml):
  etree.strip_tags(xml, '*', etree.Comment)
  if not xml.text:
    return ''
  string = " ".join(xml.text.split())
  string = reencode(string)
  return string
def cleanup(table):
    etree.strip_tags(table,'span','strong','div', 'tbody')
    for tag in table.iter():
        for att in tag.attrib.keys():
            tag.attrib.pop(att)
        if tag.tag == "table": tag.set('border','1')
    return table;
Example #39
0
 def _clean(self):
     """
     Removes some of extraneous tags to make parsing easier
     """
     etree.strip_tags(self.tree, 'strong')
     for xx in self.tree.find_class('pydocx-tab'):
         xx.drop_tag()
Example #40
0
def extract_comments(tree, dedupbool):
    '''Try and extract comments out of potential sections in the HTML'''
    comments_body = etree.Element('body')
    # define iteration strategy
    potential_tags = set(TAG_CATALOG)  # 'span'
    # potential_tags.add('div') trouble with <div class="comment-author meta">
    for expr in COMMENTS_XPATH:
        # select tree if the expression has been found
        subtree = tree.xpath(expr)
        if not subtree:
            continue
        subtree = subtree[0]
        # prune
        subtree = discard_unwanted_comments(subtree)
        etree.strip_tags(subtree, 'a', 'link', 'span')
        # extract content
        #for elem in subtree.xpath('.//*'):
        #    processed_elem = process_comments_node(elem, potential_tags)
        #    if processed_elem is not None:
        #        comments_body.append(processed_elem)
        processed_elems = [
            process_comments_node(elem, potential_tags, dedupbool)
            for elem in subtree.xpath('.//*')
        ]
        comments_body.extend(list(filter(None.__ne__, processed_elems)))
        # control
        if len(comments_body) > 0:  # if it has children
            LOGGER.debug(expr)
            # remove corresponding subtree
            subtree.getparent().remove(subtree)
            break
    # lengths
    temp_comments = trim(' '.join(comments_body.itertext()))
    return comments_body, temp_comments, len(temp_comments), tree
Example #41
0
 def parse_xml(self, filename, use_objectify=False, elements=None, tags=None):
     """
     Parse and clean the supplied file by removing any elements or tags we don't use.
     :param filename: The filename of the xml file to parse. Str
     :param use_objectify: Use the objectify parser rather than the etree parser. (Bool)
     :param elements: A tuple of element names (Str) to remove along with their content.
     :param tags: A tuple of element names (Str) to remove, preserving their content.
     :return: The root element of the xml document
     """
     try:
         with open(filename, 'rb') as import_file:
             # NOTE: We don't need to do any of the normal encoding detection here, because lxml does it's own
             # encoding detection, and the two mechanisms together interfere with each other.
             if not use_objectify:
                 tree = etree.parse(import_file, parser=etree.XMLParser(recover=True))
             else:
                 tree = objectify.parse(import_file, parser=objectify.makeparser(recover=True))
             if elements or tags:
                 self.wizard.increment_progress_bar(
                     translate('BiblesPlugin.OsisImport', 'Removing unused tags (this may take a few minutes)...'))
             if elements:
                 # Strip tags we don't use - remove content
                 etree.strip_elements(tree, elements, with_tail=False)
             if tags:
                 # Strip tags we don't use - keep content
                 etree.strip_tags(tree, tags)
             return tree.getroot()
     except OSError as e:
         self.log_exception('Opening {file_name} failed.'.format(file_name=e.filename))
         critical_error_message_box(
             title='An Error Occured When Opening A File',
             message='The following error occurred when trying to open\n{file_name}:\n\n{error}'
             .format(file_name=e.filename, error=e.strerror))
     return None
Example #42
0
def fix_corresp_label(root):
    global output
    for corresp in root.xpath("//corresp"):
        if corresp.xpath("label"):
            etree.strip_tags(corresp, "label")
            output += "correction: removed label tag from corresp " + corresp.attrib["id"] + "\n"
    return root
Example #43
0
def spaces_then_remove(el, tag_str):
    """FR's XML tends to not add spaces where needed, which leads to the
    removal of tags sometimes smashing together words."""
    for tag in el.xpath('.//' + tag_str):
        prepost_pend_spaces(tag)
    etree.strip_tags(el, tag_str)
    return el
def parseQuestionContentToList(body,title):
	root = etree.HTML(body)
	etree.strip_elements(root,'code',with_tail=False)
	etree.strip_tags(root,'*')
	nonPunct = re.compile('.*[A-Za-z0-9].*')
	text = str(etree.tostring(root,pretty_print = True)[10:-11])[1:].lower()\
	.replace('\\n',' ')\
	.replace("\\",'')\
	.replace("?",' ')
	title = title.lower().replace("?"," ")
	text += " " + title
	tokens = nltk.word_tokenize(text)
	filtered = [w for w in tokens if nonPunct.match(w)]
	#get rid of the punctuation that got left around the words
	for word in filtered:
		front = 0
		back = 0
		for letter in word:
			if letter not in string.punctuation:
				break
			front += 1
		for letter in reversed(word):
			if letter not in string.punctuation:
				break
			back -= 1
		if back == 0 :
			back = None
		word  = word[front:back]

	return filtered
Example #45
0
def xml_text_only(elem):
    '''Return inner text of element with tags stripped'''
    etree.strip_tags(elem, '*')
    inner_text = elem.text
    if inner_text:
        return inner_text.strip()
    return None
Example #46
0
    def update_oed(self, **kwargs):
        valid_links_only = kwargs.get('validLinksOnly', False)
        tree = etree.parse(self.oed_in)
        for entry in tree.findall('./link'):
            oed_id = entry.get('sourceID', None)
            oed_label_text = self.oed_index.find(oed_id, field='label') or LinkUpdater.error_message
            source_label = entry.find('./sourceLabel')
            etree.strip_tags(source_label, 'i', 'sup', 'sub', 'hm')
            source_label.text = oed_label_text

            lexid = entry.get('targetID', None)
            ode_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message
            target_label = entry.find('./targetLabel')
            etree.strip_tags(target_label, 'i', 'sup', 'sub', 'hm')
            target_label.text = ode_label_text

            if (valid_links_only and
                (oed_id is None or
                 lexid is None or
                 source_label.text == LinkUpdater.error_message or
                 target_label.text == LinkUpdater.error_message or
                 not check_match(source_label.text, target_label.text))):
                entry.getparent().remove(entry)

        with open(self.oed_out, 'w') as filehandle:
            filehandle.write(etree.tostring(tree,
                                            pretty_print=True,
                                            encoding='unicode'))
Example #47
0
def search_wiki(math_knowledge, math_map, mcom_map, roots, math_exp_rev, old_new_math_map):
    ws = WikiPageSearcher(solr_wiki_math, solr_wiki_doc)
    na = norm_attribute()
    for mid, vals in math_knowledge.iteritems():
        #mid = "MATH_C04-1197_15"
        mml = etree.tostring(math_map[mid])
        mml = na.normalize(mml)

        mml_comp = etree.tostring(mcom_map[mid])
        mml_comp = na.normalize(mml_comp)
        
        lst_dct_weighted_nps = []
        lst_dct_weighted_nps.append(vals["nps"])

        if "children" in vals:
            for v, vt in vals["children"]:
                if vt is Link_Types.comp or vt is Link_Types.simcomp: continue
                #text = u"%s %s" % (text, math_knowledge[v]["paragraph"])
                lst_dct_weighted_nps.append(math_knowledge[v]["nps"])

        agg_nps = nps_aggregration(lst_dct_weighted_nps)
        mathdb, docdb = ws.search_wikipedia_pages(mml_comp, agg_nps)
        
        is_root = old_new_math_map[math_exp_rev[mid]] in roots
        is_root = str(is_root)

        mml_to_print = etree.fromstring(etree.tostring(math_map[mid]))
        etree.strip_tags(mml_to_print, "*")
        print "\t".join((is_root, mid, encode(mml_to_print.text), print_docs_score(mathdb), print_docs_score(docdb)))
Example #48
0
    def update_odo(self, **kwargs):
        valid_links_only = kwargs.get('validLinksOnly', False)
        tree = etree.parse(self.odo_in)
        for entry in tree.findall('./e'):
            lexid = entry.get('lexid', None)
            odo_label = entry.find('./label')
            odo_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message
            etree.strip_tags(odo_label, 'i', 'sup', 'sub', 'hm')
            odo_label.text = odo_label_text
            link = entry.find('./linkSet/link')

            if link is not None:
                refentry = link.get('refentry', '0')
                refid = link.get('refid', '0')
                oed_label_text = self.oed_index.find(refentry, field='label') or LinkUpdater.error_message
                etree.strip_tags(link, 'i', 'sup', 'sub', 'hm')
                link.text = oed_label_text

            if (valid_links_only and
                (link is None or
                 link.text == LinkUpdater.error_message or
                 odo_label.text == LinkUpdater.error_message or
                 not check_match(link.text, odo_label.text))):
                entry.getparent().remove(entry)

        with open(self.odo_out, 'w') as filehandle:
            filehandle.write(etree.tostring(tree,
                                            pretty_print=True,
                                            encoding='unicode'))
def stringify_children(node):
    from lxml.etree import tostring, strip_tags

    strip_tags(node,'*')
    text = tostring(node, method='text', encoding=unicode)

    return text
Example #50
0
 def _extract_cases_from_html(self, html):
     """Build list of data dictionaries, one dictionary per case (table row)."""
     # Strip inconsistently placed <font> and <br>
     # tags that make stable coverage almost impossible
     etree.strip_tags(html, 'font', 'br')
     path = '//table[@id="AutoNumber1"]//ul'
     for ul in html.xpath(path):
         preceding = ul.xpath('./preceding::*[1]')[0]
         preceding_text = ' '.join(preceding.text_content().split()).strip(':')
         # Skip sections that are marked to be published at future date
         if preceding_text and not preceding_text.lower().endswith(' date'):
             # Below will fail if they change up string format
             date_string = preceding_text.split()[-1]
             case_date = convert_date_string(date_string)
             for element in ul.xpath('./li | ./a'):
                 if element.tag == 'li':
                     text = normalize_dashes(' '.join(element.text_content().split()))
                     if not text:
                         continue
                     anchor = element.xpath('.//a')[0]
                 elif element.tag == 'a':
                     # Malformed html, see connappct_example.html
                     anchor = element
                     glued = '%s %s' % (anchor.text_content(), anchor.tail)
                     text = normalize_dashes(' '.join(glued.split()))
                 self.cases.append({
                     'date': case_date,
                     'url': anchor.xpath('./@href')[0],
                     'docket': text.split('-')[0].replace('Concurrence', '').replace('Dissent', ''),
                     'name': text.split('-', 1)[1],
                 })
Example #51
0
def fix_article_title_tags(root):
    global output
    title = root.xpath("//title-group/article-title")[0]
    if title.xpath("//named-content"):
        etree.strip_tags(title, "named-content")
        output += "correction: removed named-content tags from article title\n"
    return root
def processFile(first_docid, filename):
  with file(filename, 'rb') as xmlfile:
    xml = "<root>%s</root>" % xmlfile.read()
  parser=etree.XMLParser(recover=True)
  tree=etree.fromstring(xml,parser=parser)
  etree.strip_tags(tree,"a")
  def inclusion_filter(doc):
    return (not doc.get('url', '').endswith(u'_(disambiguation)') 
            and not u'List_of_' in doc.get('url', '')
            and len(doc.text.strip()) >= args.mindoclen)
  (kept, dropped) = bifurcate(inclusion_filter, tree.xpath('//doc'))
  print "Dropped %s of %s documents" % (len(dropped), len(kept) + len(dropped))
  docs = list(enumerate(kept, start=first_docid))
  if len(docs) == 0:
      return first_docid

  for (docid, doc) in docs:
    text=doc.text.encode('utf-8').split("\n")
    attrs = {
      'id'    : doc.get("id"),
      'title' : text[1],
      'text'  : "\n".join(text[2:]),
      'source': 'Wikipedia',
      'date'  : TODAY,
      'url'   : doc.get("url", "").replace("http://it.wikipedia.org","http://en.wikipedia.org")
    }
    add_document(attrs, docid)
  print "Processed %s docs from %s" % (len(docs), filename)
  return docs[-1][0]
Example #53
0
    def email(self, alerts):
        node = alert_node()
        dismisseds = [a.message_id for a in mAlert.objects.filter(node=node)]
        msgs = []
        for alert in alerts:
            if alert.getId() not in dismisseds:
                """
                This is all to allow <a> tags in alert messages.
                We need to strip out all the tags so we can send a
                plain text email.
                """
                msg = str(alert)
                msgnode = etree.fromstring('<msg>{}</msg>'.format(msg))
                for i in msgnode.xpath('//a'):
                    new = etree.Element('span')
                    new.text = '{} ({})'.format(i.text, i.attrib['href'])
                    msgnode.replace(i, new)

                etree.strip_tags(msgnode, '*')
                msgs.append(msgnode.text)
        if len(msgs) == 0:
            return

        hostname = socket.gethostname()
        send_mail(
            subject='%s: %s' % (
                hostname,
                _("Critical Alerts"),
            ),
            text='\n'.join(msgs)
        )
Example #54
0
def word_lookup(word):
    request_url = api_url + word + '?key=' + api_key
    try:
        doc = etree.parse(request_url)
    except IOError:
        print 'Failed to connect to API.'
        return
    except etree.XMLSyntaxError:
        print 'Invalid XML response when looking up "' + word + '".'
        return
    entries = doc.xpath('//entry_list/entry')
    suggestions = doc.xpath('//entry_list/suggestion')
    if entries:
        print '\n\tInput: ' + word
        for entry in entries:
            print '========================='
            print entry.find('ew').text
            for definition in entry.xpath('def/dt'):
                etree.strip_tags(definition,"*")
                print '=> ' + definition.text.replace(':','', 1)
        print '========================='
    elif suggestions:
        print 'The word "' + word + '" isn\'t in the dictionary.\nSuggestions:'
        for suggestion in suggestions:
            print suggestion.text
    else:
        print 'No results found for "' + word + '".'
Example #55
0
    def enclose_and_change_self_size(self, outer_xpath, size_attribute, tag, change_tag):
        tree = self.load_dom_tree()

        # search the tree and grab the parent
        for child in tree.xpath(outer_xpath, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
            self.debug.print_debug(self, u'Enclosing and changing size: {0} to {1}'.format(child.tag, change_tag))
            new_element = etree.Element(tag)
            child.attrib[u'meTypesetSize'] = size_attribute
            if child.tag == '{http://www.tei-c.org/ns/1.0}' + change_tag:
                child.tag = 'REMOVE'
            else:
                for sub_element in child:
                    if sub_element.tag == '{http://www.tei-c.org/ns/1.0}' + change_tag:
                        child.tag = 'REMOVE'

            if child.tag != 'REMOVE':
                child.tag = change_tag

            child.addnext(new_element)
            Manipulate.append_safe(new_element, child, self)

            if child.tag == 'REMOVE':
                etree.strip_tags(child.getparent(), 'REMOVE')

            if not (child.attrib['rend'] is None):
                if u'bold' in child.attrib[u'rend']:
                    child.attrib[u'rend'] = child.attrib[u'rend'].replace(u'bold', u'')

        self.save_tree(tree)
Example #56
0
def cleanup_address_p(paragraph):
    """Function for dealing with the somewhat messy paragraphs inside an
    address block. This deals with the potential lack of spaces in the XML,
    extra E tags, and strange characters up front."""
    if paragraph.text:
        ended_with_space = paragraph.text.endswith(" ")
    else:
        ended_with_space = True
    #   Inside baseball -- adds spaces to tags that don't have them
    for child in paragraph.getchildren():
        if not child.text:
            continue

        if not ended_with_space:
            child.text = " " + child.text
        if child.tail and not child.tail.startswith(" "):
            child.text = child.text + " "

        if child.tail:
            ended_with_space = child.tail.endswith(" ")
        else:
            ended_with_space = child.text.endswith(" ")
    etree.strip_tags(paragraph, "E")
    txt = paragraph.text.strip()
    while txt and not (txt[0] in string.letters or txt[0] in string.digits):
        txt = txt[1:]
    return txt
Example #57
0
def clean_image_block(block_tree):
    """ Cleans up an image block to assure that
        it has the correct structure.
    """
    image = None
    img_wrapper = None
    caption = None
    image_found = False
    caption_found = False

    ## We get all the block descendants using lxml (should be "depth-first")
    ## in order to get image and caption elements, if any.
    for des in block_tree.iterdescendants():
        ## We only take the first img element found.
        if des.tag == 'img' and not image_found:
            image_found = True
            ## We set the image element.
            image = des
            ## If the img element is wrapped by a link
            ## we set the image_wrapper too.
            if des.getparent().tag == 'a':
                img_wrapper = des.getparent()
                ## If the class has been modified we put the correct one.
                img_wrapper.attrib['class'] = 'image-link'

        ## We only take the first span element (caption) found.
        if des.tag == 'span' and not caption_found:
            caption_found = True
            ## We set the caption element.
            caption = des
            ## If the class has been modified we put the correct one.
            caption.attrib['class'] = 'image-caption'

    ## If the image block has no image inside
    ## then it's invalid and we remove it.
    if image is None:
        block_tree.tag = 'invalid_image_block'
        etree.strip_elements(block_tree, 'invalid_image_block')
        return

    ## Sanitazing the caption, we strip out every element inside the span
    ## preserving the content and thus all the texts present.
    if caption is not None:
        etree.strip_tags(caption, '*')

    ## We go through the descendants again to mark invalid elements.
    for des in block_tree.iterdescendants():
        ## Invalid elements are all those elements which are neither the image
        ## nor the caption, nor the image_wrapper.
        if des is image or des is img_wrapper or des is caption:
            continue
        ## We remove invalid tags texts.
        des.text = ''
        ## We mark invalid tags for removal.
        des.tag = 'tag_to_be_stripped_out'

    ## We finally strip out tags marked as invalid
    ## now the image block should have the correct structure.
    etree.strip_tags(block_tree, 'tag_to_be_stripped_out')
Example #58
0
	def _clean_article(self, article):
		if len(article.cssselect('h1')) > 0:
			article.remove(article.cssselect('h1')[0])
		for e in article.cssselect('p,br,ul,li'):
			e.tail = '\n' + (e.tail if e.tail else '')
		etree.strip_tags(article, '*')
		text = unicode(article.text_content()).strip()
		self.text = re.sub(r'\W*\n\W*', '\n\n', text)
Example #59
0
    def parse_info_page(self, response):
        def get_text_formatted(node):
            from lxml.html import fromstring
            etree.strip_tags(node.xmlNode, "a")
            txt = node.extract()
            txt = txt.replace("<br/>", "\n")
            txt = txt.replace(u"\u00A0", " ")
            txt = fromstring(txt).text_content()
            txt = re.sub(r"\n[ \t]+", "\n", txt)
            return txt.strip()

        def get_text(node, regexp=None, invert=False):
            etree.strip_tags(node.xmlNode, "a")
            txt = ""
            for line in node.xpath(".//text()").extract():
                line = line.replace(u"\u00A0", " ")
                line = line.strip()
                if not line:
                    continue
                match = True
                if regexp:
                    match = regexp.search(line) and True or False
                if (match and not invert) or (not match and invert):
                    if line[0] != line[0].lower():
                        txt += ". "
                    txt += " %s " % line
            txt = re.sub("(\s\.+\s)+", ".", txt)
            txt = re.sub("[\s]+", " ", txt)
            txt = re.sub("[\.]+", ".", txt)
            txt = re.sub("^. ", "", txt)
            txt  = txt.strip()
            return txt

        lxs = LxmlSelector(response)
        item = response.meta["item"]
        meta = self.meta_as_dict(lxs)
        etree.strip_tags(lxs.xmlNode, "b", "font", "i")

        info_node = lxs.xpath("//a[@name = 'PDT']/ancestor::td[1]")
        if info_node:
            item["info"] = get_text_formatted(info_node[0])
        amendments_node = lxs.xpath("//a[@name = 'PAC']/ancestor::td[1]")
        if amendments_node:
            item["amendments"] = get_text_formatted(amendments_node[0])
        summary_node = lxs.xpath("//a[@name = 'ECRCM']/ancestor::td[1]")
        if summary_node:
            item["summary"] = get_text_formatted(summary_node[0])

        file_href = meta.get("URL_DOSSIER") or None
        if file_href:
            file_href = urljoin(response.url, file_href)
        item["law"] = LawItem(
            title=meta.get("LOI_PROMULGUEE", ""),
            href=meta.get("LIEN_LOI_PROMULGUEE", ""),
            file_href=file_href,
        )

        yield item