Esempio n. 1
0
def get_progressbar_element(percentage):
    progressbar_child_element = Element("div")
    percentage = int(percentage)
    if percentage >= 70:
        # green
        progressbar_child_element.attrib[
            "class"] = "progress-bar progress-bar-success progress-bar-striped"
    elif percentage >= 50:
        progressbar_child_element.attrib[
            "class"] = "progress-bar progress-bar-warning progress-bar-striped"
    else:
        progressbar_child_element.attrib[
            "class"] = "progress-bar progress-bar-danger progress-bar-striped"
    progressbar_child_element.attrib["role"] = "progressbar"
    progressbar_child_element.attrib[
        "aria-valuenow"] = "10"  # "{}".format(percentage)
    progressbar_child_element.attrib["aria-valuemin"] = "0"
    progressbar_child_element.attrib["aria-valuemax"] = "100"
    progressbar_child_element.attrib[
        "style"] = "width:{}%; text-align:left; padding-left: 5px;".format(
            percentage)
    progressbar_child_element.text = "{}%".format(percentage)
    progressbar_element = Element("div")
    progressbar_element.attrib["class"] = "progress"
    progressbar_element.attrib["style"] = "margin-bottom:0;"
    progressbar_element.append(progressbar_child_element)
    return progressbar_element
Esempio n. 2
0
def get_user(username, rank):
    if rank is None:
        element = Element('span')
    else:
        element = Element('a', {
            'class': rank,
            'href': reverse('user_page', args=[username])
        })
    element.text = username
    return element
Esempio n. 3
0
def get_user(username, data):
    if not data:
        element = Element('span')
        element.text = username
        return element

    element = Element('span', {'class': Profile.get_user_css_class(*data)})
    link = Element('a', {'href': reverse('user_page', args=[username])})
    link.text = username
    element.append(link)
    return element
Esempio n. 4
0
 def _apply_headers_anchors(html: str) -> str:
     root_element = fromstring(wrap_unwrap_fake_tag(html))
     for element in root_element:
         if element.tag in HEADERS:
             id_ = make_header_id(element.text)
             a_element = Element('a', {'id': id_, 'href': f'#{id_}'})
             span_element = Element('span', attrib={'class': 'iconify',
                                                    'data-icon': HTMLGen.ANCHOR_LINK_ICON_CLASS})
             a_element.append(span_element)
             element.text += ' '
             element.insert(0, a_element)
     html = tostring(root_element)
     html = wrap_unwrap_fake_tag(html, wrap=False)
     return html
Esempio n. 5
0
    def _apply_font_icons(html):
        root = fromstring(wrap_unwrap_fake_tag(html))
        for element in root.iter('a'):
            resource = element.attrib.get('href')
            if not (resource and element.text):  # .text empty in anchors <a>
                continue

            # External link
            if resource.startswith('https://github.com'):
                icon_class = HTMLGen.EXTERNAL_LINK_GITHUB_ICON_CLASS
            elif resource.startswith('http'):
                icon_class = HTMLGen.EXTERNAL_LINK_ICON_CLASS
            # Anchor
            elif resource.startswith('#'):
                icon_class = HTMLGen.ANCHOR_LINK_ICON_CLASS
            # File
            elif any(map(resource.endswith, HTMLGen.EXTENSIONS_ICON_CLASSES_MAP.keys())):
                extension = resource.rsplit('.', 1)[-1]
                icon_class = HTMLGen.EXTENSIONS_ICON_CLASSES_MAP[extension]
            else:
                print('Unknown icon resource ', resource)
                continue

            # Element prototype
            span_element = Element('span', attrib={'class': 'iconify', 'data-icon': icon_class})
            span_element.tail = ' ' + element.text
            element.text = None
            element.insert(0, span_element)
        html = tostring(root)
        html = wrap_unwrap_fake_tag(html, wrap=False)
        return html
def main():
    absolute_folder = sys.argv[1]
    pattern = '*.iml'
    fileList = []
    # Walk through directory
    for dName, sdName, fList in os.walk(absolute_folder):
        for fileName in fList:
            if fnmatch.fnmatch(fileName, pattern): # Match search string
                fileList.append(os.path.join(dName, fileName))
    pbar = ProgressBar(widgets=['Processing :', Percentage(), ' ', Bar(), ' ', ETA()], maxval=len(fileList)).start()
    fcount = 0
    for fileName in fileList:
        output_dict = generate_empty_dict()
        eclipse_file_path = os.path.dirname(fileName)+'/.classpath'
        with open(fileName, 'r') as f:
            intellij_data = f.read()
        if not intellij_data:
            pass
        intellij_dict = xmltodict.parse(intellij_data)
        fcount = fcount + 1
        # print(intellij_dict)
        output_dict = addSrcType(intellij_dict, output_dict)
        output_dict = addCombinedRules(intellij_dict, output_dict)
        output_dict = addConType(intellij_dict, output_dict)
        # print json.dumps(intellij_dict)
        result = bf.etree(output_dict, root=Element('classpath'))

        #print tostring(result)
        with open(eclipse_file_path, 'w') as f:
            data = tostring(result, doctype='<?xml version="1.0" encoding="UTF-8"?>')
            data = data.replace('<classpath>','')
            data = data.replace('</classpath>', '')
            data = data.replace('<?xml version="1.0" encoding="UTF-8"?>', '<?xml version="1.0" encoding="UTF-8"?><classpath>')
            data = data +'</classpath>'
            f.write(data)
        # Add .project file
        project_path = os.path.dirname(fileName)+'/.project'
        xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
    <name>%s</name>
    <comment/>
    <projects/>
    <buildSpec>
	<buildCommand>
		<name>org.eclipse.jdt.core.javabuilder</name>
		<arguments/>
	</buildCommand>
    </buildSpec>
    <natures>
	<nature>org.eclipse.jdt.core.javanature</nature>
    </natures>
</projectDescription>"""
        root_name = os.path.splitext(os.path.basename(fileName))[0]
        xml_data = xml_data%(root_name)
        with open(project_path, 'w') as f:
            f.write(xml_data)
            pbar.update(fcount)
    pbar.finish()
Esempio n. 7
0
 def footer(self, node):
     """Create a standard footer block for HTML files."""
     footer = Element('footer')
     footer.append(E.HR())
     footer.append(
         E.
         P("Generated automatically from {source} at {time:%d %b %Y %H:%M}."
           .format(source=node.sourcefile, time=datetime.datetime.now())), )
     return footer
Esempio n. 8
0
def get_user_rating(username, rating):
    element = Element('a', {
        'class': 'rate-group',
        'href': reverse('user_page', args=[username])
    })
    if rating:
        rating_css = rating_class(rating)
        rate_box = Element('span', {'class': 'rate-box ' + rating_css})
        rate_box.append(
            Element('span',
                    {'style': 'height: %3.fem' % rating_progress(rating)}))
        user = Element('span', {'class': 'rating ' + rating_css})
        user.text = username
        element.append(rate_box)
        element.append(user)
    else:
        element.text = username
    return element
Esempio n. 9
0
 def test_link_element(self):
     source = Element('link', {'href': '#'})
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '#')
     self.assertEqual(pos, 0)
     self.assertEqual(attr, 'href')
     self.assertEqual(el.tag, 'link')
     self.assertEqual(el.attrib, {'href': '#'})
Esempio n. 10
0
 def test_get_wrapper_tag(self):
     c = html.Cleaner(allow_tags=None, wrap_inline_tags='div')
     self.assertEqual(c.get_wrapper_tag(), None)
     c = html.Cleaner(allow_tags=['p', 'div'], wrap_inline_tags=False)
     self.assertEqual(c.get_wrapper_tag(), None)
     c = html.Cleaner(allow_tags=['p', 'div'], wrap_inline_tags=None)
     self.assertEqual(c.get_wrapper_tag().tag, 'p')
     c = html.Cleaner(allow_tags=['div'], wrap_inline_tags=None)
     self.assertEqual(c.get_wrapper_tag().tag, 'div')
     c = html.Cleaner(allow_tags=['b'], wrap_inline_tags=None)
     self.assertEqual(c.get_wrapper_tag(), None)
     c = html.Cleaner(allow_tags=['p', 'div'], wrap_inline_tags='div')
     self.assertEqual(c.get_wrapper_tag().tag, 'div')
     c = html.Cleaner(allow_tags=['p', 'div', 'span'],
                     wrap_inline_tags=(lambda:Element('span')))
     self.assertEqual(c.get_wrapper_tag().tag, 'span')
     c = html.Cleaner(allow_tags=['p', 'div'],
                     wrap_inline_tags=(lambda:Element('span')))
     self.assertEqual(c.get_wrapper_tag(), None)
Esempio n. 11
0
 def test_form_element(self):
     source = Element('form', {'action': '#'})
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '#')
     self.assertEqual(pos, 0)
     self.assertEqual(attr, 'action')
     self.assertEqual(el.tag, 'form')
     self.assertEqual(el.attrib, {'action': '#'})
Esempio n. 12
0
 def test_meta_refresh_element(self):
     source = Element('meta', {'http-equiv': 'refresh', 'content': '#'})
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '#')
     self.assertEqual(pos, 0)
     self.assertEqual(attr, 'content')
     self.assertEqual(el.tag, 'meta')
     self.assertEqual(el.attrib, {'http-equiv': 'refresh', 'content': '#'})
Esempio n. 13
0
 def test_inline_css_url(self):
     source = Element('div', {'style': 'background: url("#");'})
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '#')
     self.assertEqual(pos, 17)
     self.assertEqual(attr, 'style')
     self.assertEqual(el.tag, 'div')
     self.assertEqual(el.attrib, {'style': 'background: url("#");'})
Esempio n. 14
0
 def test_script_element(self):
     source = Element('script', {'src': '#'})
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '#')
     self.assertEqual(pos, 0)
     self.assertEqual(attr, 'src')
     self.assertEqual(el.tag, 'script')
     self.assertEqual(el.attrib, {'src': '#'})
Esempio n. 15
0
def brs_to_paragraphs(tree, inline_tags=None):
    """
    Return an lxml tree with all <br> elements stripped and paragraphs put in
    place where necessary.
    """
    # add these tags to p's that we're currently building, any other tags will
    # close the current p
    inline_tags = inline_tags or ['a']

    # if this tree doesn't have any child elements, just return it as is
    if len(tree) == 0:
        return tree

    # if this tree doesn't contain any <br> tags, we don't need to touch it
    if tree.find('.//br') is None:
        return tree

    # XXX: We're building a whole new tree here and leaving out any attributes.
    # A) That might be a little slower and more memory intensive than modifying
    # the tree in place, and B) we're dropping any attributes on block elements.
    # The latter is probably fine for current use, but certainly not ideal.
    new_tree = Element(tree.tag)

    # if this tree starts out with text, create a new paragraph for it, and
    # add it to the tree
    if tree.text:
        p = E.P()
        p.text = tree.text
        new_tree.append(p)

    for e in tree:
        if e.tag == 'br':
            # avoid adding empty p elements
            if e.tail is None:
                continue
            # start a new p
            p = E.P()
            p.text = e.tail
            new_tree.append(p)
        # if this is a block tag, and it has trailing text, that text needs to
        # go into a new paragraph... only if the tail has actual content and
        # not just whitespace though.
        elif e.tail and re.match('[^\s]', e.tail) and e.tag not in inline_tags:
            p = E.P()
            p.text = e.tail
            e.tail = ''
            new_tree.append(e)
            new_tree.append(p)
        # keep inline tags inside the current paragraph
        elif e.tag in inline_tags:
            p.append(e)
        else:
            new_tree.append(brs_to_paragraphs(e))

    return new_tree
Esempio n. 16
0
 def test_html_style_tag_css_import(self):
     source = Element('style')
     source.text = '@import url(#);'
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '#')
     self.assertEqual(pos, 12)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'style')
     self.assertEqual(el.attrib, {})
Esempio n. 17
0
 def test_html_style_tag_css_url_with_altering_colons(self):
     source = Element('style')
     source.text = 'html {background: url("#\');}'
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '#')
     self.assertEqual(pos, 23)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'style')
     self.assertEqual(el.attrib, {})
Esempio n. 18
0
 def test_script_element_with_url_in_the_text(self):
     source = Element('script')
     source.text = 'var background = "url(\'image.jpg\')"'
     elements = list(links(source))
     self.assertEqual(len(elements), 1)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, 'image.jpg')
     self.assertEqual(pos, 23)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'script')
     self.assertEqual(el.attrib, {})
Esempio n. 19
0
def add_bootstrap_in_html_header(input_filename, output_filename):
    with open(input_filename) as fp:
        htmlstring = fp.read()
    bootstrap = Element("link")
    bootstrap.attrib["rel"] = "stylesheet"
    bootstrap.attrib["href"] = "static/css/bootstrap.min.css"
    bootstrap.attrib["type"] = "text/css"
    html = lhtml.fromstring(htmlstring)
    html.head.append(bootstrap)
    with open(output_filename, "w") as fp:
        fp.write(lhtml.tostring(html, encoding=str))
Esempio n. 20
0
    def inject_script_tag(self, html):
        root = lxml.html.fromstring(html)
        if root is None:
            # Sometimes non-html sneaks through the header check
            return html
        with open('mask_headless.js') as f:
            content_js = f.read()
        script = Element("script")
        script.text = content_js

        root.insert(0, script)
        html = lxml.html.tostring(root, method="html").decode('utf-8')
        return html
Esempio n. 21
0
def wrap_set(dom, child_tag, parent_tag):
    """Wrap unbroken sets of elements in a parent container:
        - <li> in a <ul>
        - <tr> in a <table>
    """
    nxt = 0
    for e in dom.cssselect(child_tag):
        if nxt != e:
            box = Element(parent_tag)
            insert(box, e)
        box.append(e)
        nxt = parent(e).getnext()
        if nxt is None:
            nxt = e.getnext()
Esempio n. 22
0
def make_html_element(
    tag,
    text='',
    children=None,
    tail=None,
):
    'Make a HtmlElement object'
    el = Element(tag)
    if text is not None:
        el.text = text
    if children is not None:
        el.extend(children)
    if tail is not None:
        el.tail = text
    return el
Esempio n. 23
0
 def test_img_src_set_attribute_bad_formatted(self):
     source = Element('img', {'src-set': 'img1 1x; img2 2x,'})
     elements = list(links(source))
     self.assertEqual(len(elements), 2)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, 'img1')
     self.assertEqual(pos, 0)
     self.assertEqual(attr, 'src-set')
     self.assertEqual(el.tag, 'img')
     self.assertEqual(el.attrib, {'src-set': 'img1 1x; img2 2x,'})
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, 'img2')
     self.assertEqual(pos, 9)
     self.assertEqual(attr, 'src-set')
     self.assertEqual(el.tag, 'img')
     self.assertEqual(el.attrib, {'src-set': 'img1 1x; img2 2x,'})
Esempio n. 24
0
def fragment_fromstring(html,
                        create_parent=False,
                        guess_charset=None,
                        parser=None):
    """Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError("string required")

    accept_leading_text = bool(create_parent)

    elements = fragments_fromstring(
        html,
        guess_charset=guess_charset,
        parser=parser,
        no_leading_text=not accept_leading_text,
    )

    if create_parent:
        if not isinstance(create_parent, _strings):
            create_parent = "div"
        new_root = Element(create_parent)
        if elements:
            if isinstance(elements[0], _strings):
                new_root.text = elements[0]
                del elements[0]
            new_root.extend(elements)
        return new_root

    if not elements:
        raise etree.ParserError("No elements found")
    if len(elements) > 1:
        raise etree.ParserError("Multiple elements found")
    result = elements[0]
    if result.tail and result.tail.strip():
        raise etree.ParserError("Element followed by text: %r" % result.tail)
    result.tail = None
    return result
Esempio n. 25
0
def reconstruct_tables(dom, data):
    # order data vertically into row lists by page, row and finally column
    rows = collections.OrderedDict()
    cboxes = {}
    for c in sorted(data, key=lambda c: (c.page, c.y, c.x)):
        # combine page number and row position to get a useful key
        key = '%d,%d' % (c.page, c.y)
        # create row lists(y) and clipbox groups(x)
        rows.setdefault(key, []).append(c)
        cboxes.setdefault(c.clipbox, []).append(c.elem)

    from pprint import pprint
    # pprint(rows)

    # collect cell lines with same clip boxes
    merged = []
    for key, row in rows.items():
        for cell in row:
            if cell.clipbox in merged:
                rows[key] = [c for c in rows[key] if c != cell]
            else:
                cell.lines = cboxes[cell.clipbox]
                merged.append(cell.clipbox)

    for row in rows.values():
        # hardly a table row if there is only one non-empty element in it at the start of a line
        if len([c for c in row if c.text]) > 1:
            tr = parent(row[0].elem)
            tr.tag = 'tr'
            for cell in row:
                cell.elem.tag = 'td'
                cell.elem.attrib['class'] = ''
                for line in cell.lines[1:]:
                    line.attrib['class'] = ''
                    if BR: cell.elem.append(Element('br'))
                    cell.elem.append(line)
                tr.append(cell.elem)
    # drop empty span, divs
    for e in dom.iter():
        if e.tag in ('span', 'div'
                     ) and not e.text_content() or e.text_content() == ' ':
            e.drop_tag()

    wrap_set(dom, 'tr', 'table')
    return dom
Esempio n. 26
0
def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root
Esempio n. 27
0
    def open_tag(self, tag, attrs):
        """
        Handle a new tag. Performs autoclose checks.
        """
        new = Element(tag)
        attrs = ((k, v or "") for k, v in attrs)
        new.attrib.update(attrs)

        if self.root is None:
            self.root = new
        else:
            if not self.stack:
                self.error("Unexpected open tag: %s" % (tag))
            self.check_autoclose_on_open(tag)
            parent = self.stack[-1]
            parent.append(new)
        self.stack.append(new)

        if tag == "svg":
            self.in_svg = True
Esempio n. 28
0
 def test_html_style_tag_font_face(self):
     source = Element('style')
     source.text = """
     @font-face {
     font-family:'fontawesome';
     src:url('../lib/fonts/fontawesome.eot?14663396#iefix') format('embedded-opentype'),
     url('../lib/fonts/fontawesome.woff?14663396') format('woff'),
     url('../lib/fonts/fontawesome.ttf?14663396') format('truetype'),
     url('../lib/fonts/fontawesome.svg?14663396#fontawesome') format('svg');
     font-style:normal;
     }
     """
     elements = list(links(source))
     self.assertEqual(len(elements), 4)
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '../lib/fonts/fontawesome.eot?14663396#iefix')
     self.assertEqual(pos, 74)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'style')
     self.assertEqual(el.attrib, {})
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '../lib/fonts/fontawesome.woff?14663396')
     self.assertEqual(pos, 162)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'style')
     self.assertEqual(el.attrib, {})
     el, attr, url, pos = elements.pop()
     self.assertEqual(url, '../lib/fonts/fontawesome.ttf?14663396')
     self.assertEqual(pos, 232)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'style')
     self.assertEqual(el.attrib, {})
     el, attr, url, pos = elements.pop()
     self.assertEqual(url,
                      '../lib/fonts/fontawesome.svg?14663396#fontawesome')
     self.assertEqual(pos, 305)
     self.assertEqual(attr, None)
     self.assertEqual(el.tag, 'style')
     self.assertEqual(el.attrib, {})
Esempio n. 29
0
    def parse_item(self, response):
        item = self.get_new_item(response)
        html = utils.decode(response.body, response.encoding)
        article = response.xpath("//div[contains(@class,'item-page')]")
        caption_images = {}
        caption_imgs = article.xpath(".//p/span[contains(@class,'wf_caption')]")
        imgs = article.xpath(".//p/img")
        for caption_img in caption_imgs:
            if caption_img.xpath(".//img/@alt"):
                image_alt = caption_img.xpath(".//img/@alt").extract()[0].strip()
            elif caption_img.xpath(".//img/@title"):
                image_alt = caption_img.xpath(".//img/@title").extract()[0].strip()
            else:
                image_alt = None
            caption_images[urlparse.urljoin(response.url, caption_img.xpath(".//img/@src").extract()[0].strip())] = ("".join(caption_img.xpath(".//span").extract()),image_alt)
        for img in imgs:
            if img.xpath(".//@alt"):
                image_alt = img.xpath(".//@alt").extract()[0].strip()
            elif img.xpath(".//@title"):
                image_alt = img.xpath(".//@title").extract()[0].strip()
            else:
                image_alt = None
            caption_images[urlparse.urljoin(response.url, img.xpath(".//@src").extract()[0])] = (None,image_alt)
        slid_imgs = []
        for main_images_wrapper in response.xpath("//div[contains(@id,'main_images_wrapper')]"):
            desc = []
            image_info = []
            for main_des_container in main_images_wrapper.xpath(".//div[contains(@id,'main_des_container')]/div[contains(@class,'des_div')]/p"):
                desc.append(main_des_container.xpath(".//text()").extract()[0].strip())
            for img_tag in main_images_wrapper.xpath(".//div[contains(@id,'main_thumbs_arrow_wrapper')]/div[contains(@id,'main_thumb_container')]//img[contains(@class,'ig_thumb')]"):
                if img_tag.xpath(".//@src"):
                    path = urlparse.urljoin(response.url,img_tag.xpath(".//@src").extract()[0].strip().replace("120-90-80-c","600-450-80"))
                    slid_imgs.append(path)
                else:
                    path = ""
                alt = img_tag.xpath(".//@alt").extract()[0].strip() if img_tag.xpath(".//@alt") else None
                image_info.append((path,alt))
            for i in xrange(len(desc)):
                caption_images[image_info[i][0]] = (desc[i],image_info[i][1])
        item['image_urls'] = [u for u in caption_images.keys()]
        item["json"]["caption_images"] = caption_images

        createdby = response.xpath("//dd[@class='createdby']//text()").extract()[0]
        createdBySplit = createdby.split('|')
        if len(createdBySplit) >= 2:
            item['json']['author'] = createdBySplit[0].strip()[3:]
            item['json']['date'] = createdBySplit[1].strip()
        else:
            item['json']['date'] = createdby.strip()

        item['json']['title'] = article.xpath(".//h2/text()").extract()[0].strip()
        item['json']['item_url'] = response.url

        content_document=fromstring(article.extract()[0].strip())
        del_title = content_document.xpath(".//h2")[0]
        del_title.getparent().remove(del_title)
        del_author_date = content_document.xpath(".//dl[contains(@class,'article-info')]")[0]
        del_author_date.getparent().remove(del_author_date)
        if content_document.xpath(".//div[contains(@id,'main_images_wrapper')]"):
            del_main_images_wrapper = content_document.xpath(".//div[contains(@id,'main_images_wrapper')]")[0]
            for image_url in slid_imgs:
                img_doc = Element("img",**{"src":image_url})
                del_main_images_wrapper.addprevious(img_doc)
            del_main_images_wrapper.getparent().remove(del_main_images_wrapper)
        del_igallery_clear_div = content_document.xpath(".//div[contains(@class,'igallery_clear')]")[0]
        del_igallery_clear_div.getparent().remove(del_igallery_clear_div)
        captions = content_document.xpath(".//p/span[contains(@class,'wf_caption')]")
        if captions:
            for caption in captions:
                keep_img = caption.xpath(".//img")[0]
                caption.addnext(keep_img)
                caption.getparent().remove(caption)
        item["json"]["content"] = tostring(
            content_document,
            encoding="UTF-8"
        )

        sourceurl = response.meta['source_url']
        item['json']['category'] = response.meta['category']
        item['html'] = html
        htmls_path = {
            sourceurl:html
        }
        item["htmls_path"] = htmls_path
        item['source_url'] = sourceurl
        return item
Esempio n. 30
0
def semanticize(doc_path='test.html'):
    """
    P: unbroken set of lines (.t divs) of the same look make one <p>
    H1-3: Top 3 kinds of font size are turned to h1, h2 and h3.
    TABLE: use x and y position to indicate <td>, TODO: colspan support
    """
    print(doc_path)
    dom, dimensions = prepare(doc_path)
    get_dimension = lambda el, dim_type: dimensions[dim_type].get(
        classN(dim_type, el)) or 0

    # recover text from embedded fonts with bad CMAPS if > 50% of characters are unicode PUA
    recover = pua_content(dom.text_content()) > 0.5
    if recover:
        print('Recovery needed, not now.')
        return
        recover_text(dom, os.path.dirname(doc_path))

    # remove paging headers
    if REMOVE_HEADERS:
        dom = remove_headers(dom)

    # remove javascript holders
    for div in dom.cssselect('.j'):
        remove(div)

    if TABLES:
        table_data = grid_data(dom, get_dimension)
        dom = reconstruct_tables(dom, table_data)

    h_levels = heading_levels(dom, dimensions)

    # line by line analysis and conversion
    p_look = p_height = p_space = p_tag = box = 0

    for l in dom.cssselect('.t'):
        # Gather information about this line to see if it's part of a block.
        # 1. detect change of look - different css classes from previous line
        look = ' '.join([
            c for c in l.attrib['class'].split()
            if c[0] != 'y' and c[0:2] != 'fc'
        ])  # ignore y pos and font color
        new_look = p_look != look
        # 2. detect change of margin height - larger difference in bottom position from previous line
        height = get_dimension(l, 'h')
        line_height = p_height - height
        margin = line_height > MAX_LINE_HEIGHT
        # 3. space above - preceding empty line
        space = not l.text_content().strip()

        # Based on collected info: does this line belong to previous line?
        append = new_look == p_space == margin == False

        txt = l.text_content()

        tag = 'p'

        # LI
        indent = 'x0' not in look  # there is some indentation
        if [1 for b in BULLETS if txt.startswith(b)]:
            tag = 'li'
            append = 0
        elif indent and p_tag == 'li':
            tag = 'li'
            append = 1
        # H1, H2...
        size = classN('fs', l)
        if size in h_levels.keys():
            append = 0
            tag = 'h%s' % h_levels[size]

        # merge multiline-elements
        if txt.strip():
            if append:
                if BR: box.append(Element('br'))
                box.append(l)
            else:
                box = l
                l.tag = tag
        else:
            remove(l)

        if DEBUG:
            mark = ('<%s>' % tag).ljust(5)
            if append: mark = 5 * ' '
            print(' Aa %d    ⇪ %d    ⇕ % 3d    %s    %s    %s' %\
                (new_look, p_space, line_height, l.attrib['class'].ljust(40), mark, txt))

        # save current values for comparison in the next loop iteration
        p_space, p_height, p_look, p_tag = space, height, look, tag

    wrap_set(dom, 'li', 'ul')

    if STRIP_CSS:
        for e in dom.cssselect("style"):
            remove(e)
        for attr in 'style id class data-page-no data-data'.split():
            for e in dom.cssselect("*"):
                try:
                    del e.attrib[attr]
                except KeyError:
                    pass

    # save file
    html = tostring(dom, encoding=ENCODING, pretty_print=True).decode(ENCODING)
    s = '<!DOCTYPE html>' + html
    for a, b in REPLACE_AFTER:
        s = re.sub(a, b, s)
    for rm in REMOVE_AFTER:
        s = re.sub(rm, '', s)
    for b in BULLETS:
        s = s.replace(b, '')
    if recover:
        for rm in REMOVE_BEFORE:
            s = re.sub(rm, '', s)
    save_path = os.path.dirname(doc_path.replace('HTML', 'HTM')) + '.htm'
    f = open(save_path, 'w', encoding=ENCODING)
    f.write(s)
    f.close()