예제 #1
0
def main():
    args = argument_parser().parse_args()

    with pathlib.Path(args.file).open('r') as fp:
        soup = bs4.BeautifulSoup(fp, 'html.parser')
        soup.ul.contents = [bs4.NavigableString('\n')
                            ] + sort_mods(soup) + [bs4.NavigableString('\n')]

    with pathlib.Path(args.output or args.file).open('w') as file:
        file.write(str(soup))
예제 #2
0
def parse_content(parent: Union[bs4.NavigableString, bs4.Tag, bs4.Comment]) -> bs4.NavigableString:
    res = ''
    if isinstance(parent, bs4.Comment):
        pass
    elif isinstance(parent, bs4.NavigableString):
        return parent
    else:
        children = parent.contents
        if len(children) == 0:
            html_tag = str(parent)
            return bs4.NavigableString('\n') if 'br' in html_tag else bs4.NavigableString('')
        else:
            for child in children:
                res += parse_content(child)
    return bs4.NavigableString(res)
    def new_tag(self,
                name,
                parent=None,
                string=None,
                class_=None,
                index=None,
                before=None,
                after=None,
                **kwargs):
        tag = self.__doc.new_tag(name, **kwargs)
        if (string is not None):
            if (tag.string is not None):
                tag.string.replace_with(string)
            else:
                tag.string = soup.NavigableString(string)
        if (class_ is not None):
            tag['class'] = class_
        if (before is not None):
            before.insert_before(tag)
        elif (after is not None):
            after.insert_after(tag)
        elif (parent is not None):
            if (index is None or index < 0):
                parent.append(tag)
            else:
                parent.insert(index, tag)

        return tag
예제 #4
0
파일: w3c.py 프로젝트: sumsung007/xspider
    def setter(self, value):
        tag = self.doc

        for part in parts:
            if part == '':
                continue
            elif part == 'text()':
                if tag.string:
                    tag.contents[0] = bs4.NavigableString(value)
                else:
                    tag.append(value)

                tag.string = tag.contents[0]

                return
            else:
                child = tag.find(part)

                if not child:
                    child = bs4.Tag(self.doc, part)

                    tag.append(child)

                tag = child

        tag.append(value)
예제 #5
0
파일: w3c.py 프로젝트: sumsung007/xspider
    def setter(self, text):
        if self.tag.string:
            self.tag.contents[0] = bs4.NavigableString(text)
        else:
            self.tag.append(text)

        self.tag.string = self.tag.contents[0]
예제 #6
0
def run_script(name,file_name="projects.html"):
    with open(file_name) as inf:
        txt = inf.read()
        soup = bs4.BeautifulSoup(txt)

    with open(f"{file_name.split('.')[0]}_copy.html", "w+") as fc:
        fc.write(str(soup.prettify()))


    root = soup.new_tag("div",**{'class':"project-tile"})

    obj = soup.new_tag("div",**{'class':"project-object"})
    cont = soup.new_tag("div",**{'class':"project-container" ,'onclick':"toggle_project_info(this)"})
    img = soup.new_tag("img", **{'alt':'_'.join(name.split()), 'class':"project-object-img"},src=f"resources/img/projects/{'_'.join(name.split())}.png")
    ovly = soup.new_tag("div",**{'class':'project-object-img-overlay'})
    p =soup.new_tag("p",**{'class':'project-object-name'})
    p.insert(0,bs4.NavigableString(name.capitalize()))
    cont.append(img)
    cont.append(ovly)
    cont.append(p)
    obj.append(cont)

    info = soup.new_tag("div",**{'class':"project-info"})
    txt = soup.new_tag("div",**{'class':"text"})
    em = soup.new_tag("embed",**{'class':"readme"},src=f"resources/readmes/{'_'.join(name.split())}.html")
    txt.append(em)
    info.append(txt)
    root.append(obj)
    root.append(info)

    soup.html.body.section.div.append(root)
    soup = soup.prettify()

    with open(file_name, "w") as outf:
        outf.write(str(soup))
예제 #7
0
파일: views.py 프로젝트: baggids/qcat
    def normalize_rotated_range(self):
        """
        Normalize 'rotated' ranges, indicated by the class 'vertical-title'
        """
        for container in self.soup.select('.vertical-title'):

            # Extract the labels from the header.
            for header_labels in container.select('.rotate'):
                labels = []
                header_labels.wrap(self.soup.new_tag('table'))
                for div in header_labels.select('div'):
                    labels.append(div.text)

                # Fill in the checked value as text, remove all ranges.
                for sibling in container.find_next_siblings('div'):
                    squares = sibling.select('.range_square')
                    if squares:
                        # Get the position of the selected element
                        for i, square in enumerate(
                                squares[0].parent.select('div')):
                            if 'range_true' in square.get('class', []):
                                # Print the text-label
                                squares[0].parent.parent.insert(
                                    0, bs4.NavigableString(labels[i]))

                        # Remove the squares.
                        squares[0].parent.decompose()

            # Remove the header row.
            container.decompose()

        # Remove the additional lines with 'hr' tags.
        for inline_comment in self.soup.select('.inline-comment'):
            for hr in inline_comment.select('hr'):
                hr.parent.decompose()
예제 #8
0
def _add_word_with_annotation(line_num, word_num, word, stemmer, soup,
                              freq_dictionary):
    try:

        soup.find_all("p")[line_num].append(soup.new_tag("w"))
        soup.find_all("w")[word_num].append(bs4.NavigableString(word))
        soup.find_all("w")[word_num]['lex'] = stemmer.analyze(
            word)[0]['analysis'][0]['lex']
        soup.find_all("w")[word_num]['gr'] = stemmer.analyze(
            word)[0]['analysis'][0]['gr']
    except IndexError:
        return None
    return None
예제 #9
0
def get_content_from_soup(soup):
    for b in soup.findAll('br'):
        b.replace_with("❡")
    for a in soup.findAll('a'):
        a.string = _LINK_FORMAT.format(text=a.text,
                                       href=a.attrs.get('href',
                                                        '#').replace(".", "."))
    for paragraph_holder in ['h1', 'h2', 'h3', 'h4', 'h5']:
        for e in soup.findAll(paragraph_holder):
            e.insert(0, bs4.NavigableString("❡ § "))
    for paragraph_holder in ['p', 'div']:
        for e in soup.findAll(paragraph_holder):
            e.insert(0, bs4.NavigableString("❡"))
    for deletable_tag in ['script', 'style', 'header', 'link', 'footer']:
        for e in soup.findAll(deletable_tag):
            e.decompose()
    for deletable in ['header', 'footer', 'wm-ipp-base']:
        for e in soup.findAll(class_=deletable):
            e.decompose()
        for e in soup.findAll(id=deletable):
            e.decompose()
    for e in soup.findAll("li"):
        e.insert(0, bs4.NavigableString("❡ • "))
    paragraphs = [[
        s.strip().replace(".", ".") for s in p.split('.') if s.strip()
    ] for p in soup.text.replace("\n", "").split('❡')]
    to_ret = []
    bullet_carry = False
    for p in paragraphs:
        if (p == ['•']):
            bullet_carry = True
            continue
        if (p):
            if bullet_carry:
                p[0] = ' • ' + p[0]
            to_ret.append(p)
            bullet_carry = False
    return to_ret
예제 #10
0
def compile_latex(article: Article) -> Article:
    """Looks through the article content for embedded LaTeX and compiles it into
    PDFs, and adds the proper tags so they show up on import.
    """
    text_tag: bs4.NavigableString
    #matches LaTeX inside one or two dollar signs
    inline_regex = r'\$?\$([^\$]+)\$\$?'
    for text_tag in article.content.find_all(text=True):
        p = re.compile(inline_regex)
        for match in p.finditer(text_tag):
            latex = match.group(1)
            if not is_latex(latex):
                continue
            #just use the hash of the latex for a unique filename, this should probably never collide
            filename = article.get_pdf_location(str(hash(latex)))
            compile_latex_str(latex, filename)
            #if we can't find the parent, assume it's just the document
            parent: Tag
            if text_tag.parent == None or text_tag.parent.name == '[document]':
                parent = article.content
            else:
                parent = text_tag.parent
            tag_idx = parent.contents.index(text_tag)
            #replace the matched latex with a link tag
            begin, end = text_tag.split(match.group(0))
            #convert these strings to tags
            begin = bs4.NavigableString(begin)
            end = bs4.NavigableString(end)
            text_tag.replace_with(begin)
            #the latex compiler will automatically add a .pdf so we have to add one too
            link_tag = Tag(name='link',
                           attrs={'href': 'file://' + filename + '.pdf'})
            parent.insert(tag_idx + 1, link_tag)
            parent.insert(tag_idx + 2, end)
            #set the current tag to the new end tag
            text_tag = end
    return article
예제 #11
0
def replace_text_with_tag(sub_text: str, repl_tag: Tag,
                          text_tag: bs4.NavigableString,
                          article: Article) -> bs4.NavigableString:
    #if we can't find the parent, assume it's just the document
    parent: Tag
    if text_tag.parent == None or text_tag.parent.name == '[document]':
        parent = article.content
    else:
        parent = text_tag.parent
    tag_idx = parent.contents.index(text_tag)
    #replace the matched text with a tag
    begin, *rest = text_tag.split(sub_text, maxsplit=1)
    end: str
    if len(rest):
        end = rest[0]
    else:
        end = ""
    #convert these strings to tags
    begin = bs4.NavigableString(begin)
    end = bs4.NavigableString(end)
    text_tag.replace_with(begin)
    parent.insert(tag_idx + 1, repl_tag)
    parent.insert(tag_idx + 2, end)
    return end
예제 #12
0
def parse_content(
    parent: Union[bs4.NavigableString, bs4.Tag, bs4.Comment]
) -> bs4.NavigableString:
    """parse_content convert a tag to a string with interpretting `<br>` and ignoring other tags.

    .. seealso::
        https://github.com/kmyk/online-judge-tools/issues/553
    """

    res = ''
    if isinstance(parent, bs4.Comment):
        pass
    elif isinstance(parent, bs4.NavigableString):
        return parent
    else:
        children = parent.contents
        if len(children) == 0:
            html_tag = str(parent)
            return bs4.NavigableString(
                '\n') if 'br' in html_tag else bs4.NavigableString('')
        else:
            for child in children:
                res += parse_content(child)
    return bs4.NavigableString(res)
예제 #13
0
파일: views.py 프로젝트: baggids/qcat
    def range_to_table(self):
        """
        Cast the 'ranges' to a more basic format: wrap the parent container with a table, and 
        cast the divs to tds.
        """
        for range_min in self.soup.select('.range_min'):
            range_container = range_min.parent.parent

            range_table = self.soup.new_tag('table')
            range_container.insert(0, range_table)

            for i, div in enumerate(range_container.select('div')):
                div.name = 'td'
                extracted = div.extract()
                range_table.insert(i, extracted)

        for selected in self.soup.select('.range_true'):
            selected.insert(0, bs4.NavigableString('x'))
예제 #14
0
    def _parse_sample_tag(self, tag: bs4.Tag) -> Optional[Tuple[str, str]]:
        assert isinstance(tag, bs4.Tag)
        assert tag.name == 'pre'
        prv = utils.previous_sibling_tag(tag)
        pprv = tag.parent and utils.previous_sibling_tag(tag.parent)
        if prv.name == 'h6' and tag.parent.name == 'div' and tag.parent[
                'class'] == ['paragraph'] and pprv.name == 'h5':
            log.debug('h6: %s', str(prv))
            log.debug('name.encode(): %s', prv.string.encode())

            # tag.string for the tag below returns None
            # - "<pre></pre>"
            # - "<pre>6<br />1 1<br />7 4<br />0 5<br />1 3<br />-8 9<br />5 1</pre>"
            # for more details, see https://www.crummy.com/software/BeautifulSoup/bs4/doc/#string
            if tag.string is not None:
                s = tag.string
            else:
                s = bs4.NavigableString(''.join(string + '\n'
                                                for string in tag.strings))

            return utils.textfile(s.lstrip()), pprv.string + ' ' + prv.string
        return None
예제 #15
0
파일: html.py 프로젝트: openzim/sotoki
    def __init__(self):
        self.domain_re = re.compile(rf"https?://{self.domain}(?P<path>/.+)")
        self.qid_slug_answer_re = re.compile(
            r"^(q|questions)/(?P<post_id>[0-9]+)/[^/]+/(?P<answer_id>[0-9]+)")
        self.qid_re = re.compile(r"^(q|questions)/(?P<post_id>[0-9]+)/?")
        self.aid_re = re.compile(r"^a/(?P<answer_id>[0-9]+)/?")
        self.uid_re = re.compile(r"^users/(?P<user_id>[0-9]+)/?")
        self.tid_re = re.compile(r"^questions/tagged/(?P<tag_id>[0-9]+)/?$")

        # supported internal paths (what we provide)
        # used to rule-out in-SE internal links we don't support
        self.supported_res = (
            re.compile(r"questions/tagged/.+"),
            re.compile(r"users/[0-9]+/.+"),
            re.compile(r"questions/[0-9]+/.+"),
            re.compile(r"a/[0-9]+/?$"),
            re.compile(r"users/profiles/[0-9]+.webp$"),
            re.compile(r"questions/?$"),
            re.compile(r"questions_page=[0-9]+$"),
            re.compile(r"users/?$"),
            re.compile(r"users_page=[0-9]+$"),
            re.compile(r"tags$"),
            re.compile(r"tags_page=[0-9]+$"),
            re.compile(r"api/tags.json$"),
            re.compile(r"about$"),
            re.compile(r"images/[0-9]+.webp$"),
        )

        self.redacted_string = bs4.NavigableString(self.redacted_text)
        # self.markdown = mistune.create_markdown(
        #     escape=False,
        #     plugins=[plugin_strikethrough, plugin_table, plugin_footnotes],
        # )
        if self.conf.censor_words_list:
            with open(self.conf.build_dir.joinpath("words.list"), "r") as fh:
                # this will actually replace occurences of ~strings matching
                # words in the list but those can be part of actual words or whole.
                self.words_re = re.compile(r"\b\b|\b\b".join(
                    map(re.escape, [line.strip() for line in fh.readlines()])))
예제 #16
0
def generate_report(results, output_path=None, force=False):
    '''
    .. versionadded:: 1.28

    .. versionchanged:: 1.29.1
        Only try to format results for tests that have data in the
        :data:`results` dictionary.

        Prior to version ``1.29.1``, this function would fail unless the
        :data:`results` dictionary contained data existed for **all tests** in
        :data:`ALL_TESTS` .

    .. versionchanged:: 1.54
        If output extension is ``.html``, output self-contained HTML report
        with ``<script id="results">...</script>`` tag containing JSON report
        results.


    Generate summary report of :func:`self_test` results either as Markdown or
    a Word document.

    Parameters
    ----------
    results : dict
        Results from :func:`self_test`.
    output_path : str, optional
        Report output path.

        If not specified, a text-only Markdown report is generated.

        If extension of output path is ``docx``, write output as Word document.

        If extension of output path is ``html``, write output as self-contained
        HTML report with ``<script id="results">...</script>`` tag containing
        JSON report results.

        Otherwise, output path is interpreted as a directory path and a
        Markdown file is written to the output directory, along with ``.png``
        images for test-related plots (where applicable).  Output directory
        will be created if it does not exist.
    force : bool, optional
        Overwrite output path if it exists.

    Returns
    -------
    str or None
        If :data:`output_path` is not specified, a text-only Markdown report is
        returned.

    Raises
    ------
    IOError
        If :data:`output_path` exists and :data:`force` is not ``True``.
    '''
    if output_path is not None:
        output_path = ph.path(output_path).realpath()
        if output_path.exists() and not force:
            if output_path.isdir() and output_path.listdir():
                # Output path is a directory with existing contents.
                raise IOError('Output directory already exists and is '
                              'non-empty.  Use `force` to overwrite.')
            elif output_path.ext.lower() == '.docx':
                raise IOError('Output path exists.  Use `force` to overwrite.')
            elif output_path.ext.lower() == '.html':
                raise IOError('Output path exists.  Use `force` to overwrite.')
            elif output_path.isfile():
                raise IOError('Output path exists and is a file.  Output path '
                              'must either be a directory or a filepath with '
                              'the `.docx` extension.')

    tests_with_figure = set([
        'test_channels', 'test_voltage', 'test_on_board_feedback_calibration'
    ])

    # Find starting time of earliest test (or current date and time if no
    # timestamp is available).
    min_timestamp = min([
        result_i['utc_timestamp']
        for result_i in six.itervalues(results) if 'utc_timestamp' in result_i
    ] + [dt.datetime.utcnow().isoformat()])
    header = ['# DropBot self test (*{}*)'.format(min_timestamp.split('.')[0])]

    if output_path is None:
        # Execute `format_<test name>_results` for each test to generate each
        # respective Markdown report.
        md_results_cmds = [
            'format_{test_name}_results(results["{test_name}"])'.format(
                test_name=name_i) for name_i in ALL_TESTS if name_i in results
        ]
        md_results = list(map(eval, md_results_cmds))

        # Join Markdown reports, separated by horizontal bars.
        md_report = (2 * '\n' + (72 * '-') + 2 * '\n').join(header +
                                                            md_results)

        # No output path was specified.  Return text-only Markdown report.
        return md_report

    if output_path.ext.lower() in ('.docx', '.html'):
        output_path.parent.makedirs_p()
        parent_dir = ph.path(tempfile.mkdtemp(prefix='dropbot-self-test'))
    else:
        parent_dir = output_path
        output_path.makedirs_p()

    markdown_path = parent_dir.joinpath('results-summary.markdown')

    try:
        # Execute `format_<test name>_results` for each test to generate each
        # respective Markdown report.
        md_results = [
            eval('format_{test_name}_results'.format(test_name=name_i))(
                results[name_i],
                **({
                    'figure_path': parent_dir.joinpath(name_i + '.png')
                } if name_i in tests_with_figure else {}))
            for name_i in ALL_TESTS if name_i in results
        ]

        # Join Markdown reports, separated by horizontal bars.
        md_report = (2 * '\n' + (72 * '-') + 2 * '\n').join(header +
                                                            md_results)

        with markdown_path.open('w') as output:
            output.write(md_report)

        if output_path.ext.lower() == '.docx':
            sp.check_call(['pandoc', markdown_path, '-o', output_path],
                          shell=True)
        elif output_path.ext.lower() == '.html':
            # Write template to file for use with `pandoc`.
            template = pkgutil.get_data(
                'dropbot', 'static/templates/'
                'SelfTestTemplate.html5')
            template_path = parent_dir.joinpath('SelfTestTemplate.html5')
            template_path.write_text(template)
            # Use `pandoc` to create self-contained `.html` report.
            sp.check_call([
                'pandoc', markdown_path, '-o', output_path, '--standalone',
                '--self-contained', '--template', template_path
            ],
                          shell=True,
                          stderr=sp.PIPE)
            with output_path.open('r') as input_:
                data = input_.read()

            # Inject JSON result data into HTML report.
            soup = bs4.BeautifulSoup(data, 'lxml')
            results_script = soup.select_one('script#results')
            # Format JSON with indents.  Works around [`json_tricks`
            # issue][i51].
            #
            # [i51]: https://github.com/mverleg/pyjson_tricks/issues/51
            json_data = json_tricks.dumps(results, indent=4)
            results_script.string = bs4.NavigableString(json_data)
            with output_path.open('w') as output:
                output.write(unicode(soup).encode('utf8'))
    finally:
        if output_path.ext.lower() in ('.docx', '.html'):
            parent_dir.rmtree()
예제 #17
0
 def createTextNode(self, data):
     return Text(self, BeautifulSoup.NavigableString(data))
예제 #18
0
 def createTextNode(self, data):
     from .Text import Text
     return Text(self, bs4.NavigableString(data))
예제 #19
0
 def createTextNode(self, data):
     from .Text import Text
     return Text(self, BeautifulSoup.NavigableString(data))
예제 #20
0
파일: w3c.py 프로젝트: sumsung007/xspider
 def createTextNode(self, data):
     return Text(self, bs4.NavigableString(data))
예제 #21
0
def sort_mods(mods: bs4.Tag):
    return intersperse(
        sorted(mods.ul.find_all('li'), key=lambda x: x.text.lower()),
        bs4.NavigableString('\n'))
예제 #22
0
    def postprocess_soup(self, soup):
        """
        For each instance of a glossary word, replace it with a glossary tag
        """

        # Only run this if there is indeed a glossary
        if len(self.terms.keys()) == 0: return
        term_list = "|".join(self.terms)

        # TODO: there is at least one rendering issue with two glossary
        # elements inside an <em> block that is rendering wrong.

        # Only look in the main body, skip scripts, TOC, etc
        main = soup.find("div", {"id": "MAIN"})

        term_search = re.compile(
            r"^(|.*?[\s(\"'])(%s)(|[\s.?!\"',)].*)$" % term_list, re.MULTILINE)
        if main == None:
            return

        for string in list(main.strings):
            if string.parent.name == "code": continue  # skip code fragments
            if string.parent.name == "a": continue  # skip hyperlinks
            # TODO: can I do this smarter by explicitly tagging td elements for the dictionary? The code seems to work,
            # though I want to make sure that this isn't hurting build perf.
            skipit = False
            for parent in string.parents:
                if parent == None: continue
                if parent.attrs == None: continue
                if parent.attrs.has_key(
                        "id") and parent.attrs["id"] == "table-glossary":
                    skipit = True
                    break
                if parent.attrs.has_key("class") and (
                        "definition_popup" in parent.attrs["class"]):
                    # Skip hyperlinks for the popup windows
                    skipit = True
                    break
            if skipit: continue

            found_match = False
            text = unicode(string)
            while (True):
                m = term_search.search(text)
                if not m: break

                found_match = True
                string.insert_before(bs4.NavigableString(m.group(1)))
                # Replace the term search with the rest of the string so we can iterate on terms
                text = m.group(3)
                if self.visible:
                    ahref = soup.new_tag("a", href='#glossary_%s' % m.group(2))
                    ahref.string = m.group(2)
                    ahref.attrs["class"] = "glossary_link"
                    ahref.attrs["data-show"] = "define_%s" % m.group(2)
                    string.insert_before(ahref)
                else:
                    span = soup.new_tag("span")
                    span.string = m.group(2)
                    span.attrs["class"] = "glossary_link"
                    span.attrs["data-show"] = "define_%s" % m.group(2)
                    string.insert_before(span)

            if found_match:
                # Put whatever remains at the end and extract the original string
                string.insert_before(bs4.NavigableString(text))
                string.extract()
 def _createHtml(self, soup):
     child = self.getChild().createHtml(soup)
     assert assertType(child, list)
     return ([bs4.NavigableString(f"{{{{#{self.field}}}}}")] + child +
             [bs4.NavigableString(f"{{{{/{self.field}}}}}")])
def parse(args, target, soup, workdir, out_dir):
    source_info = {"type": "script", "file": __file__, "children": []}

    tile_data = []
    for filename_raw in args[1:]:
        parts = filename_raw.split("#")
        filename_actual = parts[0]
        if len(parts) > 1:
            use_ids_raw = parts[1].split(",")
        else:
            use_ids_raw = None

        data_items = {}

        filename = os.path.join(workdir, filename_actual)
        with open(filename, "r") as f:
            data = json.load(f)
            for item in data:
                if use_ids_raw is not None:
                    data_items[item["id"]] = item
                else:
                    tile_data.append(item)

        if use_ids_raw is not None:
            for n in use_ids_raw:
                if n.count("-") == 1:
                    bits = n.split("-")
                    min_ = int(bits[0])
                    max_ = int(bits[1])

                    if max_ >= min_:
                        for i in range(min_, max_ + 1):
                            tile_data.append(data_items[i])
                    else:
                        for i in range(min_, max_ - 1, -1):
                            tile_data.append(data_items[i])
                else:
                    tile_data.append(data_items[int(n)])

        source_info["children"].append({
            "type": "source",
            "file": filename,
            "children": []
        })

    nav = soup.new_tag("div")
    nav["class"] = "pagelist-nav"
    target.append(nav)

    for item in tile_data:
        entry = soup.new_tag("div")
        entry["class"] = "pagelist-entry"

        # Clickable title
        title = soup.new_tag("div")
        title["class"] = "pagelist-entry-title"
        title_a = soup.new_tag("a", href=item["click"])
        title_a.string = item["title"]
        title.append(title_a)
        entry.append(title)

        # Tag and link bubbles
        if "tags" in item:
            tag_container = soup.new_tag("div")
            tag_container["class"] = "pagelist-tag-container"

            if "tags" in item:
                for name, text in item["tags"]:
                    tag_el = soup.new_tag("span")
                    tag_el["class"] = "tag"
                    tag_el["data-name"] = name

                    if name in tag_colors:
                        tag_el["style"] = "background-color: " + tag_colors[
                            name][0] + "; color: " + tag_colors[name][1] + ";"

                    tag_el.string = text

                    tag_container.append(tag_el)
                    tag_container.append(bs4.NavigableString(" "))

            entry.append(tag_container)

        # Body
        if "body" in item:
            body = soup.new_tag("div")
            body["class"] = "pagelist-entry-desc"
            body_content = bs4.BeautifulSoup(item["body"], "lxml")
            body.extend(body_content.body.contents)
            entry.append(body)

        target.append(entry)

    return source_info
예제 #25
0
def fetch(url, proxies=None, timeout=20):
    fetch_start_time = time.time()
    if proxies is None:
        proxies = {}

    resp = requests.get(url, proxies=proxies, timeout=timeout)
    # print(resp.text)
    # print(resp.status_code)
    soup = BeautifulSoup(resp.content, 'html5lib')
    # print(soup.title)
    # print(soup.h1.text.strip())
    # print(soup.h2.text.strip())
    h1 = soup.h1
    if h1 is None:
        return False, 'failed to find titie'
    h1_hidden_part = h1.find(class_='u-hiddenVisually')
    if h1_hidden_part:
        h1_hidden_part.decompose()
    name = h1.text.strip()
    at_raw = soup.h2.text.strip()
    at = at_raw[1:]
    # if not at_raw.startswith('@'):
    #     return False, 'failed to extract at: {}'.format(at_raw)

    item_container = soup.find(id='stream-items-id')
    # if item_container is None:
    #     return False, 'failed to get item container'
    items = item_container.find_all('li', recursive=False)
    index = None
    items_parsed = []
    for index, item in enumerate(items):

        fullname = item.find(class_='fullname').text.strip()
        username = item.find(class_='username').text.strip().replace('@', '')

        is_retweet = item.find(class_='js-retweet-text') is not None

        # link
        a_link = item.find('a', class_=['twitter-timeline-link', 'u-hidden'])
        link = a_link.get('href')
        a_link.decompose()

        # time
        time_node = item.find(lambda tag: 'data-time-ms' in tag.attrs)
        timestamp_ms = int(time_node.get('data-time-ms'))

        # hashtag
        for hashtag in item.find_all('a', class_='twitter-hashtag'):
            hash_text = hashtag.text.strip()
            hash_relative_href = hashtag.get('href')
            hash_link = urllib.parse.urljoin(
                'http://twitter.com/', hash_relative_href)

            new_tag = soup.new_tag('a', href=hash_link,
                                   target='_blank', rel='noopener')
            new_tag.string = hash_text

            hashtag.replace_with(new_tag)

        # emoji
        for emoji_img in item.find_all('img', class_=('Emoji', 'Emoji--forText')):
            alt = emoji_img.get('alt')
            if alt:
                text_tag = bs4.NavigableString(alt)
                emoji_img.replace_with(text_tag)

        # atreply
        for atreply in item.find_all('a', class_='twitter-atreply'):
            href = atreply.get('href')
            reply_link = urllib.parse.urljoin('http://twitter.com/', href)
            reply_text = atreply.text.strip()

            new_tag = soup.new_tag('a', href=reply_link,
                                   target='_blank', rel='noopener')
            new_tag.string = reply_text

            atreply.replace_with(new_tag)

        text_container = item.find(class_='js-tweet-text-container')
        content = str(text_container)
        content_md = html2text.html2text(content, bodywidth=0).rstrip()
        items_parsed.append({
            'id': link.split('/')[-1],
            'name': fullname,
            'at': username,
            'timestamp_ms': timestamp_ms,
            'content_md': content_md,
            'content': content,
            'link': link,
            'retweet': is_retweet,
        })

    fetch_end_time = time.time()
    fetch_duration = fetch_end_time - fetch_start_time
    return {
        'url': url,
        'name': name,
        'at': at,
        'items': items_parsed,
        '_fetch_start_time': fetch_start_time,
        '_fetch_end_time': fetch_end_time,
        '_fetch_duration': fetch_duration,
    }
예제 #26
0
    def GenerateHTML(self, controller, minify=False, prettify=False):
        soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

        # Remove declaration.
        for x in soup.contents:
            if isinstance(x, bs4.Doctype):
                x.extract()

        # Remove declaration.
        for x in soup.contents:
            if isinstance(x, bs4.Declaration):
                x.extract()

        # Remove all imports.
        imports = soup.findAll('link', rel='import')
        for imp in imports:
            imp.extract()

        # Remove all script links.
        scripts_external = soup.findAll('script', src=True)
        for script in scripts_external:
            script.extract()

        # Remove all in-line scripts.
        scripts_external = soup.findAll('script', src=None)
        for script in scripts_external:
            script.extract()

        # Process all in-line styles.
        inline_styles = soup.findAll('style')
        for style in inline_styles:
            html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
            if html:
                ns = soup.new_tag('style')
                ns.append(bs4.NavigableString(html))
                style.replaceWith(ns)
            else:
                style.extract()

        # Rewrite all external stylesheet hrefs or remove, as needed.
        stylesheet_links = soup.findAll('link', rel='stylesheet')
        for stylesheet_link in stylesheet_links:
            html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
            if html:
                tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
                assert len(tmp) == 1
                stylesheet_link.replaceWith(tmp[0])
            else:
                stylesheet_link.extract()

        # Remove comments if minifying.
        if minify:
            comments = soup.findAll(
                text=lambda text: isinstance(text, bs4.Comment))
            for comment in comments:
                comment.extract()
        if prettify:
            return soup.prettify('utf-8').strip()

        # We are done.
        return unicode(soup).strip()
예제 #27
0
파일: html.py 프로젝트: openzim/sotoki
 def redact_link(self, link):
     for attr in ("href", "title"):
         if attr in link.attrs:
             del link.attrs[attr]
     link.contents = [bs4.NavigableString("[redacted]")]
예제 #28
0
def parse(args, target, soup, workdir, out_dir):
    source_info = {"type": "script", "file": __file__, "children": []}

    tile_data = []
    for filename_raw in args[1:]:
        parts = filename_raw.split("#")
        filename_actual = parts[0]
        if len(parts) > 1:
            use_ids_raw = parts[1].split(",")
        else:
            use_ids_raw = None

        data_items = {}

        filename = os.path.join(workdir, filename_actual)
        with open(filename, "r") as f:
            data = json.load(f)
            for item in data:
                if use_ids_raw is not None:
                    data_items[item["id"]] = item
                else:
                    tile_data.append(item)

        if use_ids_raw is not None:
            for n in use_ids_raw:
                if n.count("-") == 1:
                    bits = n.split("-")
                    min_ = int(bits[0])
                    max_ = int(bits[1])

                    if max_ >= min_:
                        for i in range(min_, max_ + 1):
                            tile_data.append(data_items[i])
                    else:
                        for i in range(min_, max_ - 1, -1):
                            tile_data.append(data_items[i])
                else:
                    tile_data.append(data_items[int(n)])

        source_info["children"].append({
            "type": "source",
            "file": filename,
            "children": []
        })

    for item in tile_data:
        entry = soup.new_tag("div", id=("post." + str(item["id"])))
        entry["class"] = "tile"

        # Clickable image
        img = soup.new_tag("img")
        img["class"] = "tile-image"
        if "image" in item:
            real_image_loc = os.path.join(workdir, item["image"])
            img["alt"] = "Depiction of '" + item["title"] + "'"
        else:
            real_image_loc = "images/projects/placeholder.png"
            img["alt"] = "Placeholder image"

        img["src"] = os.path.relpath(real_image_loc, start=out_dir)
        entry_image = Image.open(os.path.normpath(real_image_loc))
        img["width"], img["height"] = entry_image.size

        if "click" in item:
            img_a = soup.new_tag("a", href=item["click"])
            img_a.append(img)
            entry.append(img_a)
        else:
            entry.append(img)

        # Date
        if "time" in item:
            when = datetime.utcfromtimestamp(item["time"])
            when_str = when.strftime("%b %e, %Y")

            when_div = soup.new_tag("div")
            when_div["class"] = "tile-date"
            when_div.string = when_str
            entry.append(when_div)

        # Title
        title = soup.new_tag("div")
        title["class"] = "tile-title"
        title.string = item["title"]
        entry.append(title)

        # Tag and link bubbles
        if ("tags" in item) or ("links" in item):
            tag_container = soup.new_tag("div")
            tag_container["class"] = "tile-tag-container"

            if "tags" in item:
                for name, text in item["tags"]:
                    tag_el = soup.new_tag("span")
                    tag_el["class"] = "tag"

                    if name in tag_colors:
                        tag_el["style"] = "background-color: " + tag_colors[
                            name][0] + "; color: " + tag_colors[name][1] + ";"

                    tag_el.string = text

                    tag_container.append(tag_el)
                    tag_container.append(bs4.NavigableString(" "))

            if "links" in item:
                for props in item["links"]:
                    text = props["text"]
                    dest = props["dest"]

                    link_el = soup.new_tag("a")
                    link_el["class"] = "tag"
                    link_el["href"] = dest

                    icon = soup.new_tag("img")
                    icon[
                        "src"] = "../icons/link.png"  #material icon (https://material.io/tools/icons/) (under https://www.apache.org/licenses/LICENSE-2.0.html)
                    icon["alt"] = "link: "
                    icon["class"] = "tag-link-icon"
                    link_el.append(icon)

                    text_el = soup.new_tag("span")
                    text_el.string = text
                    link_el.append(text_el)

                    tag_container.append(link_el)
                    tag_container.append(bs4.NavigableString(" "))

            entry.append(tag_container)

        # Body
        body = soup.new_tag("div")
        body["class"] = "tile-body"
        body_content = bs4.BeautifulSoup(item["body"], "lxml")
        body.extend(body_content.body.contents)
        entry.append(body)

        target.append(entry)

    # Add some dummy spacers to make everything display as desired
    for i in range(10):
        spacer = soup.new_tag("div")
        spacer["class"] = "tile-spacer"
        target.append(spacer)

    return source_info