Esempi in Python per HTML2Text, esempi in Python per html2text.HTML2Text

Esempio n. 1

0

Mostra file

def get_item(item_node, test):
    serial_number = item_node.findall('serialNumber')[0].text
    vuln_id_from_tool = item_node.findall('type')[0].text
    url = item_node.get('url')
    path = item_node.findall('path')[0].text
    location = item_node.findall('location')[0].text
    rparameter = re.search(r"(?<=\[)(.*)(\])", location)
    parameter = None
    if rparameter:
        parameter = rparameter.group(1)

    unsaved_req_resp = list()
    for request_response in item_node.findall('./requestresponse'):
        request = get_clean_base64(request_response.findall('request')[0].text)
        response = get_clean_base64(
            request_response.findall('response')[0].text)
        unsaved_req_resp.append({"req": request, "resp": response})

    collab_text = ""
    for event in item_node.findall('./collaboratorEvent'):
        collab_details = list()
        collab_details.append(event.findall('interactionType')[0].text)
        collab_details.append(event.findall('originIp')[0].text)
        collab_details.append(event.findall('time')[0].text)

        if collab_details[0] == 'DNS':
            collab_details.append(event.findall('lookupType')[0].text)
            collab_details.append(event.findall('lookupHost')[0].text)
            collab_text += "The Collaborator server received a " + collab_details[0] + " lookup of type " + collab_details[3] + \
                " for the domain name " + \
                collab_details[4] + " at " + collab_details[2] + \
                " originating from " + collab_details[1] + ". "

        for request_response in event.findall('./requestresponse'):
            request = get_clean_base64(
                request_response.findall('request')[0].text)
            response = get_clean_base64(
                request_response.findall('response')[0].text)
            unsaved_req_resp.append({"req": request, "resp": response})
        if collab_details[0] == 'HTTP':
            collab_text += "The Collaborator server received an " + \
                collab_details[0] + " request at " + collab_details[2] + \
                " originating from " + collab_details[1] + ". "

    text_maker = html2text.HTML2Text()
    text_maker.body_width = 0

    background = do_clean(item_node.findall('issueBackground'))
    if background:
        background = text_maker.handle(background)

    detail = do_clean(item_node.findall('issueDetail'))
    if detail:
        detail = text_maker.handle(detail)
        if collab_text:
            detail = text_maker.handle(detail + '<p>' + collab_text + '</p>')

    remediation = do_clean(item_node.findall('remediationBackground'))
    if remediation:
        remediation = text_maker.handle(remediation)

    remediation_detail = do_clean(item_node.findall('remediationDetail'))
    if remediation_detail:
        remediation = text_maker.handle(remediation_detail +
                                        "\n") + remediation

    references = do_clean(item_node.findall('references'))
    if references:
        references = text_maker.handle(references)

    severity = item_node.findall('severity')[0].text
    if "information" == severity.lower():
        severity = "Info"

    scanner_confidence = item_node.findall('confidence')[0].text
    if scanner_confidence:
        if scanner_confidence == "Certain":
            scanner_confidence = 1
        elif scanner_confidence == "Firm":
            scanner_confidence = 4
        elif scanner_confidence == "Tentative":
            scanner_confidence = 7

    host_node = item_node.findall('host')[0]
    url_host = host_node.text
    path = item_node.findall('path')[0].text

    # Finding and Endpoint objects returned have not been saved to the database
    finding = Finding(title=item_node.findall('name')[0].text,
                      url=url,
                      test=test,
                      severity=severity,
                      param=parameter,
                      scanner_confidence=scanner_confidence,
                      description="URL: " + url_host + path + "\n\n" + detail +
                      "\n",
                      mitigation=remediation,
                      references=references,
                      false_p=False,
                      duplicate=False,
                      out_of_scope=False,
                      mitigated=None,
                      dynamic_finding=True,
                      impact=background,
                      unique_id_from_tool=serial_number,
                      vuln_id_from_tool=vuln_id_from_tool)
    finding.unsaved_req_resp = unsaved_req_resp
    # manage endpoint
    protocol = urlparse(url_host).scheme
    host = urlparse(url_host).netloc

    port = 80
    if protocol == 'https':
        port = 443
    if urlparse(url_host).port is not None:
        port = urlparse(url_host).port
    finding.unsaved_endpoints = [
        Endpoint(protocol=protocol,
                 host=host,
                 port=port,
                 path=path,
                 query=None,
                 fragment=None)
    ]
    # manage cwes
    cwes = do_clean_cwe(item_node.findall('vulnerabilityClassifications'))
    if len(cwes) > 1:
        # FIXME support more than one CWE
        logger.warning(
            f"more than one CWE for a finding {cwes}. NOT supported by parser API"
        )
    if len(cwes) > 0:
        finding.cwe = cwes[0]
    return finding

Esempio n. 2

0

Mostra file

File: 博客爬取.py Progetto: Adj325/BlogCrawlers

    def get_html_from_blog(self, blog, rule):
        s = requests.session()
        r = s.get(blog.url, headers=headers)
        if rule['encoding'] is not None:
            r.encoding = 'utf-8'
        # 获取文本内容
        html = r.text

        soup = BeautifulSoup(html, 'lxml')
        if False:

            # 增加代码标签
            html = re.sub('<code.*?>', '<code>```\n', html)
            html = re.sub('</code>', '```\n</code>', html)
        with open('temp.html', 'w', encoding='utf-8') as f:
            f.write(html)

        # 正则获取标题
        title_pattern = rule['title_pattern']
        titles = re.findall(title_pattern, html, re.DOTALL)
        if len(titles) == 0:
            title = 'default'
        else:
            title = pangu.spacing_text(titles[0])
        blog.title = title
        print('标题:', title)

        if rule['content_type'] == 'bs':
            content = soup.select(rule['content_pattern']).pop()
            content = str(content)

        else:
            # 提取正文内容
            content_pattern = rule['content_pattern']
            contents = re.findall(content_pattern, html, re.DOTALL)
            if len(contents) == 0:
                content = ''
            else:
                content = contents[0]

        content = '<h1><a href="{}">{}</a></h1><br><br>'.format(
            blog.url, blog.title) + content
        for src, dst in rule['content_replaces']:
            content = re.sub(src, dst, content)
        blog.content = content
        # print('正文:', content)

        # 转换为 MD
        # md_content = Tomd(content).markdown
        # content = re.sub('<a id=".*?"></a>', '', content)
        text_maker = ht.HTML2Text()
        md_content = text_maker.handle(content)
        # 去空行
        md_content = md_content.replace('\r', '')
        while ' \n' in md_content:
            md_content = md_content.replace(' \n', '\n')
        #md_content = md_content.replace('\n', '\n\n')
        while '\n\n\n' in md_content:
            md_content = md_content.replace('\n\n\n', '\n\n')
        # print(' MD:', md_content)

        # 正则替换
        for src, dst in rule['md_replaces']:
            md_content = re.sub(src, dst, md_content)
        # 加空格
        md_content = pangu.spacing_text(md_content)

        # ** *
        for star_line in re.findall('\*(.*?)\*', md_content):
            md_content = md_content.replace('{}'.format(star_line),
                                            '{}'.format(star_line.strip()))
        # 异常断行
        md_content = re.sub('-\n', '-', md_content)

        # 规范代码标签
        #md_content = re.sub('[ ]```', '```', md_content)

        # 过滤非法字符
        title = re.sub('[\/:*?"<>|]', '-', title)
        with open("blogs" + os.sep + title + '.md', 'w',
                  encoding='utf-8') as f:
            f.write(md_content)

        pass

Esempio n. 3

0

Mostra file

File: utils.py Progetto: koeninger/fire

def msg_to_markdown(repo, msg):
    def absurl(url):
        if not url.startswith('http:/') and not url.startswith('https:'):
            slash = '' if settings.BASE_URL.endswith('/') or url.startswith(
                '/') else '/'
            return settings.BASE_URL + slash + url
        return url

    # Need a map of content id -> attachment
    all_attachments = list(msg.attachment_set.all())
    attachments_map = {}
    for att in all_attachments:
        if att.content_id:
            attachments_map[att.content_id] = att

    # Attempt to update img elements pointing to an attach,ment
    attachments_observed = set()
    if msg.body_html:
        soup = BeautifulSoup(msg.body_html, 'html.parser')
        for img in soup.find_all('img'):
            src = img.attrs.get('src')
            if not src or not src.startswith('cid:'):
                continue

            att = attachments_map.get(src.replace('cid:', ''))
            if att:
                img['src'] = att.file.url
                attachments_observed.add(att)

        h = html2text.HTML2Text(bodywidth=0)
        msg_body = h.handle(str(soup))
    else:
        msg_body = msg.body_text

    # Look for attachments we didn't display inline
    attachments = list(att for att in all_attachments
                       if att not in attachments_observed)
    if attachments:
        attachments_text = u'\n\n\n\n---\n*Attachments:*\n\n'
        for att in attachments:
            url = att.file.url
            filename = os.path.basename(att.file.name)
            inline_img = ''
            if filename.lower().split('.')[-1] in ('png', 'gif', 'jpeg', 'jpg'
                                                   'svg'):
                inline_img = u'\n  ![]({})\n'.format(url)
            attachments_text += u'1. [{}]({}){}\n'.format(
                filename, url, inline_img)
    else:
        attachments_text = ''

    # See if we recognize this email address
    map_entry = repo.emailmap_set.filter(email__iexact=msg.from_email).first()
    if map_entry:
        tag = '@' + map_entry.login
    else:
        tag = msg.from_name

    return u'*Sent by {} ({}). Created by [fire]({}/).*\n\n---\n{}{}'.format(
        tag,
        msg.from_email,
        settings.BASE_URL,
        msg_body,
        attachments_text,
    )

Esempio n. 4

0

Mostra file

File: run_benchmarking.py Progetto: kirantech58/inscriptis

def get_output_html2text(input_data):
    h = html2text.HTML2Text()
    h.ignore_links = True
    result = h.handle(str(input_data))

    return "".join(result)

Esempio n. 5

0

Mostra file

def process_raw_message_batch(
    realm_id: int,
    raw_messages: List[Dict[str, Any]],
    subscriber_map: Dict[int, Set[int]],
    user_id_mapper: IdMapper,
    user_handler: UserHandler,
    get_recipient_id_from_receiver_name: Callable[[str, int], int],
    is_pm_data: bool,
    output_dir: str,
    zerver_realmemoji: List[Dict[str, Any]],
    total_reactions: List[Dict[str, Any]],
) -> None:
    def fix_mentions(content: str, mention_user_ids: Set[int]) -> str:
        for user_id in mention_user_ids:
            user = user_handler.get_user(user_id=user_id)
            mattermost_mention = '@{short_name}'.format(**user)
            zulip_mention = '@**{full_name}**'.format(**user)
            content = content.replace(mattermost_mention, zulip_mention)

        content = content.replace('@channel', '@**all**')
        content = content.replace('@all', '@**all**')
        # We don't have an equivalent for Mattermost's @here mention which mentions all users
        # online in the channel.
        content = content.replace('@here', '@**all**')
        return content

    mention_map: Dict[int, Set[int]] = {}
    zerver_message = []

    import html2text
    h = html2text.HTML2Text()

    pm_members = {}

    for raw_message in raw_messages:
        message_id = NEXT_ID('message')
        mention_user_ids = get_mentioned_user_ids(raw_message, user_id_mapper)
        mention_map[message_id] = mention_user_ids

        content = fix_mentions(
            content=raw_message['content'],
            mention_user_ids=mention_user_ids,
        )
        content = h.handle(content)

        if len(content) > 10000:  # nocoverage
            logging.info('skipping too-long message of length %s',
                         len(content))
            continue

        date_sent = raw_message['date_sent']
        sender_user_id = raw_message['sender_id']
        if "channel_name" in raw_message:
            recipient_id = get_recipient_id_from_receiver_name(
                raw_message["channel_name"], Recipient.STREAM)
        elif "huddle_name" in raw_message:
            recipient_id = get_recipient_id_from_receiver_name(
                raw_message["huddle_name"], Recipient.HUDDLE)
        elif "pm_members" in raw_message:
            members = raw_message["pm_members"]
            member_ids = {user_id_mapper.get(member) for member in members}
            pm_members[message_id] = member_ids
            if sender_user_id == user_id_mapper.get(members[0]):
                recipient_id = get_recipient_id_from_receiver_name(
                    members[1], Recipient.PERSONAL)
            else:
                recipient_id = get_recipient_id_from_receiver_name(
                    members[0], Recipient.PERSONAL)
        else:
            raise AssertionError(
                "raw_message without channel_name, huddle_name or pm_members key"
            )

        rendered_content = None

        topic_name = 'imported from mattermost'

        message = build_message(
            content=content,
            message_id=message_id,
            date_sent=date_sent,
            recipient_id=recipient_id,
            rendered_content=rendered_content,
            topic_name=topic_name,
            user_id=sender_user_id,
            has_attachment=False,
        )
        zerver_message.append(message)
        build_reactions(realm_id, total_reactions, raw_message["reactions"],
                        message_id, user_id_mapper, zerver_realmemoji)

    zerver_usermessage = make_user_messages(
        zerver_message=zerver_message,
        subscriber_map=subscriber_map,
        is_pm_data=is_pm_data,
        mention_map=mention_map,
    )

    message_json = dict(
        zerver_message=zerver_message,
        zerver_usermessage=zerver_usermessage,
    )

    dump_file_id = NEXT_ID('dump_file_id' + str(realm_id))
    message_file = f"/messages-{dump_file_id:06}.json"
    create_converted_data_files(message_json, output_dir, message_file)

Esempio n. 6

0

Mostra file

File: plugin.py Progetto: MasterScott/faraday_plugins

    def _process_webinspect(self):
        for session in self.webinspect.getchildren():
            hostname = session.Host.text
            port = session.Port.text
            service_data = {}
            if port:
                service_data['port'] = port

            path = session.Request.Path.text
            query = session.Request.FullQuery.text
            method = session.Request.Method.text
            request = ''
            if session.RawRequest.text:
                request = base64.b64decode(session.RawRequest.text)
            response = ''
            if session.RawResponse.text:
                response = base64.b64decode(session.RawResponse.text)
            status_code = session.Response.StatusCode.text

            for issues in session.Issues:
                for issue_data in issues.getchildren():
                    params = ''
                    check_type = issue_data.CheckTypeID
                    if check_type.text.lower() != 'vulnerability':
                        # TODO: when plugins accept tags, we shoudl this as a tag.
                        pass
                    name = issue_data.Name.text
                    external_id = issue_data.VulnerabilityID.text
                    faraday_severities = {
                        0: 'info',
                        1: 'low',
                        2: 'med',
                        3: 'high',
                        4: 'critical'
                    }
                    severity = faraday_severities[issue_data.Severity]
                    references = []
                    try:
                        classifications = issue_data.Classifications.getchildren(
                        )
                    except AttributeError:
                        classifications = []

                    for classification in classifications:
                        references.append(classification.text)

                    # Build description
                    description = u''
                    for report_section in issue_data.findall(
                            './ReportSection'):
                        description += u'{} \n'.format(
                            report_section.Name.text)
                        description += u'{} \n'.format(
                            report_section.SectionText.text)
                    description += u'{} \n'.format(issue_data.get('id'))

                    h = html2text.HTML2Text()
                    description = h.handle(description)

                    for repro_step in issue_data.findall('./ReproSteps'):
                        step = repro_step.ReproStep
                        if step is not None:
                            try:
                                params = step.PostParams.text
                            except AttributeError:
                                pass

                    if not hostname:
                        # This seems to be a mobile app
                        hostname = session.URL.text

                    if not port:
                        service_data['name'] = step.Url.text
                        service_data['port'] = step.sourceline

                    self.sast_vulns.append({
                        "host": hostname,
                        "severity": severity,
                        "service": service_data,
                        "name": name,
                        "description": description,
                        "external_id": external_id,
                        "references": references,
                        "method": method,
                        "query": query,
                        "response": response,
                        "request": request,
                        "path": path,
                        "params": params,
                        "status_code": status_code,
                        "website": session.URL.text
                    })

Esempio n. 7

0

Mostra file

from email import generator, message_from_string
from email.message import Message
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.message import MIMEMessage
from email.utils import formatdate
import mimetypes
import os

import html2text

from . import _compat as compat
from .utils import sanitize_address, forbid_multi_line_headers, make_msgid

textify = html2text.HTML2Text()

# Don't BASE64-encode UTF-8 messages
email.charset.add_charset('utf-8', email.charset.SHORTEST, None, 'utf-8')
utf8_charset = email.charset.Charset('utf-8')
utf8_charset.body_encoding = None  # Python defaults to BASE64

# Default MIME type to use on attachments (if it is not explicitly given
# and cannot be guessed).
DEFAULT_ATTACHMENT_MIME_TYPE = 'application/octet-stream'


class MIMEMixin(object):
    def as_string(self, unixfrom=False):
        """Return the entire formatted message as a string.
        Optional `unixfrom' when True, means include the Unix From_ envelope

Esempio n. 8

0

Mostra file

File: __main__.py Progetto: klq8x7m1wf/ote

#!/usr/bin/env python3

import html2text
import json
import os.path
import random
import re
import requests
import string
import sys
import time
import webbrowser

reader = html2text.HTML2Text()
reader.ignore_links = True


def create_email(username='', secure=False):
    """
    generates an email based on optional parameters
    returns username, domain
    """
    domains = [
        'esiix.com', 'wwjmp.com', '1secmail.com', '1secmail.org',
        '1secmail.net'
    ]
    domain = random.choice(domains)
    if secure:
        return ''.join(
            random.choices(string.ascii_lowercase + string.digits,
                           k=20)), domain

Esempio n. 9

0

Mostra file

def process_raw_message_batch(realm_id: int,
                              raw_messages: List[Dict[str, Any]],
                              subscriber_map: Dict[int, Set[int]],
                              user_id_mapper: IdMapper,
                              user_handler: UserHandler,
                              get_recipient_id: Callable[[ZerverFieldsT], int],
                              is_pm_data: bool,
                              output_dir: str,
                              zerver_realmemoji: List[Dict[str, Any]],
                              total_reactions: List[Dict[str, Any]],
                              ) -> None:

    def fix_mentions(content: str, mention_user_ids: Set[int]) -> str:
        for user_id in mention_user_ids:
            user = user_handler.get_user(user_id=user_id)
            mattermost_mention = '@{short_name}'.format(**user)
            zulip_mention = '@**{full_name}**'.format(**user)
            content = content.replace(mattermost_mention, zulip_mention)

        content = content.replace('@channel', '@**all**')
        content = content.replace('@all', '@**all**')
        # We don't have an equivalent for Mattermost's @here mention which mentions all users
        # online in the channel.
        content = content.replace('@here', '@**all**')
        return content

    mention_map = dict()  # type: Dict[int, Set[int]]
    zerver_message = []

    import html2text
    h = html2text.HTML2Text()

    name_to_codepoint = get_name_to_codepoint_dict()

    for raw_message in raw_messages:
        message_id = NEXT_ID('message')
        mention_user_ids = get_mentioned_user_ids(raw_message, user_id_mapper)
        mention_map[message_id] = mention_user_ids

        content = fix_mentions(
            content=raw_message['content'],
            mention_user_ids=mention_user_ids,
        )
        content = h.handle(content)

        if len(content) > 10000:  # nocoverage
            logging.info('skipping too-long message of length %s' % (len(content),))
            continue

        pub_date = raw_message['pub_date']
        try:
            recipient_id = get_recipient_id(raw_message)
        except KeyError:
            logging.debug("Could not find recipient_id for a message, skipping.")
            continue

        rendered_content = None

        topic_name = 'imported from mattermost'
        user_id = raw_message['sender_id']

        message = build_message(
            content=content,
            message_id=message_id,
            pub_date=pub_date,
            recipient_id=recipient_id,
            rendered_content=rendered_content,
            topic_name=topic_name,
            user_id=user_id,
            has_attachment=False,
        )
        zerver_message.append(message)
        build_reactions(realm_id, total_reactions, raw_message["reactions"], message_id,
                        name_to_codepoint, user_id_mapper, zerver_realmemoji)

    zerver_usermessage = make_user_messages(
        zerver_message=zerver_message,
        subscriber_map=subscriber_map,
        is_pm_data=is_pm_data,
        mention_map=mention_map,
    )

    message_json = dict(
        zerver_message=zerver_message,
        zerver_usermessage=zerver_usermessage,
    )

    dump_file_id = NEXT_ID('dump_file_id' + str(realm_id))
    message_file = "/messages-%06d.json" % (dump_file_id,)
    create_converted_data_files(message_json, output_dir, message_file)

Esempio n. 10

0

Mostra file

def begin_format1():
    sched_wb_path = 'data/restore_file.xlsx'  # path to excel workbook for import in Schedback
    paper_wb_path = 'data/CNS_2020_Paper-2020-06-11.xlsx'  # path to CNS papers workbook
    user_wb_path = 'data/CNS_2020_User-2020-06-11.xlsx'  # path to CNS users workbook

    # load excel workbooks for processing
    sched_wb = load_workbook(filename=sched_wb_path)
    paper_wb = load_workbook(filename=paper_wb_path)
    user_wb = load_workbook(filename=user_wb_path)

    # setup html2text options
    h = html2text.HTML2Text()

    # setup parameters below carefully to ensure things are inserted correctly in Sched
    sched_first_row = 13  # begin inserting sessions in this row

    # regular expressions to format author names
    p = re.compile('\s\(#[0-9]*\)')  # for formatting author names
    p_abstract = re.compile('\\n')  # for formatting abstract

    last_session_id = 4  # ID of the last session in the excel sheet
    paper_ws = paper_wb['Tablib Dataset']  # access paper worksheet
    sched_ws = sched_wb['Sessions']  # access sessions worksheet
    poster_count = 0  # keep track of count of posters to assign them to separate time slots
    poster_start_day = 19  # start day for poster sessions
    poster_start_time = 7  # start time for poster sessions
    for row in paper_ws.iter_rows(
            min_row=2):  # ignore the first row that contains column names
        paper_type = row[3].value

        if paper_type != 'Rejected':
            # extract information about the paper
            paper_ID = str(last_session_id + 1)
            paper_abstract = format_html_abstract(h, row[10].value, p_abstract)
            paper_title = row[9].value
            paper_author = format_author(row[1].value, p)

            # add paper details to sched workbook
            sched_ws['A%d' % sched_first_row] = paper_ID
            sched_ws['B%d' % sched_first_row] = paper_title
            sched_ws['C%d' % sched_first_row] = 'Y'
            sched_ws['C%d' % sched_first_row].alignment = Alignment(
                horizontal='center')
            sched_ws['D%d' % sched_first_row] = 'N'
            sched_ws['J%d' % sched_first_row] = paper_abstract
            sched_ws['K%d' % sched_first_row] = paper_author
            sched_ws['P%d' % sched_first_row] = 'TBA'

            if paper_type == 'Accepted':
                sched_ws['G%d' % sched_first_row] = 'Poster'

                # start and end times for posters
                poster_count = poster_count + 1

                if poster_count % 120 == 0:
                    poster_start_day = poster_start_day + 1
                    poster_start_time = 7
                elif poster_count % 40 == 0:
                    poster_start_time = poster_start_time + 1

                sched_ws['E%d' % sched_first_row] = '7/%d/2020 %d:00 PM' % (
                    poster_start_day, poster_start_time
                )  # start time for posters
                sched_ws['F%d' % sched_first_row] = '7/%d/2020 %d:00 PM' % (
                    poster_start_day, poster_start_time + 1
                )  # end time for posters
                sched_ws['F%d' % sched_first_row].alignment = Alignment(
                    horizontal='right')
            elif paper_type == 'AcceptedOral':
                sched_ws['G%d' % sched_first_row] = 'Oral'
            elif paper_type == 'AcceptedFeatured':
                sched_ws['G%d' % sched_first_row] = 'Featured Talk'

            sched_first_row = sched_first_row + 1
            last_session_id = last_session_id + 1

    sched_wb.save(sched_wb_path)

Esempio n. 11

0

Mostra file

 def item_description(self, item):
     h = html2text.HTML2Text()
     excerpt = h.handle(str(item.body)).split('\n\n')[0]
     return excerpt + "..."

Esempio n. 12

0

Mostra file

File: html2txt.py Progetto: tidenhub/urlwatch

def html2text(data, method='lynx', options=None):
    """
    Convert a string consisting of HTML to plain text
    for easy difference checking.

    Method may be one of:
     'lynx' (default) - Use "lynx -dump" for conversion
                        options: see "lynx -help" output for options that work with "-dump"
     'html2text'      - Use "html2text -nobs" for conversion
                        options: https://linux.die.net/man/1/html2text
     'bs4'            - Use Beautiful Soup library to prettify the HTML
                        options: "parser" only, bs4 supports "lxml", "html5lib", and "html.parser"
                        http://beautiful-soup-4.readthedocs.io/en/latest/#specifying-the-parser-to-use
     're'             - A simple regex-based HTML tag stripper
     'pyhtml2text'    - Use Python module "html2text"
                        options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options
    """
    if options is None:
        options = {}

    if method == 're':
        stripped_tags = re.sub(r'<[^>]*>', '', data)
        d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines()
                       if l.strip() != ''))
        return d

    if method == 'pyhtml2text':
        import html2text
        parser = html2text.HTML2Text()
        for k, v in options.items():
            setattr(parser, k.lower(), v)
        d = parser.handle(data)
        return d

    if method == 'bs4':
        from bs4 import BeautifulSoup
        parser = options.pop('parser', 'html.parser')
        soup = BeautifulSoup(data, parser)
        d = soup.prettify()
        return d

    if method == 'lynx':
        cmd = [
            'lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset UTF-8',
            '-display_charset UTF-8'
        ]
    elif method == 'html2text':
        cmd = ['html2text', '-nobs', '-utf8']
    else:
        raise ValueError('Unknown html2text method: %r' % (method, ))

    stdout_encoding = 'utf-8'

    for k, v in options.items():
        cmd.append('-%s %s' % (k, v) if v is True else '-%s' % k)

    logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding)

    env = {}
    env.update(os.environ)
    env['LANG'] = 'en_US.utf-8'
    env['LC_ALL'] = 'en_US.utf-8'

    html2text = subprocess.Popen(cmd,
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 env=env)
    stdout, stderr = html2text.communicate(data.encode('utf-8'))
    stdout = stdout.decode(stdout_encoding)

    if method == 'lynx':
        # Lynx translates relative links in the mode we use it to:
        # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK]

        # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the
        # "localhost" in the file:// URLs; see Debian bug 732112
        stdout = re.sub(
            r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'), ), '',
            stdout)

        # Use the following regular expression to remove the unnecessary
        # parts, so that [RANDOM STRING] (changing on each call) does not
        # expose itself as change on the website (it's a Lynx-related thing
        # Thanks to Evert Meulie for pointing that out
        stdout = re.sub(
            r'file://localhost%s/[^/]*/' %
            (os.environ.get('TMPDIR', '/tmp'), ), '', stdout)
        # Also remove file names like L9816-5928TMP.html
        stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout)

    return stdout.strip()

Esempio n. 13

0

Mostra file

File: lom_base.py Progetto: torsten-simon/oeh-search-etl

 def html2Text(self, html):
     h = html2text.HTML2Text()
     h.ignore_links = True
     h.ignore_images = True
     return h.handle(html)

Esempio n. 14

0

Mostra file

def run():
    article = getHtml()
    text_maker = ht.HTML2Text()
    md = text_maker.handle(article)
    createFile(md)

Esempio n. 15

0

Mostra file

        'deportes':
        'https://e00-elmundo.uecdn.es/elmundodeporte/rss/portada.xml',
        'tecnologia': '',
        'ciencia': 'https://e00-elmundo.uecdn.es/elmundo/rss/ciencia.xml',
        'cultura': 'https://e00-elmundo.uecdn.es/elmundo/rss/cultura.xml'
    },
}

# Actual date
date = datetime.datetime.now().date()

# Files paths
news_cache = 'news_cache.txt'

# Parser object
parser = html2text.HTML2Text()
parser.images_to_alt = True  # Discard image data, only keep alt text
parser.ignore_images = True  # Don't include any formatting for images
parser.ignore_links = True  # Ignores links
parser.ignore_tables = True
parser.body_width = 1000  # Number of charcaters per line (long number so no '\n' character appears)

rss_feed = feeds['libertad_digital']['portada']


def main():
    # Do request using requests library and timeout
    try:
        print('Making request')
        resp = requests.get(rss_feed, timeout=5.05)
    except requests.ReadTimeout as e:

Esempio n. 16

0

Mostra file

def main():
    # user_query = input()
    DOCID = 0
    INDEX = dict()

    numPartial = 1

    INDEX = dict()

    filtfiles = open("validDocs2", "r")

    uniqueTokens = list()

    for ln in filtfiles.readlines():

        fname = ln.strip()

        if DOCID % 12000 == 0 and DOCID != 0:
            for i in INDEX:
                '''
                print('partialIndex/' + str(numPartial) + '_' + str(i))
                file = open('partialIndex/' + str(numPartial) + '_' + str(i), 'w')
                file.write(str(INDEX[i]))
                file.close()
                '''
                uniqueTokens.extend(list(INDEX[i].keys()))
                json.dump(
                    INDEX[i],
                    open('partialIndex/' + str(numPartial) + '_' + str(i),
                         'w'))

            numPartial += 1

        # Increment the DOCID
        DOCID += 1

        # print the full file path
        print(fname)

        # parse json, tokenize the body
        h2t = html2text.HTML2Text()

        # open single webpage file
        file = open(fname)

        # JSON dict contains: 'url', 'content', 'encoding'
        pageDict = json.loads(file.read())

        # close file to get memory back
        file.close()

        # get html formatted content
        htmlContent = pageDict['content']

        soup = BeautifulSoup(htmlContent, 'html.parser')
        titles = ''
        bolds = ''
        h1 = ''
        h2 = ''
        h3 = ''
        if soup.title is not None:
            titles = soup.title.string
        for tag in soup.find_all("b"):
            if tag.string is not None and bolds is not None:
                bolds += (" " + tag.string)
        for tag in soup.find_all("h1"):
            if tag.string is not None and h1 is not None:
                h1 += (" " + tag.string)
        for tag in soup.find_all("h2"):
            if tag.string is not None and h2 is not None:
                h2 += (" " + tag.string)
        for tag in soup.find_all("h3"):
            if tag.string is not None and h3 is not None:
                h3 += (" " + tag.string)

        # get plain text content
        plainContent = h2t.handle(htmlContent)

        # get tokens in order of appearance
        tokens = tok.getTokens(plainContent)

        imp_words = dict()
        imp_words['titles_tokens'] = tok.getTokens(titles)
        imp_words['bolds_tokens'] = tok.getTokens(bolds)
        imp_words['h1_tokens'] = tok.getTokens(h1)
        imp_words['h2_tokens'] = tok.getTokens(h2)
        imp_words['h3_tokens'] = tok.getTokens(h3)
        #print('imp_words = ', imp_words)

        # Index the tokens
        ind.indexTokens(tokens, imp_words, DOCID, INDEX)

        DOCINDEX[DOCID] = (fname, pageDict['url'])

    for i in INDEX:
        '''
        file = open('partialIndex/' + str(numPartial) + '_' + str(i), 'w')
        file.write(str(INDEX[i]))
        file.close()
        '''
        json.dump(INDEX[i],
                  open('partialIndex/' + str(numPartial) + '_' + str(i), 'w'))

    json.dump(DOCINDEX, open('docindex', 'w'))

    # Print Final Statistics
    print("Number of Documents:     {}".format(DOCID))

    print("Number of Unique Tokens: {}".format(len(set(uniqueTokens))))

Esempio n. 17

0

Mostra file

File: main.py Progetto: saulodias/cr_scrap

def _html_to_markdown(value, baseurl=None, ignore_links=False):
    h = html2text.HTML2Text()
    h.ignore_links = ignore_links
    h.baseurl = baseurl
    h.body_width = 0
    return h.handle(str(value))

Esempio n. 18

0

Mostra file

File: dppc1_0_0.py Progetto: Karari-EK/QODEC

#version: 1.0.0
#Appreciation: All creators of packages used in this code

#import predefined python packages
import os
import re
import string
import easygui
import html2text
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

#define variables, Objects and Handles
stemmer = PorterStemmer()  #stemmer used to invoke methods used in stemming
h = html2text.HTML2Text()  #h used to invoke html to text procesing methods
h.ignore_links = True  #conversion to text removes all html links <a></a> etc

##functions defination section


#function stemword
def stemword(w):
    #invoke the stemmer.stem and return the stemmed word
    return (stemmer.stem(w))


#function create_document
#Extracted file converted so that each sentence
#is in its own paragraph sparated by sace
def create_document(t_file):

Esempio n. 19

0

Mostra file

import html2text
import scrapy
from scrapy.spiders import SitemapSpider
from vcrawler.items import Product

converter = html2text.HTML2Text()


class GrowpowerSpider(SitemapSpider):
    name = "growpower"
    sitemap_urls = ['https://www.growpower.com.br/sitemap.xml']

    def _parse_description(self, response):
        description = ''
        rawdesc = response.css('div#descricao').extract_first()
        if rawdesc is not None:
            description = converter.handle(rawdesc)
        return description

    def _parse_category(self, response):
        category = ''
        categories = response.css('div.breadcrumbs a::text').extract()
        if len(categories) > 1:
            categories.pop(0)
            category = categories[0]
            category = category.strip()
        return category

    def parse(self, response):
        sku = response.css('span[itemprop="sku"]::text').extract_first()
        if sku is None:

Esempio n. 20

0

Mostra file

File: cnblogs_html_to_md.py Progetto: hansir01/python

def run(blog):
    info = getHtml(blog)
    text_maker = ht.HTML2Text()
    md = text_maker.handle(info['article'])
    save_file = createFile(md, info['title'])
    replace_md_url(save_file)

Esempio n. 21

0

Mostra file

File: xwp2md.py Progetto: xrspook/x2them

def convert(inputfilename):
    global filename
    try:
        xmldoc = minidom.parse(inputfilename)
        wp = xmldoc.documentElement
    except Exception as e:
        print ('Fail.')
        print (e)
        print ('Please repair or delete invalid token like "& < >" there.')
        sys.exit(1)
        
    # 读取XML基础数据
    blog_title = getTagData(wp, 'title')
    blog_desc = getTagData(wp, 'description')
    blog_url = getTagData(wp, 'link')
    posts = wp.getElementsByTagName('item')

    summary = '# 目录' + '\n\n'

    # 正文信息提取并输出
    for log in posts:
        status = getTagData(log, 'wp:status')
        title = replace_text(getTagData(log, 'title'))
        author  = getTagData(log, 'dc:creator')
        date = convertDate(getTagData(log, 'wp:post_date'))
        md_name = convertMdName(getTagData(log, 'wp:post_date'))
        content = getTagData(log, 'content:encoded').replace('\n\n', '<br/><br/>')

        category_list = []
        tag_list = []
        category = log.getElementsByTagName('category')
        for cat_tag in category:
            if cat_tag.getAttribute('domain') == 'category':
                category_list.append(replace_text(getElementData(cat_tag)))
            if cat_tag.getAttribute('domain') == 'post_tag':
                tag_list.append(replace_text(getElementData(cat_tag)))
        category_list_str = ', '.join(category_list).replace('_', '\\_')
        tag_list_str = ', '.join(tag_list).replace('_', '\\_')

        comment_list = []
        comment = log.getElementsByTagName('wp:comment')
        comment_id = 1
        for cmt in comment:
            comment_date = getTagData(cmt, 'wp:comment_date')
            comment_author = getTagData(cmt, 'wp:comment_author')
            comment_author_email = getTagData(cmt, 'wp:comment_author_email')
            if comment_author_email:
                comment_author_email += ', '
            comment_author_url = getTagData(cmt, 'wp:comment_author_url')
            if comment_author_url:
                comment_author_url += ', '
            comment_content = getTagData(cmt, 'wp:comment_content')
            comment_list.append('<p>' + str(comment_id) + '. ' + comment_author + ', ' + comment_author_email + 
            comment_author_url + comment_date + '</p><p>' + comment_content + '</p>')
            comment_id += 1
        comment_list_str = ''.join(comment_list)
    
        h = html2text.HTML2Text()
        h.body_width = 0
        if status == 'publish':
            summary += '* [' + title + '](' + md_name + ')\n'
            md = ''
            md += '# ' + title + '\n'
            md += '作者: ' + author + '\n\n'
            md += '日期: ' + date + '\n\n'
            md += '分类: ' + category_list_str + '\n\n'
            md += '标签: ' + tag_list_str + '\n\n'
            md += '***' + '\n\n'
            md += h.handle(content) + '\n'
            if len(comment_list_str) > 0:
                md += '***\n'
                md += '## 从前的评论\n'
                md += h.handle(comment_list_str)
            output(filename, md_name, md)
        
    # 主页输出
    index = ''
    index += '# ' + blog_title + '\n'
    if blog_desc:
        index += '#### ' + blog_desc + '\n'
    if blog_url:
        index += '#### 原址: <' + blog_url + '>\n'
    output(filename, 'README.md', index)
    
    # 目录输出
    output(filename, 'SUMMARY.md', summary)

Esempio n. 22

0

Mostra file

def _create_issuetracker_issue(assessment, issue_tracker_info):
    """Collects information and sends a request to create external issue."""
    integration_utils.normalize_issue_tracker_info(issue_tracker_info)

    person, acl, acr = (all_models.Person, all_models.AccessControlList,
                        all_models.AccessControlRole)
    reporter_email = db.session.query(person.email, ).join(
        acl,
        person.id == acl.person_id,
    ).join(
        acr,
        sa.and_(
            acl.ac_role_id == acr.id,
            acr.name == "Audit Captains",
        ),
    ).filter(
        acl.object_id == assessment.audit_id,
        acl.object_type == all_models.Audit.__name__,
    ).order_by(person.email, ).first()

    if reporter_email:
        reporter_email = reporter_email.email

    comment = [_INITIAL_COMMENT_TMPL % _get_assessment_url(assessment)]
    test_plan = assessment.test_plan
    if test_plan:
        comment.extend([
            'Following is the assessment Requirements/Assessment Procedure '
            'from GGRC:',
            html2text.HTML2Text().handle(test_plan).strip('\n'),
        ])

    hotlist_id = issue_tracker_info.get('hotlist_id')

    issue_params = {
        'component_id': issue_tracker_info['component_id'],
        'hotlist_ids': [hotlist_id] if hotlist_id else [],
        'title': issue_tracker_info['title'],
        'type': issue_tracker_info['issue_type'],
        'priority': issue_tracker_info['issue_priority'],
        'severity': issue_tracker_info['issue_severity'],
        'reporter': reporter_email,
        'assignee': '',
        'verifier': '',
        'status': issue_tracker_info['status'],
        'ccs': [],
        'comment': '\n'.join(comment),
    }

    assignee = issue_tracker_info.get('assignee')
    if assignee:
        if not issue_tracker_info['status']:
            issue_params['status'] = 'ASSIGNED'
        issue_params['assignee'] = assignee
        issue_params['verifier'] = assignee

    cc_list = issue_tracker_info.get('cc_list')
    if cc_list is not None:
        issue_params['ccs'] = cc_list

    res = issues.Client().create_issue(issue_params)
    return res['issueId']

Esempio n. 23

0

Mostra file

File: acwing.py Progetto: caohonghai/Code

    # 这里要重新复制为你自己的cookie
}

url_tmp = 'https://www.acwing.com/problem/content/'
url = url_tmp

for i in range(1, 20):
    url = url_tmp + str(i)
    r = requests.get(url=url, headers=headers, cookies=cookies)  # 请求网址
    soup = BeautifulSoup(str(r.text), 'html.parser')  #文档对象
    # print(soup.title.get_text())
    title = soup.title.get_text()
    l = len(title)
    # print();
    # print(l);
    text_maker = ht.HTML2Text()
    text_maker.bypass_tables = False
    if (len(
            soup.find_all(
                'div', class_='ui bottom attached tab active martor-preview'))
            >= 1):
        htmlpage = str(
            soup.find_all(
                'div',
                class_='ui bottom attached tab active martor-preview')[0])
        text = text_maker.handle(htmlpage)
        print(title[0:l - 10])
        s = "# " + title[0:l - 10] + "\n\n#### 题目描述\n\n"
        s += text
        open(title + ".md", "w").write(s)

Esempio n. 24

0

Mostra file

 def html_to_txt(self, html):
     h = html2text.HTML2Text()
     h.ignore_links = True
     return h.handle(html)
     # self.text = Document(self.html)
     pass

Esempio n. 25

0

Mostra file

"""ICA parser class."""

import re
import traceback

from flask import current_app
import html2text

from recapi.html_parsers import GeneralParser

# Set html2text options
text_maker = html2text.HTML2Text()
text_maker.emphasis_mark = "*"


class KungsornenParser(GeneralParser):
    """Parser for recipes at kungsornen.se."""

    domain = "kungsornen.se"
    name = "Kungsörnen"
    address = "https://www.kungsornen.se/recept/"

    def __init__(self, url):
        """Init the parser."""
        self.url = url
        self.make_soup()
        self.get_title()
        self.get_image()
        self.get_ingredients()
        self.get_contents()
        self.get_portions()

Esempio n. 26

0

Mostra file

File: Kanji.py Progetto: RepoKK/kukan

    def ProcessJitenon(self):
        page = pickle.load(open(self._get_kanji_file_name(), "rb"))
        tree = html.fromstring(page.content)

        block = tree.xpath('//*[@id="kanjiright"]/table/tr/th')
        startIdx = 0
        endIdx = 0
        print("**** " + self._Kanji + " ****")
        for blk in block:
            if self._Kanji == "点" and len(blk.getchildren()) == 0: continue

            blkName, blkRow = h3_row_nb_all(self._Kanji, blk)

            # issue with kanji.jineton for 平
            if self._Kanji == "平" and blkName == '訓読み':
                blkRow += 1
            if self._Kanji == "平" and blkName == '意味':
                blkRow -= 1
            if self._Kanji == "点" and blkName == '意味':
                blkRow += 1
            if self._Kanji == '袒' and blkName == '訓読み':
                blkRow -= 1

            startIdx = endIdx
            endIdx += blkRow
            print("Block " + blkName + ", nb row: " + str(blkRow) + " [" +
                  str(startIdx) + ";" + str(endIdx) + "].")
            subblock = tree.xpath('//*[@id="kanjiright"]/table/tr/td')
            for idx in range(startIdx, endIdx):
                if blkName in ['部首', '画数', '音読み', '訓読み', '漢字検定', '学年']:
                    if self._Kanji == '禺' and blkName == '訓読み':
                        content = subblock[idx].text
                    elif self._Kanji == '袤' and blkName == '訓読み' and idx == 3:
                        content = subblock[idx].text
                    else:
                        content = subblock[idx].getchildren()[0].text
                elif blkName in ['Unicode']:
                    content = subblock[idx].text
                elif blkName in ['種別']:
                    if len(subblock[idx].getchildren()) > 0:
                        content = subblock[idx].getchildren()[0].text
                elif blkName in ['異体字']:
                    content = lxml.html.tostring(subblock[idx],
                                                 encoding='unicode')

                    if '新字体' in content:
                        kind = '新字体'
                    elif '標準字体' in content:
                        kind = '標準字体'
                    else:
                        kind = None

                    content = None
                    if kind:
                        link = subblock[idx].getchildren()[0].getchildren(
                        )[0].attrib
                        if 'href' in link:
                            content = (kind, link['href'])

                elif blkName in ['意味']:
                    content = lxml.html.tostring(subblock[idx],
                                                 encoding='unicode')
                    h = html2text.HTML2Text()
                    h.ignore_links = True
                    content = h.handle(content)
                    # m = re.search("<td>(.*)</td>", content, flags=re.MULTILINE)
                    # content = m[1]
                elif blkName in ['JIS水準']:
                    if len(subblock[idx].getchildren()) > 0:
                        content = subblock[idx].getchildren()[0].text
                    else:
                        content = subblock[idx].text
                self._jitenonItem[blkName].append(content)

            print(self._jitenonItem[blkName])

Esempio n. 27

0

Mostra file

import html2text
from cassiopeia import riotapi
from cassiopeia.type.core.staticdata import *
from fuzzywuzzy import process

DDRAGON_BASE = f"http://ddragon.leagueoflegends.com/cdn/{riotapi.get_versions()[0]}"

SPELL_SCALINGS = {'attackdamage': "AD", 'bonusattackdamage': "**bonus** AD",
                  'armor': "Armor", 'bonusarmor': "**bonus** Armor",
                  'spellblock': "Magic Resist", 'bonusspellblock': "**bonus** Magic Resist",
                  'health': "Health", 'bonushealth': "**bonus** Health",
                  'spelldamage': "AP", "@dynamic.abilitypower": "AP"}
# unhandled special cases: (i have been unable to find out what these mean, api missing too much data :/)
# @dynamic.attackdamage @cooldownchampion

SANITIZER = html2text.HTML2Text()
SANITIZER.ignore_links = True
SANITIZER.body_width = 0


def get_champion_by_name(name: str) -> Tuple[Optional[Champion], int]:
    """Get a champion by name with fuzzy search.
    Args:
        name: Name of champion 

    Returns:
        Tuple[Optional[Champion], int]: Second element represents the query score (how close it is to the actual value).
    """
    return get_by_name(name, riotapi.get_champions())

Esempio n. 28

0

Mostra file

File: main.py Progetto: KrajncJ/data_indexing_querying

def extract_text_from_html(html):
    h = html2text.HTML2Text()
    h.ignore_links = True
    return h.handle(html)

Esempio n. 29

0

Mostra file

File: escs.py Progetto: hubrix/arcamens

cards.count()

hooks = BitbucketHook.objects.filter(full_name='arcamens/django-github')
orgs = hooks.values_list('organization')
is_ok = Q(ancestor__ancestor__organization=orgs)
cards = Card.objects.filter(is_ok)
cards.count()

for ind in []:
    pass
else:
    print('hi')
##############################################################################
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
h.handle("<p>Hello, <a href='http://earth.google.com/'>world</a>!")
help(h.handle)
dir(h)
##############################################################################
from ehp import *

html = Html()

data = '''
<body> <em> foo </em> </body>
'''

dom = html.feed(data)
dom.text()

Esempio n. 30

0

Mostra file

    def index(self, config):
        results = []
        board = config["board"]
        if board not in viable_boards:
            self.error_logger("Your chosen board does not exist on 4chan!")
            quit()
        # Create a HTML parser for parsing comments
        h = html2text.HTML2Text()
        h.ignore_links = False

        req = f"https://a.4cdn.org/{board}/threads.json"

        content = json.loads(requests.get(req).content)
        for page_index, page in enumerate(content):
            self.logger(f"Scraping page number: {page_index+1}")
            for thread_index, threads in enumerate(page["threads"]):
                self.logger(
                    f"Extracting posts from thread number: {thread_index+1}")
                thread_id = threads["no"]
                req = f"https://a.4cdn.org/{board}/thread/{thread_id}.json"
                thread_content = json.loads(requests.get(req).content)[
                    "posts"]  # thread content is a list of posts
                for post_index, post in enumerate(thread_content):
                    self.logger(
                        f"Extracting media and comments from post number: {post_index+1}"
                    )
                    post_row = []
                    post_row.append(post["no"])
                    post_row.append(thread_id)
                    post_row.append(post["time"])

                    try:
                        comment = post["com"]
                    except KeyError:
                        comment = "..."
                    else:
                        comment = h.handle(comment)
                    post_row.append(comment)

                    # Filename
                    try:
                        filename = post["filename"]
                    except KeyError:
                        filename = ""

                    if filename != "":
                        time_id = post["tim"]
                        extension = post["ext"]
                        full_file = f"{filename}{extension}"
                        file_url = f"https://i.4cdn.org/{board}/{time_id}{extension}"
                        post_row.append(full_file)
                        post_row.append(extension)
                        post_row.append(file_url)
                    elif filename == "":
                        post_row.append("")
                        post_row.append("")
                        post_row.append("")
                    results.append(post_row)
        self.logger("Scraping metadata complete")
        results.insert(0, [
            "id", "thread_id", "datetime", "comment", "filename", "ext", "url"
        ])
        return LocalElementsIndex(results)