def get_item(item_node, test): serial_number = item_node.findall('serialNumber')[0].text vuln_id_from_tool = item_node.findall('type')[0].text url = item_node.get('url') path = item_node.findall('path')[0].text location = item_node.findall('location')[0].text rparameter = re.search(r"(?<=\[)(.*)(\])", location) parameter = None if rparameter: parameter = rparameter.group(1) unsaved_req_resp = list() for request_response in item_node.findall('./requestresponse'): request = get_clean_base64(request_response.findall('request')[0].text) response = get_clean_base64( request_response.findall('response')[0].text) unsaved_req_resp.append({"req": request, "resp": response}) collab_text = "" for event in item_node.findall('./collaboratorEvent'): collab_details = list() collab_details.append(event.findall('interactionType')[0].text) collab_details.append(event.findall('originIp')[0].text) collab_details.append(event.findall('time')[0].text) if collab_details[0] == 'DNS': collab_details.append(event.findall('lookupType')[0].text) collab_details.append(event.findall('lookupHost')[0].text) collab_text += "The Collaborator server received a " + collab_details[0] + " lookup of type " + collab_details[3] + \ " for the domain name " + \ collab_details[4] + " at " + collab_details[2] + \ " originating from " + collab_details[1] + ". " for request_response in event.findall('./requestresponse'): request = get_clean_base64( request_response.findall('request')[0].text) response = get_clean_base64( request_response.findall('response')[0].text) unsaved_req_resp.append({"req": request, "resp": response}) if collab_details[0] == 'HTTP': collab_text += "The Collaborator server received an " + \ collab_details[0] + " request at " + collab_details[2] + \ " originating from " + collab_details[1] + ". " text_maker = html2text.HTML2Text() text_maker.body_width = 0 background = do_clean(item_node.findall('issueBackground')) if background: background = text_maker.handle(background) detail = do_clean(item_node.findall('issueDetail')) if detail: detail = text_maker.handle(detail) if collab_text: detail = text_maker.handle(detail + '<p>' + collab_text + '</p>') remediation = do_clean(item_node.findall('remediationBackground')) if remediation: remediation = text_maker.handle(remediation) remediation_detail = do_clean(item_node.findall('remediationDetail')) if remediation_detail: remediation = text_maker.handle(remediation_detail + "\n") + remediation references = do_clean(item_node.findall('references')) if references: references = text_maker.handle(references) severity = item_node.findall('severity')[0].text if "information" == severity.lower(): severity = "Info" scanner_confidence = item_node.findall('confidence')[0].text if scanner_confidence: if scanner_confidence == "Certain": scanner_confidence = 1 elif scanner_confidence == "Firm": scanner_confidence = 4 elif scanner_confidence == "Tentative": scanner_confidence = 7 host_node = item_node.findall('host')[0] url_host = host_node.text path = item_node.findall('path')[0].text # Finding and Endpoint objects returned have not been saved to the database finding = Finding(title=item_node.findall('name')[0].text, url=url, test=test, severity=severity, param=parameter, scanner_confidence=scanner_confidence, description="URL: " + url_host + path + "\n\n" + detail + "\n", mitigation=remediation, references=references, false_p=False, duplicate=False, out_of_scope=False, mitigated=None, dynamic_finding=True, impact=background, unique_id_from_tool=serial_number, vuln_id_from_tool=vuln_id_from_tool) finding.unsaved_req_resp = unsaved_req_resp # manage endpoint protocol = urlparse(url_host).scheme host = urlparse(url_host).netloc port = 80 if protocol == 'https': port = 443 if urlparse(url_host).port is not None: port = urlparse(url_host).port finding.unsaved_endpoints = [ Endpoint(protocol=protocol, host=host, port=port, path=path, query=None, fragment=None) ] # manage cwes cwes = do_clean_cwe(item_node.findall('vulnerabilityClassifications')) if len(cwes) > 1: # FIXME support more than one CWE logger.warning( f"more than one CWE for a finding {cwes}. NOT supported by parser API" ) if len(cwes) > 0: finding.cwe = cwes[0] return finding
def get_html_from_blog(self, blog, rule): s = requests.session() r = s.get(blog.url, headers=headers) if rule['encoding'] is not None: r.encoding = 'utf-8' # 获取文本内容 html = r.text soup = BeautifulSoup(html, 'lxml') if False: # 增加代码标签 html = re.sub('<code.*?>', '<code>```\n', html) html = re.sub('</code>', '```\n</code>', html) with open('temp.html', 'w', encoding='utf-8') as f: f.write(html) # 正则获取标题 title_pattern = rule['title_pattern'] titles = re.findall(title_pattern, html, re.DOTALL) if len(titles) == 0: title = 'default' else: title = pangu.spacing_text(titles[0]) blog.title = title print('标题:', title) if rule['content_type'] == 'bs': content = soup.select(rule['content_pattern']).pop() content = str(content) else: # 提取正文内容 content_pattern = rule['content_pattern'] contents = re.findall(content_pattern, html, re.DOTALL) if len(contents) == 0: content = '' else: content = contents[0] content = '<h1><a href="{}">{}</a></h1><br><br>'.format( blog.url, blog.title) + content for src, dst in rule['content_replaces']: content = re.sub(src, dst, content) blog.content = content # print('正文:', content) # 转换为 MD # md_content = Tomd(content).markdown # content = re.sub('<a id=".*?"></a>', '', content) text_maker = ht.HTML2Text() md_content = text_maker.handle(content) # 去空行 md_content = md_content.replace('\r', '') while ' \n' in md_content: md_content = md_content.replace(' \n', '\n') #md_content = md_content.replace('\n', '\n\n') while '\n\n\n' in md_content: md_content = md_content.replace('\n\n\n', '\n\n') # print(' MD:', md_content) # 正则替换 for src, dst in rule['md_replaces']: md_content = re.sub(src, dst, md_content) # 加空格 md_content = pangu.spacing_text(md_content) # ** * for star_line in re.findall('\*(.*?)\*', md_content): md_content = md_content.replace('{}'.format(star_line), '{}'.format(star_line.strip())) # 异常断行 md_content = re.sub('-\n', '-', md_content) # 规范代码标签 #md_content = re.sub('[ ]```', '```', md_content) # 过滤非法字符 title = re.sub('[\/:*?"<>|]', '-', title) with open("blogs" + os.sep + title + '.md', 'w', encoding='utf-8') as f: f.write(md_content) pass
def msg_to_markdown(repo, msg): def absurl(url): if not url.startswith('http:/') and not url.startswith('https:'): slash = '' if settings.BASE_URL.endswith('/') or url.startswith( '/') else '/' return settings.BASE_URL + slash + url return url # Need a map of content id -> attachment all_attachments = list(msg.attachment_set.all()) attachments_map = {} for att in all_attachments: if att.content_id: attachments_map[att.content_id] = att # Attempt to update img elements pointing to an attach,ment attachments_observed = set() if msg.body_html: soup = BeautifulSoup(msg.body_html, 'html.parser') for img in soup.find_all('img'): src = img.attrs.get('src') if not src or not src.startswith('cid:'): continue att = attachments_map.get(src.replace('cid:', '')) if att: img['src'] = att.file.url attachments_observed.add(att) h = html2text.HTML2Text(bodywidth=0) msg_body = h.handle(str(soup)) else: msg_body = msg.body_text # Look for attachments we didn't display inline attachments = list(att for att in all_attachments if att not in attachments_observed) if attachments: attachments_text = u'\n\n\n\n---\n*Attachments:*\n\n' for att in attachments: url = att.file.url filename = os.path.basename(att.file.name) inline_img = '' if filename.lower().split('.')[-1] in ('png', 'gif', 'jpeg', 'jpg' 'svg'): inline_img = u'\n ![]({})\n'.format(url) attachments_text += u'1. [{}]({}){}\n'.format( filename, url, inline_img) else: attachments_text = '' # See if we recognize this email address map_entry = repo.emailmap_set.filter(email__iexact=msg.from_email).first() if map_entry: tag = '@' + map_entry.login else: tag = msg.from_name return u'*Sent by {} ({}). Created by [fire]({}/).*\n\n---\n{}{}'.format( tag, msg.from_email, settings.BASE_URL, msg_body, attachments_text, )
def get_output_html2text(input_data): h = html2text.HTML2Text() h.ignore_links = True result = h.handle(str(input_data)) return "".join(result)
def process_raw_message_batch( realm_id: int, raw_messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], user_id_mapper: IdMapper, user_handler: UserHandler, get_recipient_id_from_receiver_name: Callable[[str, int], int], is_pm_data: bool, output_dir: str, zerver_realmemoji: List[Dict[str, Any]], total_reactions: List[Dict[str, Any]], ) -> None: def fix_mentions(content: str, mention_user_ids: Set[int]) -> str: for user_id in mention_user_ids: user = user_handler.get_user(user_id=user_id) mattermost_mention = '@{short_name}'.format(**user) zulip_mention = '@**{full_name}**'.format(**user) content = content.replace(mattermost_mention, zulip_mention) content = content.replace('@channel', '@**all**') content = content.replace('@all', '@**all**') # We don't have an equivalent for Mattermost's @here mention which mentions all users # online in the channel. content = content.replace('@here', '@**all**') return content mention_map: Dict[int, Set[int]] = {} zerver_message = [] import html2text h = html2text.HTML2Text() pm_members = {} for raw_message in raw_messages: message_id = NEXT_ID('message') mention_user_ids = get_mentioned_user_ids(raw_message, user_id_mapper) mention_map[message_id] = mention_user_ids content = fix_mentions( content=raw_message['content'], mention_user_ids=mention_user_ids, ) content = h.handle(content) if len(content) > 10000: # nocoverage logging.info('skipping too-long message of length %s', len(content)) continue date_sent = raw_message['date_sent'] sender_user_id = raw_message['sender_id'] if "channel_name" in raw_message: recipient_id = get_recipient_id_from_receiver_name( raw_message["channel_name"], Recipient.STREAM) elif "huddle_name" in raw_message: recipient_id = get_recipient_id_from_receiver_name( raw_message["huddle_name"], Recipient.HUDDLE) elif "pm_members" in raw_message: members = raw_message["pm_members"] member_ids = {user_id_mapper.get(member) for member in members} pm_members[message_id] = member_ids if sender_user_id == user_id_mapper.get(members[0]): recipient_id = get_recipient_id_from_receiver_name( members[1], Recipient.PERSONAL) else: recipient_id = get_recipient_id_from_receiver_name( members[0], Recipient.PERSONAL) else: raise AssertionError( "raw_message without channel_name, huddle_name or pm_members key" ) rendered_content = None topic_name = 'imported from mattermost' message = build_message( content=content, message_id=message_id, date_sent=date_sent, recipient_id=recipient_id, rendered_content=rendered_content, topic_name=topic_name, user_id=sender_user_id, has_attachment=False, ) zerver_message.append(message) build_reactions(realm_id, total_reactions, raw_message["reactions"], message_id, user_id_mapper, zerver_realmemoji) zerver_usermessage = make_user_messages( zerver_message=zerver_message, subscriber_map=subscriber_map, is_pm_data=is_pm_data, mention_map=mention_map, ) message_json = dict( zerver_message=zerver_message, zerver_usermessage=zerver_usermessage, ) dump_file_id = NEXT_ID('dump_file_id' + str(realm_id)) message_file = f"/messages-{dump_file_id:06}.json" create_converted_data_files(message_json, output_dir, message_file)
def _process_webinspect(self): for session in self.webinspect.getchildren(): hostname = session.Host.text port = session.Port.text service_data = {} if port: service_data['port'] = port path = session.Request.Path.text query = session.Request.FullQuery.text method = session.Request.Method.text request = '' if session.RawRequest.text: request = base64.b64decode(session.RawRequest.text) response = '' if session.RawResponse.text: response = base64.b64decode(session.RawResponse.text) status_code = session.Response.StatusCode.text for issues in session.Issues: for issue_data in issues.getchildren(): params = '' check_type = issue_data.CheckTypeID if check_type.text.lower() != 'vulnerability': # TODO: when plugins accept tags, we shoudl this as a tag. pass name = issue_data.Name.text external_id = issue_data.VulnerabilityID.text faraday_severities = { 0: 'info', 1: 'low', 2: 'med', 3: 'high', 4: 'critical' } severity = faraday_severities[issue_data.Severity] references = [] try: classifications = issue_data.Classifications.getchildren( ) except AttributeError: classifications = [] for classification in classifications: references.append(classification.text) # Build description description = u'' for report_section in issue_data.findall( './ReportSection'): description += u'{} \n'.format( report_section.Name.text) description += u'{} \n'.format( report_section.SectionText.text) description += u'{} \n'.format(issue_data.get('id')) h = html2text.HTML2Text() description = h.handle(description) for repro_step in issue_data.findall('./ReproSteps'): step = repro_step.ReproStep if step is not None: try: params = step.PostParams.text except AttributeError: pass if not hostname: # This seems to be a mobile app hostname = session.URL.text if not port: service_data['name'] = step.Url.text service_data['port'] = step.sourceline self.sast_vulns.append({ "host": hostname, "severity": severity, "service": service_data, "name": name, "description": description, "external_id": external_id, "references": references, "method": method, "query": query, "response": response, "request": request, "path": path, "params": params, "status_code": status_code, "website": session.URL.text })
from email import generator, message_from_string from email.message import Message from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase from email.mime.message import MIMEMessage from email.utils import formatdate import mimetypes import os import html2text from . import _compat as compat from .utils import sanitize_address, forbid_multi_line_headers, make_msgid textify = html2text.HTML2Text() # Don't BASE64-encode UTF-8 messages email.charset.add_charset('utf-8', email.charset.SHORTEST, None, 'utf-8') utf8_charset = email.charset.Charset('utf-8') utf8_charset.body_encoding = None # Python defaults to BASE64 # Default MIME type to use on attachments (if it is not explicitly given # and cannot be guessed). DEFAULT_ATTACHMENT_MIME_TYPE = 'application/octet-stream' class MIMEMixin(object): def as_string(self, unixfrom=False): """Return the entire formatted message as a string. Optional `unixfrom' when True, means include the Unix From_ envelope
#!/usr/bin/env python3 import html2text import json import os.path import random import re import requests import string import sys import time import webbrowser reader = html2text.HTML2Text() reader.ignore_links = True def create_email(username='', secure=False): """ generates an email based on optional parameters returns username, domain """ domains = [ 'esiix.com', 'wwjmp.com', '1secmail.com', '1secmail.org', '1secmail.net' ] domain = random.choice(domains) if secure: return ''.join( random.choices(string.ascii_lowercase + string.digits, k=20)), domain
def process_raw_message_batch(realm_id: int, raw_messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], user_id_mapper: IdMapper, user_handler: UserHandler, get_recipient_id: Callable[[ZerverFieldsT], int], is_pm_data: bool, output_dir: str, zerver_realmemoji: List[Dict[str, Any]], total_reactions: List[Dict[str, Any]], ) -> None: def fix_mentions(content: str, mention_user_ids: Set[int]) -> str: for user_id in mention_user_ids: user = user_handler.get_user(user_id=user_id) mattermost_mention = '@{short_name}'.format(**user) zulip_mention = '@**{full_name}**'.format(**user) content = content.replace(mattermost_mention, zulip_mention) content = content.replace('@channel', '@**all**') content = content.replace('@all', '@**all**') # We don't have an equivalent for Mattermost's @here mention which mentions all users # online in the channel. content = content.replace('@here', '@**all**') return content mention_map = dict() # type: Dict[int, Set[int]] zerver_message = [] import html2text h = html2text.HTML2Text() name_to_codepoint = get_name_to_codepoint_dict() for raw_message in raw_messages: message_id = NEXT_ID('message') mention_user_ids = get_mentioned_user_ids(raw_message, user_id_mapper) mention_map[message_id] = mention_user_ids content = fix_mentions( content=raw_message['content'], mention_user_ids=mention_user_ids, ) content = h.handle(content) if len(content) > 10000: # nocoverage logging.info('skipping too-long message of length %s' % (len(content),)) continue pub_date = raw_message['pub_date'] try: recipient_id = get_recipient_id(raw_message) except KeyError: logging.debug("Could not find recipient_id for a message, skipping.") continue rendered_content = None topic_name = 'imported from mattermost' user_id = raw_message['sender_id'] message = build_message( content=content, message_id=message_id, pub_date=pub_date, recipient_id=recipient_id, rendered_content=rendered_content, topic_name=topic_name, user_id=user_id, has_attachment=False, ) zerver_message.append(message) build_reactions(realm_id, total_reactions, raw_message["reactions"], message_id, name_to_codepoint, user_id_mapper, zerver_realmemoji) zerver_usermessage = make_user_messages( zerver_message=zerver_message, subscriber_map=subscriber_map, is_pm_data=is_pm_data, mention_map=mention_map, ) message_json = dict( zerver_message=zerver_message, zerver_usermessage=zerver_usermessage, ) dump_file_id = NEXT_ID('dump_file_id' + str(realm_id)) message_file = "/messages-%06d.json" % (dump_file_id,) create_converted_data_files(message_json, output_dir, message_file)
def begin_format1(): sched_wb_path = 'data/restore_file.xlsx' # path to excel workbook for import in Schedback paper_wb_path = 'data/CNS_2020_Paper-2020-06-11.xlsx' # path to CNS papers workbook user_wb_path = 'data/CNS_2020_User-2020-06-11.xlsx' # path to CNS users workbook # load excel workbooks for processing sched_wb = load_workbook(filename=sched_wb_path) paper_wb = load_workbook(filename=paper_wb_path) user_wb = load_workbook(filename=user_wb_path) # setup html2text options h = html2text.HTML2Text() # setup parameters below carefully to ensure things are inserted correctly in Sched sched_first_row = 13 # begin inserting sessions in this row # regular expressions to format author names p = re.compile('\s\(#[0-9]*\)') # for formatting author names p_abstract = re.compile('\\n') # for formatting abstract last_session_id = 4 # ID of the last session in the excel sheet paper_ws = paper_wb['Tablib Dataset'] # access paper worksheet sched_ws = sched_wb['Sessions'] # access sessions worksheet poster_count = 0 # keep track of count of posters to assign them to separate time slots poster_start_day = 19 # start day for poster sessions poster_start_time = 7 # start time for poster sessions for row in paper_ws.iter_rows( min_row=2): # ignore the first row that contains column names paper_type = row[3].value if paper_type != 'Rejected': # extract information about the paper paper_ID = str(last_session_id + 1) paper_abstract = format_html_abstract(h, row[10].value, p_abstract) paper_title = row[9].value paper_author = format_author(row[1].value, p) # add paper details to sched workbook sched_ws['A%d' % sched_first_row] = paper_ID sched_ws['B%d' % sched_first_row] = paper_title sched_ws['C%d' % sched_first_row] = 'Y' sched_ws['C%d' % sched_first_row].alignment = Alignment( horizontal='center') sched_ws['D%d' % sched_first_row] = 'N' sched_ws['J%d' % sched_first_row] = paper_abstract sched_ws['K%d' % sched_first_row] = paper_author sched_ws['P%d' % sched_first_row] = 'TBA' if paper_type == 'Accepted': sched_ws['G%d' % sched_first_row] = 'Poster' # start and end times for posters poster_count = poster_count + 1 if poster_count % 120 == 0: poster_start_day = poster_start_day + 1 poster_start_time = 7 elif poster_count % 40 == 0: poster_start_time = poster_start_time + 1 sched_ws['E%d' % sched_first_row] = '7/%d/2020 %d:00 PM' % ( poster_start_day, poster_start_time ) # start time for posters sched_ws['F%d' % sched_first_row] = '7/%d/2020 %d:00 PM' % ( poster_start_day, poster_start_time + 1 ) # end time for posters sched_ws['F%d' % sched_first_row].alignment = Alignment( horizontal='right') elif paper_type == 'AcceptedOral': sched_ws['G%d' % sched_first_row] = 'Oral' elif paper_type == 'AcceptedFeatured': sched_ws['G%d' % sched_first_row] = 'Featured Talk' sched_first_row = sched_first_row + 1 last_session_id = last_session_id + 1 sched_wb.save(sched_wb_path)
def item_description(self, item): h = html2text.HTML2Text() excerpt = h.handle(str(item.body)).split('\n\n')[0] return excerpt + "..."
def html2text(data, method='lynx', options=None): """ Convert a string consisting of HTML to plain text for easy difference checking. Method may be one of: 'lynx' (default) - Use "lynx -dump" for conversion options: see "lynx -help" output for options that work with "-dump" 'html2text' - Use "html2text -nobs" for conversion options: https://linux.die.net/man/1/html2text 'bs4' - Use Beautiful Soup library to prettify the HTML options: "parser" only, bs4 supports "lxml", "html5lib", and "html.parser" http://beautiful-soup-4.readthedocs.io/en/latest/#specifying-the-parser-to-use 're' - A simple regex-based HTML tag stripper 'pyhtml2text' - Use Python module "html2text" options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options """ if options is None: options = {} if method == 're': stripped_tags = re.sub(r'<[^>]*>', '', data) d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) return d if method == 'pyhtml2text': import html2text parser = html2text.HTML2Text() for k, v in options.items(): setattr(parser, k.lower(), v) d = parser.handle(data) return d if method == 'bs4': from bs4 import BeautifulSoup parser = options.pop('parser', 'html.parser') soup = BeautifulSoup(data, parser) d = soup.prettify() return d if method == 'lynx': cmd = [ 'lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset UTF-8', '-display_charset UTF-8' ] elif method == 'html2text': cmd = ['html2text', '-nobs', '-utf8'] else: raise ValueError('Unknown html2text method: %r' % (method, )) stdout_encoding = 'utf-8' for k, v in options.items(): cmd.append('-%s %s' % (k, v) if v is True else '-%s' % k) logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding) env = {} env.update(os.environ) env['LANG'] = 'en_US.utf-8' env['LC_ALL'] = 'en_US.utf-8' html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env) stdout, stderr = html2text.communicate(data.encode('utf-8')) stdout = stdout.decode(stdout_encoding) if method == 'lynx': # Lynx translates relative links in the mode we use it to: # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK] # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the # "localhost" in the file:// URLs; see Debian bug 732112 stdout = re.sub( r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'), ), '', stdout) # Use the following regular expression to remove the unnecessary # parts, so that [RANDOM STRING] (changing on each call) does not # expose itself as change on the website (it's a Lynx-related thing # Thanks to Evert Meulie for pointing that out stdout = re.sub( r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'), ), '', stdout) # Also remove file names like L9816-5928TMP.html stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout) return stdout.strip()
def html2Text(self, html): h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True return h.handle(html)
def run(): article = getHtml() text_maker = ht.HTML2Text() md = text_maker.handle(article) createFile(md)
'deportes': 'https://e00-elmundo.uecdn.es/elmundodeporte/rss/portada.xml', 'tecnologia': '', 'ciencia': 'https://e00-elmundo.uecdn.es/elmundo/rss/ciencia.xml', 'cultura': 'https://e00-elmundo.uecdn.es/elmundo/rss/cultura.xml' }, } # Actual date date = datetime.datetime.now().date() # Files paths news_cache = 'news_cache.txt' # Parser object parser = html2text.HTML2Text() parser.images_to_alt = True # Discard image data, only keep alt text parser.ignore_images = True # Don't include any formatting for images parser.ignore_links = True # Ignores links parser.ignore_tables = True parser.body_width = 1000 # Number of charcaters per line (long number so no '\n' character appears) rss_feed = feeds['libertad_digital']['portada'] def main(): # Do request using requests library and timeout try: print('Making request') resp = requests.get(rss_feed, timeout=5.05) except requests.ReadTimeout as e:
def main(): # user_query = input() DOCID = 0 INDEX = dict() numPartial = 1 INDEX = dict() filtfiles = open("validDocs2", "r") uniqueTokens = list() for ln in filtfiles.readlines(): fname = ln.strip() if DOCID % 12000 == 0 and DOCID != 0: for i in INDEX: ''' print('partialIndex/' + str(numPartial) + '_' + str(i)) file = open('partialIndex/' + str(numPartial) + '_' + str(i), 'w') file.write(str(INDEX[i])) file.close() ''' uniqueTokens.extend(list(INDEX[i].keys())) json.dump( INDEX[i], open('partialIndex/' + str(numPartial) + '_' + str(i), 'w')) numPartial += 1 # Increment the DOCID DOCID += 1 # print the full file path print(fname) # parse json, tokenize the body h2t = html2text.HTML2Text() # open single webpage file file = open(fname) # JSON dict contains: 'url', 'content', 'encoding' pageDict = json.loads(file.read()) # close file to get memory back file.close() # get html formatted content htmlContent = pageDict['content'] soup = BeautifulSoup(htmlContent, 'html.parser') titles = '' bolds = '' h1 = '' h2 = '' h3 = '' if soup.title is not None: titles = soup.title.string for tag in soup.find_all("b"): if tag.string is not None and bolds is not None: bolds += (" " + tag.string) for tag in soup.find_all("h1"): if tag.string is not None and h1 is not None: h1 += (" " + tag.string) for tag in soup.find_all("h2"): if tag.string is not None and h2 is not None: h2 += (" " + tag.string) for tag in soup.find_all("h3"): if tag.string is not None and h3 is not None: h3 += (" " + tag.string) # get plain text content plainContent = h2t.handle(htmlContent) # get tokens in order of appearance tokens = tok.getTokens(plainContent) imp_words = dict() imp_words['titles_tokens'] = tok.getTokens(titles) imp_words['bolds_tokens'] = tok.getTokens(bolds) imp_words['h1_tokens'] = tok.getTokens(h1) imp_words['h2_tokens'] = tok.getTokens(h2) imp_words['h3_tokens'] = tok.getTokens(h3) #print('imp_words = ', imp_words) # Index the tokens ind.indexTokens(tokens, imp_words, DOCID, INDEX) DOCINDEX[DOCID] = (fname, pageDict['url']) for i in INDEX: ''' file = open('partialIndex/' + str(numPartial) + '_' + str(i), 'w') file.write(str(INDEX[i])) file.close() ''' json.dump(INDEX[i], open('partialIndex/' + str(numPartial) + '_' + str(i), 'w')) json.dump(DOCINDEX, open('docindex', 'w')) # Print Final Statistics print("Number of Documents: {}".format(DOCID)) print("Number of Unique Tokens: {}".format(len(set(uniqueTokens))))
def _html_to_markdown(value, baseurl=None, ignore_links=False): h = html2text.HTML2Text() h.ignore_links = ignore_links h.baseurl = baseurl h.body_width = 0 return h.handle(str(value))
#version: 1.0.0 #Appreciation: All creators of packages used in this code #import predefined python packages import os import re import string import easygui import html2text from nltk.stem import PorterStemmer from nltk.corpus import stopwords from nltk.tokenize import word_tokenize, sent_tokenize #define variables, Objects and Handles stemmer = PorterStemmer() #stemmer used to invoke methods used in stemming h = html2text.HTML2Text() #h used to invoke html to text procesing methods h.ignore_links = True #conversion to text removes all html links <a></a> etc ##functions defination section #function stemword def stemword(w): #invoke the stemmer.stem and return the stemmed word return (stemmer.stem(w)) #function create_document #Extracted file converted so that each sentence #is in its own paragraph sparated by sace def create_document(t_file):
import html2text import scrapy from scrapy.spiders import SitemapSpider from vcrawler.items import Product converter = html2text.HTML2Text() class GrowpowerSpider(SitemapSpider): name = "growpower" sitemap_urls = ['https://www.growpower.com.br/sitemap.xml'] def _parse_description(self, response): description = '' rawdesc = response.css('div#descricao').extract_first() if rawdesc is not None: description = converter.handle(rawdesc) return description def _parse_category(self, response): category = '' categories = response.css('div.breadcrumbs a::text').extract() if len(categories) > 1: categories.pop(0) category = categories[0] category = category.strip() return category def parse(self, response): sku = response.css('span[itemprop="sku"]::text').extract_first() if sku is None:
def run(blog): info = getHtml(blog) text_maker = ht.HTML2Text() md = text_maker.handle(info['article']) save_file = createFile(md, info['title']) replace_md_url(save_file)
def convert(inputfilename): global filename try: xmldoc = minidom.parse(inputfilename) wp = xmldoc.documentElement except Exception as e: print ('Fail.') print (e) print ('Please repair or delete invalid token like "& < >" there.') sys.exit(1) # 读取XML基础数据 blog_title = getTagData(wp, 'title') blog_desc = getTagData(wp, 'description') blog_url = getTagData(wp, 'link') posts = wp.getElementsByTagName('item') summary = '# 目录' + '\n\n' # 正文信息提取并输出 for log in posts: status = getTagData(log, 'wp:status') title = replace_text(getTagData(log, 'title')) author = getTagData(log, 'dc:creator') date = convertDate(getTagData(log, 'wp:post_date')) md_name = convertMdName(getTagData(log, 'wp:post_date')) content = getTagData(log, 'content:encoded').replace('\n\n', '<br/><br/>') category_list = [] tag_list = [] category = log.getElementsByTagName('category') for cat_tag in category: if cat_tag.getAttribute('domain') == 'category': category_list.append(replace_text(getElementData(cat_tag))) if cat_tag.getAttribute('domain') == 'post_tag': tag_list.append(replace_text(getElementData(cat_tag))) category_list_str = ', '.join(category_list).replace('_', '\\_') tag_list_str = ', '.join(tag_list).replace('_', '\\_') comment_list = [] comment = log.getElementsByTagName('wp:comment') comment_id = 1 for cmt in comment: comment_date = getTagData(cmt, 'wp:comment_date') comment_author = getTagData(cmt, 'wp:comment_author') comment_author_email = getTagData(cmt, 'wp:comment_author_email') if comment_author_email: comment_author_email += ', ' comment_author_url = getTagData(cmt, 'wp:comment_author_url') if comment_author_url: comment_author_url += ', ' comment_content = getTagData(cmt, 'wp:comment_content') comment_list.append('<p>' + str(comment_id) + '. ' + comment_author + ', ' + comment_author_email + comment_author_url + comment_date + '</p><p>' + comment_content + '</p>') comment_id += 1 comment_list_str = ''.join(comment_list) h = html2text.HTML2Text() h.body_width = 0 if status == 'publish': summary += '* [' + title + '](' + md_name + ')\n' md = '' md += '# ' + title + '\n' md += '作者: ' + author + '\n\n' md += '日期: ' + date + '\n\n' md += '分类: ' + category_list_str + '\n\n' md += '标签: ' + tag_list_str + '\n\n' md += '***' + '\n\n' md += h.handle(content) + '\n' if len(comment_list_str) > 0: md += '***\n' md += '## 从前的评论\n' md += h.handle(comment_list_str) output(filename, md_name, md) # 主页输出 index = '' index += '# ' + blog_title + '\n' if blog_desc: index += '#### ' + blog_desc + '\n' if blog_url: index += '#### 原址: <' + blog_url + '>\n' output(filename, 'README.md', index) # 目录输出 output(filename, 'SUMMARY.md', summary)
def _create_issuetracker_issue(assessment, issue_tracker_info): """Collects information and sends a request to create external issue.""" integration_utils.normalize_issue_tracker_info(issue_tracker_info) person, acl, acr = (all_models.Person, all_models.AccessControlList, all_models.AccessControlRole) reporter_email = db.session.query(person.email, ).join( acl, person.id == acl.person_id, ).join( acr, sa.and_( acl.ac_role_id == acr.id, acr.name == "Audit Captains", ), ).filter( acl.object_id == assessment.audit_id, acl.object_type == all_models.Audit.__name__, ).order_by(person.email, ).first() if reporter_email: reporter_email = reporter_email.email comment = [_INITIAL_COMMENT_TMPL % _get_assessment_url(assessment)] test_plan = assessment.test_plan if test_plan: comment.extend([ 'Following is the assessment Requirements/Assessment Procedure ' 'from GGRC:', html2text.HTML2Text().handle(test_plan).strip('\n'), ]) hotlist_id = issue_tracker_info.get('hotlist_id') issue_params = { 'component_id': issue_tracker_info['component_id'], 'hotlist_ids': [hotlist_id] if hotlist_id else [], 'title': issue_tracker_info['title'], 'type': issue_tracker_info['issue_type'], 'priority': issue_tracker_info['issue_priority'], 'severity': issue_tracker_info['issue_severity'], 'reporter': reporter_email, 'assignee': '', 'verifier': '', 'status': issue_tracker_info['status'], 'ccs': [], 'comment': '\n'.join(comment), } assignee = issue_tracker_info.get('assignee') if assignee: if not issue_tracker_info['status']: issue_params['status'] = 'ASSIGNED' issue_params['assignee'] = assignee issue_params['verifier'] = assignee cc_list = issue_tracker_info.get('cc_list') if cc_list is not None: issue_params['ccs'] = cc_list res = issues.Client().create_issue(issue_params) return res['issueId']
# 这里要重新复制为你自己的cookie } url_tmp = 'https://www.acwing.com/problem/content/' url = url_tmp for i in range(1, 20): url = url_tmp + str(i) r = requests.get(url=url, headers=headers, cookies=cookies) # 请求网址 soup = BeautifulSoup(str(r.text), 'html.parser') #文档对象 # print(soup.title.get_text()) title = soup.title.get_text() l = len(title) # print(); # print(l); text_maker = ht.HTML2Text() text_maker.bypass_tables = False if (len( soup.find_all( 'div', class_='ui bottom attached tab active martor-preview')) >= 1): htmlpage = str( soup.find_all( 'div', class_='ui bottom attached tab active martor-preview')[0]) text = text_maker.handle(htmlpage) print(title[0:l - 10]) s = "# " + title[0:l - 10] + "\n\n#### 题目描述\n\n" s += text open(title + ".md", "w").write(s)
def html_to_txt(self, html): h = html2text.HTML2Text() h.ignore_links = True return h.handle(html) # self.text = Document(self.html) pass
"""ICA parser class.""" import re import traceback from flask import current_app import html2text from recapi.html_parsers import GeneralParser # Set html2text options text_maker = html2text.HTML2Text() text_maker.emphasis_mark = "*" class KungsornenParser(GeneralParser): """Parser for recipes at kungsornen.se.""" domain = "kungsornen.se" name = "Kungsörnen" address = "https://www.kungsornen.se/recept/" def __init__(self, url): """Init the parser.""" self.url = url self.make_soup() self.get_title() self.get_image() self.get_ingredients() self.get_contents() self.get_portions()
def ProcessJitenon(self): page = pickle.load(open(self._get_kanji_file_name(), "rb")) tree = html.fromstring(page.content) block = tree.xpath('//*[@id="kanjiright"]/table/tr/th') startIdx = 0 endIdx = 0 print("**** " + self._Kanji + " ****") for blk in block: if self._Kanji == "点" and len(blk.getchildren()) == 0: continue blkName, blkRow = h3_row_nb_all(self._Kanji, blk) # issue with kanji.jineton for 平 if self._Kanji == "平" and blkName == '訓読み': blkRow += 1 if self._Kanji == "平" and blkName == '意味': blkRow -= 1 if self._Kanji == "点" and blkName == '意味': blkRow += 1 if self._Kanji == '袒' and blkName == '訓読み': blkRow -= 1 startIdx = endIdx endIdx += blkRow print("Block " + blkName + ", nb row: " + str(blkRow) + " [" + str(startIdx) + ";" + str(endIdx) + "].") subblock = tree.xpath('//*[@id="kanjiright"]/table/tr/td') for idx in range(startIdx, endIdx): if blkName in ['部首', '画数', '音読み', '訓読み', '漢字検定', '学年']: if self._Kanji == '禺' and blkName == '訓読み': content = subblock[idx].text elif self._Kanji == '袤' and blkName == '訓読み' and idx == 3: content = subblock[idx].text else: content = subblock[idx].getchildren()[0].text elif blkName in ['Unicode']: content = subblock[idx].text elif blkName in ['種別']: if len(subblock[idx].getchildren()) > 0: content = subblock[idx].getchildren()[0].text elif blkName in ['異体字']: content = lxml.html.tostring(subblock[idx], encoding='unicode') if '新字体' in content: kind = '新字体' elif '標準字体' in content: kind = '標準字体' else: kind = None content = None if kind: link = subblock[idx].getchildren()[0].getchildren( )[0].attrib if 'href' in link: content = (kind, link['href']) elif blkName in ['意味']: content = lxml.html.tostring(subblock[idx], encoding='unicode') h = html2text.HTML2Text() h.ignore_links = True content = h.handle(content) # m = re.search("<td>(.*)</td>", content, flags=re.MULTILINE) # content = m[1] elif blkName in ['JIS水準']: if len(subblock[idx].getchildren()) > 0: content = subblock[idx].getchildren()[0].text else: content = subblock[idx].text self._jitenonItem[blkName].append(content) print(self._jitenonItem[blkName])
import html2text from cassiopeia import riotapi from cassiopeia.type.core.staticdata import * from fuzzywuzzy import process DDRAGON_BASE = f"http://ddragon.leagueoflegends.com/cdn/{riotapi.get_versions()[0]}" SPELL_SCALINGS = {'attackdamage': "AD", 'bonusattackdamage': "**bonus** AD", 'armor': "Armor", 'bonusarmor': "**bonus** Armor", 'spellblock': "Magic Resist", 'bonusspellblock': "**bonus** Magic Resist", 'health': "Health", 'bonushealth': "**bonus** Health", 'spelldamage': "AP", "@dynamic.abilitypower": "AP"} # unhandled special cases: (i have been unable to find out what these mean, api missing too much data :/) # @dynamic.attackdamage @cooldownchampion SANITIZER = html2text.HTML2Text() SANITIZER.ignore_links = True SANITIZER.body_width = 0 def get_champion_by_name(name: str) -> Tuple[Optional[Champion], int]: """Get a champion by name with fuzzy search. Args: name: Name of champion Returns: Tuple[Optional[Champion], int]: Second element represents the query score (how close it is to the actual value). """ return get_by_name(name, riotapi.get_champions())
def extract_text_from_html(html): h = html2text.HTML2Text() h.ignore_links = True return h.handle(html)
cards.count() hooks = BitbucketHook.objects.filter(full_name='arcamens/django-github') orgs = hooks.values_list('organization') is_ok = Q(ancestor__ancestor__organization=orgs) cards = Card.objects.filter(is_ok) cards.count() for ind in []: pass else: print('hi') ############################################################################## import html2text h = html2text.HTML2Text() h.ignore_links = True h.handle("<p>Hello, <a href='http://earth.google.com/'>world</a>!") help(h.handle) dir(h) ############################################################################## from ehp import * html = Html() data = ''' <body> <em> foo </em> </body> ''' dom = html.feed(data) dom.text()
def index(self, config): results = [] board = config["board"] if board not in viable_boards: self.error_logger("Your chosen board does not exist on 4chan!") quit() # Create a HTML parser for parsing comments h = html2text.HTML2Text() h.ignore_links = False req = f"https://a.4cdn.org/{board}/threads.json" content = json.loads(requests.get(req).content) for page_index, page in enumerate(content): self.logger(f"Scraping page number: {page_index+1}") for thread_index, threads in enumerate(page["threads"]): self.logger( f"Extracting posts from thread number: {thread_index+1}") thread_id = threads["no"] req = f"https://a.4cdn.org/{board}/thread/{thread_id}.json" thread_content = json.loads(requests.get(req).content)[ "posts"] # thread content is a list of posts for post_index, post in enumerate(thread_content): self.logger( f"Extracting media and comments from post number: {post_index+1}" ) post_row = [] post_row.append(post["no"]) post_row.append(thread_id) post_row.append(post["time"]) try: comment = post["com"] except KeyError: comment = "..." else: comment = h.handle(comment) post_row.append(comment) # Filename try: filename = post["filename"] except KeyError: filename = "" if filename != "": time_id = post["tim"] extension = post["ext"] full_file = f"{filename}{extension}" file_url = f"https://i.4cdn.org/{board}/{time_id}{extension}" post_row.append(full_file) post_row.append(extension) post_row.append(file_url) elif filename == "": post_row.append("") post_row.append("") post_row.append("") results.append(post_row) self.logger("Scraping metadata complete") results.insert(0, [ "id", "thread_id", "datetime", "comment", "filename", "ext", "url" ]) return LocalElementsIndex(results)