def parse(filename, window_width=1000): try: with tempfile.NamedTemporaryFile(delete=False) as f: tmp_fname = f.name subprocess.check_call([ settings.TIKA_PREFIX + 'tika', '--encoding=utf-8', '--html', filename ], stdout=f) return html.parse(tmp_fname, window_width) except subprocess.CalledProcessError as err: logger.warning( 'Could not convert MSOffice file "%s" using tika because of %s, trying unoconv...' % (filename, err)) try: subprocess.check_call( ['unoconv', '-fpdf', '-o', tmp_fname, filename]) return pdf.parse(tmp_fname, window_width) except subprocess.CalledProcessError as err: logger.error( 'Could not convert MSOffice file "%s" using unoconv because of %s' % (filename, err)) raise PreprocError() finally: if tmp_fname and os.path.exists(tmp_fname): os.remove(tmp_fname)
def parse(self): self.timer.start("HTML") self.text = lex(self.body) self.timer.stop() self.nodes = parse(self.text) self.timer.start("Parse CSS") self.rules = parse_css(DEFAULT_STYLE) self.timer.stop() self.rules.sort(key=lambda x: x[0].score()) self.timer.start("JS") self.js = dukpy.JSInterpreter() self.js_handles = dict() # Registration self.js.export_function("log", print) self.js.export_function("querySelectorAll", self.js_querySelectorAll) # Run runtime self.js.evaljs(DEFAULT_JS) for script in find_scripts(self.nodes, []): lhost, lport, lpath, lfragment = parse_url( relative_url(script, self.history[-1])) header, body = request('GET', lhost, lport, lpath) self.js.evaljs(body) self.timer.stop() self.relayout()
def get_links(url, page, domain=False, noquery=False): '''Se domain e' True ritorna solo i links nello stesso dominio dell'URL di input. Se noquery e' True, ritorna solamente i links senza la componente query.''' try: parsed = html.parse(page, HTMLNode) except Exception as ex: return (None, str(ex)) if parsed == None: return (None, 'HTML Parsing Error') linkset = set() for link in parsed.get_links(): parsed = up.urlsplit(link) ext = os.path.splitext(parsed.path)[1].lower() if (parsed.scheme.lower() in ('http','https','ftp','') and (parsed.netloc or parsed.path) and ext in ('','.htm','.html')): if noquery and parsed.query: continue if not parsed.netloc: link = up.urljoin(url, link) parsed = up.urlsplit(link) else: if domain: continue if not parsed.scheme: link = 'http://' + link parsed = up.urlsplit(link) linkset.add(parsed.geturl()) return (linkset, None)
def render_to_image(html_source, css_source, height, width, renderer): tree = html.parse(html_source) rules = css.parse(css_source) styled_tree = style.style(tree, rules) layout_tree = [layout.build_layout_tree(node) for node in styled_tree if layout.get_display(node) is not layout.Display.NONE] root = layout.Dimensions.default() root.content.width = width for node in layout_tree: node.layout(root) renderer = painting.Renderer(width, height, renderer) image = renderer.render(layout_tree) return image
def get_metrics_for_project(project_name, data_dir, output): if not os.path.isdir(data_dir): raise RuntimeError( "Could not access understand results in directory: " + data_dir) # INTENTIONALLY USING FORWARD SLASH, THIS WORKS FOR WINDOWS # DO NOT CHANGE TO OS.PATH STUFF. UNDERSTAND WANTS FORWARD SLASHES # EVEN IN WINDOWS. -DJC 2018-06-08 index_file = data_dir + '/index.html' if not os.path.isfile(index_file): raise RuntimeError("Understand results not found: ", index_file) with open(index_file) as htmlfile: logger.info("\tGathering metrics: " + index_file) output['Project Name'] = project_name # grab <head><script> block = html.tostring(html.parse(htmlfile).getroot()[0][0]) myStr = block.decode() metrics = json.loads(myStr.split("metrics=")[1].split(';')[0]) # Convert from name/value tags to dictionary metricsDict = {} for m in metrics: name = m['name'] val = m['value'] # If blank String, don't put it in so it's easier to catch later -djc 2018-03-19 if name != "Project Name" and val: metricsDict[name] = val.replace('%', '') # Remove percentage information and only show core or central as appropriate core = True archType = metricsDict['Architecture Type'] if archType == 'Hierarchical' or archType == 'Multi-Core': core = False output['Core'] = core for key, value in metricsDict.items(): output[key] = value return output
def tokenize_body(msg, config): if msg.is_multipart(): rv = [] for m in msg.get_payload(): rv += tokenize(m, config) return rv else: type = msg.get("content-type", "text/plain") if type.startswith("text/"): payload = msg.get_payload(decode=True) if payload: tokens = [] if type.startswith("text/html"): try: (payload, tags) = html.parse(payload) tags = [(x,y) for (x,y) in tags if x not in banned_attrs] tags = [y and "%s=%s" % (x,y) or x for (x,y) in tags] tokens += mangle("HTML", [x[:251] for x in tags]) except Exception, e: # print >> sys.stderr, "crap:", e tokens += ["BUGhtml"] try: payload = html_tag_re.sub("", payload) except: pass words = word_re.findall(payload) tokens += mangle("BODY", [x for x in words if 3 <= len(x) <= 20]) if len(words) > 1 and config.double: tokens += mangle("BODY", ["%s %s".lower() % (x, y) for (x,y) in zip(words[:-1], words[1:]) if 3 <= len(x) <= 20 and 3 <= len(y) <= 20]) for key, body in config.bodies.iteritems(): tokens += body.get_tokens(payload) return tokens
import requests, html print(html.parse("<body>")) html = requests.get( "https://example.com/") ## bad: overwrites imported package name print(html)
import os import subprocess import json import eval_php import html project_name = sys.argv[1] parsed_dir1 = sys.argv[2] parsed_dir2 = sys.argv[3] pwd = os.getcwd() print pwd with open(project_name+"-modified.txt") as f: for line in f: parsed_path1 = pwd+"/"+parsed_dir1+line parsed_path2 = pwd+"/"+parsed_dir2+line print parsed_path1+" "+parsed_path2 if os.path.isfile(parsed_path1) and os.path.isfile(parsed_path2): root1 = eval_php.parse_php(parsed_path1) content1,parser1 = html.parse(root1) root2 = eval_php.parse_php(parsed_path2) content2,parser2 = html.parse(root2) if content1 == content2: print "===> "+line else: print "HTML : "+content1