def save_attachments(self, html, document, prefix, tmpdir): """ Place attachments needed by the html of this document into tmpdir. Only attachments referenced using the given prefix are saved. """ html = lxml.html.fromstring(html) prefix_len = len(prefix) # gather up the attachments that occur in the html imgs = [ img for img in html.iter('img') if img.get('src', '').startswith(prefix) ] fnames = set(img.get('src')[prefix_len:] for img in imgs) # ensure the media directory exists media_dir = os.path.join(tmpdir, prefix) os.makedirs(media_dir, exist_ok=True) for attachment in document.attachments.all(): # the src attribute values in fnames are URL-quoted if urllib.parse.quote(attachment.filename) in fnames: # save the attachment into tmpdir fname = os.path.join(media_dir, attachment.filename) with open(fname, "wb") as f: shutil.copyfileobj(attachment.file, f) # make img references absolute # see https://github.com/wkhtmltopdf/wkhtmltopdf/issues/2660 for img in imgs: img.set('src', os.path.join(tmpdir, img.get('src'))) return lxml.html.tostring(html, encoding='unicode')
def save_attachments(self, html, document, prefix, tmpdir): """ Place attachments needed by the html of this document into tmpdir. Only attachments referenced using the given prefix are saved. """ html = lxml.html.fromstring(html) prefix_len = len(prefix) # gather up the attachments that occur in the html fnames = set( img.get('src')[prefix_len:] for img in html.iter('img') if img.get('src', '').startswith(prefix) ) # ensure the media directory exists media_dir = os.path.join(tmpdir, prefix) os.makedirs(media_dir, exist_ok=True) for attachment in document.attachments.all(): # the src attribute values in fnames are URL-quoted if urllib.parse.quote(attachment.filename) in fnames: # save the attachment into tmpdir fname = os.path.join(media_dir, attachment.filename) with open(fname, "wb") as f: shutil.copyfileobj(attachment.file, f)
def make_body_images_inline(body): """Looks for images inside the body and make them inline. Before sending a message in HTML format, it is necessary to find all img tags contained in the body in order to rewrite them. For example, icons provided by CKeditor are stored on the server filesystem and not accessible from the outside. We must embark them as parts of the MIME message if we want recipients to display them correctly. :param body: the HTML body to parse """ html = lxml.html.fromstring(body) parts = [] for tag in html.iter("img"): src = tag.get("src") if src is None: continue o = urlparse(src) path = urllib.unquote(os.path.join(settings.BASE_DIR, o.path[1:])) if not os.path.exists(path): continue fname = os.path.basename(path) cid = "%s@modoboa" % os.path.splitext(fname)[0] tag.set("src", "cid:%s" % cid) with open(path, "rb") as fp: part = MIMEImage(fp.read()) part["Content-ID"] = "<%s>" % cid part.replace_header("Content-Type", '%s; name="%s"' % (part["Content-Type"], fname)) part["Content-Disposition"] = "inline" parts.append(part) return lxml.html.tostring(html), parts
def sanitize_html(html_file): html = lxml.html.fromstring(html_file) for element in html.xpath("//script|//style|//meta|//link|//option|//iframe"): element.getparent().remove(element) return (h for h in html.iter() if h.tag != etree.Comment)
def get_html_attributes(html): """ Take an lxml ElementTree; return Counter of attributes; count once. """ attributes = set() for element in html.iter(): for attribute in element.keys(): element_attribute = ' '.join([element.tag, attribute]) attributes.add(element_attribute) return Counter(attributes)
def _fix_html(self, value): html = lxml.html.fromstring(value) for elem in html.iter(): if elem.text: elem.text = self._fix_text(elem.text) if elem.tail: elem.tail = self._fix_text(elem.tail) return lxml.html.tostring(html).decode()
def convert(doc: str): output = [] html = lxml.html.fromstring(doc) for element in html.iter(): res = fsm(element) if res: output.append(res) return '\n'.join(output)
def getLastPage(url): html = lxml.html.fromstring(openUrl(url).read()) for element in html.iter(): if element.tag == 'a': try: if element.attrib['class'] == 'last': lastPage = 'http://wtcdata.nist.gov%s' % element.attrib['href'] lastPage = lastPage.split('=')[-1] return lastPage except KeyError: pass
def convert_google_sheet(sid, gid, options): html = parse_google_document( 'https://docs.google.com/spreadsheets/d/{sid}/htmlembed/sheet?gid={gid}&{options}' .format(sid=sid, gid=gid, options=options), errhelp={'sid' : sid, 'gid' : gid} ) for script in html.iter('script'): v = script.get('src') if v is None: #pass #script.getparent().remove(script) script.text = script.text.replace("CHARTS_EXPORT_URI.push('","CHARTS_EXPORT_URI.push('https://docs.google.com") else: script.set('src',"https://docs.google.com"+v) html.find('head/link').rewrite_links( lambda s: 'https://docs.google.com' + s ) html.find('head').append(lxml.html.Element( 'link', rel='stylesheet', href=url_for('static', filename='metatable.css'), )) html.find('body').append(lxml.html.Element( 'script', src="https://ajax.googleapis.com/ajax/libs/jquery/3.1.1/jquery.min.js" )) html.find('body').append(lxml.html.Element( 'script', src=url_for('static', filename='metatable.js') )) script = lxml.html.Element('script') script.text = ( "$(init); " "function init() { " "$('body').css('overflow', 'hidden'); " "var $table = $('#sheets-viewport table').detach(); " "var $metatable = create_metatable($table); " "$('body').empty().append($metatable); " "$metatable.resize(); " " }" "$('.row-header-wrapper').remove();" #"$('td').css('min-width', '100px');" "$(window).bind('load', function() {" "i=1;" "tableWidth=0;" "while (true) { idStr = '#0C'+i.toString(); obj = $(idStr); if (obj[0]==undefined) {break;}; wstr=obj[0].style.width.replace('px', ''); tableWidth+=parseInt(wstr); i++; }" "tblList = $('table.waffle');" "tblList[1].style.width=tableWidth.toString()+'px';" "tblList[3].style.width=tableWidth.toString()+'px';" "initCharts();" "});" ) html.find('body').append(script) # with open("output.txt", "w") as text_file: # text_file.write(lxml.html.tostring(html, encoding='utf-8')) return b'<!DOCTYPE html>\n<meta charset="UTF-8">\n' + \ lxml.html.tostring(html, encoding='utf-8')
def parse_to_raw_body(infilename, rewritten_input, make_transclusions): assert not rewritten_input, "no input file rewriting for .html input" transclusions = make_transclusions({}) infile = (open(infilename, 'rb') if isinstance(infilename, basestring) else infilename) try: s = infile.read() finally: infile.close() html = parse_html(s) lang = next(html.iter()).attrib.get('lang', None) #pylint: disable=W0612 handle_data_url = transclusions.add_data_url raw_body = parse_body(html.find('body'), handle_data_url=handle_data_url) return raw_body, transclusions, []
def convert(mode, url): url_translator = URLTranslator(mode, url) # get the page and parse it with lxml html_str = requests.get(url).text html_str = UnicodeDammit(html_str).unicode html = lxml.html.document_fromstring(html_str, base_url=url) # clean up the page for el in html.iter(): # support the HTML 'base' tag if el.tag == 'base' and el.get('href'): url_translator.register_base(el.get('href')) # remove comments if isinstance(el, lxml.html.HtmlComment): el.getparent().remove(el) continue # completely remove bad tags if el.tag in ('img', 'link', 'script', 'style', 'meta', 'iframe'): el.getparent().remove(el) continue # remove bad/useless attributes # bad = causes extra downloads and/or adds bloat # useless = doesn't do anything useful w/o CSS/JS # note that we don't remove ids since they can be used as URL anchors for attr, value in el.attrib.iteritems(): if attr.startswith('on') or attr in ('style', 'class'): del el.attrib[attr] # translate input[type=image] to submit if el.tag == 'input' and el.get('type') == 'image': el.attrib['type'] = 'submit' if el.get('src'): del el.attrib['src'] if el.get('alt'): el.attrib['value'] = el.get('alt') del el.attrib['alt'] # TODO: translate <noscript> to regular text # translate URL-containing attributes for attr in ('src', 'href'): if attr in el.attrib and el.attrib[attr]: el.attrib[attr] = url_translator(el.attrib[attr]) return lxml.html.tostring(html, encoding='utf-8')
def google_spreadsheet_data(sid): html = parse_google_document( 'https://docs.google.com/spreadsheets/d/{sid}/pubhtml?widget=true&range=1:70' .format(sid=sid), errhelp={'sid': sid}) title = html.find('head/title').text sheets = [] for script in html.iter('script'): if script.text is None: continue for match in SHEET_PATTERN.finditer(script.text): sheets.append(match.groupdict()) if sheets: break return title, sheets
def getFullJpg(itemHTML): ### Get link to image: links = lxml.html.iterlinks(itemHTML) for element, attribute, link, pos in links: if link.endswith('?g2_imageViewsIndex=1'): link = "http://wtcdata.nist.gov%s" % link html = lxml.html.fromstring(openUrl(link).read()) for element in html.iter(): try: if element.tag == 'img': if element.attrib['id'] == 'IFid1': mediaUrl = element.attrib['src'] mediaUrl = link = "http://wtcdata.nist.gov%s" % mediaUrl return mediaUrl except KeyError: pass
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5): " Input a raw html string, returns a raw html string of the article " html = parse(data, encoding_in) score_all(html) # rank all nodes (largest to smallest) ranked_nodes = sorted(html.iter(), key=lambda x: get_score(x), reverse=True) # minimum threshold if not len(ranked_nodes) or get_score(ranked_nodes[0]) < threshold: return None # take common ancestor or the two highest rated nodes if len(ranked_nodes) > 1: best = lowest_common_ancestor(ranked_nodes[0], ranked_nodes[1], 3) else: best = ranked_nodes[0] # clean up if not debug: keep_threshold = get_score(ranked_nodes[0]) * 3 / 4 clean_root(best, keep_threshold) # check for spammy content (links only) wc = count_words(best.text_content()) wca = count_words(' '.join( [x.text_content() for x in best.findall('.//a')])) if not debug and (wc - wca < 50 or float(wca) / wc > 0.3): return None # fix urls if url: best.make_links_absolute(url) return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out)
def convert_google_sheet(sid, gid): html = parse_google_document( 'https://docs.google.com/spreadsheets/d/{sid}/pubhtml/sheet?gid={gid}'. format(sid=sid, gid=gid), errhelp={ 'sid': sid, 'gid': gid }) for script in html.iter('script'): script.getparent().remove(script) for link in html.find('head').iter('link'): link.rewrite_links(lambda s: 'https:' + s if s.startswith('//') else 'https://docs.google.com' + s) html.find('head').append( lxml.html.Element( 'link', rel='stylesheet', href=url_for('static', filename='metatable.css'), )) html.find('body').append( lxml.html.Element( 'script', src= "https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js") ) html.find('body').append( lxml.html.Element('script', src=url_for('static', filename='metatable.js'))) script = lxml.html.Element('script') script.text = ("$(init); " "function init() { " "$('body').css('overflow', 'hidden'); " "var $viewport = $('#sheets-viewport').detach(); " "var $table = $viewport.find('table').detach(); " "var $svgs = $viewport.find('svg'); " "var $metatable = create_metatable($table); " "$('body').empty(); " "$('body').append($svgs); " "$('body').append($metatable); " "$viewport.remove(); " "$metatable.resize(); " " }") html.find('body').append(script) return b'<!DOCTYPE html>\n<meta charset="UTF-8">\n' + \ lxml.html.tostring(html, encoding='utf-8')
def html2plaintext(content): """HTML to plain text translation :param content: some HTML content """ html = lxml.html.fromstring(content) plaintext = "" for ch in html.iter(): p = None if ch.text is not None: p = ch.text.strip('\r\t\n') if ch.tag == "img": p = ch.get("alt") if p is None: continue plaintext += p + "\n" return plaintext
def find_images_in_body(body): """Looks for images inside a HTML body Before sending a message in HTML format, it is necessary to find all img tags contained in the body in order to rewrite them. For example, icons provided by CKeditor are stored on the server filesystem and not accessible from the outside. We must embark them as parts off the MIME message if we want recipients to display them correctly. :param body: the HTML body to parse """ from email.mime.image import MIMEImage from urlparse import urlparse html = lxml.html.fromstring(body) parts = [] for tag in html.iter("img"): src = tag.get("src") if src is None: continue o = urlparse(src) fname = os.path.basename(o.path) path = os.path.join(settings.MEDIA_ROOT, "webmail", fname) if not os.path.exists(path): continue cid = "%s@modoboa" % os.path.splitext(fname)[0] tag.set("src", "cid:%s" % cid) fp = open(path, "rb") p = MIMEImage(fp.read()) fp.close() p["Content-ID"] = "<%s>" % cid ct = p["Content-Type"] p.replace_header("Content-Type", '%s; name="%s"' % (ct, os.path.basename(fname))) p["Content-Disposition"] = "inline" parts.append(p) return lxml.html.tostring(html), parts
# And dump their timezones to ecfzones.json import collections import json import lxml.html import re import urllib2 from ecftimezone import ECFTimezone PACERLINKS = 'http://www.pacer.gov/psco/cgi-bin/links.pl' courts = set() html = lxml.html.parse(urllib2.urlopen(PACERLINKS)) for e in html.iter('a'): link = e.get('href') if not link: continue match = re.match(r'^https?://ecf\.([^.]+)\.uscourts.gov', link) if match: courts.add(match.group(1)) f = open('ecfdomains.txt', 'w') for c in sorted(courts): f.write(c + "\n") f.close() e = ECFTimezone() d = {c: e.timezone(c) for c in courts} od = collections.OrderedDict(sorted(d.items()))
combined_name=os.path.join(os.path.dirname(directory_name), 'dealeron.json') dealers = [] for file_name in file_names: with open(file_name, 'r') as fd: url = os.path.splitext(os.path.basename(file_name))[0] text = fd.read() if not '.dlron.' in text: print 'Dealer %s has moved on ...' % url continue try: html = lxml.html.document_fromstring(text) data = {'url': url, 'address': {}, 'geo': {}, 'departments': []} for meta in html.iter('meta'): name = meta.get('name') content = meta.get('content') if name is not None: if name == 'geo.position': lat, lng = content.split(',') data['geo']['latitude'] = lat data['geo']['longitude'] = lng elif name == 'geo.placement': data['address']['addressLocality'] = content elif name == 'geo.region': data['address']['addressRegion'] = content for div in html.find_class('hours-page'): for span in div.iter('span'): itemprop = span.get('itemprop') content = span.text_content()
full_url = urljoin(url, i['src']) print("image URL: " + full_url) y = urllib.request.urlopen(full_url) t = t + 1 with open('downloaded_images/' + str(t) + '.png', "wb") as code: code.write(y.read()) counter += 1 print('finished') return 0 print('Введите путь') url = input() parse(url) response = requests.get(url) html = html.fromstring(response.text) f = open('links.txt', 'w') i = 0 for a in html.iter("a"): if (i < 10): link = urljoin(url, a.get("href")) print(link) parse(link) f.write(link) f.write('\n') i += 1 f.close() input()
data['address']['addressRegion'] = state data['address']['postalCode'] = zip_code break break for anchor in html.find_class('get-directions-event-target'): href = anchor.get('href') if href: text = href[href.index('?q=')+3:] if len(text) >= 3: lat, lng = get_lat_lng(text) if len(lat) > 0 and len(lng) > 0: data['geo'] = {'latitude': lat, 'longitude': lng } break break if len(data['geo']) == 0: for iframe in html.iter('iframe'): src = iframe.get('src') uri = urlparse(src) if uri.hostname == 'maps.google.com': src = src.replace('&', '&') val = parse_qs(uri.query) if 'll' in val: lat, lng = get_lat_lng(val['ll'][0]) if len(lat) > 0 and len(lng) > 0: data['geo'] = {'latitude': lat, 'longitude': lng } break dealers.append(data) except ValueError, e: pass with open(combined_name, 'wb') as fd:
def main(): # TODO: combine command-line and option file. # TODO: option to generate a default configuration file parser = argparse.ArgumentParser() # TODO: doc parser.add_argument("-s", "--standalone", action="store_true") # TODO: doc args = parser.parse_args() standalone = args.standalone conf = json.load((DATA / "artdoc.js").open()) if Path("artdoc.js").exists(): user_conf = json.load(Path("artdoc.js").open()) conf.update(user_conf) info("Document:") doc_patterns = conf["doc"] if isinstance(doc_patterns, basestring): doc_patterns = [doc_patterns] docs = [] for pattern in doc_patterns: matches = list(WORKDIR.glob(pattern)) #subinfo("matching {!r}:".format(pattern)) for match in matches: subinfo(str(match)) docs.extend(matches) if not docs: sys.exit("error: no document found") # info("HTML template:") # template_file = HTML / "index.html" # subinfo(str(template_file)) # template = template_file.open().read().encode("utf-8") info("Bibliography:") bib_patterns = conf["bib"] if isinstance(bib_patterns, basestring): bib_patterns = [bib_patterns] bibs = [] for pattern in bib_patterns: matches = list(WORKDIR.glob(pattern)) #subinfo("matching {!r}:".format(pattern)) for match in matches: subinfo(str(match)) bibs.extend(matches) if not bibs: print() info("JS:") cmd = coffee["-c", str(JS / "main.coffee")] subinfo(cmd) cmd() info("CSS:") cmd = stylus[str(CSS / "style.styl")] subinfo(str(cmd)) cmd() # TODO: copy only what is required. shutil.copytree(str(DATA), str(ARTDOC)) for doc in docs: pass info("PANDOC: generate JSON file") args = ["-t", "json", "--smart"] for bib in bibs: args.extend(["--bibliography", str(bib)]) args.append(str(doc)) cmd = pandoc[args] subinfo(cmd, "> json") json_str = cmd() info("Convert raw TeX to raw HTML") cmd = local[str(BIN / "rawHTML.hs")] subinfo(cmd, "< json > json") json_str = (cmd << json_str)() # info("Flag/Box Proofs") # cmd = local[str(BIN / "proof.hs")] # subinfo(cmd, "< json > json") # try: # json_str = (cmd << json_str)() # except Exception as error: # print(repr(error)) # info("Wrap Section-Like Sequence of Blocks") # cmd = local[str(BIN / "div.hs")] # subinfo(cmd, "< json > json") # try: # json_str = (cmd << json_str)() # except Exception as error: # print(repr(error)) info("Wrap Section-Like Sequence of Blocks") cmd = local[str(BIN / "section.hs")] subinfo(cmd, "< json > json") try: json_str = (cmd << json_str)() except Exception as error: print(repr(error)) info("Flag Tombstones (end of proofs)") cmd = local[str(BIN / "tombstone.hs")] subinfo(cmd, "< json > json") try: json_str = (cmd << json_str)() except Exception as error: print(repr(error)) info("Convert Images to SVG Images") cmd = local[str(BIN / "svg.hs")] subinfo(cmd, "< json > json") json_str = (cmd << json_str)() info("Generate HTML body from markdown") args = [ "--email-obfuscation", "none", "-f", "json", "--mathjax", "-t", "html5", "--section-divs" ] cmd = pandoc[args] subinfo(cmd, "< json > body") pandoc_body_str = (cmd << json_str)() pandoc_html = lxml.html.document_fromstring(pandoc_body_str) pandoc_body = pandoc_html.cssselect("body")[0] info("Generate standalone HTML doc") html = HTML.html(HTML.head, HTML.body) body = html.cssselect("body")[0] head = html.cssselect("head")[0] head.append(HTML.meta(charset="utf-8")) body.attrib.update(pandoc_body.attrib) body.extend(pandoc_body[:]) # ---------------------------------------------------------------------- info("Add JQuery") head.extend(jquery(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Velocity") head.extend(velocity(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Clipboard.js") head.extend(clipboard(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Highlight.js") head.extend(highlight(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Google Fonts support") head.extend( google_fonts(["Alegreya", "Alegreya SC"], standalone=standalone)) # ---------------------------------------------------------------------- info("Add Mathjax support") head.extend(mathjax(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Font Awesome support") head.extend(font_awesome(standalone=standalone)) # ---------------------------------------------------------------------- info("Add artdoc css & js files") head.extend(artdoc()) # ---------------------------------------------------------------------- info("Setting language to english (required for hyphens)") html.set("lang", "en") # ---------------------------------------------------------------------- info("Ensure ids uniqueness") id_count = {} for elt in html.iter(): _id = elt.get("id") if _id is not None: count = id_count.get(_id, 0) if count > 0: elt.set("id", _id + "-" + str(count)) id_count[_id] = count + 1 # ---------------------------------------------------------------------- info("Turning headers into self-links") sections = html.cssselect("section") for section in sections: id_ = section.get("id") heading = None if len(section): first = section[0] if first.tag in "h1 h2 h3 h4 h5 h6".split(): heading = first if id_ and heading is not None: contents = [heading.text or ""] + heading[:] heading.text, heading[:] = None, [] href = {"href": "#" + id_} link = HTML.a(href, *contents) heading.insert(0, link) # ---------------------------------------------------------------------- # TODO: deal with metadata & insert a document header with: # - title, # - date (format: Month Day, Year), autoformat, autogen ? # - author(s) (with mail & affiliation when available ?). # Assume custom metadata or parse the author field ? # Representation of multiple authors ? MMm eLIFEsciences use # popup for author info. Ex: http://elifesciences.org/content/4/e06356 ! # here, use hints from http://dtd.nlm.nih.gov/book/tag-library/: # # - name (don't be more precise) # - affiliation (concatenate) # - address ??? # - email --> Font Awesome Icon # - url / uri ? # - form of ID ? (like HAL ? or ZBlatt ?) # TODO: look at the rendering of # http://kieranhealy.org/blog/archives/2014/01/23/plain-text/: # - small grey date on top, bold title, bold author name, # italics affiliation, repeat. metadata = get_metadata(str(doc)) items = [] date = parse_html(metadata.get("date")) if date is not None: items.append(HTML.p({"class": "date"}, *date)) # def textify(item): # if isinstance(item, basestring): # return item # elif hasattr(item, "text"): # return item.text # else: # return "".join([textify(it) or "" for it in item]) title = parse_html(metadata.get("title")) title_id = None if title is not None: #title_id = textify(title).lower().replace(" ", "-") items.append( HTML.h1({"class": "title"}, HTML.a({"href": "#"}, *title))) head.insert(0, HTML.title(*title)) authors = metadata.get("author") or [] for author in authors: if isinstance(author, basestring): name = parse_html(author) email = None affiliation = None else: name = parse_html(author.get("name")) email = parse_html(author.get("email")) affiliation = parse_html(author.get("affiliation")) if name is not None: if email is not None: name = [HTML.a({"href": "mailto:" + email[0]}, *name)] name = HTML.p({"class": "author"}, *name) items.append(name) if affiliation is not None: affiliation = HTML.p({"class": "affiliation"}, *affiliation) items.append(affiliation) header_attr = {"class": "main"} # if title_id is not None: # header_attr["id"] = title_id header = HTML.header(header_attr, *items) # print("HEADER", lxml.html.tostring(header)) body.insert(0, header) # print("BODY", lxml.html.tostring(body)) # print("HTML", lxml.html.tostring(html)) # ---------------------------------------------------------------------- info("Generate the standalone HTML file") html_str = lxml.html.tostring(html, encoding="utf-8", doctype="<!DOCTYPE html>") doc.with_suffix(".html").open("wb").write(html_str) sys.exit(0)
def get_event(self): d = {} d['id'] = id_from_path(self.url) d['date'] = date_from_id(d['id']) d['datetime'] = date_from_id(d['id']) d['url'] = self.url html = lxml.html.fromstring(self.document) for div in html.iter('div'): if div.get('id') == 'bodyContent': break tags = [t for t in div if not callable(t.tag) and not t.get('id') and 'footer' not in t.get('class', '')] parts = [t.text_content().strip().replace('\n', ' ') for t in tags] description = '\n'.join(parts) summary = description.split('\n', 1)[0] self.div = div if not summary: return None d['summary'] = summary d['description'] = description for n, p in enumerate(parts): match = re.search(r'\b(\d\d?)h(\d\d)?\b', p) if match: d['hour'] = time(int(match.group(1)), int(match.group(2) or '0')) d['datetime'] = combine(d['date'], d['hour']) parts[n] = p[:match.start(0)] + p[match.end(0):] break for n, p in enumerate(parts): match = re.search(ur'\b(\d+([,.]\d+)?)\s*(euros\b|euro\b|€)', p) if match: d['price'] = float(match.group(1).replace(',', '.')) parts[n] = p[:match.start(0)] + p[match.end(0):] break address = [] for n, p in enumerate(parts): match = re.search(r'\d+[\s,]+(rue|boulevard|avenue)\s+.+', p, re.I) if match: address.append(match.group(0)) p = parts[n] = p[:match.start(0)] + p[match.end(0):] match = re.search(r'\b(75|92|93|94|78|77|95|91)\d\d\d\b.*', p) if match: address.append(match.group(0)) p = parts[n] = p[:match.start(0)] + p[match.end(0):] match = re.search(r'\b(m.tro|rer)\b.*', p, re.I) if match: address.append(match.group(0)) p = parts[n] = p[:match.start(0)] + p[match.end(0):] match = re.search(r'@\s+\w+(\s+[^.]+.*)?', p) # refuse '@foo' or '@ foo . plop' if match: address.append(match.group(0)) p = parts[n] = p[:match.start(0)] + p[match.end(0):] if address: d['address'] = ' '.join(address) return d
def get_html_elements(html): """ Take an lxml ElementTree; return Counter of elements; count once. """ return Counter({element.tag for element in html.iter()})
break for geo in vcard.find_class('geo'): data['geo'] = {} for lat_el in geo.find_class('latitude'): for value in lat_el.find_class('value-title'): data['geo']['latitude'] = value.get('title') for lng_el in geo.find_class('longitude'): for value in lng_el.find_class('value-title'): data['geo']['longitude'] = value.get('title') break for maps in html.find_class('google-map'): latlng = maps.get('data-markers-list') if latlng: lat, lng = get_lat_lng(latlng) data['geo'] = {'latitude': lat, 'longitude': lng } for link in html.iter('link'): if link.get('rel') == 'publisher': data['google_plus'] = link.get('href') # for meta in html.iter('meta'): # name = meta.get('name') # if name == 'geo.position' or name == 'ICBM': # latlng = meta.get('content') # if latlng: # lat, lng = get_lat_lng(latlng) # data['geo'] = {'latitude': lat, 'longitude': lng } dealers.append(data) except ValueError, e: pass with open(combined_name, 'wb') as fd: json.dump(list(dealers), fd)
def parse_news_articles(php_directory, download_directory, file_name, query): # Note: Assumes that path is stored as <query>.php/ inpath = php_directory + file_name + "/" file_list = [ f for f in listdir(inpath) if isfile(join(inpath,f)) ] # For each file, get the article Titles and URLs for file in file_list: # Clear out any variables from last file articleURL = articleTitle = articleSource = summaryText = keywords = score = code = "" try: intext = open(inpath + file, 'r').read() html = etree.HTML(intext) except lxml.etree.XMLSyntaxError: print "ERROR: XMLSyntaxError when reading " + inpath + file break for element in html.iter(): if (element.tag == "p" and element.text == "News Result"): # Do nothing pass elif (element.tag == "a"): articleURL = element.attrib["href"] articleTitle = element.text elif (element.tag == "br"): if (element.tail != None): summaryText = element.tail elif (element.tag == "strong"): if (element.tail != "\n"): articleSource = element.tail elif (element.tag == "p"): # Check to see if article already exists using URL. If it exists, don't do anything if (articles.find_one({ "url": articleURL }) is not None): print "INFO: Duplicate article found" else: print "Processing: " + articleURL # For each URL, assign its md5 as a unique identifier #code = base64.urlsafe_b64encode(os.urandom(18)) m = hashlib.md5() m.update(articleURL) code = m.hexdigest() first_level = code[0:2] second_level = code[2:4] # This code also becomes the filename for the full file path #articleFileDirectory = php_directory + file + "--news/" articleFileDirectory = download_directory + first_level + "/" + second_level + "/" articleFilePath = articleFileDirectory + code # Download full article and use full-text (if available) for keyword extraction fullArticleText = download_article_file(articleURL, articleFileDirectory, code) if (fullArticleText is not None): keyword_set = textrank(fullArticleText) #articleFeatures = get_article_features(fullArticleText, articleURL) articleFeatures = None guessed_date = guess_date(fullArticleText) else: keyword_set = textrank(summaryText) #articleFeatures = get_article_features(summaryText, articleURL) articleFeatures = None guessed_date = guess_date(summaryText) keywords = list(keyword_set) processed_date = datetime.now().strftime("%Y-%m-%d") if (guessed_date is not None): publish_date = guessed_date else: publish_date = processed_date article = [{ "q": query, "c": code, "f": articleFeatures, "pubd": publish_date, "procd": processed_date, "url": articleURL, "t": articleTitle, "abs": summaryText, "sr": articleSource, "k": keywords, "fp": articleFilePath, "m": None }] # Write article to MongoDB collection try: article_id = articles.insert(article) except MongoException.DuplicateKey: print "Duplicate key: " + code #print "Inserted into articles: " + articleTitle.encode('ascii', 'ignore') if (fullArticleText is None): fullArticleText = summaryText # Insert into ElasticSearch json_str = mk_es_json(code, fullArticleText, articleURL, articleTitle, summaryText, publish_date) #print json_str index = 'article' index_type = 'text' es_url = 'http://localhost:9200' r = post_to_elastic_search(es_url, index, index_type, code, json_str) print r
def get_event(self): d = {} d['id'] = id_from_path(self.url) d['date'] = date_from_id(d['id']) d['datetime'] = date_from_id(d['id']) d['url'] = self.url html = lxml.html.fromstring(self.document) for div in html.iter('div'): if div.get('id') == 'bodyContent': break tags = [ t for t in div if not callable(t.tag) and not t.get('id') and 'footer' not in t.get('class', '') ] parts = [t.text_content().strip().replace('\n', ' ') for t in tags] description = '\n'.join(parts) summary = description.split('\n', 1)[0] self.div = div if not summary: return None d['summary'] = summary d['description'] = description for n, p in enumerate(parts): match = re.search(r'\b(\d\d?)h(\d\d)?\b', p) if match: d['hour'] = time(int(match.group(1)), int(match.group(2) or '0')) d['datetime'] = combine(d['date'], d['hour']) parts[n] = p[:match.start(0)] + p[match.end(0):] break for n, p in enumerate(parts): match = re.search(ur'\b(\d+([,.]\d+)?)\s*(euros\b|euro\b|€)', p) if match: d['price'] = float(match.group(1).replace(',', '.')) parts[n] = p[:match.start(0)] + p[match.end(0):] break address = [] for n, p in enumerate(parts): match = re.search(r'\d+[\s,]+(rue|boulevard|avenue)\s+.+', p, re.I) if match: address.append(match.group(0)) p = parts[n] = p[:match.start(0)] + p[match.end(0):] match = re.search(r'\b(75|92|93|94|78|77|95|91)\d\d\d\b.*', p) if match: address.append(match.group(0)) p = parts[n] = p[:match.start(0)] + p[match.end(0):] match = re.search(r'\b(m.tro|rer)\b.*', p, re.I) if match: address.append(match.group(0)) p = parts[n] = p[:match.start(0)] + p[match.end(0):] match = re.search(r'@\s+\w+(\s+[^.]+.*)?', p) # refuse '@foo' or '@ foo . plop' if match: address.append(match.group(0)) p = parts[n] = p[:match.start(0)] + p[match.end(0):] if address: d['address'] = ' '.join(address) return d
def extract_content(html): topnodes = {} for tag in ('p', 'li', 'dd', 'dt'): for p in html.iter(tag): parent, val = p.getparent(), valuate(p) topnodes[parent] = sumval(topnodes.get(parent, (0, 0, 0)), val) for p in html.iter('img'): l = categorise(p) if l > 0: parent = p.getparent() topnodes[parent] = sumval(topnodes.get(parent, (0, 0, 0)), (l, 1, 1)) toplist = list(map(lambda x: (x[0], getval(x[1])), topnodes.items())) if not toplist: return [] toplist.sort(key=lambda x: x[1], reverse=True) if toplist[0][0].tag in ('dl', 'ol', 'ul'): weighing = 4 else: weighing = 2 paths, article = {}, None for top, l in filter(lambda x: weighing*x[1] >= toplist[0][1], toplist): node, nesting = top.getparent(), 2 while node is not None: if node.tag == 'article': article, artnesting = node, nesting info = paths.get(node, (0, 0)) paths[node] = (info[0] + 1, max(info[1], nesting)) node, nesting = node.getparent(), nesting + 1 pathlist = list(paths.items()) pathlist.sort(key=lambda x: x[1]) maxp = pathlist[-1][1][0] if (article != None) and (10*valuate(top)[2] >= 8*valuate(article)[2]): top, nesting = article, artnesting else: if maxp > 1: pathlist = list(filter(lambda x: x[1][0] >= (maxp + 1) // 2, pathlist)) top, info = pathlist[0] pathnr, nesting = info if info[0] == maxp // 2: for top, info in pathlist[1:]: if info[0] != pathnr: pathnr, nesting = info break else: nesting = 1 highesthdr, content, visited = 7, [], {} for p in top.iter(): if p == top: if p.tag in ('dl', 'ol', 'ul'): p.tail = '' content.append(p) break else: continue if categorise(p) <= 0: continue if p.tag == 'img': parent = p.getparent() if parent != top: p = parent if p.tag.startswith('h'): towrite = True highesthdr = min(highesthdr, int(p.tag[1])) else: towrite = False encl, parent, i = p, p.getparent(), nesting while parent is not None and parent is not top: encl, parent = parent, parent.getparent() i -= 1 if not towrite: towrite = i > 0 if towrite: if not visited.get(encl): for elem in encl.iter(): visited[elem] = True encl.tail = '' content.append(encl) remove_after(top) if top.getparent() is not None: parent = top.getparent() parent.remove(top) else: parent = None lowesthdr, headers = None, [] for i in range(1, highesthdr): elem = None for elem in html.iter('h%d' % (i,)): pass if elem is not None: elem.tail = '' headers.append(elem) remove_before(elem) elem.getparent().remove(elem) lowesthdr = i break if lowesthdr: for elem in html.iter(): if elem.tag in ('h2', 'h3', 'h4', 'h5', 'h6'): elem.tail = '' headers.append(elem) elem.getparent().remove(elem) if parent is not None: for elem in parent: if type(elem.tag) == type(''): elem.tail = '' headers.append(elem) headers.extend(content) return list(map(clean_imgs, map(html_cleaner.clean_html, headers)))
def main(): # TODO: combine command-line and option file. # TODO: option to generate a default configuration file parser = argparse.ArgumentParser() # TODO: doc parser.add_argument("-s", "--standalone", action="store_true") # TODO: doc args = parser.parse_args() standalone = args.standalone conf = json.load((DATA / "artdoc.js").open()) if Path("artdoc.js").exists(): user_conf = json.load(Path("artdoc.js").open()) conf.update(user_conf) info("Document:") doc_patterns = conf["doc"] if isinstance(doc_patterns, basestring): doc_patterns = [doc_patterns] docs = [] for pattern in doc_patterns: matches = list(WORKDIR.glob(pattern)) #subinfo("matching {!r}:".format(pattern)) for match in matches: subinfo(str(match)) docs.extend(matches) if not docs: sys.exit("error: no document found") # info("HTML template:") # template_file = HTML / "index.html" # subinfo(str(template_file)) # template = template_file.open().read().encode("utf-8") info("Bibliography:") bib_patterns = conf["bib"] if isinstance(bib_patterns, basestring): bib_patterns = [bib_patterns] bibs = [] for pattern in bib_patterns: matches = list(WORKDIR.glob(pattern)) #subinfo("matching {!r}:".format(pattern)) for match in matches: subinfo(str(match)) bibs.extend(matches) if not bibs: print() info("JS:") cmd = coffee["-c", str(JS / "main.coffee")] subinfo(cmd) cmd() info("CSS:") cmd = stylus[str(CSS / "style.styl")] subinfo(str(cmd)) cmd() # TODO: copy only what is required. shutil.copytree(str(DATA), str(ARTDOC)) for doc in docs: pass info("PANDOC: generate JSON file") args = ["-t", "json", "--smart"] for bib in bibs: args.extend(["--bibliography", str(bib)]) args.append(str(doc)) cmd = pandoc[args] subinfo(cmd, "> json") json_str = cmd() info("Convert raw TeX to raw HTML") cmd = local[str(BIN / "rawHTML.hs")] subinfo(cmd, "< json > json") json_str = (cmd << json_str)() # info("Flag/Box Proofs") # cmd = local[str(BIN / "proof.hs")] # subinfo(cmd, "< json > json") # try: # json_str = (cmd << json_str)() # except Exception as error: # print(repr(error)) # info("Wrap Section-Like Sequence of Blocks") # cmd = local[str(BIN / "div.hs")] # subinfo(cmd, "< json > json") # try: # json_str = (cmd << json_str)() # except Exception as error: # print(repr(error)) info("Wrap Section-Like Sequence of Blocks") cmd = local[str(BIN / "section.hs")] subinfo(cmd, "< json > json") try: json_str = (cmd << json_str)() except Exception as error: print(repr(error)) info("Flag Tombstones (end of proofs)") cmd = local[str(BIN / "tombstone.hs")] subinfo(cmd, "< json > json") try: json_str = (cmd << json_str)() except Exception as error: print(repr(error)) info("Convert Images to SVG Images") cmd = local[str(BIN / "svg.hs")] subinfo(cmd, "< json > json") json_str = (cmd << json_str)() info("Generate HTML body from markdown") args = ["--email-obfuscation", "none", "-f", "json", "--mathjax", "-t", "html5", "--section-divs"] cmd = pandoc[args] subinfo(cmd, "< json > body") pandoc_body_str = (cmd << json_str)() pandoc_html = lxml.html.document_fromstring(pandoc_body_str) pandoc_body = pandoc_html.cssselect("body")[0] info("Generate standalone HTML doc") html = HTML.html(HTML.head, HTML.body) body = html.cssselect("body")[0] head = html.cssselect("head")[0] head.append(HTML.meta(charset="utf-8")) body.attrib.update(pandoc_body.attrib) body.extend(pandoc_body[:]) # ---------------------------------------------------------------------- info("Add JQuery") head.extend(jquery(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Velocity") head.extend(velocity(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Clipboard.js") head.extend(clipboard(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Highlight.js") head.extend(highlight(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Google Fonts support") head.extend(google_fonts(["Alegreya", "Alegreya SC"], standalone=standalone)) # ---------------------------------------------------------------------- info("Add Mathjax support") head.extend(mathjax(standalone=standalone)) # ---------------------------------------------------------------------- info("Add Font Awesome support") head.extend(font_awesome(standalone=standalone)) # ---------------------------------------------------------------------- info("Add artdoc css & js files") head.extend(artdoc()) # ---------------------------------------------------------------------- info("Setting language to english (required for hyphens)") html.set("lang", "en") # ---------------------------------------------------------------------- info("Ensure ids uniqueness") id_count = {} for elt in html.iter(): _id = elt.get("id") if _id is not None: count = id_count.get(_id, 0) if count > 0: elt.set("id", _id + "-" + str(count)) id_count[_id] = count + 1 # ---------------------------------------------------------------------- info("Turning headers into self-links") sections = html.cssselect("section") for section in sections: id_ = section.get("id") heading = None if len(section): first = section[0] if first.tag in "h1 h2 h3 h4 h5 h6".split(): heading = first if id_ and heading is not None: contents = [heading.text or ""] + heading[:] heading.text, heading[:] = None, [] href = {"href": "#" + id_} link = HTML.a(href, *contents) heading.insert(0, link) # ---------------------------------------------------------------------- # TODO: deal with metadata & insert a document header with: # - title, # - date (format: Month Day, Year), autoformat, autogen ? # - author(s) (with mail & affiliation when available ?). # Assume custom metadata or parse the author field ? # Representation of multiple authors ? MMm eLIFEsciences use # popup for author info. Ex: http://elifesciences.org/content/4/e06356 ! # here, use hints from http://dtd.nlm.nih.gov/book/tag-library/: # # - name (don't be more precise) # - affiliation (concatenate) # - address ??? # - email --> Font Awesome Icon # - url / uri ? # - form of ID ? (like HAL ? or ZBlatt ?) # TODO: look at the rendering of # http://kieranhealy.org/blog/archives/2014/01/23/plain-text/: # - small grey date on top, bold title, bold author name, # italics affiliation, repeat. metadata = get_metadata(str(doc)) items = [] date = parse_html(metadata.get("date")) if date is not None: items.append(HTML.p({"class": "date"}, *date)) # def textify(item): # if isinstance(item, basestring): # return item # elif hasattr(item, "text"): # return item.text # else: # return "".join([textify(it) or "" for it in item]) title = parse_html(metadata.get("title")) title_id = None if title is not None: #title_id = textify(title).lower().replace(" ", "-") items.append( HTML.h1( {"class": "title"}, HTML.a( {"href": "#"}, *title ) ) ) head.insert(0, HTML.title(*title)) authors = metadata.get("author") or [] for author in authors: if isinstance(author, basestring): name = parse_html(author) email = None affiliation = None else: name = parse_html(author.get("name")) email = parse_html(author.get("email")) affiliation = parse_html(author.get("affiliation")) if name is not None: if email is not None: name = [HTML.a({"href": "mailto:" + email[0]}, *name)] name = HTML.p({"class": "author"}, *name) items.append(name) if affiliation is not None: affiliation = HTML.p({"class": "affiliation"}, *affiliation) items.append(affiliation) header_attr = {"class": "main"} # if title_id is not None: # header_attr["id"] = title_id header = HTML.header(header_attr, *items) # print("HEADER", lxml.html.tostring(header)) body.insert(0, header) # print("BODY", lxml.html.tostring(body)) # print("HTML", lxml.html.tostring(html)) # ---------------------------------------------------------------------- info("Generate the standalone HTML file") html_str = lxml.html.tostring(html, encoding="utf-8", doctype="<!DOCTYPE html>") doc.with_suffix(".html").open("wb").write(html_str) sys.exit(0)