def html2text_all( the_url, encoding=None ) : baseurl = '' file_ = the_url if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ req_headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11','Referer': 'http://mwetoolkit.sf.net'} request = urllib2.Request( baseurl, None, req_headers ) try : response = urllib2.urlopen( request, timeout=30 ) except urllib2.URLError, e: print >> sys.stderr, "HTTP Error retrieving " + baseurl print e #sys.exit( -1 ) return None text = response.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) #pdb.set_trace() encoding = enc(response.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding, 'ignore')
def main(args1): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document") p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items") p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", default=78, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=36, help="number of pixels Google indents nested lists") p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevent when -g is specified as well") (options, args) = p.parse_args() # handle options if options.ul_style_dash: options.ul_item_mark = '-' else: options.ul_item_mark = '*' BODY_WIDTH = options.body_width GOOGLE_LIST_INDENT = options.list_indent # process input if len(args1) > 0: file_ = "t1.txt" encoding = None if len(args1) == 2: encoding = args[1] if len(args1) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) text = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] data = data.decode(encoding) else: data = sys.stdin.read() wrapwrite(html2text(data, baseurl))
def exec_main(): URL = '' arg2 = "0" # if a URL is passed as cmdline argument if sys.argv[1:]: arg1 = sys.argv[1] if sys.argv[2:]: arg2 = sys.argv[2] # if URL starts with http or https # use urllib to fetch HTML code if arg1.startswith('http://') or arg1.startswith('https://'): URL = arg1 obj = urllib.urlopen(URL) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = obj.read() encoding = enc(obj.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' html_data = text.decode(encoding) else: # use local html file encoding = 'utf8' if len(sys.argv) > 2: encoding = sys.argv[2] html_data = open(arg1, 'r').read().decode(encoding) else: # if no arg is passed html_data = "Usage: python textify.py (URL|file.html)" if (arg2 is "1"): writeToFile(wrapText(textify_html(html_data, None, URL))) else: writeToStdout(wrapText(textify_html(html_data, None, URL)))
def getURL(url): try: with close(urlopen(url)) as j: text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) except Exception, e: data = text.decode('latin-1')
def url2text(url): j = urllib.urlopen(url) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = html2text_file(text.decode(encoding)) print type(data)
def convert_http_to_text_nltk(http): """Convert http to text using nltk""" import urllib2 req = urllib2.Request(http, headers={'User-Agent': "Magic Browser"}) j = urllib2.urlopen(req) text = j.read() try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) output = nltk.clean_html(data) return normalize_ligatures(output.decode("utf-8")) #return unicode
def totext(baseurl): j = urllib2.urlopen(baseurl) text = j.read() try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) try: encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = CL.clean_html(text.decode(encoding, 'replace')) except: data = CL.clean_html("".join([x for x in text if ord(x) <= 128])) m = ht.html2text(data, baseurl) return "".join([x for x in m if ord(x) < 128])
def alchemyentityextraction(self, options, text=None, url=None): """ """ import urllib2, simplejson, json from urllib import urlencode if text: BASE_URL = 'http://access.alchemyapi.com/calls/text/TextGetRankedNamedEntities' if url: BASE_URL = 'http://access.alchemyapi.com/calls/text/HTMLGetRankedNamedEntities' ### CHANGE THIS TO YOUR OWN ALCHEMYAPI KEY ### API_KEY = 'PUT YOUR API KEY HERE' post_parameters = { 'apikey' : API_KEY, 'outputMode' : 'json', 'coreference' : '1', 'disambiguate' : '1', 'sentiment' : '1' } if text: post_parameters['text'] = text if url: baseurl = url j = urllib2.urlopen(baseurl) text = j.read() try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' post_parameters['html'] = text.decode(encoding) post_parameters['url'] = url response = simplejson.load(urllib2.urlopen(urllib2.Request(BASE_URL, data=urlencode(post_parameters)))) #print simplejson.dumps(response, sort_keys=True, indent=3) return response
def doConvert(url): # загрузка страницы j = urllib.urlopen(url) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'cp1251' data = text.decode(encoding) # конвертирование html документа в markdown originalMarkdownDocument = html2text.html2text(data, url) markdownDocument = originalMarkdownDocument.split("\n") # поиск верхней границы статьи title = lxml.html.document_fromstring(text) startLine = findStartMerker(title.find(".//title").text, markdownDocument) # удаление текста выше верхней границы del markdownDocument[:startLine] # поиск нижней границы статьи skiplist = [] endLine = findEndMarker(markdownDocument, skiplist) # удаление строк из skiplist for x in range(len(skiplist)-1,0, -1): markdownDocument.pop(skiplist[x]) # отсечение статьи по нижней границе if endLine <> -1: del markdownDocument[endLine-len(skiplist)+1:] else: return; # замена ссылок линками fragment = listToString(markdownDocument) fragment = replaceInternalLinks(originalMarkdownDocument, fragment) global htmlOut if htmlOut == 1: # конвертирование markdown в html html = markdown.markdown(fragment) print html.encode('utf-8') else: print fragment.encode('utf-8')
def main(): baseurl = "" p = optparse.OptionParser("%prog [(filename|url) [encoding]]", version="%prog " + __version__) args = p.parse_args()[1] if len(args) > 0: file_ = args[0] encoding = None if len(args) == 2: encoding = args[1] if len(args) > 2: p.error("Too many arguments") if file_.startswith("http://") or file_.startswith("https://"): baseurl = file_ j = urllib.urlopen(baseurl) text = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ("utf-8", 1) encoding = enc(j.headers, text)[0] if encoding == "us-ascii": encoding = "utf-8" data = text.decode(encoding) else: data = open(file_, "rb").read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {"encoding": "utf-8"} encoding = detect(data)["encoding"] data = data.decode(encoding) else: data = sys.stdin.read() wrapwrite(html2text(data, baseurl))
def html2text(url): baseurl = url j = urllib.urlopen(baseurl) text = j.read() try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding_t = enc(j.headers, text)[0] if encoding_t == 'us-ascii': encoding_t = 'utf-8' data = text.decode(encoding_t) text = optwrap(html2text_file(data, None, baseurl)) text = text.encode('utf-8') text=text.rsplit('\n') final_text=[] for l in text: if not re.match(r'[#*!<]',l): if len(l)>1: if re.match('\w+|\[\w+',l): final_text.append(l) return '\n'.join(final_text)
def getText(self): outputEnc = "euc-kr" j = urllib.urlopen(self.url) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = j.read() self.encoding = enc(j.headers, text)[0] print "========= ENCODE", self.encoding if self.encoding == 'us-ascii': self.encoding = 'utf-8' self.html = text.decode(self.encoding) print self.html.encode(outputEnc, 'ignore' ) self.html = self.html.replace("<br>", "") h2t = _html2text(None) h2t.feed(self.html) h2t.feed("") h2t.close() self.text = optwrap(h2t.outtext) self.text = h2t.outtext.encode(outputEnc, 'ignore') self.links = h2t.linkList return self.text
def main(): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") p.add_option("--ignore-links", dest="ignore_links", action="store_true", default=IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option("--ignore-images", dest="ignore_images", action="store_true", default=IGNORE_IMAGES, help="don't include any formatting for images") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document") p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items") p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevent when -g is specified as well") (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0: file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = sys.stdin.read() try: data = data.decode(encoding) except UnicodeDecodeError: pass try: data = data.decode('utf-8') except UnicodeDecodeError: # OS X percent-encodes any bytes that aren't valid utf-8 s = '' g = '' l = 0 for c in data: o = ord(c) if l and o < 128 or o >= 192: # we want a continuation byte, but didn't get one s += ''.join(["%%%02X" % ord(x) for x in g]) g = '' l = 0 if l == 0 and o < 128: # ascii s += c elif l == 0 and 194 <= o < 245: # valid leading bytes if o < 224: l = 1 elif o < 240: l = 2 else: l = 3 g = c elif l > 0 and 128 <= o < 192: # valid continuations g += c l -= 1 if not l: s += g g = '' else: # invalid s += "%%%02X" % o # any remaining partial characters s += ''.join(["%%%02X" % ord(x) for x in g]) data = s.decode('utf-8') h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.ignore_images = options.ignore_images h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough wrapwrite(h.handle(data))
def main(): baseurl = '' class bcolors: # pragma: no cover HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' p = optparse.OptionParser( '%prog [(filename|url) [encoding]]', version='%prog ' + ".".join(map(str, __version__)) ) p.add_option( "--no-wrap-links", dest="wrap_links", action="store_false", default=config.WRAP_LINKS, help="wrap links during conversion" ) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis" ) p.add_option( "--reference-links", dest="inline_links", action="store_false", default=config.INLINE_LINKS, help="use reference style links instead of inline links" ) p.add_option( "--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option( "--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help=("protect links from line breaks surrounding them " + "with angle brackets")) p.add_option( "--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images" ) p.add_option( "--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text" ) p.add_option( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, help="Write image tags with height and width attrs as raw html to " "retain dimensions" ) p.add_option( "-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document" ) p.add_option( "-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items" ) p.add_option( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text" ) p.add_option( "-b", "--body-width", dest="body_width", action="store", type="int", default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap" ) p.add_option( "-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists" ) p.add_option( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " "specified as well" ) p.add_option( "--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less readable, but " "avoids corner case formatting issues." ) p.add_option( "--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax." ) p.add_option( "--single-line-break", action="store_true", dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=( "Use a single line break after a block element rather than two " "line breaks. NOTE: Requires --body-width=0" ) ) p.add_option( "--unicode-snob", action="store_true", dest="unicode_snob", default=config.UNICODE_SNOB, help="Use unicode throughout document" ) p.add_option( "--no-automatic-links", action="store_false", dest="use_automatic_links", default=config.USE_AUTOMATIC_LINKS, help="Do not use automatic links wherever applicable" ) p.add_option( "--no-skip-internal-links", action="store_false", dest="skip_internal_links", default=config.SKIP_INTERNAL_LINKS, help="Do not skip internal links" ) p.add_option( "--links-after-para", action="store_true", dest="links_each_paragraph", default=config.LINKS_EACH_PARAGRAPH, help="Put links after each paragraph instead of document" ) p.add_option( "--mark-code", action="store_true", dest="mark_code", default=config.MARK_CODE, help="Mark program code blocks with [code]...[/code]" ) p.add_option( "--decode-errors", dest="decode_errors", action="store", type="string", default=config.DECODE_ERRORS, help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values" ) (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) == 2: encoding = args[1] elif len(args) > 2: p.error('Too many arguments') if len(args) > 0 and args[0] != '-': # pragma: no cover file_ = args[0] if file_.startswith('http://') or file_.startswith('https://'): warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)", DeprecationWarning) baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = wrap_read() if hasattr(data, 'decode'): try: try: data = data.decode(encoding, errors=options.decode_errors) except TypeError: # python 2.6.x does not have the errors option data = data.decode(encoding) except UnicodeDecodeError as err: warning = bcolors.WARNING + "Warning:" + bcolors.ENDC warning += ' Use the ' + bcolors.OKGREEN warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.' print(warning) raise err h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' if options.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' h.body_width = options.body_width h.google_list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.protect_links = options.protect_links h.ignore_images = options.ignore_images h.images_to_alt = options.images_to_alt h.images_with_size = options.images_with_size h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break h.inline_links = options.inline_links h.unicode_snob = options.unicode_snob h.use_automatic_links = options.use_automatic_links h.skip_internal_links = options.skip_internal_links h.links_each_paragraph = options.links_each_paragraph h.mark_code = options.mark_code h.wrap_links = options.wrap_links wrapwrite(h.handle(data))
if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ resp, text = httplib2.Http().request(baseurl, headers={'User-Agent':'textisbeautiful.net/1.0'}) #text = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) print resp encoding = enc(resp, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] data = data.decode(encoding) else: data = sys.stdin.read()
def main(): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis" ) p.add_option( "--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option( "--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help=("protect links from line breaks surrounding them " + "with angle brackets")) p.add_option( "--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images" ) p.add_option( "--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text" ) p.add_option( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, help="Write image tags with height and width attrs as raw html to " "retain dimensions" ) p.add_option( "-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document" ) p.add_option( "-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items" ) p.add_option( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text" ) p.add_option( "-b", "--body-width", dest="body_width", action="store", type="int", default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap" ) p.add_option( "-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists" ) p.add_option( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " "specified as well" ) p.add_option( "--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less readable, but " "avoids corner case formatting issues." ) p.add_option( "--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax." ) p.add_option( "--single-line-break", action="store_true", dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=( "Use a single line break after a block element rather than two " "line breaks. NOTE: Requires --body-width=0" ) ) (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0 and args[0] != '-': file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = wrap_read() if hasattr(data, 'decode'): data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' if options.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.protect_links = options.protect_links h.ignore_images = options.ignore_images h.images_to_alt = options.images_to_alt h.images_with_size = options.images_with_size h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break wrapwrite(h.handle(data))
def main(): baseurl = "" p = optparse.OptionParser("%prog [(filename|url) [encoding]]", version="%prog " + __version__) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis", ) p.add_option( "--ignore-links", dest="ignore_links", action="store_true", default=IGNORE_ANCHORS, help="don't include any formatting for links", ) p.add_option( "--ignore-images", dest="ignore_images", action="store_true", default=IGNORE_IMAGES, help="don't include any formatting for images", ) p.add_option( "-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document", ) p.add_option( "-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items", ) p.add_option( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text", ) p.add_option( "-b", "--body-width", dest="body_width", action="store", type="int", default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap", ) p.add_option( "-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists", ) p.add_option( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help= "hide strike-through text. only relevant when -g is specified as well", ) p.add_option( "--escape-all", action="store_true", dest="escape_snob", default=False, help= "Escape all special characters. Output is less readable, but avoids corner case formatting issues.", ) (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0: file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error("Too many arguments") if file_.startswith("http://") or file_.startswith("https://"): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ("utf-8", 1) encoding = enc(j.headers, data)[0] if encoding == "us-ascii": encoding = "utf-8" else: data = open(file_, "rb").read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {"encoding": "utf-8"} encoding = detect(data)["encoding"] else: data = sys.stdin.read() data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = "-" if options.em_style_asterisk: h.emphasis_mark = "*" h.strong_mark = "__" h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.ignore_images = options.ignore_images h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob wrapwrite(h.handle(data))
def main(): baseurl = '' class bcolors: # pragma: no cover HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' p = optparse.OptionParser( '%prog [(filename|url) [encoding]]', version='%prog ' + ".".join(map(str, __version__)) ) p.add_option( "--no-wrap-links", dest="wrap_links", action="store_false", default=config.WRAP_LINKS, help="wrap links during conversion" ) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis" ) p.add_option( "--reference-links", dest="inline_links", action="store_false", default=config.INLINE_LINKS, help="use reference style links instead of inline links" ) p.add_option( "--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option( "--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help=("protect links from line breaks surrounding them " + "with angle brackets")) p.add_option( "--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images" ) p.add_option( "--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text" ) p.add_option( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, help="Write image tags with height and width attrs as raw html to " "retain dimensions" ) p.add_option( "-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document" ) p.add_option( "-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items" ) p.add_option( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text" ) p.add_option( "-b", "--body-width", dest="body_width", action="store", type="int", default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap" ) p.add_option( "-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists" ) p.add_option( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " "specified as well" ) p.add_option( "--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less readable, but " "avoids corner case formatting issues." ) p.add_option( "--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax." ) p.add_option( "--single-line-break", action="store_true", dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=( "Use a single line break after a block element rather than two " "line breaks. NOTE: Requires --body-width=0" ) ) p.add_option( "--unicode-snob", action="store_true", dest="unicode_snob", default=config.UNICODE_SNOB, help="Use unicode throughout document" ) p.add_option( "--no-automatic-links", action="store_false", dest="use_automatic_links", default=config.USE_AUTOMATIC_LINKS, help="Do not use automatic links wherever applicable" ) p.add_option( "--no-skip-internal-links", action="store_false", dest="skip_internal_links", default=config.SKIP_INTERNAL_LINKS, help="Do not skip internal links" ) p.add_option( "--links-after-para", action="store_true", dest="links_each_paragraph", default=config.LINKS_EACH_PARAGRAPH, help="Put links after each paragraph instead of document" ) p.add_option( "--mark-code", action="store_true", dest="mark_code", default=config.MARK_CODE, help="Mark program code blocks with [code]...[/code]" ) p.add_option( "--decode-errors", dest="decode_errors", action="store", type="string", default=config.DECODE_ERRORS, help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values" ) (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0 and args[0] != '-': # pragma: no cover file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = wrap_read() if hasattr(data, 'decode'): try: try: data = data.decode(encoding, errors=options.decode_errors) except TypeError: # python 2.6.x does not have the errors option data = data.decode(encoding) except UnicodeDecodeError as err: warning = bcolors.WARNING + "Warning:" + bcolors.ENDC warning += ' Use the ' + bcolors.OKGREEN warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.' print(warning) raise err h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' if options.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' h.body_width = options.body_width h.google_list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.protect_links = options.protect_links h.ignore_images = options.ignore_images h.images_to_alt = options.images_to_alt h.images_with_size = options.images_with_size h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break h.inline_links = options.inline_links h.unicode_snob = options.unicode_snob h.use_automatic_links = options.use_automatic_links h.skip_internal_links = options.skip_internal_links h.links_each_paragraph = options.links_each_paragraph h.mark_code = options.mark_code h.wrap_links = options.wrap_links wrapwrite(h.handle(data))
def main(): baseurl = "" p = optparse.OptionParser("%prog [(filename|url) [encoding]]", version="%prog " + __version__) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis", ) p.add_option( "--ignore-links", dest="ignore_links", action="store_true", default=IGNORE_ANCHORS, help="don't include any formatting for links", ) p.add_option( "--ignore-images", dest="ignore_images", action="store_true", default=IGNORE_IMAGES, help="don't include any formatting for images", ) p.add_option( "-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document", ) p.add_option( "-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items", ) p.add_option( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text", ) p.add_option( "-b", "--body-width", dest="body_width", action="store", type="int", default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap", ) p.add_option( "-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists", ) p.add_option( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is specified as well", ) p.add_option( "--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.", ) (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0: file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error("Too many arguments") if file_.startswith("http://") or file_.startswith("https://"): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ("utf-8", 1) encoding = enc(j.headers, data)[0] if encoding == "us-ascii": encoding = "utf-8" else: data = open(file_, "rb").read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {"encoding": "utf-8"} encoding = detect(data)["encoding"] else: data = sys.stdin.read() data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = "-" if options.em_style_asterisk: h.emphasis_mark = "*" h.strong_mark = "__" h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.ignore_images = options.ignore_images h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob wrapwrite(h.handle(data))
def main(): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") p.add_option("--ignore-links", dest="ignore_links", action="store_true", default=IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option("--ignore-images", dest="ignore_images", action="store_true", default=IGNORE_IMAGES, help="don't include any formatting for images") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document") p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items") p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") p.add_option( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help= "hide strike-through text. only relevent when -g is specified as well") (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0: file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = sys.stdin.read() data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.ignore_images = options.ignore_images h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough wrapwrite(h.handle(data))
def main(): baseurl = "" p = optparse.OptionParser("%prog [(filename|url) [encoding]]", version="%prog " + ".".join(map(str, __version__))) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis", ) p.add_option( "--reference-links", dest="inline_links", action="store_false", default=config.INLINE_LINKS, help="use reference style links instead of inline links", ) p.add_option( "--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links", ) p.add_option( "--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help=("protect links from line breaks surrounding them " + "with angle brackets"), ) p.add_option( "--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images", ) p.add_option( "--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text", ) p.add_option( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, help="Write image tags with height and width attrs as raw html to " "retain dimensions", ) p.add_option( "-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document", ) p.add_option( "-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items", ) p.add_option( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text", ) p.add_option( "-b", "--body-width", dest="body_width", action="store", type="int", default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap", ) p.add_option( "-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists", ) p.add_option( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " "specified as well", ) p.add_option( "--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less readable, but " "avoids corner case formatting issues.", ) p.add_option( "--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax.", ) p.add_option( "--single-line-break", action="store_true", dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=( "Use a single line break after a block element rather than two " "line breaks. NOTE: Requires --body-width=0" ), ) p.add_option( "--unicode-snob", action="store_true", dest="unicode_snob", default=config.UNICODE_SNOB, help="Use unicode throughout document", ) p.add_option( "--no-automatic-links", action="store_false", dest="use_automatic_links", default=config.USE_AUTOMATIC_LINKS, help="Do not use automatic links wherever applicable", ) p.add_option( "--no-skip-internal-links", action="store_false", dest="skip_internal_links", default=config.SKIP_INTERNAL_LINKS, help="Do not skip internal links", ) p.add_option( "--links-after-para", action="store_true", dest="links_each_paragraph", default=config.LINKS_EACH_PARAGRAPH, help="Put links after each paragraph instead of document", ) p.add_option( "--mark-code", action="store_true", dest="mark_code", default=config.MARK_CODE, help="Mark program code blocks with [code]...[/code]", ) (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0 and args[0] != "-": # pragma: no cover file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error("Too many arguments") if file_.startswith("http://") or file_.startswith("https://"): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ("utf-8", 1) encoding = enc(j.headers, data)[0] if encoding == "us-ascii": encoding = "utf-8" else: data = open(file_, "rb").read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {"encoding": "utf-8"} encoding = detect(data)["encoding"] else: data = wrap_read() if hasattr(data, "decode"): data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = "-" if options.em_style_asterisk: h.emphasis_mark = "*" h.strong_mark = "__" h.body_width = options.body_width h.google_list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.protect_links = options.protect_links h.ignore_images = options.ignore_images h.images_to_alt = options.images_to_alt h.images_with_size = options.images_with_size h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break h.inline_links = options.inline_links h.unicode_snob = options.unicode_snob h.use_automatic_links = options.use_automatic_links h.skip_internal_links = options.skip_internal_links h.links_each_paragraph = options.links_each_paragraph h.mark_code = options.mark_code wrapwrite(h.handle(data))
def main(): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") p.add_option("--ignore-links", dest="ignore_links", action="store_true", default=IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option("--ignore-images", dest="ignore_images", action="store_true", default=IGNORE_IMAGES, help="don't include any formatting for images") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document") p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items") p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore " + "for emphasized text") p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " + "specified as well") p.add_option("--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less " + "readable, but avoids corner case formatting issues.") (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0: file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): import urllib2 from gzip import GzipFile from urlparse import urlparse from StringIO import StringIO baseurl = file_ urls = urlparse(baseurl); j = urllib2.urlopen(urllib2.Request(baseurl, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0', 'Accept-Encoding': 'gzip, deflate', 'Referer' : 'http://%s' % (urls.hostname) }), timeout=30) if (j.code == 200) : content_encoding = j.headers.get('Content-Encoding') if content_encoding == 'gzip': data = GzipFile(fileobj=StringIO(j.read()), mode='r').read() elif content_encoding == 'deflate': data = StringIO(deflate(j.read())).getvalue() else: data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = sys.stdin.read() data = data.decode(encoding,'ignore') h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' if options.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.ignore_images = options.ignore_images h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob wrapwrite(h.handle(data))
def main(): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") p.add_option("--ignore-links", dest="ignore_links", action="store_true", default=IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option("--ignore-images", dest="ignore_images", action="store_true", default=IGNORE_IMAGES, help="don't include any formatting for images") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document") p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items") p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text") p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is specified as well") p.add_option("--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.") (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0: file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = sys.stdin.read() data = data.decode(encoding) # Running the HTML through BeautifulSoup fixes things like # <b><i>Reversed tags</b></i> (e.g. in Down Process) # If BS could not be imported, this step is skipped if BeautifulSoup: data = str(BeautifulSoup(data)) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' if options.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.ignore_images = options.ignore_images h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob wrapwrite(h.handle(data))
def main(): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis") p.add_option("--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option("--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help=("protect links from line breaks surrounding them " + "with angle brackets")) p.add_option("--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images") p.add_option("--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text") p.add_option( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, help="Write image tags with height and width attrs as raw html to " "retain dimensions") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document") p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items") p.add_option( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text") p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " "specified as well") p.add_option( "--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less readable, but " "avoids corner case formatting issues.") p.add_option("--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax.") p.add_option( "--single-line-break", action="store_true", dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=("Use a single line break after a block element rather than two " "line breaks. NOTE: Requires --body-width=0")) (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0 and args[0] != '-': file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = wrap_read() if hasattr(data, 'decode'): data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' if options.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.protect_links = options.protect_links h.ignore_images = options.ignore_images h.images_to_alt = options.images_to_alt h.images_with_size = options.images_with_size h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break wrapwrite(h.handle(data))
def read(url, output=None, debug=0): "Get run information from RunSummary data-service" encoding = 'utf-8' key = None cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem') if os.path.isfile(url): with open(url, 'r') as stream: context = stream.read() try: pydoc.pager(context) except: print context return elif url.find('cmsweb.cern.ch') != -1: data = get_data(url, decoder=None) html = data encoding = None elif url.find('mcdb.cern.ch') != -1: data = urllib.urlopen(url) html = data.read().replace(' _place_holder;', '') encoding = enc(data.headers, html)[0] elif url.find('cern.ch') == -1: data = urllib.urlopen(url) html = data.read() encoding = enc(data.headers, html)[0] else: with working_pem(PEMMGR.pem) as key: data = get_data_sso(url, key, cert, debug) html = data.read() encoding = enc(data.headers, html)[0] if encoding == 'us-ascii': encoding = 'utf-8' pager = os.environ.get('CMSSH_PAGER', None) if html: if int(os.environ.get('HTTPDEBUG', 0)): print_info('read data') print html if encoding: text = html.decode(encoding) res = html2text(text, '') if output: with open(output, 'w') as stream: stream.write(html) else: try: if pager: pydoc.pager(res.encode('utf-8')) else: wrapwrite(html2text(text, '')) except: wrapwrite(html2text(text, '')) else: if output: with open(output, 'w') as stream: stream.write(html) else: try: if pager: pydoc.pager(html) else: print html except: print html
return optwrap(html2text_file(html, None, baseurl)) if __name__ == "__main__": baseurl = '' if sys.argv[1:]: arg = sys.argv[1] if arg.startswith('http://'): baseurl = arg j = urllib.urlopen(baseurl) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) else: encoding = 'utf8' if len(sys.argv) > 2: encoding = sys.argv[2] f = open(arg, 'r') try: data = f.read().decode(encoding) finally: f.close() else: data = sys.stdin.read().decode('utf8') wrapwrite(html2text(data, baseurl))
encoding = "utf-8" if len(args) > 0: file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = sys.stdin.read()} reveal_type(data)
def main(): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") p.add_option("--ignore-links", dest="ignore_links", action="store_true", default=IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option("--ignore-images", dest="ignore_images", action="store_true", default=IGNORE_IMAGES, help="don't include any formatting for images") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document") p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items") p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevent when -g is specified as well") (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0: file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = sys.stdin.read() data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.ignore_images = options.ignore_images h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough wrapwrite(h.handle(data))
encoding = None if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) text = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] data = data.decode(encoding) else: data = sys.stdin.read()
def strippedurl(self, options, url): sys.path.append(os.path.dirname(__file__)) from sentiment.testsentiment.models.html2text import _html2text try: #Python3 import urllib.request as urllib except: import urllib def optwrap(text): """Wrap all paragraphs in the provided text.""" #if not BODY_WIDTH: if 1: return text assert wrap, "Requires Python 2.3." result = '' newlines = 0 for para in text.split("\n"): if len(para) > 0: if para[0] != ' ' and para[0] != '-' and para[0] != '*': for line in wrap(para, BODY_WIDTH): result += line + "\n" result += "\n" newlines = 2 else: if not onlywhite(para): result += para + "\n" newlines = 1 else: if newlines < 2: result += "\n" newlines += 1 return result def wrapwrite(text): text = text.encode('utf-8') try: #Python3 sys.stdout.buffer.write(text) except AttributeError: sys.stdout.write(text) def html2text_file(html, out=wrapwrite, baseurl=''): h = _html2text(out, baseurl) h.feed(html) h.feed("") return h.close() def html2text(html, baseurl=''): return optwrap(html2text_file(html, None, baseurl)) if url.startswith('http://') or url.startswith('https://'): baseurl = url j = urllib.urlopen(baseurl) text = j.read() try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) else: print 'Cannot open this URL %s' % url values = {} print("%29s" % 'DENOMINATORS / NORMALIZATION') print("%9s %9s %9s %14s %14s %35s" % ("X^2", "X", "SQRT", "T_METHOD", "L_METHOD", "URL")) for t_method in self.getTokenizers(): print '-' * 120 for l_method in self.getStemmers(): sentiment = self.sentiment(html2text(data, baseurl), t_method, l_method) for denom, name in self.getDenoms().items(): if not values: values = {'%s [%14s:%14s] (%s)' % (baseurl, self.TOKENIZERS[t_method], self.STEMMERS[l_method], name) : sentiment[denom]} else: values['%s [%14s:%14s] (%s)' % (baseurl, self.TOKENIZERS[t_method], self.STEMMERS[l_method], name)] = sentiment[denom] print '%9f %9f %9f %14s %14s %s' % (sentiment[0], sentiment[1], sentiment[2], self.TOKENIZERS[t_method], self.STEMMERS[l_method], baseurl) return values