Python encの例、feedparser.enc Pythonの例

コード例 #1

0

ファイルを表示

ファイル: html2text.py プロジェクト: ceramisch/CAMELEON-cc

def html2text_all( the_url, encoding=None ) :
    baseurl = ''
    file_ = the_url

    if file_.startswith('http://') or file_.startswith('https://'):
        baseurl = file_
        req_headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11','Referer': 'http://mwetoolkit.sf.net'}
        request = urllib2.Request( baseurl, None, req_headers )
        try :
            response = urllib2.urlopen( request, timeout=30 )
        except urllib2.URLError, e:
            print >> sys.stderr, "HTTP Error retrieving " + baseurl
            print e
            #sys.exit( -1 )       
            return None
        text = response.read()
        if encoding is None:
            try:
                from feedparser import _getCharacterEncoding as enc
            except ImportError:
                enc = lambda x, y: ('utf-8', 1)
            #pdb.set_trace()
            encoding = enc(response.headers, text)[0]
            if encoding == 'us-ascii':
                encoding = 'utf-8'
        data = text.decode(encoding, 'ignore')

コード例 #2

0

ファイルを表示

ファイル: alt_html2text.py プロジェクト: awodeyar/Cryptic

def main(args1):
    baseurl = ''
    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
        default=False, help="convert an html-exported Google Document")
    p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
        default=False, help="use a dash rather than a star for unordered list items")
    p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
        default=78, help="number of characters per output line, 0 for no wrap")
    p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
        default=36, help="number of pixels Google indents nested lists")
    p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
        default=False, help="hide strike-through text. only relevent when -g is specified as well")
    (options, args) = p.parse_args()

    # handle options
    if options.ul_style_dash:
        options.ul_item_mark = '-'
    else:
        options.ul_item_mark = '*'

    BODY_WIDTH = options.body_width
    GOOGLE_LIST_INDENT = options.list_indent

    # process input
    if len(args1) > 0:
        file_ = "t1.txt"
        encoding = None
        if len(args1) == 2:
            encoding = args[1]
        if len(args1) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            text = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, text)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
            data = text.decode(encoding)

        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
            data = data.decode(encoding)
    else:
        data = sys.stdin.read()
    wrapwrite(html2text(data, baseurl))

コード例 #3

0

ファイルを表示

def exec_main():
    URL = ''
    arg2 = "0"
    # if a URL is passed as cmdline argument
    if sys.argv[1:]:
        arg1 = sys.argv[1]
        if sys.argv[2:]:
            arg2 = sys.argv[2]
        # if URL starts with http or https
        # use urllib to fetch HTML code
        if arg1.startswith('http://') or arg1.startswith('https://'):
            URL = arg1
            obj = urllib.urlopen(URL)
            try:
                from feedparser import _getCharacterEncoding as enc
            except ImportError:
                enc = lambda x, y: ('utf-8', 1)
            text = obj.read()
            encoding = enc(obj.headers, text)[0]
            if encoding == 'us-ascii': encoding = 'utf-8'
            html_data = text.decode(encoding)
        else:
            # use local html file
            encoding = 'utf8'
            if len(sys.argv) > 2:
                encoding = sys.argv[2]
            html_data = open(arg1, 'r').read().decode(encoding)
    else:
        # if no arg is passed
        html_data = "Usage: python textify.py (URL|file.html)"

    if (arg2 is "1"):
        writeToFile(wrapText(textify_html(html_data, None, URL)))
    else:
        writeToStdout(wrapText(textify_html(html_data, None, URL)))

コード例 #4

0

ファイルを表示

def getURL(url):
    try:
        with close(urlopen(url)) as j:
            text = j.read()
            encoding = enc(j.headers, text)[0]
            if encoding == 'us-ascii':
                encoding = 'utf-8'
            data = text.decode(encoding)
    except Exception, e:
        data = text.decode('latin-1')

コード例 #5

0

ファイルを表示

ファイル: html2text_handol.py プロジェクト: eox03y/works

def url2text(url):
    j = urllib.urlopen(url)
    try:
        from feedparser import _getCharacterEncoding as enc
    except ImportError:
        enc = lambda x, y: ('utf-8', 1)
    text = j.read()
    encoding = enc(j.headers, text)[0]
    if encoding == 'us-ascii': encoding = 'utf-8'
    data = html2text_file(text.decode(encoding))
    print type(data)

コード例 #6

0

ファイルを表示

ファイル: html2text_handol.py プロジェクト: dongsam/works

def url2text(url):
	j = urllib.urlopen(url)
	try:
		from feedparser import _getCharacterEncoding as enc
	except ImportError:
		   enc = lambda x, y: ('utf-8', 1)
	text = j.read()
	encoding = enc(j.headers, text)[0]
	if encoding == 'us-ascii': encoding = 'utf-8'
	data = html2text_file(text.decode(encoding))
	print type(data)

コード例 #7

0

ファイルを表示

ファイル: references.py プロジェクト: elizagrames/metaverse

def convert_http_to_text_nltk(http):
    """Convert http to text using nltk"""
    import urllib2
    req = urllib2.Request(http, headers={'User-Agent': "Magic Browser"})
    j = urllib2.urlopen(req)
    text = j.read()
    try:
        from feedparser import _getCharacterEncoding as enc
    except ImportError:
        enc = lambda x, y: ('utf-8', 1)
    encoding = enc(j.headers, text)[0]
    if encoding == 'us-ascii':
        encoding = 'utf-8'
    data = text.decode(encoding)
    output = nltk.clean_html(data)
    return normalize_ligatures(output.decode("utf-8"))  #return unicode

コード例 #8

0

ファイルを表示

ファイル: html2content.py プロジェクト: anaved/company-logo-picker

def totext(baseurl):
    j = urllib2.urlopen(baseurl)
    text = j.read()
    try:
        from feedparser import _getCharacterEncoding as enc
    except ImportError:
        enc = lambda x, y: ('utf-8', 1)

    try:
        encoding = enc(j.headers, text)[0]
        if encoding == 'us-ascii': encoding = 'utf-8'
        data = CL.clean_html(text.decode(encoding, 'replace'))
    except:
        data = CL.clean_html("".join([x for x in text if ord(x) <= 128]))
    m = ht.html2text(data, baseurl)
    return "".join([x for x in m if ord(x) < 128])

コード例 #9

0

ファイルを表示

ファイル: html2content.py プロジェクト: anaved/company-logo-picker

def totext(baseurl):
    j = urllib2.urlopen(baseurl)
    text = j.read()
    try:
        from feedparser import _getCharacterEncoding as enc
    except ImportError:
        enc = lambda x, y: ('utf-8', 1)

    try:
        encoding = enc(j.headers, text)[0]
        if encoding == 'us-ascii': encoding = 'utf-8'
        data = CL.clean_html(text.decode(encoding, 'replace'))
    except:
        data = CL.clean_html("".join([x for x in text if ord(x) <= 128]))
    m = ht.html2text(data, baseurl)
    return "".join([x for x in m if ord(x) < 128])

コード例 #10

0

ファイルを表示

ファイル: simplesentiment.py プロジェクト: vanjos/kairos_sentiment

	def alchemyentityextraction(self, options, text=None, url=None):
		"""
		"""
		import urllib2, simplejson, json
		from urllib import urlencode
		if text:
			BASE_URL    = 	'http://access.alchemyapi.com/calls/text/TextGetRankedNamedEntities'
		if url:
			BASE_URL    = 	'http://access.alchemyapi.com/calls/text/HTMLGetRankedNamedEntities'
                ### CHANGE THIS TO YOUR OWN ALCHEMYAPI KEY ###
		API_KEY     = 'PUT YOUR API KEY HERE'

		post_parameters = {
			'apikey'        :   API_KEY,
		    'outputMode'    :   'json',
		    'coreference'	:   '1',
		    'disambiguate'	:	'1',
		    'sentiment'		:	'1'
		}

		if text:
			post_parameters['text'] = text
		if url:
			baseurl = url
			j = urllib2.urlopen(baseurl)
			text = j.read()
			try:
				from feedparser import _getCharacterEncoding as enc
			except ImportError:
				enc = lambda x, y: ('utf-8', 1)
			encoding = enc(j.headers, text)[0]
			if encoding == 'us-ascii':
				encoding = 'utf-8'

			post_parameters['html'] = text.decode(encoding)
			post_parameters['url']  = url

		response = simplejson.load(urllib2.urlopen(urllib2.Request(BASE_URL, data=urlencode(post_parameters))))
		#print simplejson.dumps(response, sort_keys=True, indent=3)

		return response

コード例 #11

0

ファイルを表示

ファイル: dataext.py プロジェクト: AlexeyProskuryakov/twitter_searcher

def doConvert(url):
    # загрузка страницы
    j = urllib.urlopen(url)
    try:
        from feedparser import _getCharacterEncoding as enc
    except ImportError:
        enc = lambda x, y: ('utf-8', 1)
    text = j.read()
    encoding = enc(j.headers, text)[0]
    if encoding == 'us-ascii': encoding = 'cp1251'
    data = text.decode(encoding)
    # конвертирование html документа в markdown
    originalMarkdownDocument = html2text.html2text(data, url)
    markdownDocument = originalMarkdownDocument.split("\n")
    # поиск верхней границы статьи
    title = lxml.html.document_fromstring(text)
    startLine = findStartMerker(title.find(".//title").text, markdownDocument)
    # удаление текста выше верхней границы
    del markdownDocument[:startLine]
    # поиск нижней границы статьи
    skiplist = []
    endLine = findEndMarker(markdownDocument, skiplist)
    # удаление строк из skiplist
    for x in range(len(skiplist)-1,0, -1):
        markdownDocument.pop(skiplist[x])
    # отсечение статьи по нижней границе
    if endLine <> -1:
            del markdownDocument[endLine-len(skiplist)+1:]
    else:
        return;
    # замена ссылок линками
    fragment = listToString(markdownDocument)
    fragment = replaceInternalLinks(originalMarkdownDocument, fragment)
    global htmlOut
    if htmlOut == 1:
        # конвертирование markdown в html
        html = markdown.markdown(fragment)
        print html.encode('utf-8')
    else:
        print fragment.encode('utf-8')

コード例 #12

0

ファイルを表示

ファイル: html2text.py プロジェクト: dsc/html2text

def main():
    baseurl = ""
    p = optparse.OptionParser("%prog [(filename|url) [encoding]]", version="%prog " + __version__)
    args = p.parse_args()[1]
    if len(args) > 0:
        file_ = args[0]
        encoding = None
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error("Too many arguments")

        if file_.startswith("http://") or file_.startswith("https://"):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            text = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ("utf-8", 1)
                encoding = enc(j.headers, text)[0]
                if encoding == "us-ascii":
                    encoding = "utf-8"
            data = text.decode(encoding)

        else:
            data = open(file_, "rb").read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {"encoding": "utf-8"}
                encoding = detect(data)["encoding"]
            data = data.decode(encoding)
    else:
        data = sys.stdin.read()

    wrapwrite(html2text(data, baseurl))

コード例 #13

0

ファイルを表示

ファイル: text2html.py プロジェクト: ferranMazaira/memolingus

def html2text(url):
    baseurl = url
    j = urllib.urlopen(baseurl)
    text = j.read()
    try:
        from feedparser import _getCharacterEncoding as enc
    except ImportError:
        enc = lambda x, y: ('utf-8', 1)
    encoding_t = enc(j.headers, text)[0]
    if encoding_t == 'us-ascii':
        encoding_t = 'utf-8'
    data = text.decode(encoding_t)

    text = optwrap(html2text_file(data, None, baseurl))
    text = text.encode('utf-8')
    text=text.rsplit('\n')

    final_text=[]
    for l in text:
    	if not re.match(r'[#*!<]',l):
    		if len(l)>1: 
    		    if re.match('\w+|\[\w+',l): final_text.append(l)

    return '\n'.join(final_text)

コード例 #14

0

ファイルを表示

ファイル: html2textgood.py プロジェクト: eox03y/works

	def getText(self):
		outputEnc = "euc-kr"
		j = urllib.urlopen(self.url)
		try:
			from feedparser import _getCharacterEncoding as enc
		except ImportError:
			   enc = lambda x, y: ('utf-8', 1)
		text = j.read()
		self.encoding = enc(j.headers, text)[0]
		print "========= ENCODE", self.encoding
		if self.encoding == 'us-ascii': self.encoding = 'utf-8'

		self.html = text.decode(self.encoding)
		print self.html.encode(outputEnc, 'ignore' )
		self.html = self.html.replace("<br>", "")
		h2t = _html2text(None)
		h2t.feed(self.html)
		h2t.feed("")
		h2t.close()

		self.text = optwrap(h2t.outtext)
		self.text =  h2t.outtext.encode(outputEnc, 'ignore')
		self.links = h2t.linkList
		return self.text

コード例 #15

0

ファイルを表示

ファイル: html2text.py プロジェクト: kstomasz/LaureateDQLaTeX

def main():
    baseurl = ''

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", 
        default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
    p.add_option("--ignore-links", dest="ignore_links", action="store_true",
        default=IGNORE_ANCHORS, help="don't include any formatting for links")
    p.add_option("--ignore-images", dest="ignore_images", action="store_true",
        default=IGNORE_IMAGES, help="don't include any formatting for images")
    p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
        default=False, help="convert an html-exported Google Document")
    p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
        default=False, help="use a dash rather than a star for unordered list items")
    p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
        default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
    p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
        default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
    p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
        default=False, help="hide strike-through text. only relevent when -g is specified as well")
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0:
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = sys.stdin.read()

    try:
        data = data.decode(encoding)
    except UnicodeDecodeError:
        pass
    try:
        data = data.decode('utf-8')
    except UnicodeDecodeError:
        # OS X percent-encodes any bytes that aren't valid utf-8
        s = ''
        g = ''
        l = 0
        for c in data:
            o = ord(c)
            if l and o < 128 or o >= 192:
                # we want a continuation byte, but didn't get one
                s += ''.join(["%%%02X" % ord(x) for x in g])
                g = ''
                l = 0
            if l == 0 and o < 128:
                # ascii
                s += c
            elif l == 0 and 194 <= o < 245:
                # valid leading bytes
                if o < 224:
                    l = 1
                elif o < 240:
                    l = 2
                else:
                    l = 3
                g = c
            elif l > 0 and 128 <= o < 192:
                # valid continuations
                g += c
                l -= 1
                if not l:
                    s += g
                    g = ''
            else:
                # invalid
                s += "%%%02X" % o

        # any remaining partial characters
        s += ''.join(["%%%02X" % ord(x) for x in g])
        data = s.decode('utf-8')
    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash: h.ul_item_mark = '-'

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.ignore_images = options.ignore_images
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough

    wrapwrite(h.handle(data))

コード例 #16

0

ファイルを表示

ファイル: cli.py プロジェクト: amitu/html2text

def main():
    baseurl = ''

    class bcolors:  # pragma: no cover
        HEADER = '\033[95m'
        OKBLUE = '\033[94m'
        OKGREEN = '\033[92m'
        WARNING = '\033[93m'
        FAIL = '\033[91m'
        ENDC = '\033[0m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'

    p = optparse.OptionParser(
        '%prog [(filename|url) [encoding]]',
        version='%prog ' + ".".join(map(str, __version__))
    )
    p.add_option(
        "--no-wrap-links",
        dest="wrap_links",
        action="store_false",
        default=config.WRAP_LINKS,
        help="wrap links during conversion"
    )
    p.add_option(
        "--ignore-emphasis",
        dest="ignore_emphasis",
        action="store_true",
        default=config.IGNORE_EMPHASIS,
        help="don't include any formatting for emphasis"
    )
    p.add_option(
        "--reference-links",
        dest="inline_links",
        action="store_false",
        default=config.INLINE_LINKS,
        help="use reference style links instead of inline links"
    )
    p.add_option(
        "--ignore-links",
        dest="ignore_links",
        action="store_true",
        default=config.IGNORE_ANCHORS,
        help="don't include any formatting for links")
    p.add_option(
        "--protect-links",
        dest="protect_links",
        action="store_true",
        default=config.PROTECT_LINKS,
        help=("protect links from line breaks surrounding them " +
              "with angle brackets"))
    p.add_option(
        "--ignore-images",
        dest="ignore_images",
        action="store_true",
        default=config.IGNORE_IMAGES,
        help="don't include any formatting for images"
    )
    p.add_option(
        "--images-to-alt",
        dest="images_to_alt",
        action="store_true",
        default=config.IMAGES_TO_ALT,
        help="Discard image data, only keep alt text"
    )
    p.add_option(
        "--images-with-size",
        dest="images_with_size",
        action="store_true",
        default=config.IMAGES_WITH_SIZE,
        help="Write image tags with height and width attrs as raw html to "
             "retain dimensions"
    )
    p.add_option(
        "-g", "--google-doc",
        action="store_true",
        dest="google_doc",
        default=False,
        help="convert an html-exported Google Document"
    )
    p.add_option(
        "-d", "--dash-unordered-list",
        action="store_true",
        dest="ul_style_dash",
        default=False,
        help="use a dash rather than a star for unordered list items"
    )
    p.add_option(
        "-e", "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text"
    )
    p.add_option(
        "-b", "--body-width",
        dest="body_width",
        action="store",
        type="int",
        default=config.BODY_WIDTH,
        help="number of characters per output line, 0 for no wrap"
    )
    p.add_option(
        "-i", "--google-list-indent",
        dest="list_indent",
        action="store",
        type="int",
        default=config.GOOGLE_LIST_INDENT,
        help="number of pixels Google indents nested lists"
    )
    p.add_option(
        "-s", "--hide-strikethrough",
        action="store_true",
        dest="hide_strikethrough",
        default=False,
        help="hide strike-through text. only relevant when -g is "
             "specified as well"
    )
    p.add_option(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help="Escape all special characters.  Output is less readable, but "
             "avoids corner case formatting issues."
    )
    p.add_option(
        "--bypass-tables",
        action="store_true",
        dest="bypass_tables",
        default=config.BYPASS_TABLES,
        help="Format tables in HTML rather than Markdown syntax."
    )
    p.add_option(
        "--single-line-break",
        action="store_true",
        dest="single_line_break",
        default=config.SINGLE_LINE_BREAK,
        help=(
            "Use a single line break after a block element rather than two "
            "line breaks. NOTE: Requires --body-width=0"
        )
    )
    p.add_option(
        "--unicode-snob",
        action="store_true",
        dest="unicode_snob",
        default=config.UNICODE_SNOB,
        help="Use unicode throughout document"
    )
    p.add_option(
        "--no-automatic-links",
        action="store_false",
        dest="use_automatic_links",
        default=config.USE_AUTOMATIC_LINKS,
        help="Do not use automatic links wherever applicable"
    )
    p.add_option(
        "--no-skip-internal-links",
        action="store_false",
        dest="skip_internal_links",
        default=config.SKIP_INTERNAL_LINKS,
        help="Do not skip internal links"
    )
    p.add_option(
        "--links-after-para",
        action="store_true",
        dest="links_each_paragraph",
        default=config.LINKS_EACH_PARAGRAPH,
        help="Put links after each paragraph instead of document"
    )
    p.add_option(
        "--mark-code",
        action="store_true",
        dest="mark_code",
        default=config.MARK_CODE,
        help="Mark program code blocks with [code]...[/code]"
    )
    p.add_option(
        "--decode-errors",
        dest="decode_errors",
        action="store",
        type="string",
        default=config.DECODE_ERRORS,
        help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
    )
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) == 2:
        encoding = args[1]
    elif len(args) > 2:
        p.error('Too many arguments')

    if len(args) > 0 and args[0] != '-':  # pragma: no cover
        file_ = args[0]

        if file_.startswith('http://') or file_.startswith('https://'):
            warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
                    DeprecationWarning)
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = wrap_read()

    if hasattr(data, 'decode'):
        try:
            try:
                data = data.decode(encoding, errors=options.decode_errors)
            except TypeError:
                # python 2.6.x does not have the errors option
                data = data.decode(encoding)
        except UnicodeDecodeError as err:
            warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
            warning += ' Use the ' + bcolors.OKGREEN
            warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.'
            print(warning)
            raise err

    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = '-'
    if options.em_style_asterisk:
        h.emphasis_mark = '*'
        h.strong_mark = '__'

    h.body_width = options.body_width
    h.google_list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.protect_links = options.protect_links
    h.ignore_images = options.ignore_images
    h.images_to_alt = options.images_to_alt
    h.images_with_size = options.images_with_size
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob
    h.bypass_tables = options.bypass_tables
    h.single_line_break = options.single_line_break
    h.inline_links = options.inline_links
    h.unicode_snob = options.unicode_snob
    h.use_automatic_links = options.use_automatic_links
    h.skip_internal_links = options.skip_internal_links
    h.links_each_paragraph = options.links_each_paragraph
    h.mark_code = options.mark_code
    h.wrap_links = options.wrap_links

    wrapwrite(h.handle(data))

コード例 #17

0

ファイルを表示

ファイル: html2text.py プロジェクト: krisrogers/textisbeautiful

        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            resp, text = httplib2.Http().request(baseurl, headers={'User-Agent':'textisbeautiful.net/1.0'})
            #text = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                print resp
                encoding = enc(resp, text)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
            data = text.decode(encoding)

        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
            data = data.decode(encoding)
    else:
        data = sys.stdin.read()

コード例 #18

0

ファイルを表示

ファイル: cli.py プロジェクト: akshayjh/python-naive-bayes-spam-classifier

def main():
    baseurl = ''

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option(
        "--ignore-emphasis",
        dest="ignore_emphasis",
        action="store_true",
        default=config.IGNORE_EMPHASIS,
        help="don't include any formatting for emphasis"
    )
    p.add_option(
        "--ignore-links",
        dest="ignore_links",
        action="store_true",
        default=config.IGNORE_ANCHORS,
        help="don't include any formatting for links")
    p.add_option(
        "--protect-links",
        dest="protect_links",
        action="store_true",
        default=config.PROTECT_LINKS,
        help=("protect links from line breaks surrounding them " +
              "with angle brackets"))
    p.add_option(
        "--ignore-images",
        dest="ignore_images",
        action="store_true",
        default=config.IGNORE_IMAGES,
        help="don't include any formatting for images"
    )
    p.add_option(
        "--images-to-alt",
        dest="images_to_alt",
        action="store_true",
        default=config.IMAGES_TO_ALT,
        help="Discard image data, only keep alt text"
    )
    p.add_option(
        "--images-with-size",
        dest="images_with_size",
        action="store_true",
        default=config.IMAGES_WITH_SIZE,
        help="Write image tags with height and width attrs as raw html to "
             "retain dimensions"
    )
    p.add_option(
        "-g", "--google-doc",
        action="store_true",
        dest="google_doc",
        default=False,
        help="convert an html-exported Google Document"
    )
    p.add_option(
        "-d", "--dash-unordered-list",
        action="store_true",
        dest="ul_style_dash",
        default=False,
        help="use a dash rather than a star for unordered list items"
    )
    p.add_option(
        "-e", "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text"
    )
    p.add_option(
        "-b", "--body-width",
        dest="body_width",
        action="store",
        type="int",
        default=config.BODY_WIDTH,
        help="number of characters per output line, 0 for no wrap"
    )
    p.add_option(
        "-i", "--google-list-indent",
        dest="list_indent",
        action="store",
        type="int",
        default=config.GOOGLE_LIST_INDENT,
        help="number of pixels Google indents nested lists"
    )
    p.add_option(
        "-s", "--hide-strikethrough",
        action="store_true",
        dest="hide_strikethrough",
        default=False,
        help="hide strike-through text. only relevant when -g is "
             "specified as well"
    )
    p.add_option(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help="Escape all special characters.  Output is less readable, but "
             "avoids corner case formatting issues."
    )
    p.add_option(
        "--bypass-tables",
        action="store_true",
        dest="bypass_tables",
        default=config.BYPASS_TABLES,
        help="Format tables in HTML rather than Markdown syntax."
    )
    p.add_option(
        "--single-line-break",
        action="store_true",
        dest="single_line_break",
        default=config.SINGLE_LINE_BREAK,
        help=(
            "Use a single line break after a block element rather than two "
            "line breaks. NOTE: Requires --body-width=0"
        )
    )
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0 and args[0] != '-':
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = wrap_read()

    if hasattr(data, 'decode'):
        data = data.decode(encoding)

    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = '-'
    if options.em_style_asterisk:
        h.emphasis_mark = '*'
        h.strong_mark = '__'

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.protect_links = options.protect_links
    h.ignore_images = options.ignore_images
    h.images_to_alt = options.images_to_alt
    h.images_with_size = options.images_with_size
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob
    h.bypass_tables = options.bypass_tables
    h.single_line_break = options.single_line_break

    wrapwrite(h.handle(data))

コード例 #19

0

ファイルを表示

def main():
    baseurl = ""

    p = optparse.OptionParser("%prog [(filename|url) [encoding]]",
                              version="%prog " + __version__)
    p.add_option(
        "--ignore-emphasis",
        dest="ignore_emphasis",
        action="store_true",
        default=IGNORE_EMPHASIS,
        help="don't include any formatting for emphasis",
    )
    p.add_option(
        "--ignore-links",
        dest="ignore_links",
        action="store_true",
        default=IGNORE_ANCHORS,
        help="don't include any formatting for links",
    )
    p.add_option(
        "--ignore-images",
        dest="ignore_images",
        action="store_true",
        default=IGNORE_IMAGES,
        help="don't include any formatting for images",
    )
    p.add_option(
        "-g",
        "--google-doc",
        action="store_true",
        dest="google_doc",
        default=False,
        help="convert an html-exported Google Document",
    )
    p.add_option(
        "-d",
        "--dash-unordered-list",
        action="store_true",
        dest="ul_style_dash",
        default=False,
        help="use a dash rather than a star for unordered list items",
    )
    p.add_option(
        "-e",
        "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text",
    )
    p.add_option(
        "-b",
        "--body-width",
        dest="body_width",
        action="store",
        type="int",
        default=BODY_WIDTH,
        help="number of characters per output line, 0 for no wrap",
    )
    p.add_option(
        "-i",
        "--google-list-indent",
        dest="list_indent",
        action="store",
        type="int",
        default=GOOGLE_LIST_INDENT,
        help="number of pixels Google indents nested lists",
    )
    p.add_option(
        "-s",
        "--hide-strikethrough",
        action="store_true",
        dest="hide_strikethrough",
        default=False,
        help=
        "hide strike-through text. only relevant when -g is specified as well",
    )
    p.add_option(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help=
        "Escape all special characters.  Output is less readable, but avoids corner case formatting issues.",
    )
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0:
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error("Too many arguments")

        if file_.startswith("http://") or file_.startswith("https://"):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ("utf-8", 1)
                encoding = enc(j.headers, data)[0]
                if encoding == "us-ascii":
                    encoding = "utf-8"
        else:
            data = open(file_, "rb").read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {"encoding": "utf-8"}
                encoding = detect(data)["encoding"]
    else:
        data = sys.stdin.read()

    data = data.decode(encoding)
    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = "-"
    if options.em_style_asterisk:
        h.emphasis_mark = "*"
        h.strong_mark = "__"

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.ignore_images = options.ignore_images
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob

    wrapwrite(h.handle(data))

コード例 #20

0

ファイルを表示

ファイル: cli.py プロジェクト: zargooshifar/multi_login_sut

def main():
    baseurl = ''

    class bcolors:  # pragma: no cover
        HEADER = '\033[95m'
        OKBLUE = '\033[94m'
        OKGREEN = '\033[92m'
        WARNING = '\033[93m'
        FAIL = '\033[91m'
        ENDC = '\033[0m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'

    p = optparse.OptionParser(
        '%prog [(filename|url) [encoding]]',
        version='%prog ' + ".".join(map(str, __version__))
    )
    p.add_option(
        "--no-wrap-links",
        dest="wrap_links",
        action="store_false",
        default=config.WRAP_LINKS,
        help="wrap links during conversion"
    )
    p.add_option(
        "--ignore-emphasis",
        dest="ignore_emphasis",
        action="store_true",
        default=config.IGNORE_EMPHASIS,
        help="don't include any formatting for emphasis"
    )
    p.add_option(
        "--reference-links",
        dest="inline_links",
        action="store_false",
        default=config.INLINE_LINKS,
        help="use reference style links instead of inline links"
    )
    p.add_option(
        "--ignore-links",
        dest="ignore_links",
        action="store_true",
        default=config.IGNORE_ANCHORS,
        help="don't include any formatting for links")
    p.add_option(
        "--protect-links",
        dest="protect_links",
        action="store_true",
        default=config.PROTECT_LINKS,
        help=("protect links from line breaks surrounding them " +
              "with angle brackets"))
    p.add_option(
        "--ignore-images",
        dest="ignore_images",
        action="store_true",
        default=config.IGNORE_IMAGES,
        help="don't include any formatting for images"
    )
    p.add_option(
        "--images-to-alt",
        dest="images_to_alt",
        action="store_true",
        default=config.IMAGES_TO_ALT,
        help="Discard image data, only keep alt text"
    )
    p.add_option(
        "--images-with-size",
        dest="images_with_size",
        action="store_true",
        default=config.IMAGES_WITH_SIZE,
        help="Write image tags with height and width attrs as raw html to "
             "retain dimensions"
    )
    p.add_option(
        "-g", "--google-doc",
        action="store_true",
        dest="google_doc",
        default=False,
        help="convert an html-exported Google Document"
    )
    p.add_option(
        "-d", "--dash-unordered-list",
        action="store_true",
        dest="ul_style_dash",
        default=False,
        help="use a dash rather than a star for unordered list items"
    )
    p.add_option(
        "-e", "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text"
    )
    p.add_option(
        "-b", "--body-width",
        dest="body_width",
        action="store",
        type="int",
        default=config.BODY_WIDTH,
        help="number of characters per output line, 0 for no wrap"
    )
    p.add_option(
        "-i", "--google-list-indent",
        dest="list_indent",
        action="store",
        type="int",
        default=config.GOOGLE_LIST_INDENT,
        help="number of pixels Google indents nested lists"
    )
    p.add_option(
        "-s", "--hide-strikethrough",
        action="store_true",
        dest="hide_strikethrough",
        default=False,
        help="hide strike-through text. only relevant when -g is "
             "specified as well"
    )
    p.add_option(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help="Escape all special characters.  Output is less readable, but "
             "avoids corner case formatting issues."
    )
    p.add_option(
        "--bypass-tables",
        action="store_true",
        dest="bypass_tables",
        default=config.BYPASS_TABLES,
        help="Format tables in HTML rather than Markdown syntax."
    )
    p.add_option(
        "--single-line-break",
        action="store_true",
        dest="single_line_break",
        default=config.SINGLE_LINE_BREAK,
        help=(
            "Use a single line break after a block element rather than two "
            "line breaks. NOTE: Requires --body-width=0"
        )
    )
    p.add_option(
        "--unicode-snob",
        action="store_true",
        dest="unicode_snob",
        default=config.UNICODE_SNOB,
        help="Use unicode throughout document"
    )
    p.add_option(
        "--no-automatic-links",
        action="store_false",
        dest="use_automatic_links",
        default=config.USE_AUTOMATIC_LINKS,
        help="Do not use automatic links wherever applicable"
    )
    p.add_option(
        "--no-skip-internal-links",
        action="store_false",
        dest="skip_internal_links",
        default=config.SKIP_INTERNAL_LINKS,
        help="Do not skip internal links"
    )
    p.add_option(
        "--links-after-para",
        action="store_true",
        dest="links_each_paragraph",
        default=config.LINKS_EACH_PARAGRAPH,
        help="Put links after each paragraph instead of document"
    )
    p.add_option(
        "--mark-code",
        action="store_true",
        dest="mark_code",
        default=config.MARK_CODE,
        help="Mark program code blocks with [code]...[/code]"
    )
    p.add_option(
        "--decode-errors",
        dest="decode_errors",
        action="store",
        type="string",
        default=config.DECODE_ERRORS,
        help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
    )
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0 and args[0] != '-':  # pragma: no cover
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = wrap_read()

    if hasattr(data, 'decode'):
        try:
            try:
                data = data.decode(encoding, errors=options.decode_errors)
            except TypeError:
                # python 2.6.x does not have the errors option
                data = data.decode(encoding)
        except UnicodeDecodeError as err:
            warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
            warning += ' Use the ' + bcolors.OKGREEN
            warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.'
            print(warning)
            raise err

    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = '-'
    if options.em_style_asterisk:
        h.emphasis_mark = '*'
        h.strong_mark = '__'

    h.body_width = options.body_width
    h.google_list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.protect_links = options.protect_links
    h.ignore_images = options.ignore_images
    h.images_to_alt = options.images_to_alt
    h.images_with_size = options.images_with_size
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob
    h.bypass_tables = options.bypass_tables
    h.single_line_break = options.single_line_break
    h.inline_links = options.inline_links
    h.unicode_snob = options.unicode_snob
    h.use_automatic_links = options.use_automatic_links
    h.skip_internal_links = options.skip_internal_links
    h.links_each_paragraph = options.links_each_paragraph
    h.mark_code = options.mark_code
    h.wrap_links = options.wrap_links

    wrapwrite(h.handle(data))

コード例 #21

0

ファイルを表示

ファイル: html2text.py プロジェクト: wking/html2text

def main():
    baseurl = ""

    p = optparse.OptionParser("%prog [(filename|url) [encoding]]", version="%prog " + __version__)
    p.add_option(
        "--ignore-emphasis",
        dest="ignore_emphasis",
        action="store_true",
        default=IGNORE_EMPHASIS,
        help="don't include any formatting for emphasis",
    )
    p.add_option(
        "--ignore-links",
        dest="ignore_links",
        action="store_true",
        default=IGNORE_ANCHORS,
        help="don't include any formatting for links",
    )
    p.add_option(
        "--ignore-images",
        dest="ignore_images",
        action="store_true",
        default=IGNORE_IMAGES,
        help="don't include any formatting for images",
    )
    p.add_option(
        "-g",
        "--google-doc",
        action="store_true",
        dest="google_doc",
        default=False,
        help="convert an html-exported Google Document",
    )
    p.add_option(
        "-d",
        "--dash-unordered-list",
        action="store_true",
        dest="ul_style_dash",
        default=False,
        help="use a dash rather than a star for unordered list items",
    )
    p.add_option(
        "-e",
        "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text",
    )
    p.add_option(
        "-b",
        "--body-width",
        dest="body_width",
        action="store",
        type="int",
        default=BODY_WIDTH,
        help="number of characters per output line, 0 for no wrap",
    )
    p.add_option(
        "-i",
        "--google-list-indent",
        dest="list_indent",
        action="store",
        type="int",
        default=GOOGLE_LIST_INDENT,
        help="number of pixels Google indents nested lists",
    )
    p.add_option(
        "-s",
        "--hide-strikethrough",
        action="store_true",
        dest="hide_strikethrough",
        default=False,
        help="hide strike-through text. only relevant when -g is specified as well",
    )
    p.add_option(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help="Escape all special characters.  Output is less readable, but avoids corner case formatting issues.",
    )
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0:
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error("Too many arguments")

        if file_.startswith("http://") or file_.startswith("https://"):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ("utf-8", 1)
                encoding = enc(j.headers, data)[0]
                if encoding == "us-ascii":
                    encoding = "utf-8"
        else:
            data = open(file_, "rb").read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {"encoding": "utf-8"}
                encoding = detect(data)["encoding"]
    else:
        data = sys.stdin.read()

    data = data.decode(encoding)
    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = "-"
    if options.em_style_asterisk:
        h.emphasis_mark = "*"
        h.strong_mark = "__"

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.ignore_images = options.ignore_images
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob

    wrapwrite(h.handle(data))

コード例 #22

0

ファイルを表示

ファイル: html2text.py プロジェクト: feitianyiren/EzRead

def main():
    baseurl = ''

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option("--ignore-emphasis",
                 dest="ignore_emphasis",
                 action="store_true",
                 default=IGNORE_EMPHASIS,
                 help="don't include any formatting for emphasis")
    p.add_option("--ignore-links",
                 dest="ignore_links",
                 action="store_true",
                 default=IGNORE_ANCHORS,
                 help="don't include any formatting for links")
    p.add_option("--ignore-images",
                 dest="ignore_images",
                 action="store_true",
                 default=IGNORE_IMAGES,
                 help="don't include any formatting for images")
    p.add_option("-g",
                 "--google-doc",
                 action="store_true",
                 dest="google_doc",
                 default=False,
                 help="convert an html-exported Google Document")
    p.add_option("-d",
                 "--dash-unordered-list",
                 action="store_true",
                 dest="ul_style_dash",
                 default=False,
                 help="use a dash rather than a star for unordered list items")
    p.add_option("-b",
                 "--body-width",
                 dest="body_width",
                 action="store",
                 type="int",
                 default=BODY_WIDTH,
                 help="number of characters per output line, 0 for no wrap")
    p.add_option("-i",
                 "--google-list-indent",
                 dest="list_indent",
                 action="store",
                 type="int",
                 default=GOOGLE_LIST_INDENT,
                 help="number of pixels Google indents nested lists")
    p.add_option(
        "-s",
        "--hide-strikethrough",
        action="store_true",
        dest="hide_strikethrough",
        default=False,
        help=
        "hide strike-through text. only relevent when -g is specified as well")
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0:
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = sys.stdin.read()

    data = data.decode(encoding)
    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash: h.ul_item_mark = '-'

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.ignore_images = options.ignore_images
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough

    wrapwrite(h.handle(data))

コード例 #23

0

ファイルを表示

ファイル: cli.py プロジェクト: bfoxtrot/foxtroterp

def main():
    baseurl = ""

    p = optparse.OptionParser("%prog [(filename|url) [encoding]]", version="%prog " + ".".join(map(str, __version__)))
    p.add_option(
        "--ignore-emphasis",
        dest="ignore_emphasis",
        action="store_true",
        default=config.IGNORE_EMPHASIS,
        help="don't include any formatting for emphasis",
    )
    p.add_option(
        "--reference-links",
        dest="inline_links",
        action="store_false",
        default=config.INLINE_LINKS,
        help="use reference style links instead of inline links",
    )
    p.add_option(
        "--ignore-links",
        dest="ignore_links",
        action="store_true",
        default=config.IGNORE_ANCHORS,
        help="don't include any formatting for links",
    )
    p.add_option(
        "--protect-links",
        dest="protect_links",
        action="store_true",
        default=config.PROTECT_LINKS,
        help=("protect links from line breaks surrounding them " + "with angle brackets"),
    )
    p.add_option(
        "--ignore-images",
        dest="ignore_images",
        action="store_true",
        default=config.IGNORE_IMAGES,
        help="don't include any formatting for images",
    )
    p.add_option(
        "--images-to-alt",
        dest="images_to_alt",
        action="store_true",
        default=config.IMAGES_TO_ALT,
        help="Discard image data, only keep alt text",
    )
    p.add_option(
        "--images-with-size",
        dest="images_with_size",
        action="store_true",
        default=config.IMAGES_WITH_SIZE,
        help="Write image tags with height and width attrs as raw html to " "retain dimensions",
    )
    p.add_option(
        "-g",
        "--google-doc",
        action="store_true",
        dest="google_doc",
        default=False,
        help="convert an html-exported Google Document",
    )
    p.add_option(
        "-d",
        "--dash-unordered-list",
        action="store_true",
        dest="ul_style_dash",
        default=False,
        help="use a dash rather than a star for unordered list items",
    )
    p.add_option(
        "-e",
        "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text",
    )
    p.add_option(
        "-b",
        "--body-width",
        dest="body_width",
        action="store",
        type="int",
        default=config.BODY_WIDTH,
        help="number of characters per output line, 0 for no wrap",
    )
    p.add_option(
        "-i",
        "--google-list-indent",
        dest="list_indent",
        action="store",
        type="int",
        default=config.GOOGLE_LIST_INDENT,
        help="number of pixels Google indents nested lists",
    )
    p.add_option(
        "-s",
        "--hide-strikethrough",
        action="store_true",
        dest="hide_strikethrough",
        default=False,
        help="hide strike-through text. only relevant when -g is " "specified as well",
    )
    p.add_option(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help="Escape all special characters.  Output is less readable, but " "avoids corner case formatting issues.",
    )
    p.add_option(
        "--bypass-tables",
        action="store_true",
        dest="bypass_tables",
        default=config.BYPASS_TABLES,
        help="Format tables in HTML rather than Markdown syntax.",
    )
    p.add_option(
        "--single-line-break",
        action="store_true",
        dest="single_line_break",
        default=config.SINGLE_LINE_BREAK,
        help=(
            "Use a single line break after a block element rather than two "
            "line breaks. NOTE: Requires --body-width=0"
        ),
    )
    p.add_option(
        "--unicode-snob",
        action="store_true",
        dest="unicode_snob",
        default=config.UNICODE_SNOB,
        help="Use unicode throughout document",
    )
    p.add_option(
        "--no-automatic-links",
        action="store_false",
        dest="use_automatic_links",
        default=config.USE_AUTOMATIC_LINKS,
        help="Do not use automatic links wherever applicable",
    )
    p.add_option(
        "--no-skip-internal-links",
        action="store_false",
        dest="skip_internal_links",
        default=config.SKIP_INTERNAL_LINKS,
        help="Do not skip internal links",
    )
    p.add_option(
        "--links-after-para",
        action="store_true",
        dest="links_each_paragraph",
        default=config.LINKS_EACH_PARAGRAPH,
        help="Put links after each paragraph instead of document",
    )
    p.add_option(
        "--mark-code",
        action="store_true",
        dest="mark_code",
        default=config.MARK_CODE,
        help="Mark program code blocks with [code]...[/code]",
    )
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0 and args[0] != "-":  # pragma: no cover
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error("Too many arguments")

        if file_.startswith("http://") or file_.startswith("https://"):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ("utf-8", 1)
                encoding = enc(j.headers, data)[0]
                if encoding == "us-ascii":
                    encoding = "utf-8"
        else:
            data = open(file_, "rb").read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {"encoding": "utf-8"}
                encoding = detect(data)["encoding"]
    else:
        data = wrap_read()

    if hasattr(data, "decode"):
        data = data.decode(encoding)

    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = "-"
    if options.em_style_asterisk:
        h.emphasis_mark = "*"
        h.strong_mark = "__"

    h.body_width = options.body_width
    h.google_list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.protect_links = options.protect_links
    h.ignore_images = options.ignore_images
    h.images_to_alt = options.images_to_alt
    h.images_with_size = options.images_with_size
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob
    h.bypass_tables = options.bypass_tables
    h.single_line_break = options.single_line_break
    h.inline_links = options.inline_links
    h.unicode_snob = options.unicode_snob
    h.use_automatic_links = options.use_automatic_links
    h.skip_internal_links = options.skip_internal_links
    h.links_each_paragraph = options.links_each_paragraph
    h.mark_code = options.mark_code

    wrapwrite(h.handle(data))

コード例 #24

0

ファイルを表示

ファイル: html2text.py プロジェクト: jansel/html2text

def main():
    baseurl = ''

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option("--ignore-emphasis", dest="ignore_emphasis",
                 action="store_true", default=IGNORE_EMPHASIS,
                 help="don't include any formatting for emphasis")
    p.add_option("--ignore-links", dest="ignore_links", action="store_true",
                 default=IGNORE_ANCHORS,
                 help="don't include any formatting for links")
    p.add_option("--ignore-images", dest="ignore_images", action="store_true",
                 default=IGNORE_IMAGES,
                 help="don't include any formatting for images")
    p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
                 default=False,
                 help="convert an html-exported Google Document")
    p.add_option("-d", "--dash-unordered-list", action="store_true",
                 dest="ul_style_dash", default=False,
                 help="use a dash rather than a star for unordered list items")
    p.add_option("-e", "--asterisk-emphasis", action="store_true",
                 dest="em_style_asterisk", default=False,
                 help="use an asterisk rather than an underscore " +
                 "for emphasized text")
    p.add_option("-b", "--body-width", dest="body_width", action="store",
                 type="int", default=BODY_WIDTH,
                 help="number of characters per output line, 0 for no wrap")
    p.add_option("-i", "--google-list-indent", dest="list_indent",
                 action="store", type="int", default=GOOGLE_LIST_INDENT,
                 help="number of pixels Google indents nested lists")
    p.add_option("-s", "--hide-strikethrough", action="store_true",
                 dest="hide_strikethrough", default=False,
                 help="hide strike-through text. only relevant when -g is " +
                 "specified as well")
    p.add_option("--escape-all", action="store_true", dest="escape_snob",
                 default=False,
                 help="Escape all special characters.  Output is less " +
                 "readable, but avoids corner case formatting issues.")
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0:
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            import urllib2
            from gzip import GzipFile
            from urlparse import urlparse
            from StringIO import StringIO
            
            baseurl = file_
            urls = urlparse(baseurl);
            j = urllib2.urlopen(urllib2.Request(baseurl, headers={
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0',
                'Accept-Encoding': 'gzip, deflate',
                'Referer'   : 'http://%s' % (urls.hostname)
            }), timeout=30)
            if (j.code == 200) :
                content_encoding = j.headers.get('Content-Encoding')
                if content_encoding == 'gzip':
                    data = GzipFile(fileobj=StringIO(j.read()), mode='r').read()
                elif content_encoding == 'deflate':
                    data = StringIO(deflate(j.read())).getvalue()
                else:
                    data = j.read()
            
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = sys.stdin.read()

    data = data.decode(encoding,'ignore')
    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = '-'
    if options.em_style_asterisk:
        h.emphasis_mark = '*'
        h.strong_mark = '__'

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.ignore_images = options.ignore_images
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob

    wrapwrite(h.handle(data))

コード例 #25

0

ファイルを表示

ファイル: html2text.py プロジェクト: usernamenumber/html2text

def main():
    baseurl = ''

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
        default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
    p.add_option("--ignore-links", dest="ignore_links", action="store_true",
        default=IGNORE_ANCHORS, help="don't include any formatting for links")
    p.add_option("--ignore-images", dest="ignore_images", action="store_true",
        default=IGNORE_IMAGES, help="don't include any formatting for images")
    p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
        default=False, help="convert an html-exported Google Document")
    p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
        default=False, help="use a dash rather than a star for unordered list items")
    p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
        default=False, help="use an asterisk rather than an underscore for emphasized text")
    p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
        default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
    p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
        default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
    p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
        default=False, help="hide strike-through text. only relevant when -g is specified as well")
    p.add_option("--escape-all", action="store_true", dest="escape_snob",
        default=False, help="Escape all special characters.  Output is less readable, but avoids corner case formatting issues.")
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0:
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = sys.stdin.read()

    data = data.decode(encoding)

    # Running the HTML through BeautifulSoup fixes things like
    # <b><i>Reversed tags</b></i> (e.g. in Down Process)
    # If BS could not be imported, this step is skipped
    if BeautifulSoup:
        data = str(BeautifulSoup(data))

    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash: h.ul_item_mark = '-'
    if options.em_style_asterisk:
        h.emphasis_mark = '*'
        h.strong_mark = '__'

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.ignore_images = options.ignore_images
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob

    wrapwrite(h.handle(data))

コード例 #26

0

ファイルを表示

ファイル: cli.py プロジェクト: glondon/python-naive-bayes-spam-classifier

def main():
    baseurl = ''

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option("--ignore-emphasis",
                 dest="ignore_emphasis",
                 action="store_true",
                 default=config.IGNORE_EMPHASIS,
                 help="don't include any formatting for emphasis")
    p.add_option("--ignore-links",
                 dest="ignore_links",
                 action="store_true",
                 default=config.IGNORE_ANCHORS,
                 help="don't include any formatting for links")
    p.add_option("--protect-links",
                 dest="protect_links",
                 action="store_true",
                 default=config.PROTECT_LINKS,
                 help=("protect links from line breaks surrounding them " +
                       "with angle brackets"))
    p.add_option("--ignore-images",
                 dest="ignore_images",
                 action="store_true",
                 default=config.IGNORE_IMAGES,
                 help="don't include any formatting for images")
    p.add_option("--images-to-alt",
                 dest="images_to_alt",
                 action="store_true",
                 default=config.IMAGES_TO_ALT,
                 help="Discard image data, only keep alt text")
    p.add_option(
        "--images-with-size",
        dest="images_with_size",
        action="store_true",
        default=config.IMAGES_WITH_SIZE,
        help="Write image tags with height and width attrs as raw html to "
        "retain dimensions")
    p.add_option("-g",
                 "--google-doc",
                 action="store_true",
                 dest="google_doc",
                 default=False,
                 help="convert an html-exported Google Document")
    p.add_option("-d",
                 "--dash-unordered-list",
                 action="store_true",
                 dest="ul_style_dash",
                 default=False,
                 help="use a dash rather than a star for unordered list items")
    p.add_option(
        "-e",
        "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text")
    p.add_option("-b",
                 "--body-width",
                 dest="body_width",
                 action="store",
                 type="int",
                 default=config.BODY_WIDTH,
                 help="number of characters per output line, 0 for no wrap")
    p.add_option("-i",
                 "--google-list-indent",
                 dest="list_indent",
                 action="store",
                 type="int",
                 default=config.GOOGLE_LIST_INDENT,
                 help="number of pixels Google indents nested lists")
    p.add_option("-s",
                 "--hide-strikethrough",
                 action="store_true",
                 dest="hide_strikethrough",
                 default=False,
                 help="hide strike-through text. only relevant when -g is "
                 "specified as well")
    p.add_option(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help="Escape all special characters.  Output is less readable, but "
        "avoids corner case formatting issues.")
    p.add_option("--bypass-tables",
                 action="store_true",
                 dest="bypass_tables",
                 default=config.BYPASS_TABLES,
                 help="Format tables in HTML rather than Markdown syntax.")
    p.add_option(
        "--single-line-break",
        action="store_true",
        dest="single_line_break",
        default=config.SINGLE_LINE_BREAK,
        help=("Use a single line break after a block element rather than two "
              "line breaks. NOTE: Requires --body-width=0"))
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0 and args[0] != '-':
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = wrap_read()

    if hasattr(data, 'decode'):
        data = data.decode(encoding)

    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = '-'
    if options.em_style_asterisk:
        h.emphasis_mark = '*'
        h.strong_mark = '__'

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.protect_links = options.protect_links
    h.ignore_images = options.ignore_images
    h.images_to_alt = options.images_to_alt
    h.images_with_size = options.images_with_size
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob
    h.bypass_tables = options.bypass_tables
    h.single_line_break = options.single_line_break

    wrapwrite(h.handle(data))

コード例 #27

0

ファイルを表示

def read(url, output=None, debug=0):
    "Get run information from RunSummary data-service"
    encoding = 'utf-8'
    key = None
    cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem')
    if os.path.isfile(url):
        with open(url, 'r') as stream:
            context = stream.read()
            try:
                pydoc.pager(context)
            except:
                print context
        return
    elif url.find('cmsweb.cern.ch') != -1:
        data = get_data(url, decoder=None)
        html = data
        encoding = None
    elif url.find('mcdb.cern.ch') != -1:
        data = urllib.urlopen(url)
        html = data.read().replace('&nbsp_place_holder;', '')
        encoding = enc(data.headers, html)[0]
    elif url.find('cern.ch') == -1:
        data = urllib.urlopen(url)
        html = data.read()
        encoding = enc(data.headers, html)[0]
    else:
        with working_pem(PEMMGR.pem) as key:
            data = get_data_sso(url, key, cert, debug)
            html = data.read()
            encoding = enc(data.headers, html)[0]
    if encoding == 'us-ascii':
        encoding = 'utf-8'
    pager = os.environ.get('CMSSH_PAGER', None)
    if html:
        if int(os.environ.get('HTTPDEBUG', 0)):
            print_info('read data')
            print html
        if encoding:
            text = html.decode(encoding)
            res = html2text(text, '')
            if output:
                with open(output, 'w') as stream:
                    stream.write(html)
            else:
                try:
                    if pager:
                        pydoc.pager(res.encode('utf-8'))
                    else:
                        wrapwrite(html2text(text, ''))
                except:
                    wrapwrite(html2text(text, ''))
        else:
            if output:
                with open(output, 'w') as stream:
                    stream.write(html)
            else:
                try:
                    if pager:
                        pydoc.pager(html)
                    else:
                        print html
                except:
                    print html

コード例 #28

0

ファイルを表示

ファイル: html2text.py プロジェクト: clebaresu/impra-adns

    return optwrap(html2text_file(html, None, baseurl))


if __name__ == "__main__":
    baseurl = ''
    if sys.argv[1:]:
        arg = sys.argv[1]
        if arg.startswith('http://'):
            baseurl = arg
            j = urllib.urlopen(baseurl)
            try:
                from feedparser import _getCharacterEncoding as enc
            except ImportError:
                enc = lambda x, y: ('utf-8', 1)
            text = j.read()
            encoding = enc(j.headers, text)[0]
            if encoding == 'us-ascii': encoding = 'utf-8'
            data = text.decode(encoding)

        else:
            encoding = 'utf8'
            if len(sys.argv) > 2:
                encoding = sys.argv[2]
            f = open(arg, 'r')
            try:
                data = f.read().decode(encoding)
            finally:
                f.close()
    else:
        data = sys.stdin.read().decode('utf8')
    wrapwrite(html2text(data, baseurl))

コード例 #29

0

ファイルを表示

ファイル: tmp.py プロジェクト: PYART0/PyART-demo

    encoding = "utf-8"
    if len(args) > 0:
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = sys.stdin.read()}
    reveal_type(data)

コード例 #30

0

ファイルを表示

ファイル: html2text.py プロジェクト: adamjberg/finna-be-octo-ninja

def main():
    baseurl = ''

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", 
        default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
    p.add_option("--ignore-links", dest="ignore_links", action="store_true",
        default=IGNORE_ANCHORS, help="don't include any formatting for links")
    p.add_option("--ignore-images", dest="ignore_images", action="store_true",
        default=IGNORE_IMAGES, help="don't include any formatting for images")
    p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
        default=False, help="convert an html-exported Google Document")
    p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
        default=False, help="use a dash rather than a star for unordered list items")
    p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
        default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
    p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
        default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
    p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
        default=False, help="hide strike-through text. only relevent when -g is specified as well")
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0:
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = sys.stdin.read()

    data = data.decode(encoding)
    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash: h.ul_item_mark = '-'

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.ignore_images = options.ignore_images
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough

    wrapwrite(h.handle(data))

コード例 #31

0

ファイルを表示

ファイル: cern_html.py プロジェクト: dmwm/cmssh

def read(url, output=None, debug=0):
    "Get run information from RunSummary data-service"
    encoding = 'utf-8'
    key  = None
    cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem')
    if  os.path.isfile(url):
        with open(url, 'r') as stream:
            context = stream.read()
            try:
                pydoc.pager(context)
            except:
                print context
        return
    elif url.find('cmsweb.cern.ch') != -1:
        data = get_data(url, decoder=None)
        html = data
        encoding = None
    elif url.find('mcdb.cern.ch') != -1:
        data = urllib.urlopen(url)
        html = data.read().replace('&nbsp_place_holder;', '')
        encoding = enc(data.headers, html)[0]
    elif url.find('cern.ch') == -1:
        data = urllib.urlopen(url)
        html = data.read()
        encoding = enc(data.headers, html)[0]
    else:
        with working_pem(PEMMGR.pem) as key:
            data = get_data_sso(url, key, cert, debug)
            html = data.read()
            encoding = enc(data.headers, html)[0]
    if  encoding == 'us-ascii':
        encoding = 'utf-8'
    pager = os.environ.get('CMSSH_PAGER', None)
    if  html:
        if  int(os.environ.get('HTTPDEBUG', 0)):
            print_info('read data')
            print html
        if  encoding:
            text = html.decode(encoding)
            res  = html2text(text, '')
            if  output:
                with open(output, 'w') as stream:
                    stream.write(html)
            else:
                try:
                    if  pager:
                        pydoc.pager(res.encode('utf-8'))
                    else:
                        wrapwrite(html2text(text, ''))
                except:
                    wrapwrite(html2text(text, ''))
        else:
            if  output:
                with open(output, 'w') as stream:
                    stream.write(html)
            else:
                try:
                    if  pager:
                        pydoc.pager(html)
                    else:
                        print html
                except:
                    print html

コード例 #32

0

ファイルを表示

ファイル: html2text.py プロジェクト: aadityabhatia/gdocs-markdown-processor

        encoding = None
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            text = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, text)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
            data = text.decode(encoding)

        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
            data = data.decode(encoding)
    else:
        data = sys.stdin.read()

コード例 #33

0

ファイルを表示

ファイル: simplesentiment.py プロジェクト: vanjos/kairos_sentiment

	def strippedurl(self, options, url):
		sys.path.append(os.path.dirname(__file__))
		from sentiment.testsentiment.models.html2text import _html2text

		try: #Python3
			import urllib.request as urllib
		except:
			import urllib

		def optwrap(text):
			"""Wrap all paragraphs in the provided text."""
			#if not BODY_WIDTH:
			if 1:
				return text

			assert wrap, "Requires Python 2.3."
			result = ''
			newlines = 0
			for para in text.split("\n"):
				if len(para) > 0:
					if para[0] != ' ' and para[0] != '-' and para[0] != '*':
						for line in wrap(para, BODY_WIDTH):
							result += line + "\n"
						result += "\n"
						newlines = 2
					else:
						if not onlywhite(para):
							result += para + "\n"
							newlines = 1
				else:
					if newlines < 2:
						result += "\n"
						newlines += 1
			return result

		def wrapwrite(text):
			text = text.encode('utf-8')
			try: #Python3
				sys.stdout.buffer.write(text)
			except AttributeError:
				sys.stdout.write(text)

		def html2text_file(html, out=wrapwrite, baseurl=''):
			h = _html2text(out, baseurl)
			h.feed(html)
			h.feed("")
			return h.close()

		def html2text(html, baseurl=''):
			return optwrap(html2text_file(html, None, baseurl))

		if url.startswith('http://') or url.startswith('https://'):
			baseurl = url
			j = urllib.urlopen(baseurl)
			text = j.read()
			try:
				from feedparser import _getCharacterEncoding as enc
			except ImportError:
				enc = lambda x, y: ('utf-8', 1)
			encoding = enc(j.headers, text)[0]
			if encoding == 'us-ascii':
				encoding = 'utf-8'

			data = text.decode(encoding)

		else:
			print 'Cannot open this URL %s' % url

		values = {}

		print("%29s" % 'DENOMINATORS / NORMALIZATION')
		print("%9s %9s %9s %14s %14s %35s" % ("X^2", "X", "SQRT", "T_METHOD", "L_METHOD", "URL"))
		for t_method in self.getTokenizers():
			print '-' * 120
			for l_method in self.getStemmers():
				sentiment = self.sentiment(html2text(data, baseurl), t_method, l_method)
				for denom, name in self.getDenoms().items():
					if not values:
						values = {'%s [%14s:%14s] (%s)' % (baseurl, self.TOKENIZERS[t_method], self.STEMMERS[l_method],
						                                   name) : sentiment[denom]}
					else:
						values['%s [%14s:%14s] (%s)' % (baseurl, self.TOKENIZERS[t_method], self.STEMMERS[l_method],
						                                name)] = sentiment[denom]
				print '%9f %9f %9f %14s %14s %s' % (sentiment[0], sentiment[1], sentiment[2],
				                                   self.TOKENIZERS[t_method], self.STEMMERS[l_method], baseurl)
		return values