Python html2text 예제들

예제 #1

0

파일 보기

파일: Parser.py 프로젝트: weidler/tyrex

	def readFileAtPath(self, posix_path):
		"""
		Reads a file at a given path. Looks for utf-8/latin-1 encoding. Converts HTML Markup to Text.

		@parameters
		posix_path		string	the concerned filepath at which the method should read

		@returns		string	html-free content of filepath
						bool	FALSE if encoding unknown or file not found
		"""

		try:
			with open(posix_path, encoding="utf-8") as f:  # general encoding
				return html2text(f.read())
		except UnicodeDecodeError:
			try:
				with open(posix_path, encoding="latin-1") as f:  # german language encoding
					return html2text(f.read())
			except:
				print("DECODE ERROR")
				return False
		except IOError:
			print("FILE NOT FOUND")
			return False
		except Exception as e:
			print("UNKNOWN ERROR\n" + e)
			return False

예제 #2

0

파일 보기

파일: newspipe.py 프로젝트: Puyb/newspipe

def getPlainText(html, links=True):
    if not isinstance(html, unicode):
        html = html.decode('latin1')

    plain_text = u''
    if has_html2text:
        # html2text seems to be not-thread-safe, so I'm avoiding concurrency
        # here using a semaphore
        html2text_lock.acquire()
        try:
            try:
                plain_text = html2text(html).strip()
            except:
                plain_text = getException ()
                mylog.exception ('Error en getPlainText')
        
        finally:
            html2text_lock.release()
    


    if not isinstance(plain_text, unicode):
        plain_text = plain_text.decode('utf-8')

    return plain_text

예제 #3

0

파일 보기

파일: emailer.py 프로젝트: nagyist/chaiproject

def send(recipients, subject, message, sender=None, format=MARKDOWN):
    """send an email as TEXT, MARKDOWN or HTML"""

    if type(recipients) in (list, tuple):
        recipients = ', '.join(recipients)

    from email.mime.text import MIMEText
    from email.mime.multipart import MIMEMultipart
    from conf.mailsettings import smtpsettings

    if format == TEXT:
        msg = MIMEText(message)
    else:
        msg = MIMEMultipart('alternative')

        if format == MARKDOWN:
            import markdown2
            msg.attach(MIMEText(message, 'plain'))
            msg.attach(MIMEText(markdown2.markdown(message), 'html'))
        elif format == HTML:
            import html2text
            msg.attach(MIMEText(html2text(message), 'plain'))
            msg.attach(MIMEText(message, 'html'))

    msg['Subject'] = subject
    msg['From'] = sender or smtpsettings.get('sender')
    msg['To'] = recipients

    smtp_send(msg)

예제 #4

0

파일 보기

def update_module_with_section(module_node, section, section_types, testbases):
    section_html = html.tostring(section).decode()
    doc = html2text(section_html)
    rewrite_doc(module_node, doc)
    testcases = cases_dict(section, section_types)

    for node in module_node.body[3:]:
        if isinstance(node, ast.ClassDef) and node.name in testcases:
            #rewrite docs for existing methods in existing case
            docs_transformer = AppendDocToMethods(testcases[node.name])
            docs_transformer.visit(node)

            #append uncreated methods to existing case
            for (testmethod, doc) in docs_transformer.testmethods.items():

                node.body.append(create_testmethod(testmethod, doc))

            del testcases[node.name]
    #create uncreated testcases
    for testcase, testmethods in testcases.items():
        if not testmethods:
            continue

        casedef = ast.ClassDef( name=testcase,
                                body=[],
                                bases=(ast.Name(base, ast.Load()) for base in testbases),
                                decorator_list=[])
        for testmethod, doc in testmethods.items():
            casedef.body.append(create_testmethod(testmethod, doc))
        module_node.body.append(casedef)

    return module_node

예제 #5

0

파일 보기

파일: emailer.py 프로젝트: jlmwebhosting/chaiproject

def send(recipients, subject, message, sender=None, format=MARKDOWN):
	"""send an email as TEXT, MARKDOWN or HTML"""
	
	if type(recipients) in (list, tuple):
		recipients = ', '.join(recipients)
		
	from email.mime.text import MIMEText
	from email.mime.multipart import MIMEMultipart
	from conf.mailsettings import smtpsettings
	
	if format==TEXT:
		msg = MIMEText(message)
	else:
		msg = MIMEMultipart('alternative')
		
		if format==MARKDOWN:
			import markdown2
			msg.attach(MIMEText(message, 'plain'))
			msg.attach(MIMEText(markdown2.markdown(message), 'html'))
		elif format==HTML:
			import html2text
			msg.attach(MIMEText(html2text(message), 'plain'))
			msg.attach(MIMEText(message, 'html'))
	
	msg['Subject'] = subject
	msg['From'] = sender or smtpsettings.get('sender')
	msg['To'] = recipients
	
	smtp_send(msg)

예제 #6

0

파일 보기

파일: getFiles.py 프로젝트: danielrasmuson/Math130_Automated

def getDocxStr(docxFile):
    """ Gets the document string from the file using a little bit of trickery with docx2html and html2text. """
    # We can deal just fine with unicode and probably should, so that we get better formatted answers.
    newText = docx2html(docxFile)
    fileStr = html2text(newText)
    fileStr = fileStr.replace(u"\u2013","-")
    return fileStr

예제 #7

0

파일 보기

파일: newspipe.py 프로젝트: Puyb/newspipe

def makeHeader(text):
    if not text:
        text = ''
    if not isinstance(text, unicode):
        text = text.decode('latin1')
    if isinstance(text, unicode):
        text = text.encode('utf-8')
    try:
        if has_html2text:
            text = html2text(text).strip()
    except UnicodeError:
        pass

    return str(email.Header.make_header([(text, 'utf-8')]))

예제 #8

0

파일 보기

파일: reader.py 프로젝트: drorarm/shambhala

def main():
    lines = []
    with codecs.open('a01.htm', encoding='cp1255') as f:
        for line in f:
            if line.startswith("MArr") or line.startswith("MTbl"):
                #print html2text(line)
                lines.append(line)

    file = codecs.open("output.txt", "w", "utf")
    posts = []

    try:
        line_index = 0
        while line_index < len(lines):
            mtbl = lines[line_index + 1]
            post_dict = decode_mtbl(mtbl)
            file.write(post_dict['title'] + '\n')
            file.write(u'user:'******'user'] + '\n')
            file.write(u'post_time: {}\n'.format(post_dict['post_time']))
            file.write(u'header_level: {}\n'.format(post_dict['header_level']))
            file.write("####################\n")

            marr = lines[line_index]
            try:
                content = html2text(
                    extract_body(marr).encode(encoding='cp1255')).decode(
                        encoding='cp1255')
                post_dict['content'] = content
                posts.append(post_dict)
                file.write(content + '\n')
            except Exception as e:
                print("could not decode html: ")
                print(e)
            file.write(
                "-------------------------------------------------------------------------------------------------------------------------\n"
            )
            line_index = line_index + 2
    except Exception as e:
        print(e)
    finally:
        file.close()

    for post in posts:
        if post['user'] == u'DrorKFTC':
            print(post['title'])
            print(post['code'])
            print('---------------------------')
            print(post['content'])

예제 #9

0

파일 보기

파일: gethosts.py 프로젝트: riverzhou/githubip

def procHTML(rawhtml):
    ip = ''
    mdText = html2text(rawhtml)
    for line in mdText.split('\n'):
        line = line.strip()
        if line.startswith(resultHead) and resultTag in line:
            ip = line.split(resultTag)[1].rstrip(')')
            if isIP(ip):
                print(line, currentUrl)
                break
    if not isIP(ip):
        if dictUrlRetry[currentUrl] > 0:
            dictUrlRetry[currentUrl] -= 1
            listUrl.insert(0, currentUrl)
            print(mdText)
    else:
        listResult.append((ip, currentUrl.split(urlPrefix)[1]))
    loadUrl()

예제 #10

0

파일 보기

파일: test_grab.py 프로젝트: countrymarmot/myreadchoice

def find(url):
    '''get article from the url, and return the markdown content.
    '''
    print url
    result = grab.get_article(url)
    if "error" in result:
        print result["error"]
        return
    print result["title"].encode("utf-8")
    print "score: " + str(result["score"])
    if(result["article"] is not None):
        print type(result["article"])
        html = result["article"]
        path = ("./%s/" % "output")
        name = result["title"]
        __save_file(path, name + ".html", html)
        __save_file(path, name + ".md", html2text(html))
    else:
        print "no article found."

예제 #11

0

파일 보기

파일: pubmed_oa_parser.py 프로젝트: katblankart/pubmed_parser

def parse_coi_statements(tree):
    """
    Parse conflict of interest statements from given article tree
    """
    coi_paths = (
        'conflict',
        'CoiStatement',
        './/*[@*="conflict"]',
        './/*[@*="conflict-interest"]',
        './/*[@*="COI-statement"]',
        './/*[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"interest") and (contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"competing") or contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"declaring") or contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"conflict"))]'
    )

    for pi, path in enumerate(coi_paths):
        for el in tree.xpath(path):
            coi_text = '\n'.join(el.itertext())
            if pi != 5:
                yield coi_text
            else:
                if len(coi_text) <= 36:
                    # TODO: get filename from somewhere
                    article_text = html2text(codecs.open(filename, 'r', encoding='utf8').read())
                    match = re.search(coi_text, article_text, flags=re.IGNORECASE) 
                if match is not None:
                    start_pos = match.start()
                    full_coi_text = ''
                    not_newline = True
                    i = 0
                    while not_newline:
                        char = article_text[start_pos + i]
                        if char == '\n' and article_text[start_pos + i + 1] == '\n' and i > len(coi_text):
                            not_newline = False   
                        else:
                            full_coi_text += article_text[start_pos + i]
                            i += 1
                    
                    coi_text = full_coi_text.replace('\t', ' ').replace('\n', ' ')
                    yield coi_text

예제 #12

0

파일 보기

파일: courrier.py 프로젝트: senufo/xbmc-courrier

    def processMails(self, text, att_file):
        """
        Parse mail for display in XBMC
        """
        myemail = email.message_from_string(text)
        p = EmailParser()
        msgobj = p.parsestr(text)
        if msgobj['Subject'] is not None:
            decodefrag = decode_header(msgobj['Subject'])
            subj_fragments = []
            for s , enc in decodefrag:
                if enc:
                    s = unicode(s , enc).encode('utf8','replace')
                subj_fragments.append(s)
            subject = ''.join(subj_fragments)
        else:
            subject = None
        if msgobj['Date'] is not None:
            date = msgobj['Date']
        else:
            date = '--'
        Sujet = subject
        realname = parseaddr(msgobj.get('From'))[1]

        body = None
        html = None
        for part in msgobj.walk():
            content_disposition = part.get("Content-Disposition", None)
            prog = re.compile('attachment')
            #Retrouve le nom des fichiers attaches
            if prog.search(str(content_disposition)):
                file_att = str(content_disposition)

                pattern = Pattern(r"\"(.+)\"")
                att_file +=  str(pattern.findall(file_att))

            if part.get_content_type() == "text/plain":
                if body is None:
                    body = ""
                try :
                    #Si pas de charset défini
                    if (part.get_content_charset() is None):
                        body +=  part.get_payload(decode=True)
                    else:
                        body += unicode(
                           part.get_payload(decode=True),
                           part.get_content_charset(),
                           'replace'
                           ).encode('utf8','replace')
                except Exception, e:
                    body += "Erreur unicode"
                    print "BODY = %s " % body
            elif part.get_content_type() == "text/html":
                if html is None:
                    html = ""
                try :
                    unicode_coded_entities_html = unicode(BeautifulStoneSoup(html,
                            convertEntities=BeautifulStoneSoup.HTML_ENTITIES))

                    html += unicode_coded_entities_html
                    html = html2text(html)
                except Exception, e:
                    html += "Erreur unicode html"

예제 #13

0

파일 보기

파일: scrapper.py 프로젝트: NatuMyers/pythonWebScrape

# https://www.youtube.com/watch?v=qfGthiqwaZo

import urllib2
from import html2text #formats HTML to markdown

# read each line of the md
for line in html2text(urllib2.urlopen("http://www.moviebodycounts.com/Braveheart.htm").read()).split("\n")
    if "IMDb" in line:
        print line.split("[IMDb]")
    if "Film:" in line:
        print line.split("[IMDb]")