def readFileAtPath(self, posix_path): """ Reads a file at a given path. Looks for utf-8/latin-1 encoding. Converts HTML Markup to Text. @parameters posix_path string the concerned filepath at which the method should read @returns string html-free content of filepath bool FALSE if encoding unknown or file not found """ try: with open(posix_path, encoding="utf-8") as f: # general encoding return html2text(f.read()) except UnicodeDecodeError: try: with open(posix_path, encoding="latin-1") as f: # german language encoding return html2text(f.read()) except: print("DECODE ERROR") return False except IOError: print("FILE NOT FOUND") return False except Exception as e: print("UNKNOWN ERROR\n" + e) return False
def getPlainText(html, links=True): if not isinstance(html, unicode): html = html.decode('latin1') plain_text = u'' if has_html2text: # html2text seems to be not-thread-safe, so I'm avoiding concurrency # here using a semaphore html2text_lock.acquire() try: try: plain_text = html2text(html).strip() except: plain_text = getException () mylog.exception ('Error en getPlainText') finally: html2text_lock.release() if not isinstance(plain_text, unicode): plain_text = plain_text.decode('utf-8') return plain_text
def send(recipients, subject, message, sender=None, format=MARKDOWN): """send an email as TEXT, MARKDOWN or HTML""" if type(recipients) in (list, tuple): recipients = ', '.join(recipients) from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from conf.mailsettings import smtpsettings if format == TEXT: msg = MIMEText(message) else: msg = MIMEMultipart('alternative') if format == MARKDOWN: import markdown2 msg.attach(MIMEText(message, 'plain')) msg.attach(MIMEText(markdown2.markdown(message), 'html')) elif format == HTML: import html2text msg.attach(MIMEText(html2text(message), 'plain')) msg.attach(MIMEText(message, 'html')) msg['Subject'] = subject msg['From'] = sender or smtpsettings.get('sender') msg['To'] = recipients smtp_send(msg)
def update_module_with_section(module_node, section, section_types, testbases): section_html = html.tostring(section).decode() doc = html2text(section_html) rewrite_doc(module_node, doc) testcases = cases_dict(section, section_types) for node in module_node.body[3:]: if isinstance(node, ast.ClassDef) and node.name in testcases: #rewrite docs for existing methods in existing case docs_transformer = AppendDocToMethods(testcases[node.name]) docs_transformer.visit(node) #append uncreated methods to existing case for (testmethod, doc) in docs_transformer.testmethods.items(): node.body.append(create_testmethod(testmethod, doc)) del testcases[node.name] #create uncreated testcases for testcase, testmethods in testcases.items(): if not testmethods: continue casedef = ast.ClassDef( name=testcase, body=[], bases=(ast.Name(base, ast.Load()) for base in testbases), decorator_list=[]) for testmethod, doc in testmethods.items(): casedef.body.append(create_testmethod(testmethod, doc)) module_node.body.append(casedef) return module_node
def send(recipients, subject, message, sender=None, format=MARKDOWN): """send an email as TEXT, MARKDOWN or HTML""" if type(recipients) in (list, tuple): recipients = ', '.join(recipients) from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from conf.mailsettings import smtpsettings if format==TEXT: msg = MIMEText(message) else: msg = MIMEMultipart('alternative') if format==MARKDOWN: import markdown2 msg.attach(MIMEText(message, 'plain')) msg.attach(MIMEText(markdown2.markdown(message), 'html')) elif format==HTML: import html2text msg.attach(MIMEText(html2text(message), 'plain')) msg.attach(MIMEText(message, 'html')) msg['Subject'] = subject msg['From'] = sender or smtpsettings.get('sender') msg['To'] = recipients smtp_send(msg)
def getDocxStr(docxFile): """ Gets the document string from the file using a little bit of trickery with docx2html and html2text. """ # We can deal just fine with unicode and probably should, so that we get better formatted answers. newText = docx2html(docxFile) fileStr = html2text(newText) fileStr = fileStr.replace(u"\u2013","-") return fileStr
def makeHeader(text): if not text: text = '' if not isinstance(text, unicode): text = text.decode('latin1') if isinstance(text, unicode): text = text.encode('utf-8') try: if has_html2text: text = html2text(text).strip() except UnicodeError: pass return str(email.Header.make_header([(text, 'utf-8')]))
def main(): lines = [] with codecs.open('a01.htm', encoding='cp1255') as f: for line in f: if line.startswith("MArr") or line.startswith("MTbl"): #print html2text(line) lines.append(line) file = codecs.open("output.txt", "w", "utf") posts = [] try: line_index = 0 while line_index < len(lines): mtbl = lines[line_index + 1] post_dict = decode_mtbl(mtbl) file.write(post_dict['title'] + '\n') file.write(u'user:'******'user'] + '\n') file.write(u'post_time: {}\n'.format(post_dict['post_time'])) file.write(u'header_level: {}\n'.format(post_dict['header_level'])) file.write("####################\n") marr = lines[line_index] try: content = html2text( extract_body(marr).encode(encoding='cp1255')).decode( encoding='cp1255') post_dict['content'] = content posts.append(post_dict) file.write(content + '\n') except Exception as e: print("could not decode html: ") print(e) file.write( "-------------------------------------------------------------------------------------------------------------------------\n" ) line_index = line_index + 2 except Exception as e: print(e) finally: file.close() for post in posts: if post['user'] == u'DrorKFTC': print(post['title']) print(post['code']) print('---------------------------') print(post['content'])
def procHTML(rawhtml): ip = '' mdText = html2text(rawhtml) for line in mdText.split('\n'): line = line.strip() if line.startswith(resultHead) and resultTag in line: ip = line.split(resultTag)[1].rstrip(')') if isIP(ip): print(line, currentUrl) break if not isIP(ip): if dictUrlRetry[currentUrl] > 0: dictUrlRetry[currentUrl] -= 1 listUrl.insert(0, currentUrl) print(mdText) else: listResult.append((ip, currentUrl.split(urlPrefix)[1])) loadUrl()
def find(url): '''get article from the url, and return the markdown content. ''' print url result = grab.get_article(url) if "error" in result: print result["error"] return print result["title"].encode("utf-8") print "score: " + str(result["score"]) if(result["article"] is not None): print type(result["article"]) html = result["article"] path = ("./%s/" % "output") name = result["title"] __save_file(path, name + ".html", html) __save_file(path, name + ".md", html2text(html)) else: print "no article found."
def parse_coi_statements(tree): """ Parse conflict of interest statements from given article tree """ coi_paths = ( 'conflict', 'CoiStatement', './/*[@*="conflict"]', './/*[@*="conflict-interest"]', './/*[@*="COI-statement"]', './/*[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"interest") and (contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"competing") or contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"declaring") or contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"conflict"))]' ) for pi, path in enumerate(coi_paths): for el in tree.xpath(path): coi_text = '\n'.join(el.itertext()) if pi != 5: yield coi_text else: if len(coi_text) <= 36: # TODO: get filename from somewhere article_text = html2text(codecs.open(filename, 'r', encoding='utf8').read()) match = re.search(coi_text, article_text, flags=re.IGNORECASE) if match is not None: start_pos = match.start() full_coi_text = '' not_newline = True i = 0 while not_newline: char = article_text[start_pos + i] if char == '\n' and article_text[start_pos + i + 1] == '\n' and i > len(coi_text): not_newline = False else: full_coi_text += article_text[start_pos + i] i += 1 coi_text = full_coi_text.replace('\t', ' ').replace('\n', ' ') yield coi_text
def processMails(self, text, att_file): """ Parse mail for display in XBMC """ myemail = email.message_from_string(text) p = EmailParser() msgobj = p.parsestr(text) if msgobj['Subject'] is not None: decodefrag = decode_header(msgobj['Subject']) subj_fragments = [] for s , enc in decodefrag: if enc: s = unicode(s , enc).encode('utf8','replace') subj_fragments.append(s) subject = ''.join(subj_fragments) else: subject = None if msgobj['Date'] is not None: date = msgobj['Date'] else: date = '--' Sujet = subject realname = parseaddr(msgobj.get('From'))[1] body = None html = None for part in msgobj.walk(): content_disposition = part.get("Content-Disposition", None) prog = re.compile('attachment') #Retrouve le nom des fichiers attaches if prog.search(str(content_disposition)): file_att = str(content_disposition) pattern = Pattern(r"\"(.+)\"") att_file += str(pattern.findall(file_att)) if part.get_content_type() == "text/plain": if body is None: body = "" try : #Si pas de charset défini if (part.get_content_charset() is None): body += part.get_payload(decode=True) else: body += unicode( part.get_payload(decode=True), part.get_content_charset(), 'replace' ).encode('utf8','replace') except Exception, e: body += "Erreur unicode" print "BODY = %s " % body elif part.get_content_type() == "text/html": if html is None: html = "" try : unicode_coded_entities_html = unicode(BeautifulStoneSoup(html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) html += unicode_coded_entities_html html = html2text(html) except Exception, e: html += "Erreur unicode html"
# https://www.youtube.com/watch?v=qfGthiqwaZo import urllib2 from import html2text #formats HTML to markdown # read each line of the md for line in html2text(urllib2.urlopen("http://www.moviebodycounts.com/Braveheart.htm").read()).split("\n") if "IMDb" in line: print line.split("[IMDb]") if "Film:" in line: print line.split("[IMDb]")