Ejemplos de html2text en Python, ejemplos de html2text.html2text.html2text en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: evernote-shotgun.py Proyecto: russelling/evernote-shotgun

def processNotes(noteStore, notebookName, notes, sg, sgUser):

    for noteMetadata in notes:

        #Get the note
        note = noteStore.getNote(noteMetadata.guid, False, True, False, False)

        #Check if the note was already synced, meaning it has the 'sgSynced' tag
        if note.tagGuids:
            tags = [noteStore.getTag(tagGuid) for tagGuid in note.tagGuids]
            tagNames = [tag.name for tag in tags]
            if 'sgSynced' in tagNames:
                print('\nAlready synced {0}'.format(note.title))
                continue

        #Convert the Note body to plain text
        print('\nGetting note data for {0}'.format(noteMetadata.guid))
        contentENML = noteStore.getNoteContent(noteMetadata.guid, True, False,
                                               False, False)
        contentHTML = enml.ENMLToHTML(contentENML)
        contentTEXT = html2text.html2text(contentHTML.decode('utf-8'))
        contentTEXT = re.sub(r' *\n', os.linesep, contentTEXT)

        #Gather the required data for the Shotgun note
        sgProject = sg.find('Project', [['name', 'is', notebookName]])
        noteLinks = []
        if note.tagGuids:
            print '...Processing tags'
            noteLinks = processTags(noteStore, sgProject, note.tagGuids, sg)

        #Create a Shotgun note
        print '...Creating Shotgun note'
        sgData = {
            'subject': note.title,
            'content': contentTEXT,
            'project': sgProject[0],
            'note_links': noteLinks,
            'user': sgUser
        }
        sgNote = sg.create('Note', sgData)

        #If the note has attachments, run processResources() to attachment them to the note
        if note.resources:
            print '...Processing attachments'
            processResources(notebookName, note.title, note.resources,
                             sgNote['id'], sg)

        #Tag the note with 'sgSynced' so it doesn't get synced again
        note.tagNames = ['sgSynced']
        noteStore.updateNote(note)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: rssbot.py Proyecto: stpeter/memberbot

 def sendItem(self, item, muc, feedName):
     """ Sends a summary of an rss item to a specified muc.
     """
     #for contentKey in ['summary','value', '']:
     #    if item.has_key(contentKey):
     #        break
     #if contentKey == '':
     #    print "No content found for item"
     #    return
     #print u"found content in key %s" % contentKey
     content = self.bot.xmlesc(item['content'][0].value)
     content = item['content'][0].value
     text = html2text("Update from feed %s\n%s\n%s" % (feedName, self.bot.xmlesc(item['title']), content))
     self.bot.sendMessage(muc, text, mtype='groupchat')

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_finds.py Proyecto: countrymarmot/myreadchoice

def find(url):
    '''get article from the url, and return the markdown content.
    '''
    try:
        print url
        result = grab.get_article(url)
        print result["title"].encode("utf-8")
        print "score: " + str(result["score"])
        if(result["article"] is not None):
            html = result["article"]
            path = ("./%s/" % "output")
            name = result["title"]
            #save_file(path, name + ".html", html)
            __save_file(path, name + ".md", html2text(html))
    except Exception as e:
        print e

Ejemplo n.º 4

0

Mostrar archivo

Archivo: rssbot.py Proyecto: ebagnaschi/SleekBot

 def sendItem(self, item, muc, feedName):
     """ Sends a summary of an rss item to a specified muc.
     """
     #for contentKey in ['summary','value', '']:
     #    if item.has_key(contentKey):
     #        break
     #if contentKey == '':
     #    print "No content found for item"
     #    return
     #print u"found content in key %s" % contentKey
     if 'content' in item:
         content = self.bot.xmlesc(item['content'][0].value)
         content = item['content'][0].value
     else:
         content = ''
     text = html2text("Update from feed %s\n%s\n%s" % (feedName, self.bot.xmlesc(item['title']), content))
     self.bot.sendMessage(muc, text, mtype='groupchat')

Ejemplo n.º 5

0

Mostrar archivo

Archivo: Oni-RssAsText.py Proyecto: leoss/Oni-RssAsText

def main():
    #download file
    xml = parse(urlopen(ONI_FEED_URL))

    #iterate through items
    xml_data = xml.getElementsByTagName("item")
    for node in xml_data:
        entry = {}
        #get html of description element
        html = node.getElementsByTagName("description")[0].firstChild.data
        text = html2text(html)
        #set new text
        node.getElementsByTagName("description")[0].firstChild.data = text

    #save file
    fd = open(ONI_NEW_RSS_FILEPATH, "w")
    fd.write(xml.toprettyxml())
    fd.close()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: evernote-shotgun.py Proyecto: benhadden/evernote-shotgun

def processNotes(noteStore, notebookName, notes, sg, sgUser):
    
    for noteMetadata in notes:
        
        #Get the note
        note = noteStore.getNote(noteMetadata.guid, False, True, False, False)
        
        #Check if the note was already synced, meaning it has the 'sgSynced' tag
        if note.tagGuids:
            tags = [noteStore.getTag(tagGuid) for tagGuid in note.tagGuids]
            tagNames = [tag.name for tag in tags]
            if 'sgSynced' in tagNames:
                print('\nAlready synced {0}'.format(note.title))
                continue
        
        #Convert the Note body to plain text
        print('\nGetting note data for {0}'.format(noteMetadata.guid))
        contentENML = noteStore.getNoteContent(noteMetadata.guid, True, False, False, False)
        contentHTML = enml.ENMLToHTML(contentENML)
        contentTEXT = html2text.html2text(contentHTML.decode('utf-8'))
        contentTEXT = re.sub(r' *\n', os.linesep, contentTEXT)
        
        #Gather the required data for the Shotgun note
        sgProject = sg.find('Project', [['name','is',notebookName]])
        noteLinks = []
        if note.tagGuids:
            print '...Processing tags'
            noteLinks = processTags(noteStore, sgProject, note.tagGuids, sg)
            
        #Create a Shotgun note
        print '...Creating Shotgun note'
        sgData = {'subject':note.title, 'content':contentTEXT, 'project':sgProject[0], 'note_links':noteLinks, 'user':sgUser}
        sgNote = sg.create('Note',sgData)
        
        #If the note has attachments, run processResources() to attachment them to the note
        if note.resources:
            print '...Processing attachments'
            processResources(notebookName, note.title, note.resources, sgNote['id'], sg)
            
        #Tag the note with 'sgSynced' so it doesn't get synced again
        note.tagNames = ['sgSynced']
        noteStore.updateNote(note)

Ejemplo n.º 7

0

Mostrar archivo

def main(argv):
    # evernote_notes = minidom.parse(open("backup_2014_02_06.enex"))
    # ipdb.set_trace()

    evernote_notes = minidom.parse(open(argv[0]))

    notes = evernote_notes.getElementsByTagName('note')

    for ii in range(0, len(notes)):
        # Rules about titles/filenames: NO commas, colons, slashes, spaces
        # So just mainly periods, hyphens, underscores.
        # You print XML element values with this. srsly? I know, it's insanity
        file_note_name = notes[ii].getElementsByTagName(
            'title')[0].firstChild.nodeValue
        # Because capital letters on the command line are lame.
        file_note_name = file_note_name.lower()
        file_note_name = re.sub(',', '', file_note_name)
        file_note_name = re.sub(':', '', file_note_name)
        # This ugly thing is for forwardslashes as used in Unix.
        file_note_name = re.sub('[[\]/]', '-', file_note_name)
        file_note_name = re.sub(' ', '-', file_note_name)
        file_note_name = re.sub('/', '-', file_note_name)
        print(file_note_name)

        if (file_note_name[0] == '-'):
            file_note_name = file_note_name[1:len(file_note_name)]

        if not os.path.exists('output/' + file_note_name + '.md'):
            with open('output/' + file_note_name + '.md', 'a') as f_note:

                # Title metadata
                f_note.write("Note Title: " + '`' + file_note_name + '`')

                # I'm delimiting subday times with semicolons instead of colons, as is normal, due to some obscurities in a vim plugin I use for interfacing with them.

                # Created time metadata
                f_note.write("\n")
                created = notes[ii].getElementsByTagName(
                    'created')[0].firstChild.nodeValue
                created = created[0:4] + '_' + created[4:6] + '_' + \
                    created[6:8] + '-' + created[9:11] + ';' + created[11:13] + \
                    ';' + created[13:15]
                f_note.write("Note Created: " + '`' + created + '`')

                # Updated time metadata
                f_note.write("\n")
                updated = notes[ii].getElementsByTagName(
                    'updated')[0].firstChild.nodeValue
                updated = updated[0:4] + '_' + updated[4:6] + '_' + \
                    updated[6:8] + '-' + updated[9:11] + ';' + updated[11:13] + \
                    ';' + updated[13:15]
                f_note.write("Note Updated: " + '`' + updated + '`')

                # Tags metadata
                f_note.write("\n")
                f_note.write("Note Tags: ")
                for jj in range(0, len(notes[ii].getElementsByTagName('tag'))):
                    print(notes[ii].getElementsByTagName('tag')
                          [jj].firstChild.nodeValue)

                    f_note.write("^" + notes[ii].getElementsByTagName('tag')
                                 [jj].firstChild.nodeValue + "^" + ", ")

                # Actual Body content
                f_note.write("\n")
                f_note.write("Note Body:")
                f_note.write("\n")
                f_note.write("\n")
                body = notes[ii].getElementsByTagName(
                    'content')[0].firstChild.nodeValue

                # Notes: Raw image information seems to be encoded with <data encoding="base64">.
                # Based on a search `/encoding="base[^6]` returning nothing, all the image codes appear to be this same base, which is very common.
                # Near/before the raw image data there is also a "file-name" element that would be extremely helpful.
                # However, there's other resource metadata scattered around the "data" element for an image, like "resource-attributes" and "resource", so while parsing images and saving both them to file and inserting markdown links to said data file is possible, it will be a ton of work.
                # Not to mention the question of what format to do it - do you put them in a separate `fig` folder? Just make a copy of the image in the output? Put the images in a new folder with the same name as the note?
                # This is a moderately-sized hole in Austinote at the moment, but I don't really use images in notes that much (or want to reference them absolutely somewhere else), and so I'm not going to deal with it at the moment.
                # I think for the time being I will just have a "figures" folder to allow absolute path linkage in markdown to specific figures on the OS, but I will do this manually.

                # Clean up the very ugly body unicode in XML to plain ASCII
                body = re.sub(u'\xa0', '', body)
                body = html2text(body)
                # body = html2text.html2text(body)

                # Then turn Aaron's ugly occasional unicode further to ASCII
                # vim gives this as 'e2 80 98', which is the byte hex version of '\xe2\x80\x98', and, through http://www.ltg.ed.ac.uk/~richard/utf-8.cgi?input=e2+80+99&mode=bytes , is apparently unicode '\u2019'
                # body = re.sub(u'\xe2\x80\x98', '\'', body)
                # this works! finally! This should solve all Unicode worries, post-html2text and post-Evernote
                body = re.sub(u'\u2019', '\'', body)
                body = re.sub(u'\u2018', '\'', body)
                # we also have what's in ————————————————————————————————————————————— to deal with, which is e2 80 94, or \u2014
                body = re.sub(u'\u2014', '----', body)
                body = re.sub(u'\u201c', '\"', body)
                body = re.sub(u'\u201d', '\"', body)

                body = re.sub('\\\\', '', body)

                f_note.write(body)
                f_note.close()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: gui.py Proyecto: staticerror/aass

 def resultize(text):
     return html2text.html2text(text).encode('utf8')