def processNotes(noteStore, notebookName, notes, sg, sgUser): for noteMetadata in notes: #Get the note note = noteStore.getNote(noteMetadata.guid, False, True, False, False) #Check if the note was already synced, meaning it has the 'sgSynced' tag if note.tagGuids: tags = [noteStore.getTag(tagGuid) for tagGuid in note.tagGuids] tagNames = [tag.name for tag in tags] if 'sgSynced' in tagNames: print('\nAlready synced {0}'.format(note.title)) continue #Convert the Note body to plain text print('\nGetting note data for {0}'.format(noteMetadata.guid)) contentENML = noteStore.getNoteContent(noteMetadata.guid, True, False, False, False) contentHTML = enml.ENMLToHTML(contentENML) contentTEXT = html2text.html2text(contentHTML.decode('utf-8')) contentTEXT = re.sub(r' *\n', os.linesep, contentTEXT) #Gather the required data for the Shotgun note sgProject = sg.find('Project', [['name', 'is', notebookName]]) noteLinks = [] if note.tagGuids: print '...Processing tags' noteLinks = processTags(noteStore, sgProject, note.tagGuids, sg) #Create a Shotgun note print '...Creating Shotgun note' sgData = { 'subject': note.title, 'content': contentTEXT, 'project': sgProject[0], 'note_links': noteLinks, 'user': sgUser } sgNote = sg.create('Note', sgData) #If the note has attachments, run processResources() to attachment them to the note if note.resources: print '...Processing attachments' processResources(notebookName, note.title, note.resources, sgNote['id'], sg) #Tag the note with 'sgSynced' so it doesn't get synced again note.tagNames = ['sgSynced'] noteStore.updateNote(note)
def sendItem(self, item, muc, feedName): """ Sends a summary of an rss item to a specified muc. """ #for contentKey in ['summary','value', '']: # if item.has_key(contentKey): # break #if contentKey == '': # print "No content found for item" # return #print u"found content in key %s" % contentKey content = self.bot.xmlesc(item['content'][0].value) content = item['content'][0].value text = html2text("Update from feed %s\n%s\n%s" % (feedName, self.bot.xmlesc(item['title']), content)) self.bot.sendMessage(muc, text, mtype='groupchat')
def find(url): '''get article from the url, and return the markdown content. ''' try: print url result = grab.get_article(url) print result["title"].encode("utf-8") print "score: " + str(result["score"]) if(result["article"] is not None): html = result["article"] path = ("./%s/" % "output") name = result["title"] #save_file(path, name + ".html", html) __save_file(path, name + ".md", html2text(html)) except Exception as e: print e
def sendItem(self, item, muc, feedName): """ Sends a summary of an rss item to a specified muc. """ #for contentKey in ['summary','value', '']: # if item.has_key(contentKey): # break #if contentKey == '': # print "No content found for item" # return #print u"found content in key %s" % contentKey if 'content' in item: content = self.bot.xmlesc(item['content'][0].value) content = item['content'][0].value else: content = '' text = html2text("Update from feed %s\n%s\n%s" % (feedName, self.bot.xmlesc(item['title']), content)) self.bot.sendMessage(muc, text, mtype='groupchat')
def main(): #download file xml = parse(urlopen(ONI_FEED_URL)) #iterate through items xml_data = xml.getElementsByTagName("item") for node in xml_data: entry = {} #get html of description element html = node.getElementsByTagName("description")[0].firstChild.data text = html2text(html) #set new text node.getElementsByTagName("description")[0].firstChild.data = text #save file fd = open(ONI_NEW_RSS_FILEPATH, "w") fd.write(xml.toprettyxml()) fd.close()
def processNotes(noteStore, notebookName, notes, sg, sgUser): for noteMetadata in notes: #Get the note note = noteStore.getNote(noteMetadata.guid, False, True, False, False) #Check if the note was already synced, meaning it has the 'sgSynced' tag if note.tagGuids: tags = [noteStore.getTag(tagGuid) for tagGuid in note.tagGuids] tagNames = [tag.name for tag in tags] if 'sgSynced' in tagNames: print('\nAlready synced {0}'.format(note.title)) continue #Convert the Note body to plain text print('\nGetting note data for {0}'.format(noteMetadata.guid)) contentENML = noteStore.getNoteContent(noteMetadata.guid, True, False, False, False) contentHTML = enml.ENMLToHTML(contentENML) contentTEXT = html2text.html2text(contentHTML.decode('utf-8')) contentTEXT = re.sub(r' *\n', os.linesep, contentTEXT) #Gather the required data for the Shotgun note sgProject = sg.find('Project', [['name','is',notebookName]]) noteLinks = [] if note.tagGuids: print '...Processing tags' noteLinks = processTags(noteStore, sgProject, note.tagGuids, sg) #Create a Shotgun note print '...Creating Shotgun note' sgData = {'subject':note.title, 'content':contentTEXT, 'project':sgProject[0], 'note_links':noteLinks, 'user':sgUser} sgNote = sg.create('Note',sgData) #If the note has attachments, run processResources() to attachment them to the note if note.resources: print '...Processing attachments' processResources(notebookName, note.title, note.resources, sgNote['id'], sg) #Tag the note with 'sgSynced' so it doesn't get synced again note.tagNames = ['sgSynced'] noteStore.updateNote(note)
def main(argv): # evernote_notes = minidom.parse(open("backup_2014_02_06.enex")) # ipdb.set_trace() evernote_notes = minidom.parse(open(argv[0])) notes = evernote_notes.getElementsByTagName('note') for ii in range(0, len(notes)): # Rules about titles/filenames: NO commas, colons, slashes, spaces # So just mainly periods, hyphens, underscores. # You print XML element values with this. srsly? I know, it's insanity file_note_name = notes[ii].getElementsByTagName( 'title')[0].firstChild.nodeValue # Because capital letters on the command line are lame. file_note_name = file_note_name.lower() file_note_name = re.sub(',', '', file_note_name) file_note_name = re.sub(':', '', file_note_name) # This ugly thing is for forwardslashes as used in Unix. file_note_name = re.sub('[[\]/]', '-', file_note_name) file_note_name = re.sub(' ', '-', file_note_name) file_note_name = re.sub('/', '-', file_note_name) print(file_note_name) if (file_note_name[0] == '-'): file_note_name = file_note_name[1:len(file_note_name)] if not os.path.exists('output/' + file_note_name + '.md'): with open('output/' + file_note_name + '.md', 'a') as f_note: # Title metadata f_note.write("Note Title: " + '`' + file_note_name + '`') # I'm delimiting subday times with semicolons instead of colons, as is normal, due to some obscurities in a vim plugin I use for interfacing with them. # Created time metadata f_note.write("\n") created = notes[ii].getElementsByTagName( 'created')[0].firstChild.nodeValue created = created[0:4] + '_' + created[4:6] + '_' + \ created[6:8] + '-' + created[9:11] + ';' + created[11:13] + \ ';' + created[13:15] f_note.write("Note Created: " + '`' + created + '`') # Updated time metadata f_note.write("\n") updated = notes[ii].getElementsByTagName( 'updated')[0].firstChild.nodeValue updated = updated[0:4] + '_' + updated[4:6] + '_' + \ updated[6:8] + '-' + updated[9:11] + ';' + updated[11:13] + \ ';' + updated[13:15] f_note.write("Note Updated: " + '`' + updated + '`') # Tags metadata f_note.write("\n") f_note.write("Note Tags: ") for jj in range(0, len(notes[ii].getElementsByTagName('tag'))): print(notes[ii].getElementsByTagName('tag') [jj].firstChild.nodeValue) f_note.write("^" + notes[ii].getElementsByTagName('tag') [jj].firstChild.nodeValue + "^" + ", ") # Actual Body content f_note.write("\n") f_note.write("Note Body:") f_note.write("\n") f_note.write("\n") body = notes[ii].getElementsByTagName( 'content')[0].firstChild.nodeValue # Notes: Raw image information seems to be encoded with <data encoding="base64">. # Based on a search `/encoding="base[^6]` returning nothing, all the image codes appear to be this same base, which is very common. # Near/before the raw image data there is also a "file-name" element that would be extremely helpful. # However, there's other resource metadata scattered around the "data" element for an image, like "resource-attributes" and "resource", so while parsing images and saving both them to file and inserting markdown links to said data file is possible, it will be a ton of work. # Not to mention the question of what format to do it - do you put them in a separate `fig` folder? Just make a copy of the image in the output? Put the images in a new folder with the same name as the note? # This is a moderately-sized hole in Austinote at the moment, but I don't really use images in notes that much (or want to reference them absolutely somewhere else), and so I'm not going to deal with it at the moment. # I think for the time being I will just have a "figures" folder to allow absolute path linkage in markdown to specific figures on the OS, but I will do this manually. # Clean up the very ugly body unicode in XML to plain ASCII body = re.sub(u'\xa0', '', body) body = html2text(body) # body = html2text.html2text(body) # Then turn Aaron's ugly occasional unicode further to ASCII # vim gives this as 'e2 80 98', which is the byte hex version of '\xe2\x80\x98', and, through http://www.ltg.ed.ac.uk/~richard/utf-8.cgi?input=e2+80+99&mode=bytes , is apparently unicode '\u2019' # body = re.sub(u'\xe2\x80\x98', '\'', body) # this works! finally! This should solve all Unicode worries, post-html2text and post-Evernote body = re.sub(u'\u2019', '\'', body) body = re.sub(u'\u2018', '\'', body) # we also have what's in ————————————————————————————————————————————— to deal with, which is e2 80 94, or \u2014 body = re.sub(u'\u2014', '----', body) body = re.sub(u'\u201c', '\"', body) body = re.sub(u'\u201d', '\"', body) body = re.sub('\\\\', '', body) f_note.write(body) f_note.close()
def resultize(text): return html2text.html2text(text).encode('utf8')