def list(self, event, number, full, plurality, name, start): full = full == 'article' if number: number = int(number) elif not plurality: number = 1 else: number = 10 start = start and int(start) or 0 feed = event.session.query(Feed).filter_by(name=name).first() if not feed: event.addresponse(u"I don't know about the %s feed", name) return feed.update() if not feed.entries: event.addresponse(u"I can't find any articles in that feed") return articles = feed.entries[start:number + start] entries = [] for article in articles: if full: if 'summary' in article: summary = html2text_file(article.summary, None) else: if article.content[0].type in \ ('application/xhtml+xml', 'text/html'): summary = html2text_file(article.content[0].value, None) else: summary = article.content[0].value entries.append( u'%(number)s: "%(title)s"%(link)s : %(summary)s' % { 'number': articles.index(article) + 1, 'title': html2text_file(article.title, None).strip(), 'link': get_link(article), 'summary': summary, }) else: entries.append(u'%s: "%s"' % (feed.entries.index(article) + 1, html2text_file(article.title, None).strip())) event.addresponse(u', '.join(entries))
def html2fmt(html, target_format): # html = html.replace("\n\n", '<br/><br/>') # html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[') # html = html.replace('</pre>', ']]></pre>') if target_format == 'html': return html else: return html2text_file(html, None)
def article(self, event, number, pattern, name): feed = event.session.query(Feed).filter_by(name=name).first() if not feed: event.addresponse(u"I don't know about the %s feed", name) return feed.update() if not feed.entries: event.addresponse(u"I can't access that feed") return article = None if number: if int(number) > len(feed.entries) or 1 > int(number): event.addresponse(u"That's old news dude") return article = feed.entries[int(number) - 1] else: pattern = re.compile(pattern, re.I) for entry in feed.entries: if pattern.search(entry.title): article = entry break if not article: event.addresponse(u'Are you making up news again?') return if 'summary' in article: summary = html2text_file(article.summary, None) else: if article.content[0].type in \ ('application/xhtml+xml', 'text/html'): summary = html2text_file(article.content[0].value, None) else: summary = article.content[0].value event.addresponse( u'"%(title)s" %(link)s : %(summary)s', { 'title': html2text_file(article.title, None).strip(), 'link': article.link, 'summary': summary, })
def html2fmt(html, target_format): # html = html.replace("\n\n", '<br/><br/>') # html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[') # html = html.replace('</pre>', ']]></pre>') if target_format=='html': return html else: # This is like very stupid but I was having troubles with unicode encodings and process.POpen return html2text_file(html, None)
def html2fmt(html, target_format): # html = html.replace("\n\n", '<br/><br/>') # html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[') # html = html.replace('</pre>', ']]></pre>') if target_format == 'html': return html else: # This is like very stupid but I was having troubles with unicode encodings and process.POpen return html2text_file(html, None)
def article(self, event, number, pattern, name): feed = event.session.query(Feed).filter_by(name=name).first() if not feed: event.addresponse(u"I don't know about the %s feed", name) return feed.update() if not feed.entries: event.addresponse(u"I can't find any articles in that feed") return article = None if number: if int(number) > len(feed.entries) or 1 > int(number): event.addresponse(u"That's old news dude") return article = feed.entries[int(number) - 1] else: pattern = re.compile(pattern, re.I) for entry in feed.entries: if pattern.search(entry.title): article = entry break if not article: event.addresponse(u'Are you making up news again?') return if 'summary' in article: summary = html2text_file(article.summary, None) else: if article.content[0].type in \ ('application/xhtml+xml', 'text/html'): summary = html2text_file(article.content[0].value, None) else: summary = article.content[0].value event.addresponse(u'"%(title)s"%(link)s : %(summary)s', { 'title': html2text_file(article.title, None).strip(), 'link': get_link(article), 'summary': summary, })
def list(self, event, number, full, plurality, name, start): full = full == 'article' if number: number = int(number) elif not plurality: number = 1 else: number = 10 start = start and int(start) or 0 feed = event.session.query(Feed).filter_by(name=name).first() if not feed: event.addresponse(u"I don't know about the %s feed", name) return feed.update() if not feed.entries: event.addresponse(u"I can't find any articles in that feed") return articles = feed.entries[start:number+start] entries = [] for article in articles: if full: if 'summary' in article: summary = html2text_file(article.summary, None) else: if article.content[0].type in \ ('application/xhtml+xml', 'text/html'): summary = html2text_file(article.content[0].value, None) else: summary = article.content[0].value entries.append(u'%(number)s: "%(title)s"%(link)s : %(summary)s' % { 'number': articles.index(article) + 1, 'title': html2text_file(article.title, None).strip(), 'link': get_link(article), 'summary': summary, }) else: entries.append(u'%s: "%s"' % (feed.entries.index(article) + 1, html2text_file(article.title, None).strip())) event.addresponse(u', '.join(entries))
def list(self, event, number, name, start): number = number and int(number) or 10 start = start and int(start) or 0 feed = event.session.query(Feed).filter_by(name=name).first() if not feed: event.addresponse(u"I don't know about the %s feed", name) return feed.update() if not feed.entries: event.addresponse(u"I can't find any articles in that feed") return articles = feed.entries[start:number+start] articles = [u'%s: "%s"' % (feed.entries.index(entry) + 1, html2text_file(entry.title, None).strip()) for entry in articles] event.addresponse(u', '.join(articles))
def do_VIEW_DETAILS_OR_NOT(self,command_name,details,*args) : for position in self.get_selection_list(args) : self._print('\n') self._print('='*3+' [ '+('%3d'%(position+1,))+' ] '+'='*58+'\n') self._print(self.format_title(self._rss_reader.get_title(position))+'\n') self._print('-'*65+'\n') if details : link = self._rss_reader.get_link(position) if link and link != '' : self._print(link+'\n') self._print('-'*65+'\n') self._print(html2text_file(self._rss_reader.get_content(position),None)) if details : categories = self._rss_reader.get_categories(position) if categories is not None : if len(categories)>0 : self._print('-'*65+'\n') for categorie in categories : self._print(' %s\n' % categorie) if len(categories)>0 : self._print('-'*65+'\n')
def list(self, event, number, name, start): number = number and int(number) or 10 start = start and int(start) or 0 feed = event.session.query(Feed).filter_by(name=name).first() if not feed: event.addresponse(u"I don't know about the %s feed", name) return feed.update() if not feed.entries: event.addresponse(u"I can't find any articles in that feed") return articles = feed.entries[start:number + start] articles = [ u'%s: "%s"' % (feed.entries.index(entry) + 1, html2text_file(entry.title, None).strip()) for entry in articles ] event.addresponse(u', '.join(articles))
def html2text(html, baseurl=''): return optwrap(html2text_file(html, None, baseurl))
def handleMsg(mailbox, msg, is_subpart=False, strdate=""): """ This function handles a message object recursively, it has several tasks: - save all of the attachments in the message - extract all of the text information into the message body - if the email contains html messages they will be converted into text and added to the message body - extract all of the field information (To, Cc, From, ...) from the message objects """ global text global attachments global fieldFrom, fieldSubject, fieldTime # Message/RFC822 parts are bundled this way ============== while isinstance(msg.get_payload(),email.Message.Message): msg=msg.get_payload() if not is_subpart: fieldFrom = "" fieldSubject = "" fieldTime = None # fieldTime is a 9-item tuple text = "" # the text contents of a message attachments = "" ## Set the "From" Field ================================== if fieldFrom == "" and msg['From'] != None: text += "To: %s\n" % decode_field(msg['To']) if msg['Cc'] != None: text += "Cc: %s\n" % decode_field(msg['Cc']) if msg['Bcc'] != None: text += "Bcc: %s\n" % decode_field(msg['Bcc']) text += "From: %s\n" % decode_field(msg['From']) fieldFrom = decode_field(msg['From']) ## Set the "Subject" Field =============================== if fieldSubject == "" and msg['Subject'] != None: fieldSubject = decode_field(msg['Subject']) text += "Subject: %s\n" % fieldSubject ## Set the "Date" Field ================================== if fieldTime == None and msg['Date'] != None: fieldTime = string2time(msg['Date']) strdate = time.strftime("%Y%m%d%H%M", fieldTime) ## Handle multipart messages recursively ================= if msg.is_multipart(): for submsg in msg.get_payload(): handleMsg(mailbox, submsg, True, strdate) else: fname = msg.get_filename() if fname == None: if msg.get_content_type() == 'text/plain': text += "\n%s" % msg.get_payload(decode=1) else: fname = "message.htm" ## Save an attachment to a file ======================== if not fname == None: fname = decode_field(fname) filename = "%s\\att_%s\\%s_%s" % (mailboxdir, mailbox, strdate, fname) org_filename = filename i = 1 while os.path.exists(filename): path, ext = os.path.splitext(org_filename) filename = "%s (%d)%s" % (path, i, ext) i = i + 1 print " Found part: %s" % filename # for debugging purposes attachments += "%s\n" % filename fd = open (filename, "wb") data = msg.get_payload(decode=1) fd.write(data) # convert an html message to text if fname == "message.htm": try: strio = cStringIO.StringIO() html2text.html2text_file(data, out=strio.write) text += strio.getvalue() strio.close() except sgmllib.SGMLParseError, e: print e fd.close()
def get_desc(soup,url): if soup: result= ht.html2text_file(str(soup),None) else: result =unicode(html2content.get_text(url)) return result
def get_desc(soup, url): if soup: result = ht.html2text_file(str(soup), None) else: result = unicode(html2content.get_text(url)) return result
def write_hakyll(data, target_format): sys.stdout.write("writing") item_uids = {} attachments = {} def get_blog_path(data, path_infix='hakyll'): name = data['header']['link'] name = re.sub('^https?', '', name) name = re.sub('[^A-Za-z0-9_.-]', '', name) return os.path.normpath(build_dir + '/' + path_infix + '/' + name) blog_dir = get_blog_path(data) def get_full_dir(dir): full_dir = os.path.normpath(blog_dir + '/' + dir) if (not os.path.exists(full_dir)): os.makedirs(full_dir) return full_dir def open_file(file): f = codecs.open(file, 'w', encoding='utf-8') return f def get_item_uid(item, date_prefix=False, namespace=''): result = None if namespace not in item_uids: item_uids[namespace] = {} if item['wp_id'] in item_uids[namespace]: result = item_uids[namespace][item['wp_id']] else: uid = [] if (date_prefix): dt = datetime.strptime(item['date'], date_fmt) uid.append(dt.strftime('%Y-%m-%d')) uid.append('-') s_title = item['slug'] if s_title is None or s_title == '': s_title = item['title'] if s_title is None or s_title == '': s_title = 'untitled' s_title = s_title.replace(' ', '_') s_title = s_title.strip(' \t\n\r\'') s_title = re.sub('[^a-zA-Z0-9_-]', '', s_title) uid.append(s_title) fn = ''.join(uid) n = 1 while fn in item_uids[namespace]: n = n + 1 fn = ''.join(uid) + '_' + str(n) item_uids[namespace][i['wp_id']] = fn result = fn return result def get_item_path(item, dir=''): full_dir = get_full_dir(dir) filename_parts = [full_dir, '/'] if build_mode == 'tree': m = re.search('(\d+-\d+-\d+)(-)(.+)', item['uid']) if m is not None: uiddt = datetime.strptime(m.group(1),'%Y-%m-%d').strftime('%Y/%m/%d') filename_parts.append(uiddt) if (not os.path.exists(''.join(filename_parts))): os.makedirs(''.join(filename_parts)) filename_parts.append(os.path.join('/', m.group(3))) else: filename_parts.append(item['uid']) else: filename_parts.append(item['uid']) if item['type'] == 'page': if (not os.path.exists(''.join(filename_parts))): os.makedirs(''.join(filename_parts)) filename_parts.append('/index') filename_parts.append('.') filename_parts.append(target_format) return ''.join(filename_parts) def get_attachment_path(src, dir, dir_prefix='a'): try: files = attachments[dir] except KeyError: attachments[dir] = files = {} try: filename = files[src] except KeyError: file_root, file_ext = os.path.splitext(os.path.basename(urlparse(src)[2])) file_infix = 1 if file_root == '': file_root = '1' current_files = files.values() maybe_filename = file_root + file_ext while maybe_filename in current_files: maybe_filename = file_root + '-' + str(file_infix) + file_ext file_infix = file_infix + 1 files[src] = filename = maybe_filename target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + dir) target_file = os.path.normpath(target_dir + '/' + filename) if (not os.path.exists(target_dir)): os.makedirs(target_dir) #if src not in attachments[dir]: ##print target_name return target_file for i in data['items']: skip_item = False for field, value in item_field_filter.iteritems(): if(i[field] == value): skip_item = True break if(skip_item): continue sys.stdout.write(".") sys.stdout.flush() out = None i['title'] = i['title'].strip(' \t\n\r\'') yaml_header = { 'title': i['title'], 'date': datetime.strptime(i['date'], '%Y-%m-%d %H:%M:%S'), 'slug': i['slug'], 'wordpressid': int(i['wp_id']), 'comments': i['comments'], } if i['status'] != u'publish': yaml_header['published'] = False if i['type'] == 'post': i['uid'] = get_item_uid(i, date_prefix=True) fn = get_item_path(i, dir='_posts') out = open_file(fn) yaml_header['layout'] = 'post' elif i['type'] == 'page': i['uid'] = get_item_uid(i) # Chase down parent path, if any parentpath = '' item = i while item['parent'] != "0": item = next((parent for parent in data['items'] if parent['wp_id'] == item['parent']), None) if item: parentpath = get_item_uid(item) + "/" + parentpath else: break fn = get_item_path(i, parentpath) out = open_file(fn) yaml_header['layout'] = 'page' elif i['type'] in item_type_filter: pass else: print "Unknown item type :: " + i['type'] if download_images: for img in i['img_srcs']: try: urlretrieve(urljoin(data['header']['link'], img.decode('utf-8')), get_attachment_path(img, i['uid'])) except: print "\n unable to download " + urljoin(data['header']['link'], img.decode('utf-8')) if out is not None: def toyaml(data): return yaml.safe_dump(data, allow_unicode=True, default_flow_style=False).decode('utf-8') tax_out = {} for taxonomy in i['taxanomies']: tvalue_list = [] for tvalue in i['taxanomies'][taxonomy]: t_name = taxonomy_name_mapping.get(taxonomy, taxonomy) if t_name not in tax_out: tax_out[t_name] = [] if tvalue in tax_out[t_name]: continue tvalue_list.append(tvalue) tax_out[t_name] = ",".join(tvalue_list) out.write('---\n') if len(yaml_header) > 0: out.write(toyaml(yaml_header)) if len(tax_out) > 0: out.write(toyaml(tax_out)) out.write('---\n\n') try: out.write(html2text_file(i['body'], None)) except: print "\n Parse error on: " + title out.close() print "\n"
def format_title(self,title) : return html2text_file(title,None).strip('\r\n ').replace('\n',' ')
def _html2text(html): sio = StringIO() html2text.html2text_file(html, sio.write) text = sio.getvalue() sio.close() return text