def _import_subscriptions(self): subscriptions = None for f in self.z.namelist(): if 'subscriptions.xml' in f: subscriptions = opml.from_string(self.z.open(f).read()) break if subscriptions is None: return False for sub in subscriptions: if hasattr(sub, 'type'): title = sub.title link = sub.xmlUrl site = sub.htmlUrl Feed.create_and_subscribe(title, link, site, self.user) else: # In this case, it's a 'group' of feeds. folder = sub for sub in folder: title = sub.title link = sub.xmlUrl site = sub.htmlUrl feed = Feed.create_and_subscribe(title, link, site, self.user) userfeed = feed.userfeed(self.user) userfeed.tags.add(folder.title) return True
def POST(self): import opml x = web.input(importfile={}) memcache.set(MEMC_ADV_ID, self.__url__, 86400) if 'importfile' in x: user = self.getcurrentuser() try: rsslist = opml.from_string(x.importfile.file.read()) except Exception as e: return self.GET(str(e)) for o in self.walkOutline(rsslist): title, url, isfulltext = o.text, urllib.unquote_plus( o.xmlUrl), o.isFulltext #isFulltext为非标准属性 isfulltext = bool(isfulltext.lower() in ('true', '1')) if title and url: rss = Feed.all().filter('book = ', user.ownfeeds).filter( "url = ", url).get() #查询是否有重复的 if rss: rss.title = title rss.isfulltext = isfulltext rss.put() else: Feed(title=title, url=url, book=user.ownfeeds, isfulltext=isfulltext, time=datetime.datetime.utcnow()).put() memcache.delete('%d.feedscount' % user.ownfeeds.key().id()) raise web.seeother('/my') else: raise web.seeother('')
def get_feeds(opml_text: bytes, selected_shows: Optional[list] = None ) -> list: if selected_shows: print('These are the selected shows: {}'.format(', '.join(selected_shows))) opml_feeds = opml.from_string(opml_text) filtered_feeds = [] if selected_shows: # https://stackoverflow.com/questions/59825/how-to-retrieve-an-element-from-a-set-without-removing-it#answer-59841 first_obj = next(iter(selected_shows)) if first_obj != '': opml_feeds = filter_shows( opml_feeds, selected_shows) for feed in opml_feeds: if feed.type == 'rss': filtered_feeds.append( { 'feed_name': feed.title, 'feed_url': feed.htmlUrl } ) else: raise Exception("Your file had a non rss value.") show_names = [ show['feed_name'] for show in filtered_feeds ] print('This is the filtered show list: {}'.format(', '.join(show_names))) return filtered_feeds
def opml_import(opml_data, idoffset, isdup): outline = opml.from_string(opml_data) feeds = [] def handlefeed(sub, tags): if isdup(sub.xmlUrl.strip().lower(), tags): return [] feeddata = dict(id=len(feeds) + idoffset, tags=[a for a in set(tags)], title=sub.text, xmlUrl=sub.xmlUrl, htmlUrl=sub.htmlUrl if sub.htmlUrl else '') feeds.append(feeddata) def handlesub(sub, tags): if len(sub) < 1: # just a feed handlefeed(sub, tags) else: # a folder of feeds newtags = [] newtags.extend(tags) try: newtags.append(sub.text) except: pass for s in sub: handlesub(s, newtags) handlesub(outline, []) return feeds
def opml_import(opml_data, idoffset, isdup): outline = opml.from_string(opml_data) feeds = [] def handlefeed(sub, tags): if isdup(sub.xmlUrl.strip().lower(), tags): return [] feeddata = dict( id=len(feeds)+idoffset, tags=[a for a in set(tags)], title=sub.text, xmlUrl=sub.xmlUrl, htmlUrl=sub.htmlUrl if sub.htmlUrl else '') feeds.append(feeddata) def handlesub(sub, tags): if len(sub) < 1: # just a feed handlefeed(sub, tags) else: # a folder of feeds newtags = [] newtags.extend(tags) try: newtags.append(sub.text) except: pass for s in sub: handlesub(s, newtags) handlesub(outline, []) return feeds
def post(self): user = users.get_current_user() if not user: self.redirect('/') from google.appengine.ext import blobstore import opml # get file upload_files = self.get_uploads('file') blob_info = upload_files[0] blob_reader = blobstore.BlobReader(blob_info.key()) opmlFile = blob_reader.read() # get user ud = GetAppUserByEmail(user.email()) private_data = json.loads(ud.private_data) self.bloglist = private_data['bloglist'] # parse file outline = opml.from_string(opmlFile) self.processOutline(outline, 'root') # save new data private_data['bloglist'] = self.bloglist ud.private_data = json.dumps(private_data) ud.put() logging.debug('imported blog list: %s', json.dumps(self.bloglist)) self.redirect('/')
def get_form_initial(self, step): if step == '1': src = None uploaddata = self.get_cleaned_data_for_step('0') if uploaddata['file']: fsrc = uploaddata['file'] str = "" for chunk in fsrc.chunks(): str += chunk ofile = opml.from_string(str) else: src = uploaddata['url'] ofile = opml.parse(src) initial = [] for entry in ofile: init_entry = { 'enabled': True, 'title': entry.title, 'feedurl': entry.xmlUrl, 'wwwurl': entry.htmlUrl, } initial.append(init_entry) return initial else: return super(OPMLImport, self).get_form_initial(step)
def POST(self): import opml x = web.input(importfile={}) memcache.set(MEMC_ADV_ID, self.__url__, 86400) if 'importfile' in x: user = self.getcurrentuser() try: rsslist = opml.from_string(x.importfile.file.read()) except Exception as e: return self.GET(str(e)) for o in self.walkOutline(rsslist): title, url, isfulltext = o.text, urllib.unquote_plus(o.xmlUrl), o.isFulltext #isFulltext为非标准属性 isfulltext = bool(isfulltext.lower() in ('true', '1')) if title and url: rss = Feed.all().filter('book = ', user.ownfeeds).filter("url = ", url).get() #查询是否有重复的 if rss: rss.title = title rss.isfulltext = isfulltext rss.put() else: Feed(title=title,url=url,book=user.ownfeeds,isfulltext=isfulltext, time=datetime.datetime.utcnow()).put() memcache.delete('%d.feedscount'%user.ownfeeds.key().id()) raise web.seeother('/my') else: raise web.seeother('')
def to_python(self, data): f = super(OPMLField, self).to_python(data) if f is None: return if hasattr(data, "read"): content = data.read() else: content = data["content"] try: opml.from_string(content) except XMLSyntaxError: raise forms.ValidationError(_("This file doesn't seem to be a valid OPML file.")) if hasattr(f, "seek") and callable(f.seek): f.seek(0) return f
def to_python(self, data): f = super(OPMLField, self).to_python(data) if f is None: return if hasattr(data, 'read'): content = data.read() else: content = data['content'] try: opml.from_string(content) except XMLSyntaxError: raise forms.ValidationError( _("This file doesn't seem to be a valid OPML file.")) if hasattr(f, 'seek') and callable(f.seek): f.seek(0) return f
def import_opml(nickname, opml_content): """ Import new feeds from an OPML file. """ user = User.query.filter(User.nickname == nickname).first() try: subscriptions = opml.from_string(opml_content) except: logger.exception("Parsing OPML file failed:") raise def read(subsubscription, nb=0): """ Parse recursively through the categories and sub-categories. """ for subscription in subsubscription: if len(subscription) != 0: nb = read(subscription, nb) else: try: title = subscription.text except: title = "" try: description = subscription.description except: description = "" try: link = subscription.xmlUrl except: continue if ( None != Feed.query.filter( Feed.user_id == user.id, Feed.link == link ).first() ): continue try: site_link = subscription.htmlUrl except: site_link = "" new_feed = Feed( title=title, description=description, link=link, site_link=site_link, enabled=True, ) user.feeds.append(new_feed) nb += 1 return nb nb = read(subscriptions) db.session.commit() return nb
def import_opml(user_id, opml_url=None, data=None): outline = None if opml_url is not None: outline = opml.parse(opml_url) if data is not None: outline = opml.from_string(data) outline = outline or [] for entry in outline: url = entry.xmlUrl print(url) subscribe_to_url(url, user_id)
def import_opml(user_id, opml_url=None, data=None): outline = None if opml_url is not None: outline = opml.parse(opml_url) if data is not None: outline = opml.from_string(data) outline = outline or [] for entry in outline: url = entry.xmlUrl print url subscribe_to_url(url, user_id)
def post(self, request, *args, **kwargs): try: entries = opml.from_string(request.body) except XMLSyntaxError: raise exceptions.ParseError("This file doesn't seem to be a valid OPML file.") existing_feeds = set(request.user.feeds.values_list("url", flat=True)) try: with user_lock("opml_import", request.user.pk, timeout=30): imported = save_outline(request.user, None, entries, existing_feeds) except ValidationError: raise exceptions.ParseError("Another concurrent OPML import is happening " "for this user.") return Response("OK: {0}".format(imported))
def add_file(f, openzip, cnt, feed_urls): if f.filename.startswith("__") or f.filename.startswith("."): return (0, feed_urls) if not f.filename.endswith("subscriptions.xml"): return (0, feed_urls) opmlfile = openzip.read(f) outline = opml.from_string(opmlfile) for o in outline: for o2 in o: feed_urls.append(getattr(o2, 'xmlUrl')) cnt += 1 return (cnt, feed_urls)
def __init__(self, source): self.source = source try: self.data = opml.from_string(source) except Exception as e: raise OpmlSourceError(e) try: self.number = self._show_number() # we need an explicit copy here so we get the unicode str self.title = self.data.title[:] self.shownotes = self._get_shownotes() except ValueError: raise ValueError('Bad opml data, no show number')
def opml_import(): if request.files.get('opmlfile', None) is None: flash(gettext('Got no file'), 'warning') return redirect(url_for('user.profile')) data = request.files.get('opmlfile', None) try: subscriptions = opml.from_string(data.read()) except: flash(gettext("Couldn't parse file"), 'danger') return redirect(request.referrer) ccontr = CategoryController(current_user.id) fcontr = FeedController(current_user.id) created_count, existing_count, failed_count = 0, 0, 0 categories = {cat.name: cat.id for cat in ccontr.read()} for line in subscriptions: try: link = line.xmlUrl except Exception: failed_count += 1 continue # don't import twice if fcontr.read(link=link).count(): existing_count += 1 continue # handling categories cat_id = None category = getattr(line, 'category', None) if category: if category not in categories: new_category = ccontr.create(name=category) categories[new_category.name] = new_category.id cat_id = categories[category] fcontr.create(title=getattr(line, 'text', None), category_id=cat_id, description=getattr(line, 'description', None), link=link, site_link=getattr(line, 'htmlUrl', None)) created_count += 1 flash( gettext( "Created %(created)d feed ! (%(failed)d import failed, " "%(existing)d were already existing)", created=created_count, failed=failed_count, existing=existing_count), "info") return redirect(url_for('user.profile'))
def parse_user_opml(user_id, source): source = source.replace('<?xml version="1.0" encoding="UTF-8"?>', '') try: res = opml.from_string(source) except Exception as exc: logger.error(exc) return False if res: for category in res: logger.debug('Category: %s' % category.text) for item in category: logger.debug('Item: %s %s' % (item.text, item.xmlUrl)) SourceFactory().add_to_user(user_id, 'feed', item.xmlUrl, item.text, category.text) return True return False
def post(self, request, *args, **kwargs): try: entries = opml.from_string(request.body) except XMLSyntaxError: raise exceptions.ParseError( "This file doesn't seem to be a valid OPML file.") existing_feeds = set(request.user.feeds.values_list('url', flat=True)) try: with user_lock('opml_import', request.user.pk, timeout=30): imported = save_outline(request.user, None, entries, existing_feeds) except ValidationError: raise exceptions.ParseError( "Another concurrent OPML import is happening " "for this user.") return Response("OK: {0}".format(imported))
def post(): opml_file = request.files['opml_file'] try: subscriptions = opml.from_string(opml_file.read()) except Exception as error: raise UnprocessableEntity("Couldn't parse OPML file (%r)" % error) ccontr = CategoryController(current_identity.id) fcontr = FeedController(current_identity.id) counts = {'created': 0, 'existing': 0, 'failed': 0, 'exceptions': []} categories = {cat.name: cat.id for cat in ccontr.read()} for line in subscriptions: try: link = line.xmlUrl except Exception as error: counts['failed'] += 1 counts['exceptions'].append(str(error)) continue # don't import twice if fcontr.read(link=link).count(): counts['existing'] += 1 continue # handling categories cat_id = None category = getattr(line, 'category', '').lstrip('/') if category: if category not in categories: new_category = ccontr.create(name=category) categories[new_category.name] = new_category.id cat_id = categories[category] fcontr.create(title=getattr(line, 'text', None), category_id=cat_id, description=getattr(line, 'description', None), link=link, site_link=getattr(line, 'htmlUrl', None)) counts['created'] += 1 code = 200 if counts.get('created'): code = 201 elif counts.get('failed'): code = 400 return counts, code
def import_opml(email, opml_content): """ Import new feeds from an OPML file. """ user = User.query.filter(User.email == email).first() try: subscriptions = opml.from_string(opml_content) except: logger.exception("Parsing OPML file failed:") raise def read(subsubscription, nb=0): """ Parse recursively through the categories and sub-categories. """ for subscription in subsubscription: if len(subscription) != 0: nb = read(subscription, nb) else: try: title = subscription.text except: title = "" try: description = subscription.description except: description = "" try: link = subscription.xmlUrl except: continue if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == link).first(): continue try: site_link = subscription.htmlUrl except: site_link = "" new_feed = Feed(title=title, description=description, link=link, site_link=site_link, enabled=True) user.feeds.append(new_feed) nb += 1 return nb nb = read(subscriptions) db.session.commit() return nb
def opml_import(): if request.files.get('opmlfile', None) is None: flash(gettext('Got no file'), 'warning') return redirect(url_for('user.profile')) data = request.files.get('opmlfile', None) try: subscriptions = opml.from_string(data.read()) except: flash(gettext("Couldn't parse file"), 'danger') return redirect(request.referrer) ccontr = CategoryController(current_user.id) fcontr = FeedController(current_user.id) created_count, existing_count, failed_count = 0, 0, 0 categories = {cat.name: cat.id for cat in ccontr.read()} for line in subscriptions: try: link = line.xmlUrl except Exception: failed_count += 1 continue # don't import twice if fcontr.read(link=link).count(): existing_count += 1 continue # handling categories cat_id = None category = getattr(line, 'category', None) if category: if category not in categories: new_category = ccontr.create(name=category) categories[new_category.name] = new_category.id cat_id = categories[category] fcontr.create(title=getattr(line, 'text', None), category_id=cat_id, description=getattr(line, 'description', None), link=link, site_link=getattr(line, 'htmlUrl', None)) created_count += 1 flash(gettext("Created %(created)d feed ! (%(failed)d import failed, " "%(existing)d were already existing)", created=created_count, failed=failed_count, existing=existing_count), "info") return redirect(url_for('user.profile'))
def POST(self): import opml x = web.input(importfile={}) defaultIsfulltext = bool(x.get('defaultIsfulltext')) #默认是否按全文RSS导入 if 'importfile' in x: user = self.getcurrentuser() try: rsslist = opml.from_string(x.importfile.file.read()) except Exception as e: return self.GET(str(e)) for o in self.walkOutline(rsslist): title, url, isfulltext = o.text, urllib.unquote_plus( o.xmlUrl), o.isFulltext #isFulltext为非标准属性 if isfulltext.lower() in ('true', '1'): isfulltext = True elif isfulltext.lower() in ('false', '0'): isfulltext = False else: isfulltext = defaultIsfulltext if title and url: try: url = url.decode('utf-8') except: pass rss = Feed.all().filter('book = ', user.ownfeeds).filter( "url = ", url).get() #查询是否有重复的 if rss: rss.title = title rss.isfulltext = isfulltext rss.put() else: Feed(title=title, url=url, book=user.ownfeeds, isfulltext=isfulltext, time=datetime.datetime.utcnow()).put() raise web.seeother('/my') else: raise web.seeother(self.__url__)
def POST(self): import opml x = web.input(importfile={}) defaultIsfulltext = bool(x.get('defaultIsfulltext')) #默认是否按全文RSS导入 if 'importfile' in x: user = self.getcurrentuser() try: rsslist = opml.from_string(x.importfile.file.read()) except Exception as e: return self.GET(str(e)) for o in self.walkOutline(rsslist): title, url, isfulltext = o.text, urllib.unquote_plus(o.xmlUrl), o.isFulltext #isFulltext为非标准属性 if isfulltext.lower() in ('true', '1'): isfulltext = True elif isfulltext.lower() in ('false', '0'): isfulltext = False else: isfulltext = defaultIsfulltext if title and url: try: url = url.decode('utf-8') except: pass rss = Feed.all().filter('book = ', user.ownfeeds).filter("url = ", url).get() #查询是否有重复的 if rss: rss.title = title rss.isfulltext = isfulltext rss.put() else: Feed(title=title, url=url, book=user.ownfeeds, isfulltext=isfulltext, time=datetime.datetime.utcnow()).put() raise web.seeother('/my') else: raise web.seeother(self.__url__)
import opml subscriptions = opml.from_string( open('subscriptions.opml', 'r').read().encode("UTF-8")) def is_category(item): return not hasattr(item, 'type') def list_feeds(title, clctn, indent=0): print((" " * indent) + title) for row in clctn: if is_category(row): list_feeds(row.title, row, indent + 1) else: print((" " * (indent + 1)) + row.title + " " + row.xmlUrl) list_feeds(subscriptions.title, subscriptions)
def import_opml(opml_string, user): outline = opml.from_string(opml_string) if len(outline) == 0: raise NoFeedsFound() for outline_element in outline: process_outline(outline_element, user)
def parse(outline_url, my_folder, my_home_index_page): global DEBUG OPTIONS = {} TEMPLATES = {} PAGES = {} FILENAMES = [] GLOSSARY_COMPLETE = False CALENDARS = {} GLOSSARY = DEFAULT_GLOSSARY.copy() data_folder, my_folder = os.path.split(my_folder) working_dir, DATA_FOLDER = os.path.split(data_folder) os.chdir(working_dir) base_folder = "%s/%s" % (DATA_FOLDER, my_folder) mkdir_p(base_folder) outline_url = re.sub('www', 'dl', outline_url) if 'usercontent' not in outline_url: outline_url = re.sub('dropbox', 'dropboxusercontent', outline_url) outline = list(opml.from_string(requests.get(outline_url).content)) for i, node in enumerate(outline): if node.text == '#glossary': GLOSSARY.update(grabChildren(outline.pop(i))) elif node.text == '#templates': TEMPLATES.update(grabChildren(outline.pop(i))) else: first_word = node.text.split(' ')[0] if first_word in GLOSSARY_OPTIONS: try: value = OPTIONS[first_word[1:]] except: value = node.text GLOSSARY.update(GLOSSARY_FUNCTIONS[first_word](value)) for k, v in GLOSSARY.items(): if type(v[0]) == type([]): GLOSSARY[k] = ''.join(v[1]) for k, v in TEMPLATES.items(): if type(v[0]) == type([]): TEMPLATES[k] = (v[0], ''.join(v[1])) GLOSSARY_COMPLETE = True while outline: next_node = outline.pop() try: if next_node.type == 'include': real_url = re.sub('dropbox', 'dropboxusercontent', next_node.url) real_url = re.sub('https', 'http', real_url) include = list(opml.from_string( requests.get(real_url).content)) nodes = include else: nodes = [next_node] except: nodes = [next_node] for node in nodes: try: if node.icon == 'calendar': try: i_title = node.name except: i_title = node.text CALENDARS = addCalendar('Home', node, i_title, CALENDARS) continue except: pass try: if node[0].icon == 'calendar': try: i_title = node[0].name except: i_title = node.text CALENDARS = addCalendar(node.text, node[0], i_title, CALENDARS) continue except: pass if node.text[0] == '#': option = node.text[1:].rstrip().lstrip() parts = option.split(' ') if len(parts) < 2: OPTIONS[option] = True else: key, value = parts[0], ' '.join(parts[1:]) if value[0] == '[': value = random.choice( [v.strip() for v in value[1:-1].split(',')]) elif value[0] == '"': value = value[1:-1] if value.lower() in ['true', 'false']: OPTIONS[key] = bool(value) else: try: OPTIONS[key] = int(value) except: OPTIONS[key] = value else: brandLink = '/' blogHomeTitle = OPTIONS.get('blogHomeTitle', 'Home') page = {} try: this_type = node.type except: this_type = 'outline' try: rules, template = TEMPLATES[this_type] except Exception as e: raise Usage( "#templates node required until I pull default templates from Trex. \n\n%s" % e.message) template = ''.join(template) for k, v in node._root.items(): template = re.sub('<%%%s%%>' % k, v, template) page[k] = v page_desc = page.get('pageDescription', ' ') template = re.sub('<%blogHomeTitle%>', blogHomeTitle, template) template = re.sub('<%pageTitle%>', page['text'], template) template = re.sub('<%pageDescription%>', page_desc, template) if 'name' not in page: page['name'] = makeName(page['text']) if 'url' not in page: page['url'] = "/%s" % page['name'] waste, bodytext = grabData(node, rules, this_type) bodytext.append( '</div>' ) # not sure why we need this - something's not right bodytext = ''.join(bodytext) data = re.sub('<%bodytext%>', bodytext, template) page['bodytext'] = bodytext template = subData(data, GLOSSARY) template = re.sub( '<%BRANDMENU%>', '<a class="brand" href="<%BRANDLINK%>"><%BRAND%></a>', template) template = re.sub('<%BRAND%>', blogHomeTitle, template) template = re.sub('<%BRANDLINK%>', brandLink, template) page['data'] = template PAGES[page['name']] = page for k, v in PAGES.items(): file_name = getFileName("%s/%s" % (base_folder, v['name']), FILENAMES) FILENAMES.append(file_name) save_file = False new_data = v['data'] if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >> fh, new_data fh.close() if os.path.basename(file_name) == my_home_index_page: print file_name fh = open( os.path.join(os.path.split(file_name)[0], "index.html"), "w+") print >> fh, new_data fh.close() blogHomeTitle = OPTIONS.get('blogHomeTitle', 'Home') posts = {"Home": []} feedcount = OPTIONS.get('feedCount', 20) domain = "http://%s" % OPTIONS.get('domainName', '') feed_posts = [] for base, calendar_stuff in CALENDARS.items(): ycals, index_title = calendar_stuff page_data, path_name = None, None this_post_data, this_path_name = None, None while ycals: if this_post_data: file_name = getFileName( "%s/%s" % (base_folder, this_path_name), FILENAMES) FILENAMES.append(file_name) save_file = False new_data = re.sub('<nextprev>', getPrevNextLinks(prev_path_name, path_name), this_post_data) if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >> fh, new_data fh.close() ycal = ycals.pop() try: index_desc = ycal.description except: index_desc = '' year_title = ycal.text year_name = ycal.text year_num = ycal.text if base == 'Home': brandLink = '/' sub_folder = '' root_folder = base_folder year_path = year_num else: sub_folder = makeName(base) brandLink = "/%s" % sub_folder root_folder = "%s/%s" % (base_folder, sub_folder) if not os.path.exists(root_folder): os.mkdir(root_folder) year_path = "%s/%s" % (sub_folder, year_num) if (sub_folder, index_title) not in posts: posts[(sub_folder, index_title)] = [] year_folder = "%s/%s" % (base_folder, year_path) try: if not os.path.exists(year_folder): os.mkdir(year_folder) except: mkdir_p(year_folder) for mcal in ycal: month_title = mcal.text month_name = month_title.split(' ')[0] month_num = MONTHS[month_name] month_path = "%s/%s" % (year_path, month_num) month_folder = "%s/%s" % (year_folder, month_num) if not os.path.exists(month_folder): os.mkdir(month_folder) for dcal in mcal: day_title = dcal.text day_name = dcal.text day_num = "%02d" % float(day_title.split(' ')[1]) day_path = "%s/%s" % (month_path, day_num) day_folder = "%s/%s" % (month_folder, day_num) if not os.path.exists(day_folder): os.mkdir(day_folder) trail = [(year_path, year_title), (month_path, month_title), (day_path, day_title)] trail_links = """ <nextprev> <div class="breadcrumbs"><a href="/%s">%s</a> / %s</div> """ % (sub_folder, base, " / ".join( ['<a href="/%s/">%s</a>' % (l, n) for l, n in trail])) # using page below because it matches above for node in dcal: prev_post_data = this_post_data prev_path_name = this_path_name this_post_data = page_data this_path_name = path_name page = {} try: this_type = node.type except: this_type = 'outline' rules, template = TEMPLATES[this_type] template = ''.join(template) for k, v in node._root.items(): template = re.sub('<%%%s%%>' % k, v, template) page[k] = v page_desc = page.get('pageDescription', index_desc) template = re.sub('<%blogHomeTitle%>', blogHomeTitle, template) template = re.sub('<%pageTitle%>', page['text'], template) template = re.sub('<%pageDescription%>', page_desc, template) if 'name' not in page: page['name'] = makeName(page['text']) if 'url' not in page: page['url'] = "/%s" % page['name'] waste, bodytext = grabData(node, rules, this_type) bodytext.append( '</div><!--FIX-->' ) # not sure why we need this - something's not right bodytext = '\n'.join(bodytext) data = re.sub('<%bodytext%>', bodytext, template) page['bodytext'] = bodytext template = re.sub('</h1>', '</h1>%s' % trail_links, subData(data, GLOSSARY)) template = re.sub( '<%BRANDMENU%>', '<a class="brand" href="<%BRANDLINK%>"><%BRAND%></a>', template) template = re.sub('<%BRAND%>', index_title, template) template = re.sub('<%BRANDLINK%>', brandLink, template) path_name = "%s/%s" % (day_path, page['name']) page['url'] = "/%s" % path_name listing = page['text'], page['bodytext'], page[ 'url'], page_desc if feedcount: try: if node.isFeedItem == 'true': feed_posts.append( PyRSS2Gen.RSSItem( title=page['text'], link=domain + page['url'], description=page['bodytext'], guid=domain + page['url'], pubDate=page['created'])) feedcount -= 1 except: pass # Do this after listing so comments don't show on index pages disqusGroupName = OPTIONS.get('disqusGroupName', False) commentsString = '' if disqusGroupName: uniq_id = outline_url + node.created commentsString = """ <script>var disqus_identifier = '%s';</script><a onclick="showHideComments ()"><span id="idShowHideComments" style="cursor: pointer;"></span></a><div class="divDisqusComments" id="idDisqusComments" style="visibility: visible;" ><div id="disqus_thread"></div></div><script type="text/javascript" src="http://disqus.com/forums/%s/embed.js"></script></div> """ % (uniq_id, disqusGroupName) page_data = re.sub('<!-- COMMENTS -->', commentsString, template) page['data'] = page_data if sub_folder: posts[(sub_folder, index_title)].append(listing) else: posts["Home"].append(listing) for path_info in [(year_path, year_title), (month_path, month_title), (day_path, day_title)]: if not path_info[0]: continue if path_info not in posts: if sub_folder: path_info = path_info[0], path_info[1] posts[path_info] = [] posts[path_info].append(listing) if this_post_data: file_name = getFileName( "%s/%s" % (base_folder, this_path_name), FILENAMES) FILENAMES.append(file_name) save_file = False new_data = re.sub( '<nextprev>', getPrevNextLinks(prev_path_name, path_name), this_post_data) if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >> fh, new_data.encode('utf-16') fh.close() if not ycals: prev_post_data = this_post_data prev_path_name = this_path_name this_post_data = page_data this_path_name = path_name path_name = None if this_post_data: file_name = getFileName( "%s/%s" % (base_folder, this_path_name), FILENAMES) FILENAMES.append(file_name) save_file = False new_data = re.sub( '<nextprev>', getPrevNextLinks(prev_path_name, path_name), this_post_data) if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >> fh, new_data fh.close() # Generate Feed date_format = "%a, %d %b %Y %H:%M:%S %Z" feed_posts.sort( key=lambda x: datetime.datetime.strptime(x.pubDate, date_format), reverse=True) buildFeed(OPTIONS['rssTitle'], domain, page_desc, feed_posts, base_folder) # iterate over posts for path_info, these_posts in posts.items(): count = OPTIONS.get('bloghomeItemCount', 20) chunks = [ these_posts[x:x + count] for x in xrange(0, len(these_posts), count) ] for i, chunk in enumerate(chunks): try: pageDescription = chunks[0][0][-1] except: pageDescription = OPTIONS.get('pageDescription', ' ') blogHomeDescription = OPTIONS.get('blogHomeDescription', pageDescription) if not i: page_name = "index" else: page_name = str(i + 1) if path_info == "Home": brandLink = '/' page_title = blogHomeTitle page_desc = blogHomeDescription file_name = getFileName( "%s/%s.html" % (base_folder, page_name), FILENAMES) FILENAMES.append(file_name) else: path, page_title = path_info brandLink = "/%s" % path.split('/')[0] page_desc = pageDescription file_name = getFileName( "%s/%s/%s.html" % (base_folder, path, page_name), FILENAMES) FILENAMES.append(file_name) rules, template = TEMPLATES['bloghome'] template = ''.join(template) # do the title and desc first to avoid overwriting with home title template = re.sub("<\%blogHomeTitle\%>", page_title, template) template = re.sub("<\%blogHomeDescription\%>", page_desc, template) template = re.sub("<\%pageTitle\%>", page_title, template) template = re.sub("<\%pageDescription\%>", page_desc, template) template = subData(template, GLOSSARY) template = re.sub( '<%BRANDMENU%>', '<a class="brand" href="<%BRANDLINK%>"><%BRAND%></a>', template) template = re.sub('<%BRAND%>', page_title, template) template = re.sub('<%BRANDLINK%>', brandLink, template) bodytext = '' for title, page_data, page_url, page_desc in chunk: bodytext += "<h2><a href=\"%s\">%s</a></h2>\n%s\n" % ( page_url, title, page_data) save_file = False new_data = re.sub('<%bodytext%>', bodytext, template) if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >> fh, new_data.encode('utf-16') fh.close() fh.close() return base_folder
connection.test.feed.find_and_modify({'url': self.url}, {'$push': {'items': {'$each': self.items}}}) def process(self): """Common method for create od update feed in db""" if self.in_db: self.__update_feed_in_db() else: self.__add_feed_in_db() if __name__ == "__main__": monkey.patch_all() logging.basicConfig(format='%(asctime)s::%(levelname)s::%(message)s', filename="fetcher.log", level=logging.INFO) logging.info("\n" + "*" * 5 + " New run " + "*" * 5) URLS = set() def create_job(url): f = FeedHandler(url) f.process() opml_file = open("feedly.opml") outline = opml.from_string(opml_file.read()) for line in outline: for item in line: URLS.add(item.xmlUrl) gevent.joinall([gevent.spawn(create_job, url) for url in URLS])
def parse(outline_url, my_folder, my_home_index_page): global DEBUG OPTIONS = {} TEMPLATES = {} PAGES = {} FILENAMES = [] GLOSSARY_COMPLETE = False CALENDARS = {} GLOSSARY = DEFAULT_GLOSSARY.copy() data_folder, my_folder = os.path.split(my_folder) working_dir, DATA_FOLDER = os.path.split(data_folder) os.chdir(working_dir) base_folder = "%s/%s" % (DATA_FOLDER, my_folder) mkdir_p(base_folder) outline_url = re.sub('www','dl',outline_url) if 'usercontent' not in outline_url: outline_url = re.sub('dropbox','dropboxusercontent', outline_url) outline = list(opml.from_string(requests.get(outline_url).content)) for i, node in enumerate(outline): if node.text == '#glossary': GLOSSARY.update(grabChildren(outline.pop(i))) elif node.text == '#templates': TEMPLATES.update(grabChildren(outline.pop(i))) else: first_word = node.text.split(' ')[0] if first_word in GLOSSARY_OPTIONS: try: value = OPTIONS[first_word[1:]] except: value = node.text GLOSSARY.update(GLOSSARY_FUNCTIONS[first_word](value)) for k,v in GLOSSARY.items(): if type(v[0]) == type([]): GLOSSARY[k] = ''.join(v[1]) for k,v in TEMPLATES.items(): if type(v[0]) == type([]): TEMPLATES[k] = (v[0],''.join(v[1])) GLOSSARY_COMPLETE = True while outline: next_node = outline.pop() try: if next_node.type == 'include': real_url = re.sub('dropbox','dropboxusercontent',next_node.url) real_url = re.sub('https','http',real_url) include = list(opml.from_string(requests.get(real_url).content)) nodes = include else: nodes = [next_node] except: nodes = [next_node] for node in nodes: try: if node.icon == 'calendar': try: i_title = node.name except: i_title = node.text CALENDARS = addCalendar('Home',node,i_title, CALENDARS) continue except: pass try: if node[0].icon == 'calendar': try: i_title = node[0].name except: i_title = node.text CALENDARS = addCalendar(node.text,node[0],i_title, CALENDARS) continue except: pass if node.text[0] == '#': option = node.text[1:].rstrip().lstrip() parts = option.split(' ') if len(parts) < 2: OPTIONS[option] = True else: key, value = parts[0], ' '.join(parts[1:]) if value[0] == '[': value = random.choice([v.strip() for v in value[1:-1].split(',')]) elif value[0] == '"': value = value[1:-1] if value.lower() in ['true','false']: OPTIONS[key] = bool(value) else: try: OPTIONS[key] = int(value) except: OPTIONS[key] = value else: brandLink = '/' blogHomeTitle = OPTIONS.get('blogHomeTitle','Home') page = {} try: this_type = node.type except: this_type = 'outline' try: rules, template = TEMPLATES[this_type] except Exception as e: raise Usage("#templates node required until I pull default templates from Trex. \n\n%s" % e.message) template = ''.join(template) for k,v in node._root.items(): template = re.sub('<%%%s%%>' % k,v,template) page[k] = v page_desc = page.get('pageDescription', ' ') template = re.sub('<%blogHomeTitle%>', blogHomeTitle, template) template = re.sub('<%pageTitle%>', page['text'], template) template = re.sub('<%pageDescription%>', page_desc, template) if 'name' not in page: page['name'] = makeName(page['text']) if 'url' not in page: page['url'] = "/%s" % page['name'] waste, bodytext = grabData(node, rules, this_type) bodytext.append('</div>') # not sure why we need this - something's not right bodytext = ''.join(bodytext) data = re.sub('<%bodytext%>',bodytext,template) page['bodytext'] = bodytext template = subData(data, GLOSSARY) template = re.sub('<%BRANDMENU%>', '<a class="brand" href="<%BRANDLINK%>"><%BRAND%></a>', template) template = re.sub('<%BRAND%>', blogHomeTitle, template) template = re.sub('<%BRANDLINK%>', brandLink, template) page['data'] = template PAGES[page['name']] = page for k, v in PAGES.items(): file_name = getFileName("%s/%s" % (base_folder,v['name']),FILENAMES) FILENAMES.append(file_name) save_file = False new_data = v['data'] if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >>fh, new_data fh.close() if os.path.basename(file_name) == my_home_index_page: print file_name fh = open(os.path.join(os.path.split(file_name)[0], "index.html"), "w+") print >>fh, new_data fh.close() blogHomeTitle = OPTIONS.get('blogHomeTitle','Home') posts = {"Home": []} feedcount = OPTIONS.get('feedCount',20) domain = "http://%s" % OPTIONS.get('domainName','') feed_posts = [] for base, calendar_stuff in CALENDARS.items(): ycals, index_title = calendar_stuff page_data, path_name = None, None this_post_data, this_path_name = None, None while ycals: if this_post_data: file_name = getFileName("%s/%s" % (base_folder, this_path_name), FILENAMES) FILENAMES.append(file_name) save_file = False new_data = re.sub('<nextprev>', getPrevNextLinks(prev_path_name, path_name), this_post_data) if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >>fh, new_data fh.close() ycal = ycals.pop() try: index_desc = ycal.description except: index_desc = '' year_title = ycal.text year_name = ycal.text year_num = ycal.text if base == 'Home': brandLink = '/' sub_folder = '' root_folder = base_folder year_path = year_num else: sub_folder = makeName(base) brandLink = "/%s" % sub_folder root_folder = "%s/%s" % (base_folder,sub_folder) if not os.path.exists(root_folder): os.mkdir(root_folder) year_path = "%s/%s" % (sub_folder, year_num) if (sub_folder,index_title) not in posts: posts[(sub_folder,index_title)] = [] year_folder = "%s/%s" % (base_folder, year_path) try: if not os.path.exists(year_folder): os.mkdir(year_folder) except: mkdir_p(year_folder) for mcal in ycal: month_title = mcal.text month_name = month_title.split(' ')[0] month_num = MONTHS[month_name] month_path = "%s/%s" % (year_path, month_num) month_folder = "%s/%s" % (year_folder, month_num) if not os.path.exists(month_folder): os.mkdir(month_folder) for dcal in mcal: day_title = dcal.text day_name = dcal.text day_num = "%02d" % float(day_title.split(' ')[1]) day_path = "%s/%s" % (month_path, day_num) day_folder = "%s/%s" % (month_folder, day_num) if not os.path.exists(day_folder): os.mkdir(day_folder) trail = [(year_path,year_title),(month_path,month_title),(day_path,day_title)] trail_links = """ <nextprev> <div class="breadcrumbs"><a href="/%s">%s</a> / %s</div> """ % (sub_folder, base, " / ".join(['<a href="/%s/">%s</a>' % (l,n) for l,n in trail])) # using page below because it matches above for node in dcal: prev_post_data = this_post_data prev_path_name = this_path_name this_post_data = page_data this_path_name = path_name page = {} try: this_type = node.type except: this_type = 'outline' rules, template = TEMPLATES[this_type] template = ''.join(template) for k,v in node._root.items(): template = re.sub('<%%%s%%>' % k,v,template) page[k] = v page_desc = page.get('pageDescription', index_desc) template = re.sub('<%blogHomeTitle%>', blogHomeTitle, template) template = re.sub('<%pageTitle%>', page['text'], template) template = re.sub('<%pageDescription%>', page_desc, template) if 'name' not in page: page['name'] = makeName(page['text']) if 'url' not in page: page['url'] = "/%s" % page['name'] waste, bodytext = grabData(node, rules, this_type) bodytext.append('</div><!--FIX-->') # not sure why we need this - something's not right bodytext = '\n'.join(bodytext) data = re.sub('<%bodytext%>',bodytext,template) page['bodytext'] = bodytext template = re.sub('</h1>', '</h1>%s' % trail_links, subData(data, GLOSSARY)) template = re.sub('<%BRANDMENU%>', '<a class="brand" href="<%BRANDLINK%>"><%BRAND%></a>', template) template = re.sub('<%BRAND%>', index_title, template) template = re.sub('<%BRANDLINK%>', brandLink, template) path_name = "%s/%s" % (day_path,page['name']) page['url'] = "/%s" % path_name listing = page['text'], page['bodytext'], page['url'], page_desc if feedcount: try: if node.isFeedItem == 'true': feed_posts.append( PyRSS2Gen.RSSItem( title = page['text'], link = domain + page['url'], description = page['bodytext'], guid = domain + page['url'], pubDate = page['created'] ) ) feedcount -= 1 except: pass # Do this after listing so comments don't show on index pages disqusGroupName = OPTIONS.get('disqusGroupName', False) commentsString = '' if disqusGroupName: uniq_id = outline_url + node.created commentsString = """ <script>var disqus_identifier = '%s';</script><a onclick="showHideComments ()"><span id="idShowHideComments" style="cursor: pointer;"></span></a><div class="divDisqusComments" id="idDisqusComments" style="visibility: visible;" ><div id="disqus_thread"></div></div><script type="text/javascript" src="http://disqus.com/forums/%s/embed.js"></script></div> """ % (uniq_id, disqusGroupName) page_data = re.sub('<!-- COMMENTS -->', commentsString, template) page['data'] = page_data if sub_folder: posts[(sub_folder,index_title)].append(listing) else: posts["Home"].append(listing) for path_info in [(year_path, year_title), (month_path, month_title), (day_path, day_title)]: if not path_info[0]: continue if path_info not in posts: if sub_folder: path_info = path_info[0], path_info[1] posts[path_info] = [] posts[path_info].append(listing) if this_post_data: file_name = getFileName("%s/%s" % (base_folder, this_path_name), FILENAMES) FILENAMES.append(file_name) save_file = False new_data = re.sub('<nextprev>', getPrevNextLinks(prev_path_name, path_name), this_post_data) if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >>fh, new_data.encode('utf-16') fh.close() if not ycals: prev_post_data = this_post_data prev_path_name = this_path_name this_post_data = page_data this_path_name = path_name path_name = None if this_post_data: file_name = getFileName("%s/%s" % (base_folder, this_path_name), FILENAMES) FILENAMES.append(file_name) save_file = False new_data = re.sub('<nextprev>', getPrevNextLinks(prev_path_name, path_name), this_post_data) if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >>fh, new_data fh.close() # Generate Feed date_format = "%a, %d %b %Y %H:%M:%S %Z" feed_posts.sort(key=lambda x: datetime.datetime.strptime(x.pubDate, date_format), reverse=True) buildFeed(OPTIONS['rssTitle'], domain, page_desc, feed_posts, base_folder) # iterate over posts for path_info, these_posts in posts.items(): count = OPTIONS.get('bloghomeItemCount',20) chunks=[these_posts[x:x+count] for x in xrange(0, len(these_posts), count)] for i,chunk in enumerate(chunks): try: pageDescription = chunks[0][0][-1] except: pageDescription = OPTIONS.get('pageDescription',' ') blogHomeDescription = OPTIONS.get('blogHomeDescription', pageDescription) if not i: page_name = "index" else: page_name = str(i+1) if path_info == "Home": brandLink = '/' page_title = blogHomeTitle page_desc = blogHomeDescription file_name = getFileName("%s/%s.html" % (base_folder, page_name), FILENAMES) FILENAMES.append(file_name) else: path, page_title = path_info brandLink = "/%s" % path.split('/')[0] page_desc = pageDescription file_name = getFileName("%s/%s/%s.html" % (base_folder, path, page_name), FILENAMES) FILENAMES.append(file_name) rules, template = TEMPLATES['bloghome'] template = ''.join(template) # do the title and desc first to avoid overwriting with home title template = re.sub("<\%blogHomeTitle\%>", page_title, template) template = re.sub("<\%blogHomeDescription\%>", page_desc, template) template = re.sub("<\%pageTitle\%>", page_title, template) template = re.sub("<\%pageDescription\%>", page_desc, template) template = subData(template, GLOSSARY) template = re.sub('<%BRANDMENU%>', '<a class="brand" href="<%BRANDLINK%>"><%BRAND%></a>', template) template = re.sub('<%BRAND%>', page_title, template) template = re.sub('<%BRANDLINK%>', brandLink, template) bodytext = '' for title, page_data, page_url, page_desc in chunk: bodytext += "<h2><a href=\"%s\">%s</a></h2>\n%s\n" % (page_url, title, page_data) save_file = False new_data = re.sub('<%bodytext%>', bodytext, template) if os.path.exists(file_name): fh = open(file_name) file_data = fh.read()[:-1] fh.close() save_file = (file_data != new_data) else: save_file = True if save_file: fh = open(file_name, "w+") print >>fh, new_data.encode('utf-16') fh.close() fh.close() return base_folder
""" $ pip install opml $ python opml_to_markdown.py some_outline.opml -> some_outline.md """ import codecs import opml import sys INPUT = sys.argv[1] OUTPUT = '.'.join(INPUT.split('.')[:-1] + ['md']) with codecs.open(INPUT, 'r', 'utf-8') as f: outline = opml.from_string(f.read()) blocks = [] def _extractBlocks(node): for child in node: blocks.append(child.text) if len(child) > 0: _extractBlocks(child) _extractBlocks(outline) output_content = '\n\n'.join(blocks) with codecs.open(OUTPUT, 'w', 'utf-8') as f: f.write(output_content)