def handle(self, args): # Connect to database db.connect() # Expand vars and user on the location passed root = os.path.expanduser(args.location[0]) root = os.path.expandvars(root) # Create the exporter object exporter = MongoExporter( root, categories=args.categories, scheme=args.scheme ) # If list categories is true, list them and exit. if args.list_categories: return "\n".join(sorted(exporter.categories)) with Timer() as t: exporter.export() return ( "Baleen corpus export complete in {}\n" "Exported {} posts in {} categories\n" "More information is in README in {}" ).format( t, sum(exporter.counts.values()), len(exporter.categories), root )
def handle(self, args): # Connect to the database db.connect() # Load the OPML files into the database count = sum(load_opml(path) for path in args.opml) return "Ingested {} feeds from {} OPML files".format(count, len(args.opml))
def handle(self, args): # Connect to database db.connect() # Expand vars and user on the location passed root = os.path.expanduser(args.location[0]) root = os.path.expandvars(root) # Create the exporter object exporter = MongoExporter(root, categories=args.categories, scheme=args.scheme) # If list categories is true, list them and exit. if args.list_categories: return "\n".join(sorted(exporter.categories)) with Timer() as t: exporter.export() return ("Baleen corpus export complete in {}\n" "Exported {} posts in {} categories\n" "More information is in README in {}").format( t, sum(exporter.counts.values()), len(exporter.categories), root)
def latest_job(): # get the last job executed db.connect() version = baleen.get_version() counts = [ db.Feed.objects.count(), db.Post.objects.count(), db.Job.objects.count() ] latest_job = db.Job.objects.order_by('-started').first() latest_feed = db.Feed.objects.order_by('-updated').first() latest_post = db.Post.objects.order_by('-id').first() td = datetime.datetime.now() - latest_job.started running_time = str(td) logitems = get_logs() # load all data into job_status template return render_template('job_status.html', latest_job=latest_job, latest_feed=latest_feed, latest_post=latest_post, version=version, counts=counts, running_time=running_time, logitems=logitems)
def main(args): """ Simple script that reads JSON data from mongo from a category, then dumps it out. """ db.connect() for post in posts(args.category): args.output.write(post.to_json()) args.output.write("\n")
def index(): # connect to the database db.connect() # get all the stuff we want feeds = db.Feed.objects() feed_count = feeds.count() topics = set([feed.category for feed in db.Feed.objects.only('category')]) feeds_topics_counts = len(topics) feeds_topics = {topic: db.Feed.objects(category=topic) for topic in topics} # load all the data into the templates/feed_list.html template return render_template('feed_list.html', feeds=feeds, feeds_topics=feeds_topics, feed_count=feed_count, topic_count=feeds_topics_counts)
def handle(self, args): # Connect to database db.connect() # Export from the database exporter = MongoExporter() exporter.export(args.location[0], categories=args.categories) # Perform counts of export root = args.location[0] cats = filter(os.path.isdir, [os.path.join(root, cat) for cat in os.listdir(root)]) docs = sum(len(os.listdir(d)) for d in cats) return ("Exported {} documents in {} categories " "as well as a readme to {}.".format(docs, len(cats), root))
def handle(self, args): ingestor = MongoIngestor() if args.opml: ingestor = OPMLIngestor(args.opml) raise ConsoleError("opml ingestion is an untested utility!") if args.feeds: ingestor = Ingestor(args.feeds) raise ConsoleError("feed ingestion is an untested utility!") db.connect() ingestor.ingest() return ("Processed {feeds} feeds ({timer}): " "{posts} posts with {errors} errors").format( timer=ingestor.timer, **ingestor.counts)
def index(): # connect to the database db.connect() # get all the stuff we want feeds = db.Feed.objects() feed_count = feeds.count() topics = set([feed.category for feed in db.Feed.objects.only('category')]) feeds_topics_counts = len(topics) feeds_topics = { topic: db.Feed.objects(category=topic) for topic in topics } # load all the data into the templates/feed_list.html template return render_template('feed_list.html', feeds=feeds, feeds_topics=feeds_topics, feed_count=feed_count, topic_count=feeds_topics_counts)
def handle(self, args): ingestor = MongoIngestor() if args.opml: ingestor = OPMLIngestor(args.opml) raise ConsoleError("opml ingestion is an untested utility!") if args.feeds: ingestor = Ingestor(args.feeds) raise ConsoleError("feed ingestion is an untested utility!") db.connect() ingestor.ingest() return ( "Processed {feeds} feeds ({timer}): " "{posts} posts with {errors} errors" ).format( timer=ingestor.timer, **ingestor.counts )
def latest_job(): # get the last job executed db.connect() version = baleen.get_version() counts = [db.Feed.objects.count(),db.Post.objects.count(),db.Job.objects.count()] latest_job = db.Job.objects.order_by('-started').first() latest_feed = db.Feed.objects.order_by('-updated').first() latest_post = db.Post.objects.order_by('-id').first() td = datetime.datetime.now() - latest_job.started running_time = str(td) logitems = get_logs() # load all data into job_status template return render_template('job_status.html', latest_job=latest_job, latest_feed=latest_feed, latest_post=latest_post, version=version, counts=counts, running_time=running_time, logitems=logitems)
def handle(self, args): # Connect to database db.connect() # Export from the database exporter = MongoExporter() exporter.export(args.location[0], categories=args.categories) # Perform counts of export root = args.location[0] cats = filter( os.path.isdir, [os.path.join(root, cat) for cat in os.listdir(root)] ) docs = sum(len(os.listdir(d)) for d in cats) return ( "Exported {} documents in {} categories " "as well as a readme to {}.".format( docs, len(cats), root ) )
def ingest(path, **kwargs): """ Ingests an OPML file into the Mongo database; returns the count of the number of documents added to the database. """ db.connect(**kwargs) opml = OPML(path) rows = 0 for feed in opml: feed.pop('type') # Unneeded for database feed.pop('text') # Unneeded for database feed['link'] = feed.pop('xmlurl') # Rename the XML URL feed['urls'] = { 'htmlurl': feed.pop('htmlurl') # Add htmlurl to urls } feed = db.Feed(**feed) # Construct without an ObjectId try: feed.save() rows += 1 except NotUniqueError: continue return rows
def handle(self, args): # Setup output and connect to database. output = [] db.connect() # Printout configuration details as necessary. if args.config: output.append("Configuration:") output.append(str(settings)) output.append("") output.append("Baleen v{} Status:".format(baleen.get_version())) output.append( "{} Feeds and {} Posts after {} Jobs".format( db.Feed.objects.count(), db.Post.objects.count(), db.Job.objects.count(), ) ) latest = db.Job.objects.order_by('-started').first() output.extend([ "", "Latest Job: ", " Type: {} v{}".format(latest.name, latest.version), " Job ID: {}".format(latest.jobid), " Started: {}".format(latest.started.strftime(HUMAN_DATETIME)) ]) if latest.finished: if latest.failed: output.append(" Failed: {}".format(latest.reason)) else: output.append(" Finished: {}".format(latest.finished.strftime(HUMAN_DATETIME))) output.append(" Counts:") output.append(" " + "\n ".join(["{}: {}".format(*item) for item in list(latest.counts.items())])) output.append(" Errors:") output.append(" " + "\n ".join(["{}: {}".format(*item) for item in list(latest.errors.items())])) else: output.append(" Currently Running") latest = db.Feed.objects.order_by('-updated').first() output.extend([ "", "Latest Feed: ", " Title: \"{}\"".format(latest.title), " eTag: \"{}\"".format(latest.etag), " Modified: {}".format(latest.modified), " Updated: {}".format(latest.updated.strftime(HUMAN_DATETIME)), # u" Posts: {}".format(latest.count_posts()), # This is very slow need to fix. ]) latest = db.Post.objects.order_by('-id').first() output.extend([ "", "Latest Post: ", " Title: \"{}\"".format(latest.title), " Feed: \"{}\"".format(latest.feed.title), " Fetched: {}".format(latest.created.strftime(HUMAN_DATETIME)), ]) return "\n".join(output).encode('utf-8', errors='replace')
def ingest(self, args): db.connect() ingestor = MongoIngestor() ingestor.ingest()
def handle(self, args): # Setup output and connect to database. output = [] db.connect() # Printout configuration details as necessary. if args.config: output.append(u"Configuration:") output.append(unicode(settings)) output.append(u"") output.append(u"Baleen v{} Status:".format(baleen.get_version())) output.append( u"{} Feeds and {} Posts after {} Jobs".format( db.Feed.objects.count(), db.Post.objects.count(), db.Job.objects.count() ) ) latest = db.Job.objects.order_by("-started").first() output.extend( [ u"", u"Latest Job: ", u" Type: {} v{}".format(latest.name, latest.version), u" Job ID: {}".format(latest.jobid), u" Started: {}".format(latest.started.strftime(HUMAN_DATETIME)), ] ) if latest.finished: if latest.failed: output.append(u" Failed: {}".format(latest.reason)) else: output.append(u" Finished: {}".format(latest.finished.strftime(HUMAN_DATETIME))) output.append(u" Counts:") output.append(u" " + u"\n ".join([u"{}: {}".format(*item) for item in latest.counts.items()])) output.append(u" Errors:") output.append(u" " + u"\n ".join([u"{}: {}".format(*item) for item in latest.errors.items()])) else: output.append(u" Currently Running") latest = db.Feed.objects.order_by("-updated").first() output.extend( [ u"", u"Latest Feed: ", u' Title: "{}"'.format(latest.title), u' eTag: "{}"'.format(latest.etag), u" Modified: {}".format(latest.modified), u" Updated: {}".format(latest.updated.strftime(HUMAN_DATETIME)), # u" Posts: {}".format(latest.count_posts()), # This is very slow need to fix. ] ) latest = db.Post.objects.order_by("-id").first() output.extend( [ u"", u"Latest Post: ", u' Title: "{}"'.format(latest.title), u' Feed: "{}"'.format(latest.feed.title), u" Fetched: {}".format(latest.created.strftime(HUMAN_DATETIME)), ] ) return u"\n".join(output).encode("utf-8", errors="replace")
catdir[category] = path # Iterate through all posts, writing them to disk correctly. # Right now we will simply write them based on their object id. for post, category in self.posts(): path = os.path.join( self.root, catdir[category], "{}.{}".format(post.id, self.scheme) ) with codecs.open(path, 'w', encoding='utf-8') as f: action = { 'json': lambda: post.to_json(indent=2), 'html': post.htmlize, }[self.scheme] f.write(action()) # Mark the export as finished and write the README to the corpus. self.state = State.Finished self.readme(os.path.join(self.root, "README")) self.feedinfo(os.path.join(self.root, "feeds.json")) if __name__ == '__main__': import baleen.models as db db.connect() exporter = MongoExporter('fixtures/corpus') exporter.export()
""" In the root directory writes each file and a README """ if not os.path.exists(root): os.mkdir(root) if not os.path.isdir(root): raise Exception("%s is not a directory!" % root) for category in self.categories: dirname = os.path.join(root, category.replace(" ", "_")) if not os.path.exists(dirname): os.mkdir(dirname) for idx, post in enumerate(self.posts(category)): name = os.path.join(dirname, "%03i.html" % idx) with codecs.open(name, 'wb', encoding='utf8') as f: f.write(post.htmlize()) readme = os.path.join(root, "README") with codecs.open(readme, 'wb', encoding='utf8') as f: f.write(self.readme()) if __name__ == '__main__': from baleen.models import connect connect() exporter = MongoExporter() exporter.export('fixtures/corpus')
raise ExportError("'{}' is not a directory!".format(path)) catdir[category] = path # Iterate through all posts, writing them to disk correctly. # Right now we will simply write them based on their object id. for post, category in self.posts(): path = os.path.join(self.root, catdir[category], "{}.{}".format(post.id, self.scheme)) with codecs.open(path, 'w', encoding='utf-8') as f: action = { 'json': lambda: post.to_json(indent=2), 'html': post.htmlize, }[self.scheme] f.write(action()) # Mark the export as finished and write the README to the corpus. self.state = State.Finished self.readme(os.path.join(self.root, "README")) self.feedinfo(os.path.join(self.root, "feeds.json")) if __name__ == '__main__': import baleen.models as db db.connect() exporter = MongoExporter('fixtures/corpus') exporter.export()
In the root directory writes each file and a README """ categories = categories or self.categories if not os.path.exists(root): os.mkdir(root) if not os.path.isdir(root): raise Exception("%s is not a directory!" % root) for category in categories: dirname = os.path.join(root, category.replace(" ", "_")) if not os.path.exists(dirname): os.mkdir(dirname) for idx, post in enumerate(self.posts(category)): name = os.path.join(dirname, "%03i.html" % idx) with codecs.open(name, 'wb', encoding='utf8') as f: f.write(post.htmlize()) readme = os.path.join(root, "README") with codecs.open(readme, 'wb', encoding='utf8') as f: f.write(self.readme()) if __name__ == '__main__': from baleen.models import connect connect() exporter = MongoExporter() exporter.export('fixtures/corpus')