Example #1
0
    def handle(self, args):
        # Connect to database
        db.connect()

        # Expand vars and user on the location passed
        root = os.path.expanduser(args.location[0])
        root = os.path.expandvars(root)

        # Create the exporter object
        exporter = MongoExporter(
            root, categories=args.categories, scheme=args.scheme
        )

        # If list categories is true, list them and exit.
        if args.list_categories:
            return "\n".join(sorted(exporter.categories))

        with Timer() as t:
            exporter.export()

        return (
            "Baleen corpus export complete in {}\n"
            "Exported {} posts in {} categories\n"
            "More information is in README in {}"
        ).format(
            t, sum(exporter.counts.values()),
            len(exporter.categories), root
        )
Example #2
0
    def handle(self, args):
        # Connect to the database
        db.connect()

        # Load the OPML files into the database
        count = sum(load_opml(path) for path in args.opml)
        return "Ingested {} feeds from {} OPML files".format(count, len(args.opml))
Example #3
0
    def handle(self, args):
        # Connect to the database
        db.connect()

        # Load the OPML files into the database
        count = sum(load_opml(path) for path in args.opml)
        return "Ingested {} feeds from {} OPML files".format(count, len(args.opml))
Example #4
0
    def handle(self, args):
        # Connect to database
        db.connect()

        # Expand vars and user on the location passed
        root = os.path.expanduser(args.location[0])
        root = os.path.expandvars(root)

        # Create the exporter object
        exporter = MongoExporter(root,
                                 categories=args.categories,
                                 scheme=args.scheme)

        # If list categories is true, list them and exit.
        if args.list_categories:
            return "\n".join(sorted(exporter.categories))

        with Timer() as t:
            exporter.export()

        return ("Baleen corpus export complete in {}\n"
                "Exported {} posts in {} categories\n"
                "More information is in README in {}").format(
                    t, sum(exporter.counts.values()), len(exporter.categories),
                    root)
Example #5
0
def latest_job():
    # get the last job executed
    db.connect()
    version = baleen.get_version()
    counts = [
        db.Feed.objects.count(),
        db.Post.objects.count(),
        db.Job.objects.count()
    ]
    latest_job = db.Job.objects.order_by('-started').first()
    latest_feed = db.Feed.objects.order_by('-updated').first()
    latest_post = db.Post.objects.order_by('-id').first()
    td = datetime.datetime.now() - latest_job.started
    running_time = str(td)
    logitems = get_logs()

    # load all data into job_status template
    return render_template('job_status.html',
                           latest_job=latest_job,
                           latest_feed=latest_feed,
                           latest_post=latest_post,
                           version=version,
                           counts=counts,
                           running_time=running_time,
                           logitems=logitems)
Example #6
0
def main(args):
    """
    Simple script that reads JSON data from mongo from a category, then dumps it out. 
    """
    db.connect() 
    for post in posts(args.category):
        args.output.write(post.to_json())
        args.output.write("\n")
Example #7
0
def main(args):
    """
    Simple script that reads JSON data from mongo from a category, then dumps it out. 
    """
    db.connect()
    for post in posts(args.category):
        args.output.write(post.to_json())
        args.output.write("\n")
Example #8
0
def index():
    # connect to the database
    db.connect()
    # get all the stuff we want
    feeds = db.Feed.objects()
    feed_count = feeds.count()
    topics = set([feed.category for feed in db.Feed.objects.only('category')])
    feeds_topics_counts = len(topics)
    feeds_topics = {topic: db.Feed.objects(category=topic) for topic in topics}

    # load all the data into the templates/feed_list.html template
    return render_template('feed_list.html',
                           feeds=feeds,
                           feeds_topics=feeds_topics,
                           feed_count=feed_count,
                           topic_count=feeds_topics_counts)
Example #9
0
    def handle(self, args):
        # Connect to database
        db.connect()

        # Export from the database
        exporter = MongoExporter()
        exporter.export(args.location[0], categories=args.categories)

        # Perform counts of export
        root = args.location[0]
        cats = filter(os.path.isdir,
                      [os.path.join(root, cat) for cat in os.listdir(root)])
        docs = sum(len(os.listdir(d)) for d in cats)

        return ("Exported {} documents in {} categories "
                "as well as a readme to {}.".format(docs, len(cats), root))
Example #10
0
    def handle(self, args):

        ingestor = MongoIngestor()

        if args.opml:
            ingestor = OPMLIngestor(args.opml)
            raise ConsoleError("opml ingestion is an untested utility!")

        if args.feeds:
            ingestor = Ingestor(args.feeds)
            raise ConsoleError("feed ingestion is an untested utility!")

        db.connect()
        ingestor.ingest()
        return ("Processed {feeds} feeds ({timer}): "
                "{posts} posts with {errors} errors").format(
                    timer=ingestor.timer, **ingestor.counts)
Example #11
0
def index():
    # connect to the database
    db.connect()
    # get all the stuff we want
    feeds = db.Feed.objects()
    feed_count = feeds.count()
    topics = set([feed.category for feed in db.Feed.objects.only('category')])
    feeds_topics_counts = len(topics)
    feeds_topics = {
        topic: db.Feed.objects(category=topic)
        for topic in topics
    }

    # load all the data into the templates/feed_list.html template
    return render_template('feed_list.html',
                           feeds=feeds,
                           feeds_topics=feeds_topics,
                           feed_count=feed_count,
                           topic_count=feeds_topics_counts)
Example #12
0
    def handle(self, args):

        ingestor = MongoIngestor()

        if args.opml:
            ingestor = OPMLIngestor(args.opml)
            raise ConsoleError("opml ingestion is an untested utility!")

        if args.feeds:
            ingestor = Ingestor(args.feeds)
            raise ConsoleError("feed ingestion is an untested utility!")

        db.connect()
        ingestor.ingest()
        return (
            "Processed {feeds} feeds ({timer}): "
            "{posts} posts with {errors} errors"
        ).format(
            timer=ingestor.timer, **ingestor.counts
        )
Example #13
0
def latest_job():
    # get the last job executed
    db.connect()
    version = baleen.get_version()
    counts = [db.Feed.objects.count(),db.Post.objects.count(),db.Job.objects.count()]
    latest_job = db.Job.objects.order_by('-started').first()
    latest_feed = db.Feed.objects.order_by('-updated').first()
    latest_post = db.Post.objects.order_by('-id').first()
    td = datetime.datetime.now() - latest_job.started
    running_time = str(td)
    logitems = get_logs()

    # load all data into job_status template
    return render_template('job_status.html',
                           latest_job=latest_job,
                           latest_feed=latest_feed,
                           latest_post=latest_post,
                           version=version,
                           counts=counts,
                           running_time=running_time,
                           logitems=logitems)
Example #14
0
    def handle(self, args):
        # Connect to database
        db.connect()

        # Export from the database
        exporter = MongoExporter()
        exporter.export(args.location[0], categories=args.categories)

        # Perform counts of export
        root = args.location[0]
        cats = filter(
            os.path.isdir, [os.path.join(root, cat) for cat in os.listdir(root)]
        )
        docs = sum(len(os.listdir(d)) for d in cats)

        return (
            "Exported {} documents in {} categories "
            "as well as a readme to {}.".format(
                docs, len(cats), root
            )
        )
Example #15
0
def ingest(path, **kwargs):
    """
    Ingests an OPML file into the Mongo database; returns the count of the
    number of documents added to the database.
    """
    db.connect(**kwargs)

    opml = OPML(path)
    rows = 0
    for feed in opml:
        feed.pop('type')                    # Unneeded for database
        feed.pop('text')                    # Unneeded for database
        feed['link'] = feed.pop('xmlurl')   # Rename the XML URL
        feed['urls'] = {
            'htmlurl': feed.pop('htmlurl')  # Add htmlurl to urls
        }
        feed = db.Feed(**feed)              # Construct without an ObjectId

        try:
            feed.save()
            rows += 1
        except NotUniqueError:
            continue
    return rows
Example #16
0
def ingest(path, **kwargs):
    """
    Ingests an OPML file into the Mongo database; returns the count of the
    number of documents added to the database.
    """
    db.connect(**kwargs)

    opml = OPML(path)
    rows = 0
    for feed in opml:
        feed.pop('type')  # Unneeded for database
        feed.pop('text')  # Unneeded for database
        feed['link'] = feed.pop('xmlurl')  # Rename the XML URL
        feed['urls'] = {
            'htmlurl': feed.pop('htmlurl')  # Add htmlurl to urls
        }
        feed = db.Feed(**feed)  # Construct without an ObjectId

        try:
            feed.save()
            rows += 1
        except NotUniqueError:
            continue
    return rows
Example #17
0
    def handle(self, args):
        # Setup output and connect to database.
        output = []
        db.connect()

        # Printout configuration details as necessary.
        if args.config:
            output.append("Configuration:")
            output.append(str(settings))
            output.append("")

        output.append("Baleen v{} Status:".format(baleen.get_version()))
        output.append(
            "{} Feeds and {} Posts after {} Jobs".format(
                db.Feed.objects.count(),
                db.Post.objects.count(),
                db.Job.objects.count(),
            )
        )

        latest = db.Job.objects.order_by('-started').first()
        output.extend([
            "",
            "Latest Job: ",
            "    Type: {} v{}".format(latest.name, latest.version),
            "    Job ID: {}".format(latest.jobid),
            "    Started: {}".format(latest.started.strftime(HUMAN_DATETIME))
        ])

        if latest.finished:
            if latest.failed:
                output.append("    Failed: {}".format(latest.reason))
            else:
                output.append("    Finished: {}".format(latest.finished.strftime(HUMAN_DATETIME)))
                output.append("    Counts:")
                output.append("      " + "\n      ".join(["{}: {}".format(*item) for item in list(latest.counts.items())]))
                output.append("    Errors:")
                output.append("      " + "\n      ".join(["{}: {}".format(*item) for item in list(latest.errors.items())]))
        else:
            output.append("    Currently Running")

        latest = db.Feed.objects.order_by('-updated').first()
        output.extend([
            "",
            "Latest Feed: ",
            "    Title: \"{}\"".format(latest.title),
            "    eTag: \"{}\"".format(latest.etag),
            "    Modified: {}".format(latest.modified),
            "    Updated: {}".format(latest.updated.strftime(HUMAN_DATETIME)),
            # u"    Posts: {}".format(latest.count_posts()), # This is very slow need to fix.
        ])

        latest = db.Post.objects.order_by('-id').first()
        output.extend([
            "",
            "Latest Post: ",
            "    Title: \"{}\"".format(latest.title),
            "    Feed: \"{}\"".format(latest.feed.title),
            "    Fetched: {}".format(latest.created.strftime(HUMAN_DATETIME)),
        ])

        return "\n".join(output).encode('utf-8', errors='replace')
Example #18
0
File: run.py Project: youurt/baleen
 def ingest(self, args):
     db.connect()
     ingestor = MongoIngestor()
     ingestor.ingest()
Example #19
0
    def handle(self, args):
        # Setup output and connect to database.
        output = []
        db.connect()

        # Printout configuration details as necessary.
        if args.config:
            output.append(u"Configuration:")
            output.append(unicode(settings))
            output.append(u"")

        output.append(u"Baleen v{} Status:".format(baleen.get_version()))
        output.append(
            u"{} Feeds and {} Posts after {} Jobs".format(
                db.Feed.objects.count(), db.Post.objects.count(), db.Job.objects.count()
            )
        )

        latest = db.Job.objects.order_by("-started").first()
        output.extend(
            [
                u"",
                u"Latest Job: ",
                u"    Type: {} v{}".format(latest.name, latest.version),
                u"    Job ID: {}".format(latest.jobid),
                u"    Started: {}".format(latest.started.strftime(HUMAN_DATETIME)),
            ]
        )

        if latest.finished:
            if latest.failed:
                output.append(u"    Failed: {}".format(latest.reason))
            else:
                output.append(u"    Finished: {}".format(latest.finished.strftime(HUMAN_DATETIME)))
                output.append(u"    Counts:")
                output.append(u"      " + u"\n      ".join([u"{}: {}".format(*item) for item in latest.counts.items()]))
                output.append(u"    Errors:")
                output.append(u"      " + u"\n      ".join([u"{}: {}".format(*item) for item in latest.errors.items()]))
        else:
            output.append(u"    Currently Running")

        latest = db.Feed.objects.order_by("-updated").first()
        output.extend(
            [
                u"",
                u"Latest Feed: ",
                u'    Title: "{}"'.format(latest.title),
                u'    eTag: "{}"'.format(latest.etag),
                u"    Modified: {}".format(latest.modified),
                u"    Updated: {}".format(latest.updated.strftime(HUMAN_DATETIME)),
                # u"    Posts: {}".format(latest.count_posts()), # This is very slow need to fix.
            ]
        )

        latest = db.Post.objects.order_by("-id").first()
        output.extend(
            [
                u"",
                u"Latest Post: ",
                u'    Title: "{}"'.format(latest.title),
                u'    Feed: "{}"'.format(latest.feed.title),
                u"    Fetched: {}".format(latest.created.strftime(HUMAN_DATETIME)),
            ]
        )

        return u"\n".join(output).encode("utf-8", errors="replace")
Example #20
0
            catdir[category] = path

        # Iterate through all posts, writing them to disk correctly.
        # Right now we will simply write them based on their object id.
        for post, category in self.posts():
            path = os.path.join(
                self.root, catdir[category], "{}.{}".format(post.id, self.scheme)
            )

            with codecs.open(path, 'w', encoding='utf-8') as f:
                action = {
                    'json': lambda: post.to_json(indent=2),
                    'html': post.htmlize,
                }[self.scheme]

                f.write(action())

        # Mark the export as finished and write the README to the corpus.
        self.state = State.Finished
        self.readme(os.path.join(self.root, "README"))
        self.feedinfo(os.path.join(self.root, "feeds.json"))


if __name__ == '__main__':
    import baleen.models as db

    db.connect()
    exporter = MongoExporter('fixtures/corpus')
    exporter.export()
Example #21
0
        """
        In the root directory writes each file and a README
        """

        if not os.path.exists(root):
            os.mkdir(root)

        if not os.path.isdir(root):
            raise Exception("%s is not a directory!" % root)

        for category in self.categories:
            dirname = os.path.join(root, category.replace(" ", "_"))
            if not os.path.exists(dirname):
                os.mkdir(dirname)

            for idx, post in enumerate(self.posts(category)):
                name = os.path.join(dirname, "%03i.html" % idx)
                with codecs.open(name, 'wb', encoding='utf8') as f:
                    f.write(post.htmlize())

        readme = os.path.join(root, "README")
        with codecs.open(readme, 'wb', encoding='utf8') as f:
            f.write(self.readme())

if __name__ == '__main__':
    from baleen.models import connect
    connect()
    exporter = MongoExporter()
    exporter.export('fixtures/corpus')

Example #22
0
                raise ExportError("'{}' is not a directory!".format(path))

            catdir[category] = path

        # Iterate through all posts, writing them to disk correctly.
        # Right now we will simply write them based on their object id.
        for post, category in self.posts():
            path = os.path.join(self.root, catdir[category],
                                "{}.{}".format(post.id, self.scheme))

            with codecs.open(path, 'w', encoding='utf-8') as f:
                action = {
                    'json': lambda: post.to_json(indent=2),
                    'html': post.htmlize,
                }[self.scheme]

                f.write(action())

        # Mark the export as finished and write the README to the corpus.
        self.state = State.Finished
        self.readme(os.path.join(self.root, "README"))
        self.feedinfo(os.path.join(self.root, "feeds.json"))


if __name__ == '__main__':
    import baleen.models as db

    db.connect()
    exporter = MongoExporter('fixtures/corpus')
    exporter.export()
Example #23
0
        In the root directory writes each file and a README
        """
        categories = categories or self.categories

        if not os.path.exists(root):
            os.mkdir(root)

        if not os.path.isdir(root):
            raise Exception("%s is not a directory!" % root)

        for category in categories:
            dirname = os.path.join(root, category.replace(" ", "_"))
            if not os.path.exists(dirname):
                os.mkdir(dirname)

            for idx, post in enumerate(self.posts(category)):
                name = os.path.join(dirname, "%03i.html" % idx)
                with codecs.open(name, 'wb', encoding='utf8') as f:
                    f.write(post.htmlize())

        readme = os.path.join(root, "README")
        with codecs.open(readme, 'wb', encoding='utf8') as f:
            f.write(self.readme())


if __name__ == '__main__':
    from baleen.models import connect
    connect()
    exporter = MongoExporter()
    exporter.export('fixtures/corpus')