Beispiel #1
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    urllist = [subscription.feed_uri
               for subscription in opml.recursive_subscriptions]
    threads_count = args.threads if args.threads is not None else cpu_count()

    generator = crawl(urllist, threads_count)
    try:
        for feed_url, feed_data, crawler_hints in generator:
            if args.verbose:
                print('{0.title} - {1} entries'.format(
                    feed_data, len(feed_data.entries)
                ))
            with stage:
                feed_id = hashlib.sha1(feed_url).hexdigest()
                stage.feeds[feed_id] = feed_data
    except CrawlError as e:
        print(e, file=sys.stderr)
Beispiel #2
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    feed_id = args.feed_id
    if feed_id:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions
                        if sub.feed_id == feed_id)
        if not feed_map:
            print('There is no such feed:', feed_id, file=sys.stderr)
            return
    else:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions)
        if not feed_map:
            print('No feeds to crawl', file=sys.stderr)
            return
    threads_count = args.threads if args.threads is not None else cpu_count()
    iterator = iter(crawl(feed_map.keys(), threads_count))
    while 1:
        try:
            feed_url, feed_data, crawler_hints = next(iterator)
            if args.verbose:
                print('{0.title} - {1} entries'.format(
                    feed_data, len(feed_data.entries)
                ))
            with stage:
                feed_id = feed_map[feed_url]
                stage.feeds[feed_id] = feed_data
        except (CrawlError, SchemaError) as e:
            if isinstance(e, CrawlError):
                print('Something went wrong with', e.feed_uri, file=sys.stderr)
            if args.verbose:
                traceback.print_exc()
            else:
                print(e, file=sys.stderr)
        except StopIteration:
            break
Beispiel #3
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    feed_id = args.feed_id
    if feed_id:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions
                        if sub.feed_id == feed_id)
        if not feed_map:
            print('There is no such feed:', feed_id, file=sys.stderr)
            return
    else:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions)
        if not feed_map:
            print('No feeds to crawl', file=sys.stderr)
            return
    threads_count = args.threads if args.threads is not None else cpu_count()
    iterator = iter(crawl(feed_map.keys(), threads_count))
    while 1:
        try:
            feed_url, feed_data, crawler_hints = next(iterator)
            if args.verbose:
                print('{0.title} - {1} entries'.format(feed_data,
                                                       len(feed_data.entries)))
            with stage:
                feed_id = feed_map[feed_url]
                stage.feeds[feed_id] = feed_data
        except (CrawlError, SchemaError) as e:
            if isinstance(e, CrawlError):
                print('Something went wrong with', e.feed_uri, file=sys.stderr)
            if args.verbose:
                traceback.print_exc()
            else:
                print(e, file=sys.stderr)
        except StopIteration:
            break
def test_cpu_count():
    assert isinstance(cpu_count(), numbers.Integral)
    assert 0 < cpu_count()