"""Heroshi URL server WSGI application.""" from base64 import b64encode import eventlet, eventlet.pools, eventlet.wsgi import hashlib try: import yajl as json except ImportError: import json import webob import webob.exc from heroshi import get_logger log = get_logger("manager.server") from heroshi.conf import settings from heroshi.misc import gzip_string from heroshi.wsgi import method_dispatcher from .manager import Manager AUTH_HEADER = "X-Heroshi-Auth" MIN_COMPRESS_LENGTH = 400 manager_pool = eventlet.pools.Pool(max_size=1) manager_pool.create = Manager class Response(webob.Response): default_content_type = 'text/plain' default_conditional_response = True
# coding: utf-8 """Heroshi crawl reporter entry-point. Reads \\n-separated list of JSON crawl results on stdin.""" import json import logging, sys from optparse import OptionParser import heroshi, heroshi.api from heroshi.log import update_loggers_level log = heroshi.get_logger("cli_report") def parse_params(): usage_info = u"Usage: %prog [OPTION...]" version_info = u"Heroshi/" + heroshi.__version__ opt_parser = OptionParser(usage_info, version=version_info) opt_parser.set_defaults(verbose=False, quiet=False, forever=False) opt_parser.add_option('-q', '--quiet', action="store_true", help=u"Be quiet, don't generate any output") opt_parser.add_option('-v', '--verbose', action="store_true", help=u"Be verbose, print detailed information") (options, args) = opt_parser.parse_args() return options, args def main(): options, args = parse_params() # set up logging if options.quiet:
# coding: utf-8 """Heroshi worker: IO-worker interaction. """ import errno from eventlet import sleep, with_timeout from eventlet.queue import Event import json import subprocess from heroshi import error, get_logger log = get_logger("worker.io") class IoWorkerDead(error.Error): pass class Worker(object): """IO worker. """ def __init__(self, is_closed): self.is_closed = is_closed self.results = {} self.worker = None def run_loop(self): """Runs io-worker until it dies. You SHOULD spawn this function (so it runs in separate thread). """ args = ["io-worker/io-worker", "-skip-robots"]
"""Heroshi URL server implementation main module.""" __all__ = ['Manager'] import datetime import dateutil.parser import eventlet, eventlet.pools, eventlet.queue from eventlet import greenthread, spawn, sleep, Queue eventlet.monkey_patch(all=False, socket=True, select=True, psycopg=True) try: import yajl as json except ImportError: import json from heroshi import TIME_FORMAT, get_logger, log_exceptions log = get_logger("manager") from heroshi.conf import settings from heroshi.data import Cache from heroshi.misc import reraise_errors from heroshi.profile import Profile from heroshi.storage.postgres import StorageConnection class Manager(object): """Class encapsulating Heroshi URL server state.""" def __init__(self): self.active = False self.prefetch_queue = Queue(settings.prefetch['queue_size']) self.prefetch_thread = spawn(self.prefetch_worker)
"""Custom profiler implementation.""" import time from heroshi import get_logger log = get_logger("profile") class Profile(object): def __init__(self, name): self.name = name self.start_time = None def __enter__(self): self.start_time = time.time() def __exit__(self, exc_type, exc_value, exc_tb): end = time.time() time_passed = end - self.start_time log.info(u"%s: %d ms", self.name, time_passed * 1000) return False def decorate(self): def wrapper(func): def wrapped(*args, **kwargs): with self: return func(*args, **kwargs) return wrapped return wrapper
sends crawl info back to queue server.""" from datetime import datetime import eventlet from eventlet import GreenPool, greenthread, sleep, spawn, with_timeout from eventlet.queue import Empty, Queue import httplib2 import json import random, time, urllib, urlparse import robotparser import sys from heroshi import TIME_FORMAT from heroshi import api, error, get_logger log = get_logger("worker.Crawler") from heroshi.conf import settings from heroshi.data import PoolMap from heroshi.error import ApiError, CrawlError, FetchError, RobotsError from heroshi.misc import reraise_errors from heroshi.worker import io eventlet.monkey_patch(all=False, os=True, socket=True, select=True) class Stop(error.Error): pass class Crawler(object): def __init__(self, max_connections, input_is_plain):
"""PostgreSQL storage backend for Heroshi.""" __all__ = ['StorageConnection'] import base64 from datetime import datetime from itertools import imap from functools import partial import hashlib import json import psycopg2 import psycopg2.extensions psycopg2.extensions.register_type(psycopg2.extensions.UNICODE) from heroshi import TIME_FORMAT, get_logger log = get_logger("storage.postgres") from heroshi.conf import settings from heroshi.error import StorageError from . import dbhelpers, sql RANDOMIZER_K = 50 RECHECK_INTERVAL = '2 days' TABLE = 'metadata' def row_factory(columns, values): row = dbhelpers.dict_factory(columns, values) row.update(json.loads(row.pop('var') or "{}")) row['headers'] = json.loads(row['headers'] or "{}") return row
Crawler uses these helpers to communicate with URL server.""" from eventlet.pools import Pool import httplib2 try: import yajl as json except ImportError: import json import socket from urllib import urlencode from heroshi import get_logger log = get_logger("api") from heroshi.conf import settings from heroshi.error import ApiError manager_connections = Pool(max_size=2) manager_connections.create = lambda: httplib2.Http(timeout=20) def request_manager(resource, method, data=None, headers=None): use_headers = { 'User-Agent': settings.identity['user_agent'], 'X-Heroshi-Auth': settings.api_key, 'Expect': '', # a try to fix result: 100 not-ok problem } if headers is not None:
"""Heroshi Postgres database helpers.""" from psycopg2 import DatabaseError, IntegrityError from heroshi import get_logger log = get_logger("storage.dbhelpers") from heroshi.error import StorageError from . import sql class DbRow(object): def __repr__(self): return u"<DbRow %s>" % ", ".join("%s=%s" % (n, v) for (n,v) in self.__dict__.iteritems()) def obj_factory(columns, row): r = DbRow() r.__dict__ = dict_factory(columns, row) return r def dict_factory(columns, row): return dict( (column, row[index]) for index, column in enumerate(columns) ) def fetch(cursor, factory=None): """Fetches all results from cursor as *list* of `factory()` items. Default factory is `dict_factory`, it makes dict with column names as keys. You can also use `obj_factory`, it makes `DbRow` objects