Esempio n. 1
0
    def __init__(self, rabbitmq_url=None, queue=None, routing_key=None,
                 exchange="message", exchange_type="direct", log=None,
                 max_tasks=5, logging=None):
        """

        == Config dict structure (case adjusted to json configuration):
        {
            "rabbit": {
                "url": "apmq://rabbit",
                "queue": "test",
                "routingKey": "example.json"
                "exchange": "message", // optional, default: message
                "exchangeType:" "topic" // optional, default: topic
            }
        }

        :param str rabbitmq_url: optional url to rabbitmq
        :param str queue: name of the queue
        :param str routing_key: routing key for queue
        :param str exchange: name of the exchange
        :param str exchange_type: type of the exchange
        :param dict config: Manager configuration from parsed json config all
                            the above options can be configured from it
        :param logging.Logger log: optional logger that will replace new one
        :raises exceptions.NotConfigured:
        :return:
        """

        if queue is None:
            raise exceptions.NotConfigured("Misssing queue")

        self._connection = None
        self._channel = None
        self._closing = False
        self._consumer_tag = None
        self._max_tasks = max_tasks  # 2 cores + 1
        self._tasks_number = 0
        self._executor = ThreadPoolExecutor(max_workers=self._max_tasks)
        self._max_tasks_warning_counter = 0

        self._rabbitmq_url = rabbitmq_url
        self._queue = queue
        self._routing_key = routing_key
        self._exchange = exchange
        self._exchange_type = exchange_type

        if log is None:
            from toddler.logging import setup_logging
            if logging is not None:
                self.log = setup_logging(config=logging)
            else:
                self.log = setup_logging()
        else:
            self.log = log
Esempio n. 2
0
def get_documents(search_url, params: dict, nb_rows=600, per_page=100):

    log = setup_logging()
    params = Dict(**params)
    params.language = "en"
    params.synthesis = "disabled" # no synthesis
    params.hf =  per_page
    context = None
    i = 0
    for x in range(0, nb_rows, per_page):
        url = urljoin(search_url, "/search-api/search")
        params.start = x
        params.context = context
        while True:
            response = requests.get(url, params=params.to_dict())
            """:type response: requests.Response"""
            if response.status_code != 200:
                log.error("Got 500: %s" % url)
                log.debug(params.to_dict())
                log.debug(response.text)
                continue

            """:type response: requests.Response"""
            doc = BeautifulSoup(response.text.encode("utf8"), ['lxml', 'xml'])
            context = doc.Answer['context']
            if int(doc.Answer['nhits']) > 0:
                for hit in doc.find_all("Hit"):
                    log.info("Extracted %d document" % i)
                    i += 1
                    yield _extract_hit(hit)

            break

    return
Esempio n. 3
0
    def __init__(self, rabbitmq_url, exchange, routing_key, logging=None, log=None, **kwargs):
        self.rabbitmq_url = rabbitmq_url
        self.exchange = exchange
        self.routing_key = routing_key
        self.scheduler = scheduler(time.time, time.sleep)

        self.delay_queue_thread = AnalysisTaskDelayQueueThread(self.rabbitmq_url)

        if log is None:
            from toddler.logging import setup_logging

            if logging is not None:
                self.log = setup_logging(config=logging)
            else:
                self.log = setup_logging()
        else:
            self.log = log

        pass