Ejemplo n.º 1
0
class Crawler(object):
    """Provides utilities for retrieving website content."""
    def __init__(self, **kwargs):
        agent_type = kwargs.setdefault('agent_type', None)
        referer = kwargs.setdefault('referer', None)
        self.headers = kwargs.setdefault(
            'headers',
            HeaderGenerator().header(agent_type=agent_type, referer=referer))
        self.timeout = kwargs.setdefault('timeout', 10)
        self.resp_attributes = kwargs.setdefault('resp_attributes', [
            'content', 'encoding', 'headers', 'history', 'html', 'json', 'ok',
            'reason', 'status_code', 'text', 'url'
        ])
        self.elem_attributes = kwargs.setdefault('elem_attributes', [
            'absolute_links', 'base_url', 'encoding', 'full_text', 'html',
            'links', 'raw_html', 'text', 'url'
        ])
        self._logging = kwargs.setdefault('logging', True)
        if self._logging:
            log_path = kwargs.setdefault('log_path', '.')
            log_file = kwargs.setdefault('log_file', 'out.log')
            log_name = kwargs.setdefault('log_name', __name__)
            self._logger = Logger(name=log_name,
                                  log_path=log_path,
                                  log_file=log_file)
        self.session = requests_html.HTMLSession()
        self._err_recs = []

    def _push_error(self, error, url, comp_id=None, attr=None):
        c_id = str(comp_id) if comp_id else comp_id
        if self._logging:
            if c_id:
                msg = ('\nRequest for response from {} for company {} ' +
                       'threw exception: {}\n').format(url, c_id, error)
            elif attr:
                msg = (
                    '\nRequest for "{}" from {} threw exception: {}\n'.format(
                        attr, url, error))
            else:
                msg = ('\nRequest for response from {} threw exception: {}\n'.
                       format(url, error))
            self._logger.error(msg)
        self._err_recs.append({
            'time': strftime('%Y-%m-%d %H:%M:%S'),
            'company_profile_id': c_id,
            'attribute': attr,
            'url': url,
            'exception': error
        })

    @error_trap
    def _get_response(self, url, headers, timeout, cookies):

        r = self.session.get(url,
                             headers=headers,
                             timeout=timeout,
                             cookies=cookies)
        if r is None:
            return None, None, url
        else:
            if r.ok:
                if self._logging:
                    self._logger.info(('\nOrig_URL: {}; Ret_URL: {}; ' +
                                       'status: {}, reason: {}\n').format(
                                           url, r.url, r.status_code,
                                           r.reason))
                return r, None, r.url
            else:
                if self._logging:
                    self._logger.warning(('\nOrig_URL: {}; Ret_URL: {}; ' +
                                          'status: {}, reason: {}\n').format(
                                              url, r.url, r.status_code,
                                              r.reason))
                return r, r.reason, r.url

    def response(self,
                 url,
                 headers=None,
                 timeout=None,
                 cookies=None,
                 c_id=None):
        headers = headers or self.headers
        timeout = timeout or self.timeout

        def flip_scheme():
            u = furl(url)
            u.scheme = 'https' if u.scheme == 'http' else 'http'
            return u.url

        f_val, err = self._get_response(url, headers, timeout, cookies)
        if err or f_val[1] or f_val[0] is None:
            flipped_url = flip_scheme()
            f_val, err = self._get_response(flipped_url, headers, timeout,
                                            cookies)
            if err:
                self._push_error(err, flipped_url, comp_id=c_id)
                return None
            else:
                if f_val[0] is None:
                    self._push_error('Response is NULL',
                                     flipped_url,
                                     comp_id=c_id)
                if f_val[1]:
                    self._push_error(f_val[1], flipped_url, comp_id=c_id)
                return f_val[0]
        else:
            return f_val[0]

    @error_trap
    def _check_valid_get(self, obj, a):
        obj_type = type(obj)
        if obj_type == requests_html.HTMLResponse:
            assert a in self.resp_attributes, \
                ('Second parameter must be one of: {}'.
                 format(', '.join(self.resp_attributes)))
        elif ((obj_type == requests_html.HTML)
              or (obj_type == requests_html.Element)):
            assert a in self.elem_attributes, \
                ('Second parameter must be one of: {}'.
                 format(', '.join(self.elem_attributes)))
        else:
            raise TypeError('First parameter must be one of type ' +
                            'requests_html.HTMLResponse, ' +
                            'requests_html.HTML, or ' +
                            'requests_html.Element')
        return

    @error_trap
    def _get(self, obj, a):
        _, err = self._check_valid_get(obj, a)
        if err:
            if type(err) == AssertionError:
                u = self.get(obj, 'url')
            else:
                u = None
            self._push_error(err, u, attr=a)
            return None
        else:
            attr = getattr(obj, a) if a != 'json' else getattr(obj, a)()
            if attr is None:
                u = self.get(obj, 'url') if a != 'url' else None
                self._push_error('NULL attribute', u, attr=a)
            return attr

    def get(self, obj, a):
        attr, err = self._get(obj, a)
        if err:
            u, e = self._get(obj, 'url') if a != 'url' else None, None
            if e:
                self._push_error(e, u, attr='url')
            self._push_error(err, u, attr=a)
        return attr

    @error_trap
    def _write_errors(self, outfile):
        ft = outfile.split('.')[-1]
        assert (ft in ['pkl', 'xlsx', 'csv']), \
            'Output filename must specify a pickle (.pkl), ' + \
            'excel (.xlsx) or csv (.csv) file.'
        if ft == 'pkl':
            pd.DataFrame(self._err_recs).to_pickle(outfile)
        elif ft == 'xlsx':
            pd.DataFrame(self._err_recs).to_excel(outfile,
                                                  engine='xlsxwriter',
                                                  index=False)
        else:
            pd.DataFrame(self._err_recs).to_csv(outfile, index=False)
        return outfile

    def write_errors(self, out_fn):
        outfile, err = self._write_errors(out_fn)
        if err:
            if self._logging:
                msg = '\nError while writing out error log: {}\n'.format(err)
                self._logger.error(msg)
        return outfile
Ejemplo n.º 2
0
class Worker:
    def __init__(self, params):
        queue_name = params["queue_name"]
        model_url = params["model_url"]
        host_name = params.get("rabbitmq_hostname", "localhost")
        mongo_address = params.get("mongo_address", "localhost:27017")
        self.bucket_name = params["bucket_name"]
        self.deduplicate_model = params["deduplicate_model"]
        self.deduplicate_threshold = params["deduplicate_threshold"]
        self.logger = Logger()

        while True:
            try:
                if self.set_up_rabbitmq_connection(host_name, queue_name):
                    break
            except Exception as e:
                self.logger.error(
                    f"Failed to connect to rabbitmq queue {queue_name} at {host_name}. Reason: {e}"
                )
                time.sleep(3)
                continue

        # start consuming (blocks)
        self.num_threads = 4
        self.model_name = queue_name

        self.bucket_handler = Bucket(bucket_name)

        self.logger.info(f"Extract worker for model: {queue_name}")
        self.model = model_picker(queue_name, model_url)

        self.logger.info(f"Connecting to mongodb at {mongo_address}")
        client = MongoClient(mongo_address)
        self.db = client.features

        self.channel.start_consuming()
        self.connection.close()

    def set_up_rabbitmq_connection(self, host_name, queue_name):
        credentials = pika.PlainCredentials('admin', 'admin')
        self.connection = pika.BlockingConnection(
            pika.ConnectionParameters(host_name, credentials=credentials))
        self.channel = self.connection.channel()
        self.channel.queue_declare(queue=queue_name, durable=True)
        self.channel_name = "features"
        self.channel.exchange_declare(exchange=self.channel_name,
                                      exchange_type="fanout",
                                      durable=True)
        self.channel.queue_bind(exchange=self.channel_name, queue=queue_name)
        self.channel.basic_qos(prefetch_count=20)
        # set up subscription on the queue
        self.channel.basic_consume(queue_name, self.process)
        return True

    def get_public_url(self, file_name):
        return f"https://storage.googleapis.com/{self.bucket_name}/{file_name}"

    def check_duplication(self, img_name, feature):
        response = requests.post(
            f"http://serving-{self.deduplicate_model}:5000/search?json=true",
            json=feature.tolist())
        if response.status_code != 200:
            print(f"Deduplicate request fails for image {img_name}")
            return False
        result = response.json()

        if len(result) == 0:
            return False

        best_match = result[0]["distance"]
        is_duplicated = best_match <= self.deduplicate_threshold
        if is_duplicated:
            print(f"Image {img_name} already exists")
            self.channel.basic_publish(exchange="",
                                       routing_key="duplicated_files",
                                       body=img_name)
        return is_duplicated

    @FAILURE_COUNTER.count_exceptions()
    @REQUEST_TIME.time()
    def process(self, ch, method, properties, file_name):
        file_name = file_name.decode()
        print(f"Processing file {file_name}")
        downloaded_dir = "./tmp"
        local_file_path = self.bucket_handler.download(file_name,
                                                       downloaded_dir)
        feature = extract_features(local_file_path, self.model)

        if self.deduplicate_model:
            is_duplicated = self.check_duplication(file_name, feature)
            if is_duplicated:
                self.channel.basic_ack(delivery_tag=method.delivery_tag)
                return

        self.db[self.model_name].insert_one({
            "url":
            self.get_public_url(file_name),
            "feature":
            feature.tolist()
        })
        self.channel.basic_ack(delivery_tag=method.delivery_tag)