class Crawler(object): """Provides utilities for retrieving website content.""" def __init__(self, **kwargs): agent_type = kwargs.setdefault('agent_type', None) referer = kwargs.setdefault('referer', None) self.headers = kwargs.setdefault( 'headers', HeaderGenerator().header(agent_type=agent_type, referer=referer)) self.timeout = kwargs.setdefault('timeout', 10) self.resp_attributes = kwargs.setdefault('resp_attributes', [ 'content', 'encoding', 'headers', 'history', 'html', 'json', 'ok', 'reason', 'status_code', 'text', 'url' ]) self.elem_attributes = kwargs.setdefault('elem_attributes', [ 'absolute_links', 'base_url', 'encoding', 'full_text', 'html', 'links', 'raw_html', 'text', 'url' ]) self._logging = kwargs.setdefault('logging', True) if self._logging: log_path = kwargs.setdefault('log_path', '.') log_file = kwargs.setdefault('log_file', 'out.log') log_name = kwargs.setdefault('log_name', __name__) self._logger = Logger(name=log_name, log_path=log_path, log_file=log_file) self.session = requests_html.HTMLSession() self._err_recs = [] def _push_error(self, error, url, comp_id=None, attr=None): c_id = str(comp_id) if comp_id else comp_id if self._logging: if c_id: msg = ('\nRequest for response from {} for company {} ' + 'threw exception: {}\n').format(url, c_id, error) elif attr: msg = ( '\nRequest for "{}" from {} threw exception: {}\n'.format( attr, url, error)) else: msg = ('\nRequest for response from {} threw exception: {}\n'. format(url, error)) self._logger.error(msg) self._err_recs.append({ 'time': strftime('%Y-%m-%d %H:%M:%S'), 'company_profile_id': c_id, 'attribute': attr, 'url': url, 'exception': error }) @error_trap def _get_response(self, url, headers, timeout, cookies): r = self.session.get(url, headers=headers, timeout=timeout, cookies=cookies) if r is None: return None, None, url else: if r.ok: if self._logging: self._logger.info(('\nOrig_URL: {}; Ret_URL: {}; ' + 'status: {}, reason: {}\n').format( url, r.url, r.status_code, r.reason)) return r, None, r.url else: if self._logging: self._logger.warning(('\nOrig_URL: {}; Ret_URL: {}; ' + 'status: {}, reason: {}\n').format( url, r.url, r.status_code, r.reason)) return r, r.reason, r.url def response(self, url, headers=None, timeout=None, cookies=None, c_id=None): headers = headers or self.headers timeout = timeout or self.timeout def flip_scheme(): u = furl(url) u.scheme = 'https' if u.scheme == 'http' else 'http' return u.url f_val, err = self._get_response(url, headers, timeout, cookies) if err or f_val[1] or f_val[0] is None: flipped_url = flip_scheme() f_val, err = self._get_response(flipped_url, headers, timeout, cookies) if err: self._push_error(err, flipped_url, comp_id=c_id) return None else: if f_val[0] is None: self._push_error('Response is NULL', flipped_url, comp_id=c_id) if f_val[1]: self._push_error(f_val[1], flipped_url, comp_id=c_id) return f_val[0] else: return f_val[0] @error_trap def _check_valid_get(self, obj, a): obj_type = type(obj) if obj_type == requests_html.HTMLResponse: assert a in self.resp_attributes, \ ('Second parameter must be one of: {}'. format(', '.join(self.resp_attributes))) elif ((obj_type == requests_html.HTML) or (obj_type == requests_html.Element)): assert a in self.elem_attributes, \ ('Second parameter must be one of: {}'. format(', '.join(self.elem_attributes))) else: raise TypeError('First parameter must be one of type ' + 'requests_html.HTMLResponse, ' + 'requests_html.HTML, or ' + 'requests_html.Element') return @error_trap def _get(self, obj, a): _, err = self._check_valid_get(obj, a) if err: if type(err) == AssertionError: u = self.get(obj, 'url') else: u = None self._push_error(err, u, attr=a) return None else: attr = getattr(obj, a) if a != 'json' else getattr(obj, a)() if attr is None: u = self.get(obj, 'url') if a != 'url' else None self._push_error('NULL attribute', u, attr=a) return attr def get(self, obj, a): attr, err = self._get(obj, a) if err: u, e = self._get(obj, 'url') if a != 'url' else None, None if e: self._push_error(e, u, attr='url') self._push_error(err, u, attr=a) return attr @error_trap def _write_errors(self, outfile): ft = outfile.split('.')[-1] assert (ft in ['pkl', 'xlsx', 'csv']), \ 'Output filename must specify a pickle (.pkl), ' + \ 'excel (.xlsx) or csv (.csv) file.' if ft == 'pkl': pd.DataFrame(self._err_recs).to_pickle(outfile) elif ft == 'xlsx': pd.DataFrame(self._err_recs).to_excel(outfile, engine='xlsxwriter', index=False) else: pd.DataFrame(self._err_recs).to_csv(outfile, index=False) return outfile def write_errors(self, out_fn): outfile, err = self._write_errors(out_fn) if err: if self._logging: msg = '\nError while writing out error log: {}\n'.format(err) self._logger.error(msg) return outfile
class Worker: def __init__(self, params): queue_name = params["queue_name"] model_url = params["model_url"] host_name = params.get("rabbitmq_hostname", "localhost") mongo_address = params.get("mongo_address", "localhost:27017") self.bucket_name = params["bucket_name"] self.deduplicate_model = params["deduplicate_model"] self.deduplicate_threshold = params["deduplicate_threshold"] self.logger = Logger() while True: try: if self.set_up_rabbitmq_connection(host_name, queue_name): break except Exception as e: self.logger.error( f"Failed to connect to rabbitmq queue {queue_name} at {host_name}. Reason: {e}" ) time.sleep(3) continue # start consuming (blocks) self.num_threads = 4 self.model_name = queue_name self.bucket_handler = Bucket(bucket_name) self.logger.info(f"Extract worker for model: {queue_name}") self.model = model_picker(queue_name, model_url) self.logger.info(f"Connecting to mongodb at {mongo_address}") client = MongoClient(mongo_address) self.db = client.features self.channel.start_consuming() self.connection.close() def set_up_rabbitmq_connection(self, host_name, queue_name): credentials = pika.PlainCredentials('admin', 'admin') self.connection = pika.BlockingConnection( pika.ConnectionParameters(host_name, credentials=credentials)) self.channel = self.connection.channel() self.channel.queue_declare(queue=queue_name, durable=True) self.channel_name = "features" self.channel.exchange_declare(exchange=self.channel_name, exchange_type="fanout", durable=True) self.channel.queue_bind(exchange=self.channel_name, queue=queue_name) self.channel.basic_qos(prefetch_count=20) # set up subscription on the queue self.channel.basic_consume(queue_name, self.process) return True def get_public_url(self, file_name): return f"https://storage.googleapis.com/{self.bucket_name}/{file_name}" def check_duplication(self, img_name, feature): response = requests.post( f"http://serving-{self.deduplicate_model}:5000/search?json=true", json=feature.tolist()) if response.status_code != 200: print(f"Deduplicate request fails for image {img_name}") return False result = response.json() if len(result) == 0: return False best_match = result[0]["distance"] is_duplicated = best_match <= self.deduplicate_threshold if is_duplicated: print(f"Image {img_name} already exists") self.channel.basic_publish(exchange="", routing_key="duplicated_files", body=img_name) return is_duplicated @FAILURE_COUNTER.count_exceptions() @REQUEST_TIME.time() def process(self, ch, method, properties, file_name): file_name = file_name.decode() print(f"Processing file {file_name}") downloaded_dir = "./tmp" local_file_path = self.bucket_handler.download(file_name, downloaded_dir) feature = extract_features(local_file_path, self.model) if self.deduplicate_model: is_duplicated = self.check_duplication(file_name, feature) if is_duplicated: self.channel.basic_ack(delivery_tag=method.delivery_tag) return self.db[self.model_name].insert_one({ "url": self.get_public_url(file_name), "feature": feature.tolist() }) self.channel.basic_ack(delivery_tag=method.delivery_tag)