from bucket import Bucket from dupestats import DupeStats from authorset import AuthorSet from rewriter import rewritechart bucket = Bucket() bucket.download() bucket.extract() stat = DupeStats() stat.dupe_stats() authorSet = AuthorSet() authorSet.preprocess() authorSet.process() authorSet.processproposals() authorSet.heatmap() authorSet.dot() charts = sorted(stat.get_charts()) for chart in charts: dupeslist = stat.dupefinder(chart) rewritechart(chart, dupeslist)
class BucketTest(unittest.TestCase): def setUp(self): self.bucket = Bucket() def test_scan(self): """ Test for the scan method :return: """ dom = '<ListBucketResult xmlns="http://doc.s3.amazonaws.com/2006-03-01"><Name>kubernetes-charts</Name><Prefix' \ '/><Marker/><NextMarker>kapacitor-0.3.0.tgz</NextMarker><IsTruncated>true</IsTruncated><Contents><Key' \ '>acs-engine-autoscaler-0.1.0.tgz</Key><Generation>1501637633913843</Generation><MetaGeneration>1' \ '</MetaGeneration><LastModified>2017-08-02T01:33:53.600Z</LastModified><ETag' \ '>"7ba1dd9555e78f23eac07a7223cdad18"</ETag><Size>4069</Size></Contents><Contents><Key>acs-engine' \ '-autoscaler-1.0.0.tgz</Key><Generation>1505061247273212</Generation><MetaGeneration>1</MetaGeneration' \ '><LastModified>2017-09-10T16:34:07.187Z</LastModified><ETag>"fcea91b52795fb8576be7a62ceebb731"</ETag' \ '><Size>4229</Size></Contents></ListBucketResult>' dom = xml_dom.parseString(dom) keys, next_marker = self.bucket.scan(dom) valid_keys = [ 'acs-engine-autoscaler-0.1.0.tgz', 'acs-engine-autoscaler-1.0.0.tgz' ] valid_next_marker = 'kapacitor-0.3.0.tgz' self.assertEqual(keys, valid_keys) self.assertEqual(next_marker, valid_next_marker) def test_get_releases(self): """ Test for the get_releases method. :return: """ diff_rel = [ 'acs-engine-autoscaler-0.1.0.tgz', 'artifactory-ha-0.1.7.tgz' ] same_rel = [ 'acs-engine-autoscaler-0.1.0.tgz', 'acs-engine-autoscaler-1.0.0.tgz' ] valid_diff_rel = { 'acs-engine-autoscaler': '0.1.0.tgz', 'artifactory-ha': '0.1.7.tgz' } valid_same_rel = {'acs-engine-autoscaler': '1.0.0.tgz'} diff_releases = self.bucket.get_releases(diff_rel) same_releases = self.bucket.get_releases(same_rel) self.assertEqual(same_releases, valid_same_rel) self.assertEqual(diff_releases, valid_diff_rel) def test_bucket_class(self): """ Test for the Bucket class. At the moment, it'll only work if the _charts and _descriptor directories don't exist. :return: """ self.bucket.download() self.bucket.extract() chart_count = len(os.listdir(self.bucket.path)) template_count = len(os.listdir(self.bucket.descriptor)) releases_count = len(self.bucket.releases) print(f"chart_count: {chart_count}") print(f"template_count: {template_count}") print(f"releases_count: {releases_count}") self.assertEqual(chart_count, template_count) self.assertEqual(chart_count or template_count, releases_count)
class Worker: def __init__(self, params): queue_name = params["queue_name"] model_url = params["model_url"] host_name = params.get("rabbitmq_hostname", "localhost") mongo_address = params.get("mongo_address", "localhost:27017") self.bucket_name = params["bucket_name"] self.deduplicate_model = params["deduplicate_model"] self.deduplicate_threshold = params["deduplicate_threshold"] self.logger = Logger() while True: try: if self.set_up_rabbitmq_connection(host_name, queue_name): break except Exception as e: self.logger.error( f"Failed to connect to rabbitmq queue {queue_name} at {host_name}. Reason: {e}" ) time.sleep(3) continue # start consuming (blocks) self.num_threads = 4 self.model_name = queue_name self.bucket_handler = Bucket(bucket_name) self.logger.info(f"Extract worker for model: {queue_name}") self.model = model_picker(queue_name, model_url) self.logger.info(f"Connecting to mongodb at {mongo_address}") client = MongoClient(mongo_address) self.db = client.features self.channel.start_consuming() self.connection.close() def set_up_rabbitmq_connection(self, host_name, queue_name): credentials = pika.PlainCredentials('admin', 'admin') self.connection = pika.BlockingConnection( pika.ConnectionParameters(host_name, credentials=credentials)) self.channel = self.connection.channel() self.channel.queue_declare(queue=queue_name, durable=True) self.channel_name = "features" self.channel.exchange_declare(exchange=self.channel_name, exchange_type="fanout", durable=True) self.channel.queue_bind(exchange=self.channel_name, queue=queue_name) self.channel.basic_qos(prefetch_count=20) # set up subscription on the queue self.channel.basic_consume(queue_name, self.process) return True def get_public_url(self, file_name): return f"https://storage.googleapis.com/{self.bucket_name}/{file_name}" def check_duplication(self, img_name, feature): response = requests.post( f"http://serving-{self.deduplicate_model}:5000/search?json=true", json=feature.tolist()) if response.status_code != 200: print(f"Deduplicate request fails for image {img_name}") return False result = response.json() if len(result) == 0: return False best_match = result[0]["distance"] is_duplicated = best_match <= self.deduplicate_threshold if is_duplicated: print(f"Image {img_name} already exists") self.channel.basic_publish(exchange="", routing_key="duplicated_files", body=img_name) return is_duplicated @FAILURE_COUNTER.count_exceptions() @REQUEST_TIME.time() def process(self, ch, method, properties, file_name): file_name = file_name.decode() print(f"Processing file {file_name}") downloaded_dir = "./tmp" local_file_path = self.bucket_handler.download(file_name, downloaded_dir) feature = extract_features(local_file_path, self.model) if self.deduplicate_model: is_duplicated = self.check_duplication(file_name, feature) if is_duplicated: self.channel.basic_ack(delivery_tag=method.delivery_tag) return self.db[self.model_name].insert_one({ "url": self.get_public_url(file_name), "feature": feature.tolist() }) self.channel.basic_ack(delivery_tag=method.delivery_tag)