Beispiel #1
0
from bucket import Bucket
from dupestats import DupeStats
from authorset import AuthorSet
from rewriter import rewritechart

bucket = Bucket()
bucket.download()
bucket.extract()

stat = DupeStats()
stat.dupe_stats()

authorSet = AuthorSet()
authorSet.preprocess()
authorSet.process()
authorSet.processproposals()
authorSet.heatmap()
authorSet.dot()

charts = sorted(stat.get_charts())

for chart in charts:
    dupeslist = stat.dupefinder(chart)
    rewritechart(chart, dupeslist)
class BucketTest(unittest.TestCase):
    def setUp(self):
        self.bucket = Bucket()

    def test_scan(self):
        """
        Test for the scan method
        :return:
        """
        dom = '<ListBucketResult xmlns="http://doc.s3.amazonaws.com/2006-03-01"><Name>kubernetes-charts</Name><Prefix' \
              '/><Marker/><NextMarker>kapacitor-0.3.0.tgz</NextMarker><IsTruncated>true</IsTruncated><Contents><Key' \
              '>acs-engine-autoscaler-0.1.0.tgz</Key><Generation>1501637633913843</Generation><MetaGeneration>1' \
              '</MetaGeneration><LastModified>2017-08-02T01:33:53.600Z</LastModified><ETag' \
              '>"7ba1dd9555e78f23eac07a7223cdad18"</ETag><Size>4069</Size></Contents><Contents><Key>acs-engine' \
              '-autoscaler-1.0.0.tgz</Key><Generation>1505061247273212</Generation><MetaGeneration>1</MetaGeneration' \
              '><LastModified>2017-09-10T16:34:07.187Z</LastModified><ETag>"fcea91b52795fb8576be7a62ceebb731"</ETag' \
              '><Size>4229</Size></Contents></ListBucketResult>'

        dom = xml_dom.parseString(dom)
        keys, next_marker = self.bucket.scan(dom)

        valid_keys = [
            'acs-engine-autoscaler-0.1.0.tgz',
            'acs-engine-autoscaler-1.0.0.tgz'
        ]
        valid_next_marker = 'kapacitor-0.3.0.tgz'

        self.assertEqual(keys, valid_keys)
        self.assertEqual(next_marker, valid_next_marker)

    def test_get_releases(self):
        """
        Test for the get_releases method.
        :return:
        """
        diff_rel = [
            'acs-engine-autoscaler-0.1.0.tgz', 'artifactory-ha-0.1.7.tgz'
        ]
        same_rel = [
            'acs-engine-autoscaler-0.1.0.tgz',
            'acs-engine-autoscaler-1.0.0.tgz'
        ]

        valid_diff_rel = {
            'acs-engine-autoscaler': '0.1.0.tgz',
            'artifactory-ha': '0.1.7.tgz'
        }
        valid_same_rel = {'acs-engine-autoscaler': '1.0.0.tgz'}

        diff_releases = self.bucket.get_releases(diff_rel)
        same_releases = self.bucket.get_releases(same_rel)

        self.assertEqual(same_releases, valid_same_rel)
        self.assertEqual(diff_releases, valid_diff_rel)

    def test_bucket_class(self):
        """
        Test for the Bucket class.
        At the moment, it'll only work if the _charts and _descriptor directories don't exist.
        :return:
        """
        self.bucket.download()
        self.bucket.extract()

        chart_count = len(os.listdir(self.bucket.path))
        template_count = len(os.listdir(self.bucket.descriptor))
        releases_count = len(self.bucket.releases)

        print(f"chart_count: {chart_count}")
        print(f"template_count: {template_count}")
        print(f"releases_count: {releases_count}")

        self.assertEqual(chart_count, template_count)
        self.assertEqual(chart_count or template_count, releases_count)
Beispiel #3
0
class Worker:
    def __init__(self, params):
        queue_name = params["queue_name"]
        model_url = params["model_url"]
        host_name = params.get("rabbitmq_hostname", "localhost")
        mongo_address = params.get("mongo_address", "localhost:27017")
        self.bucket_name = params["bucket_name"]
        self.deduplicate_model = params["deduplicate_model"]
        self.deduplicate_threshold = params["deduplicate_threshold"]
        self.logger = Logger()

        while True:
            try:
                if self.set_up_rabbitmq_connection(host_name, queue_name):
                    break
            except Exception as e:
                self.logger.error(
                    f"Failed to connect to rabbitmq queue {queue_name} at {host_name}. Reason: {e}"
                )
                time.sleep(3)
                continue

        # start consuming (blocks)
        self.num_threads = 4
        self.model_name = queue_name

        self.bucket_handler = Bucket(bucket_name)

        self.logger.info(f"Extract worker for model: {queue_name}")
        self.model = model_picker(queue_name, model_url)

        self.logger.info(f"Connecting to mongodb at {mongo_address}")
        client = MongoClient(mongo_address)
        self.db = client.features

        self.channel.start_consuming()
        self.connection.close()

    def set_up_rabbitmq_connection(self, host_name, queue_name):
        credentials = pika.PlainCredentials('admin', 'admin')
        self.connection = pika.BlockingConnection(
            pika.ConnectionParameters(host_name, credentials=credentials))
        self.channel = self.connection.channel()
        self.channel.queue_declare(queue=queue_name, durable=True)
        self.channel_name = "features"
        self.channel.exchange_declare(exchange=self.channel_name,
                                      exchange_type="fanout",
                                      durable=True)
        self.channel.queue_bind(exchange=self.channel_name, queue=queue_name)
        self.channel.basic_qos(prefetch_count=20)
        # set up subscription on the queue
        self.channel.basic_consume(queue_name, self.process)
        return True

    def get_public_url(self, file_name):
        return f"https://storage.googleapis.com/{self.bucket_name}/{file_name}"

    def check_duplication(self, img_name, feature):
        response = requests.post(
            f"http://serving-{self.deduplicate_model}:5000/search?json=true",
            json=feature.tolist())
        if response.status_code != 200:
            print(f"Deduplicate request fails for image {img_name}")
            return False
        result = response.json()

        if len(result) == 0:
            return False

        best_match = result[0]["distance"]
        is_duplicated = best_match <= self.deduplicate_threshold
        if is_duplicated:
            print(f"Image {img_name} already exists")
            self.channel.basic_publish(exchange="",
                                       routing_key="duplicated_files",
                                       body=img_name)
        return is_duplicated

    @FAILURE_COUNTER.count_exceptions()
    @REQUEST_TIME.time()
    def process(self, ch, method, properties, file_name):
        file_name = file_name.decode()
        print(f"Processing file {file_name}")
        downloaded_dir = "./tmp"
        local_file_path = self.bucket_handler.download(file_name,
                                                       downloaded_dir)
        feature = extract_features(local_file_path, self.model)

        if self.deduplicate_model:
            is_duplicated = self.check_duplication(file_name, feature)
            if is_duplicated:
                self.channel.basic_ack(delivery_tag=method.delivery_tag)
                return

        self.db[self.model_name].insert_one({
            "url":
            self.get_public_url(file_name),
            "feature":
            feature.tolist()
        })
        self.channel.basic_ack(delivery_tag=method.delivery_tag)