def get_multiple_results(self,
                          keys,
                          cl_engine=forge.get_classification(),
                          as_obj=False):
     results = {
         k: self.create_empty_result_from_key(k, cl_engine, as_obj=as_obj)
         for k in keys if k.endswith(".e")
     }
     keys = [k for k in keys if not k.endswith(".e")]
     try:
         results.update(
             self.result.multiget(keys, as_dictionary=True, as_obj=as_obj))
     except MultiKeyError as e:
         log.warning(
             f"Trying to get multiple results but some are missing: {str(e.keys)}"
         )
         results.update(e.partial_output)
     return results
    def _ensure_collection(self):
        # Create HOT index
        if not self.with_retries(self.datastore.client.indices.exists, self.name):
            log.debug(f"Index {self.name.upper()} does not exists. Creating it now...")
            try:
                self.with_retries(self.datastore.client.indices.create, self.name, self._get_index_definition())
            except elasticsearch.exceptions.RequestError as e:
                if "resource_already_exists_exception" not in str(e):
                    raise
                log.warning(f"Tried to create an index template that already exists: {self.name.upper()}")

        if self.ilm_config:
            # Create ILM policy
            while not self._ilm_policy_exists():
                try:
                    self.with_retries(self._create_ilm_policy)
                except ILMException:
                    time.sleep(0.1)
                    pass

            # Create WARM index template
            if not self.with_retries(self.datastore.client.indices.exists_template, self.name):
                log.debug(f"Index template {self.name.upper()} does not exists. Creating it now...")

                index = self._get_index_definition()

                index["index_patterns"] = [f"{self.name}-*"]
                index["order"] = 1
                index["settings"]["index.lifecycle.name"] = f"{self.name}_policy"
                index["settings"]["index.lifecycle.rollover_alias"] = f"{self.name}-archive"

                try:
                    self.with_retries(self.datastore.client.indices.put_template, self.name, index)
                except elasticsearch.exceptions.RequestError as e:
                    if "resource_already_exists_exception" not in str(e):
                        raise
                    log.warning(f"Tried to create an index template that already exists: {self.name.upper()}")

            if not self.with_retries(self.datastore.client.indices.exists_alias, f"{self.name}-archive"):
                log.debug(f"Index alias {self.name.upper()}-archive does not exists. Creating it now...")

                index = {"aliases": {f"{self.name}-archive": {"is_write_index": True}}}

                try:
                    self.with_retries(self.datastore.client.indices.create, f"{self.name}-000001", index)
                except elasticsearch.exceptions.RequestError as e:
                    if "resource_already_exists_exception" not in str(e):
                        raise
                    log.warning(f"Tried to create an index template that already exists: {self.name.upper()}-000001")

        self._check_fields()
    def multi_index_bulk(self, bulk_plans):
        max_retry_backoff = 10
        retries = 0
        while True:
            try:
                plan = "\n".join([p.get_plan_data() for p in bulk_plans])
                ret_val = self.ds.client.bulk(body=plan)
                return ret_val
            except (elasticsearch.exceptions.ConnectionError,
                    elasticsearch.exceptions.ConnectionTimeout,
                    elasticsearch.exceptions.AuthenticationException):
                log.warning(f"No connection to Elasticsearch server(s): "
                            f"{' | '.join(self.ds.get_hosts(safe=True))}"
                            f", retrying...")
                time.sleep(min(retries, max_retry_backoff))
                self.ds.connection_reset()
                retries += 1

            except elasticsearch.exceptions.TransportError as e:
                err_code, msg, cause = e.args
                if err_code == 503 or err_code == '503':
                    log.warning(
                        "Looks like index is not ready yet, retrying...")
                    time.sleep(min(retries, max_retry_backoff))
                    self.ds.connection_reset()
                    retries += 1
                elif err_code == 429 or err_code == '429':
                    log.warning(
                        "Elasticsearch is too busy to perform the requested task, "
                        "we will wait a bit and retry...")
                    time.sleep(min(retries, max_retry_backoff))
                    self.ds.connection_reset()
                    retries += 1

                else:
                    raise
    def get_summary_from_keys(self,
                              keys,
                              cl_engine=forge.get_classification(),
                              user_classification=None):
        out = {
            "tags": [],
            "attack_matrix": [],
            "heuristics": {
                "info": [],
                "suspicious": [],
                "malicious": []
            },
            "classification": cl_engine.UNRESTRICTED,
            "filtered": False
        }
        done_map = {"heuristics": set(), "attack": set(), "tags": set()}

        if len(keys) == 0:
            return out

        keys = [x for x in list(keys) if not x.endswith(".e")]
        file_keys = list(set([x[:64] for x in keys]))
        try:
            items = self.result.multiget(keys, as_obj=False)
        except MultiKeyError as e:
            # Generate partial summaries even if results are missing
            log.warning(
                f"Trying to generate summary but we are missing result(s): {str(e.keys)}"
            )
            items = e.partial_output
            out['missing_results'] = e.keys
        try:
            files = self.file.multiget(file_keys, as_obj=False)
        except MultiKeyError as e:
            # Generate partial summaries even if results are missing
            log.warning(
                f"Trying to generate summary but we are missing file(s): {str(e.keys)}"
            )
            files = e.partial_output
            out['missing_files'] = e.keys

        for key, item in items.items():
            for section in item.get('result', {}).get('sections', []):
                file_classification = files.get(key[:64], {}).get(
                    'classification', section['classification'])
                if user_classification:
                    if not cl_engine.is_accessible(user_classification,
                                                   section['classification']):
                        out["filtered"] = True
                        continue
                    if not cl_engine.is_accessible(user_classification,
                                                   file_classification):
                        out["filtered"] = True
                        continue

                out["classification"] = cl_engine.max_classification(
                    out["classification"], section['classification'])
                out["classification"] = cl_engine.max_classification(
                    out["classification"], file_classification)

                h_type = "info"

                if section.get('heuristic', False):
                    # Get the heuristics data
                    if section['heuristic']['score'] < 100:
                        h_type = "info"
                    elif section['heuristic']['score'] < 1000:
                        h_type = "suspicious"
                    else:
                        h_type = "malicious"

                    cache_key = f"{section['heuristic']['heur_id']}_{key}"
                    if cache_key not in done_map['heuristics']:
                        out['heuristics'][h_type].append({
                            'heur_id':
                            section['heuristic']['heur_id'],
                            'name':
                            section['heuristic']['name'],
                            'key':
                            key
                        })
                        done_map['heuristics'].add(cache_key)

                    for attack in section['heuristic'].get('attack', []):
                        # Get attack matrix data
                        attack_id = attack['attack_id']

                        cache_key = f"{attack_id}_{key}"
                        if cache_key not in done_map['attack']:
                            out['attack_matrix'].append({
                                "key":
                                key,
                                "attack_id":
                                attack_id,
                                "h_type":
                                h_type,
                                "name":
                                attack['pattern'],
                                "categories":
                                attack['categories']
                            })
                            done_map['attack'].add(cache_key)

                # Get tagging data
                for tag_type, tags in flatten(section.get('tags', {})).items():
                    if tags is not None:
                        for tag in tags:
                            cache_key = f"{tag_type}_{tag}_{key}"

                            if cache_key not in done_map['tags']:
                                out['tags'].append({
                                    'type':
                                    tag_type,
                                    'h_type':
                                    h_type,
                                    'short_type':
                                    tag_type.rsplit(".", 1)[-1],
                                    'value':
                                    tag,
                                    'key':
                                    key
                                })
                                done_map['tags'].add(cache_key)

        return out
    def get_or_create_file_tree(self,
                                submission,
                                max_depth,
                                cl_engine=forge.get_classification(),
                                user_classification=None):
        if user_classification is not None:
            user_classification = cl_engine.normalize_classification(
                user_classification, long_format=False)
            cache_key = f"{submission['sid']}_{user_classification}"
            for illegal_char in [" ", ":", "/"]:
                cache_key = cache_key.replace(illegal_char, "")
        else:
            cache_key = submission['sid']

        if isinstance(submission, Model):
            submission = submission.as_primitives()

        num_files = len(list(set([x[:64] for x in submission['results']])))
        max_score = submission['max_score']

        cached_tree = self.submission_tree.get_if_exists(cache_key,
                                                         as_obj=False)
        if cached_tree:
            tree = json.loads(cached_tree['tree'])
            if self._is_valid_tree(tree, num_files, max_score):
                return {
                    "tree": tree,
                    "classification": cached_tree['classification'],
                    "filtered": cached_tree['filtered'],
                    "partial": False
                }

        partial = False
        files = {}
        scores = {}
        missing_files = []
        file_hashes = [x[:64] for x in submission['results']]
        file_hashes.extend([x[:64] for x in submission['errors']])
        file_hashes.extend([f['sha256'] for f in submission['files']])
        try:
            temp_file_data_map = self.file.multiget(list(set(file_hashes)),
                                                    as_dictionary=True,
                                                    as_obj=False)
        except MultiKeyError as e:
            log.warning(
                f"Trying to generate file tree but we are missing file(s): {str(e.keys)}"
            )
            temp_file_data_map = e.partial_output
            missing_files = e.keys
            partial = True
        forbidden_files = set()

        max_classification = cl_engine.UNRESTRICTED
        file_data_map = {}
        for key, value in temp_file_data_map.items():
            if user_classification and not cl_engine.is_accessible(
                    user_classification, value['classification']):
                partial = True
                forbidden_files.add(key)
                continue
            file_data_map[key] = value
            max_classification = cl_engine.max_classification(
                max_classification, value['classification'])

        try:
            results_data = self.result.multiget(
                [x for x in submission['results'] if not x.endswith(".e")],
                as_obj=False)
        except MultiKeyError as e:
            log.warning(
                f"Trying to generate file tree but we are missing result(s): {str(e.keys)}"
            )
            results_data = e.partial_output
            partial = True

        for key, item in results_data.items():
            sha256 = key[:64]

            # Get scores
            if sha256 not in scores:
                scores[sha256] = 0
            scores[sha256] += item["result"]["score"]

            # Get files
            extracted = item['response']['extracted']
            if len(extracted) == 0:
                continue
            if sha256 not in files:
                files[sha256] = []
            files[sha256].extend(extracted)

        tree_cache = []

        def recurse_tree(child_p, placeholder, parents_p, lvl=0):
            if lvl == max_depth + 1:
                # Enforce depth protection while building the tree
                return

            c_sha256 = child_p['sha256']
            c_name = child_p['name']
            if c_sha256 in placeholder:
                placeholder[c_sha256]['name'].append(c_name)
            else:
                children_list = {}
                truncated = False
                child_list = files.get(c_sha256, [])
                for new_child in child_list:
                    if new_child['sha256'] in tree_cache:
                        truncated = True
                        continue
                    tree_cache.append(child['sha256'])

                    if new_child['sha256'] not in parents_p:
                        recurse_tree(new_child, children_list,
                                     parents_p + [c_sha256], lvl + 1)

                try:
                    placeholder[c_sha256] = {
                        "name": [c_name],
                        "type": file_data_map[c_sha256]['type'],
                        "sha256": file_data_map[c_sha256]['sha256'],
                        "size": file_data_map[c_sha256]['size'],
                        "children": children_list,
                        "truncated": truncated,
                        "score": scores.get(c_sha256, 0),
                    }
                except KeyError:
                    if c_sha256 not in forbidden_files and c_sha256 not in missing_files:
                        file_data_map[c_sha256] = self.file.get(c_sha256,
                                                                as_obj=False)
                        placeholder[c_sha256] = {
                            "name": [c_name],
                            "type": file_data_map[c_sha256]['type'],
                            "sha256": file_data_map[c_sha256]['sha256'],
                            "size": file_data_map[c_sha256]['size'],
                            "children": children_list,
                            "truncated": truncated,
                            "score": scores.get(c_sha256, 0),
                        }

        tree = {}
        for f in submission['files']:
            sha256 = f['sha256']
            name = f['name']

            if sha256 in tree:
                tree[sha256]['name'].append(name)
            else:
                parents = [sha256]
                children = {}
                c_list = files.get(sha256, [])
                for child in c_list:
                    tree_cache.append(child['sha256'])
                    recurse_tree(child, children, parents)

                try:
                    tree[sha256] = {
                        "name": [name],
                        "children": children,
                        "type": file_data_map[sha256]['type'],
                        "sha256": file_data_map[sha256]['sha256'],
                        "size": file_data_map[sha256]['size'],
                        "truncated": False,
                        "score": scores.get(sha256, 0),
                    }
                except KeyError:
                    if sha256 not in forbidden_files and sha256 not in missing_files:
                        file_data_map[sha256] = self.file.get(sha256,
                                                              as_obj=False)
                        tree[sha256] = {
                            "name": [name],
                            "children": children,
                            "type": file_data_map[sha256]['type'],
                            "sha256": file_data_map[sha256]['sha256'],
                            "size": file_data_map[sha256]['size'],
                            "truncated": False,
                            "score": scores.get(sha256, 0),
                        }

        if not partial:
            cached_tree = {
                'expiry_ts': now_as_iso(days_until_archive * 24 * 60 * 60),
                'tree': json.dumps(tree),
                'classification': max_classification,
                'filtered': len(forbidden_files) > 0
            }

            self.submission_tree.save(cache_key, cached_tree)

        return {
            'tree': tree,
            'classification': max_classification,
            'filtered': len(forbidden_files) > 0,
            'partial': partial
        }
    def with_retries(self, func, *args, **kwargs):
        retries = 0
        updated = 0
        deleted = 0
        while True:
            try:
                ret_val = func(*args, **kwargs)

                if retries:
                    log.info('Reconnected to elasticsearch!')

                if updated:
                    ret_val['updated'] += updated

                if deleted:
                    ret_val['deleted'] += deleted

                return ret_val

            except elasticsearch.exceptions.NotFoundError as e:
                if "index_not_found_exception" in str(e):
                    time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
                    log.debug("The index does not exist. Trying to recreate it...")
                    self._ensure_collection()
                    self.datastore.connection_reset()
                    retries += 1
                else:
                    raise

            except elasticsearch.exceptions.ConflictError as ce:
                updated += ce.info.get('updated', 0)
                deleted += ce.info.get('deleted', 0)

                time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
                self.datastore.connection_reset()
                retries += 1

            except (SearchRetryException,
                    elasticsearch.exceptions.ConnectionError,
                    elasticsearch.exceptions.ConnectionTimeout,
                    elasticsearch.exceptions.AuthenticationException) as e:
                if not isinstance(e, SearchRetryException):
                    log.warning(f"No connection to Elasticsearch server(s): "
                                f"{' | '.join(self.datastore.get_hosts(safe=True))}"
                                f", retrying...")
                time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
                self.datastore.connection_reset()
                retries += 1

            except elasticsearch.exceptions.TransportError as e:
                err_code, msg, cause = e.args
                if err_code == 503 or err_code == '503':
                    log.warning("Looks like index is not ready yet, retrying...")
                    time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
                    self.datastore.connection_reset()
                    retries += 1
                elif err_code == 429 or err_code == '429':
                    log.warning("Elasticsearch is too busy to perform the requested task, "
                                "we will wait a bit and retry...")
                    time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
                    self.datastore.connection_reset()
                    retries += 1

                else:
                    raise