Example #1
0
    def _next_record(self):
        if not self._f:
            raise StopIteration("No more records")

        while (self._f):
            if self._read:
                data = self._f.read(self.block_size)
                self._read = False
                if data:
                    if self._data_block:
                        self._data_block += data
                    else:
                        self._data_block = data
                else:
                    #end of file
                    self._close()
                    if self._data_block:
                        m = self.last_record_end_re.search(self._data_block)
                        if m:
                            self._index += 1
                            json_str = "{{\n{}\n}}".format(
                                self._data_block[:m.start()])
                            self._data_block = None
                            return simdjson.loads(json_str)
                        else:
                            raise Exception(
                                "The last record is incomplete in file({}).".
                                format(self._input_file))
                    else:
                        raise StopIteration("No more records")

            if self._index is None:
                m = self.first_record_start_re.search(self._data_block)
                if m:
                    self._data_block = self._data_block[m.end():]
                    self._index = -1
                elif self._data_block.strip():
                    raise Exception(
                        "The file({}) is an invalid json file".format(
                            self._input_file))
                else:
                    self._data_block = None
                    self._read = True
            else:
                m = self.record_sep_re.search(self._data_block)
                if m:
                    self._index += 1
                    json_str = "{{\n{}\n}}".format(
                        self._data_block[:m.start()])
                    self._data_block = self._data_block[m.end():]
                    return simdjson.loads(json_str)
                else:
                    self._read = True
Example #2
0
def get_neighbors_attr(graph, n, pred=False):
    """Get the neighbors attr of node in graph.

    Parameters
    ----------
    graph:
        the graph to query.
    n: node
        the node to get neighbors.
    report_type:
        the report type of report graph operation,
            types_pb2.SUCC_ATTR_BY_NODE: get the successors attr of node,
            types_pb2.PRED_ATTR_BY_NODE: get the predecessors attr of node,

    Returns
    -------
    attr: tuple
    """
    if graph.graph_type == graph_def_pb2.ARROW_PROPERTY:
        n = graph._convert_to_label_id_tuple(n)
    report_t = types_pb2.PRED_ATTR_BY_NODE if pred else types_pb2.SUCC_ATTR_BY_NODE
    op = dag_utils.report_graph(graph,
                                report_t,
                                node=simdjson.dumps(n).encode("utf-8"))
    archive = op.eval()
    return simdjson.loads(archive.get_bytes())
Example #3
0
 def init_resumable_rest(cls, request, bucket):
     name = request.args.get("name", "")
     if len(request.data) > 0:
         if name != "":
             utils.error.invalid("name argument in non-empty payload", None)
         data = simdjson.loads(request.data)
         metadata = json_format.ParseDict(data, resources_pb2.Object())
     else:
         metadata = resources_pb2.Object()
         metadata.name = name
     if metadata.content_type == "":
         metadata.content_type = request.headers.get(
             "x-upload-content-type", "application/octet-stream")
     upload_id = hashlib.sha256(
         ("%s/o/%s" %
          (bucket.name, metadata.name)).encode("utf-8")).hexdigest()
     location = (
         request.host_url +
         "upload/storage/v1/b/%s/o?uploadType=resumable&upload_id=%s" %
         (bucket.name, upload_id))
     headers = {
         key.lower(): value
         for key, value in request.headers.items()
         if key.lower().startswith("x-")
     }
     request = utils.common.FakeRequest(args=request.args.to_dict(),
                                        headers=headers,
                                        data=b"")
     return cls.init_upload(request, metadata, bucket, location, upload_id)
Example #4
0
def parse_multipart(request):
    content_type = request.headers.get("content-type")
    if content_type is None or not content_type.startswith(
            "multipart/related"):
        utils.error.invalid("Content-type header in multipart upload", None)
    _, _, boundary = content_type.partition("boundary=")
    if boundary is None:
        utils.error.missing(
            "boundary in content-type header in multipart upload", None)

    def parse_part(part):
        result = part.split(b"\r\n")
        if result[0] != b"" and result[-1] != b"":
            utils.error.invalid("Multipart %s" % str(part), None)
        result = list(filter(None, result))
        headers = {}
        if len(result) < 2:
            result.append(b"")
        for header in result[:-1]:
            key, value = header.split(b": ")
            headers[key.decode("utf-8")] = value.decode("utf-8")
        return headers, result[-1]

    boundary = boundary.encode("utf-8")
    parts = request.data.split(b"--" + boundary)
    if parts[-1] != b"--\r\n":
        utils.error.missing("end marker (--%s--) in media body" % boundary,
                            None)
    _, resource = parse_part(parts[1])
    metadata = simdjson.loads(resource)
    media_headers, media = parse_part(parts[2])
    return metadata, media_headers, media
Example #5
0
def _test_loads():
    """Ensure basic usage of loads is the same."""
    # We don't use a binary file here because pre-py3.6 the built-in couldn't
    # handle bytes.
    with open('jsonexamples/canada.json', 'r') as fin:
        content = fin.read()

    assert json.loads(content) == simdjson.loads(content)
Example #6
0
def get_idx_key(filename):
    idx_fn = (filename.split('.')[-1].strip() + '.idx')
    if _file_exists(idx_fn):
        idx_key = json.loads(get_read_fn(idx_fn))
        idx = {v: key for key, v in idx_key.items()}
        return idx
    else:
        return None
Example #7
0
 def update_acl(self, request, entity, context):
     role = ""
     if context is not None:
         role = request.bucket_access_control.role
     else:
         payload = simdjson.loads(request.data)
         role = payload["role"]
     return self.__upsert_acl(entity, role, True, context)
Example #8
0
 def patch_default_object_acl(self, request, entity, context):
     role = ""
     if context is not None:
         role = request.object_access_control.role
     else:
         payload = simdjson.loads(request.data)
         role = payload["role"]
     return self.__upsert_default_object_acl(entity, role, True, context)
Example #9
0
 def init(cls, request, context):
     time_created = datetime.datetime.now()
     metadata = None
     if context is not None:
         metadata = request.bucket
     else:
         metadata = json_format.ParseDict(
             cls.__preprocess_rest(simdjson.loads(request.data)),
             resources_pb2.Bucket(),
         )
     cls.__validate_bucket_name(metadata.name, context)
     default_projection = 1
     if len(metadata.acl) != 0 or len(metadata.default_object_acl) != 0:
         default_projection = 2
     is_uniform = metadata.iam_configuration.uniform_bucket_level_access.enabled
     metadata.iam_configuration.uniform_bucket_level_access.enabled = False
     if len(metadata.acl) == 0:
         predefined_acl = utils.acl.extract_predefined_acl(
             request, False, context)
         if predefined_acl == 0:
             predefined_acl = 3
         elif predefined_acl == "":
             predefined_acl = "projectPrivate"
         elif is_uniform:
             utils.error.invalid(
                 "Predefined ACL with uniform bucket level access enabled",
                 context)
         cls.__insert_predefined_acl(metadata, predefined_acl, context)
     if len(metadata.default_object_acl) == 0:
         predefined_default_object_acl = utils.acl.extract_predefined_default_object_acl(
             request, context)
         if predefined_default_object_acl == 0:
             predefined_default_object_acl = 5
         elif predefined_default_object_acl == "":
             predefined_default_object_acl = "projectPrivate"
         elif is_uniform:
             utils.error.invalid(
                 "Predefined Default Object ACL with uniform bucket level access enabled",
                 context,
             )
         cls.__insert_predefined_default_object_acl(
             metadata, predefined_default_object_acl, context)
     metadata.iam_configuration.uniform_bucket_level_access.enabled = is_uniform
     metadata.id = metadata.name
     metadata.project_number = int(utils.acl.PROJECT_NUMBER)
     metadata.metageneration = 0
     metadata.etag = hashlib.md5(metadata.name.encode("utf-8")).hexdigest()
     metadata.time_created.FromDatetime(time_created)
     metadata.updated.FromDatetime(time_created)
     metadata.owner.entity = utils.acl.get_project_entity("owners", context)
     metadata.owner.entity_id = hashlib.md5(
         metadata.owner.entity.encode("utf-8")).hexdigest()
     return (
         cls(metadata, {}, cls.__init_iam_policy(metadata, context)),
         utils.common.extract_projection(request, default_projection,
                                         context),
     )
Example #10
0
 def jg(cls, filename, handle_errors=True):
     with gfile(filename, 'r') as f:
         for l in f:
             try:
                 yield json.loads(l)
             except Exception as e:
                 if not handle_errors:
                     logger.log(f'Error parsing File: {str(e)}')
                     raise e
Example #11
0
 def insert_notification(self, request, context):
     notification = None
     if context is not None:
         notification = request.notification
     else:
         notification = json_format.ParseDict(simdjson.loads(request.data),
                                              resources_pb2.Notification())
     notification.id = "notification-%d" % random.getrandbits(16)
     self.notifications.append(notification)
     return notification
Example #12
0
File: cli.py Project: ynuosoft/cr8
def dicts_from_lines(lines):
    """ returns a generator producing dicts from json lines

    1 JSON object per line is supported:

        {"name": "n1"}
        {"name": "n2"}

    Or 1 JSON object:

        {
            "name": "n1"
        }

    Or a list of JSON objects:

        [
            {"name": "n1"},
            {"name": "n2"},
        ]

    Or a list of JSON objects in a single line:

        [{"name": "n1"}, {"name": "n2"}]
    """
    lines = iter(lines)
    for line in lines:
        line = line.strip()
        if not line:
            continue  # skip empty lines
        try:
            data = loads(line)
            if isinstance(data, list):
                yield from data
            else:
                yield data
        except ValueError:
            content = line + ''.join(lines)
            dicts = loads(content)
            if isinstance(dicts, list):
                yield from dicts
            else:
                yield dicts
Example #13
0
 def insert_default_object_acl(self, request, context):
     entity, role = "", ""
     if context is not None:
         entity, role = (
             request.object_access_control.entity,
             request.object_access_control.role,
         )
     else:
         payload = simdjson.loads(request.data)
         entity, role = payload["entity"], payload["role"]
     return self.__upsert_default_object_acl(entity, role, False, context)
Example #14
0
 def set_iam_policy(self, request, context):
     policy = None
     if context is not None:
         policy = request.iam_request.policy
     else:
         data = simdjson.loads(request.data)
         data.pop("kind", None)
         policy = json_format.ParseDict(data, policy_pb2.Policy())
     self.iam_policy = policy
     self.iam_policy.etag = datetime.datetime.now().isoformat().encode(
         "utf-8")
     return self.iam_policy
Example #15
0
 def update(self, request, context):
     metadata = None
     if context is not None:
         metadata = request.metadata
     else:
         metadata = json_format.ParseDict(
             self.__preprocess_rest(simdjson.loads(request.data)),
             resources_pb2.Object(),
         )
     self.__update_metadata(metadata, None)
     self.__insert_predefined_acl(
         metadata,
         self.bucket,
         utils.acl.extract_predefined_acl(request, False, context),
         context,
     )
Example #16
0
 def init(cls, request, context):
     time_created = datetime.datetime.now()
     metadata = None
     if context is not None:
         metadata = request.bucket
     else:
         metadata = json_format.ParseDict(
             cls.__preprocess_rest(simdjson.loads(request.data)),
             resources_pb2.Bucket(),
         )
     cls.__validate_bucket_name(metadata.name, context)
     default_projection = 1
     if len(metadata.acl) != 0 or len(metadata.default_object_acl) != 0:
         default_projection = 2
     if len(metadata.acl) == 0:
         predefined_acl = utils.acl.extract_predefined_acl(request, False, context)
         if predefined_acl == 0:
             predefined_acl = 3
         elif predefined_acl == "":
             predefined_acl = "projectPrivate"
         cls.__insert_predefined_acl(metadata, predefined_acl, context)
     if len(metadata.default_object_acl) == 0:
         predefined_default_object_acl = utils.acl.extract_predefined_default_object_acl(
             request, context
         )
         if predefined_default_object_acl == 0:
             predefined_default_object_acl = 5
         elif predefined_default_object_acl == "":
             predefined_default_object_acl = "projectPrivate"
         cls.__insert_predefined_default_object_acl(
             metadata, predefined_default_object_acl, context
         )
     metadata.id = metadata.name
     metadata.project_number = int(utils.acl.PROJECT_NUMBER)
     metadata.metageneration = 0
     metadata.etag = hashlib.md5(metadata.name.encode("utf-8")).hexdigest()
     metadata.time_created.FromDatetime(time_created)
     metadata.updated.FromDatetime(time_created)
     metadata.owner.entity = utils.acl.get_project_entity("owners", context)
     metadata.owner.entity_id = hashlib.md5(
         metadata.owner.entity.encode("utf-8")
     ).hexdigest()
     return (
         cls(metadata, [], None),
         utils.common.extract_projection(request, default_projection, context),
     )
Example #17
0
 def patch(self, request, context):
     update_mask = field_mask_pb2.FieldMask()
     metadata = None
     if context is not None:
         metadata = request.metadata
         update_mask = request.update_mask
     else:
         data = simdjson.loads(request.data)
         if "labels" in data:
             if data["labels"] is None:
                 self.metadata.labels.clear()
             else:
                 for key, value in data["labels"].items():
                     if value is None:
                         self.metadata.labels.pop(key, None)
                     else:
                         self.metadata.labels[key] = value
         data.pop("labels", None)
         data = Bucket.__preprocess_rest(data)
         metadata = json_format.ParseDict(data, resources_pb2.Bucket())
         paths = set()
         for key in utils.common.nested_key(data):
             key = utils.common.to_snake_case(key)
             head = key
             for i, c in enumerate(key):
                 if c == "." or c == "[":
                     head = key[0:i]
                     break
             if head in Bucket.modifiable_fields:
                 if "[" in key:
                     paths.add(head)
                 else:
                     paths.add(key)
         update_mask = field_mask_pb2.FieldMask(paths=list(paths))
     self.__update_metadata(metadata, update_mask)
     self.__insert_predefined_acl(
         metadata, utils.acl.extract_predefined_acl(request, False,
                                                    context), context)
     self.__insert_predefined_default_object_acl(
         metadata,
         utils.acl.extract_predefined_default_object_acl(request, context),
         context,
     )
def process_status_file(context,metadata,status_file):
    now = timezone.now()
    context["containerstatus"]["harvester"].message="{}:Begin to process container status file '{}'".format(now.strftime("%Y-%m-%d %H:%M:%S"), metadata["resource_id"])
    context["containerstatus"]["harvester"].last_heartbeat = now
    context["containerstatus"]["harvester"].save(update_fields=["message","last_heartbeat"])
    if settings.CONTAINERSTATUS_STREAMING_PARSE:
        status_records = LogRecordIterator(status_file)
    else:
        with open(status_file,"r") as f:
            status_records = simdjson.loads(f.read())

    records = 0

    for record in status_records:
        records += 1
        try:
            if any(not (record.get(key) or "").strip() for key in ("computer","containerid","image","name")):
                #data is incomplete,ignore
                continue

            created = to_datetime(record["created"])
            started = to_datetime(record["started"])
            finished = to_datetime(record.get("finished"))
            containerid = record["containerid"]
            ports = record["ports"] or None
            containerstate = record["containerstate"]
            if finished:
                containerstate = "terminated"
            envs = os.linesep.join(json.loads(record["environmentvar"])) if record["environmentvar"] else None
            exitcode = str(record["exitcode"]) if finished else None
            computer = record["computer"].strip()
            workload_name = record["name"].strip()
            image_without_tag = record.get("image","").strip()
            if not image_without_tag:
                continue
            else:
                imageid = "{}:{}".format(image_without_tag,record["imagetag"].strip())
            cluster = None
            clustername = None
            if computer in context["clusters"]:
                cluster = context["clusters"][computer]
            elif record.get("resourceid"):
                resourceid = record["resourceid"].strip().rsplit("/",1)[-1]
                if resourceid in context["clusters"]:
                    cluster = context["clusters"][resourceid]
                else:
                    clustername = resourceid
            else:
                clustername = computer

            if not cluster:
                try:
                    cluster = models.Cluster.objects.get(name=clustername)
                except ObjectDoesNotExist as ex:
                    if settings.ENABLE_ADDED_BY_CONTAINERLOG:
                        cluster = models.Cluster(name=clustername,added_by_log=True)
                        cluster.save()
                    else:
                        continue

                context["clusters"][clustername] = cluster

            workload = None
            container = None
            key = (cluster.id,containerid)
            if key in context["containerstatus"]["terminated_containers"]:
                continue
            elif key in context["containerstatus"]["containers"]:
                container = context["containerstatus"]["containers"][key]
            else:
                try:
                    container = models.Container.objects.get(cluster=cluster,containerid=containerid)
                    context["containerstatus"]["containers"][key] = container
                except ObjectDoesNotExist as ex:
                    pass

            if container:
                workload_key = (container.workload.cluster.id,container.workload.namespace.name,container.workload.name,container.workload.kind)
                if workload_key not in context["workloads"]:
                    workload_update_fields = []
                    workload = container.workload
                    context["workloads"][workload_key] = (workload,workload_update_fields)
                else:
                    workload,workload_update_fields = context["workloads"][workload_key]

            elif settings.ENABLE_ADDED_BY_CONTAINERLOG:
                kind = "service?" if ports else "jobs?"
                new_workload_name = "{}-{}".format(image_without_tag,workload_name)
                workload_key = (cluster.id,"unknown",new_workload_name,kind)
                workload = None
                if workload_key in context["workloads"]:
                    workload,workload_update_fields = context["workloads"][workload_key]
                else:
                    #try to find the workload through cluster and workload name
                    workload_qs = models.Workload.objects.filter(cluster=cluster,name=workload_name)
                    for obj in workload_qs:
                        if obj.containerimage and obj.containerimage.imageid.startswith(image_without_tag) and ((ports and obj.listenings.all().count()) or (not ports and obj.listenings.all().count() == 0)):
                            workload = obj
                            break
                    if not workload :
                        if settings.ENABLE_ADDED_BY_CONTAINERLOG:
                            #not found , create a workload for this log
                            namespace_key = (cluster.id,"unknown")
                            if namespace_key in context["namespaces"]:
                                namespace = context["namespaces"][namespace_key]
                            else:
                                try:
                                    namespace = models.Namespace.objects.get(cluster=cluster,name="unknown")
                                except ObjectDoesNotExist as ex:
                                    namespace = models.Namespace(cluster=cluster,name="unknown",added_by_log=True,created=created or timezone.now(),modified=created or timezone.now())
                                    namespace.save()
    
                                context["namespaces"][namespace_key] = namespace
    
                            workload = models.Workload.objects.filter(cluster=cluster,namespace=namespace,name=new_workload_name,kind=kind).first()
    
                            if not workload:
                                image = models.ContainerImage.parse_image(imageid)
                                workload = models.Workload(
                                    cluster=namespace.cluster,
                                    project=namespace.project,
                                    namespace=namespace,
                                    name=new_workload_name,
                                    image=imageid,
                                    containerimage=image,
                                    kind=kind,
                                    api_version="",
                                    added_by_log=True,
                                    modified=created or timezone.now(),
                                    created=created or timezone.now()
                                )
                                #if finished and finished.date() < timezone.now().date():
                                #    workload.deleted = finished
                                workload.save()
                        else:
                            continue

                    workload_key = (cluster.id,workload.namespace.name,workload.name,workload.kind)
                    workload_update_fields = []
                    context["workloads"][workload_key] = (workload,workload_update_fields)

                container = models.Container(
                    cluster=workload.cluster,
                    namespace=workload.namespace,
                    workload=workload,
                    poduid = "",
                    containerid = containerid
                )
                context["containerstatus"]["containers"][key] = container
            else:
                continue

            #container
            container_status = get_container_status(container,containerstate)
            update_fields = set_fields(container,[
                ("exitcode",exitcode or container.exitcode),
                ("image",imageid or container.image),
                ("ports",ports or container.ports),
                ("envs",envs or container.envs),
                ("container_created",created or container.container_created),
                ("container_started",started or container.container_started),
                ("container_terminated",finished or container.container_terminated),
                ("status",container_status),
                ("last_checked",to_datetime(record["max_timegenerated"]))
            ])

            if container.pk is None:
                container.save()
            elif update_fields:
                container.save(update_fields=update_fields)

            update_latest_containers(context,container,workload=workload,workload_update_fields=workload_update_fields)

            if container.status == "running" and container.workload.kind.lower() == "deployment" and (not container.pk or "status" in update_fields):
                context["containerstatus"]["new_deployed_workloads"].add(container.workload)

            if container_status.lower() in ("deleted","terminated"):
                del context["containerstatus"]["containers"][key]
                context["containerstatus"]["terminated_containers"].add(key)

        except Exception as ex:
            #delete already added records from this log file
            logger.error("Failed to parse container status record({}).{}".format(record,str(ex)))
            continue

    context["last_archive_time"] = metadata["archive_endtime"]
    logger.info("Harvest {1} records from file '{0}'".format(status_file,records))
Example #19
0
def process_status_file(context,metadata,status_file):
    now = timezone.now()
    context["podstatus"]["harvester"].message="{}:Begin to process pod status file '{}'".format(now.strftime("%Y-%m-%d %H:%M:%S"),metadata["resource_id"])
    context["podstatus"]["harvester"].last_heartbeat = now
    context["podstatus"]["harvester"].save(update_fields=["message","last_heartbeat"])
    if settings.PODSTATUS_STREAMING_PARSE:
        status_records = LogRecordIterator(status_file)
    else:
        with open(status_file,"r") as f:
            status_records = simdjson.loads(f.read())

    records = 0
    for record in status_records:
        records += 1
        try:
            if any(not (record.get(key) or "").strip() for key in ("clusterid","computer","namespace","poduid","containerid","pod_created","container_name","controllerkind")):
                #data is incomplete,ignore
                continue

            if record["computer"].strip().lower().startswith("aks-nodepool"):
                cluster_name = record["clusterid"].strip().rsplit("/")[-1]
            else:
                cluster_name = record["computer"].strip()

            cluster_name = cluster_name.split(".",1)[0]

            if cluster_name in context["clusters"]:
                cluster = context["clusters"][cluster_name]
            else:
                #logger.debug("find cluster {}".format(cluster_name))
                try:
                    cluster = models.Cluster.objects.get(name=cluster_name)
                except ObjectDoesNotExist as ex:
                    if settings.ENABLE_ADDED_BY_CONTAINERLOG:
                        cluster = models.Cluster(name=cluster_name,added_by_log=True)
                        cluster.save()
                    else:
                        continue
                context["clusters"][cluster_name] = cluster

            namespace_name = record["namespace"].strip()
            key = (cluster.id,namespace_name)
            if key in context["namespaces"]:
                namespace = context["namespaces"][key]
            else:
                #logger.debug("find namespace {}".format(namespace_name))
                try:
                    namespace = models.Namespace.objects.get(cluster=cluster,name=namespace_name)
                except ObjectDoesNotExist as ex:
                    if settings.ENABLE_ADDED_BY_CONTAINERLOG:
                        namespace = models.Namespace(cluster=cluster,name=namespace_name,added_by_log=True,created=pod_created,modified=pod_created)
                        namespace.save()
                    else:
                        continue
                context["namespaces"][key] = namespace

            poduid = record["poduid"].strip()
            containerid = record["containerid"].strip()
            container_name = record["container_name"].split("/")
            if len(container_name) != 2:
                raise Exception("Can't parse the container_name '{}'".format(record["container_name"]))
            elif container_name[0].strip() != poduid:
                raise Exception("The first part of the container_name '{}' should be '{}'".format(record["container_name"],poduid))
            else:
                workload_name = container_name[1].strip()

            pod_created = to_datetime(record.get("pod_created"))
            pod_started = to_datetime(record.get("pod_started"))
            podip = record.get("podip")
            max_timegenerated = to_datetime(record["max_timegenerated"])

            workload_kind = to_workload_kind(record["controllerkind"])

            key = (cluster.id,namespace.name,workload_name,workload_kind)
            if key in context["workloads"]:
                workload,workload_update_fields = context["workloads"][key]
            else:
                #logger.debug("find workload.{}/{}({})".format(namespace.name,workload_name,workload_kind))
                try:
                    #logger.debug("find workload, cluster={}, project={}, namespace={},name={},kind={}".format(cluster,namespace.project,namespace,workload_name,workload_kind))
                    workload = models.Workload.objects.get(cluster=cluster,namespace=namespace,name=workload_name,kind=workload_kind)
                except ObjectDoesNotExist as ex:
                    if settings.ENABLE_ADDED_BY_CONTAINERLOG:
                        workload = models.Workload(cluster=cluster,project=namespace.project,namespace=namespace,name=workload_name,kind=workload_kind,image="",api_version="",modified=pod_created,created=pod_created,added_by_log=True)
                        #if pod_created.date() < timezone.now().date():
                        #    workload.deleted = max_timegenerated
                        workload.save()
                    else:
                        continue
                workload_update_fields = []
                context["workloads"][key] = (workload,workload_update_fields)

            try:
                container = models.Container.objects.get(cluster=cluster,containerid=containerid)
                previous_workload = container.workload
                previous_namespace = container.namespace
            except ObjectDoesNotExist as ex:
                container = models.Container(cluster=cluster,containerid=containerid)
                previous_workload = None
                previous_namespace = None

            update_fields = set_fields(container,[
                ("namespace",namespace),
                ("workload",workload),
                ("pod_created",pod_created),
                ("pod_started",pod_started),
                ("podip",podip),
                ("poduid",poduid),
                ("last_checked",to_datetime(record["max_timegenerated"]))
            ])
            """
            if workload and workload.deleted and workload.deleted < max_timegenerated:
                workload.deleted = max_timegenerated
                if "deleted" not in workload_update_fields:
                    workload_update_fields.append("deleted")
            """

            if previous_workload and previous_workload != workload and previous_workload.added_by_log and previous_workload.namespace.name == "unknown":
                context["podstatus"]["removable_workloads"].add(previous_workload)
                context["podstatus"]["orphan_namespaces"].add(previous_workload.namespace)

            if container.pk is None:
                container.save()
            elif update_fields:
                container.save(update_fields=update_fields)

        except Exception as ex:
            #delete already added records from this log file
            logger.error("Failed to parse pod status record({}).{}".format(record,str(ex)))
            continue

    logger.info("Harvest {1} records from file '{0}'".format(status_file,records))
Example #20
0
 def jsonloads(cls, string):
     return json.loads(string)
Example #21
0
 def jl(cls, line):
     return json.loads(line)
Example #22
0
def process_status_file(context,metadata,status_file):
    now = timezone.now()
    context["logstatus"]["harvester"].message="{}:Begin to process container log file '{}'".format(now.strftime("%Y-%m-%d %H:%M:%S"), metadata["resource_id"])
    context["logstatus"]["harvester"].last_heartbeat = now
    context["logstatus"]["harvester"].save(update_fields=["message","last_heartbeat"])
    if settings.CONTAINERLOG_STREAMING_PARSE:
        status_records = LogRecordIterator(status_file)
    else:
        with open(status_file,"r") as f:
            status_records = simdjson.loads(f.read())

    records = 0
    for record in status_records:
        try:
            if any(not (record.get(key) or "").strip() for key in ("computer","containerid","logentry","logtime")):
                #data is incomplete,ignore
                continue

            logtime = to_datetime(record["logtime"])
            containerid = record["containerid"].strip()
            message = record["logentry"].strip()
            if not message:
                continue
 
            message = message.replace("\x00","").replace("\\n","\n")
            message = message.strip()
            """
            #try to get log time from message.
            datestr = message[0:19]
            for pattern in ["%Y-%m-%d %H:%M:%S"]:
                try:
                    logtime = timezone.make_aware(datetime.datetime.strptime(datestr,pattern))
                    break
                except:
                    continue
            """

            source = (record["logentrysource"] or "").strip() or None

            computer = record["computer"].strip()
            cluster = None
            clustername = None

            if computer in context["clusters"]:
                cluster = context["clusters"][computer]
            elif record.get("resourceid"):
                resourceid = record["resourceid"].strip().rsplit("/",1)[-1]
                if resourceid in context["clusters"]:
                    cluster = context["clusters"][resourceid]
                else:
                    clustername = resourceid
            else:
                clustername = computer

            if not cluster:
                try:
                    cluster = models.Cluster.objects.get(name=clustername)
                except ObjectDoesNotExist as ex:
                    if settings.ENABLE_ADDED_BY_CONTAINERLOG:
                        cluster = models.Cluster(name=clustername,added_by_log=True)
                        cluster.save()
                    else:
                        continue

                context["clusters"][clustername] = cluster
            """
            if cluster.name != 'az-k3s-oim01':
                continue
            """

            key = (cluster.id,containerid)
            if key in context["logstatus"]["containers"]:
                container,container_update_fields = context["logstatus"]["containers"][key]
            else:
                try:
                    container = models.Container.objects.get(cluster=cluster,containerid=containerid)
                except ObjectDoesNotExist as ex:
                    if settings.CONTAINERLOG_FAILED_IF_CONTAINER_NOT_FOUND:
                        raise Exception("The containerId({}) in log resource({}) Not Found".format(containerid,metadata))
                    else:
                        continue
                container_update_fields = []
                context["logstatus"]["containers"][key] = (container,container_update_fields)

            key = (cluster.id,containerid)
            if key in context["logstatus"]["containerlogs"]:
                containerlog = context["logstatus"]["containerlogs"][key]
                containerlog.archiveid = metadata["resource_id"]
            else:
                containerlog = models.ContainerLog(archiveid=metadata["resource_id"])
                context["logstatus"]["containerlogs"][key] = containerlog

            result = container.workload.containerimage.imagefamily.get_loglevel(message)
            if result:
                level,newmessage = result
            else:
                level = None
                newmessage = False
                for log_level_re,value in log_levels:
                    if log_level_re.search(message):
                        level,newmessage = value
                        break

                if level is None:
                    if source.lower() in ('stderr',):
                        level = models.ContainerLog.ERROR
                    else:
                        level = models.ContainerLog.INFO

            if not containerlog.logtime:
                containerlog.id = None
                containerlog.container = container
                containerlog.logtime = logtime
                containerlog.latest_logtime = logtime
                containerlog.source = source
                #containerlog.message = "{}:{}".format(logtime.strftime("%Y-%m-%d %H:%M:%S.%f"),message)
                containerlog.message = message
                containerlog.level = level
            elif newmessage or logtime >= (containerlog.latest_logtime + datetime.timedelta(seconds=1)) or containerlog.source != source :
                records += 1

                containerlog.save()
                _add_notify_log(context,containerlog)
                container = containerlog.container
                update_workload_latest_containers(context,containerlog)
                key = (container.cluster.id,container.containerid)
                if key in context["logstatus"]["containers"]:
                    container,container_update_fields = context["logstatus"]["containers"][key]
                else:
                    container_update_fields = []
                    context["logstatus"]["containers"][key] = (container,container_update_fields)
                container_update_fields = set_fields(container,[
                    ("log", True),
                    ("warning", True if containerlog.level == models.ContainerLog.WARNING else container.warning),
                    ("error", True if containerlog.level == models.ContainerLog.ERROR else container.error),
                ],container_update_fields)
                if newmessage and containerlog.logtime >= logtime:
                    #more than one logs at the same time, add one millesconds to the logtime because of unique index
                    logtime = containerlog.logtime + datetime.timedelta(milliseconds=1)
                containerlog.id = None
                containerlog.container = container
                containerlog.logtime = logtime
                containerlog.latest_logtime = logtime
                containerlog.source = source
                #containerlog.message = "{}:{}".format(logtime.strftime("%Y-%m-%d %H:%M:%S.%f"),message)
                containerlog.message = message
                containerlog.level = level
            else:
                if level > containerlog.level:
                    containerlog.level = level
                #containerlog.message = "{}\n{}:{}".format(containerlog.message,logtime.strftime("%Y-%m-%d %H:%M:%S.%f"),message)
                containerlog.message = "{}\n{}".format(containerlog.message,message)
                if logtime > containerlog.latest_logtime:
                    containerlog.latest_logtime = logtime
        except Exception as ex:
            #delete already added records from this log file
            logger.error("Failed to parse container log record({}).{}".format(record,str(ex)))
            continue

    #save the last message
    containerlogs = [o for o in context["logstatus"]["containerlogs"].values() if o.logtime and o.container]
    containerlogs.sort(key=lambda o:o.logtime)
    for containerlog in containerlogs:
        records += 1
        containerlog.save()
        _add_notify_log(context,containerlog)
        container = containerlog.container
        update_workload_latest_containers(context,containerlog)
        key = (container.cluster.id,container.containerid)
        if key in context["logstatus"]["containers"]:
            container,container_update_fields = context["logstatus"]["containers"][key]
        else:
            container_update_fields = []
            context["logstatus"]["containers"][key] = (container,container_update_fields)
        container_update_fields = set_fields(container,[
            ("log", True),
            ("warning", True if containerlog.level == models.ContainerLog.WARNING else container.warning),
            ("error", True if containerlog.level == models.ContainerLog.ERROR else container.error),
        ],container_update_fields)
        containerlog.id = None
        containerlog.logtime = None
        containerlog.level = None
        containerlog.message = None
        containerlog.source = None
        containerlog.container = None
        containerlog.latest_logtime = None

    #save terminated containers
    terminated_keys = []
    for key,value  in context["logstatus"]["containers"].items():
        container,container_update_fields = value
        if container.container_terminated and (container.container_terminated + datetime.timedelta(minutes=30)) < metadata["archive_endtime"]:
            terminated_keys.append(key)
            if not container.pk:
                container.save()
            elif container_update_fields:
                container.save(update_fields=container_update_fields)
                container_update_fields.clear()

    #delete terminated containers from cache
    for key in terminated_keys:
        del context["logstatus"]["containers"][key]
        if key in context["logstatus"]["containerlogs"]:
            del context["logstatus"]["containerlogs"][key]
    logger.info("Harvest {1} records from file '{0}'".format(status_file,records))
Example #23
0
              procedure='s.get_raw_data',
              detail=response,
              message='failed to get raw data')
        sys.exit(1)

    status, response = s.get_iteration_set(name='test')
    if status is False:
        s.log(level='ERROR',
              app='test',
              procedure='s.get_iteration_set',
              detail=response,
              message='failed to get iteration set')
        sys.exit(1)

    status, response = s.get_unreviewed_index_records(module='scraper',
                                                      name='test',
                                                      datasource='test')
    if status is False:
        s.log(level='ERROR',
              app='test',
              procedure='s.get_unreviewed_index_records',
              detail=response,
              message='failed to get unreviewed scrapeindex records')
        sys.exit(1)
    else:
        si_record = simdjson.loads(
            response['get_unreviewed_index_records'])[0]['_id']

    print('passed all tests')
    sys.exit(0)
Example #24
0
# orjson only outputs bytes, but often we need unicode:
print('---dumps---')
benchmark("orjson", lambda s: orjson.dumps(s).decode('utf-8'))
benchmark("Python", json.dumps)
benchmark("rapidjson", rapidjson.dumps)
benchmark("ujson", ujson.dumps)
benchmark("simplejson", simplejson.dumps)
benchmark("hyperjson", hyperjson.dumps)
print('---loads---')
benchmark_load("orjson", lambda x: orjson.loads(x.encode('utf-8')))
benchmark_load("Python", json.loads)
benchmark_load("rapidjson", rapidjson.loads)
benchmark_load("ujson", ujson.loads)
benchmark_load("simplejson", simplejson.loads)
benchmark_load("hyperjson", hyperjson.loads)
benchmark_load("pysimdjson-load", lambda x: simdjson.loads(x.encode('utf-8')))

# dumps
# orjson 1.227565050125122
# Python 5.861892938613892
# rapidjson 2.87353777885437
# ujson 1.669421911239624

# loads
# orjson 2.642509937286377
# Python 4.873814105987549
# rapidjson 3.068044900894165
# ujson 1.7971441745758057

# orjson==2.6.1
# python-rapidjson==0.9.1
Example #25
0
def process_log_file(context, metadata, log_file):
    if settings.NGINXLOG_STREAMING_PARSE:
        log_records = LogRecordIterator(log_file)
    else:
        with open(log_file, "r") as f:
            log_records = simdjson.loads(f.read())

    records = 0
    webserver_records = {}
    webserver = None
    key = None
    original_request_path = None
    for record in log_records:
        records += 1
        try:
            if "?" in record["request_path"]:
                request_path, path_parameters = record["request_path"].split(
                    "?", 1)
                if path_parameters:
                    path_parameters = path_parameters.replace("%00",
                                                              "").replace(
                                                                  "\x00", "")
                    path_parameters = [
                        (k, v[0] if len(v) == 1 else v)
                        for k, v in QueryDict(path_parameters).lists()
                    ]
                    path_parameters.sort(key=lambda o: o[0].lower())
                    all_path_parameters = [o[0] for o in path_parameters]
                    path_parameters = WebAppAccessLog.to_path_parameters(
                        path_parameters)
            else:
                request_path = record["request_path"]
                path_parameters = None
                all_path_parameters = None

            try:
                http_status = int(record["http_status"])
            except:
                http_status = 0

            original_request_path = request_path
            if not request_path:
                request_path = "/"
                original_request_path = request_path
            elif len(request_path) > 512:
                request_path = request_path[0:512]

            parameters_changed, path_parameters = RequestParameterFilter.filter_parameters(
                record["webserver"],
                request_path,
                path_parameters,
                parameter_filters=context["parameter_filters"],
                parameter_filter_map=context["parameter_filter_map"])

            path_changed, request_path = RequestPathNormalizer.normalize_path(
                record["webserver"],
                request_path,
                path_normalizers=context["path_normalizers"],
                path_normalizer_map=context["path_normalizer_map"],
                path_filter=context["path_filter"])
            if request_path is None:
                continue

            if webserver:
                if record["webserver"] != webserver:
                    for log_record in webserver_records.values():
                        log_record.save()
                    webserver_records.clear()
                    webserver = record["webserver"]
            else:
                webserver = record["webserver"]

            key = (request_path, http_status, path_parameters)

            accesslog = webserver_records.get(key)
            if accesslog:
                accesslog.requests += int(record["requests"])
                accesslog.total_response_time += to_float(
                    record["total_response_time"])
                if accesslog.max_response_time < to_float(
                        record["max_response_time"]):
                    accesslog.max_response_time = to_float(
                        record["max_response_time"])

                if accesslog.min_response_time > to_float(
                        record["min_response_time"]):
                    accesslog.min_response_time = to_float(
                        record["min_response_time"])

                accesslog.avg_response_time = accesslog.total_response_time / accesslog.requests
                if all_path_parameters:
                    if accesslog.all_path_parameters:
                        changed = False
                        for param in all_path_parameters:
                            if param not in accesslog.all_path_parameters:
                                accesslog.all_path_parameters.append(param)
                                changed = True
                        if changed:
                            accesslog.all_path_parameters.sort()
                    else:
                        accesslog.all_path_parameters = all_path_parameters
            else:
                accesslog = WebAppAccessLog(
                    log_starttime=metadata["archive_starttime"],
                    log_endtime=metadata["archive_endtime"],
                    webserver=record["webserver"],
                    request_path=request_path,
                    http_status=http_status,
                    path_parameters=path_parameters,
                    all_path_parameters=all_path_parameters,
                    requests=int(record["requests"]),
                    max_response_time=to_float(record["max_response_time"]),
                    min_response_time=to_float(record["min_response_time"]),
                    total_response_time=to_float(
                        record["total_response_time"]))
                accesslog.avg_response_time = accesslog.total_response_time / accesslog.requests
                if accesslog.webserver not in context.get("webapps", {}):
                    if "webapps" not in context:
                        context["webapps"] = {}
                    context["webapps"][
                        accesslog.webserver] = WebApp.objects.filter(
                            name=accesslog.webserver).first()
                accesslog.webapp = context["webapps"][accesslog.webserver]

                if accesslog.webapp and not accesslog.webapp.redirect_to and not accesslog.webapp.redirect_to_other:
                    if accesslog.webapp not in context.get(
                            "webapplocations", {}):
                        if "webapplocations" not in context:
                            context["webapplocations"] = {}
                        context["webapplocations"][accesslog.webapp] = list(
                            WebAppLocation.objects.filter(
                                app=accesslog.webapp).order_by("-score"))
                    accesslog.webapplocation = accesslog.webapp.get_matched_location(
                        original_request_path,
                        context["webapplocations"][accesslog.webapp])
                    if not accesslog.webapplocation and accesslog.http_status < 300 and accesslog.http_status >= 200:
                        logger.warning(
                            "Can't find the app location for request path({1}) in web application({0})"
                            .format(accesslog.webapp, accesslog.request_path))
                webserver_records[key] = accesslog
        except Exception as ex:
            #delete already added records from this log file
            WebAppAccessLog.objects.filter(
                log_starttime=metadata["archive_starttime"]).delete()
            logger.error(
                "Failed to parse the nginx access log record({}).{}".format(
                    record, traceback.format_exc()))
            raise Exception(
                "Failed to parse the nginx access log record({}).{}".format(
                    record, str(ex)))

    for log_record in webserver_records.values():
        log_record.save()

    logger.info("Harvest {1} records from log file '{0}'".format(
        log_file, records))
Example #26
0
 def json(self) -> dict:
     return simdjson.loads(self.content)
Example #27
0
def readl_simdjson(filepath: str):
    parser = simdjson.Parser()
    with open(filepath) as fp:
        return [simdjson.loads(line) for line in fp]
Example #28
0
    def query(self, bbox, allow_missing=False):
        """
    For the specified bounding box (or equivalent representation),
    list all segment ids enclosed within it.

    If allow_missing is set, then don't raise an error if an index
    file is missing.

    Returns: set(labels)
    """
        bbox = Bbox.create(bbox, context=self.physical_bounds, autocrop=True)
        original_bbox = bbox.clone()
        bbox = bbox.expand_to_chunk_size(self.chunk_size.astype(
            self.physical_bounds.dtype),
                                         offset=self.physical_bounds.minpt)

        if bbox.subvoxel():
            return []

        labels = set()
        fast_path = bbox.contains_bbox(self.physical_bounds)

        if self.sql_db and fast_path:
            conn = connect(self.sql_db)
            cur = conn.cursor()
            cur.execute("select label from file_lookup")
            while True:
                rows = cur.fetchmany(size=2**20)
                if len(rows) == 0:
                    break
                # Sqlite only stores signed integers, so we need to coerce negative
                # integers back into unsigned with a bitwise and.
                labels.update(
                    (int(row[0]) & 0xffffffffffffffff for row in rows))
            cur.close()
            conn.close()
            return labels

        index_files = self.index_file_paths_for_bbox(bbox)

        num_blocks = int(np.ceil(len(index_files) / 10000))
        for index_files_subset in tqdm(sip(index_files, 10000),
                                       total=num_blocks,
                                       desc="Block",
                                       disable=((not self.config.progress)
                                                or (num_blocks == 1))):
            results = self.fetch_index_files(index_files_subset)

            parser = simdjson.Parser()
            for filename, content in tqdm(results.items(),
                                          desc="Decoding Labels",
                                          disable=(not self.config.progress)):
                if content is None:
                    if allow_missing:
                        continue
                    else:
                        raise SpatialIndexGapError(filename +
                                                   " was not found.")

                # The bbox test saps performance a lot
                # but we can skip it if we know 100% that
                # the labels are going to be inside. This
                # optimization is important for querying
                # entire datasets, which is contemplated
                # for shard generation.
                if fast_path:
                    res = parser.parse(content).keys()
                    labels.update(
                        (int(label) for label in res))  # fast path: 16% CPU
                else:
                    res = simdjson.loads(content)
                    for label, label_bbx in res.items():
                        label = int(label)
                        label_bbx = Bbox.from_list(label_bbx)

                        if Bbox.intersects(label_bbx, original_bbox):
                            labels.add(label)

        return labels
Example #29
0
import simdjson

with open('sample.json', 'rb') as f:
    document = simdjson.loads(f.read())

print(document)
print(type(document))

print(document["type"])
print(document["created_at"])
print(document["id"])

print(document["actor"])
for k, v in document["actor"].items():
    print(k, v)

print(document["repo"])
for k, v in document["repo"].items():
    print(k, v)

print(document["public"])

print(document["payload"])
for k, v in document["payload"].items():
    print(k, v)
Example #30
0
def test_valid_smallblock():
    assert simdjson.loads(b'{"test": "value"}') == {'test': 'value'}