Beispiel #1
0
def _deep_json_to_string(value, depth):
    """
    :param value: SOME STRUCTURE
    :param depth: THE MAX DEPTH OF PROPERTIES, DEEPER WILL BE STRING-IFIED
    :return: FLATTER STRUCTURE
    """
    if is_data(value):
        if depth == 0:
            return strings.limit(value2json(value), LOG_STRING_LENGTH)

        return {
            k: _deep_json_to_string(v, depth - 1)
            for k, v in value.items()
        }
    elif is_sequence(value):
        return strings.limit(value2json(value), LOG_STRING_LENGTH)
    elif isinstance(value, number_types):
        return value
    elif is_text(value):
        return strings.limit(value, LOG_STRING_LENGTH)
    elif is_binary(value):
        return strings.limit(bytes2base64(value), LOG_STRING_LENGTH)
    elif isinstance(value, (date, datetime)):
        return datetime2unix(value)
    else:
        return strings.limit(value2json(value), LOG_STRING_LENGTH)
    def _annotate(cls, item, timestamp, stack_depth):
        """
        :param item:  A LogItem THE TYPE OF MESSAGE
        :param stack_depth: FOR TRACKING WHAT LINE THIS CAME FROM
        :return:
        """
        item.timestamp = timestamp
        item.machine = machine_metadata
        item.template = strings.limit(item.template, 10000)

        item.format = strings.limit(item.format, 10000)
        if item.format == None:
            format = text(item)
        else:
            format = item.format.replace("{{", "{{params.")
        if not format.startswith(CR) and format.find(CR) > -1:
            format = CR + format

        if cls.trace:
            log_format = item.format = "{{machine.name}} (pid {{machine.pid}}) - {{timestamp|datetime}} - {{thread.name}} - \"{{location.file}}:{{location.line}}\" - ({{location.method}}) - " + format
            f = sys._getframe(stack_depth + 1)
            item.location = {
                "line": f.f_lineno,
                "file": text(f.f_code.co_filename),
                "method": text(f.f_code.co_name)
            }
            thread = _Thread.current()
            item.thread = {"name": thread.name, "id": thread.id}
        else:
            log_format = item.format = "{{timestamp|datetime}} - " + format

        cls.main_log.write(log_format, item.__data__())
    def post(self, path, **kwargs):
        url = self.settings.host + ":" + unicode(self.settings.port) + path

        try:
            wrap(kwargs).headers["Accept-Encoding"] = "gzip,deflate"

            data = kwargs.get(b'data')
            if data == None:
                pass
            elif isinstance(data, Mapping):
                kwargs[b'data'] = data =convert.unicode2utf8(convert.value2json(data))
            elif not isinstance(kwargs["data"], str):
                Log.error("data must be utf8 encoded string")

            if self.debug:
                sample = kwargs.get(b'data', "")[:300]
                Log.note("{{url}}:\n{{data|indent}}", url=url, data=sample)

            if self.debug:
                Log.note("POST {{url}}", url=url)
            response = http.post(url, **kwargs)
            if response.status_code not in [200, 201]:
                Log.error(response.reason.decode("latin1") + ": " + strings.limit(response.content.decode("latin1"), 100 if self.debug else 10000))
            if self.debug:
                Log.note("response: {{response}}", response=utf82unicode(response.content)[:130])
            details = mo_json.json2value(utf82unicode(response.content))
            if details.error:
                Log.error(convert.quote2string(details.error))
            if details._shards.failed > 0:
                Log.error("Shard failures {{failures|indent}}",
                    failures="---\n".join(r.replace(";", ";\n") for r in details._shards.failures.reason)
                )
            return details
        except Exception as e:
            if url[0:4] != "http":
                suggestion = " (did you forget \"http://\" prefix on the host name?)"
            else:
                suggestion = ""

            if kwargs.get("data"):
                Log.error(
                    "Problem with call to {{url}}" + suggestion + "\n{{body|left(10000)}}",
                    url=url,
                    body=strings.limit(kwargs["data"], 100 if self.debug else 10000),
                    cause=e
                )
            else:
                Log.error("Problem with call to {{url}}" + suggestion, url=url, cause=e)
Beispiel #4
0
    def post(self, path, **kwargs):
        url = self.settings.host + ":" + unicode(self.settings.port) + path

        try:
            wrap(kwargs).headers["Accept-Encoding"] = "gzip,deflate"

            data = kwargs.get(b'data')
            if data == None:
                pass
            elif isinstance(data, Mapping):
                kwargs[b'data'] = data =convert.unicode2utf8(convert.value2json(data))
            elif not isinstance(kwargs["data"], str):
                Log.error("data must be utf8 encoded string")

            if self.debug:
                sample = kwargs.get(b'data', "")[:300]
                Log.note("{{url}}:\n{{data|indent}}", url=url, data=sample)

            if self.debug:
                Log.note("POST {{url}}", url=url)
            response = http.post(url, **kwargs)
            if response.status_code not in [200, 201]:
                Log.error(response.reason.decode("latin1") + ": " + strings.limit(response.content.decode("latin1"), 100 if self.debug else 10000))
            if self.debug:
                Log.note("response: {{response}}", response=utf82unicode(response.content)[:130])
            details = mo_json.json2value(utf82unicode(response.content))
            if details.error:
                Log.error(convert.quote2string(details.error))
            if details._shards.failed > 0:
                Log.error("Shard failures {{failures|indent}}",
                    failures="---\n".join(r.replace(";", ";\n") for r in details._shards.failures.reason)
                )
            return details
        except Exception, e:
            if url[0:4] != "http":
                suggestion = " (did you forget \"http://\" prefix on the host name?)"
            else:
                suggestion = ""

            if kwargs.get("data"):
                Log.error(
                    "Problem with call to {{url}}" + suggestion + "\n{{body|left(10000)}}",
                    url=url,
                    body=strings.limit(kwargs["data"], 100 if self.debug else 10000),
                    cause=e
                )
            else:
                Log.error("Problem with call to {{url}}" + suggestion, url=url, cause=e)
def json2value(json_string, params=Null, flexible=False, leaves=False):
    """
    :param json_string: THE JSON
    :param params: STANDARD JSON PARAMS
    :param flexible: REMOVE COMMENTS
    :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED
    :return: Python value
    """
    json_string = text(json_string)
    if not is_text(json_string) and json_string.__class__.__name__ != "FileString":
        Log.error("only unicode json accepted")

    try:
        if params:
            # LOOKUP REFERENCES
            json_string = expand_template(json_string, params)

        if flexible:
            value = hjson2value(json_string)
        else:
            value = to_data(json_decoder(text(json_string)))

        if leaves:
            value = leaves_to_data(value)

        return value

    except Exception as e:
        e = Except.wrap(e)

        if not json_string.strip():
            Log.error("JSON string is only whitespace")

        c = e
        while "Expecting '" in c.cause and "' delimiter: line" in c.cause:
            c = c.cause

        if "Expecting '" in c and "' delimiter: line" in c:
            line_index = int(strings.between(c.message, " line ", " column ")) - 1
            column = int(strings.between(c.message, " column ", " ")) - 1
            line = json_string.split("\n")[line_index].replace("\t", " ")
            if column > 20:
                sample = "..." + line[column - 20:]
                pointer = "   " + (" " * 20) + "^"
            else:
                sample = line
                pointer = (" " * column) + "^"

            if len(sample) > 43:
                sample = sample[:43] + "..."

            Log.error(CAN_NOT_DECODE_JSON + " at:\n\t{{sample}}\n\t{{pointer}}\n", sample=sample, pointer=pointer)

        base_str = strings.limit(json_string, 1000).encode('utf8')
        hexx_str = bytes2hex(base_str, " ")
        try:
            char_str = " " + "  ".join((c.decode("latin1") if ord(c) >= 32 else ".") for c in base_str)
        except Exception:
            char_str = " "
        Log.error(CAN_NOT_DECODE_JSON + ":\n{{char_str}}\n{{hexx_str}}\n", char_str=char_str, hexx_str=hexx_str, cause=e)
Beispiel #6
0
 def write(self, template, params):
     if params.get("template"):
         # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE
         self.queue.add({"value": params})
     else:
         template = strings.limit(template, 2000)
         self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE)
     return self
 def write(self, template, params):
     try:
         params.template = strings.limit(params.template, 2000)
         params.format = None
         self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60)
     except Exception as e:
         sys.stdout.write(text_type(Except.wrap(e)))
     return self
 def write(self, template, params):
     try:
         params.template = strings.limit(params.template, 2000)
         params.format = None
         self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60)
     except Exception as e:
         sys.stdout.write(text(Except.wrap(e)))
     return self
Beispiel #9
0
def minimize_repo(repo):
    """
    RETURN A MINIMAL VERSION OF THIS CHANGESET
    """
    if repo == None:
        return Null
    output = wrap(_copy_but(repo, _exclude_from_repo))
    output.changeset.description = strings.limit(output.changeset.description, 1000)
    return output
Beispiel #10
0
def minimize_repo(repo):
    """
    RETURN A MINIMAL VERSION OF THIS CHANGESET
    """
    if repo == None:
        return Null
    output = wrap(_copy_but(repo, _exclude_from_repo))
    output.changeset.description = strings.limit(output.changeset.description,
                                                 1000)
    return output
Beispiel #11
0
def _deep_json_to_string(value, depth):
    """
    :param value: SOME STRUCTURE
    :param depth: THE MAX DEPTH OF PROPERTIES, DEEPER WILL BE STRING-IFIED
    :return: FLATTER STRUCTURE
    """
    if isinstance(value, Mapping):
        if depth == 0:
            return strings.limit(value2json(value), LOG_STRING_LENGTH)

        return {k: _deep_json_to_string(v, depth - 1) for k, v in value.items()}
    elif isinstance(value, list):
        return strings.limit(value2json(value), LOG_STRING_LENGTH)
    elif isinstance(value, (float, int, long)):
        return value
    elif isinstance(value, basestring):
        return strings.limit(value, LOG_STRING_LENGTH)
    else:
        return strings.limit(value2json(value), LOG_STRING_LENGTH)
Beispiel #12
0
    def _normalize_revision(self, r, found_revision, push, get_diff):
        new_names = set(r.keys()) - {"rev", "node", "user", "description", "desc", "date", "files", "backedoutby", "parents", "children", "branch", "tags", "pushuser", "pushdate", "pushid", "phase", "bookmarks"}
        if new_names and not r.tags:
            Log.warning("hg is returning new property names ({{names}})", names=new_names)

        changeset = Changeset(
            id=r.node,
            id12=r.node[0:12],
            author=r.user,
            description=strings.limit(coalesce(r.description, r.desc), 2000),
            date=parse_hg_date(r.date),
            files=r.files,
            backedoutby=r.backedoutby if r.backedoutby else None,
            bug=self._extract_bug_id(r.description)
        )
        rev = Revision(
            branch=found_revision.branch,
            index=r.rev,
            changeset=changeset,
            parents=unwraplist(list(set(r.parents))),
            children=unwraplist(list(set(r.children))),
            push=push,
            phase=r.phase,
            bookmarks=unwraplist(r.bookmarks),
            etl={"timestamp": Date.now().unix, "machine": machine_metadata}
        )

        r.pushuser = None
        r.pushdate = None
        r.pushid = None
        r.node = None
        r.user = None
        r.desc = None
        r.description = None
        r.date = None
        r.files = None
        r.backedoutby = None
        r.parents = None
        r.children = None
        r.bookmarks = None

        set_default(rev, r)

        # ADD THE DIFF
        if get_diff or GET_DIFF:
            rev.changeset.diff = self._get_json_diff_from_hg(rev)

        try:
            _id = coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE)
            with self.es_locker:
                self.es.add({"id": _id, "value": rev})
        except Exception as e:
            Log.warning("did not save to ES", cause=e)

        return rev
 def delete(self, path, **kwargs):
     url = self.settings.host + ":" + unicode(self.settings.port) + path
     try:
         response = http.delete(url, **kwargs)
         if response.status_code not in [200]:
             Log.error(response.reason+": "+response.all_content)
         if self.debug:
             Log.note("response: {{response}}", response=strings.limit(utf82unicode(response.all_content), 130))
         details = wrap(mo_json.json2value(utf82unicode(response.all_content)))
         if details.error:
             Log.error(details.error)
         return details
     except Exception as e:
         Log.error("Problem with call to {{url}}", url=url, cause=e)
Beispiel #14
0
 def delete(self, path, **kwargs):
     url = self.settings.host + ":" + unicode(self.settings.port) + path
     try:
         response = http.delete(url, **kwargs)
         if response.status_code not in [200]:
             Log.error(response.reason+": "+response.all_content)
         if self.debug:
             Log.note("response: {{response}}", response=strings.limit(utf82unicode(response.all_content), 130))
         details = wrap(mo_json.json2value(utf82unicode(response.all_content)))
         if details.error:
             Log.error(details.error)
         return details
     except Exception, e:
         Log.error("Problem with call to {{url}}", url=url, cause=e)
def _deep_json_to_string(value, depth):
    """
    :param value: SOME STRUCTURE
    :param depth: THE MAX DEPTH OF PROPERTIES, DEEPER WILL BE STRING-IFIED
    :return: FLATTER STRUCTURE
    """
    if is_data(value):
        if depth == 0:
            return strings.limit(value2json(value), LOG_STRING_LENGTH)

        return {k: _deep_json_to_string(v, depth - 1) for k, v in value.items()}
    elif is_sequence(value):
        return strings.limit(value2json(value), LOG_STRING_LENGTH)
    elif isinstance(value, number_types):
        return value
    elif is_text(value):
        return strings.limit(value, LOG_STRING_LENGTH)
    elif is_binary(value):
        return strings.limit(bytes2base64(value), LOG_STRING_LENGTH)
    elif isinstance(value, (date, datetime)):
        return datetime2unix(value)
    else:
        return strings.limit(value2json(value), LOG_STRING_LENGTH)
Beispiel #16
0
def _deep_json_to_string(value, depth):
    """
    :param value: SOME STRUCTURE
    :param depth: THE MAX DEPTH OF PROPERTIES, DEEPER WILL BE STRING-IFIED
    :return: FLATTER STRUCTURE
    """
    if isinstance(value, Mapping):
        if depth == 0:
            return strings.limit(value2json(value), LOG_STRING_LENGTH)

        return {
            k: _deep_json_to_string(v, depth - 1)
            for k, v in value.items()
        }
    elif isinstance(value, (list, FlatList)):
        return strings.limit(value2json(value), LOG_STRING_LENGTH)
    elif isinstance(value, number_types):
        return value
    elif isinstance(value, text_type):
        return strings.limit(value, LOG_STRING_LENGTH)
    elif isinstance(value, binary_type):
        return strings.limit(bytes2base64(value), LOG_STRING_LENGTH)
    else:
        return strings.limit(value2json(value), LOG_STRING_LENGTH)
Beispiel #17
0
def minimize_repo(repo):
    # output = set_default({}, _exclude_from_repo, repo)
    output = wrap(_copy_but(repo, _exclude_from_repo))
    output.changeset.description = strings.limit(output.changeset.description, 1000)
    return output
Beispiel #18
0
def json2value(json_string, params=Null, flexible=False, leaves=False):
    """
    :param json_string: THE JSON
    :param params: STANDARD JSON PARAMS
    :param flexible: REMOVE COMMENTS
    :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED
    :return: Python value
    """
    if isinstance(json_string, str):
        Log.error("only unicode json accepted")

    try:
        if flexible:
            # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r
            # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58
            json_string = re.sub(r"\"\"\".*?\"\"\"", r"\n", json_string, flags=re.MULTILINE)
            json_string = "\n".join(remove_line_comment(l) for l in json_string.split("\n"))
            # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA
            json_string = re.sub(r",\s*\}", r"}", json_string)
            # ALLOW LISTS TO END WITH COMMA
            json_string = re.sub(r",\s*\]", r"]", json_string)

        if params:
            # LOOKUP REFERENCES
            json_string = expand_template(json_string, params)

        try:
            value = wrap(json_decoder(unicode(json_string)))
        except Exception as e:
            Log.error("can not decode\n{{content}}", content=json_string, cause=e)

        if leaves:
            value = wrap_leaves(value)

        return value

    except Exception as e:
        e = Except.wrap(e)

        if not json_string.strip():
            Log.error("JSON string is only whitespace")

        c = e
        while "Expecting '" in c.cause and "' delimiter: line" in c.cause:
            c = c.cause

        if "Expecting '" in c and "' delimiter: line" in c:
            line_index = int(strings.between(c.message, " line ", " column ")) - 1
            column = int(strings.between(c.message, " column ", " ")) - 1
            line = json_string.split("\n")[line_index].replace("\t", " ")
            if column > 20:
                sample = "..." + line[column - 20:]
                pointer = "   " + (" " * 20) + "^"
            else:
                sample = line
                pointer = (" " * column) + "^"

            if len(sample) > 43:
                sample = sample[:43] + "..."

            Log.error("Can not decode JSON at:\n\t" + sample + "\n\t" + pointer + "\n")

        base_str = strings.limit(json_string, 1000).encode('utf8')
        hexx_str = bytes2hex(base_str, " ")
        try:
            char_str = " " + "  ".join((c.decode("latin1") if ord(c) >= 32 else ".") for c in base_str)
        except Exception as e:
            char_str = " "
        Log.error("Can not decode JSON:\n" + char_str + "\n" + hexx_str + "\n", e)
Beispiel #19
0
    def extend(self, records):
        """
        records - MUST HAVE FORM OF
            [{"value":value}, ... {"value":value}] OR
            [{"json":json}, ... {"json":json}]
            OPTIONAL "id" PROPERTY IS ALSO ACCEPTED
        """
        if self.settings.read_only:
            Log.error("Index opened in read only mode, no changes allowed")
        lines = []
        try:
            for r in records:
                id = r.get("id")
                r_value = r.get('value')
                if id == None and r_value:
                    id = r_value.get('_id')
                if id == None:
                    id = random_id()

                if "json" in r:
                    json_bytes = r["json"].encode("utf8")
                elif r_value or isinstance(r_value, (dict, Data)):
                    json_bytes = convert.value2json(r_value).encode("utf8")
                else:
                    json_bytes = None
                    Log.error("Expecting every record given to have \"value\" or \"json\" property")

                lines.append(b'{"index":{"_id": ' + convert.value2json(id).encode("utf8") + b'}}')
                if self.settings.tjson:
                    lines.append(json2typed(json_bytes.decode('utf8')).encode('utf8'))
                else:
                    lines.append(json_bytes)
            del records

            if not lines:
                return

            with Timer("Add {{num}} documents to {{index}}", {"num": len(lines) / 2, "index":self.settings.index}, debug=self.debug):
                try:
                    data_bytes = b"\n".join(l for l in lines) + b"\n"
                except Exception, e:
                    Log.error("can not make request body from\n{{lines|indent}}", lines=lines, cause=e)

                response = self.cluster.post(
                    self.path + "/_bulk",
                    data=data_bytes,
                    headers={"Content-Type": "text"},
                    timeout=self.settings.timeout,
                    retry=self.settings.retry,
                    params={"consistency": self.settings.consistency}
                )
                items = response["items"]

                fails = []
                if self.cluster.version.startswith("0.90."):
                    for i, item in enumerate(items):
                        if not item.index.ok:
                            fails.append(i)
                elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])):
                    for i, item in enumerate(items):
                        if item.index.status not in [200, 201]:
                            fails.append(i)
                else:
                    Log.error("version not supported {{version}}", version=self.cluster.version)

                if fails:
                    if len(fails) <= 3:
                        cause = [
                            Except(
                                template="{{status}} {{error}} (and {{some}} others) while loading line id={{id}} into index {{index|quote}}:\n{{line}}",
                                status=items[i].index.status,
                                error=items[i].index.error,
                                some=len(fails) - 1,
                                line=strings.limit(lines[i * 2 + 1], 500 if not self.debug else 100000),
                                index=self.settings.index,
                                id=items[i].index._id
                            )
                            for i in fails
                        ]
                    else:
                        i=fails[0]
                        cause = Except(
                            template="{{status}} {{error}} (and {{some}} others) while loading line id={{id}} into index {{index|quote}}:\n{{line}}",
                            status=items[i].index.status,
                            error=items[i].index.error,
                            some=len(fails) - 1,
                            line=strings.limit(lines[i * 2 + 1], 500 if not self.debug else 100000),
                            index=self.settings.index,
                            id=items[i].index._id
                        )
                    Log.error("Problems with insert", cause=cause)

        except Exception, e:
            if e.message.startswith("sequence item "):
                Log.error("problem with {{data}}", data=repr(lines[int(e.message[14:16].strip())]), cause=e)
            Log.error("problem sending to ES", e)
Beispiel #20
0
    def extract(self, settings, force, restart, start, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if start:
                state = start, 0
            elif restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                get_ids = sql_query({
                    "from": "job",
                    "select": ["id"],
                    "where": {
                        "or": [
                            {
                                "gt": {
                                    "last_modified": Date(last_modified)
                                }
                            },
                            {
                                "and": [
                                    {
                                        "eq": {
                                            "last_modified":
                                            Date(last_modified)
                                        }
                                    },
                                    {
                                        "gt": {
                                            "id": job_id
                                        }
                                    },
                                ]
                            },
                        ]
                    },
                    "sort": ["last_modified", "id"],
                    "limit": settings.extractor.chunk_size,
                })
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # SOME LIMITS PLACES ON STRING SIZE
                for fl in jx.drill(acc, "job_log.failure_line"):
                    fl.message = strings.limit(fl.message, 10000)
                for r in acc:
                    r.etl.timestamp = Date.now()
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")
Beispiel #21
0
def diff_to_json(unified_diff):
    """
    CONVERT UNIFIED DIFF TO EASY-TO-STORE JSON FORMAT
    :param unified_diff: text
    :return: JSON details
    """
    output = []
    files = FILE_SEP.split(unified_diff)[1:]
    for file_ in files:
        changes = []
        old_file_header, new_file_header, file_diff = file_.split("\n", 2)
        old_file_path = old_file_header[
            1:]  # eg old_file_header == "a/testing/marionette/harness/marionette_harness/tests/unit/unit-tests.ini"
        new_file_path = new_file_header[
            5:]  # eg new_file_header == "+++ b/tests/resources/example_file.py"

        c = 0, 0
        hunks = HUNK_SEP.split(file_diff)[1:]
        for hunk in hunks:
            line_diffs = hunk.split("\n")
            old_start, old_length, new_start, new_length = HUNK_HEADER.match(
                line_diffs[0]).groups()
            next_c = max(0, int(new_start) - 1), max(0, int(old_start) - 1)
            if next_c[0] - next_c[1] != c[0] - c[1]:
                Log.error("expecting a skew of {{skew}}",
                          skew=next_c[0] - next_c[1])
            if c[0] > next_c[0]:
                Log.error("can not handle out-of-order diffs")
            while c[0] != next_c[0]:
                c = no_change(c)

            for line in line_diffs[1:]:
                if not line:
                    continue
                if (line.startswith("new file mode")
                        or line.startswith("deleted file mode")
                        or line.startswith("index ")
                        or line.startswith("diff --git")):
                    # HAPPENS AT THE TOP OF NEW FILES
                    # diff --git a/security/sandbox/linux/SandboxFilter.cpp b/security/sandbox/linux/SandboxFilter.cpp
                    # u'new file mode 100644'
                    # u'deleted file mode 100644'
                    # index a763e390731f5379ddf5fa77090550009a002d13..798826525491b3d762503a422b1481f140238d19
                    # GIT binary patch
                    # literal 30804
                    break
                d = line[0]
                if d == '+':
                    changes.append({
                        "new": {
                            "line": int(c[0]),
                            "content": strings.limit(line[1:],
                                                     MAX_CONTENT_LENGTH)
                        }
                    })
                elif d == '-':
                    changes.append({
                        "old": {
                            "line": int(c[1]),
                            "content": strings.limit(line[1:],
                                                     MAX_CONTENT_LENGTH)
                        }
                    })
                try:
                    c = MOVE[d](c)
                except Exception as e:
                    Log.warning("bad line {{line|quote}}", line=line, cause=e)

        output.append({
            "new": {
                "name": new_file_path
            },
            "old": {
                "name": old_file_path
            },
            "changes": changes
        })
    return wrap(output)
Beispiel #22
0
def json2value(json_string, params=Null, flexible=False, leaves=False):
    """
    :param json_string: THE JSON
    :param params: STANDARD JSON PARAMS
    :param flexible: REMOVE COMMENTS
    :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED
    :return: Python value
    """
    if isinstance(json_string, str):
        Log.error("only unicode json accepted")

    try:
        if flexible:
            # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r
            # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58
            json_string = re.sub(r"\"\"\".*?\"\"\"",
                                 r"\n",
                                 json_string,
                                 flags=re.MULTILINE)
            json_string = "\n".join(
                remove_line_comment(l) for l in json_string.split("\n"))
            # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA
            json_string = re.sub(r",\s*\}", r"}", json_string)
            # ALLOW LISTS TO END WITH COMMA
            json_string = re.sub(r",\s*\]", r"]", json_string)

        if params:
            # LOOKUP REFERENCES
            json_string = expand_template(json_string, params)

        try:
            value = wrap(json_decoder(unicode(json_string)))
        except Exception as e:
            Log.error("can not decode\n{{content}}",
                      content=json_string,
                      cause=e)

        if leaves:
            value = wrap_leaves(value)

        return value

    except Exception as e:
        e = Except.wrap(e)

        if not json_string.strip():
            Log.error("JSON string is only whitespace")

        c = e
        while "Expecting '" in c.cause and "' delimiter: line" in c.cause:
            c = c.cause

        if "Expecting '" in c and "' delimiter: line" in c:
            line_index = int(strings.between(c.message, " line ",
                                             " column ")) - 1
            column = int(strings.between(c.message, " column ", " ")) - 1
            line = json_string.split("\n")[line_index].replace("\t", " ")
            if column > 20:
                sample = "..." + line[column - 20:]
                pointer = "   " + (" " * 20) + "^"
            else:
                sample = line
                pointer = (" " * column) + "^"

            if len(sample) > 43:
                sample = sample[:43] + "..."

            Log.error("Can not decode JSON at:\n\t" + sample + "\n\t" +
                      pointer + "\n")

        base_str = strings.limit(json_string, 1000).encode('utf8')
        hexx_str = bytes2hex(base_str, " ")
        try:
            char_str = " " + "  ".join(
                (c.decode("latin1") if ord(c) >= 32 else ".")
                for c in base_str)
        except Exception as e:
            char_str = " "
        Log.error("Can not decode JSON:\n" + char_str + "\n" + hexx_str + "\n",
                  e)
Beispiel #23
0
    def _normalize_revision(self, r, found_revision, push, get_diff,
                            get_moves):
        new_names = set(r.keys()) - KNOWN_TAGS
        if new_names and not r.tags:
            Log.warning(
                "hg is returning new property names {{names|quote}} for {{changeset}} from {{url}}",
                names=new_names,
                changeset=r.node,
                url=found_revision.branch.url,
            )

        changeset = Changeset(
            id=r.node,
            id12=r.node[0:12],
            author=coalesce(r.author, r.user),
            description=strings.limit(coalesce(r.description, r.desc), 2000),
            date=parse_hg_date(r.date),
            files=r.files,
            backedoutby=r.backedoutby,
            backsoutnodes=r.backsoutnodes,
            bug=mo_math.UNION(([int(b) for b in r.bugs.no],
                               self._extract_bug_id(r.description))),
        )
        rev = Revision(
            branch=found_revision.branch,
            index=r.rev,
            changeset=changeset,
            parents=set(r.parents),
            children=set(r.children),
            push=push,
            phase=r.phase,
            bookmarks=unwraplist(r.bookmarks),
            landingsystem=r.landingsystem,
            etl={
                "timestamp": Date.now().unix,
                "machine": machine_metadata
            },
        )
        rev = elasticsearch.scrub(rev)

        r.pushuser = None
        r.pushdate = None
        r.pushid = None
        r.node = None
        r.user = None
        r.desc = None
        r.description = None
        r.date = None
        r.files = None
        r.backedoutby = None
        r.parents = None
        r.children = None
        r.bookmarks = None
        r.landingsystem = None
        r.extra = None
        r.author = None
        r.pushhead = None
        r.reviewers = None
        r.bugs = None
        r.treeherderrepourl = None
        r.backsoutnodes = None
        r.treeherderrepo = None
        r.perfherderurl = None
        r.branch = None
        r.phase = None
        r.rev = None
        r.tags = None

        set_default(rev, r)

        # ADD THE DIFF
        if get_diff:
            rev.changeset.diff = self._get_json_diff_from_hg(rev)

        try:
            _id = (coalesce(rev.changeset.id12, "") + "-" + rev.branch.name +
                   "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE))
            with self.repo_locker:
                self.repo.add({"id": _id, "value": rev})
            if get_moves:
                rev.changeset.moves = self._get_moves_from_hg(rev)
                with self.moves_locker:
                    self.moves.add({"id": _id, "value": rev})
        except Exception as e:
            e = Except.wrap(e)
            Log.warning(
                "Did not save to ES, waiting {{duration}} seconds",
                duration=WAIT_AFTER_NODE_FAILURE,
                cause=e,
            )
            Till(seconds=WAIT_AFTER_NODE_FAILURE).wait()
            if "FORBIDDEN/12/index read-only" in e:
                pass  # KNOWN FAILURE MODE

        return rev
    def extend(self, records):
        """
        records - MUST HAVE FORM OF
            [{"value":value}, ... {"value":value}] OR
            [{"json":json}, ... {"json":json}]
            OPTIONAL "id" PROPERTY IS ALSO ACCEPTED
        """
        if self.settings.read_only:
            Log.error("Index opened in read only mode, no changes allowed")
        lines = []
        try:
            for r in records:
                id = r.get("id")
                r_value = r.get('value')
                if id == None and r_value:
                    id = r_value.get('_id')
                if id == None:
                    id = random_id()

                if "json" in r:
                    json_bytes = r["json"].encode("utf8")
                elif r_value or isinstance(r_value, (dict, Data)):
                    json_bytes = convert.value2json(r_value).encode("utf8")
                else:
                    json_bytes = None
                    Log.error("Expecting every record given to have \"value\" or \"json\" property")

                lines.append(b'{"index":{"_id": ' + convert.value2json(id).encode("utf8") + b'}}')
                if self.settings.tjson:
                    lines.append(json2typed(json_bytes.decode('utf8')).encode('utf8'))
                else:
                    lines.append(json_bytes)
            del records

            if not lines:
                return

            with Timer("Add {{num}} documents to {{index}}", {"num": len(lines) / 2, "index":self.settings.index}, debug=self.debug):
                try:
                    data_bytes = b"\n".join(l for l in lines) + b"\n"
                except Exception as e:
                    Log.error("can not make request body from\n{{lines|indent}}", lines=lines, cause=e)

                response = self.cluster.post(
                    self.path + "/_bulk",
                    data=data_bytes,
                    headers={"Content-Type": "text"},
                    timeout=self.settings.timeout,
                    retry=self.settings.retry,
                    params={"consistency": self.settings.consistency}
                )
                items = response["items"]

                fails = []
                if self.cluster.version.startswith("0.90."):
                    for i, item in enumerate(items):
                        if not item.index.ok:
                            fails.append(i)
                elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])):
                    for i, item in enumerate(items):
                        if item.index.status not in [200, 201]:
                            fails.append(i)
                else:
                    Log.error("version not supported {{version}}", version=self.cluster.version)

                if fails:
                    if len(fails) <= 3:
                        cause = [
                            Except(
                                template="{{status}} {{error}} (and {{some}} others) while loading line id={{id}} into index {{index|quote}}:\n{{line}}",
                                status=items[i].index.status,
                                error=items[i].index.error,
                                some=len(fails) - 1,
                                line=strings.limit(lines[i * 2 + 1], 500 if not self.debug else 100000),
                                index=self.settings.index,
                                id=items[i].index._id
                            )
                            for i in fails
                        ]
                    else:
                        i=fails[0]
                        cause = Except(
                            template="{{status}} {{error}} (and {{some}} others) while loading line id={{id}} into index {{index|quote}}:\n{{line}}",
                            status=items[i].index.status,
                            error=items[i].index.error,
                            some=len(fails) - 1,
                            line=strings.limit(lines[i * 2 + 1], 500 if not self.debug else 100000),
                            index=self.settings.index,
                            id=items[i].index._id
                        )
                    Log.error("Problems with insert", cause=cause)

        except Exception as e:
            if e.message.startswith("sequence item "):
                Log.error("problem with {{data}}", data=repr(lines[int(e.message[14:16].strip())]), cause=e)
            Log.error("problem sending to ES", e)
Beispiel #25
0
    def _normalize_revision(self, r, found_revision, push, get_diff, get_moves):
        new_names = set(r.keys()) - KNOWN_TAGS
        if new_names and not r.tags:
            Log.warning(
                "hg is returning new property names {{names|quote}} for {{changeset}} from {{url}}",
                names=new_names,
                changeset=r.node,
                url=found_revision.branch.url
            )

        changeset = Changeset(
            id=r.node,
            id12=r.node[0:12],
            author=r.user,
            description=strings.limit(coalesce(r.description, r.desc), 2000),
            date=parse_hg_date(r.date),
            files=r.files,
            backedoutby=r.backedoutby if r.backedoutby else None,
            bug=self._extract_bug_id(r.description)
        )
        rev = Revision(
            branch=found_revision.branch,
            index=r.rev,
            changeset=changeset,
            parents=unwraplist(list(set(r.parents))),
            children=unwraplist(list(set(r.children))),
            push=push,
            phase=r.phase,
            bookmarks=unwraplist(r.bookmarks),
            landingsystem=r.landingsystem,
            etl={"timestamp": Date.now().unix, "machine": machine_metadata}
        )

        r.pushuser = None
        r.pushdate = None
        r.pushid = None
        r.node = None
        r.user = None
        r.desc = None
        r.description = None
        r.date = None
        r.files = None
        r.backedoutby = None
        r.parents = None
        r.children = None
        r.bookmarks = None
        r.landingsystem = None

        set_default(rev, r)

        # ADD THE DIFF
        if get_diff:
            rev.changeset.diff = self._get_json_diff_from_hg(rev)
        if get_moves:
            rev.changeset.moves = self._get_moves_from_hg(rev)

        try:
            _id = coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE)
            with self.es_locker:
                self.es.add({"id": _id, "value": rev})
        except Exception as e:
            e = Except.wrap(e)
            Log.warning("Did not save to ES, waiting {{duration}} seconds", duration=WAIT_AFTER_NODE_FAILURE, cause=e)
            Till(seconds=WAIT_AFTER_NODE_FAILURE).wait()
            if "FORBIDDEN/12/index read-only" in e:
                pass  # KNOWN FAILURE MODE

        return rev