Beispiel #1
0
def test_extract_job(complex_job, extract_job_settings):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file,
    then you may use the diff to review the changes.
    """
    with MySQL(extract_job_settings.source.database) as source:
        with MySqlSnowflakeExtractor(extract_job_settings.source) as extractor:
            sql = extractor.get_sql(
                SQL("SELECT " + text(complex_job.id) + " as id"))

            acc = []
            with source.transaction():
                cursor = list(source.query(sql, stream=True, row_tuples=True))
                extractor.construct_docs(cursor, acc.append, False)

    doc = first(acc)
    doc.guid = first(JOB).guid  # NEW EACH TIME

    job_guid = first(jx.drill(JOB, "job_log.failure_line.job_guid"))
    for fl in jx.drill(doc, "job_log.failure_line"):
        fl.job_guid = job_guid

    assertAlmostEqual(
        acc,
        JOB,
        places=
        4,  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
    )
Beispiel #2
0
def test_extract_alert(extract_alert_settings, test_perf_alert_summary,
                       test_perf_alert):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file,
    then you may use the diff to review the changes.
    """
    now = datetime.datetime.now()
    source = MySQL(extract_alert_settings.source.database)
    extractor = MySqlSnowflakeExtractor(extract_alert_settings.source)
    sql = extractor.get_sql(
        SQL("SELECT " + text(test_perf_alert_summary.id) + " as id"))

    acc = []
    with source.transaction():
        cursor = list(source.query(sql, stream=True, row_tuples=True))
        extractor.construct_docs(cursor, acc.append, False)

    doc = acc[0]
    # TEST ARE RUN WITH CURRENT TIMESTAMPS
    doc.created = now
    doc.last_updated = now
    for d in doc.details:
        d.created = now
        d.last_updated = now
        d.series_signature.last_updated = now

    assertAlmostEqual(
        acc, ALERT, places=3
    )  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
Beispiel #3
0
 def add(self, aggs, acc, query, decoders, selects):
     self.result = format_table_from_groupby(aggs, acc, query, decoders, selects)
     # CONFIRM HEADER MATCH
     if self.header:
         assertAlmostEqual(self.header, self.result.header)
     else:
         self.header = self.result.header
Beispiel #4
0
def parse_sql(sql):
    query = wrap(moz_sql_parser.parse(sql))
    # PULL OUT THE AGGREGATES
    for s in listwrap(query.select):
        val = s if s == "*" else s.value
        # LOOK FOR GROUPBY COLUMN IN SELECT CLAUSE, REMOVE DUPLICATION
        for g in listwrap(query.groupby):
            try:
                assertAlmostEqual(g.value, val, "")
                g.name = s.name
                s.value = None  # MARK FOR REMOVAL
                break
            except Exception as e:
                pass

        if isinstance(val, Mapping):
            for a in KNOWN_SQL_AGGREGATES:
                if val[a]:
                    s.aggregate = a
                    s.value = val[a]
    query.select = [
        s for s in listwrap(query.select) if s == "*" or s.value != None
    ]
    query.format = "table"
    return query
Beispiel #5
0
def compare_to_expected(query, result, expect, places):
    query = wrap(query)
    expect = wrap(expect)

    if result.meta.format == "table":
        try:
            assertAlmostEqual(set(result.header), set(expect.header))
        except Exception as e:
            Log.error("format=table headers do not match", cause=e)

        # MAP FROM expected COLUMN TO result COLUMN
        mapping = transpose(*transpose(*filter(
            lambda v: v[0][1] == v[1][1],
            itertools.product(enumerate(expect.header), enumerate(result.header))
        ))[1])[0]
        result.header = [result.header[m] for m in mapping]

        if result.data:
            columns = transpose(*unwrap(result.data))
            result.data = transpose(*(columns[m] for m in mapping))

        if not query.sort:
            sort_table(result)
            sort_table(expect)
    elif result.meta.format == "list":
        if not query.sort:
            try:
                # result.data MAY BE A LIST OF VALUES, NOT OBJECTS
                data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name")
            except Exception:
                data_columns = [{"name": "."}]

            sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns

            if is_list(expect.data):
                try:
                    expect.data = jx.sort(expect.data, sort_order.name)
                except Exception as _:
                    pass

            if is_list(result.data):
                try:
                    result.data = jx.sort(result.data, sort_order.name)
                except Exception as _:
                    pass

    elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort:
        result_data, result_header = cube2list(result.data)
        result_header = map(literal_field, result_header)
        result_data = unwrap(jx.sort(result_data, result_header))
        result.data = list2cube(result_data, result_header)

        expect_data, expect_header = cube2list(expect.data)
        expect_header = map(literal_field, expect_header)
        expect_data = jx.sort(expect_data, expect_header)
        expect.data = list2cube(expect_data, expect_header)

    # CONFIRM MATCH
    assertAlmostEqual(result, expect, places=places)
Beispiel #6
0
def compare_to_expected(query, result, expect):
    query = wrap(query)
    expect = wrap(expect)

    if result.meta.format == "table":
        assertAlmostEqual(set(result.header), set(expect.header))

        # MAP FROM expected COLUMN TO result COLUMN
        mapping = list(zip(*list(zip(*filter(
            lambda v: v[0][1] == v[1][1],
            itertools.product(enumerate(expect.header), enumerate(result.header))
        )))[1]))[0]
        result.header = [result.header[m] for m in mapping]

        if result.data:
            columns = list(zip(*unwrap(result.data)))
            result.data = zip(*[columns[m] for m in mapping])

        if not query.sort:
            sort_table(result)
            sort_table(expect)
    elif result.meta.format == "list":
        if query["from"].startswith("meta."):
            pass
        else:
            query = QueryOp.wrap(query, query.frum, query.schema)

        if not query.sort:
            try:
                #result.data MAY BE A LIST OF VALUES, NOT OBJECTS
                data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name")
            except Exception as _:
                data_columns = [{"name": "."}]

            sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns

            if isinstance(expect.data, list):
                try:
                    expect.data = jx.sort(expect.data, sort_order.name)
                except Exception:
                    pass

            if isinstance(result.data, list):
                try:
                    result.data = jx.sort(result.data, sort_order.name)
                except Exception:
                    pass

    elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort:
        result_data, result_header = cube2list(result.data)
        result_data = unwrap(jx.sort(result_data, result_header))
        result.data = list2cube(result_data, result_header)

        expect_data, expect_header = cube2list(expect.data)
        expect_data = jx.sort(expect_data, expect_header)
        expect.data = list2cube(expect_data, expect_header)

    # CONFIRM MATCH
    assertAlmostEqual(result, expect, places=6)
Beispiel #7
0
def allclose(a, b):
    try:
        from mo_testing.fuzzytestcase import assertAlmostEqual

        assertAlmostEqual(a, b)
        return True
    except Exception as e:
        return False
Beispiel #8
0
def parse_sql(sql):
    # TODO: CONVERT tuple OF LITERALS INTO LITERAL LIST
    # # IF ALL MEMBERS OF A LIST ARE LITERALS, THEN MAKE THE LIST LITERAL
    # if all(isinstance(r, number_types) for r in output):
    #     pass
    # elif all(isinstance(r, number_types) or (is_data(r) and "literal" in r.keys()) for r in output):
    #     output = {"literal": [r['literal'] if is_data(r) else r for r in output]}
    query = wrap(moz_sql_parser.parse(sql))
    redundant_select = []
    # PULL OUT THE AGGREGATES
    for s in listwrap(query.select):
        val = s if s == '*' else s.value

        # EXTRACT KNOWN AGGREGATE FUNCTIONS
        if is_data(val):
            for a in KNOWN_SQL_AGGREGATES:
                value = val[a]
                if value != None:
                    if is_list(value):
                        # AGGREGATE WITH PARAMETERS  EG percentile(value, 0.90)
                        s.aggregate = a
                        s[a] = unwraplist(value[1::])
                        s.value = value[0]
                    else:
                        # SIMPLE AGGREGATE
                        s.aggregate = a
                        s.value = value
                    break

        # LOOK FOR GROUPBY COLUMN IN SELECT CLAUSE, REMOVE DUPLICATION
        for g in listwrap(query.groupby):
            try:
                assertAlmostEqual(g.value, val, "")
                g.name = s.name
                redundant_select.append(s)
                break
            except Exception:
                pass

    # REMOVE THE REDUNDANT select
    if is_list(query.select):
        for r in redundant_select:
            query.select.remove(r)
    elif query.select and redundant_select:
        query.select = None

    # RENAME orderby TO sort
    query.sort, query.orderby = query.orderby, None
    query.format = "table"
    return query
Beispiel #9
0
def parse_sql(sql):
    query = wrap(moz_sql_parser.parse(sql))
    query.select = listwrap(query.select)

    redundant_select = []
    # PULL OUT THE AGGREGATES
    for s in query.select:
        val = s if s == '*' else s.value

        # EXTRACT KNOWN AGGREGATE FUNCTIONS
        if isinstance(val, Mapping):
            for a in KNOWN_SQL_AGGREGATES:
                value = val[a]
                if value != None:
                    s.aggregate = a
                    if isinstance(value, list):
                        # AGGREGATE WITH PARAMETERS  EG percentile(value, 0.90)
                        s[a] = unwraplist(value[1::])
                        s.value = value[0]
                    elif isinstance(value, Mapping):
                        # EXPRESSION
                        if len(value.keys()) == 0:
                            s.value = None
                        else:
                            s.value = value
                    else:
                        # SIMPLE VALUE
                        s.value = value
                    break

        # LOOK FOR GROUPBY COLUMN IN SELECT CLAUSE, REMOVE DUPLICATION
        for g in listwrap(query.groupby):
            try:
                assertAlmostEqual(g.value, val, "")
                g.name = s.name
                redundant_select.append(s)
                break
            except Exception:
                pass

    # REMOVE THE REDUNDANT select
    for r in redundant_select:
        query.select.remove(r)

    # RENAME orderby TO sort
    query.sort, query.orderby = query.orderby, None
    query.format = "table"
    return query
Beispiel #10
0
def fix(rownum, line, source, sample_only_filter, sample_size):
    value = json2value(line)

    if value._id.startswith(("tc.97", "96", "bb.27")):
        # AUG 24, 25 2017 - included full diff with repo; too big to index
        try:
            data = json2value(line)
            repo = data.repo
            repo.etl = None
            repo.branch.last_used = None
            repo.branch.description = None
            repo.branch.etl = None
            repo.branch.parent_name = None
            repo.children = None
            repo.parents = None
            if repo.changeset.diff or data.build.repo.changeset.diff:
                Log.error("no diff allowed")
            else:
                assertAlmostEqual(minimize_repo(repo), repo)
        except Exception as e:
            if CAN_NOT_DECODE_JSON in e:
                raise e
            data.repo = minimize_repo(repo)
            data.build.repo = minimize_repo(data.build.repo)
            line = value2json(data)
    else:
        pass

    if rownum == 0:
        if len(line) > MAX_RECORD_LENGTH:
            _shorten(value, source)
        value = _fix(value)
        if sample_only_filter and Random.int(
                int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter(
                    [value], sample_only_filter):
            # INDEX etl.id==0, BUT NO MORE
            if value.etl.id != 0:
                Log.error("Expecting etl.id==0")
            row = {"value": value}
            return row, True
    elif len(line) > MAX_RECORD_LENGTH:
        _shorten(value, source)
        value = _fix(value)
    elif line.find('"resource_usage":') != -1:
        value = _fix(value)

    row = {"value": value}
    return row, False
Beispiel #11
0
def test_extract_job(complex_job, extract_job_settings, now):
    source = MySQL(extract_job_settings.source.database)
    extractor = MySqlSnowflakeExtractor(extract_job_settings.source)
    sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id"))

    acc = []
    with source.transaction():
        cursor = list(source.query(sql, stream=True, row_tuples=True))
        extractor.construct_docs(cursor, acc.append, False)

    doc = acc[0]
    doc.guid = complex_job.guid
    doc.last_modified = complex_job.last_modified

    assertAlmostEqual(
        acc, JOB, places=3
    )  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
Beispiel #12
0
    def __eq__(self, other):
        if other == None:
            if self._json == "null":
                return True
            else:
                return False
        elif self._json == "null":
            return False

        Log.warning("expensive")

        from mo_testing.fuzzytestcase import assertAlmostEqual

        try:
            assertAlmostEqual(json2value(self._json), other)
            return True
        except Exception:
            return False
Beispiel #13
0
def test_extract_alert(extract_alert_settings, test_perf_alert_summary,
                       test_perf_alert):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file,
    then you may use the diff to review the changes.
    """
    with MySQL(extract_alert_settings.source.database) as source:
        with MySqlSnowflakeExtractor(
                extract_alert_settings.source) as extractor:
            sql = extractor.get_sql(
                SQL("SELECT " + text(test_perf_alert_summary.id) + " as id"))

            acc = []
            with source.transaction():
                cursor = list(source.query(sql, stream=True, row_tuples=True))
                extractor.construct_docs(cursor, acc.append, False)

    assertAlmostEqual(
        acc, ALERT, places=3
    )  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
Beispiel #14
0
def test_extract_job(complex_job, extract_job_settings, now):
    """
    If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file,
    then you may use the diff to review the changes.
    """
    source = MySQL(extract_job_settings.source.database)
    extractor = MySqlSnowflakeExtractor(extract_job_settings.source)
    sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id"))

    acc = []
    with source.transaction():
        cursor = list(source.query(sql, stream=True, row_tuples=True))
        extractor.construct_docs(cursor, acc.append, False)

    doc = acc[0]
    doc.guid = complex_job.guid

    assertAlmostEqual(
        acc,
        JOB,
        places=
        4,  # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
    )
Beispiel #15
0
    def write_lines(self, key, lines):
        self._verify_key_format(key)
        storage = self.bucket.new_key(str(key + ".json.gz"))

        if VERIFY_UPLOAD:
            lines = list(lines)

        with mo_files.TempFile() as tempfile:
            with open(tempfile.abspath, "wb") as buff:
                DEBUG and Log.note("Temp file {{filename}}",
                                   filename=tempfile.abspath)
                archive = gzip.GzipFile(filename=str(key + ".json"),
                                        fileobj=buff,
                                        mode="w")
                count = 0
                for l in lines:
                    if is_many(l):
                        for ll in l:
                            archive.write(ll.encode("utf8"))
                            archive.write(b"\n")
                            count += 1
                    else:
                        archive.write(l.encode("utf8"))
                        archive.write(b"\n")
                        count += 1
                archive.close()

            retry = 3
            while retry:
                try:
                    with Timer(
                            "Sending {{count}} lines in {{file_length|comma}} bytes for {{key}}",
                        {
                            "key": key,
                            "file_length": tempfile.length,
                            "count": count
                        },
                            verbose=self.settings.debug,
                    ):
                        storage.set_contents_from_filename(
                            tempfile.abspath,
                            headers={"Content-Type": mimetype.GZIP})
                    break
                except Exception as e:
                    e = Except.wrap(e)
                    retry -= 1
                    if (retry == 0 or "Access Denied" in e
                            or "No space left on device" in e):
                        Log.error("could not push data to s3", cause=e)
                    else:
                        Log.warning("could not push data to s3, will retry",
                                    cause=e)

            if self.settings.public:
                storage.set_acl("public-read")

            if VERIFY_UPLOAD:
                try:
                    with open(tempfile.abspath, mode="rb") as source:
                        result = list(ibytes2ilines(
                            scompressed2ibytes(source)))
                        assertAlmostEqual(result,
                                          lines,
                                          msg="file is different")

                    # full_url = "https://"+self.name+".s3-us-west-2.amazonaws.com/"+storage.key.replace(":", "%3A")
                    # https://active-data-test-result.s3-us-west-2.amazonaws.com/tc.1524896%3A152488763.0.json.gz

                    # dest_bucket = s3.MultiBucket(bucket="self.name", kwargs=self.settings.aws)

                    result = list(self.read_lines(strip_extension(key)))
                    assertAlmostEqual(result,
                                      lines,
                                      result,
                                      msg="S3 is different")

                except Exception as e:
                    from activedata_etl.transforms import TRY_AGAIN_LATER

                    Log.error(TRY_AGAIN_LATER,
                              reason="did not pass verification",
                              cause=e)
        return
Beispiel #16
0
                try:
                    result.data = jx.sort(result.data, sort_order.name)
                except Exception, _:
                    pass

    elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort:
        result_data, result_header = cube2list(result.data)
        result_data = unwrap(jx.sort(result_data, result_header))
        result.data = list2cube(result_data, result_header)

        expect_data, expect_header = cube2list(expect.data)
        expect_data = jx.sort(expect_data, expect_header)
        expect.data = list2cube(expect_data, expect_header)

    # CONFIRM MATCH
    assertAlmostEqual(result, expect, places=places)


def cube2list(cube):
    """
    RETURNS header SO THAT THE ORIGINAL CUBE CAN BE RECREATED
    :param cube: A dict WITH VALUES BEING A MULTIDIMENSIONAL ARRAY OF UNIFORM VALUES
    :return: (rows, header) TUPLE
    """
    header = list(unwrap(cube).keys())
    rows = []
    for r in zip(*[[(k, v) for v in a] for k, a in cube.items()]):
        row = Data()
        for k, v in r:
           row[k]=v
        rows.append(unwrap(row))