Ejemplo n.º 1
0
def get_all_signatures(db_config, sql):
    """
    RETURN ALL SIGNATURES FROM PERFHERDER DATABASE
    """
    db = MySQL(db_config)
    with db:
        return db.query(text(sql))
Ejemplo n.º 2
0
 def setUpClass(cls):
     Log.start(settings.debug)
     with Timer("setup database"):
         try:
             with MySQL(schema=None, kwargs=settings.database) as db:
                 db.query("drop database testing")
         except Exception as e:
             if "Can't drop database " in e:
                 pass
             else:
                 Log.warning("problem removing db", cause=e)
         MySQL.execute_file("tests/resources/database.sql",
                            schema=None,
                            kwargs=settings.database)
Ejemplo n.º 3
0
    def __init__(self, kwargs=None):
        self.settings = kwargs
        self.settings.exclude = set(self.settings.exclude)
        self.settings.show_foreign_keys = coalesce(
            self.settings.show_foreign_keys, True)

        self.all_nested_paths = None
        self.nested_path_to_join = None
        self.columns = None

        with Explanation("scan database", debug=DEBUG):
            self.db = MySQL(**kwargs.database)
            with self.db:
                with self.db.transaction():
                    self._scan_database()
Ejemplo n.º 4
0
    def test_lean_inline_all(self):
        config = set_default(
            {
                "extract": {
                    "ids": "select * from fact_table"
                },
                "snowflake": {
                    "show_foreign_keys": False,
                    "reference_only": ["inner1.value", "inner2.value"]
                }
            }, config_template)
        db = MySQL(**config.snowflake.database)
        data = [10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22]
        Extract(kwargs=config).extract(db=db,
                                       start_point=Null,
                                       first_value=Null,
                                       data=data,
                                       please_stop=Null)

        result = File(filename).read_json()
        for r in result:
            r.etl = None
        expected = expected_results["lean_inline_all"]
        self.assertEqual(result, expected, "expecting identical")
        self.assertEqual(expected, result, "expecting identical")
Ejemplo n.º 5
0
 def extract(please_stop):
     with MySQL(**settings.snowflake.database) as db:
         with db.transaction():
             for kwargs in extractor.queue:
                 if please_stop:
                     break
                 try:
                     extractor.extract(db=db,
                                       please_stop=please_stop,
                                       **kwargs)
                 except Exception as e:
                     Log.warning("Could not extract", cause=e)
                     extractor.queue.add(kwargs)
Ejemplo n.º 6
0
    def __init__(self,
                 host,
                 port,
                 username,
                 password,
                 debug=False,
                 schema=None,
                 preamble=None,
                 readonly=False,
                 kwargs=None):
        from pyLibrary.sql.mysql import MySQL

        self.settings = kwargs
        self._db = MySQL(kwargs)
Ejemplo n.º 7
0
    def test_complex(self):
        config = config_template
        db = MySQL(**config.snowflake.database)
        Extract(kwargs=config).extract(db=db,
                                       start_point=Null,
                                       first_value=Null,
                                       data=[10],
                                       please_stop=Null)

        result = File(filename).read_json()
        result[0].etl = None
        expected = expected_results["complex"]
        self.assertEqual(result, expected, "expecting identical")
        self.assertEqual(expected, result, "expecting identical")
Ejemplo n.º 8
0
    def test_lean(self):
        config = set_default({"snowflake": {
            "show_foreign_keys": False
        }}, config_template)
        db = MySQL(**config.snowflake.database)
        Extract(kwargs=config).extract(db=db,
                                       start_point=Null,
                                       first_value=Null,
                                       data=[10],
                                       please_stop=Null)

        result = File(filename).read_json()
        result[0].etl = None
        expected = expected_results["lean"]
        self.assertEqual(result, expected, "expecting identical")
        self.assertEqual(expected, result, "expecting identical")
Ejemplo n.º 9
0
    def test_inline(self):
        config = set_default(
            {
                "snowflake": {
                    "reference_only": ["inner1.value", "inner2.value"]
                }
            }, config_template)
        db = MySQL(**config.snowflake.database)
        Extract(kwargs=config).extract(db=db,
                                       start_point=Null,
                                       first_value=Null,
                                       data=[10],
                                       please_stop=Null)

        result = File(filename).read_json()
        result[0].etl = None
        expected = expected_results["inline"]
        self.assertEqual(result, expected, "expecting identical")
        self.assertEqual(expected, result, "expecting identical")
Ejemplo n.º 10
0
def get_dataum(db_config, signature_id):
    db = MySQL(db_config)
    with db:
        return db.query(expand_template(datum_sql, quote_list(listwrap(signature_id))))
Ejemplo n.º 11
0
    def __init__(self, kwargs=None):
        self.settings = kwargs
        self.schema = SnowflakeSchema(self.settings.snowflake)
        self._extract = extract = kwargs.extract

        # SOME PREP
        get_git_revision()

        # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF
        with MySQL(**kwargs.snowflake.database) as db:
            processes = None
            try:
                processes = jx.filter(
                    db.query("show processlist"), {
                        "and": [{
                            "neq": {
                                "Command": "Sleep"
                            }
                        }, {
                            "neq": {
                                "Info": "show processlist"
                            }
                        }]
                    })
            except Exception as e:
                Log.warning("no database", cause=e)

            if processes:
                if DEBUG:
                    Log.warning("Processes are running\n{{list|json}}",
                                list=processes)
                else:
                    Log.error("Processes are running\n{{list|json}}",
                              list=processes)

        extract.type = listwrap(extract.type)
        extract.start = listwrap(extract.start)
        extract.batch = listwrap(extract.batch)
        extract.field = listwrap(extract.field)
        if any(
                len(extract.type) != len(other)
                for other in [extract.start, extract.batch, extract.field]):
            Log.error(
                "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object"
            )
        for i, t in enumerate(extract.type):
            if t == "time":
                extract.start[i] = Date(extract.start[i])
                extract.batch[i] = Duration(extract.batch[i])
            elif t == "number":
                pass
            else:
                Log.error('Expecting `extract.type` to be "number" or "time"')

        extract.threads = coalesce(extract.threads, 1)
        self.done_pulling = Signal()
        self.queue = Queue("all batches",
                           max=2 * coalesce(extract.threads, 1),
                           silent=True)

        self.bucket = s3.Bucket(self.settings.destination)
        self.notify = aws.Queue(self.settings.notify)
        Thread.run("get records", self.pull_all_remaining)
Ejemplo n.º 12
0
    def pull_all_remaining(self, please_stop):
        try:
            try:
                content = File(self.settings.extract.last).read_json()
                if len(content) == 1:
                    Log.note("Got a manually generated file {{filename}}",
                             filename=self.settings.extract.last)
                    start_point = tuple(content[0])
                    first_value = [
                        self._extract.start[0] + (start_point[0] * DAY),
                        start_point[1]
                    ]
                else:
                    Log.note("Got a machine generated file {{filename}}",
                             filename=self.settings.extract.last)
                    start_point, first_value = content
                    start_point = tuple(start_point)
                Log.note("First value is {{start1|date}}, {{start2}}",
                         start1=first_value[0],
                         start2=first_value[1])
            except Exception as _:
                Log.error(
                    "Expecting a file {{filename}} with the last good S3 bucket etl id in array form eg: [[954, 0]]",
                    filename=self.settings.extract.last)
                start_point = tuple(self._extract.start)
                first_value = Null

            counter = Counter(start=0)
            for t, s, b, f, i in reversed(
                    zip(self._extract.type, self._extract.start,
                        self._extract.batch,
                        listwrap(first_value) + DUMMY_LIST,
                        range(len(self._extract.start)))):
                if t == "time":
                    counter = DurationCounter(start=s,
                                              duration=b,
                                              child=counter)
                    first_value[i] = Date(f)
                else:
                    counter = BatchCounter(start=s, size=b, child=counter)

            batch_size = self._extract.batch.last(
            ) * 2 * self.settings.extract.threads
            with MySQL(**self.settings.snowflake.database) as db:
                while not please_stop:
                    sql = self._build_list_sql(db, first_value, batch_size + 1)
                    pending = []
                    counter.reset(start_point)
                    with Timer("Grab a block of ids for processing"):
                        with closing(db.db.cursor()) as cursor:
                            acc = []
                            cursor.execute(sql)
                            count = 0
                            for row in cursor:
                                detail_key = counter.next(row)
                                key = tuple(detail_key[:-1])
                                count += 1
                                if key != start_point:
                                    if first_value:
                                        if not acc:
                                            Log.error(
                                                "not expected, {{filename}} is probably set too far in the past",
                                                filename=self.settings.extract.
                                                last)
                                        pending.append({
                                            "start_point": start_point,
                                            "first_value": first_value,
                                            "data": acc
                                        })
                                    acc = []
                                    start_point = key
                                    first_value = row
                                acc.append(
                                    row[-1]
                                )  # ASSUME LAST COLUMN IS THE FACT TABLE id
                    Log.note("adding {{num}} for processing", num=len(pending))
                    self.queue.extend(pending)

                    if count < batch_size:
                        self.queue.add(THREAD_STOP)
                        break
        except Exception as e:
            Log.warning("Problem pulling data", cause=e)
        finally:
            self.done_pulling.go()
            Log.note("pulling new data is done")
Ejemplo n.º 13
0
class SnowflakeSchema(object):
    @override
    def __init__(self, kwargs=None):
        self.settings = kwargs
        self.settings.exclude = set(self.settings.exclude)
        self.settings.show_foreign_keys = coalesce(
            self.settings.show_foreign_keys, True)

        self.all_nested_paths = None
        self.nested_path_to_join = None
        self.columns = None

        with Explanation("scan database", debug=DEBUG):
            self.db = MySQL(**kwargs.database)
            with self.db:
                with self.db.transaction():
                    self._scan_database()

    def get_sql(self, get_ids):
        sql = self._compose_sql(get_ids)

        # ORDERING
        sort = []
        ordering = []
        for ci, c in enumerate(self.columns):
            if c.sort:
                sort.append(quote_column(c.column_alias) + SQL_IS_NOT_NULL)
                sort.append(quote_column(c.column_alias))
                ordering.append(ci)

        union_all_sql = SQL_UNION_ALL.join(sql)
        union_all_sql = (SQL_SELECT + SQL_STAR + SQL_FROM +
                         sql_alias(sql_iso(union_all_sql), quote_column('a')) +
                         SQL_ORDERBY + sql_list(sort))
        return union_all_sql

    def _scan_database(self):
        # GET ALL RELATIONS
        raw_relations = self.db.query("""
            SELECT
                table_schema,
                table_name,
                referenced_table_schema,
                referenced_table_name,
                referenced_column_name,
                constraint_name,
                column_name,
                ordinal_position
            FROM
                information_schema.key_column_usage
            WHERE
                referenced_column_name IS NOT NULL
        """,
                                      param=self.settings.database)

        if not raw_relations:
            Log.error("No relations in the database")

        for r in self.settings.add_relations:
            try:
                a, b = map(strings.trim, r.split("->"))
                a = a.split(".")
                b = b.split(".")
                raw_relations.append(
                    Data(table_schema=a[0],
                         table_name=a[1],
                         referenced_table_schema=b[0],
                         referenced_table_name=b[1],
                         referenced_column_name=b[2],
                         constraint_name=Random.hex(20),
                         column_name=a[2],
                         ordinal_position=1))
            except Exception as e:
                Log.error("Could not parse {{line|quote}}", line=r, cause=e)

        relations = jx.select(raw_relations,
                              [{
                                  "name": "constraint.name",
                                  "value": "constraint_name"
                              }, {
                                  "name": "table.schema",
                                  "value": "table_schema"
                              }, {
                                  "name": "table.name",
                                  "value": "table_name"
                              }, {
                                  "name": "column.name",
                                  "value": "column_name"
                              }, {
                                  "name": "referenced.table.schema",
                                  "value": "referenced_table_schema"
                              }, {
                                  "name": "referenced.table.name",
                                  "value": "referenced_table_name"
                              }, {
                                  "name": "referenced.column.name",
                                  "value": "referenced_column_name"
                              }, {
                                  "name": "ordinal_position",
                                  "value": "ordinal_position"
                              }])

        # GET ALL TABLES
        raw_tables = self.db.query("""
            SELECT
                t.table_schema,
                t.table_name,
                c.constraint_name,
                c.constraint_type,
                k.column_name,
                k.ordinal_position
            FROM
                information_schema.tables t
            LEFT JOIN
                information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY')
            LEFT JOIN
                information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema
            ORDER BY
                t.table_schema,
                t.table_name,
                c.constraint_name,
                k.ordinal_position,
                k.column_name
        """,
                                   param=self.settings.database)

        # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING
        tables = UniqueIndex(keys=["name", "schema"])
        for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]):
            c = wrap(list(c))
            best_index = Null
            is_referenced = False
            is_primary = False
            for g, w in jx.groupby(c, "constraint_name"):
                if not g.constraint_name:
                    continue
                w = list(w)
                ref = False
                for r in relations:
                    if r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name:
                        ref = True
                is_prime = w[0].constraint_type == "PRIMARY"

                reasons_this_one_is_better = [
                    best_index == None,  # WE DO NOT HAVE A CANDIDATE YET
                    is_prime
                    and not is_primary,  # PRIMARY KEYS ARE GOOD TO HAVE
                    is_primary == is_prime and ref and
                    not is_referenced,  # REFERENCED UNIQUE TUPLES ARE GOOD TOO
                    is_primary == is_prime and ref == is_referenced and len(w)
                    < len(best_index)  # THE SHORTER THE TUPLE, THE BETTER
                ]
                if any(reasons_this_one_is_better):
                    is_primary = is_prime
                    is_referenced = ref
                    best_index = w

            tables.add({
                "name": t.table_name,
                "schema": t.table_schema,
                "id": [b.column_name for b in best_index]
            })

        fact_table = tables[self.settings.fact_table,
                            self.settings.database.schema]
        ids_table = {
            "alias": "t0",
            "name": "__ids__",
            "schema": fact_table.schema,
            "id": fact_table.id
        }
        relations.extend(
            wrap({
                "constraint": {
                    "name": "__link_ids_to_fact_table__"
                },
                "table": ids_table,
                "column": {
                    "name": c
                },
                "referenced": {
                    "table": fact_table,
                    "column": {
                        "name": c
                    }
                },
                "ordinal_position": i
            }) for i, c in enumerate(fact_table.id))
        tables.add(ids_table)

        # GET ALL COLUMNS
        raw_columns = self.db.query("""
            SELECT
                column_name,
                table_schema,
                table_name,
                ordinal_position,
                data_type
            FROM
                information_schema.columns
        """,
                                    param=self.settings.database)

        reference_only_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 2
        ]
        reference_all_tables = [
            r.split(".")[0] for r in self.settings.reference_only
            if len(r.split(".")) == 1
        ]
        foreign_column_table_schema_triples = {(r.column.name, r.table.name,
                                                r.table.schema)
                                               for r in relations}
        referenced_column_table_schema_triples = {
            (r.referenced.column.name, r.referenced.table.name,
             r.referenced.table.schema)
            for r in relations
        }
        related_column_table_schema_triples = foreign_column_table_schema_triples | referenced_column_table_schema_triples

        columns = UniqueIndex(["column.name", "table.name", "table.schema"])
        for c in raw_columns:
            if c.table_name in reference_only_tables:
                if c.table_name + "." + c.column_name in self.settings.reference_only:
                    include = True
                    reference = True
                    foreign = False
                elif c.column_name in tables[(c.table_name,
                                              c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = False
                    foreign = False
                else:
                    include = False
                    reference = False
                    foreign = False
            elif c.table_name in reference_all_tables:
                # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED
                if c.column_name in tables[(c.table_name, c.table_schema)].id:
                    include = self.settings.show_foreign_keys
                    reference = True
                    foreign = False
                elif (c.column_name, c.table_name,
                      c.table_schema) in foreign_column_table_schema_triples:
                    include = False
                    reference = False
                    foreign = True
                else:
                    include = True
                    reference = False
                    foreign = False
            elif c.column_name in tables[(c.table_name, c.table_schema)].id:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            elif (c.column_name, c.table_name,
                  c.table_schema) in foreign_column_table_schema_triples:
                include = False
                reference = False
                foreign = True
            elif (c.column_name, c.table_name,
                  c.table_schema) in referenced_column_table_schema_triples:
                include = self.settings.show_foreign_keys
                reference = False
                foreign = False
            else:
                include = True
                reference = False
                foreign = False

            rel = {
                "column": {
                    "name": c.column_name,
                    "type": c.data_type
                },
                "table": {
                    "name": c.table_name,
                    "schema": c.table_schema
                },
                "ordinal_position": c.ordinal_position,
                "is_id": c.column_name
                in tables[(c.table_name, c.table_schema)].id,
                "include": include,  # TRUE IF THIS COLUMN IS OUTPUTTED
                "reference":
                reference,  # TRUE IF THIS COLUMN REPRESENTS THE ROW
                "foreign": foreign  # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW
            }
            columns.add(rel)

        # ITERATE OVER ALL PATHS
        todo = FlatList()
        output_columns = FlatList()
        nested_path_to_join = {}
        all_nested_paths = [["."]]

        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query("SELECT * FROM " +
                              quote_column(position.name, position.schema) +
                              " LIMIT 1")

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return

            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            # INNER OBJECTS
            referenced_tables = list(
                jx.groupby(
                    jx.filter(
                        relations, {
                            "eq": {
                                "table.name": position.name,
                                "table.schema": position.schema
                            }
                        }), "constraint.name"))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text_type(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path
                })

                # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)])
                # HANDLE THE COMMON *id SUFFIX
                name = []
                for a, b in zip(constraint_columns.column.name,
                                constraint_columns.referenced.table.name):
                    if a.startswith(b):
                        name.append(b)
                    elif a.endswith("_id"):
                        name.append(a[:-3])
                    else:
                        name.append(a)
                referenced_column_path = join_field(
                    split_field(path) + ["/".join(name)])
                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                # insert into nested1 VALUES (100, 10, 'aaa', -1);
                # id.about.time.nested1 .ref=10
                # id.about.time.nested1 .ref.name
                for col in columns:
                    if col.table.name == constraint_columns[
                            0].referenced.table.name and col.table.schema == constraint_columns[
                                0].referenced.table.schema:
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema:
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text_type(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(position=copy(constraint_columns[0].referenced.table),
                         path=referenced_column_path,
                         nested_path=nested_path,
                         done_relations=copy(done_relations),
                         no_nested_docs=no_nested_docs))

            # NESTED OBJECTS
            if not no_nested_docs:
                for g, constraint_columns in jx.groupby(
                        jx.filter(
                            relations, {
                                "eq": {
                                    "referenced.table.name": position.name,
                                    "referenced.table.schema": position.schema
                                }
                            }), "constraint.name"):
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    referenced_column_path = join_field(
                        split_field(path) + ["/".join(many_table)])
                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    # if new_path not in self.settings.include:
                    #     Log.note("Exclude nested path {{path}}", path=new_path)
                    #     continue
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text_type(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default({}, g, {
                            "children": True,
                            "join_columns": constraint_columns,
                            "path": path,
                            "nested_path": nested_path
                        }))
                    # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name
                    for col in columns:
                        if col.table.name == constraint_columns[
                                0].table.name and col.table.schema == constraint_columns[
                                    0].table.schema:
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text_type(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None
                                })

                    todo.append(
                        Data(position=constraint_columns[0].table,
                             path=referenced_column_path,
                             nested_path=new_nested_path,
                             done_relations=copy(done_relations),
                             no_nested_docs=no_nested_docs))

        path = "."
        nested_path = [path]
        nested_path_to_join["."] = [{
            "path":
            path,
            "join_columns": [{
                "referenced": {
                    "table": ids_table
                }
            }],
            "nested_path":
            nested_path
        }]

        todo.append(
            Data(position=ids_table,
                 path=path,
                 nested_path=nested_path,
                 done_relations=set(),
                 no_nested_docs=False))

        while todo:
            item = todo.pop(0)
            follow_paths(**item)

        self.all_nested_paths = all_nested_paths
        self.nested_path_to_join = nested_path_to_join
        self.columns = output_columns

    def _compose_sql(self, get_ids):
        """
        :param get_ids: SQL to get the ids, and used to select the documents returned
        :return:
        """

        sql = []
        for nested_path in self.all_nested_paths:
            # MAKE THE REQUIRED JOINS
            sql_joins = []

            for i, curr_join in enumerate(
                    self.nested_path_to_join[nested_path[0]]):
                curr_join = wrap(curr_join)
                rel = curr_join.join_columns[0]
                if i == 0:
                    sql_joins.append(
                        SQL_FROM +
                        sql_alias(sql_iso(get_ids),
                                  quote_column(rel.referenced.table.alias)))
                elif curr_join.children:
                    full_name = quote_column(rel.table.name, rel.table.schema)
                    sql_joins.append(SQL_JOIN + sql_alias(
                        full_name, quote_column(rel.table.alias)
                    ) + SQL_ON + sql_and(
                        quote_column(const_col.column.name, rel.table.alias) +
                        "=" + quote_column(const_col.referenced.column.name,
                                           rel.referenced.table.alias)
                        for const_col in curr_join.join_columns))
                else:
                    full_name = quote_column(rel.referenced.table.name,
                                             rel.referenced.table.schema)
                    sql_joins.append(SQL_LEFT_JOIN + sql_alias(
                        full_name, quote_column(rel.referenced.table.alias)
                    ) + SQL_ON + sql_and(
                        quote_column(const_col.referenced.column.name,
                                     rel.referenced.table.alias) + "=" +
                        quote_column(const_col.column.name, rel.table.alias)
                        for const_col in curr_join.join_columns))

            # ONLY SELECT WHAT WE NEED, NULL THE REST
            selects = []
            not_null_column_seen = False
            for ci, c in enumerate(self.columns):
                if c.column_alias[1:] != text_type(ci):
                    Log.error("expecting consistency")
                if c.nested_path[0] == nested_path[0]:
                    s = sql_alias(
                        quote_column(c.column.column.name, c.table_alias),
                        quote_column(c.column_alias))
                    if s == None:
                        Log.error("bug")
                    selects.append(s)
                    not_null_column_seen = True
                elif startswith_field(nested_path[0], c.path):
                    # PARENT ID REFERENCES
                    if c.column.is_id:
                        s = sql_alias(
                            quote_column(c.column.column.name, c.table_alias),
                            quote_column(c.column_alias))
                        selects.append(s)
                        not_null_column_seen = True
                    else:
                        selects.append(
                            sql_alias(SQL_NULL, quote_column(c.column_alias)))
                else:
                    selects.append(
                        sql_alias(SQL_NULL, quote_column(c.column_alias)))

            if not_null_column_seen:
                sql.append(SQL_SELECT + sql_list(selects) + "".join(sql_joins))
        return sql
Ejemplo n.º 14
0
def update_repo(repo, settings):
    with MySQL(settings.database) as db:
        try:
            pull_repo(repo)

            # GET LATEST DATE
            existing_range = db.query(
                """
                        SELECT
                            max(`date`) `max`,
                            min(`date`) `min`,
                            min(revision) min_rev,
                            max(revision) max_rev
                        FROM
                            changesets
                        WHERE
                            repos={{repos}}
                    """, {"repos": repo.name})[0]

            ranges = wrap([{
                "min":
                coalesce(existing_range.max, convert.milli2datetime(0)) +
                timedelta(days=1)
            }, {
                "max": existing_range.min
            }])

            for r in ranges:
                for g, docs in qb.groupby(get_changesets(date_range=r,
                                                         repo=repo),
                                          size=100):
                    for doc in docs:
                        doc.file_changes = None
                        doc.file_adds = None
                        doc.file_dels = None
                        doc.description = doc.description[0:16000]

                    db.insert_list("changesets", docs)
                    db.flush()

            missing_revisions = find_holes(db, "changesets", "revision", {
                "min": 0,
                "max": existing_range.max_rev + 1
            }, {"term": {
                "repos": repo.name
            }})
            for _range in missing_revisions:
                for g, docs in qb.groupby(get_changesets(revision_range=_range,
                                                         repo=repo),
                                          size=100):
                    for doc in docs:
                        doc.file_changes = None
                        doc.file_adds = None
                        doc.file_dels = None
                        doc.description = doc.description[0:16000]

                    db.insert_list("changesets", docs)
                    db.flush()

        except Exception, e:
            Log.warning("Failure to pull from {{repos.name}}", {"repos": repo},
                        e)