Ejemplo n.º 1
0
    def load_to_table(
        self,
        src_path: str,
        table_md: Optional[TableMD] = None,
        table_md_path: Optional[str] = None,
    ) -> None:
        """Load data to a designated table using table metadata yaml file to construct the table. Only CSV format
        is valid.
        :param src_path: Path to the file to load into the database (singular file)
        :type src_path: str
        :param table_md: Parsed YAML file containing table metadata
        :type table_md: Optional[TableMD]
        :param table_md_path: Path to a table metadata YAML file
        :type table_md_path: Optional[str]
        """
        if not table_md:
            table_md = TableMD(table_md_path=table_md_path)
        sql_generator = SQLGenerator(table_md=table_md)
        queries = [
            sql_generator.drop_table(),
            sql_generator.create_table_query(),
        ]
        self.execute(queries)

        self.copy_expert(src_path=src_path, query=sql_generator.copy_query())

        if table_md.delta_params:
            self.execute(sql_generator.upsert_on_id())
Ejemplo n.º 2
0
def test_create_table_query():
    md = get_mock_table_md()
    gen = SQLGenerator(md)
    query = gen.create_table_query()
    expected = """
CREATE TABLE IF NOT EXISTS test.test_table_delta(id varchar(300),event_type varchar(100),event_ts timestamp);
    """
    assert query.strip() == expected.strip()
Ejemplo n.º 3
0
 def __load_to_pgres_callback(self, ch, method, properties, body):
     data = json.loads(body)
     # encapsulating values inside single quotes for loading into the database
     row = [data[field] for field in self.fields]
     hook = PgHook()
     sql_gen = SQLGenerator(self.table_md)
     queries = [
         sql_gen.create_table_query(),  # create table if not exists for loading
         sql_gen.insert_values_into(values=row),
     ]
     hook.execute(queries)
     ch.basic_ack(delivery_tag=method.delivery_tag)
Ejemplo n.º 4
0
    def batch_load_to_pgres(self):
        """
        Inactivity timeout is added in cases where there are less than 5 messages left in the queue. If no further messages
        arrive in 15 seconds, the inactivity timeout kicks in and triggers the processing of the batch currently
        stored in memory.
        """
        sql_gen = SQLGenerator(self.table_md)
        hook = PgHook()
        hook.execute(sql_gen.create_table_query())

        while True:
            try:
                connection, channel = self.__get_conn()
                batch = []
                # Get five messages and break out.
                for method_frame, properties, body in channel.consume(
                    queue=self.queue, inactivity_timeout=15
                ):

                    # if no more messages exist in the queue, break out of the loop
                    if not method_frame:
                        break
                    data = json.loads(body)
                    row = [data[field] for field in self.fields]
                    batch.append(sql_gen.insert_values_into(values=row))
                    channel.basic_ack(method_frame.delivery_tag)

                    if method_frame.delivery_tag == 5:
                        break
                # Requeing the rest of the messages after having pulled a batch
                channel.cancel()
                print("processing batch")
                hook.execute(batch)

            # Close the channel and the connection safely when interrupting so we don't get hanging connections
            except KeyboardInterrupt:  # safely
                channel.close()
                connection.close()
                raise