Esempio n. 1
0
    def test_flat_schema(self):
        schema = '{ "type": "SCHEMA", "stream": "simple_stream", "schema": { "properties": { "id": { "type": [ "null", "string" ] }, "name": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "integer" ] }, "ratio": { "type": [ "null", "number" ] }, "timestamp": { "type": "string", "format": "date-time" }, "date": { "type": "string", "format": "date" } }, "type": [ "null", "object" ] }, "key_properties": [ "id" ], "bookmark_properties": [ "date" ] }'

        msg = singer.parse_message(schema)

        schema = build_schema(msg.schema,
                              key_properties=msg.key_properties,
                              add_metadata=True)

        for f in schema:
            if f.name == "id":
                self.assertEqual(f.field_type.upper(), "STRING")

            elif f.name == "name":
                self.assertEqual(f.field_type.upper(), "STRING")

            elif f.name == "value":
                self.assertEqual(f.field_type.upper(), "INTEGER")

            elif f.name == "ratio":
                self.assertEqual(f.field_type.upper(), "FLOAT")

            elif f.name == "timestamp":
                self.assertEqual(f.field_type.upper(), "TIMESTAMP")

            elif f.name == "date":
                self.assertEqual(f.field_type.upper(), "DATE")
    def _load_to_bq(self, client, dataset, table_name, table_schema,
                    table_config, key_props, metadata_columns, truncate, rows):
        logger = self.logger
        partition_field = table_config.get("partition_field", None)
        cluster_fields = table_config.get("cluster_fields", None)
        force_fields = table_config.get("force_fields", {})

        schema = build_schema(table_schema,
                              key_properties=key_props,
                              add_metadata=metadata_columns,
                              force_fields=force_fields)
        load_config = LoadJobConfig()
        load_config.ignore_unknown_values = True
        load_config.schema = schema
        if partition_field:
            load_config.time_partitioning = bigquery.table.TimePartitioning(
                type_=bigquery.table.TimePartitioningType.DAY,
                field=partition_field)

        if cluster_fields:
            load_config.clustering_fields = cluster_fields

        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            logger.info(f"Load {table_name} by FULL_TABLE (truncate)")
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
        else:
            logger.info(f"Appending to {table_name}")
            load_config.write_disposition = WriteDisposition.WRITE_APPEND

        logger.info("loading {} to BigQuery".format(table_name))

        load_job = None
        try:
            load_job = client.load_table_from_file(rows,
                                                   dataset.table(table_name),
                                                   job_config=load_config,
                                                   rewind=True)
            logger.info("loading job {}".format(load_job.job_id))
            job = load_job.result()
            logger.info(job._properties)

            return job

        except google_exceptions.BadRequest as err:
            logger.error("failed to load table {} from file: {}".format(
                table_name, str(err)))
            if load_job and load_job.errors:
                reason = err.errors[0]["reason"]
                messages = [f"{err['message']}" for err in load_job.errors]
                logger.error("reason: {reason}, errors:\n{e}".format(
                    reason=reason, e="\n".join(messages)))
                err.message = f"reason: {reason}, errors: {';'.join(messages)}"

            raise err
Esempio n. 3
0
    def test_nested_schema_v3(self):
        schema = '{    "type":"SCHEMA",    "stream":"orders",    "schema": {        "properties": {          "address_id": {            "type": [              "null",              "string"            ]          },          "address_is_active": {            "type": [              "null",              "boolean"            ]          },          "billing_address": {            "properties": {              "address1": {                "type": [                  "null",                  "string"                ]              },              "address2": {                "type": [                  "null",                  "string"                ]              },              "city": {                "type": [                  "null",                  "string"                ]              },              "company": {                "type": [                  "null",                  "string"                ]              },              "country": {                "type": [                  "null",                  "string"                ]              },              "first_name": {                "type": [                  "null",                  "string"                ]              },              "last_name": {                "type": [                  "null",                  "string"                ]              },              "phone": {                "type": [                  "null",                  "string"                ]              },              "province": {                "type": [                  "null",                  "string"                ]              },              "zip": {                "type": [                  "null",                  "string"                ]              }            },            "type": [              "null",              "object"            ],            "additionalProperties": false          },          "charge_id": {            "type": [              "null",              "string"            ]          },          "charge_status": {            "type": [              "null",              "string"            ]          },          "created_at": {            "format": "date-time",            "type": [              "null",              "string"            ]          },          "customer_id": {            "type": [              "null",              "string"            ]          },          "discount_codes": {            "anyOf": [              {                "type": "array",                "items": {                  "type": "object",                  "additionalProperties": false,                  "properties": {                    "amount": {                      "type": [                        "null",                        "number"                      ]                    },                    "code": {                      "type": [                        "null",                        "string"                      ]                    },                    "type": {                      "type": [                        "null",                        "string"                      ]                    }                  }                }              },              {                "type": "null"              }            ]          },          "email": {            "type": [              "null",              "string"            ]          },          "first_name": {            "type": [              "null",              "string"            ]          },          "hash": {            "type": [              "null",              "string"            ]          },          "id": {            "type": [              "null",              "string"            ]          },          "is_prepaid": {            "type": [              "null",              "boolean"            ]          },          "last_name": {            "type": [              "null",              "string"            ]          },          "line_items": {            "anyOf": [              {                "type": "array",                "items": {                  "type": "object",                  "additionalProperties": false,                  "properties": {                    "grams": {                      "type": [                        "null",                        "integer"                      ]                    },                    "images": {                      "type": [                        "null",                        "object"                      ],                      "additionalProperties": false,                      "properties": {                        "large": {                          "type": [                            "null",                            "string"                          ]                        },                        "medium": {                          "type": [                            "null",                            "string"                          ]                        },                        "original": {                          "type": [                            "null",                            "string"                          ]                        },                        "small": {                          "type": [                            "null",                            "string"                          ]                        }                      }                    },                    "price": {                      "type": [                        "null",                        "number"                      ],                      "multipleOf": 1e-08                    },                    "properties": {                      "anyOf": [                        {                          "type": "array",                          "items": {                            "type": "object",                            "additionalProperties": false,                            "properties": {                              "name": {                                "type": [                                  "null",                                  "string"                                ]                              },                              "value": {                                "type": [                                  "null",                                  "string"                                ]                              }                            }                          }                        },                        {                          "type": "null"                        }                      ]                    },                    "quantity": {                      "type": [                        "null",                        "integer"                      ]                    },                    "shopify_product_id": {                      "type": [                        "null",                        "string"                      ]                    },                    "shopify_variant_id": {                      "type": [                        "null",                        "string"                      ]                    },                    "sku": {                      "type": [                        "null",                        "string"                      ]                    },                    "subscription_id": {                      "type": [                        "null",                        "string"                      ]                    },                    "title": {                      "type": [                        "null",                        "string"                      ]                    },                    "variant_title": {                      "type": [                        "null",                        "string"                      ]                    },                    "vendor": {                      "type": [                        "null",                        "string"                      ]                    }                  }                }              },              {                "type": "null"              }            ]          },          "note": {            "type": [              "null",              "string"            ]          },          "note_attributes": {            "anyOf": [              {                "type": "array",                "items": {                  "type": "object",                  "additionalProperties": false,                  "properties": {                    "name": {                      "type": [                        "null",                        "string"                      ]                    },                    "value": {                      "type": [                        "null",                        "string"                      ]                    }                  }                }              },              {                "type": "null"              }            ]          },          "payment_processor": {            "type": [              "null",              "string"            ]          },          "processed_at": {            "format": "date-time",            "type": [              "null",              "string"            ]          },          "scheduled_at": {            "format": "date-time",            "type": [              "null",              "string"            ]          },          "shipped_date": {            "format": "date-time",            "type": [              "null",              "string"            ]          },          "shipping_address": {            "properties": {              "address1": {                "type": [                  "null",                  "string"                ]              },              "address2": {                "type": [                  "null",                  "string"                ]              },              "city": {                "type": [                  "null",                  "string"                ]              },              "company": {                "type": [                  "null",                  "string"                ]              },              "country": {                "type": [                  "null",                  "string"                ]              },              "first_name": {                "type": [                  "null",                  "string"                ]              },              "last_name": {                "type": [                  "null",                  "string"                ]              },              "phone": {                "type": [                  "null",                  "string"                ]              },              "province": {                "type": [                  "null",                  "string"                ]              },              "zip": {                "type": [                  "null",                  "string"                ]              }            },            "type": [              "null",              "object"            ],            "additionalProperties": false          },          "shipping_date": {            "format": "date-time",            "type": [              "null",              "string"            ]          },          "shipping_lines": {            "anyOf": [              {                "type": "array",                "items": {                  "type": "object",                  "additionalProperties": false,                  "properties": {                    "code": {                      "type": [                        "null",                        "string"                      ]                    },                    "price": {                      "type": [                        "null",                        "number"                      ]                    },                    "title": {                      "type": [                        "null",                        "string"                      ]                    }                  }                }              },              {                "type": "null"              }            ]          },          "shopify_cart_token": {            "type": [              "null",              "string"            ]          },          "shopify_customer_id": {            "type": [              "null",              "string"            ]          },          "shopify_id": {            "type": [              "null",              "string"            ]          },          "shopify_order_id": {            "type": [              "null",              "string"            ]          },          "shopify_order_number": {            "type": [              "null",              "string"            ]          },          "status": {            "type": [              "null",              "string"            ]          },          "subtotal_price": {            "type": [              "null",              "number"            ]          },          "tags": {            "type": [              "null",              "string"            ]          },          "tax_lines": {            "anyOf": [              {                "type": "array",                "items": {                  "type": "object",                  "additionalProperties": false,                  "properties": {                    "code": {                      "type": [                        "null",                        "string"                      ]                    },                    "price": {                      "type": [                        "null",                        "number"                      ]                    },                    "title": {                      "type": [                        "null",                        "string"                      ]                    }                  }                }              },              {                "type": "null"              }            ]          },          "total_discounts": {            "multipleOf": 1e-08,            "type": [              "null",              "number"            ]          },          "total_line_items_price": {            "multipleOf": 1e-08,            "type": [              "null",              "number"            ]          },          "total_price": {            "type": [              "null",              "number"            ]          },          "total_refunds": {            "multipleOf": 1e-08,            "type": [              "null",              "number"            ]          },          "total_tax": {            "multipleOf": 1e-08,            "type": [              "null",              "number"            ]          },          "total_weight": {            "type": [              "null",              "integer"            ]          },          "transaction_id": {            "type": [              "null",              "string"            ]          },          "type": {            "type": [              "null",              "string"            ]          },          "updated_at": {            "format": "date-time",            "type": [              "null",              "string"            ]          }        },        "type": "object",        "additionalProperties": false      },    "key_properties":[       "Id"    ] }'

        msg = singer.parse_message(schema)

        schema = build_schema(msg.schema,
                              key_properties=msg.key_properties,
                              add_metadata=True)

        self.assertTrue(True)
Esempio n. 4
0
    def test_nested_schema_v2(self):
        schema = '{"type": "SCHEMA", "stream": "campaigns", "schema": {"type": ["null", "object"], "additionalProperties": false, "properties": {"AudienceAdsBidAdjustment": {"type": ["null", "integer"]}, "BiddingScheme": {"anyOf": [{"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "InheritedBidStrategyType": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}, "TargetCpa": {"type": ["null", "number"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "TargetRoas": {"type": ["null", "number"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}, "TargetRoas": {"type": ["null", "number"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}, "TargetAdPosition": {"type": ["null", "string"]}, "TargetImpressionShare": {"type": ["null", "number"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}}}]}, "BudgetType": {"type": ["null", "string"]}, "DailyBudget": {"type": ["null", "number"]}, "ExperimentId": {"type": ["null", "integer"]}, "FinalUrlSuffix": {"type": ["null", "string"]}, "ForwardCompatibilityMap": {"type": ["null", "object"], "properties": {"KeyValuePairOfstringstring": {"type": ["null", "array"], "items": {"type": ["null", "object"], "additionalProperties": false, "properties": {"key": {"type": ["null", "string"]}, "value": {"type": ["null", "string"]}}}}}}, "Id": {"type": ["null", "integer"]}, "Name": {"type": ["null", "string"]}, "Status": {"type": ["null", "string"]}, "SubType": {"type": ["null", "string"]}, "TimeZone": {"type": ["null", "string"]}, "TrackingUrlTemplate": {"type": ["null", "string"]}, "UrlCustomParameters": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Parameters": {"type": ["null", "object"], "properties": {"CustomParameter": {"type": ["null", "array"], "items": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Key": {"type": ["null", "string"]}, "Value": {"type": ["null", "string"]}}}}}}}}, "CampaignType": {"type": ["null", "string"]}, "Settings": {"type": ["null", "object"], "properties": {"Setting": {"type": ["null", "array"], "items": {"anyOf": [{"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "Details": {"type": ["null", "object"], "properties": {"TargetSettingDetail": {"type": ["null", "array"], "items": {"type": ["null", "object"], "additionalProperties": false, "properties": {"CriterionTypeGroup": {"type": ["null", "string"]}, "TargetAndBid": {"type": ["boolean"]}}}}}}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "LocalInventoryAdsEnabled": {"type": ["null", "boolean"]}, "Priority": {"type": ["null", "integer"]}, "SalesCountryCode": {"type": ["null", "string"]}, "StoreId": {"type": ["null", "integer"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "BidBoostValue": {"type": ["null", "number"]}, "BidMaxValue": {"type": ["null", "number"]}, "BidOption": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "DomainName": {"type": ["null", "string"]}, "Language": {"type": ["null", "string"]}, "PageFeedIds": {"type": ["null", "object"], "properties": {"long": {"type": ["null", "array"], "items": {"type": "integer"}}}}, "Source": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}}}]}}}}, "BudgetId": {"type": ["null", "integer"]}, "Languages": {"type": ["null", "object"], "properties": {"string": {"type": ["null", "array"], "items": {"type": "string"}}}}, "AdScheduleUseSearcherTimeZone": {"type": ["null", "boolean"]}}}, "key_properties": ["Id"]}'

        msg = singer.parse_message(schema)

        schema = build_schema(msg.schema,
                              key_properties=msg.key_properties,
                              add_metadata=True)

        self.assertTrue(True)
Esempio n. 5
0
    def test_nested_schema(self):
        schema = '{ "type": "SCHEMA", "stream": "nested_stream", "schema": { "properties": { "account_id": { "type": [ "null", "string" ] }, "account_name": { "type": [ "null", "string" ] }, "action_values": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "ad_id": { "type": [ "null", "string" ] }, "ad_name": { "type": [ "null", "string" ] }, "adset_id": { "type": [ "null", "string" ] }, "adset_name": { "type": [ "null", "string" ] }, "age": { "type": [ "null", "integer", "string" ] }, "campaign_id": { "type": [ "null", "string" ] }, "campaign_name": { "type": [ "null", "string" ] }, "canvas_avg_view_percent": { "type": [ "null", "number" ] }, "canvas_avg_view_time": { "type": [ "null", "number" ] }, "clicks": { "type": [ "null", "integer" ] }, "conversion_rate_ranking": { "type": [ "null", "string" ] }, "cost_per_action_type": { "items": { "properties": { "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "string" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "cost_per_inline_link_click": { "type": [ "null", "number" ] }, "cost_per_inline_post_engagement": { "type": [ "null", "number" ] }, "cost_per_unique_action_type": { "items": { "properties": { "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "string" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "cost_per_unique_click": { "type": [ "null", "number" ] }, "cost_per_unique_inline_link_click": { "type": [ "null", "number" ] }, "cpc": { "type": [ "null", "number" ] }, "cpm": { "type": [ "null", "number" ] }, "cpp": { "type": [ "null", "number" ] }, "ctr": { "type": [ "null", "number" ] }, "date_start": { "format": "date-time", "type": [ "null", "string" ] }, "date_stop": { "format": "date-time", "type": [ "null", "string" ] }, "engagement_rate_ranking": { "type": [ "null", "string" ] }, "frequency": { "type": [ "null", "number" ] }, "gender": { "type": [ "null", "string" ] }, "impressions": { "type": [ "null", "integer" ] }, "inline_link_click_ctr": { "type": [ "null", "number" ] }, "inline_link_clicks": { "type": [ "null", "integer" ] }, "inline_post_engagement": { "type": [ "null", "integer" ] }, "objective": { "type": [ "null", "string" ] }, "outbound_clicks": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "quality_ranking": { "type": [ "null", "string" ] }, "reach": { "type": [ "null", "integer" ] }, "social_spend": { "type": [ "null", "number" ] }, "spend": { "type": [ "null", "number" ] }, "unique_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "unique_clicks": { "type": [ "null", "integer" ] }, "unique_ctr": { "type": [ "null", "number" ] }, "unique_inline_link_click_ctr": { "type": [ "null", "number" ] }, "unique_inline_link_clicks": { "type": [ "null", "integer" ] }, "unique_link_clicks_ctr": { "type": [ "null", "number" ] }, "video_30_sec_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_p100_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_p25_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_p50_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_p75_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_play_curve_actions": { "items": { "properties": { "action_type": { "type": [ "null", "string" ] }, "value": { "items": { "type": [ "null", "integer" ] }, "type": [ "null", "array" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "website_ctr": { "items": { "properties": { "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] } }, "type": [ "null", "object" ] }, "key_properties": [ "campaign_id", "adset_id", "ad_id", "date_start", "age", "gender" ], "bookmark_properties": [ "date_start" ] }'

        msg = singer.parse_message(schema)

        schema = build_schema(msg.schema,
                              key_properties=msg.key_properties,
                              add_metadata=True)

        for f in schema:
            if f.name in ("date_start", "date_stop"):
                self.assertEqual(f.field_type.upper(), "TIMESTAMP")
Esempio n. 6
0
def persist_lines_job(
    client,
    dataset,
    lines=None,
    truncate=False,
    forced_fulltables=[],
    validate_records=True,
    table_suffix=None,
):
    state = None
    schemas = {}
    key_properties = {}
    rows = {}
    errors = {}
    table_suffix = table_suffix or ""

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            table_name = msg.stream + table_suffix

            if table_name not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(table_name))

            schema = schemas[table_name]

            if validate_records:
                validate(msg.record, schema)

            new_rec = filter(schema, msg.record)

            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row.
            data = bytes(
                json.dumps(new_rec, cls=DecimalEncoder) + "\n", "UTF-8")

            rows[table_name].write(data)

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug("Setting state to {}".format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table_name = msg.stream + table_suffix

            if table_name in rows:
                continue

            schemas[table_name] = msg.schema
            key_properties[table_name] = msg.key_properties
            rows[table_name] = TemporaryFile(mode="w+b")
            errors[table_name] = None

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        key_props = key_properties[table]
        SCHEMA = build_schema(schemas[table], key_properties=key_props)
        load_config = LoadJobConfig()
        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate or (table in forced_fulltables):
            logger.info(f"Load {table} by FULL_TABLE")
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        logger.info("loading {} to Bigquery.\n".format(table))

        try:
            load_job = client.load_table_from_file(rows[table],
                                                   dataset.table(table),
                                                   job_config=load_config,
                                                   rewind=True)
            logger.info("loading job {}".format(load_job.job_id))
            logger.info(load_job.result())
        except google_exceptions.BadRequest as err:
            logger.error("failed to load table {} from file: {}".format(
                table, str(err)))
            if load_job.errors:
                messages = [
                    f"reason: {err['reason']}, message: {err['message']}"
                    for err in load_job.errors
                ]
                logger.error("errors:\n{}".format("\n".join(messages)))
            raise

    yield state
Esempio n. 7
0
def persist_lines_stream(project_id,
                         dataset_id,
                         lines=None,
                         validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(
            Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            err = None
            try:
                err = bigquery_client.insert_rows_json(tables[msg.stream],
                                                       [msg.record])
            except Exception as exc:
                logger.error(
                    f"failed to insert rows for {tables[msg.stream]}: {str(exc)}\n{msg.record}"
                )
                raise

            errors[msg.stream] = err
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug("Setting state to {}".format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table),
                                           schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in errors.keys():
        if not errors[table]:
            logging.info("Loaded {} row(s) from {} into {}:{}".format(
                rows[table], dataset_id, table, tables[table].path))
            emit_state(state)
        else:
            logging.error("Errors: %s", errors[table])

    return state
Esempio n. 8
0
def persist_lines_stream(  # noqa: 211
    client: Client,
    project_id,
    dataset: Dataset,
    lines: TextIO,
    truncate: bool,
    forced_fulltables: list,
    validate_records: bool = True,
    table_suffix: Optional[str] = None,
    table_prefix: Optional[str] = None,
) -> Iterator[Optional[str]]:
    """Stream data into BigQuery.

    Arguments:
        client {Client} -- BigQuery client
        dataset {Dataset} -- BigQuery dataset
        lines {TextIO} -- Tap stream

    Keyword Arguments:
        truncate {bool} -- Whether to truncunate the table
        forced_fulltables {list} -- List of tables to truncunate
        validate_records {bool} -- Whether to alidate records (default: {True})
        table_suffix {Optional[str]} -- Suffix for tables (default: {None})
        table_prefix {Optional[str]} -- Prefix for tables (default: {None})

    Raises:
        SchemaNotFoundException: If the schema message was not received yet
        InvalidSingerMessage: Invalid Sinnger message

    Yields:
        Iterator[Optional[str]] -- State
    """
    # Create variable in which we save data in the upcomming loop
    state: Optional[str] = None
    schemas: dict = {}
    key_properties: dict = {}
    tables: dict = {}
    rows: dict = {}
    errors: dict = {}
    table_suffix = table_suffix or ''
    table_prefix = table_prefix or ''

    # For every Singer input message
    for line in lines:
        # Parse the message
        try:
            msg: Union[SchemaMessage, StateMessage,
                       RecordMessage] = (parse_message(line))
        except json.decoder.JSONDecodeError:
            LOGGER.error(f'Unable to parse Singer Message:\n{line}')
            raise

        # There can be several kind of messages. When inserting data, the
        # schema message comes first
        if isinstance(msg, SchemaMessage):
            # Schema message, create the table
            table_name: str = table_prefix + msg.stream + table_suffix

            # Save the schema, key_properties and message to use in the
            # record messages that are following
            schemas[table_name] = msg.schema
            key_properties[table_name] = msg.key_properties

            tables[table_name] = bigquery.Table(
                dataset.table(table_name),
                schema=build_schema(schemas[table_name]),
            )

            rows[table_name] = 0
            errors[table_name] = None

            dataset_id: str = dataset.dataset_id
            if not table_exists(client, project_id, dataset_id, table_name):
                # Create the table
                client.create_table(tables[table_name])
            elif truncate or table_name in forced_fulltables:
                LOGGER.info(f'Load {table_name} by FULL_TABLE')

                # When truncating is enabled and the table exists, the table
                # has to be recreated. Because of this, we have to wait
                # otherwise data can be lost, see:
                # https://stackoverflow.com/questions/36846571/
                # bigquery-table-truncation-before-streaming-not-working
                LOGGER.info(f'Deleting table {table_name} because it exists')
                client.delete_table(tables[table_name])
                LOGGER.info(f'Recreating table {table_name}')
                client.create_table(tables[table_name])
                LOGGER.info(
                    'Sleeping for 5 minutes before streaming data, '
                    f'to avoid streaming data loss in {table_name}', )
                time.sleep(FIVE_MINUTES)

                # Delete table

        elif isinstance(msg, RecordMessage):
            # Record message
            table_name = table_prefix + msg.stream + table_suffix

            if table_name not in schemas:
                raise SchemaNotFoundException(
                    f'A record for stream {table_name} was encountered before '
                    'a corresponding schema', )

            # Retrieve schema
            schema: dict = schemas[table_name]

            # Retrieve table
            table_ref: TableReference = tables[table_name]

            # Validate the record
            if validate_records:
                # Raises ValidationError if the record has invalid schema
                validate(msg.record, schema)

            # Filter the record
            record_input: Optional[Union[dict, str, list]] = filter_schema(
                schema,
                msg.record,
            )

            # Somewhere in the process, the input record can have decimal
            # values e.g. "value": Decimal('10.25'). These are not JSON
            # erializable. Therefore, we dump the JSON here, which converts
            # them to string. Thereafter, we load the dumped JSON so we get a
            # dictionary again, which we can insert to BigQuery
            record_json: str = json.dumps(record_input, cls=DecimalEncoder)
            record: dict = json.loads(record_json)

            # Save the error
            err: Optional[list] = None

            try:
                # Insert record
                err = client.insert_rows(table_ref, [record])
            except Exception as exc:
                LOGGER.error(
                    f'Failed to insert rows for {table_name}: {exc}\n'
                    f'{record}\n{err}', )
                raise

            # Save errors of the stream and increate the insert rows
            errors[msg.stream] = err
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, StateMessage):
            # State messages
            LOGGER.debug(f'Setting state to {msg.value}')
            state = msg.value

        else:
            raise InvalidSingerMessage(f'Unrecognized Singer Message:\n {msg}')

    for table in errors.keys():
        if errors[table]:
            logging.error(f'Errors: {errors[table]}')
        else:
            logging.info(
                'Loaded {rows} row(s) from {source} into {tab}:{path}'.format(
                    rows=rows[table],
                    source=dataset.dataset_id,
                    tab=table,
                    path=tables[table].path,
                ), )
            yield state
Esempio n. 9
0
def persist_lines_job(  # noqa: WPS210, WPS211, WPS213, WPS231, WPS238
    client: Client,
    dataset: Dataset,
    lines: TextIO,
    truncate: bool,
    forced_fulltables: list,
    validate_records: bool = True,
    table_suffix: Optional[str] = None,
    table_prefix: Optional[str] = None,
) -> Iterator[Optional[str]]:
    """Perform a load job into BigQuery.

    Arguments:
        client {Client} -- BigQuery client
        dataset {Dataset} -- BigQuery dataset
        lines {TextIO} -- Tap stream

    Keyword Arguments:
        truncate {bool} -- Whether to truncunate the table
        forced_fulltables {list} -- List of tables to truncunate
        validate_records {bool} -- Whether to alidate records (default: {True})
        table_suffix {Optional[str]} -- Suffix for tables (default: {None})
        table_prefix {Optional[str]} -- Prefix for tables (default: {None})

    Raises:
        SchemaNotFoundException: If the schema message was not received yet
        InvalidSingerMessage: Invalid Sinnger message

    Yields:
        Iterator[Optional[str]] -- State
    """
    # Create variable in which we save data in the upcomming loop
    state: Optional[str] = None
    schemas: dict = {}
    key_properties: dict = {}
    rows: dict = {}
    errors: dict = {}
    table_suffix = table_suffix or ''
    table_prefix = table_prefix or ''

    # For every Singer input message
    for line in lines:
        # Parse the message
        try:
            msg: Union[SchemaMessage, StateMessage, RecordMessage] = (
                parse_message(line)
            )
        except json.decoder.JSONDecodeError:
            LOGGER.error(f'Unable to parse Singer Message:\n{line}')
            raise

        # There can be several kind of messages. When inserting data, the
        # schema message comes first
        if isinstance(msg, SchemaMessage):
            # Schema message, save schema
            table_name: str = table_prefix + msg.stream + table_suffix

            # Skip schema if already created
            if table_name in rows:
                continue

            # Save schema and setup a temp file for data storage
            schemas[table_name] = msg.schema
            key_properties[table_name] = msg.key_properties
            rows[table_name] = TemporaryFile(mode='w+b')
            errors[table_name] = None

        elif isinstance(msg, RecordMessage):
            # Record message
            table_name = table_prefix + msg.stream + table_suffix

            if table_name not in schemas:
                raise SchemaNotFoundException(
                    f'A record for stream {table_name} was encountered before '
                    'a corresponding schema',
                )

            # Retrieve schema
            schema: dict = schemas[table_name]

            # Validate the record
            if validate_records:
                # Raises ValidationError if the record has invalid schema
                validate(msg.record, schema)

            record_input: Optional[Union[dict, str, list]] = filter_schema(
                schema,
                msg.record,
            )

            # Somewhere in the process, the input record can have decimal
            # values e.g. "value": Decimal('10.25'). These are not JSON
            # erializable. Therefore, we dump the JSON here, which converts
            # them to string. Thereafter, we load the dumped JSON so we get a
            # dictionary again, which we can insert to BigQuery
            record_str: str = '{rec}\n'.format(
                rec=json.dumps(record_input, cls=DecimalEncoder),
            )

            record: bytes = bytes(record_str, 'UTF-8')

            # Save data to load later
            rows[table_name].write(record)

            state = None

        elif isinstance(msg, StateMessage):
            # State messages
            LOGGER.debug(f'Setting state to {msg.value}')
            state = msg.value

        else:
            raise InvalidSingerMessage(
                f'Unrecognized Singer Message:\n {msg}',
            )

    # After all recordsa are received, setup a load job per stream
    for table in rows.keys():
        # Prepare load job
        key_props: str = key_properties[table]
        load_config: LoadJobConfig = LoadJobConfig()
        load_config.schema = build_schema(
            schemas[table],
            key_properties=key_props,
        )
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        # Overwrite the table if truncate is enabled
        if truncate or table in forced_fulltables:
            LOGGER.info(f'Load {table} by FULL_TABLE')
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        LOGGER.info(f'loading {table} to Bigquery.')

        # Setup load job
        load_job: LoadJob = client.load_table_from_file(
            rows[table],
            dataset.table(table),
            job_config=load_config,
            rewind=True,
        )

        LOGGER.info(f'loading job {load_job.job_id}')

        # Run load job
        try:
            load_job.result()
        except google_exceptions.GoogleAPICallError as err:
            # Parse errors
            LOGGER.error(f'failed to load table {table} from file: {err}')

            if load_job.errors:
                messages: list = [
                    f"reason: {err['reason']}, message: {err['message']}"
                    for err in load_job.errors
                ]
                messages_str: str = '\n'.join(messages)
                LOGGER.error(f'errors:\n{messages_str}')
            raise
        LOGGER.info(
            f'Loaded {load_job.output_rows} row(s) in '
            f'{load_job.destination}',
        )

    yield state