def insert_source_task(self):
        """
        Insert the Sqoop task that imports the source MySQL data into S3.
        """
        # Use all columns - but strip any double-quotes from the column names.
        columns = [field[0].strip('"') for field in self.table_schema]
        partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec
        destination = url_path_join(
            self.warehouse_path,
            self.warehouse_subdirectory,
            self.database,
            self.table_name,
            partition_path_spec
        ) + '/'

        return SqoopImportFromMysql(
            table_name=self.table_name,
            credentials=self.db_credentials,
            database=self.database,
            destination=destination,
            mysql_delimiters=False,
            fields_terminated_by=self.field_delimiter,
            null_string=self.null_marker,
            delimiter_replacement=' ',
            direct=False,
            columns=columns,
        )
Esempio n. 2
0
    def insert_source_task(self):
        # Make sure yet again that columns have been calculated.
        columns = [field.name for field in self.schema]
        partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec
        destination = url_path_join(
            self.warehouse_path,
            self.warehouse_subdirectory,
            self.database,
            self.table_name,
            partition_path_spec
        ) + '/'

        additional_metadata = {
            'table_schema': self.mysql_compliant_schema(),
            'deleted_fields': self.deleted_fields,
            'database': self.database,
            'table_name': self.table_name,
            'date': self.date.isoformat(),
        }

        return SqoopImportFromMysql(
            table_name=self.table_name,
            credentials=self.db_credentials,
            database=self.database,
            destination=destination,
            overwrite=self.overwrite,
            mysql_delimiters=False,
            fields_terminated_by=self.field_delimiter,
            null_string=self.null_marker,
            delimiter_replacement=' ',
            direct=False,
            columns=columns,
            additional_metadata=additional_metadata,
        )
Esempio n. 3
0
 def s3_location_for_table(self):
     partition_path_spec = HivePartition('dt',
                                         self.date.isoformat()).path_spec
     destination = url_path_join(self.warehouse_path,
                                 self.warehouse_subdirectory, self.database,
                                 self.table_name, partition_path_spec) + '/'
     return destination
 def insert_source_task(self):
     # Get the columns to request from Sqoop, as a side effect of
     # getting the Vertica columns. The Vertica column names are quoted, so strip the quotes off.
     column_names = [name[1:-1] for (name, _) in self.columns]
     partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec
     destination = url_path_join(
         self.warehouse_path,
         self.warehouse_subdirectory,
         self.database,
         self.table_name,
         partition_path_spec
     ) + '/'
     # The arguments here to SqoopImportFromMysql should be the same as for BigQuery.
     # The old format used mysql_delimiters, and direct mode.  We have now removed direct mode,
     # and that gives us more choices for other settings.   We have already changed null_string and field termination,
     # and we hardcode here the replacement of delimiters (like newlines) with spaces
     # (using Sqoop's --hive-delims-replacement option).
     # We could also set other SqoopImportTask parameters: escaped_by, enclosed_by, optionally_enclosed_by.
     # If we wanted to model 'mysql_delimiters=True', we would set escaped-by: \ optionally-enclosed-by: '.
     # But instead we use the defaults for them, so that there is no escaping or enclosing.
     return SqoopImportFromMysql(
         table_name=self.table_name,
         credentials=self.db_credentials,
         database=self.database,
         destination=destination,
         overwrite=self.overwrite,
         mysql_delimiters=False,
         fields_terminated_by=self.field_delimiter,
         null_string=self.null_marker,
         delimiter_replacement=' ',
         direct=False,
         columns=column_names,
     )
 def insert_source_task(self):
     partition_path_spec = HivePartition('dt',
                                         self.date.isoformat()).path_spec
     url_with_filename = url_path_join(self.warehouse_path,
                                       "course_catalog", "subjects",
                                       partition_path_spec, "subjects.tsv")
     return ExternalURL(url=url_with_filename)
    def output(self):
        partition_path_spec = HivePartition('dt', self.date).path_spec

        output_worksheet_name = self.worksheet_name
        output_url = url_path_join(self.warehouse_path, 'google_sheets',
                                   self.spreadsheet_key, output_worksheet_name,
                                   partition_path_spec,
                                   '{}.tsv'.format(output_worksheet_name))
        return get_target_from_url(output_url)
 def get_schema_metadata_target(self):
     """Return target for reading or writing out schema-level metadata file."""
     partition_path_spec = HivePartition('dt', self.date).path_spec
     url = url_path_join(self.intermediate_warehouse_path,
                         self.vertica_warehouse_name,
                         self.vertica_schema_name,
                         '_metadata_export_schema', partition_path_spec,
                         '_metadata')
     return get_target_from_url(url)
    def output(self):
        """
        Output is set up so it can be read in as a Hive table with partitions.

        The form is {output_root}/payments/dt={CCYY-mm-dd}/cybersource_{merchant}.tsv
        """
        date_string = self.run_date.strftime('%Y-%m-%d')  # pylint: disable=no-member
        partition_path_spec = HivePartition('dt', date_string).path_spec
        filename = "cybersource_{}.tsv".format(self.merchant_id)
        url_with_filename = url_path_join(self.output_root, "payments", partition_path_spec, filename)
        return get_target_from_url(url_with_filename)
Esempio n. 9
0
    def output(self):
        """
        Output is set up so it can be read in as a Hive table with partitions.

        The form is {output_root}/paypal_casereport/paypal_ftpprocessed/dt={CCYY-mm-dd}/PayPalCaseReport.tsv
        """
        date_string = self.run_date.strftime('%Y-%m-%d')
        partition_path_spec = HivePartition('dt', date_string).path_spec
        filename = "PayPalCaseReport.tsv"
        url_with_filename = url_path_join(self.warehouse_path, "paypal_casereport", "paypal_ftpprocessed", partition_path_spec, filename)
        return get_target_from_url(url_with_filename)
Esempio n. 10
0
    def output(self):
        """
        Output is set up so that it can be read as a Hive table with partitions.

        The form is {warehouse_path}/course_structure/dt={CCYY-mm-dd}/courses.tsv.
        """
        date_string = self.run_date.strftime('%Y-%m-%d')  # pylint: disable=no-member
        partition_path_spec = HivePartition('dt', date_string).path_spec
        url_with_filename = url_path_join(self.warehouse_path, "course_structure",
                                          partition_path_spec, "courses.tsv")
        return get_target_from_url(url_with_filename)
Esempio n. 11
0
 def database_metadata_target(self):
     partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec
     metadata_destination = url_path_join(
         self.warehouse_path,
         self.warehouse_subdirectory,
         self.database,
         DUMP_METADATA_OUTPUT,
         partition_path_spec,
         METADATA_FILENAME,
     )
     return get_target_from_url(metadata_destination)
    def s3_location_for_table(self):
        """
        Returns the URL for the location of S3 data for the given table and schema.

        This logic is shared by classes that dump data to S3 and those that load data from S3, so they agree.
        """
        partition_path_spec = HivePartition('dt', self.date).path_spec
        url = url_path_join(self.intermediate_warehouse_path,
                            self.vertica_warehouse_name,
                            self.vertica_schema_name, self.table_name,
                            partition_path_spec) + '/'
        return url
Esempio n. 13
0
 def requires(self):
     # Define path so that data could be loaded into Hive, without actually requiring the load to be performed.
     table_name = 'user_video_viewing'
     dummy_partition = HivePartition('dt', self.interval.date_b.isoformat())  # pylint: disable=no-member
     partition_path_spec = dummy_partition.path_spec
     input_path = url_path_join(self.warehouse_path, table_name, partition_path_spec + '/')
     return UserVideoViewingTask(
         mapreduce_engine=self.mapreduce_engine,
         n_reduce_tasks=self.n_reduce_tasks,
         source=self.source,
         interval=self.interval,
         pattern=self.pattern,
         output_root=input_path,
     )
 def insert_source_task(self):
     partition_path_spec = HivePartition('dt',
                                         self.date.isoformat()).path_spec
     destination = url_path_join(self.warehouse_path, "database_import",
                                 self.database, self.table_name,
                                 partition_path_spec) + '/'
     return SqoopImportFromMysql(
         table_name=self.table_name,
         credentials=self.db_credentials,
         database=self.database,
         destination=destination,
         overwrite=self.overwrite,
         mysql_delimiters=True,
     )
Esempio n. 15
0
    def insert_source_task(self):
        """
        We are already exporting vertica tables to S3 using SqoopImportFromVertica through VerticaSchemaToBigQueryTask
        workflow, so we specify ExternalURL here instead. In the future we can change this to a
        SqoopImportFromVertica task.
        """
        partition_path_spec = HivePartition('dt', self.date).path_spec
        intermediate_warehouse_path = url_path_join(self.warehouse_path,
                                                    'import/vertica/sqoop/')
        url = url_path_join(intermediate_warehouse_path,
                            self.vertica_warehouse_name,
                            self.vertica_schema_name, self.table_name,
                            partition_path_spec) + '/'

        return ExternalURL(url=url)
 def partition(self):  # pragma: no cover
     return HivePartition('dt', self.interval.date_b.isoformat())  # pylint: disable=no-member
Esempio n. 17
0
 def partition(self):
     """The table is partitioned by date."""
     return HivePartition('dt', self.date.isoformat())  # pylint: disable=no-member
 def s3_output_path(self):
     partition_path_spec = HivePartition('dt', self.date).path_spec
     target_url = url_path_join(self.intermediate_warehouse_path,
                                self.warehouse_name, self.schema_name,
                                self.table_name, partition_path_spec) + '/'
     return target_url
 def partition(self):
     return HivePartition('dt', self.import_date.isoformat())  # pylint: disable=no-member
Esempio n. 20
0
 def partition(self):
     return HivePartition('date_interval', str(self.interval))