Ejemplo n.º 1
0
 def create_task(self,
                 num_mappers=None,
                 where=None,
                 verbose=False,
                 columns=None,
                 null_string=None,
                 fields_terminated_by=None,
                 delimiter_replacement=None,
                 overwrite=False,
                 direct=True,
                 mysql_delimiters=True):
     """Create a SqoopImportFromMysql with specified options."""
     task = SqoopImportFromMysql(
         credentials=sentinel.ignored,
         database='exampledata',
         destination="/fake/destination",
         table_name="example_table",
         num_mappers=num_mappers,
         where=where,
         verbose=verbose,
         columns=columns if columns is not None else [],
         null_string=null_string,
         fields_terminated_by=fields_terminated_by,
         delimiter_replacement=delimiter_replacement,
         overwrite=overwrite,
         direct=direct,
         mysql_delimiters=mysql_delimiters,
     )
     return task
    def insert_source_task(self):
        """
        Insert the Sqoop task that imports the source MySQL data into S3.
        """
        # Use all columns - but strip any double-quotes from the column names.
        columns = [field[0].strip('"') for field in self.table_schema]
        partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec
        destination = url_path_join(
            self.warehouse_path,
            self.warehouse_subdirectory,
            self.database,
            self.table_name,
            partition_path_spec
        ) + '/'

        return SqoopImportFromMysql(
            table_name=self.table_name,
            credentials=self.db_credentials,
            database=self.database,
            destination=destination,
            mysql_delimiters=False,
            fields_terminated_by=self.field_delimiter,
            null_string=self.null_marker,
            delimiter_replacement=' ',
            direct=False,
            columns=columns,
        )
 def insert_source_task(self):
     # Get the columns to request from Sqoop, as a side effect of
     # getting the Vertica columns. The Vertica column names are quoted, so strip the quotes off.
     column_names = [name[1:-1] for (name, _) in self.columns]
     partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec
     destination = url_path_join(
         self.warehouse_path,
         self.warehouse_subdirectory,
         self.database,
         self.table_name,
         partition_path_spec
     ) + '/'
     # The arguments here to SqoopImportFromMysql should be the same as for BigQuery.
     # The old format used mysql_delimiters, and direct mode.  We have now removed direct mode,
     # and that gives us more choices for other settings.   We have already changed null_string and field termination,
     # and we hardcode here the replacement of delimiters (like newlines) with spaces
     # (using Sqoop's --hive-delims-replacement option).
     # We could also set other SqoopImportTask parameters: escaped_by, enclosed_by, optionally_enclosed_by.
     # If we wanted to model 'mysql_delimiters=True', we would set escaped-by: \ optionally-enclosed-by: '.
     # But instead we use the defaults for them, so that there is no escaping or enclosing.
     return SqoopImportFromMysql(
         table_name=self.table_name,
         credentials=self.db_credentials,
         database=self.database,
         destination=destination,
         overwrite=self.overwrite,
         mysql_delimiters=False,
         fields_terminated_by=self.field_delimiter,
         null_string=self.null_marker,
         delimiter_replacement=' ',
         direct=False,
         columns=column_names,
     )
Ejemplo n.º 4
0
 def requires(self):
     return SqoopImportFromMysql(
         table_name=self.table_name,
         # TODO: We may want to make the explicit passing in of columns optional as it prevents a direct transfer.
         # Make sure delimiters and nulls etc. still work after removal.
         columns=[c[0] for c in self.columns],
         destination=self.partition_location,
         credentials=self.credentials,
         num_mappers=self.num_mappers,
         verbose=self.verbose,
         overwrite=self.overwrite,
         database=self.database,
         # Hive expects NULL to be represented by the string "\N" in the data. You have to pass in "\\N" to sqoop
         # since it uses that string directly in the generated Java code, so "\\N" actually looks like "\N" to the
         # Java code. In order to get "\\N" onto the command line we have to use another set of escapes to tell the
         # python code to pass through the "\" character.
         null_string='\\\\N',
         # It's unclear why, but this setting prevents us from correctly substituting nulls with \N.
         mysql_delimiters=False,
         # This is a string that is interpreted as an octal number, so it is equivalent to the character Ctrl-A
         # (0x01). This is the default separator for fields in Hive.
         fields_terminated_by='\x01',
         # Replace delimiters with a single space if they appear in the data. This prevents the import of malformed
         # records. Hive does not support escape characters or other reasonable workarounds to this problem.
         delimiter_replacement=' ',
         where=self.where,
     )
Ejemplo n.º 5
0
    def sqoop_export_mysql_table_task(self):
        if self.sqoop_export_task is None:
            mysql_schema = self.mysql_compliant_schema()
            column_names = [
                field_name
                for (field_name, _field_type, _field_null) in mysql_schema
            ]
            additional_metadata = {
                'table_schema': mysql_schema,
                'deleted_fields': self.deleted_fields,
                'database': self.database,
                'table_name': self.table_name,
                'date': self.date.isoformat(),
            }
            self.sqoop_export_task = SqoopImportFromMysql(
                table_name=self.table_name,
                credentials=self.db_credentials,
                database=self.database,
                destination=self.s3_location_for_table,
                overwrite=self.overwrite,
                mysql_delimiters=False,
                fields_terminated_by=self.field_delimiter,
                null_string=self.null_marker,
                delimiter_replacement=' ',
                direct=False,
                columns=column_names,
                additional_metadata=additional_metadata,
            )

        return self.sqoop_export_task
Ejemplo n.º 6
0
    def insert_source_task(self):
        # Make sure yet again that columns have been calculated.
        columns = [field.name for field in self.schema]
        partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec
        destination = url_path_join(
            self.warehouse_path,
            self.warehouse_subdirectory,
            self.database,
            self.table_name,
            partition_path_spec
        ) + '/'

        additional_metadata = {
            'table_schema': self.mysql_compliant_schema(),
            'deleted_fields': self.deleted_fields,
            'database': self.database,
            'table_name': self.table_name,
            'date': self.date.isoformat(),
        }

        return SqoopImportFromMysql(
            table_name=self.table_name,
            credentials=self.db_credentials,
            database=self.database,
            destination=destination,
            overwrite=self.overwrite,
            mysql_delimiters=False,
            fields_terminated_by=self.field_delimiter,
            null_string=self.null_marker,
            delimiter_replacement=' ',
            direct=False,
            columns=columns,
            additional_metadata=additional_metadata,
        )
 def requires(self):
     table_name = 'courseware_studentmodule'
     return SqoopImportFromMysql(credentials=self.credentials,
                                 destination=url_path_join(
                                     self.dump_root, table_name),
                                 table_name=table_name,
                                 num_mappers=self.num_mappers,
                                 where=self.where,
                                 verbose=self.verbose)
 def requires(self):
     table_name = 'courseware_studentmodule'
     return SqoopImportFromMysql(
         credentials=self.credentials,
         destination=url_path_join(self.dest, table_name),
         table_name=table_name,
         num_mappers=self.num_mappers,
         overwrite=self.sqoop_overwrite,
     )
 def insert_source_task(self):
     partition_path_spec = HivePartition('dt',
                                         self.date.isoformat()).path_spec
     destination = url_path_join(self.warehouse_path, "database_import",
                                 self.database, self.table_name,
                                 partition_path_spec) + '/'
     return SqoopImportFromMysql(
         table_name=self.table_name,
         credentials=self.db_credentials,
         database=self.database,
         destination=destination,
         overwrite=self.overwrite,
         mysql_delimiters=True,
     )