def upgrade_date_order_style(style: DateOrderStyle, hint_name: str) -> None: nonlocal date_order_style if date_order_style not in (None, style): cant_handle_hint(fail_if_cant_handle_hint, hint_name, hints) else: date_order_style = style quiet_remove(unhandled_hints, hint_name)
def determine_date_output_style(unhandled_hints: Set[str], hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool) -> \ Tuple[DateOutputStyle, Optional[DateOrderStyle]]: # see docs in the types module dateformat = hints.dateformat timeonlyformat = hints.timeonlyformat datetimeformattz = hints.datetimeformattz datetimeformat = hints.datetimeformat date_order_style: Optional[DateOrderStyle] = None if (dateformat == 'YYYY-MM-DD' and timeonlyformat in ['HH24:MI:SS', 'HH:MI:SS'] and datetimeformattz in ['YYYY-MM-DD HH:MI:SSOF', 'YYYY-MM-DD HH24:MI:SSOF'] and datetimeformat in ['YYYY-MM-DD HH24:MI:SS', 'YYYY-MM-DD HH:MI:SS']): date_output_style: DateOutputStyle = 'ISO' # date_order_style doesn't really matter, as ISO is not ambiguous else: # 'SQL', 'Postgres' and 'German' all support only alphabetic # timezone indicators, which aren't yet supported in the # records spec cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints) quiet_remove(unhandled_hints, 'dateformat') quiet_remove(unhandled_hints, 'timeonlyformat') quiet_remove(unhandled_hints, 'datetimeformattz') quiet_remove(unhandled_hints, 'datetimeformat') return (date_output_style, date_order_style)
def postgres_copy_options_common(unhandled_hints: Set[str], hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool, original_postgres_options: PostgresCopyOptions) ->\ PostgresCopyOptions: postgres_options = original_postgres_options.copy() # ENCODING # # Specifies that the file is encoded in the encoding_name. If # this option is omitted, the current client encoding is # used. See the Notes below for more details. encoding_hint = hints.encoding if encoding_hint in postgres_encoding_names: postgres_options['encoding'] = postgres_encoding_names[encoding_hint] quiet_remove(unhandled_hints, 'encoding') else: cant_handle_hint(fail_if_cant_handle_hint, 'encoding', hints) # FORCE_NOT_NULL # # Do not match the specified columns' values against the null # string. In the default case where the null string is empty, # this means that empty values will be read as zero-length # strings rather than nulls, even when they are not quoted. This # option is allowed only in COPY FROM, and only when using CSV # format. # # HEADER # # Specifies that the file contains a header line with the names # of each column in the file. On output, the first line contains # the column names from the table, and on input, the first line # is ignored. This option is allowed only when using CSV format. # quiet_remove(unhandled_hints, 'header-row') postgres_options['header'] = hints.header_row # OIDS # # Specifies copying the OID for each row. (An error is raised if # OIDS is specified for a table that does not have OIDs, or in # the case of copying a query.) # # DELIMITER # # Specifies the character that separates columns within each row # (line) of the file. The default is a tab character in text # format, a comma in CSV format. This must be a single one-byte # character. This option is not allowed when using binary format. # postgres_options['delimiter'] = hints.field_delimiter quiet_remove(unhandled_hints, 'field-delimiter') return postgres_options
def mysql_load_options(unhandled_hints: Set[str], records_format: DelimitedRecordsFormat, fail_if_cant_handle_hint: bool) -> MySqlLoadOptions: hints = records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint) # # The server uses the character set indicated by the # character_set_database system variable to interpret the # information in the file. SET NAMES and the setting of # character_set_client do not affect interpretation of input. If # the contents of the input file use a character set that differs # from the default, it is usually preferable to specify the # character set of the file by using the CHARACTER SET clause. A # character set of binary specifies “no conversion.” # hint_encoding: HintEncoding = hints.encoding character_set = MYSQL_CHARACTER_SETS_FOR_LOAD.get(hint_encoding) if character_set is not None: mysql_character_set = character_set else: cant_handle_hint(fail_if_cant_handle_hint, 'encoding', records_format.hints) mysql_character_set = 'utf8' quiet_remove(unhandled_hints, 'encoding') field_terminator: HintFieldDelimiter = hints.field_delimiter mysql_fields_terminator = field_terminator quiet_remove(unhandled_hints, 'field-delimiter') # https://dev.mysql.com/doc/refman/8.0/en/load-data.html # # # LOAD DATA can be used to read files obtained from external # sources. For example, many programs can export data in # comma-separated values (CSV) format, such that lines have fields # separated by commas and enclosed within double quotation marks, # with an initial line of column names. If the lines in such a # file are terminated by carriage return/newline pairs, the # statement shown here illustrates the field- and line-handling # options you would use to load the file: # # LOAD DATA INFILE 'data.txt' INTO TABLE tbl_name # FIELDS TERMINATED BY ',' ENCLOSED BY '"' # LINES TERMINATED BY '\r\n' # IGNORE 1 LINES; # # mysql_fields_enclosed_by = None mysql_fields_optionally_enclosed_by = None hint_quotechar: HintQuoteChar = hints.quotechar hint_quoting: HintQuoting = hints.quoting if hint_quoting == 'all': mysql_fields_enclosed_by = hint_quotechar elif hint_quoting == 'minimal': # "If the input values are not necessarily enclosed within # quotation marks, use OPTIONALLY before the ENCLOSED BY option." # # This implies to me that parsing here is permissive - # otherwise unambiguous strings without double quotes around # them will be understood as a string, not rejected. mysql_fields_optionally_enclosed_by = hint_quotechar elif hint_quoting == 'nonnumeric': mysql_fields_optionally_enclosed_by = hint_quotechar elif hint_quoting is None: pass else: _assert_never(hint_quotechar) quiet_remove(unhandled_hints, 'quoting') quiet_remove(unhandled_hints, 'quotechar') # If the field begins with the ENCLOSED BY character, instances of # that character are recognized as terminating a field value only # if followed by the field or line TERMINATED BY sequence. To # avoid ambiguity, occurrences of the ENCLOSED BY character within # a field value can be doubled and are interpreted as a single # instance of the character. For example, if ENCLOSED BY '"' is # specified, quotation marks are handled as shown here: hint_doublequote: HintDoublequote = hints.doublequote if hint_quoting is not None: # We need to ignore flake8's "is vs ==" check because 'is' # doesn't work currently with MyPy's Literal[] case checking if hint_doublequote == True: # noqa: E712 pass elif hint_doublequote == False: # noqa: E712 cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', records_format.hints) else: _assert_never(hint_doublequote) quiet_remove(unhandled_hints, 'doublequote') # FIELDS ESCAPED BY controls how to read or write special characters: # # * For input, if the FIELDS ESCAPED BY character is not empty, # occurrences of that character are stripped and the following # character is taken literally as part of a field value. Some # two-character sequences that are exceptions, where the first # character is the escape character. # # [...] # # If the FIELDS ESCAPED BY character is empty, escape-sequence # interpretation does not occur. hint_escape: HintEscape = hints.escape if hint_escape is None: mysql_fields_escaped_by = None elif hint_escape == '\\': mysql_fields_escaped_by = '\\' else: _assert_never(hint_quoting) quiet_remove(unhandled_hints, 'escape') mysql_lines_starting_by = '' hint_record_terminator: HintRecordTerminator = hints.record_terminator mysql_lines_terminated_by = hint_record_terminator quiet_remove(unhandled_hints, 'record-terminator') hint_header_row: HintHeaderRow = hints.header_row # We need to ignore flake8's "is vs ==" check because 'is' # doesn't work currently with MyPy's Literal[] case checking if hint_header_row == True: # noqa: E712 mysql_ignore_n_lines = 1 elif hint_header_row == False: # noqa: E712 mysql_ignore_n_lines = 0 else: _assert_never(hint_header_row) quiet_remove(unhandled_hints, 'header-row') hint_compression: HintCompression = hints.compression if hint_compression is not None: cant_handle_hint(fail_if_cant_handle_hint, 'compression', records_format.hints) quiet_remove(unhandled_hints, 'compression') # # Date/time parsing in MySQL seems to be permissive enough to # accept all formats we've sent at it in integration tests. That # said, DD/MM support is unlikely to work if the server isn't set # to a compatible locale. This is true for a number of the # database drivers; the backlog item to address is here: # # https://github.com/bluelabsio/records-mover/issues/75 # # To address, we'll want to look into "set trade_date" per # https://stackoverflow.com/questions/44171283/load-data-local-infile-with-sqlalchemy-and-pymysql # quiet_remove(unhandled_hints, 'dateformat') quiet_remove(unhandled_hints, 'timeonlyformat') quiet_remove(unhandled_hints, 'datetimeformat') quiet_remove(unhandled_hints, 'datetimeformattz') return MySqlLoadOptions(character_set=mysql_character_set, fields_terminated_by=mysql_fields_terminator, fields_enclosed_by=mysql_fields_enclosed_by, fields_optionally_enclosed_by=mysql_fields_optionally_enclosed_by, fields_escaped_by=mysql_fields_escaped_by, lines_starting_by=mysql_lines_starting_by, lines_terminated_by=mysql_lines_terminated_by, ignore_n_lines=mysql_ignore_n_lines)
def postgres_copy_options_csv(unhandled_hints: Set[str], hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool, mode: CopyOptionsModeType) ->\ PostgresCopyOptions: postgres_options: PostgresCopyOptions = {} # FORMAT # # Selects the data format to be read or written: text, csv (Comma # Separated Values), or binary. The default is text. # postgres_options['format'] = 'csv' # NULL # # Specifies the string that represents a null value. The default # is \N (backslash-N) in text format, and an unquoted empty # string in CSV format. You might prefer an empty string even in # text format for cases where you don't want to distinguish nulls # from empty strings. This option is not allowed when using # binary format. # # Note: When using COPY FROM, any data item that matches this # string will be stored as a null value, so you should make sure # that you use the same string as you used with COPY TO. # # QUOTE # # Specifies the quoting character to be used when a data value is # quoted. The default is double-quote. This must be a single # one-byte character. This option is allowed only when using CSV # format. # postgres_options['quote'] = hints.quotechar quiet_remove(unhandled_hints, 'quotechar') # ESCAPE # # Specifies the character that should appear before a data # character that matches the QUOTE value. The default is the same # as the QUOTE value (so that the quoting character is doubled if # it appears in the data). This must be a single one-byte # character. This option is allowed only when using CSV format. # if not hints.doublequote: cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints) else: quiet_remove(unhandled_hints, 'doublequote') if hints.escape is not None: cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints) else: quiet_remove(unhandled_hints, 'escape') # FORCE_QUOTE # # Forces quoting to be used for all non-NULL values in each # specified column. NULL output is never quoted. If * is # specified, non-NULL values will be quoted in all columns. This # option is allowed only in COPY TO, and only when using CSV # format. if mode is CopyOptionsMode.LOADING: if hints.quoting != 'minimal': cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) else: quiet_remove(unhandled_hints, 'quoting') elif mode is CopyOptionsMode.UNLOADING: # The values in each record are separated by the DELIMITER # character. If the value contains the delimiter character, # the QUOTE character, the NULL string, a carriage return, or # line feed character, then the whole value is prefixed and # suffixed by the QUOTE character, and any occurrence within # the value of a QUOTE character or the ESCAPE character is # preceded by the escape character. You can also use # FORCE_QUOTE to force quotes when outputting non-NULL values # in specific columns. if hints.quoting == 'minimal': pass # default elif hints.quoting == 'all': postgres_options['force_quote'] = '*' else: cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) else: _assert_never(mode) quiet_remove(unhandled_hints, 'quoting') # As of the 9.2 release (documentation as of 2019-03-12), there's # no statement in the docs on what newline formats are accepted in # "CSV" mode: # # https://www.postgresql.org/docs/9.2/sql-copy.html#AEN67247 # # So let's test and find out! # # This test file is in UNIX newline mode: # # $ file tests/integration/resources/delimited-bigquery-with-header.csv # tests/integration/resources/delimited-bigquery-with-header.csv: ASCII text # $ # It loads fine with: # $ mvrec file2table --source.variant bigquery # --source.no_compression tests/integration/resources/delimited-bigquery-with-header.csv # --target.existing_table drop_and_recreate dockerized-postgres public bigqueryformat # $ unix2mac -n tests/integration/resources/delimited-bigquery-with-header.csv # tests/integration/resources/delimited-bigquery-with-header-mac.csv # $ mvrec file2table --source.variant bigquery --source.no_compression # tests/integration/resources/delimited-bigquery-with-header-mac.csv # dockerized-postgres public bigqueryformat # loads fine # $ unix2dos -n tests/integration/resources/delimited-bigquery-with-header.csv # tests/integration/resources/delimited-bigquery-with-header-dos.csv # $ mvrec file2table --source.variant bigquery --source.no_compression # tests/integration/resources/delimited-bigquery-with-header-dos.csv # --target.existing_table drop_and_recreate # dockerized-postgres public bigqueryformat # loads fine if mode is CopyOptionsMode.LOADING: if hints.record_terminator in ("\n", "\r\n", "\r", None): quiet_remove(unhandled_hints, 'record-terminator') else: cant_handle_hint(fail_if_cant_handle_hint, 'records-terminator', hints) elif mode is CopyOptionsMode.UNLOADING: # No control for this is given - exports appear with unix # newlines. if hints.record_terminator == "\n": quiet_remove(unhandled_hints, 'record-terminator') else: cant_handle_hint(fail_if_cant_handle_hint, 'records-terminator', hints) else: _assert_never(mode) if hints.compression is not None: cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) else: quiet_remove(unhandled_hints, 'compression') return postgres_copy_options_common(unhandled_hints, hints, fail_if_cant_handle_hint, postgres_options)
def determine_input_date_order_style(unhandled_hints: Set[str], hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool) ->\ Optional[DateOrderStyle]: date_order_style: Optional[DateOrderStyle] = None def upgrade_date_order_style(style: DateOrderStyle, hint_name: str) -> None: nonlocal date_order_style if date_order_style not in (None, style): cant_handle_hint(fail_if_cant_handle_hint, hint_name, hints) else: date_order_style = style quiet_remove(unhandled_hints, hint_name) # https://www.postgresql.org/docs/9.5/datatype-datetime.html#DATATYPE-DATETIME-INPUT # "Date and time input is accepted in almost any reasonable # format, including ISO 8601, SQL-compatible, traditional # POSTGRES, and others. For some formats, ordering of day, month, # and year in date input is ambiguous and there is support for # specifying the expected ordering of these fields. Set the # DateStyle parameter to MDY to select month-day-year # interpretation, DMY to select day-month-year interpretation, or # YMD to select year-month-day interpretation." # $ ./itest shell # $ db dockerized-postgres # postgres=# SHOW DateStyle; # DateStyle # ----------- # ISO, MDY # (1 row) # # postgres=# datetimeformattz = hints.datetimeformattz # datetimeformattz: Valid values: "YYYY-MM-DD HH:MI:SSOF", # "YYYY-MM-DD HH:MI:SS", "YYYY-MM-DD HH24:MI:SSOF", "YYYY-MM-DD # HH24:MI:SSOF", "MM/DD/YY HH24:MI". See Redshift docs for more # information (note that HH: is equivalent to HH24: and that if # you don't provide an offset (OF), times are assumed to be in # UTC). # Default value is "YYYY-MM-DD HH:MI:SSOF". if datetimeformattz in [ 'YYYY-MM-DD HH:MI:SSOF', 'YYYY-MM-DD HH24:MI:SSOF' ]: # # postgres=# select timestamptz '2020-01-01 23:01:01-10'; # timestamptz # ------------------------ # 2020-01-02 09:01:01+00 # (1 row) # # postgres=# # Any DateStyle will do as this is unambiguous quiet_remove(unhandled_hints, 'datetimeformattz') elif datetimeformattz == 'YYYY-MM-DD HH:MI:SS': # # # postgres=# select timestamptz '2020-01-01 23:01:01'; # timestamptz # ------------------------ # 2020-01-01 23:01:01+00 # (1 row) # # postgres=# # Any DateStyle will do as this is unambiguous quiet_remove(unhandled_hints, 'datetimeformattz') elif datetimeformattz == "MM/DD/YY HH24:MI": # "MM/DD/YY HH24:MI" # # postgres=# select timestamptz '01/02/2999 23:01'; # timestamptz # ------------------------ # 2999-01-02 23:01:00+00 # (1 row) # # postgres=# upgrade_date_order_style('MDY', 'datetimeformattz') else: cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints) # datetimeformat: Valid values: "YYYY-MM-DD HH24:MI:SS", # "YYYY-MM-DD HH12:MI AM", "MM/DD/YY HH24:MI". See Redshift docs # for more information. datetimeformat = hints.datetimeformat if datetimeformat in ("YYYY-MM-DD HH24:MI:SS", "YYYY-MM-DD HH:MI:SS"): # # postgres=# select timestamp '2020-01-02 15:13:12'; # timestamp # --------------------- # 2020-01-02 15:13:12 # (1 row) # # postgres=# # Any DateStyle will do as this is unambiguous quiet_remove(unhandled_hints, 'datetimeformat') elif datetimeformat == "YYYY-MM-DD HH12:MI AM": # "YYYY-MM-DD HH12:MI AM" # # postgres=# select timestamp '2020-01-02 1:13 PM'; # timestamp # --------------------- # 2020-01-02 13:13:00 # (1 row) # # postgres=# # Any DateStyle will do as this is unambiguous quiet_remove(unhandled_hints, 'datetimeformat') elif datetimeformat == "MM/DD/YY HH24:MI": # postgres=# select timestamp '01/02/20 15:23'; # timestamp # --------------------- # 2020-01-02 15:23:00 # (1 row) # # postgres=# upgrade_date_order_style('MDY', 'datetimeformat') else: cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints) timeonlyformat = hints.timeonlyformat # timeonlyformat: Valid values: "HH12:MI AM" (e.g., "1:00 PM"), # "HH24:MI:SS" (e.g., "13:00:00") if timeonlyformat == "HH12:MI AM": # "HH12:MI AM" (e.g., "1:00 PM"), # # postgres=# select time '1:00 PM'; # time # ---------- # 13:00:00 # (1 row) # # postgres=# # Supported! quiet_remove(unhandled_hints, 'timeonlyformat') elif timeonlyformat == "HH24:MI:SS": # "HH24:MI:SS" (e.g., "13:00:00") # # postgres=# select time '13:00:00'; # time # ---------- # 13:00:00 # (1 row) # # postgres=# # Supported! quiet_remove(unhandled_hints, 'timeonlyformat') else: cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints) # dateformat: Valid values: null, "YYYY-MM-DD", "MM-DD-YYYY", "DD-MM-YYYY", "MM/DD/YY". dateformat = hints.dateformat if dateformat == "YYYY-MM-DD": # postgres=# select date '1999-01-02'; # date # ------------ # 1999-01-02 # (1 row) # # postgres=# # Any DateStyle will do as this is unambiguous quiet_remove(unhandled_hints, 'dateformat') elif dateformat == "MM-DD-YYYY": # "MM-DD-YYYY" # # postgres=# select date '01-02-1999'; # date # ------------ # 1999-01-02 # (1 row) # # postgres=# upgrade_date_order_style('MDY', 'dateformat') elif dateformat == "DD-MM-YYYY": # "DD-MM-YYYY" - not supported by default, need to switch to # DMY: # # postgres=# select date '02-01-1999'; # date # ------------ # 1999-02-01 # (1 row) # # postgres=# upgrade_date_order_style('DMY', 'dateformat') elif dateformat == "MM/DD/YY": # "MM/DD/YY". # # postgres=# select date '01/02/99'; # date # ------------ # 1999-01-02 # (1 row) # # postgres=# upgrade_date_order_style('MDY', 'dateformat') elif dateformat is None: # null implies that that date format is unknown, and that the # implementation SHOULD generate using their default value and # parse permissively. # ...which is what Postgres does! quiet_remove(unhandled_hints, 'dateformat') else: cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints) return date_order_style
def postgres_copy_options_text(unhandled_hints: Set[str], hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool, mode: CopyOptionsModeType) ->\ PostgresCopyOptions: postgres_options: PostgresCopyOptions = {} # FORMAT # # Selects the data format to be read or written: text, csv (Comma # Separated Values), or binary. The default is text. # postgres_options['format'] = 'text' # Backslash characters (\) can be used in the COPY data to quote # data characters that might otherwise be taken as row or column # delimiters. In particular, the following characters must be # preceded by a backslash if they appear as part of a column # value: backslash itself, newline, carriage return, and the # current delimiter character. if hints.escape is None: cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints) else: quiet_remove(unhandled_hints, 'escape') # NULL # # Specifies the string that represents a null value. The default # is \N (backslash-N) in text format, and an unquoted empty # string in CSV format. You might prefer an empty string even in # text format for cases where you don't want to distinguish nulls # from empty strings. This option is not allowed when using # binary format. # # Note: When using COPY FROM, any data item that matches this # string will be stored as a null value, so you should make sure # that you use the same string as you used with COPY TO. # # QUOTE # # Specifies the quoting character to be used when a data value is # quoted. The default is double-quote. This must be a single # one-byte character. This option is allowed only when using CSV # format. # quiet_remove(unhandled_hints, 'quotechar') # ESCAPE # # Specifies the character that should appear before a data # character that matches the QUOTE value. The default is the same # as the QUOTE value (so that the quoting character is doubled if # it appears in the data). This must be a single one-byte # character. This option is allowed only when using CSV format. # if hints.doublequote: cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints) else: quiet_remove(unhandled_hints, 'doublequote') # FORCE_QUOTE # # Forces quoting to be used for all non-NULL values in each # specified column. NULL output is never quoted. If * is # specified, non-NULL values will be quoted in all columns. This # option is allowed only in COPY TO, and only when using CSV # format. # if hints.quoting is not None: cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) else: quiet_remove(unhandled_hints, 'quoting') # FORCE_NOT_NULL # # Do not match the specified columns' values against the null # string. In the default case where the null string is empty, # this means that empty values will be read as zero-length # strings rather than nulls, even when they are not quoted. This # option is allowed only in COPY FROM, and only when using CSV # format. # # COPY TO will terminate each row with a Unix-style newline # ("\n"). Servers running on Microsoft Windows instead output # carriage return/newline ("\r\n"), but only for COPY to a server # file; for consistency across platforms, COPY TO STDOUT always # sends "\n" regardless of server platform. COPY FROM can handle # lines ending with newlines, carriage returns, or carriage # return/newlines. To reduce the risk of error due to # un-backslashed newlines or carriage returns that were meant as # data, COPY FROM will complain if the line endings in the input # are not all alike. if mode is CopyOptionsMode.LOADING: if hints.record_terminator in ["\n", "\r", "\r\n"]: quiet_remove(unhandled_hints, 'record-terminator') else: cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints) elif mode is CopyOptionsMode.UNLOADING: # No control for this is given - exports appear with unix # newlines. if hints.record_terminator == "\n": quiet_remove(unhandled_hints, 'record-terminator') else: cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints) else: _assert_never(mode) if hints.compression is not None: cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) else: quiet_remove(unhandled_hints, 'compression') return postgres_copy_options_common(unhandled_hints, hints, fail_if_cant_handle_hint, postgres_options)