def split_file(file_name, delimiter=',', row_limit=10000, parts=0, output_dir='./'):
    '''
    Split files into either parts or row limit so we can process smaller files
    Precedence of parts over row_limit when both are supplied
    '''
    split_file_list = []
    header_path = None
    total_rows = 0
    # if file_name is None, it means no csv file
    if file_name is not None:
        if not validate_file(file_name):
            raise Exception('Unable to split invalid file')
        create_directory(output_dir)

        with open_udl_file(file_name) as csvfile:
            data = csv.reader(csvfile, delimiter=delimiter)
            header = next(data)
            # Get the total number of records
            for row in data:
                total_rows += 1
            if total_rows is 0:
                raise Exception('CSV file has no data')
            if parts is 0:
                parts = math.ceil(total_rows / row_limit)
            # Recalculate values
            row_limit = math.ceil(total_rows / parts)
            parts = math.ceil(total_rows / row_limit)
            # Split files
            for i in range(1, parts + 1):
                csvfile.seek(0)
                # Generate file names that will be loaded into fdw
                output_file = os.path.join(output_dir, 'part_' + str(uuid4()) + '.csv')
                with open_udl_file(output_file, 'w') as writerfile:
                    row_count = 0
                    start = row_limit * (i - 1) + 1
                    end = i * row_limit + 1
                    # Slice the iterator based on start and end
                    for row in itertools.islice(data, start, end):
                        csvwriter = csv.writer(writerfile, delimiter=delimiter)
                        csvwriter.writerow(row)
                        row_count += 1
                    split_file_list.append([output_file, row_count, start])

        # save headers to output dir
        header_path = os.path.join(output_dir, 'headers.csv')
        with open_udl_file(header_path, 'w') as csv_header_file:
            header_writer = csv.writer(csv_header_file, delimiter=delimiter)
            header_writer.writerow(header)
            csv_header_file.flush()  # EJ, make sure the file is writtend into disk. this happens only when benchmark prints frames

    return split_file_list, header_path, total_rows, os.path.getsize(file_name) if file_name is not None else 0
Beispiel #2
0
def get_attribute_value_from_json_keypath(json_file_path, *attribute_key_path):
    '''
    Determine the attribute value from the json file contents
    @param json_file_path: The full directory pathname of the json file
    @type string
    @param attribute_key_path: the key path to search the json for
    @type tuple
    @return: value of the key
    @rtype: string
    '''

    attribute_value = None

    with open_udl_file(json_file_path) as json_file:
        try:
            json_object = json.load(json_file, object_hook=CaseInsensitiveDict)
            attribute_value = json_object
            for key in attribute_key_path:
                attribute_value = attribute_value.get(key)
        except ValueError:
            logger.error('Malformed json file %s' % json_file_path)
        except KeyError:
            logger.error('Cannot find key %s in file %s' %
                         (str(attribute_key_path), json_file_path))
        except AttributeError:
            logger.error('The given path %s in file %s is invalid' %
                         (str(attribute_key_path), json_file_path))

    return attribute_value
    def execute(self, dir_path, file_name, batch_sid):
        """
        Check to make sure the file does not contain duplicate headers

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid)
        """
        full_path = abs_path_join(dir_path, file_name)

        processed_headers = []
        headers = None

        try:
            with open_udl_file(full_path, 'rU') as file_to_validate:
                # headers = file_to_validate.readline()
                reader = csv.reader(file_to_validate)
                while headers is None or len(headers) == 0:
                    headers = next(reader)
        except FileNotFoundError:
            return (ErrorCode.SRC_FILE_NOT_ACCESSIBLE_SFV, dir_path, file_name, batch_sid)
        except Exception:
            return (ErrorCode.STATUS_UNKNOWN_ERROR, dir_path, file_name, batch_sid)

        for header in headers:
            if header.lower() in processed_headers:
                return (ErrorCode.SRC_FILE_HAS_DUPLICATE_HEADERS, dir_path, file_name, batch_sid)
            elif len(header) > 0:
                processed_headers.append(header.lower())

        return (ErrorCode.STATUS_OK, dir_path, file_name, batch_sid)
    def execute(self, dir_path, file_name, batch_sid):
        """Check if file has at least one data row, and make sure the data row
        and headers contain the same number of fields

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid)
        """
        full_path = abs_path_join(dir_path, file_name)

        try:
            # open file, reader, and get first two rows
            # if we can retrieve the 2nd row, then it exists,
            # and we can return
            file_to_validate = open_udl_file(full_path, 'rU')
            file_reader = csv.reader(file_to_validate)
            next(file_reader)
            next(file_reader)
        except StopIteration:
            return ErrorCode.SRC_FILE_HAS_NO_DATA, dir_path, file_name, batch_sid
        except FileNotFoundError:
            return ErrorCode.SRC_FILE_NOT_ACCESSIBLE_SFV, dir_path, file_name, batch_sid
        except Exception:
            return ErrorCode.STATUS_UNKNOWN_ERROR, dir_path, file_name, batch_sid

        return ErrorCode.STATUS_OK, dir_path, file_name, batch_sid
    def execute(self, dir_path, file_name, batch_sid):
        """Execute that the file is indeed comma delimited

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid)
        """

        # get full path and open file
        full_path = abs_path_join(dir_path, file_name)
        try:
            file_to_validate = open_udl_file(full_path, 'rU')
        except FileNotFoundError:
            return (ErrorCode.SRC_FILE_NOT_ACCESSIBLE_SFV, dir_path, file_name, batch_sid)
        except Exception:
            return (ErrorCode.STATUS_UNKNOWN_ERROR, dir_path, file_name, batch_sid)

        sample_data = None
        # use csv.sniffer to detect the dialect, then close the file
        try:

            # h4x for http://bugs.python.org/issue10515
            # csv sniffer doesn't like lines that end in any type of quote
            # so lets detect quoted string and eol and replace them with 'FILLER'
            sample_data = file_to_validate.read(1024)
            match_results = re.findall(r'(["].*["])\s*$', sample_data, re.MULTILINE)
            for result in match_results:
                sample_data = sample_data.replace(result, 'FILLER')
            dialect = csv.Sniffer().sniff(sample_data, ',')
        except csv.Error:
            # if csv.sniffer thorws an exception it means we got a strange encoding.
            # In order not to interfere with encodings that DO work. Lets only apply
            # the hack when an exception is thrown
            try:
                # h4x to fix an exception that is thrown by sniffer
                # for the encoding type Western Europe (DOS/OS2-850 International) and probably others as well :)
                sample_data = sample_data.replace('\n', '\r')
                dialect = csv.Sniffer().sniff(sample_data, ',')
            except csv.Error:
                # if the hack fails then we'll throw the exception
                return (ErrorCode.SRC_FILE_WRONG_DELIMITER, dir_path, file_name, batch_sid)

        file_to_validate.close()

        # execute delimiting character
        if dialect.delimiter != ',':
            print('Wrong delim')
            print(dialect)
            print(dialect.delimiter)
            return (ErrorCode.SRC_FILE_WRONG_DELIMITER, dir_path, file_name, batch_sid)

        return (ErrorCode.STATUS_OK, dir_path, file_name, batch_sid)
def read_json_file(json_file):
    '''
    Read a json file into a dictionary
    @param json_file: The path to the json file to read
    @return: A dictionary containing the data from the json file
    @rtype: dict
    '''

    with open_udl_file(json_file, 'r') as jf:
        return json.load(jf)
def read_json_file(json_file):
    '''
    Read a json file into a dictionary
    @param json_file: The path to the json file to read
    @return: A dictionary containing the data from the json file
    @rtype: dict
    '''

    with open_udl_file(json_file, 'r') as jf:
        return json.load(jf)
def check_header_contains_column(csv_file, column):
    """Open the csv file and determine if the file contains the column

    :param csv_file: the name of the csv file
    :returns True if the file contains the column, False otherwise
    """
    with open_udl_file(csv_file, 'r') as fp:
        csv_reader = csv.reader(fp)
        header = next(csv_reader)
        header = [x.lower() for x in header]
        return column.lower() in header
def check_header_contains_column(csv_file, column):
    """Open the csv file and determine if the file contains the column

    :param csv_file: the name of the csv file
    :returns True if the file contains the column, False otherwise
    """
    with open_udl_file(csv_file, 'r') as fp:
        csv_reader = csv.reader(fp)
        header = next(csv_reader)
        header = [x.lower() for x in header]
        return column.lower() in header
    def execute(self, dir_path, file_name, batch_sid):
        """Check to make sure the file contains non-empty headers

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid)
        """
        # get full path and read 1st 4 lines from file.
        full_path = abs_path_join(dir_path, file_name)
        first_four_lines = []
        try:
            with open_udl_file(full_path, 'rU') as file_to_validate:
                for line in file_to_validate:
                    line = line.strip()
                    if line is not None and len(line) > 0:
                        first_four_lines.append(line)
                        if len(first_four_lines) >= 4:
                            break
        except FileNotFoundError:
            return (ErrorCode.SRC_FILE_NOT_ACCESSIBLE_SFV, dir_path, file_name, batch_sid)
        except Exception:
            return (ErrorCode.STATUS_UNKNOWN_ERROR, dir_path, file_name, batch_sid)

        if len(first_four_lines) == 0:
            # No rows, so no header.
            return (ErrorCode.SRC_FILE_HAS_NO_HEADERS, dir_path, file_name, batch_sid)

        header = first_four_lines[0]
        if all([not bool(column.strip()) for column in header.split(",")]):
            # Header contains no names. Just commas and spaces.
            return (ErrorCode.SRC_FILE_HAS_NO_HEADERS, dir_path, file_name, batch_sid)

        # Pass first 4 lines to CSV header sniffer
        first_four_lines_str = os.linesep.join(first_four_lines)

        has_headers = csv.Sniffer().has_header(first_four_lines_str)

        if not has_headers:
            return (ErrorCode.SRC_FILE_HAS_NO_HEADERS, dir_path, file_name, batch_sid)
        else:
            return (ErrorCode.STATUS_OK, dir_path, file_name, batch_sid)
    def execute(self, dir_path, file_name, batch_sid):
        '''
        Run json.load() on the given file, if it is invalid json, the exception will be caught and the proper code returned

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid)
        '''
        complete_path = os.path.join(dir_path, file_name)
        with open_udl_file(complete_path) as f:
            try:
                json.load(f)
                return (ErrorCode.STATUS_OK, dir_path, file_name, batch_sid)
            except ValueError:
                return (ErrorCode.SRC_JSON_INVALID_STRUCTURE, dir_path, file_name, batch_sid)
    def execute(self, dir_path, file_name, batch_sid):
        '''
        Run json.load() on the given file, if it is invalid json, the exception will be caught and the proper code returned

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid)
        '''
        complete_path = os.path.join(dir_path, file_name)
        with open_udl_file(complete_path) as f:
            try:
                json.load(f)
                return (ErrorCode.STATUS_OK, dir_path, file_name, batch_sid)
            except ValueError:
                return (ErrorCode.SRC_JSON_INVALID_STRUCTURE, dir_path,
                        file_name, batch_sid)
    def execute(self, dir_path, file_name, batch_sid):
        """Execute to make sure that the number of headers is the same as the
        number of data-points on the first _lines_to_validate lines.

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid)
        """
        # get full path, open file and create reader
        full_path = abs_path_join(dir_path, file_name)
        try:
            with open_udl_file(full_path, 'rU') as file_to_validate:
                file_reader = csv.reader(file_to_validate)

                # get the headers
                headers = next(file_reader)
                num_headers = len(headers)

                # check the number of data points
                while (file_reader.line_num - 1) < self._lines_to_validate:
                    try:
                        line = next(file_reader)
                    # execute to make sure we haven't hit the end of the file
                    except StopIteration:
                        return ErrorCode.SRC_FILE_HAS_NO_DATA, dir_path, file_name, batch_sid

                    # validate the number of data entries
                    if len(line) != num_headers or self._empty_header_has_data(headers, line):
                        return ErrorCode.SRC_FILE_HEADERS_MISMATCH_DATA, dir_path, file_name, batch_sid
        except FileNotFoundError:
            return ErrorCode.SRC_FILE_NOT_ACCESSIBLE_SFV, dir_path, file_name, batch_sid
        except Exception:
            return ErrorCode.STATUS_UNKNOWN_ERROR, dir_path, file_name, batch_sid

        # we passed all tests
        return ErrorCode.STATUS_OK, dir_path, file_name, batch_sid
    def execute(self, dir_path, file_name, batch_sid):
        '''
        Iterate through all the elements of mapping, and check that we can reach all expected fields using
        the provided paths.

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid, field) or (status_code, dir_path, file_name, batch_sid)
        '''

        complete_path = os.path.join(dir_path, file_name)
        with open_udl_file(complete_path) as f:
            json_object = json.load(f)
            mapping = self.mapping
            for field in mapping.keys():
                path = mapping[field]
                if not self.does_json_path_exist(json_object, path):
                    return (ErrorCode.SRC_JSON_INVALID_FORMAT, dir_path, file_name, batch_sid, field)
            return (ErrorCode.STATUS_OK, dir_path, file_name, batch_sid)
    def execute(self, dir_path, file_name, batch_sid):
        """Check if file has only and all the columns expected to be present
           the validator does not care about case and order of the columns

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid)
        """
        full_path = abs_path_join(dir_path, file_name)

        try:
            # open file and get the header
            file_to_validate = open_udl_file(full_path, 'rU')
            file_reader = csv.reader(file_to_validate)
            header_row = next(file_reader)
            file_to_validate.close()
            header_row = [column.lower() for column in header_row]
            if not self.are_eq(header_row, self.expected_csv_fields):
                # not equal when 'op', 'completestatus', 'administrationcondition' are not present
                new_expected = [x for x in self.expected_csv_fields if x not in {Constants.OP_COLUMN_NAME, Constants.COMPLETESTATUS, Constants.ADMINISTRATIONCONDITION}]
                new_header_row = [x for x in header_row if x not in {Constants.OP_COLUMN_NAME, Constants.COMPLETESTATUS, Constants.ADMINISTRATIONCONDITION}]
                if not self.are_eq(new_header_row, new_expected):
                    return ErrorCode.SRC_FILE_HAS_HEADERS_MISMATCH_EXPECTED_FORMAT, dir_path, file_name, batch_sid

        except StopIteration:
            return ErrorCode.SRC_FILE_HAS_NO_DATA, dir_path, file_name, batch_sid
        except FileNotFoundError:
            return ErrorCode.SRC_FILE_NOT_ACCESSIBLE_SFV, dir_path, file_name, batch_sid
        except Exception:
            return ErrorCode.STATUS_UNKNOWN_ERROR, dir_path, file_name, batch_sid

        return ErrorCode.STATUS_OK, dir_path, file_name, batch_sid
    def execute(self, dir_path, file_name, batch_sid):
        '''
        Iterate through all the elements of mapping, and check that we can reach all expected fields using
        the provided paths.

        @param dir_path: path of the file
        @type dir_path: string
        @param file_name: name of the file
        @type file_name: string
        @param batch_sid: batch id of the file
        @type batch_sid: integer
        @return: tuple of the form: (status_code, dir_path, file_name, batch_sid, field) or (status_code, dir_path, file_name, batch_sid)
        '''

        complete_path = os.path.join(dir_path, file_name)
        with open_udl_file(complete_path) as f:
            json_object = json.load(f)
            mapping = self.mapping
            for field in mapping.keys():
                path = mapping[field]
                if not self.does_json_path_exist(json_object, path):
                    return (ErrorCode.SRC_JSON_INVALID_FORMAT, dir_path,
                            file_name, batch_sid, field)
            return (ErrorCode.STATUS_OK, dir_path, file_name, batch_sid)
def extract_csv_header(conn, staging_schema, ref_table, csv_lz_table,
                       csv_header_file):
    '''
    Extract header names and header types from input csv file,
    and also compare the header names in csv_header_file and ref_table.
    If any of header does not match, raise ValueError.
    By default, the header type for all columns is 'text'.
    '''
    # get ordered header names from input csv_header_file
    with open_udl_file(csv_header_file) as csv_obj:
        reader = csv.reader(csv_obj)
        header_names_in_header_file = next(reader)
        header_types = [DATA_TYPE_IN_FDW_TABLE
                        ] * len(header_names_in_header_file)

    # Case insensitive
    lowered_headers_in_file = [
        header.lower() for header in header_names_in_header_file
    ]
    # verify headers in csv header file also exist in ref_table
    header_names_in_ref_table = get_csv_header_names_in_ref_table(
        conn, staging_schema, ref_table, csv_lz_table)
    lowered_header_names_in_ref_table = [
        header.lower() for header in header_names_in_ref_table
    ]
    # if there are columns which exist at header file, but not defined in ref table, raise exception
    diff_item = set(lowered_headers_in_file) - set(
        lowered_header_names_in_ref_table)
    if len(diff_item) > 0:
        raise ValueError(
            'Column %s does not match between header file and mapping defined in ref table %s'
            % (str(diff_item), ref_table))
    formatted_header_names = [
        canonicalize_header_field(name) for name in lowered_headers_in_file
    ]
    return formatted_header_names, header_types
Beispiel #18
0
def task(incoming_msg):
    '''
    This is the celery task to load err file
    '''
    guid_batch = incoming_msg.get(mk.GUID_BATCH)
    # Outgoing message to be piped to the file expander
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    tenant_directory_paths = incoming_msg.get(mk.TENANT_DIRECTORY_PATHS)
    expanded_dir = tenant_directory_paths.get(mk.EXPANDED)
    err_file = file_util.get_file_type_from_dir('.err', expanded_dir)
    if err_file is not None:
        with file_util.open_udl_file(err_file) as f:
            json_data = f.read()
            error_json = json.loads(json_data)
            content = error_json['content']
            if content == 'error':
                tsb_error = error_json['tsb_error']
                outgoing_msg['tsb_error'] = tsb_error
                notification_data = {'tsb_error': tsb_error}
                notification_data[Constants.ERROR_DESC] = 'tsb error'
                merge_to_udl2stat_notification(guid_batch, notification_data)

    return outgoing_msg
def extract_csv_header(conn, staging_schema, ref_table, csv_lz_table, csv_header_file):
    '''
    Extract header names and header types from input csv file,
    and also compare the header names in csv_header_file and ref_table.
    If any of header does not match, raise ValueError.
    By default, the header type for all columns is 'text'.
    '''
    # get ordered header names from input csv_header_file
    with open_udl_file(csv_header_file) as csv_obj:
        reader = csv.reader(csv_obj)
        header_names_in_header_file = next(reader)
        header_types = [DATA_TYPE_IN_FDW_TABLE] * len(header_names_in_header_file)

    # Case insensitive
    lowered_headers_in_file = [header.lower() for header in header_names_in_header_file]
    # verify headers in csv header file also exist in ref_table
    header_names_in_ref_table = get_csv_header_names_in_ref_table(conn, staging_schema, ref_table, csv_lz_table)
    lowered_header_names_in_ref_table = [header.lower() for header in header_names_in_ref_table]
    # if there are columns which exist at header file, but not defined in ref table, raise exception
    diff_item = set(lowered_headers_in_file) - set(lowered_header_names_in_ref_table)
    if len(diff_item) > 0:
        raise ValueError('Column %s does not match between header file and mapping defined in ref table %s' % (str(diff_item), ref_table))
    formatted_header_names = [canonicalize_header_field(name) for name in lowered_headers_in_file]
    return formatted_header_names, header_types