Python FieldCleaner.cleanString Beispiele, dataactvalidator.filestreaming.fieldCleaner.FieldCleaner.cleanString Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: validator.py Projekt: lisaam/data-act-broker-backend

    def evaluateRule(cls,data,rule,datatype,interfaces,record):
        """ Checks data against specified rule

        Args:
            data: Data to be checked
            rule: Rule object to test against
            datatype: Type to convert data into
            interfaces: InterfaceHolder object to the databases
            record: Some rule types require the entire record as a dict

        Returns:
            True if rule passed, False otherwise
        """
        if data is None:
            # Treat blank as an empty string
            data = ""
        value = rule.rule_text_1
        currentRuleType = rule.rule_type.name
        # Call specific rule function
        ruleFunction = "_".join(["rule",str(currentRuleType).lower()])
        ruleFunction = FieldCleaner.cleanString(ruleFunction)
        try:
            ruleMethod = getattr(cls, str(ruleFunction))
            return ruleMethod(data, value, rule, datatype, interfaces, record)
        except AttributeError as e:
            # Unrecognized rule type
            raise ResponseException(str(e), StatusCode.INTERNAL_ERROR, ValueError)

Beispiel #2

0

Datei anzeigen

    def insert(self, record, fileType):
        """ Write single record to this table
        Args:
        record: dict with column names as keys
        fileType: Type of file record is in

        Returns:
        True if successful
        """

        # Need to translate the provided record to use column IDs instead of field names for keys
        idRecord = {}
        for key in record:
            idRecord[str(
                self.interfaces.validationDb.getColumnId(
                    key, fileType))] = record[key]

        if (self.BATCH_INSERT):
            if (self.INSERT_BY_ORM):
                raise NotImplementedError(
                    "Have not implemented ORM method for batch insert")
            else:
                self.batch.append(idRecord)
                if (len(self.batch) > self.BATCH_SIZE):
                    # Time to write the batch
                    self.interface.connection.execute(
                        self.orm.__table__.insert(), self.batch)
                    # Reset batch
                    self.batch = []
                return True
        else:
            if (self.INSERT_BY_ORM):
                try:
                    recordOrm = self.orm()
                except:
                    # createTable was not called
                    raise Exception("Must call createTable before writing")

                attributes = self.getPublicMembers(recordOrm)

                # For each field, add value to ORM object
                for key in idRecord:
                    attr = FieldCleaner.cleanString(key)  #key.replace(" ","_")
                    setattr(recordOrm, attr, idRecord[key])

                self.interface.session.add(recordOrm)
                self.interface.session.commit()
                return True
            else:
                raise ValueError(
                    "Must do either batch or use ORM, cannot set both to False"
                )

Beispiel #3

0

Datei anzeigen

Datei: stagingTable.py Projekt: lisaam/data-act-broker-backend

    def insert(self, record, fileType):
        """ Write single record to this table
        Args:
        record: dict with column names as keys
        fileType: Type of file record is in

        Returns:
        True if successful
        """

        # Need to translate the provided record to use column IDs instead of field names for keys
        idRecord = {}
        # Mark if header
        for key in record:
            if key == "row":
                idRecord[key] = record[key]
            else:
                idRecord[str(self.interfaces.validationDb.getColumnId(key,fileType))] = record[key]

        if(self.BATCH_INSERT):
            if(self.INSERT_BY_ORM):
                raise NotImplementedError("Have not implemented ORM method for batch insert")
            else:
                self.batch.append(idRecord)
                if(len(self.batch)>self.BATCH_SIZE):
                    # Time to write the batch
                    self.interface.connection.execute(self.orm.__table__.insert(),self.batch)
                    # Reset batch
                    self.batch = []
                return True
        else:
            if(self.INSERT_BY_ORM):
                try:
                    recordOrm = self.orm()
                except:
                    # createTable was not called
                    raise Exception("Must call createTable before writing")

                attributes = self.getPublicMembers(recordOrm)

                # For each field, add value to ORM object
                for key in idRecord:
                    attr = FieldCleaner.cleanString(key) #key.replace(" ","_")
                    setattr(recordOrm,attr,idRecord[key])

                self.interface.session.add(recordOrm)
                self.interface.session.commit()
                return True
            else:
                raise ValueError("Must do either batch or use ORM, cannot set both to False")

Beispiel #4

0

Datei anzeigen

Datei: validator.py Projekt: lisaam/data-act-broker-backend

    def rule_exists_in_table(cls, data, value, rule, datatype, interfaces, record):
        """ Check that field value exists in specified table, rule_text_1 has table and column to check against, rule_text_2 is length to pad to """
        ruleTextOne = str(rule.rule_text_1).split(",")
        if len(ruleTextOne) != 2:
            # Bad rule definition
            raise ResponseException("exists_in_table rule incorrectly defined, must have both table and field in rule_text_one",StatusCode.INTERNAL_ERROR,ValueError)
        # Not putting model name through FieldCleaner because model names will have uppercase
        model = getattr(domainModels,str(ruleTextOne[0]).strip())
        field = FieldCleaner.cleanString(ruleTextOne[1])
        ruleTextTwo = FieldCleaner.cleanString(rule.rule_text_2)
        if len(ruleTextTwo) == 0:
            # Skip padding
            paddedData = FieldCleaner.cleanString(data)
        else:
            # Pad data to correct length
            try:
                padLength = int(ruleTextTwo)
            except ValueError as e:
                # Need an integer in rule_text_two
                raise ResponseException("Need an integer width in rule_text_two for exists_in_table rules",StatusCode.INTERNAL_ERROR,ValueError)
            paddedData = FieldCleaner.cleanString(data).zfill(padLength)

        # Build query for model and field specified
        query = interfaces.validationDb.session.query(model).filter(getattr(model,field) == paddedData)
        try:
            # Check that value exists in table, should be unique
            interfaces.validationDb.runUniqueQuery(query,"Data not found in table", "Conflicting entries found for this data")
            # If unique result found, rule passed
            return True
        except ResponseException as e:
            # If exception is no result found, rule failed
            if type(e.wrappedException) == type(NoResultFound()):
                return False
            else:
                # This is an unexpected exception, so re-raise it
                raise

Beispiel #5

0

Datei anzeigen

Datei: validatorValidationInterface.py Projekt: lisaam/data-act-broker-backend

    def getFieldsByFile(self, fileType):
        """ Returns a dict of valid field names that can appear in this type of file

        Args:
        fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial)

        Returns:
        dict with field names as keys and values are ORM object FileColumn
        """
        returnDict = {}
        fileId = self.getFileId(fileType)
        if(fileId is None) :
            raise ValueError("File type does not exist")
        queryResult = self.session.query(FileColumn).options(subqueryload("field_type")).filter(FileColumn.file_id == fileId).all()
        for column in queryResult :
            returnDict[FieldCleaner.cleanString(column.name)]  = column
        return returnDict

Beispiel #6

0

Datei anzeigen

Datei: validatorValidationInterface.py Projekt: lisaam/data-act-broker-backend

    def getFieldsByFileList(self, fileType):
        """ Returns a list of valid field names that can appear in this type of file

        Args:
        fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial)

        Returns:
        list of names
        """
        fileId = self.getFileId(fileType)
        returnList  = []
        if(fileId is None) :
            raise ValueError("Filetype does not exist")
        queryResult = self.session.query(FileColumn).filter(FileColumn.file_id == fileId).all()
        for result in queryResult:
            result.name = FieldCleaner.cleanString(result.name) # Standardize field names
        return queryResult

Beispiel #7

0

Datei anzeigen

Datei: validator.py Projekt: lisaam/data-act-broker-backend

    def rule_check_prefix(cls, data, value, rule, datatype, interfaces, record):
        """ Check that 1-digit prefix is consistent with reimbursable flag """
        dataString = FieldCleaner.cleanString(data)

        # Load target field and dict to compare with
        targetField = FieldCleaner.cleanName(rule.rule_text_1)
        prefixMap = json.loads(str(rule.rule_text_2))

        # Check that character and value are consistent with dict in rule_text_2
        if dataString[0] not in prefixMap:
            # Unknown prefix, this is a failure
            return False
        source = prefixMap[dataString[0]]
        target = record[targetField]
        source = source.lower() if source is not None else source
        target = target.lower() if target is not None else target

        if source == target:
            # Matches the value in target field, rule passes
            return True
        else:
            return False

Beispiel #8

0

Datei anzeigen

Datei: stagingTable.py Projekt: lisaam/data-act-broker-backend

    def createTable(self, fileType, filename, jobId, tableName=None):
        """ Create staging table for new file
        Args:
        fileType -- type of file to create a table for (e.g. Award, AwardFinancial)

        Returns:
        tableName if created, exception otherwise
        """
        if(tableName == None):
            tableName = self.interface.getTableName(jobId)
        self.name = tableName

        if(self.interface.tableExists(tableName)):
            # Old table still present, drop table and replace
            self.interface.dropTable(tableName)

        # Alternate way of naming tables
        #tableName = "data" + tableName.replace("/","").replace("\\","").replace(".","")
        # Write tableName to related job in job tracker

        self.interfaces.jobDb.addStagingTable(jobId,tableName)
        fields = self.interfaces.validationDb.getFieldsByFile(fileType)

        """ Might not need sequence for ORM
        # Create sequence to be used for primary key
        sequenceName = tableName + "Serial"
        sequenceStatement = "CREATE SEQUENCE " + sequenceName + " START 1"
        try:
            self.runStatement(sequenceStatement)
        except ProgrammingError:
            # Sequence already exists
            pass
        """
        primaryAssigned = False
        # Create empty dict for field names and values
        classFieldDict = {"__tablename__":tableName}
        # Create dict to hold record for field names
        fieldNameMap = {}
        # Add each column
        for key in fields:
            # Build column statement for this key
            # Create cleaned version of key
            newKey = str(fields[key].file_column_id)
            # Get correct type name
            fieldTypeName = FieldCleaner.cleanString(fields[key].field_type.name)
            if(fieldTypeName == "string"):
                fieldTypeName = Text
            elif(fieldTypeName == "int"):
                fieldTypeName = Integer
            elif(fieldTypeName == "decimal"):
                fieldTypeName = Numeric
            elif(fieldTypeName == "boolean"):
                fieldTypeName = Boolean
            elif(fieldTypeName == "long"):
                fieldTypeName = BigInteger
            else:
                raise ValueError("Bad field type")
            # Get extra parameters (primary key or not null)
            extraParam = ""
            if(FieldCleaner.cleanString(fields[key].field_type.description) == "primary_key"):
                classFieldDict[newKey] = Column(fieldTypeName, primary_key=True)
                primaryAssigned = True
            elif(fields[key].required):
                classFieldDict[newKey] = Column(fieldTypeName, nullable=False)
            else:
                classFieldDict[newKey] = Column(fieldTypeName)
            # First record will hold field names
            fieldNameMap[str(newKey)] = str(key)
        # Add column for row number
        classFieldDict["row"] = Column(Integer, nullable=False)

        if(not primaryAssigned):
            # If no primary key assigned, add one based on table name
            classFieldDict["".join([tableName,"id"])] = Column(Integer, primary_key = True)


        # Create ORM class based on dict
        self.orm = type(tableName,(declarative_base(),),classFieldDict)
        self.jobId = jobId

        # Create table
        self.orm.__table__.create(self.interface.engine)

        # Add field name map to table
        self.interface.addFieldNameMap(tableName,fieldNameMap)

Beispiel #9

0

Datei anzeigen

Datei: loaderUtils.py Projekt: lisaam/data-act-broker-backend

    def loadCsv(cls,filename,model,interface,fieldMap,fieldOptions):
        """ Loads a table based on a csv

        Args:
            filename: CSV to load
            model: ORM object for table to be loaded
            interface: interface to DB table is in
            fieldMap: dict that maps columns of the csv to attributes of the ORM object
            fieldOptions: dict with keys of attribute names, value contains a dict with options for that attribute.
                Current options are "pad_to_length" which if present will pad the field with leading zeros up to
                specified length, and "skip_duplicate" which ignores subsequent lines that repeat values.
        """
        # Delete all records currently in table
        interface.session.query(model).delete()
        interface.session.commit()
        valuePresent = {}
        # Open csv
        with open(filename,'rU') as csvfile:
            # Read header
            header = csvfile.readline()
            # Split header into fieldnames
            rawFieldNames = header.split(",")
            fieldNames = []
            # Clean field names
            for field in rawFieldNames:
                fieldNames.append(FieldCleaner.cleanString(field))
            # Map fieldnames to attribute names
            attributeNames = []
            for field in fieldNames:
                if field in fieldMap:
                    attributeNames.append(fieldMap[field])
                    if fieldMap[field] in fieldOptions and "skip_duplicates" in fieldOptions[fieldMap[field]]:
                        # Create empty dict for this field
                        valuePresent[fieldMap[field]] = {}
                else:
                    raise KeyError("".join(["Found unexpected field ", str(field)]))
            # Check that all fields are present
            for field in fieldMap:
                if not field in fieldNames:
                    raise ValueError("".join([str(field)," is required for loading table ", str(type(model))]))
            # Open DictReader with attribute names
            reader = csv.DictReader(csvfile,fieldnames = attributeNames)
            # For each row, create instance of model and add it
            for row in reader:
                skipInsert = False
                for field in fieldOptions:
                    # For each field with options present, modify according to those options
                    options = fieldOptions[field]
                    if "pad_to_length" in options:
                        padLength = options["pad_to_length"]
                        row[field] = Validator.padToLength(row[field],padLength)
                    if "skip_duplicates" in options:
                        if len(row[field].strip()) == 0 or row[field] in valuePresent[field]:
                            # Value not provided or already exists, skip it
                            skipInsert = True
                        else:
                            # Insert new value
                            valuePresent[field][row[field]] = True
                    record = model(**row)
                if not skipInsert:
                    try:
                        interface.session.merge(record)
                    except IntegrityError as e:
                        # Hit a duplicate value that violates index, skip this one
                        print("".join(["Warning: Skipping this row: ",str(row)]))
                        print("".join(["Due to error: ",str(e)]))
                        interface.session.rollback()
                        continue
            interface.session.commit()

Beispiel #10

0

Datei anzeigen

Datei: sqlLoader.py Projekt: us1415/data-act-broker-backend

    def loadSql(cls, filename):
        """Load SQL-based validation rules to db."""
        with createApp().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(RuleSql).delete()

            # Create rule severity and file type lookups
            severity = sess.query(RuleSeverity)
            severityDict = {s.name: s.rule_severity_id for s in severity.all()}
            ft = sess.query(FileTypeValidation)
            fileTypeDict = {f.name: f.file_id for f in ft.all()}

            filename = os.path.join(cls.sql_rules_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                rawFieldNames = header.split(',')
                fieldNames = []
                # clean field names
                for field in rawFieldNames:
                    fieldNames.append(FieldCleaner.cleanString(field))

                unknownFields = set(fieldNames) - set(cls.headers)
                if len(unknownFields) != 0:
                    raise KeyError("".join([
                        "Found unexpected fields: ",
                        str(list(unknownFields))
                    ]))

                missingFields = set(cls.headers) - set(fieldNames)
                if len(missingFields) != 0:
                    raise ValueError("".join([
                        "Missing required fields: ",
                        str(list(missingFields))
                    ]))

                reader = csv.DictReader(csvfile, fieldnames=fieldNames)
                for row in reader:
                    sql = cls.readSqlStr(row['query_name'])

                    rule_sql = RuleSql(
                        rule_sql=sql,
                        rule_label=row['rule_label'],
                        rule_description=row['rule_description'],
                        rule_error_message=row['rule_error_message'],
                        query_name=row['query_name'])

                    # look up file type id
                    try:
                        fileId = fileTypeDict[row["file_type"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["file_type"], row["rule_label"]))
                    try:
                        if row["target_file"].strip() == "":
                            # No target file provided
                            targetFileId = None
                        else:
                            targetFileId = fileTypeDict[row["target_file"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["target_file"], row["rule_label"]))

                    # set cross file flag
                    if (FieldCleaner.cleanString(row["rule_cross_file_flag"])
                            in ['true', 't', 'y', 'yes']):
                        cross_file_flag = True
                    else:
                        cross_file_flag = False

                    rule_sql.rule_severity_id = severityDict[
                        row['severity_name']]
                    rule_sql.file_id = fileId
                    rule_sql.target_file_id = targetFileId
                    rule_sql.rule_cross_file_flag = cross_file_flag

                    sess.merge(rule_sql)
            sess.commit()

Beispiel #11

0

Datei anzeigen

    def openFile(self,region,bucket,filename,csvSchema,bucketName,errorFilename):
        """ Opens file and prepares to read each record, mapping entries to specified column names
        Args:
            region: AWS region where the bucket is located (not used if instantiated as CsvLocalReader)
            bucket: the S3 Bucket (not used if instantiated as CsvLocalReader)
            filename: The file path for the CSV file in S3
            csvSchema: list of FileColumn objects for this file type
            bucketName: bucket to send errors to
            errorFilename: filename for error report
        """


        possibleFields = {}
        currentFields = {}
        for schema in  csvSchema:
                possibleFields[FieldCleaner.cleanString(schema.name)] = 0

        self.filename = filename
        self.unprocessed = ''
        self.extraLine = False
        self.lines = []
        self.headerDictionary = {}
        self.packetCounter = 0
        current = 0
        self.isFinished = False
        self.columnCount = 0
        line = self._getLine()
        # make sure we have not finished reading the file

        if(self.isFinished) :
            # Write header error for no header row
            with self.getWriter(bucketName, errorFilename, ["Error Type"], self.isLocal) as writer:
                writer.write(["No header row"])
                writer.finishBatch()
            raise ResponseException("CSV file must have a header",StatusCode.CLIENT_ERROR,ValueError,ValidationError.singleRow)

        duplicatedHeaders = []
        #create the header

        # check delimiters in header row
        pipeCount = line.count("|")
        commaCount = line.count(",")

        if pipeCount != 0 and commaCount != 0:
            # Write header error for mixed delimiter use
            with self.getWriter(bucketName, errorFilename, ["Error Type"], self.isLocal) as writer:
                writer.write(["Cannot use both ',' and '|' as delimiters. Please choose one."])
                writer.finishBatch()
            raise ResponseException("Error in header row: CSV file must use only '|' or ',' as the delimiter", StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError)

        self.delimiter = "|" if line.count("|") != 0 else ","
        for row in csv.reader([line],dialect='excel', delimiter=self.delimiter):
            for cell in row :
                headerValue = FieldCleaner.cleanString(cell)
                if( not headerValue in possibleFields) :
                    # Allow unexpected headers, just mark the header as None so we skip it when reading
                    self.headerDictionary[(current)] = None
                    current += 1
                elif(possibleFields[headerValue] == 1) :
                    # Add to duplicated header list
                    duplicatedHeaders.append(headerValue)
                else:
                    self.headerDictionary[(current)] = headerValue
                    possibleFields[headerValue]  = 1
                    current += 1
        self.columnCount = current
        #Check that all required fields exists
        missingHeaders = []
        for schema in csvSchema :
            if(possibleFields[FieldCleaner.cleanString(schema.name)] == 0) :
                missingHeaders.append(schema.name)
        if(len(missingHeaders) > 0 or len(duplicatedHeaders) > 0):
            # Write header errors if any occurred and raise a header_error exception
            errorString = ""
            with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer:
                extraInfo = {}
                if(len(duplicatedHeaders) > 0):
                    errorString = "".join([errorString, "Duplicated: ",", ".join(duplicatedHeaders)])
                    extraInfo["duplicated_headers"] = ", ".join(duplicatedHeaders)
                    for header in duplicatedHeaders:
                        writer.write(["Duplicated header", header])
                if(len(missingHeaders) > 0):
                    if(len(duplicatedHeaders)):
                        # Separate missing and duplicated headers if both are present
                        errorString += "| "
                    errorString = "".join([errorString, "Missing: ",", ".join(missingHeaders)])
                    extraInfo["missing_headers"] = ", ".join(missingHeaders)
                    for header in missingHeaders:
                        writer.write(["Missing header", header])
                writer.finishBatch()
            raise ResponseException("Errors in header row: " + str(errorString), StatusCode.CLIENT_ERROR, ValueError,ValidationError.headerError,**extraInfo)

Beispiel #12

0

Datei anzeigen

    def createTable(self, fileType, filename, jobId, tableName=None):
        """ Create staging table for new file
        Args:
        fileType -- type of file to create a table for (e.g. Award, AwardFinancial)

        Returns:
        tableName if created, exception otherwise
        """
        if (tableName == None):
            tableName = self.interface.getTableName(jobId)
        self.name = tableName

        if (self.interface.tableExists(tableName)):
            # Old table still present, drop table and replace
            self.interface.dropTable(tableName)

        # Alternate way of naming tables
        #tableName = "data" + tableName.replace("/","").replace("\\","").replace(".","")
        # Write tableName to related job in job tracker

        self.interfaces.jobDb.addStagingTable(jobId, tableName)
        fields = self.interfaces.validationDb.getFieldsByFile(fileType)
        """ Might not need sequence for ORM
        # Create sequence to be used for primary key
        sequenceName = tableName + "Serial"
        sequenceStatement = "CREATE SEQUENCE " + sequenceName + " START 1"
        try:
            self.runStatement(sequenceStatement)
        except ProgrammingError:
            # Sequence already exists
            pass
        """
        primaryAssigned = False
        # Create empty dict for field names and values
        classFieldDict = {"__tablename__": tableName}
        # Add each column
        for key in fields:
            # Build column statement for this key
            # Create cleaned version of key
            newKey = str(fields[key].file_column_id)
            # Get correct type name
            fieldTypeName = FieldCleaner.cleanString(
                fields[key].field_type.name)
            if (fieldTypeName == "string"):
                fieldTypeName = Text
            elif (fieldTypeName == "int"):
                fieldTypeName = Integer
            elif (fieldTypeName == "decimal"):
                fieldTypeName = Numeric
            elif (fieldTypeName == "boolean"):
                fieldTypeName = Boolean
            elif (fieldTypeName == "long"):
                fieldTypeName = BigInteger
            else:
                raise ValueError("Bad field type")
            # Get extra parameters (primary key or not null)
            extraParam = ""
            if (FieldCleaner.cleanString(
                    fields[key].field_type.description) == "primary_key"):
                classFieldDict[newKey] = Column(fieldTypeName,
                                                primary_key=True)
                primaryAssigned = True
            elif (fields[key].required):
                classFieldDict[newKey] = Column(fieldTypeName, nullable=False)
            else:
                classFieldDict[newKey] = Column(fieldTypeName)

        if (not primaryAssigned):
            # If no primary key assigned, add one based on table name
            classFieldDict["".join([tableName,
                                    "id"])] = Column(Integer, primary_key=True)

        # Create ORM class based on dict
        self.orm = type(tableName, (declarative_base(), ), classFieldDict)
        self.jobId = jobId

        # Create table
        self.orm.__table__.create(self.interface.engine)

Beispiel #13

0

Datei anzeigen

    def open_file(self, region, bucket, filename, csv_schema, bucket_name,
                  error_filename, long_to_short_dict):
        """ Opens file and prepares to read each record, mapping entries to specified column names
        Args:
            region: AWS region where the bucket is located (not used if instantiated as CsvLocalReader)
            bucket: the S3 Bucket (not used if instantiated as CsvLocalReader)
            filename: The file path for the CSV file in S3
            csv_schema: list of FileColumn objects for this file type
            bucket_name: bucket to send errors to
            error_filename: filename for error report
            long_to_short_dict: mapping of long to short schema column names
        """

        self.filename = filename
        self.unprocessed = ''
        self.extra_line = False
        self.lines = []
        self.flex_dictionary = {}
        self.header_dictionary = {}
        self.packet_counter = 0
        current = 0
        self.is_finished = False
        self.column_count = 0
        line = self._get_line()
        # make sure we have not finished reading the file

        if self.is_finished:
            # Write header error for no header row
            with self.get_writer(bucket_name, error_filename, ["Error Type"],
                                 self.is_local) as writer:
                writer.write(["No header row"])
                writer.finishBatch()
            raise ResponseException("CSV file must have a header",
                                    StatusCode.CLIENT_ERROR, ValueError,
                                    ValidationError.singleRow)

        duplicated_headers = []
        #create the header

        # check delimiters in header row
        pipe_count = line.count("|")
        comma_count = line.count(",")

        if pipe_count != 0 and comma_count != 0:
            # Write header error for mixed delimiter use
            with self.get_writer(bucket_name, error_filename, ["Error Type"],
                                 self.is_local) as writer:
                writer.write([
                    "Cannot use both ',' and '|' as delimiters. Please choose one."
                ])
                writer.finishBatch()
            raise ResponseException(
                "Error in header row: CSV file must use only '|' or ',' as the delimiter",
                StatusCode.CLIENT_ERROR, ValueError,
                ValidationError.headerError)

        self.delimiter = "|" if line.count("|") != 0 else ","

        # Set the list of possible_fields, using  the shorter,
        # machine-readable column names
        possible_fields = {}
        for schema in csv_schema:
            possible_fields[FieldCleaner.cleanString(schema.name_short)] = 0

        for row in csv.reader([line],
                              dialect='excel',
                              delimiter=self.delimiter):
            # check to see if header contains long or short column names
            col_matches = 0
            for value in row:
                if FieldCleaner.cleanString(value) in long_to_short_dict:
                    col_matches += 1
            # if most of column headers are in the long format,
            # we'll treat the file as having long headers
            if col_matches > .5 * len(row):
                long_headers = True
            else:
                long_headers = False

            for cell in row:
                submitted_header_value = FieldCleaner.cleanString(cell)
                if long_headers and submitted_header_value in long_to_short_dict:
                    header_value = FieldCleaner.cleanString(
                        long_to_short_dict[submitted_header_value])
                elif long_headers:
                    header_value = None
                else:
                    header_value = submitted_header_value
                if not header_value in possible_fields:
                    # Add flex headers to flex list
                    if str(submitted_header_value).startswith("flex_"):
                        self.flex_dictionary[current] = submitted_header_value
                    else:
                        self.flex_dictionary[current] = None
                    # Allow unexpected headers, just mark the header as None so we skip it when reading
                    self.header_dictionary[current] = None
                    current += 1
                elif possible_fields[header_value] == 1:
                    # Add header value (as submitted) to duplicated header list
                    duplicated_headers.append(submitted_header_value)
                else:
                    self.header_dictionary[current] = header_value
                    possible_fields[header_value] = 1
                    current += 1

        self.column_count = current

        #Check that all required fields exists
        missing_headers = []
        for schema in csv_schema:
            if possible_fields[FieldCleaner.cleanString(
                    schema.name_short)] == 0:
                # return long colname for error reporting
                missing_headers.append(schema.name)

        if len(missing_headers) > 0 or len(duplicated_headers) > 0:
            # Write header errors if any occurred and raise a header_error exception
            error_string = ""
            with self.get_writer(bucket_name, error_filename,
                                 self.header_report_headers,
                                 self.is_local) as writer:
                extra_info = {}
                if len(duplicated_headers) > 0:
                    error_string = "".join([
                        error_string, "Duplicated: ",
                        ", ".join(duplicated_headers)
                    ])
                    extra_info["duplicated_headers"] = ", ".join(
                        duplicated_headers)
                    for header in duplicated_headers:
                        writer.write(["Duplicated header", header])
                if len(missing_headers) > 0:
                    if len(duplicated_headers):
                        # Separate missing and duplicated headers if both are present
                        error_string += "| "
                    error_string = "".join([
                        error_string, "Missing: ", ", ".join(missing_headers)
                    ])
                    extra_info["missing_headers"] = ", ".join(missing_headers)
                    for header in missing_headers:
                        writer.write(["Missing header", header])
                writer.finishBatch()
            raise ResponseException(
                "Errors in header row: " + str(error_string),
                StatusCode.CLIENT_ERROR, ValueError,
                ValidationError.headerError, **extra_info)

        return long_headers

Beispiel #14

0

Datei anzeigen

Datei: csvAbstractReader.py Projekt: govtmirror/data-act-validator

    def openFile(self, region, bucket, filename, csvSchema, bucketName,
                 errorFilename):
        """ Opens file and prepares to read each record, mapping entries to specified column names
        Args:
            bucket : the S3 Bucket
            filename: The file path for the CSV file in S3
            writer: An implementation of csvAbstractWriter to send header errors to
        Returns:
        """

        possibleFields = {}
        currentFields = {}
        for schema in csvSchema:
            possibleFields[FieldCleaner.cleanString(schema.name)] = 0

        self.filename = filename
        self.unprocessed = ''
        self.extraLine = False
        self.lines = []
        self.headerDictionary = {}
        self.packetCounter = 0
        current = 0
        self.isFinished = False
        self.columnCount = 0
        line = self._getLine()
        # make sure we have not finished reading the file

        if (self.isFinished):
            raise ResponseException("CSV file must have a header",
                                    StatusCode.CLIENT_ERROR, ValueError,
                                    ValidationError.singleRow)

        duplicatedHeaders = []
        #create the header
        for row in csv.reader([line], dialect='excel'):
            for cell in row:
                headerValue = FieldCleaner.cleanString(cell)
                if (not headerValue in possibleFields):
                    # Allow unexpected headers, just mark the header as None so we skip it when reading
                    self.headerDictionary[(current)] = None
                    current += 1
                elif (possibleFields[headerValue] == 1):
                    # Add to duplicated header list
                    duplicatedHeaders.append(headerValue)
                else:
                    self.headerDictionary[(current)] = headerValue
                    possibleFields[headerValue] = 1
                    current += 1
        self.columnCount = current
        #Check that all required fields exists
        missingHeaders = []
        for schema in csvSchema:
            if (schema.required and possibleFields[FieldCleaner.cleanString(
                    schema.name)] == 0):
                missingHeaders.append(schema.name)
        if (len(missingHeaders) > 0 or len(duplicatedHeaders) > 0):
            # Write header errors if any occurred and raise a header_error exception

            with self.getWriter(bucketName, errorFilename,
                                self.headerReportHeaders,
                                self.isLocal) as writer:
                extraInfo = {}
                if (len(duplicatedHeaders) > 0):
                    extraInfo["duplicated_headers"] = ", ".join(
                        duplicatedHeaders)
                    for header in duplicatedHeaders:
                        writer.write(["Duplicated header", header])
                if (len(missingHeaders) > 0):
                    extraInfo["missing_headers"] = ", ".join(missingHeaders)
                    for header in missingHeaders:
                        writer.write(["Missing header", header])
                writer.finishBatch()
            raise ResponseException("Errors in header row",
                                    StatusCode.CLIENT_ERROR, ValueError,
                                    ValidationError.headerError, **extraInfo)

Beispiel #15

0

Datei anzeigen

Datei: csvAbstractReader.py Projekt: fedspendingtransparency/data-act-validator

    def openFile(self,region,bucket,filename,csvSchema,bucketName,errorFilename):
        """ Opens file and prepares to read each record, mapping entries to specified column names
        Args:
            bucket : the S3 Bucket
            filename: The file path for the CSV file in S3
            writer: An implementation of csvAbstractWriter to send header errors to
        Returns:
        """


        possibleFields = {}
        currentFields = {}
        for schema in  csvSchema:
                possibleFields[FieldCleaner.cleanString(schema.name)] = 0

        self.filename = filename
        self.unprocessed = ''
        self.extraLine = False
        self.lines = []
        self.headerDictionary = {}
        self.packetCounter = 0
        current = 0
        self.isFinished = False
        self.columnCount = 0
        line = self._getLine()
        # make sure we have not finished reading the file

        if(self.isFinished) :
            raise ResponseException("CSV file must have a header",StatusCode.CLIENT_ERROR,ValueError,ValidationError.singleRow)

        duplicatedHeaders = []
        #create the header
        for row in csv.reader([line],dialect='excel'):
            for cell in row :
                headerValue = FieldCleaner.cleanString(cell)
                if( not headerValue in possibleFields) :
                    # Allow unexpected headers, just mark the header as None so we skip it when reading
                    self.headerDictionary[(current)] = None
                    current += 1
                elif(possibleFields[headerValue] == 1) :
                    # Add to duplicated header list
                    duplicatedHeaders.append(headerValue)
                else:
                    self.headerDictionary[(current)] = headerValue
                    possibleFields[headerValue]  = 1
                    current += 1
        self.columnCount = current
        #Check that all required fields exists
        missingHeaders = []
        for schema in csvSchema :
            if(schema.required and  possibleFields[FieldCleaner.cleanString(schema.name)] == 0) :
                missingHeaders.append(schema.name)
        if(len(missingHeaders) > 0 or len(duplicatedHeaders) > 0):
            # Write header errors if any occurred and raise a header_error exception

            with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer:
                extraInfo = {}
                if(len(duplicatedHeaders) > 0):
                    extraInfo["duplicated_headers"] = ", ".join(duplicatedHeaders)
                    for header in duplicatedHeaders:
                        writer.write(["Duplicated header", header])
                if(len(missingHeaders) > 0):
                    extraInfo["missing_headers"] = ", ".join(missingHeaders)
                    for header in missingHeaders:
                        writer.write(["Missing header", header])
                writer.finishBatch()
            raise ResponseException("Errors in header row", StatusCode.CLIENT_ERROR, ValueError,ValidationError.headerError,**extraInfo)