def testCanNamifyEmptyText(self): self.assertEqual(_tools.namified(u""), "x") self.assertEqual(_tools.namified(u" "), "x") self.assertEqual(_tools.namified(u"\t"), "x")
def testCanNamifyControlCharacters(self): self.assertEqual(_tools.namified(u"\r"), "x") self.assertEqual(_tools.namified(u"a\rb"), "a_b")
def testCanNamifyNumber(self): self.assertEqual(_tools.namified(u"1a"), "x1a") self.assertEqual(_tools.namified(u"3.1415"), "x3_1415")
def testCanNamifyKeyword(self): self.assertEqual(_tools.namified(u"if"), "if_")
def testCanNamifyText(self): self.assertEqual(_tools.namified(u"hello"), "hello") self.assertEqual(_tools.namified(u"hElLo"), "hElLo") self.assertEqual(_tools.namified(u"h3LL0"), "h3LL0") self.assertEqual(_tools.namified(u"Date of birth"), "Date_of_birth") self.assertEqual(_tools.namified(u"a b"), "a_b")
def createCidRows(readable, **keywords): """ Create rows for an ICD by examining the contents of ``readable``. Optional keyword parameters are: * ``encoding`` - the character encoding to be used in case ``readable`` contains delimited data. * ``dataFormat`` - the data format to be assumed; default: `FORMAT_AUTO`. * ``header`` - number of header rows to ignore for data analysis; default: 0. * ``stopAfter`` - number of data rows after which to stop analyzing; 0 means "analyze all data"; default: 0. * ``fieldNames`` - Python names to refer to the fields. If this is a list of strings, each string represents a field name. If this is a single string, split it using comma (,) as separator to get to the names. If this is ``None`` (the default), use the last column of the ``header`` as names. If ``header`` is ``None``, use generated field names such as 'column_a', 'column_b' and so on. """ assert readable is not None dataFormat = keywords.get("dataFormat", FORMAT_AUTO) assert dataFormat is not None encoding = keywords.get(_ENCODING, DEFAULT_ENCODING) assert encoding is not None dataRowsToStopAfter = keywords.get("stopAfter", 0) assert dataRowsToStopAfter >= 0 headerRowsToSkip = keywords.get("header", 0) assert headerRowsToSkip >= 0 fieldNames = keywords.get("fieldNames") if isinstance(fieldNames, basestring): fieldNames = [name.strip() for name in fieldNames.split(",")] elif fieldNames is not None: assert isinstance(fieldNames, list), u"field names must be a list or string but is: %s" % type(fieldNames) def decimalRows(hasDecimals, usesThousandsSeparator, decimalSeparator, thousandsSeparator): result = [] if hasDecimals: result.append(["d", data.KEY_DECIMAL_SEPARATOR, decimalSeparator]) if usesThousandsSeparator: result.append(["d", data.KEY_THOUSANDS_SEPARATOR, thousandsSeparator]) return result NO_COUNT = -1 _log.debug(u"find longest segment of rows with same column count") currentSegmentColumnCount = None longestSegmentColumnCount = NO_COUNT longestSegmentRowCount = NO_COUNT currentSegmentRowCount = 0 rowIndex = 0 rowIndexWhereCurrentSegmentStarted = 0 rowIndexWhereLongestSegmentStarts = None # TODO: Cleanup code: calling both createDataFormat and createReader causes the data format to be analyzed twice. dataFormat = createDataFormat(readable, **keywords) readable.seek(0) reader = createReader(readable, **keywords) isFirstRow = True isReadFieldNamesFromHeader = (not fieldNames and headerRowsToSkip) for rowToAnalyze in reader: columnCount = len(rowToAnalyze) if isFirstRow: currentSegmentColumnCount = columnCount else: isFirstRow = False if isReadFieldNamesFromHeader and (rowIndex == headerRowsToSkip - 1): fieldNames = rowToAnalyze if (rowIndex >= headerRowsToSkip) and (columnCount != currentSegmentColumnCount): _log.debug(u" segment starts in row %d after %d rows", rowIndex, currentSegmentRowCount) if currentSegmentRowCount > longestSegmentRowCount: rowIndexWhereLongestSegmentStarts = rowIndexWhereCurrentSegmentStarted longestSegmentRowCount = currentSegmentRowCount longestSegmentColumnCount = currentSegmentColumnCount rowIndexWhereCurrentSegmentStarted = rowIndex currentSegmentRowCount = 0 currentSegmentColumnCount = columnCount else: currentSegmentRowCount += 1 rowIndex += 1 # Validate field names. if fieldNames is not None: if isReadFieldNamesFromHeader: location = tools.InputLocation(readable, hasCell=True) location.advanceLine(headerRowsToSkip) else: location = None if not fieldNames: raise data.DataFormatSyntaxError(u"the field names specified must contain at least 1 name", location) uniquefieldNames = set() for nameIndex in range(len(fieldNames)): fieldNameToCheck = fieldNames[nameIndex] if isReadFieldNamesFromHeader: fieldNameToCheck = _tools.namified(fieldNameToCheck) fieldNameToCheck = fields.validatedFieldName(fieldNameToCheck, location) if fieldNameToCheck in uniquefieldNames: raise fields.FieldSyntaxError(u"field name must be unique: %s" % fieldNameToCheck, location) fieldNames[nameIndex] = fieldNameToCheck uniquefieldNames.add(fieldNameToCheck) if location: location.advanceCell() # Handle the case that the whole file can be one large segment. _log.debug(u"last segment started in row %d and lasted for %d rows", rowIndexWhereCurrentSegmentStarted, currentSegmentRowCount) if currentSegmentRowCount > longestSegmentRowCount: rowIndexWhereLongestSegmentStarts = rowIndexWhereCurrentSegmentStarted longestSegmentRowCount = currentSegmentRowCount longestSegmentColumnCount = currentSegmentColumnCount if longestSegmentRowCount < 1: raise CutplaceSniffError(u"content must contain data for format to be sniffed") _log.debug(u"found longest segment starting in row %d lasting for %d rows having %d columns", rowIndexWhereLongestSegmentStarts, longestSegmentRowCount, longestSegmentColumnCount) assert rowIndexWhereLongestSegmentStarts is not None _log.debug(u"skip %d rows until longest segment starts", rowIndexWhereLongestSegmentStarts) readable.seek(0) reader = createReader(readable, **keywords) rowIndex = 0 location = tools.InputLocation(readable) while rowIndex < rowIndexWhereLongestSegmentStarts: reader.next() location.advanceLine() rowIndex += 1 _log.debug(u"analyze longest segment of rows with same column count") columnInfos = [] for columnIndex in range(longestSegmentColumnCount): columnInfoToAppend = _ColumnSniffInfo(columnIndex, dataFormat) if fieldNames: columnInfoToAppend.name = fieldNames[columnIndex] columnInfos.append(columnInfoToAppend) rowIndex = 0 while rowIndex < longestSegmentRowCount: rowToAnalyze = reader.next() if rowIndex >= headerRowsToSkip: columnCountOfRowToAnalyze = len(rowToAnalyze) if columnCountOfRowToAnalyze != longestSegmentColumnCount: raise CutplaceSniffError(u"data must not change between sniffer passes, but row %d now has %d columns instead of %d" \ % (rowIndex + 1, columnCountOfRowToAnalyze, longestSegmentColumnCount), location) for itemIndex in range(longestSegmentColumnCount): value = rowToAnalyze[itemIndex] columnInfos[itemIndex].process(value) location.advanceLine() rowIndex += 1 # Make sure that decimal fields either use comma or point. decimalCommaColumnCount = 0 decimalPointColumnCount = 0 usesThousandSeparator = False for columnInfo in columnInfos: if columnInfo.isNumber and not columnInfo.isInteger: if columnInfo.isDecimalComma: _log.debug(u"field is decimal with comma as separator: %s", columnInfo.name) decimalCommaColumnCount += 1 if columnInfo.usesThousandsSeparator: _log.debug(u" decimal field uses point as thousands separator: %s", columnInfo.name) usesThousandSeparator = True elif columnInfo.isDecimalPoint: _log.debug(u"field is decimal with point as separator: %s", columnInfo.name) decimalPointColumnCount += 1 if columnInfo.usesThousandsSeparator: _log.debug(u" decimal field uses comma as thousands separator: %s", columnInfo.name) usesThousandSeparator = True if (decimalCommaColumnCount > 0) and (decimalPointColumnCount > 0): _log.warning(u"columns use different decimal separators: %d use comma, %d use point", decimalCommaColumnCount, decimalPointColumnCount) hasToChangeDecimalComma = (decimalCommaColumnCount < decimalPointColumnCount) if hasToChangeDecimalComma: decimalCommaColumnCount = 0 else: decimalPointColumnCount = 0 for columnInfo in columnInfos: if (columnInfo.isDecimalComma and hasToChangeDecimalComma) or columnInfo.isDecimalPoint and not hasToChangeDecimalComma: _log.warn(u" change '%s' to text field", columnInfo.name) columnInfo.changeToTextField() assert (decimalCommaColumnCount == 0) or (decimalPointColumnCount == 0) for columnIndex in range(longestSegmentColumnCount): _log.debug(u" %s" % columnInfos[columnIndex].asFieldFormat()) # Build rows for CID: data data format. icdRows = [] icdRows.append(["", "Interface: <Name>"]) icdRows.append([]) for dataFormatRow in dataFormat.asIcdRows(): dataFormatCsvRow = ['d'] dataFormatCsvRow.extend(dataFormatRow) icdRows.append(dataFormatCsvRow) _log.info(u"number of decimal fields: %d with comma, %d with point", decimalCommaColumnCount, decimalPointColumnCount) icdRows.extend(decimalRows(decimalCommaColumnCount > 0, usesThousandSeparator, ",", ".")) icdRows.extend(decimalRows(decimalPointColumnCount > 0, usesThousandSeparator, ".", ",")) icdRows.append([]) # Build rows for CID: data field formats. icdRows.append(["", "Field", "Example", "Empty?", "Length", "Type", "Rule"]) for columnInfo in columnInfos: fieldFormat = columnInfo.asFieldFormat() fieldRow = ["f"] fieldRow.extend(fieldFormat.asIcdRow()) icdRows.append(fieldRow) _log.debug(u" %s", fieldRow) return icdRows