Ejemplo n.º 1
0
 def testCanNamifyEmptyText(self):
     self.assertEqual(_tools.namified(u""), "x")
     self.assertEqual(_tools.namified(u" "), "x")
     self.assertEqual(_tools.namified(u"\t"), "x")
Ejemplo n.º 2
0
 def testCanNamifyControlCharacters(self):
     self.assertEqual(_tools.namified(u"\r"), "x")
     self.assertEqual(_tools.namified(u"a\rb"), "a_b")
Ejemplo n.º 3
0
 def testCanNamifyNumber(self):
     self.assertEqual(_tools.namified(u"1a"), "x1a")
     self.assertEqual(_tools.namified(u"3.1415"), "x3_1415")
Ejemplo n.º 4
0
 def testCanNamifyKeyword(self):
     self.assertEqual(_tools.namified(u"if"), "if_")
Ejemplo n.º 5
0
 def testCanNamifyControlCharacters(self):
     self.assertEqual(_tools.namified(u"\r"), "x")
     self.assertEqual(_tools.namified(u"a\rb"), "a_b")
Ejemplo n.º 6
0
 def testCanNamifyText(self):
     self.assertEqual(_tools.namified(u"hello"), "hello")
     self.assertEqual(_tools.namified(u"hElLo"), "hElLo")
     self.assertEqual(_tools.namified(u"h3LL0"), "h3LL0")
     self.assertEqual(_tools.namified(u"Date of birth"), "Date_of_birth")
     self.assertEqual(_tools.namified(u"a    b"), "a_b")
Ejemplo n.º 7
0
 def testCanNamifyEmptyText(self):
     self.assertEqual(_tools.namified(u""), "x")
     self.assertEqual(_tools.namified(u" "), "x")
     self.assertEqual(_tools.namified(u"\t"), "x")
Ejemplo n.º 8
0
 def testCanNamifyKeyword(self):
     self.assertEqual(_tools.namified(u"if"), "if_")
Ejemplo n.º 9
0
 def testCanNamifyNumber(self):
     self.assertEqual(_tools.namified(u"1a"), "x1a")
     self.assertEqual(_tools.namified(u"3.1415"), "x3_1415")
Ejemplo n.º 10
0
 def testCanNamifyText(self):
     self.assertEqual(_tools.namified(u"hello"), "hello")
     self.assertEqual(_tools.namified(u"hElLo"), "hElLo")
     self.assertEqual(_tools.namified(u"h3LL0"), "h3LL0")
     self.assertEqual(_tools.namified(u"Date of birth"), "Date_of_birth")
     self.assertEqual(_tools.namified(u"a    b"), "a_b")
Ejemplo n.º 11
0
def createCidRows(readable, **keywords):
    """
    Create rows for an ICD by examining the contents of ``readable``.

    Optional keyword parameters are:

      * ``encoding`` - the character encoding to be used in case ``readable``
        contains delimited data.
      * ``dataFormat`` - the data format to be assumed; default: `FORMAT_AUTO`.
      * ``header`` - number of header rows to ignore for data analysis;
        default: 0.
      * ``stopAfter`` - number of data rows after which to stop analyzing;
        0 means "analyze all data"; default: 0.
      * ``fieldNames`` - Python names to refer to the fields. If this is a list of
        strings, each string represents a field name. If this is a single
        string, split it using comma (,) as separator to get to the names. If
        this is ``None`` (the default), use the last column of the ``header``
        as names. If ``header`` is ``None``, use generated field names such as
        'column_a', 'column_b' and so on.
    """
    assert readable is not None
    dataFormat = keywords.get("dataFormat", FORMAT_AUTO)
    assert dataFormat is not None
    encoding = keywords.get(_ENCODING, DEFAULT_ENCODING)
    assert encoding is not None
    dataRowsToStopAfter = keywords.get("stopAfter", 0)
    assert dataRowsToStopAfter >= 0
    headerRowsToSkip = keywords.get("header", 0)
    assert headerRowsToSkip >= 0
    fieldNames = keywords.get("fieldNames")
    if isinstance(fieldNames, basestring):
        fieldNames = [name.strip() for name in fieldNames.split(",")]
    elif fieldNames is not None:
        assert isinstance(fieldNames, list), u"field names must be a list or string but is: %s" % type(fieldNames)

    def decimalRows(hasDecimals, usesThousandsSeparator, decimalSeparator, thousandsSeparator):
        result = []
        if hasDecimals:
            result.append(["d", data.KEY_DECIMAL_SEPARATOR, decimalSeparator])
            if usesThousandsSeparator:
                result.append(["d", data.KEY_THOUSANDS_SEPARATOR, thousandsSeparator])
        return result

    NO_COUNT = -1

    _log.debug(u"find longest segment of rows with same column count")
    currentSegmentColumnCount = None
    longestSegmentColumnCount = NO_COUNT
    longestSegmentRowCount = NO_COUNT
    currentSegmentRowCount = 0
    rowIndex = 0
    rowIndexWhereCurrentSegmentStarted = 0
    rowIndexWhereLongestSegmentStarts = None
    # TODO: Cleanup code: calling both createDataFormat and createReader causes the data format to be analyzed twice.
    dataFormat = createDataFormat(readable, **keywords)
    readable.seek(0)
    reader = createReader(readable, **keywords)
    isFirstRow = True
    isReadFieldNamesFromHeader = (not fieldNames and headerRowsToSkip)
    for rowToAnalyze in reader:
        columnCount = len(rowToAnalyze)
        if isFirstRow:
            currentSegmentColumnCount = columnCount
        else:
            isFirstRow = False
        if isReadFieldNamesFromHeader and (rowIndex == headerRowsToSkip - 1):
            fieldNames = rowToAnalyze
        if (rowIndex >= headerRowsToSkip) and (columnCount != currentSegmentColumnCount):
            _log.debug(u"  segment starts in row %d after %d rows", rowIndex, currentSegmentRowCount)
            if currentSegmentRowCount > longestSegmentRowCount:
                rowIndexWhereLongestSegmentStarts = rowIndexWhereCurrentSegmentStarted
                longestSegmentRowCount = currentSegmentRowCount
                longestSegmentColumnCount = currentSegmentColumnCount
            rowIndexWhereCurrentSegmentStarted = rowIndex
            currentSegmentRowCount = 0
            currentSegmentColumnCount = columnCount
        else:
            currentSegmentRowCount += 1
        rowIndex += 1

    # Validate field names.
    if fieldNames is not None:
        if isReadFieldNamesFromHeader:
            location = tools.InputLocation(readable, hasCell=True)
            location.advanceLine(headerRowsToSkip)
        else:
            location = None
        if not fieldNames:
            raise data.DataFormatSyntaxError(u"the field names specified must contain at least 1 name", location)
        uniquefieldNames = set()
        for nameIndex in range(len(fieldNames)):
            fieldNameToCheck = fieldNames[nameIndex]
            if isReadFieldNamesFromHeader:
                fieldNameToCheck = _tools.namified(fieldNameToCheck)
            fieldNameToCheck = fields.validatedFieldName(fieldNameToCheck, location)
            if fieldNameToCheck in uniquefieldNames:
                raise fields.FieldSyntaxError(u"field name must be unique: %s" % fieldNameToCheck, location)
            fieldNames[nameIndex] = fieldNameToCheck
            uniquefieldNames.add(fieldNameToCheck)
            if location:
                location.advanceCell()

    # Handle the case that the whole file can be one large segment.
    _log.debug(u"last segment started in row %d and lasted for %d rows", rowIndexWhereCurrentSegmentStarted, currentSegmentRowCount)
    if currentSegmentRowCount > longestSegmentRowCount:
        rowIndexWhereLongestSegmentStarts = rowIndexWhereCurrentSegmentStarted
        longestSegmentRowCount = currentSegmentRowCount
        longestSegmentColumnCount = currentSegmentColumnCount

    if longestSegmentRowCount < 1:
        raise CutplaceSniffError(u"content must contain data for format to be sniffed")
    _log.debug(u"found longest segment starting in row %d lasting for %d rows having %d columns",
        rowIndexWhereLongestSegmentStarts, longestSegmentRowCount, longestSegmentColumnCount)

    assert rowIndexWhereLongestSegmentStarts is not None
    _log.debug(u"skip %d rows until longest segment starts", rowIndexWhereLongestSegmentStarts)
    readable.seek(0)
    reader = createReader(readable, **keywords)
    rowIndex = 0
    location = tools.InputLocation(readable)
    while rowIndex < rowIndexWhereLongestSegmentStarts:
        reader.next()
        location.advanceLine()
        rowIndex += 1

    _log.debug(u"analyze longest segment of rows with same column count")
    columnInfos = []
    for columnIndex in range(longestSegmentColumnCount):
        columnInfoToAppend = _ColumnSniffInfo(columnIndex, dataFormat)
        if fieldNames:
            columnInfoToAppend.name = fieldNames[columnIndex]
        columnInfos.append(columnInfoToAppend)
    rowIndex = 0
    while rowIndex < longestSegmentRowCount:
        rowToAnalyze = reader.next()
        if rowIndex >= headerRowsToSkip:
            columnCountOfRowToAnalyze = len(rowToAnalyze)
            if columnCountOfRowToAnalyze != longestSegmentColumnCount:
                raise CutplaceSniffError(u"data must not change between sniffer passes, but row %d now has %d columns instead of %d" \
                    % (rowIndex + 1, columnCountOfRowToAnalyze, longestSegmentColumnCount), location)
            for itemIndex in range(longestSegmentColumnCount):
                value = rowToAnalyze[itemIndex]
                columnInfos[itemIndex].process(value)
        location.advanceLine()
        rowIndex += 1

    # Make sure that decimal fields either use comma or point.
    decimalCommaColumnCount = 0
    decimalPointColumnCount = 0
    usesThousandSeparator = False
    for columnInfo in columnInfos:
        if columnInfo.isNumber and not columnInfo.isInteger:
            if columnInfo.isDecimalComma:
                _log.debug(u"field is decimal with comma as separator: %s", columnInfo.name)
                decimalCommaColumnCount += 1
                if columnInfo.usesThousandsSeparator:
                    _log.debug(u"  decimal field uses point as thousands separator: %s", columnInfo.name)
                    usesThousandSeparator = True
            elif columnInfo.isDecimalPoint:
                _log.debug(u"field is decimal with point as separator: %s", columnInfo.name)
                decimalPointColumnCount += 1
                if columnInfo.usesThousandsSeparator:
                    _log.debug(u"  decimal field uses comma as thousands separator: %s", columnInfo.name)
                    usesThousandSeparator = True

    if (decimalCommaColumnCount > 0) and (decimalPointColumnCount > 0):
        _log.warning(u"columns use different decimal separators: %d use comma, %d use point",
            decimalCommaColumnCount, decimalPointColumnCount)
        hasToChangeDecimalComma = (decimalCommaColumnCount < decimalPointColumnCount)
        if hasToChangeDecimalComma:
            decimalCommaColumnCount = 0
        else:
            decimalPointColumnCount = 0
        for columnInfo in columnInfos:
            if (columnInfo.isDecimalComma and hasToChangeDecimalComma) or columnInfo.isDecimalPoint and not hasToChangeDecimalComma:
                _log.warn(u"  change '%s' to text field", columnInfo.name)
                columnInfo.changeToTextField()
    assert (decimalCommaColumnCount == 0) or (decimalPointColumnCount == 0)

    for columnIndex in range(longestSegmentColumnCount):
        _log.debug(u"  %s" % columnInfos[columnIndex].asFieldFormat())

    # Build rows for CID: data data format.
    icdRows = []
    icdRows.append(["", "Interface: <Name>"])
    icdRows.append([])
    for dataFormatRow in dataFormat.asIcdRows():
        dataFormatCsvRow = ['d']
        dataFormatCsvRow.extend(dataFormatRow)
        icdRows.append(dataFormatCsvRow)
    _log.info(u"number of decimal fields: %d with comma, %d with point", decimalCommaColumnCount, decimalPointColumnCount)
    icdRows.extend(decimalRows(decimalCommaColumnCount > 0, usesThousandSeparator, ",", "."))
    icdRows.extend(decimalRows(decimalPointColumnCount > 0, usesThousandSeparator, ".", ","))
    icdRows.append([])

    # Build rows for CID: data field formats.
    icdRows.append(["", "Field", "Example", "Empty?", "Length", "Type", "Rule"])
    for columnInfo in columnInfos:
        fieldFormat = columnInfo.asFieldFormat()
        fieldRow = ["f"]
        fieldRow.extend(fieldFormat.asIcdRow())
        icdRows.append(fieldRow)
        _log.debug(u"  %s", fieldRow)

    return icdRows