Exemple #1
0
  def testFieldMetaTypeIsValid(self):
    self.assertEqual(FieldMetaType.isValid(FieldMetaType.string), True)
    self.assertEqual(FieldMetaType.isValid(FieldMetaType.datetime), True)
    self.assertEqual(FieldMetaType.isValid(FieldMetaType.integer), True)
    self.assertEqual(FieldMetaType.isValid(FieldMetaType.float), True)
    self.assertEqual(FieldMetaType.isValid(FieldMetaType.boolean), True)
    self.assertEqual(FieldMetaType.isValid(FieldMetaType.list), True)
    self.assertEqual(FieldMetaType.isValid(FieldMetaType.sdr), True)

    self.assertEqual(FieldMetaType.isValid("bogus-type"), False)
Exemple #2
0
    def __init__(self,
                 streamID,
                 write=False,
                 fields=None,
                 missingValues=None,
                 bookmark=None,
                 includeMS=True,
                 firstRecord=None):
        super(FileRecordStream, self).__init__()

        # Only bookmark or firstRow can be specified, not both
        if bookmark is not None and firstRecord is not None:
            raise RuntimeError(
                "Only bookmark or firstRecord can be specified, not both")

        if fields is None:
            fields = []
        if missingValues is None:
            missingValues = ['']

        # We'll be operating on csvs with arbitrarily long fields
        size = 2**27
        csv.field_size_limit(size)

        self._filename = streamID
        # We can't guarantee what system files are coming from, use universal
        # newlines
        self._write = write
        self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
        self._file = open(self._filename, self._mode)
        self._sequences = set()
        self.rewindAtEOF = False

        if write:
            assert fields is not None
            assert isinstance(fields, (tuple, list))
            # Verify all fields are 3-tuple
            assert all(
                isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                for f in fields)
            names, types, specials = zip(*fields)
            self._writer = csv.writer(self._file)
        else:
            # Read header lines
            self._reader = csv.reader(self._file, dialect="excel")
            try:
                names = [n.strip() for n in self._reader.next()]
            except:
                raise Exception('The header line of the file %s contained a NULL byte' \
                                % self._filename)
            types = [t.strip() for t in self._reader.next()]
            specials = [s.strip() for s in self._reader.next()]

            # If there are no specials, this means there was a blank line
            if len(specials) == 0:
                specials = [""]

        if not len(names) == len(types) == len(specials):
            raise Exception('Invalid file format: different number of fields '
                            'in the header rows of file %s (%d, %d, %d)' %
                            (streamID, len(names), len(types), len(specials)))

        # Verify standard file format
        for t in types:
            if not FieldMetaType.isValid(t):
                raise Exception(
                    'Invalid file format for "%s" - field type "%s" '
                    'not a valid FieldMetaType' % (
                        self._filename,
                        t,
                    ))

        for s in specials:
            if not FieldMetaSpecial.isValid(s):
                raise Exception(
                    'Invalid file format. \'%s\' is not a valid special '
                    'flag' % s)

        self._fields = [
            FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)
        ]
        self._fieldCount = len(self._fields)

        # Keep track on how many records have been read/written
        self._recordCount = 0

        self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if
                              FieldMetaSpecial.timestamp in specials else None)
        self._resetIdx = (specials.index(FieldMetaSpecial.reset)
                          if FieldMetaSpecial.reset in specials else None)
        self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if
                               FieldMetaSpecial.sequence in specials else None)
        self._categoryIdx = (specials.index(FieldMetaSpecial.category) if
                             FieldMetaSpecial.category in specials else None)
        self._learningIdx = (specials.index(FieldMetaSpecial.learning) if
                             FieldMetaSpecial.learning in specials else None)

        # keep track of the current sequence
        self._currSequence = None
        self._currTime = None

        if self._timeStampIdx:
            assert types[self._timeStampIdx] == FieldMetaType.datetime
        if self._sequenceIdIdx:
            assert types[self._sequenceIdIdx] in (FieldMetaType.string,
                                                  FieldMetaType.integer)
        if self._resetIdx:
            assert types[self._resetIdx] == FieldMetaType.integer
        if self._categoryIdx:
            assert types[self._categoryIdx] in (FieldMetaType.list,
                                                FieldMetaType.integer)
        if self._learningIdx:
            assert types[self._learningIdx] == FieldMetaType.integer

        # Convert the types to the actual types in order to convert the strings
        if self._mode == self._FILE_READ_MODE:
            m = {
                FieldMetaType.integer: intOrNone,
                FieldMetaType.float: floatOrNone,
                FieldMetaType.boolean: parseBool,
                FieldMetaType.string: unescape,
                FieldMetaType.datetime: parseTimestamp,
                FieldMetaType.sdr: parseSdr,
                FieldMetaType.list: parseStringList
            }
        else:
            if includeMS:
                datetimeFunc = serializeTimestamp
            else:
                datetimeFunc = serializeTimestampNoMS
            m = {
                FieldMetaType.integer: str,
                FieldMetaType.float: str,
                FieldMetaType.string: escape,
                FieldMetaType.boolean: str,
                FieldMetaType.datetime: datetimeFunc,
                FieldMetaType.sdr: serializeSdr,
                FieldMetaType.list: stripList
            }

        self._adapters = [m[t] for t in types]

        self._missingValues = missingValues

        #
        # If the bookmark is set, we need to skip over first N records
        #
        if bookmark is not None:
            rowsToSkip = self._getStartRow(bookmark)
        elif firstRecord is not None:
            rowsToSkip = firstRecord
        else:
            rowsToSkip = 0

        while rowsToSkip > 0:
            self.next()
            rowsToSkip -= 1

        # Dictionary to store record statistics (min and max of scalars for now)
        self._stats = None
Exemple #3
0
  def __init__(self, streamDef, bookmark=None, saveOutput=False,
               isBlocking=True, maxTimeout=0, eofOnTimeout=False):
    # Call superclass constructor
    super(StreamReader, self).__init__()

    loggerPrefix = 'com.numenta.nupic.data.StreamReader'
    self._logger = logging.getLogger(loggerPrefix)
    json_helpers.validate(streamDef,
                          schemaPath=pkg_resources.resource_filename(
                             jsonschema.__name__, "stream_def.json"))
    assert len(streamDef['streams']) == 1, "Only 1 source stream is supported"

    # Save constructor args
    sourceDict = streamDef['streams'][0]
    self._recordCount = 0
    self._eofOnTimeout = eofOnTimeout
    self._logger.debug('Reading stream with the def: %s', sourceDict)

    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None

    # ---------------------------------------------------------------------
    # Get the stream definition params

    # Limiting window of the stream. It would not return any records until
    # 'first_record' ID is read (or very first with the ID above that). The
    # stream will return EOS once it reads record with ID 'last_record' or
    # above (NOTE: the name 'lastRecord' is misleading because it is NOT
    #  inclusive).
    firstRecordIdx = sourceDict.get('first_record', None)
    self._sourceLastRecordIdx = sourceDict.get('last_record', None)

    # If a bookmark was given, then override first_record from the stream
    #  definition.
    if bookmark is not None:
      firstRecordIdx = None


    # Column names must be provided in the streamdef json
    # Special case is ['*'], meaning all available names from the record stream
    self._streamFieldNames = sourceDict.get('columns', None)
    if self._streamFieldNames != None and self._streamFieldNames[0] == '*':
      self._needFieldsFiltering = False
    else:
      self._needFieldsFiltering = True

    # Types must be specified in streamdef json, or in case of the
    #  file_recod_stream types could be implicit from the file
    streamFieldTypes = sourceDict.get('types', None)
    self._logger.debug('Types from the def: %s', streamFieldTypes)
    # Validate that all types are valid
    if streamFieldTypes is not None:
      for dataType in streamFieldTypes:
        assert FieldMetaType.isValid(dataType)

    # Reset, sequence and time fields might be provided by streamdef json
    streamResetFieldName = streamDef.get('resetField', None)
    streamTimeFieldName = streamDef.get('timeField', None)
    streamSequenceFieldName = streamDef.get('sequenceIdField', None)
    self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName,
                                                      streamTimeFieldName,
                                                      streamSequenceFieldName)


    # =======================================================================
    # Open up the underlying record store
    dataUrl = sourceDict.get('source', None)
    assert dataUrl is not None
    self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout,
                                         bookmark, firstRecordIdx)
    assert self._recordStore is not None


    # =======================================================================
    # Prepare the data structures we need for returning just the fields
    #  the caller wants from each record
    recordStoreFields = self._recordStore.getFields()
    self._recordStoreFieldNames = self._recordStore.getFieldNames()

    if not self._needFieldsFiltering:
      self._streamFieldNames = self._recordStoreFieldNames

    # Build up the field definitions for each field. This is a list of tuples
    #  of (name, type, special)
    self._streamFields = []
    for dstIdx, name in enumerate(self._streamFieldNames):
      if name not in self._recordStoreFieldNames:
        raise RuntimeError("The column '%s' from the stream definition "
          "is not present in the underlying stream which has the following "
          "columns: %s" % (name, self._recordStoreFieldNames))

      fieldIdx = self._recordStoreFieldNames.index(name)
      fieldType = recordStoreFields[fieldIdx].type
      fieldSpecial = recordStoreFields[fieldIdx].special

      # If the types or specials were defined in the stream definition,
      #   then override what was found in the record store
      if streamFieldTypes is not None:
        fieldType = streamFieldTypes[dstIdx]

      if streamResetFieldName is not None and streamResetFieldName == name:
        fieldSpecial = FieldMetaSpecial.reset
      if streamTimeFieldName is not None and streamTimeFieldName == name:
        fieldSpecial = FieldMetaSpecial.timestamp
      if (streamSequenceFieldName is not None and
          streamSequenceFieldName == name):
        fieldSpecial = FieldMetaSpecial.sequence

      self._streamFields.append(FieldMetaInfo(name, fieldType, fieldSpecial))


    # ========================================================================
    # Create the aggregator which will handle aggregation of records before
    #  returning them.
    self._aggregator = Aggregator(
            aggregationInfo=streamDef.get('aggregation', None),
            inputFields=recordStoreFields,
            timeFieldName=streamDef.get('timeField', None),
            sequenceIdFieldName=streamDef.get('sequenceIdField', None),
            resetFieldName=streamDef.get('resetField', None))

    # We rely on the aggregator to tell us the bookmark of the last raw input
    #  that contributed to the aggregated record
    self._aggBookmark = None

    # Compute the aggregation period in terms of months and seconds
    if 'aggregation' in streamDef:
      self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds(
                streamDef.get('aggregation'))
    else:
      self._aggMonthsAndSeconds = None


    # ========================================================================
    # Are we saving the generated output to a csv?
    if saveOutput:
      tmpDir = tempfile.mkdtemp()
      outFilename = os.path.join(tmpDir, "generated_output.csv")
      self._logger.info("StreamReader: Saving generated records to: '%s'" %
                        outFilename)
      self._writer = FileRecordStream(streamID=outFilename,
                                      write=True,
                                      fields=self._streamFields)
    else:
      self._writer = None
Exemple #4
0
  def __init__(self, streamDef, bookmark=None, saveOutput=False,
               isBlocking=True, maxTimeout=0, eofOnTimeout=False):
    # Call superclass constructor
    super(StreamReader, self).__init__()

    loggerPrefix = 'com.numenta.nupic.data.StreamReader'
    self._logger = logging.getLogger(loggerPrefix)
    json_helpers.validate(streamDef,
                          schemaPath=pkg_resources.resource_filename(
                             jsonschema.__name__, "stream_def.json"))
    assert len(streamDef['streams']) == 1, "Only 1 source stream is supported"

    # Save constructor args
    sourceDict = streamDef['streams'][0]
    self._recordCount = 0
    self._eofOnTimeout = eofOnTimeout
    self._logger.debug('Reading stream with the def: %s', sourceDict)

    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None

    # ---------------------------------------------------------------------
    # Get the stream definition params

    # Limiting window of the stream. It would not return any records until
    # 'first_record' ID is read (or very first with the ID above that). The
    # stream will return EOS once it reads record with ID 'last_record' or
    # above (NOTE: the name 'lastRecord' is misleading because it is NOT
    #  inclusive).
    firstRecordIdx = sourceDict.get('first_record', None)
    self._sourceLastRecordIdx = sourceDict.get('last_record', None)

    # If a bookmark was given, then override first_record from the stream
    #  definition.
    if bookmark is not None:
      firstRecordIdx = None


    # Column names must be provided in the streamdef json
    # Special case is ['*'], meaning all available names from the record stream
    self._streamFieldNames = sourceDict.get('columns', None)
    if self._streamFieldNames != None and self._streamFieldNames[0] == '*':
      self._needFieldsFiltering = False
    else:
      self._needFieldsFiltering = True

    # Types must be specified in streamdef json, or in case of the
    #  file_recod_stream types could be implicit from the file
    streamFieldTypes = sourceDict.get('types', None)
    self._logger.debug('Types from the def: %s', streamFieldTypes)
    # Validate that all types are valid
    if streamFieldTypes is not None:
      for dataType in streamFieldTypes:
        assert FieldMetaType.isValid(dataType)

    # Reset, sequence and time fields might be provided by streamdef json
    streamResetFieldName = streamDef.get('resetField', None)
    streamTimeFieldName = streamDef.get('timeField', None)
    streamSequenceFieldName = streamDef.get('sequenceIdField', None)
    self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName,
                                                      streamTimeFieldName,
                                                      streamSequenceFieldName)


    # =======================================================================
    # Open up the underlying record store
    dataUrl = sourceDict.get('source', None)
    assert dataUrl is not None
    self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout,
                                         bookmark, firstRecordIdx)
    assert self._recordStore is not None


    # =======================================================================
    # Prepare the data structures we need for returning just the fields
    #  the caller wants from each record
    recordStoreFields = self._recordStore.getFields()
    self._recordStoreFieldNames = self._recordStore.getFieldNames()

    if not self._needFieldsFiltering:
      self._streamFieldNames = self._recordStoreFieldNames

    # Build up the field definitions for each field. This is a list of tuples
    #  of (name, type, special)
    self._streamFields = []
    for dstIdx, name in enumerate(self._streamFieldNames):
      if name not in self._recordStoreFieldNames:
        raise RuntimeError("The column '%s' from the stream definition "
          "is not present in the underlying stream which has the following "
          "columns: %s" % (name, self._recordStoreFieldNames))

      fieldIdx = self._recordStoreFieldNames.index(name)
      fieldType = recordStoreFields[fieldIdx].type
      fieldSpecial = recordStoreFields[fieldIdx].special

      # If the types or specials were defined in the stream definition,
      #   then override what was found in the record store
      if streamFieldTypes is not None:
        fieldType = streamFieldTypes[dstIdx]

      if streamResetFieldName is not None and streamResetFieldName == name:
        fieldSpecial = FieldMetaSpecial.reset
      if streamTimeFieldName is not None and streamTimeFieldName == name:
        fieldSpecial = FieldMetaSpecial.timestamp
      if (streamSequenceFieldName is not None and
          streamSequenceFieldName == name):
        fieldSpecial = FieldMetaSpecial.sequence

      self._streamFields.append(FieldMetaInfo(name, fieldType, fieldSpecial))


    # ========================================================================
    # Create the aggregator which will handle aggregation of records before
    #  returning them.
    self._aggregator = Aggregator(
            aggregationInfo=streamDef.get('aggregation', None),
            inputFields=recordStoreFields,
            timeFieldName=streamDef.get('timeField', None),
            sequenceIdFieldName=streamDef.get('sequenceIdField', None),
            resetFieldName=streamDef.get('resetField', None))

    # We rely on the aggregator to tell us the bookmark of the last raw input
    #  that contributed to the aggregated record
    self._aggBookmark = None

    # Compute the aggregation period in terms of months and seconds
    if 'aggregation' in streamDef:
      self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds(
                streamDef.get('aggregation'))
    else:
      self._aggMonthsAndSeconds = None


    # ========================================================================
    # Are we saving the generated output to a csv?
    if saveOutput:
      tmpDir = tempfile.mkdtemp()
      outFilename = os.path.join(tmpDir, "generated_output.csv")
      self._logger.info("StreamReader: Saving generated records to: '%s'" %
                        outFilename)
      self._writer = FileRecordStream(streamID=outFilename,
                                      write=True,
                                      fields=self._streamFields)
    else:
      self._writer = None
Exemple #5
0
  def __init__(self, streamID, write=False, fields=None, missingValues=None,
               bookmark=None, includeMS=True, firstRecord=None):
    super(FileRecordStream, self).__init__()

    # Only bookmark or firstRow can be specified, not both
    if bookmark is not None and firstRecord is not None:
      raise RuntimeError(
          "Only bookmark or firstRecord can be specified, not both")

    if fields is None:
      fields = []
    if missingValues is None:
      missingValues = ['']

    # We'll be operating on csvs with arbitrarily long fields
    size = 2**27
    csv.field_size_limit(size)

    self._filename = streamID
    # We can't guarantee what system files are coming from, use universal
    # newlines
    self._write = write
    self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
    self._file = open(self._filename, self._mode)
    self._sequences = set()
    self.rewindAtEOF = False

    if write:
      assert fields is not None
      assert isinstance(fields, (tuple, list))
      # Verify all fields are 3-tuple
      assert all(isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                 for f in fields)
      names, types, specials = zip(*fields)
      self._writer = csv.writer(self._file)
    else:
      # Read header lines
      self._reader = csv.reader(self._file, dialect="excel")
      try:
        names = [n.strip() for n in self._reader.next()]
      except:
        raise Exception('The header line of the file %s contained a NULL byte' \
                        % self._filename)
      types = [t.strip() for t in self._reader.next()]
      specials = [s.strip() for s in self._reader.next()]

      # If there are no specials, this means there was a blank line
      if len(specials) == 0:
        specials=[""]

    if not len(names) == len(types) == len(specials):
      raise Exception('Invalid file format: different number of fields '
                      'in the header rows of file %s (%d, %d, %d)' %
                      (streamID, len(names), len(types), len(specials)))

    # Verify standard file format
    for t in types:
      if not FieldMetaType.isValid(t):
        raise Exception('Invalid file format for "%s" - field type "%s" '
                        'not a valid FieldMetaType' % (self._filename, t,))

    for s in specials:
      if not FieldMetaSpecial.isValid(s):
        raise Exception('Invalid file format. \'%s\' is not a valid special '
                        'flag' % s)

    self._fields = [FieldMetaInfo(*attrs)
                    for attrs in zip(names, types, specials)]
    self._fieldCount = len(self._fields)

    # Keep track on how many records have been read/written
    self._recordCount = 0

    self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp)
                          if FieldMetaSpecial.timestamp in specials else None)
    self._resetIdx = (specials.index(FieldMetaSpecial.reset)
                      if FieldMetaSpecial.reset in specials else None)
    self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence)
                           if FieldMetaSpecial.sequence in specials else None)
    self._categoryIdx = (specials.index(FieldMetaSpecial.category)
                         if FieldMetaSpecial.category in specials else None)
    self._learningIdx = (specials.index(FieldMetaSpecial.learning)
                         if FieldMetaSpecial.learning in specials else None)

    # keep track of the current sequence
    self._currSequence = None
    self._currTime = None

    if self._timeStampIdx:
      assert types[self._timeStampIdx] == FieldMetaType.datetime
    if self._sequenceIdIdx:
      assert types[self._sequenceIdIdx] in (FieldMetaType.string,
                                            FieldMetaType.integer)
    if self._resetIdx:
      assert types[self._resetIdx] == FieldMetaType.integer
    if self._categoryIdx:
      assert types[self._categoryIdx] in (FieldMetaType.list,
                                          FieldMetaType.integer)
    if self._learningIdx:
      assert types[self._learningIdx] == FieldMetaType.integer

    # Convert the types to the actual types in order to convert the strings
    if self._mode == self._FILE_READ_MODE:
      m = {FieldMetaType.integer: intOrNone,
           FieldMetaType.float: floatOrNone,
           FieldMetaType.boolean: parseBool,
           FieldMetaType.string: unescape,
           FieldMetaType.datetime: parseTimestamp,
           FieldMetaType.sdr: parseSdr,
           FieldMetaType.list: parseStringList}
    else:
      if includeMS:
        datetimeFunc = serializeTimestamp
      else:
        datetimeFunc = serializeTimestampNoMS
      m = {FieldMetaType.integer: str,
           FieldMetaType.float: str,
           FieldMetaType.string: escape,
           FieldMetaType.boolean: str,
           FieldMetaType.datetime: datetimeFunc,
           FieldMetaType.sdr: serializeSdr,
           FieldMetaType.list: stripList}

    self._adapters = [m[t] for t in types]

    self._missingValues = missingValues

    #
    # If the bookmark is set, we need to skip over first N records
    #
    if bookmark is not None:
      rowsToSkip = self._getStartRow(bookmark)
    elif firstRecord is not None:
      rowsToSkip = firstRecord
    else:
      rowsToSkip = 0

    while rowsToSkip > 0:
      self.next()
      rowsToSkip -= 1


    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None