Example #1
0
  def testFieldMetaSpecialIsValid(self):
    self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.none), True)
    self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.reset), True)
    self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.sequence), True)
    self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.timestamp), True)
    self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.category), True)
    self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.learning), True)

    self.assertEqual(FieldMetaSpecial.isValid("bogus-special"), False)
Example #2
0
    def testFieldMetaSpecialIsValid(self):
        self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.none), True)
        self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.reset),
                         True)
        self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.sequence),
                         True)
        self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.timestamp),
                         True)
        self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.category),
                         True)
        self.assertEqual(FieldMetaSpecial.isValid(FieldMetaSpecial.learning),
                         True)

        self.assertEqual(FieldMetaSpecial.isValid("bogus-special"), False)
Example #3
0
  def __init__(self, streamID, write=False, fields=None, missingValues=None,
               bookmark=None, includeMS=True, firstRecord=None):
    super(FileRecordStream, self).__init__()

    # Only bookmark or firstRow can be specified, not both
    if bookmark is not None and firstRecord is not None:
      raise RuntimeError(
          "Only bookmark or firstRecord can be specified, not both")

    if fields is None:
      fields = []
    if missingValues is None:
      missingValues = ['']

    # We'll be operating on csvs with arbitrarily long fields
    size = 2**27
    csv.field_size_limit(size)

    self._filename = streamID
    # We can't guarantee what system files are coming from, use universal
    # newlines
    self._write = write
    self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
    self._file = open(self._filename, self._mode)
    self._sequences = set()
    self.rewindAtEOF = False

    if write:
      assert fields is not None
      assert isinstance(fields, (tuple, list))
      # Verify all fields are 3-tuple
      assert all(isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                 for f in fields)
      names, types, specials = zip(*fields)
      self._writer = csv.writer(self._file)
    else:
      # Read header lines
      self._reader = csv.reader(self._file, dialect="excel")
      try:
        names = [n.strip() for n in self._reader.next()]
      except:
        raise Exception('The header line of the file %s contained a NULL byte' \
                        % self._filename)
      types = [t.strip() for t in self._reader.next()]
      specials = [s.strip() for s in self._reader.next()]

      # If there are no specials, this means there was a blank line
      if len(specials) == 0:
        specials=[""]

    if not len(names) == len(types) == len(specials):
      raise Exception('Invalid file format: different number of fields '
                      'in the header rows of file %s (%d, %d, %d)' %
                      (streamID, len(names), len(types), len(specials)))

    # Verify standard file format
    for t in types:
      if not FieldMetaType.isValid(t):
        raise Exception('Invalid file format for "%s" - field type "%s" '
                        'not a valid FieldMetaType' % (self._filename, t,))

    for s in specials:
      if not FieldMetaSpecial.isValid(s):
        raise Exception('Invalid file format. \'%s\' is not a valid special '
                        'flag' % s)

    self._fields = [FieldMetaInfo(*attrs)
                    for attrs in zip(names, types, specials)]
    self._fieldCount = len(self._fields)

    # Keep track on how many records have been read/written
    self._recordCount = 0

    self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp)
                          if FieldMetaSpecial.timestamp in specials else None)
    self._resetIdx = (specials.index(FieldMetaSpecial.reset)
                      if FieldMetaSpecial.reset in specials else None)
    self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence)
                           if FieldMetaSpecial.sequence in specials else None)
    self._categoryIdx = (specials.index(FieldMetaSpecial.category)
                         if FieldMetaSpecial.category in specials else None)
    self._learningIdx = (specials.index(FieldMetaSpecial.learning)
                         if FieldMetaSpecial.learning in specials else None)

    # keep track of the current sequence
    self._currSequence = None
    self._currTime = None

    if self._timeStampIdx:
      assert types[self._timeStampIdx] == FieldMetaType.datetime
    if self._sequenceIdIdx:
      assert types[self._sequenceIdIdx] in (FieldMetaType.string,
                                            FieldMetaType.integer)
    if self._resetIdx:
      assert types[self._resetIdx] == FieldMetaType.integer
    if self._categoryIdx:
      assert types[self._categoryIdx] in (FieldMetaType.list,
                                          FieldMetaType.integer)
    if self._learningIdx:
      assert types[self._learningIdx] == FieldMetaType.integer

    # Convert the types to the actual types in order to convert the strings
    if self._mode == self._FILE_READ_MODE:
      m = {FieldMetaType.integer: intOrNone,
           FieldMetaType.float: floatOrNone,
           FieldMetaType.boolean: parseBool,
           FieldMetaType.string: unescape,
           FieldMetaType.datetime: parseTimestamp,
           FieldMetaType.sdr: parseSdr,
           FieldMetaType.list: parseStringList}
    else:
      if includeMS:
        datetimeFunc = serializeTimestamp
      else:
        datetimeFunc = serializeTimestampNoMS
      m = {FieldMetaType.integer: str,
           FieldMetaType.float: str,
           FieldMetaType.string: escape,
           FieldMetaType.boolean: str,
           FieldMetaType.datetime: datetimeFunc,
           FieldMetaType.sdr: serializeSdr,
           FieldMetaType.list: stripList}

    self._adapters = [m[t] for t in types]

    self._missingValues = missingValues

    #
    # If the bookmark is set, we need to skip over first N records
    #
    if bookmark is not None:
      rowsToSkip = self._getStartRow(bookmark)
    elif firstRecord is not None:
      rowsToSkip = firstRecord
    else:
      rowsToSkip = 0

    while rowsToSkip > 0:
      self.next()
      rowsToSkip -= 1


    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None
  def __init__(self, streamID, write=False, fields=None, missingValues=None,
               bookmark=None, includeMS=True, firstRecord=None):
    """
    streamID:
        CSV file name, input or output
    write:
        True or False, open for writing if True
    fields:
        a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only
        applicable when write==True
    missingValues:
        what missing values should be replaced with?
    bookmark:
        a reference to the previous reader, if passed in, the records will be
        returned starting from the point where bookmark was requested. Either
        bookmark or firstRecord can be specified, not both. If bookmark is used,
        then firstRecord MUST be None.
    includeMS:
        If false, the microseconds portion is not included in the
        generated output file timestamp fields. This makes it compatible
        with reading in from Excel.
    firstRecord:
        0-based index of the first record to start reading from. Either bookmark
        or firstRecord can be specified, not both. If bookmark is used, then
        firstRecord MUST be None.

    Each field is a 3-tuple (name, type, special or FieldMetaSpecial.none)

    The name is the name of the field. The type is one of the constants in
    `FieldMetaType`. The special is one of the `FieldMetaSpecial` values
    that designate their field as the sequenceId, reset, timestamp, or category.
    With exception of multiple categories, there can be at most one of each.
    There may be multiple fields of type datetime, but no more than one of them
    may be the timestamp field (FieldMetaSpecial.timestamp). The sequence id
    field must be either a string or an int. The reset field must be an int (and
    must contain 0 or 1).

    The category field must be an int or space-separated list of ints, where
    the former represents single-label classification and the latter is for
    multi-label classification (e.g. "1 3 4" designates a record for labels 1,
    3, and 4). The number of categories is allowed to vary record to record;
    sensor regions represent non-categories with -1, thus the category values
    must be >= 0.

    The FileRecordStream iterates over the field names, types and specials and
    stores the information.
    """
    super(FileRecordStream, self).__init__()

    # Only bookmark or firstRow can be specified, not both
    if bookmark is not None and firstRecord is not None:
      raise RuntimeError(
          "Only bookmark or firstRecord can be specified, not both")

    if fields is None:
      fields = []
    if missingValues is None:
      missingValues = ['']

    # We'll be operating on csvs with arbitrarily long fields
    size = 2**27
    csv.field_size_limit(size)

    self._filename = streamID
    # We can't guarantee what system files are coming from, use universal
    # newlines
    self._write = write
    self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
    self._file = open(self._filename, self._mode)
    self._sequences = set()
    self.rewindAtEOF = False

    if write:
      assert fields is not None
      assert isinstance(fields, (tuple, list))
      # Verify all fields are 3-tuple
      assert all(isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                 for f in fields)
      names, types, specials = zip(*fields)
      self._writer = csv.writer(self._file)
    else:
      # Make sure readline() works on windows too
      os.linesep = '\n'
      # Read header lines
      self._reader = csv.reader(self._file, dialect="excel")
      try:
        names = [n.strip() for n in self._reader.next()]
      except:
        raise Exception('The header line of the file %s contained a NULL byte' \
                        % self._filename)
      types = [t.strip() for t in self._reader.next()]
      specials = [s.strip() for s in self._reader.next()]

      # If there are no specials, this means there was a blank line
      if len(specials) == 0:
        specials=[""]

    if not len(names) == len(types) == len(specials):
      raise Exception('Invalid file format: different number of fields '
                      'in the header rows of file %s (%d, %d, %d)' %
                      (streamID, len(names), len(types), len(specials)))

    # Verify standard file format
    for t in types:
      if not FieldMetaType.isValid(t):
        raise Exception('Invalid file format for "%s" - field type "%s" '
                        'not a valid FieldMetaType' % (self._filename, t,))

    for s in specials:
      if not FieldMetaSpecial.isValid(s):
        raise Exception('Invalid file format. \'%s\' is not a valid special '
                        'flag' % s)

    self._fields = [FieldMetaInfo(*attrs)
                    for attrs in zip(names, types, specials)]
    self._fieldCount = len(self._fields)

    # Keep track on how many records have been read/written
    self._recordCount = 0

    self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp)
                          if FieldMetaSpecial.timestamp in specials else None)
    self._resetIdx = (specials.index(FieldMetaSpecial.reset)
                      if FieldMetaSpecial.reset in specials else None)
    self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence)
                           if FieldMetaSpecial.sequence in specials else None)
    self._categoryIdx = (specials.index(FieldMetaSpecial.category)
                         if FieldMetaSpecial.category in specials else None)
    self._learningIdx = (specials.index(FieldMetaSpecial.learning)
                         if FieldMetaSpecial.learning in specials else None)

    # keep track of the current sequence
    self._currSequence = None
    self._currTime = None

    if self._timeStampIdx:
      assert types[self._timeStampIdx] == FieldMetaType.datetime
    if self._sequenceIdIdx:
      assert types[self._sequenceIdIdx] in (FieldMetaType.string,
                                            FieldMetaType.integer)
    if self._resetIdx:
      assert types[self._resetIdx] == FieldMetaType.integer
    if self._categoryIdx:
      assert types[self._categoryIdx] in (FieldMetaType.list,
                                          FieldMetaType.integer)
    if self._learningIdx:
      assert types[self._learningIdx] == FieldMetaType.integer

    # Convert the types to the actual types in order to convert the strings
    if self._mode == self._FILE_READ_MODE:
      m = {FieldMetaType.integer: intOrNone,
           FieldMetaType.float: floatOrNone,
           FieldMetaType.boolean: parseBool,
           FieldMetaType.string: unescape,
           FieldMetaType.datetime: parseTimestamp,
           FieldMetaType.sdr: parseSdr,
           FieldMetaType.list: parseStringList}
    else:
      if includeMS:
        datetimeFunc = serializeTimestamp
      else:
        datetimeFunc = serializeTimestampNoMS
      m = {FieldMetaType.integer: str,
           FieldMetaType.float: str,
           FieldMetaType.string: escape,
           FieldMetaType.boolean: str,
           FieldMetaType.datetime: datetimeFunc,
           FieldMetaType.sdr: serializeSdr,
           FieldMetaType.list: stripList}

    self._adapters = [m[t] for t in types]

    self._missingValues = missingValues

    #
    # If the bookmark is set, we need to skip over first N records
    #
    if bookmark is not None:
      rowsToSkip = self._getStartRow(bookmark)
    elif firstRecord is not None:
      rowsToSkip = firstRecord
    else:
      rowsToSkip = 0

    while rowsToSkip > 0:
      self.next()
      rowsToSkip -= 1


    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None
Example #5
0
    def __init__(self,
                 streamID,
                 write=False,
                 fields=None,
                 missingValues=None,
                 bookmark=None,
                 includeMS=True,
                 firstRecord=None):
        """
    streamID:
        CSV file name, input or output
    write:
        True or False, open for writing if True
    fields:
        a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only
        applicable when write==True
    missingValues:
        what missing values should be replaced with?
    bookmark:
        a reference to the previous reader, if passed in, the records will be
        returned starting from the point where bookmark was requested. Either
        bookmark or firstRecord can be specified, not both. If bookmark is used,
        then firstRecord MUST be None.
    includeMS:
        If false, the microseconds portion is not included in the
        generated output file timestamp fields. This makes it compatible
        with reading in from Excel.
    firstRecord:
        0-based index of the first record to start reading from. Either bookmark
        or firstRecord can be specified, not both. If bookmark is used, then
        firstRecord MUST be None.

    Each field is a 3-tuple (name, type, special or FieldMetaSpecial.none)

    The name is the name of the field. The type is one of the constants in
    `FieldMetaType`. The special is one of the `FieldMetaSpecial` values
    that designate their field as the sequenceId, reset, timestamp, or category.
    With exception of multiple categories, there can be at most one of each.
    There may be multiple fields of type datetime, but no more than one of them
    may be the timestamp field (FieldMetaSpecial.timestamp). The sequence id
    field must be either a string or an int. The reset field must be an int (and
    must contain 0 or 1).

    The category field must be an int or space-separated list of ints, where
    the former represents single-label classification and the latter is for
    multi-label classification (e.g. "1 3 4" designates a record for labels 1,
    3, and 4). The number of categories is allowed to vary record to record;
    sensor regions represent non-categories with -1, thus the category values
    must be >= 0.

    The FileRecordStream iterates over the field names, types and specials and
    stores the information.
    """
        super(FileRecordStream, self).__init__()

        # Only bookmark or firstRow can be specified, not both
        if bookmark is not None and firstRecord is not None:
            raise RuntimeError(
                "Only bookmark or firstRecord can be specified, not both")

        if fields is None:
            fields = []
        if missingValues is None:
            missingValues = ['']

        # We'll be operating on csvs with arbitrarily long fields
        size = 2**27
        csv.field_size_limit(size)

        self._filename = streamID
        # We can't guarantee what system files are coming from, use universal
        # newlines
        self._write = write
        self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
        self._file = open(self._filename, self._mode)
        self._sequences = set()
        self.rewindAtEOF = False

        if write:
            assert fields is not None
            assert isinstance(fields, (tuple, list))
            # Verify all fields are 3-tuple
            assert all(
                isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                for f in fields)
            names, types, specials = zip(*fields)
            self._writer = csv.writer(self._file)
        else:
            # Read header lines
            self._reader = csv.reader(self._file, dialect="excel")
            try:
                names = [n.strip() for n in self._reader.next()]
            except:
                raise Exception('The header line of the file %s contained a NULL byte' \
                                % self._filename)
            types = [t.strip() for t in self._reader.next()]
            specials = [s.strip() for s in self._reader.next()]

            # If there are no specials, this means there was a blank line
            if len(specials) == 0:
                specials = [""]

        if not len(names) == len(types) == len(specials):
            raise Exception('Invalid file format: different number of fields '
                            'in the header rows of file %s (%d, %d, %d)' %
                            (streamID, len(names), len(types), len(specials)))

        # Verify standard file format
        for t in types:
            if not FieldMetaType.isValid(t):
                raise Exception(
                    'Invalid file format for "%s" - field type "%s" '
                    'not a valid FieldMetaType' % (
                        self._filename,
                        t,
                    ))

        for s in specials:
            if not FieldMetaSpecial.isValid(s):
                raise Exception(
                    'Invalid file format. \'%s\' is not a valid special '
                    'flag' % s)

        self._fields = [
            FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)
        ]
        self._fieldCount = len(self._fields)

        # Keep track on how many records have been read/written
        self._recordCount = 0

        self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if
                              FieldMetaSpecial.timestamp in specials else None)
        self._resetIdx = (specials.index(FieldMetaSpecial.reset)
                          if FieldMetaSpecial.reset in specials else None)
        self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if
                               FieldMetaSpecial.sequence in specials else None)
        self._categoryIdx = (specials.index(FieldMetaSpecial.category) if
                             FieldMetaSpecial.category in specials else None)
        self._learningIdx = (specials.index(FieldMetaSpecial.learning) if
                             FieldMetaSpecial.learning in specials else None)

        # keep track of the current sequence
        self._currSequence = None
        self._currTime = None

        if self._timeStampIdx:
            assert types[self._timeStampIdx] == FieldMetaType.datetime
        if self._sequenceIdIdx:
            assert types[self._sequenceIdIdx] in (FieldMetaType.string,
                                                  FieldMetaType.integer)
        if self._resetIdx:
            assert types[self._resetIdx] == FieldMetaType.integer
        if self._categoryIdx:
            assert types[self._categoryIdx] in (FieldMetaType.list,
                                                FieldMetaType.integer)
        if self._learningIdx:
            assert types[self._learningIdx] == FieldMetaType.integer

        # Convert the types to the actual types in order to convert the strings
        if self._mode == self._FILE_READ_MODE:
            m = {
                FieldMetaType.integer: intOrNone,
                FieldMetaType.float: floatOrNone,
                FieldMetaType.boolean: parseBool,
                FieldMetaType.string: unescape,
                FieldMetaType.datetime: parseTimestamp,
                FieldMetaType.sdr: parseSdr,
                FieldMetaType.list: parseStringList
            }
        else:
            if includeMS:
                datetimeFunc = serializeTimestamp
            else:
                datetimeFunc = serializeTimestampNoMS
            m = {
                FieldMetaType.integer: str,
                FieldMetaType.float: str,
                FieldMetaType.string: escape,
                FieldMetaType.boolean: str,
                FieldMetaType.datetime: datetimeFunc,
                FieldMetaType.sdr: serializeSdr,
                FieldMetaType.list: stripList
            }

        self._adapters = [m[t] for t in types]

        self._missingValues = missingValues

        #
        # If the bookmark is set, we need to skip over first N records
        #
        if bookmark is not None:
            rowsToSkip = self._getStartRow(bookmark)
        elif firstRecord is not None:
            rowsToSkip = firstRecord
        else:
            rowsToSkip = 0

        while rowsToSkip > 0:
            self.next()
            rowsToSkip -= 1

        # Dictionary to store record statistics (min and max of scalars for now)
        self._stats = None