def _read(self):
     try:
         size_bits = self.fh.read(4)
         size = struct.unpack(
             "<i", size_bits)[0] - 4  # BSON size byte includes itself
         data = size_bits + self.fh.read(size)
         if len(data) != size + 4:
             raise struct.error(
                 "Unable to cleanly read expected BSON Chunk; EOF, underful buffer or invalid object size."
             )
         if data[size + 4 - 1] != "\x00":
             raise InvalidBSON("Bad EOO in BSON Data")
         doc = None
         if self.fast_string_prematch:
             if self.fast_string_prematch in data:
                 doc = BSON(data).decode(tz_aware=True)
             else:
                 doc = {"_id": 0}
         else:
             doc = BSON(data).decode(tz_aware=True)
         return doc
     except struct.error, e:
         #print >> sys.stderr, "Parsing Length record failed: %s" % e
         self.eof = True
         raise StopIteration(e)
 def _read(self):
     try:
         size_bits = self.fh.read(4)
         size = struct.unpack(
             "<i", size_bits)[0] - 4  # BSON size byte includes itself
         data = size_bits + self.fh.read(size)
         if len(data) != size + 4:
             raise struct.error(
                 "Unable to read expected BSON Chunk; " +
                 "EOF, underful buffer or invalid object size.")
         if six.PY3:
             eoo = 0x00
         else:  # six.PY2
             eoo = "\x00"
         if data[size + 4 - 1] != eoo:
             raise InvalidBSON("Bad EOO in BSON Data")
         if self.fast_string_prematch.encode("utf-8") in data:
             if self.decode:
                 try:
                     return BSON(data).decode(self.codec)
                 except TypeError:
                     return BSON(data).decode()
             else:
                 return data
         raise ValueError("Unknown Error")
     except struct.error as e:
         self.eof = True
         raise StopIteration(e)
Ejemplo n.º 3
0
def _make_bson_doc(uid: str, df: pd.DataFrame, metadata) -> SON:
    """
    Takes a DataFrame and makes a BSON document ready to be inserted
    into MongoDB. Given Conritick's focus on timeseries data, the input
    DataFrame index must be a DatetimeIndex.
    Column names are kept and saved as strings.
    Index name is explicitly discarded and not saved.

    :param uid: Unique ID for the timeseries represented by the input DataFrame
    :param df: Input DataFrame
    :param metadata: Any BSON-able objects to be attached to document as metadata
    :return: BSON document
    """
    mem_usage = df.memory_usage().sum()
    df = df.sort_index(ascending=True)

    if df.index.tzinfo is None:
        if not all(ix.time() == datetime.time(0, 0) for ix in df.index[:100]):
            # Issue warning only if DataFrame doesn't look like EOD based.
            warnings.warn('DatetimeIndex is timezone-naive. Assuming to be in UTC.')
        offset = None
    else:
        offset = df.index.tzinfo._utcoffset.total_seconds() / 60

    # Remove invalid MongoDB field characters
    df = df.rename(columns=lambda x: re.sub('\.', '', str(x)))
    index = _make_bson_column(df.index)
    columns = SON()
    for col in df.columns:
        columns[col] = _make_bson_column(df[col])

    nrows = len(df)
    binary_size = sum([columns[col]['size'] for col in df.columns])
    binary_size += index['size']
    compression_ratio = binary_size / mem_usage
    if binary_size > 0.95 * MAX_BSON_SIZE:
        msg = f'Binary data size is too large ({binary_size:,} / {compression_ratio:.1%})'
        raise InvalidBSON(msg, compression_ratio)
    logger.debug(f'{uid} document: {binary_size:,} bytes ({compression_ratio:.1%}), {nrows} rows')
    add_meta = {'nrows': nrows, 'binary_size': binary_size, 'utc_offset': offset}
    metadata = {**metadata, **add_meta}

    doc = SON([
        ('uid', uid),
        ('start', df.index[0]),
        ('end', df.index[-1]),
        ('metadata', metadata),
        ('index', index),
        ('columns', columns)])

    return doc
Ejemplo n.º 4
0
 def _read(self):
     try:
         size_bits = self.fh.read(4)
         size = struct.unpack("<i", size_bits)[0] - 4 # BSON size byte includes itself 
         data = self.fh.read(size)
         if len(data) != size:
             raise struct.error("Unable to cleanly read expected BSON Chunk; EOF, underful buffer or invalid object size.")
         if data[size - 1] != "\x00":
             raise InvalidBSON("Bad EOO in BSON Data")
         chunk = data[:size - 1]
         doc = _elements_to_dict(chunk, dict, True)
         return doc
     except struct.error, e:
         #print >> sys.stderr, "Parsing Length record failed: %s" % e
         self.eof = True
         raise StopIteration(e)
Ejemplo n.º 5
0
 def _read(self):
     try:
         size_bits = self.fh.read(4)
         size = struct.unpack(
             "<i", size_bits)[0] - 4  # BSON size byte includes itself
         data = size_bits + self.fh.read(size)
         if len(data) != size + 4:
             raise struct.error(
                 "Unable to cleanly read expected BSON Chunk; EOF, underful buffer or invalid object size."
             )
         if data[size + 4 - 1] != "\x00":
             raise InvalidBSON("Bad EOO in BSON Data")
         doc = BSON(data).decode(codec_options=STREAMING_CODEC_OPTIONS)
         return doc
     except struct.error, e:
         #print >> sys.stderr, "Parsing Length record failed: %s" % e
         self.eof = True
         raise StopIteration(e)
Ejemplo n.º 6
0
 def _read(self):
     try:
         size_bits = self.fh.read(4)
         size = struct.unpack(
             "<i", size_bits)[0] - 4  # BSON size byte includes itself
         data = size_bits + self.fh.read(size)
         if len(data) != size + 4:
             raise struct.error(
                 "Unable to cleanly read expected BSON Chunk; EOF, underful buffer or invalid object size."
             )
         if data[size + 4 - 1] != "\x00":
             raise InvalidBSON("Bad EOO in BSON Data")
         if self.fast_string_prematch in data:
             if self.decode:
                 try:
                     return BSON(data).decode(tz_aware=True)
                 except TypeError:
                     return BSON(data).decode()
             else:
                 return data
         raise ValueError("Unknown Error")
     except struct.error, e:
         self.eof = True
         raise StopIteration(e)