def _read(self): try: size_bits = self.fh.read(4) size = struct.unpack( "<i", size_bits)[0] - 4 # BSON size byte includes itself data = size_bits + self.fh.read(size) if len(data) != size + 4: raise struct.error( "Unable to cleanly read expected BSON Chunk; EOF, underful buffer or invalid object size." ) if data[size + 4 - 1] != "\x00": raise InvalidBSON("Bad EOO in BSON Data") doc = None if self.fast_string_prematch: if self.fast_string_prematch in data: doc = BSON(data).decode(tz_aware=True) else: doc = {"_id": 0} else: doc = BSON(data).decode(tz_aware=True) return doc except struct.error, e: #print >> sys.stderr, "Parsing Length record failed: %s" % e self.eof = True raise StopIteration(e)
def _read(self): try: size_bits = self.fh.read(4) size = struct.unpack( "<i", size_bits)[0] - 4 # BSON size byte includes itself data = size_bits + self.fh.read(size) if len(data) != size + 4: raise struct.error( "Unable to read expected BSON Chunk; " + "EOF, underful buffer or invalid object size.") if six.PY3: eoo = 0x00 else: # six.PY2 eoo = "\x00" if data[size + 4 - 1] != eoo: raise InvalidBSON("Bad EOO in BSON Data") if self.fast_string_prematch.encode("utf-8") in data: if self.decode: try: return BSON(data).decode(self.codec) except TypeError: return BSON(data).decode() else: return data raise ValueError("Unknown Error") except struct.error as e: self.eof = True raise StopIteration(e)
def _make_bson_doc(uid: str, df: pd.DataFrame, metadata) -> SON: """ Takes a DataFrame and makes a BSON document ready to be inserted into MongoDB. Given Conritick's focus on timeseries data, the input DataFrame index must be a DatetimeIndex. Column names are kept and saved as strings. Index name is explicitly discarded and not saved. :param uid: Unique ID for the timeseries represented by the input DataFrame :param df: Input DataFrame :param metadata: Any BSON-able objects to be attached to document as metadata :return: BSON document """ mem_usage = df.memory_usage().sum() df = df.sort_index(ascending=True) if df.index.tzinfo is None: if not all(ix.time() == datetime.time(0, 0) for ix in df.index[:100]): # Issue warning only if DataFrame doesn't look like EOD based. warnings.warn('DatetimeIndex is timezone-naive. Assuming to be in UTC.') offset = None else: offset = df.index.tzinfo._utcoffset.total_seconds() / 60 # Remove invalid MongoDB field characters df = df.rename(columns=lambda x: re.sub('\.', '', str(x))) index = _make_bson_column(df.index) columns = SON() for col in df.columns: columns[col] = _make_bson_column(df[col]) nrows = len(df) binary_size = sum([columns[col]['size'] for col in df.columns]) binary_size += index['size'] compression_ratio = binary_size / mem_usage if binary_size > 0.95 * MAX_BSON_SIZE: msg = f'Binary data size is too large ({binary_size:,} / {compression_ratio:.1%})' raise InvalidBSON(msg, compression_ratio) logger.debug(f'{uid} document: {binary_size:,} bytes ({compression_ratio:.1%}), {nrows} rows') add_meta = {'nrows': nrows, 'binary_size': binary_size, 'utc_offset': offset} metadata = {**metadata, **add_meta} doc = SON([ ('uid', uid), ('start', df.index[0]), ('end', df.index[-1]), ('metadata', metadata), ('index', index), ('columns', columns)]) return doc
def _read(self): try: size_bits = self.fh.read(4) size = struct.unpack("<i", size_bits)[0] - 4 # BSON size byte includes itself data = self.fh.read(size) if len(data) != size: raise struct.error("Unable to cleanly read expected BSON Chunk; EOF, underful buffer or invalid object size.") if data[size - 1] != "\x00": raise InvalidBSON("Bad EOO in BSON Data") chunk = data[:size - 1] doc = _elements_to_dict(chunk, dict, True) return doc except struct.error, e: #print >> sys.stderr, "Parsing Length record failed: %s" % e self.eof = True raise StopIteration(e)
def _read(self): try: size_bits = self.fh.read(4) size = struct.unpack( "<i", size_bits)[0] - 4 # BSON size byte includes itself data = size_bits + self.fh.read(size) if len(data) != size + 4: raise struct.error( "Unable to cleanly read expected BSON Chunk; EOF, underful buffer or invalid object size." ) if data[size + 4 - 1] != "\x00": raise InvalidBSON("Bad EOO in BSON Data") doc = BSON(data).decode(codec_options=STREAMING_CODEC_OPTIONS) return doc except struct.error, e: #print >> sys.stderr, "Parsing Length record failed: %s" % e self.eof = True raise StopIteration(e)
def _read(self): try: size_bits = self.fh.read(4) size = struct.unpack( "<i", size_bits)[0] - 4 # BSON size byte includes itself data = size_bits + self.fh.read(size) if len(data) != size + 4: raise struct.error( "Unable to cleanly read expected BSON Chunk; EOF, underful buffer or invalid object size." ) if data[size + 4 - 1] != "\x00": raise InvalidBSON("Bad EOO in BSON Data") if self.fast_string_prematch in data: if self.decode: try: return BSON(data).decode(tz_aware=True) except TypeError: return BSON(data).decode() else: return data raise ValueError("Unknown Error") except struct.error, e: self.eof = True raise StopIteration(e)