def dump_rows(outp, fd, store, compress=False, genrows_kwargs=None): outp.printf('Starting row dump') if not genrows_kwargs: genrows_kwargs = {} i = 0 j = 0 cur_bytes = 0 bufs = [] kwargs = preset_args.get(store.getStoreType(), {}) kwargs.update(genrows_kwargs) tick = time.time() for rows in store.genStoreRows(**kwargs): j += len(rows) i += len(rows) tufo = s_tufo.tufo('core:save:add:rows', rows=rows) if compress: tufo[1]['rows'] = gzip.compress(s_common.msgenpack(rows), 9) byts = s_common.msgenpack(tufo) bufs.append(byts) cur_bytes += len(byts) if cur_bytes > s_axon.megabyte * DUMP_MEGS: fd.write(b''.join([byts for byts in bufs])) outp.printf('Stored {} rows, total {} rows'.format(j, i)) bufs = [] cur_bytes = 0 j = 0 # There still may be rows we need too write out. if bufs: fd.write(b''.join([byts for byts in bufs])) outp.printf('Stored {} rows, total {} rows'.format(j, i)) bufs = [] tock = time.time() outp.printf('Done dumping rows - took {} seconds.'.format(tock - tick)) outp.printf('Dumped {} rows'.format(i))
def _addRows(self, rows): ''' Adds a bunch of rows to the database Take care: this was written this way for performance, in particular when len(rows) is large. ''' encs = [] with self._getTxn(write=True) as txn: next_pk = self.next_pk # First, we encode all the i, p, v, t for all rows for i, p, v, t in rows: if next_pk > MAX_PK: raise s_common.HitCoreLimit( name='MAX_PK', size=MAX_PK, mesg='Out of primary key values') if len(p) > MAX_PROP_LEN: raise s_common.HitCoreLimit( name='MAX_PROP_LEN', size=MAX_PROP_LEN, mesg='Property length too large') i_enc = _encIden(i) p_enc = _encProp(p) v_key_enc = _encValKey(v) t_enc = s_common.msgenpack(t) pk_enc = _encPk(next_pk) row_enc = s_common.msgenpack((i, p, v, t)) # idx 0 1 2 3 4 5 encs.append((i_enc, p_enc, row_enc, t_enc, v_key_enc, pk_enc)) next_pk += 1 # An iterator of what goes into the main table: key=pk_enc, val=encoded(i, p, v, t) kvs = ((x[5], x[2]) for x in encs) # Shove it all in at once consumed, added = txn.cursor(self.rows).putmulti(kvs, overwrite=False, append=True) if consumed != added or consumed != len(encs): # Will only fail if record already exists, which should never happen raise s_common.BadCoreStore(store='lmdb', mesg='unexpected pk in DB') # Update the indices for all rows kvs = ((x[0] + x[1], x[5]) for x in encs) txn.cursor(self.index_ip).putmulti(kvs, dupdata=True) kvs = ((x[1] + x[4] + x[3], x[5]) for x in encs) txn.cursor(self.index_pvt).putmulti(kvs, dupdata=True) kvs = ((x[1] + x[3], x[5]) for x in encs) txn.cursor(self.index_pt).putmulti(kvs, dupdata=True) # self.next_pk should be protected from multiple writers. Luckily lmdb write lock does # that for us. self.next_pk = next_pk
def dump_blobs(outp, fd, store): i = 0 outp.printf('Dumping blobstore') for key in store.getBlobKeys(): valu = store.getBlobValu(key) tufo = s_tufo.tufo('syn:core:blob:set', key=key, valu=s_common.msgenpack(valu)) byts = s_common.msgenpack(tufo) fd.write(byts) i += 1 outp.printf('Done dumping {} keys from blobstore.'.format(i))
def _calcFirstLastKeys(prop, valu, mintime, maxtime): ''' Returns the encoded bytes for the start and end keys to the pt or pvt index. Helper function for _{get,del}RowsByProp ''' p_enc = _encProp(prop) v_key_enc = b'' if valu is None else _encValKey(valu) v_is_hashed = valu is not None and (v_key_enc[0] == HASH_VAL_MARKER_ENC) if mintime is None and maxtime is None: return (p_enc + v_key_enc, None, v_is_hashed, True) mintime_enc = b'' if mintime is None else s_common.msgenpack(mintime) maxtime_enc = MAX_TIME_ENC if maxtime is None else s_common.msgenpack(maxtime) first_key = p_enc + v_key_enc + mintime_enc last_key = p_enc + v_key_enc + maxtime_enc return (first_key, last_key, v_is_hashed, False)
def hashitem(item): ''' Generate a uniq hash for the JSON compatible primitive data structure. ''' norm = normitem(item) byts = s_common.msgenpack(norm) return hashlib.md5(byts).hexdigest()
def tx(self, mesg): ''' Transmit a mesg tufo ( type, info ) via the socket using msgpack. If present this API is safe for use with a socket in a Plex(). ''' byts = s_common.msgenpack(mesg) return self.txbytes(byts)
def _delBlobValu(self, key): key_byts = s_common.msgenpack(key.encode('utf-8')) with self._getTxn(write=True) as txn: # type: lmdb.Transaction ret = txn.pop(key_byts, db=self.blob_store) if ret is None: # pragma: no cover # We should never get here, but if we do, throw an exception. raise s_common.NoSuchName(name=key, mesg='Cannot delete key which is not present in the blobstore.') return ret
def _encValKey(v): ''' Encode a value as used in a key. Non-negative numbers are msgpack encoded. Negative numbers are encoded as a marker, then the encoded negative of that value, so that the ordering of the encodings is easily mapped to the ordering of the negative numbers. Strings too long are hashed. Note that this scheme prevents interleaving of value types: all string encodings compare larger than all negative number encodings compare larger than all nonnegative encodings. ''' if s_compat.isint(v): if v >= 0: return s_common.msgenpack(v) else: return NEGATIVE_VAL_MARKER_ENC + s_common.msgenpack(-v) else: if len(v) >= LARGE_STRING_SIZE: return (HASH_VAL_MARKER_ENC + s_common.msgenpack(xxhash.xxh64(v).intdigest())) else: return STRING_VAL_MARKER_ENC + s_common.msgenpack(v)
def _delRowAndIndices(self, txn, pk_enc, i_enc=None, p_enc=None, v_key_enc=None, t_enc=None, delete_ip=True, delete_pvt=True, delete_pt=True, only_if_val=None): ''' Deletes the row corresponding to pk_enc and the indices pointing to it ''' with txn.cursor(self.rows) as cursor: if not cursor.set_key(pk_enc): raise s_common.BadCoreStore(store='lmdb', mesg='Missing PK') i, p, v, t = s_common.msgunpack(cursor.value()) if only_if_val is not None and only_if_val != v: return False cursor.delete() if delete_ip and i_enc is None: i_enc = _encIden(i) if p_enc is None: p_enc = _encProp(p) if delete_pvt and v_key_enc is None: v_key_enc = _encValKey(v) if (delete_pvt or delete_pt) and t_enc is None: t_enc = s_common.msgenpack(t) if delete_ip: # Delete I-P index entry if not txn.delete(i_enc + p_enc, value=pk_enc, db=self.index_ip): raise s_common.BadCoreStore(store='lmdb', mesg='Missing I-P index') if delete_pvt: # Delete P-V-T index entry if not txn.delete(p_enc + v_key_enc + t_enc, value=pk_enc, db=self.index_pvt): raise s_common.BadCoreStore(store='lmdb', mesg='Missing P-V-T index') if delete_pt: # Delete P-T index entry if not txn.delete(p_enc + t_enc, value=pk_enc, db=self.index_pt): raise s_common.BadCoreStore(store='lmdb', mesg='Missing P-T index') return True
def _txSockMesg(self, sock, mesg): # handle the need to send on a socket in the plex byts = s_common.msgenpack(mesg) if len(byts) > 50000 and sock.get('sock:can:gzip'): byts = sockgzip(byts) with self._plex_lock: # we have no backlog! if sock.txbuf is None: byts = sock._tx_xform(byts) try: sent = sock.send(byts) except ssl.SSLError as e: # FIXME isolate this filth within link modules. sent = 0 if e.errno != 3: #logger.exception(e) sock.fini() return except Exception as e: #logger.exception(e) sock.fini() return blen = len(byts) if sent == blen: return # our send was a bit short... sock.txbuf = byts[sent:] sock.txsize += (blen - sent) sock.fire('sock:tx:size', size=sock.txsize) self._plex_txsocks.append(sock) self._plexWake() return # so... we have a backlog... sock.txque.append(byts) sock.txsize += len(byts) sock.fire('sock:tx:size', size=sock.txsize)
def tx(self, mesg): ''' Transmit a mesg tufo ( type, info ) via the socket using msgpack. If present this API is safe for use with a socket in a Plex(). ''' if self.plex is not None: return self.plex._txSockMesg(self, mesg) try: byts = s_common.msgenpack(mesg) if len(byts) > 50000 and self.get('sock:can:gzip'): byts = sockgzip(byts) self.sendall(byts) return True except socket.error as e: self.fini() return False
def add(self, item): ''' Add an item to the persistance storage. ''' byts = s_common.msgenpack(item) size = len(byts) with self.fdlock: if self.isfini: raise s_common.IsFini() if self.fdoff != self.size: self.fd.seek(0, os.SEEK_END) off = self.size self.fd.write(byts) self.size += len(byts) self.fdoff = self.size return (off, size)
def main(argv, outp=None): if outp is None: # pragma: no cover outp = s_output.OutPut() parser = makeargpaser() opts = parser.parse_args(argv) if not opts.verbose: logging.disable(logging.DEBUG) if os.path.isfile(opts.output) and not opts.force: outp.printf('Cannot overwrite a backup.') return 1 genrows_kwargs = {} if opts.extra_args: with open(opts.extra_args, 'rb') as fd: genrows_kwargs = json.loads(fd.read().decode()) storconf = {'rev:storage': False} if opts.revstorage: storconf['rev:storage'] = True backup_tufo = gen_backup_tufo(opts) with open(opts.output, 'wb') as fd: fd.write(s_common.msgenpack(backup_tufo)) with s_cortex.openstore(opts.store, storconf=storconf) as store: dump_store(outp, fd, store, compress=opts.compress, dump_blobstore=opts.dump_blobstore, genrows_kwargs=genrows_kwargs) outp.printf('Fin') return 0
def setBlobValu(self, key, valu): ''' Set a value from the blob key/value (KV) store. This resides below the tufo storage layer and is Cortex implementation dependent. In purely memory backed cortexes, this KV store may not be persistent, even if the tufo-layer is persistent, through something such as the savefile mechanism. Notes: Data which is stored in the KV store is msgpacked, so caveats with that apply. Args: key (str): Name of the value to store. valu: Value to store in the KV store. Returns: The input value, unchanged. ''' buf = s_common.msgenpack(valu) self._setBlobValu(key, buf) self.savebus.fire('syn:core:blob:set', key=key, valu=buf) return valu
def getMeldBytes(self): ''' Return a msgpack packed copy of the MindMeld dictionary. ''' return s_common.msgenpack(self.info)
def enMsgB64(item): # FIXME find a way to go directly from binary bytes to # base64 *string* to avoid the extra decode pass.. return base64.b64encode(s_common.msgenpack(item)).decode('utf8')
MAX_PK_BYTES = 8 if sys.maxsize > 2**32 else 4 # Prefix to indicate that a v is a nonnegative value NONNEGATIVE_VAL_MARKER = 0 # Prefix to indicate that a v is a negative value NEGATIVE_VAL_MARKER = -1 # Prefix to indicate than a v is a string STRING_VAL_MARKER = -2 # Prefix to indicate that a v is hash of a string HASH_VAL_MARKER = -3 # The negative marker encoded NEGATIVE_VAL_MARKER_ENC = s_common.msgenpack(NEGATIVE_VAL_MARKER) # The string marker encoded STRING_VAL_MARKER_ENC = s_common.msgenpack(STRING_VAL_MARKER) # The hash marker encoded HASH_VAL_MARKER_ENC = s_common.msgenpack(HASH_VAL_MARKER) # Number of bytes in a UUID UUID_SIZE = 16 # An index key can't ever be larger (lexicographically) than this MAX_INDEX_KEY = b'\xff' * 20 # String vals of this size or larger will be truncated and hashed in index. What this means is # that comparison on large string vals require retrieving the row from the main table
def _getBlobValu(self, key): key_byts = s_common.msgenpack(key.encode('utf-8')) with self._getTxn() as txn: # type: lmdb.Transaction ret = txn.get(key_byts, default=None, db=self.blob_store) return ret
def _setBlobValu(self, key, valu): key_byts = s_common.msgenpack(key.encode('utf-8')) with self._getTxn(write=True) as txn: # type: lmdb.Transaction txn.put(key_byts, valu, overwrite=True, db=self.blob_store)
def sockgzip(byts): blen = len(byts) byts = zlib.compress(byts) #print('GZIP DELTA: %d -> %d' % (blen,len(byts))) return s_common.msgenpack(('sock:gzip', {'data': byts}))
def savemesg(mesg): fd.write(s_common.msgenpack(mesg))