def provide_batch(self): if self.done: return 0, None batch = pump.Batch(self) if self.file_iter == None: self.f = self.spec.replace(JSON_SCHEME, "") files = list() if os.path.isfile(self.f) and self.f.endswith(".zip"): zfobj = zipfile.ZipFile(self.f) self.working_dir = tempfile.mkdtemp() ZipUtil(zfobj).extractall(self.working_dir) JSONSource.enumerate_files(self.working_dir, files, False) elif os.path.isdir(self.f): JSONSource.enumerate_files(self.f, files, False) else: try: fp = open(self.f, 'r') dockey = JSONSource.gen_dockey(os.path.basename(self.f)) self.save_doc(batch, dockey, fp, True) fp.close() except IOError, error: return "error: could not open json: %s; exception: %s" % \ (self.f, e), None self.done = True return 0,batch if len(files) > 0: self.file_iter = iter(files)
def provide_batch(self): """Provides a batch of messages, with GET/SET ratios and keys controlled by a mcsoda-inspired approach, but simpler.""" if self.done: return 0, None cfg = self.source_map['cfg'] prefix = cfg['prefix'] max_items = cfg['max-items'] ratio_sets = cfg['ratio-sets'] exit_after_creates = cfg['exit-after-creates'] json = cfg['json'] if not self.body: min_value_body = "0" * cfg['min-value-size'] if json: self.body = '{"name": "%s%s", "age": %s, "index": %s,' + \ ' "body": "%s"}' % min_value_body else: self.body = min_value_body batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] vbucket_id = 0x0000ffff cas, exp, flg = 0, 0, 0 while (batch.size() < batch_max_size and batch.bytes < batch_max_bytes): if ratio_sets >= float(self.cur_sets) / float(self.cur_ops or 1): self.cur_sets = self.cur_sets + 1 cmd = memcacheConstants.CMD_TAP_MUTATION if self.cur_items < max_items: key = self.cur_items self.cur_items = self.cur_items + 1 else: key = self.cur_sets % self.cur_items else: self.cur_gets = self.cur_gets + 1 cmd = memcacheConstants.CMD_GET key = self.cur_gets % self.cur_items self.cur_ops = self.cur_ops + 1 if json: value = self.body % (prefix, key, key % 101, key) else: value = self.body msg = (cmd, vbucket_id, prefix + str(key), flg, exp, cas, '', value) batch.append(msg, len(value)) if exit_after_creates and self.cur_items >= max_items: self.done = True return 0, batch if batch.size() <= 0: return 0, None return 0, batch
def provide_batch( self ) -> Tuple[couchbaseConstants.PUMP_ERROR, Optional[pump.Batch]]: if self.done: return 0, None if not self.r: try: self.r = csv.reader(open(self.spec, 'r', encoding='utf-8')) self.fields = next(self.r) if not 'id' in self.fields: return f'error: no \'id\' field in 1st line of csv: {self.spec}', None except StopIteration: return f'error: could not read 1st line of csv: {self.spec}', None except IOError as e: return f'error: could not open csv: {self.spec}; exception: {e!s}', None batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] cmd = couchbaseConstants.CMD_TAP_MUTATION vbucket_id = 0x0000ffff while (self.r and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): try: vals = next(self.r) doc = {} for i, field in enumerate(self.fields): if i >= len(vals): continue if field == 'id': doc[field] = vals[i] else: doc[field] = number_try_parse(vals[i]) if doc['id']: msg: couchbaseConstants.BATCH_MSG = (cmd, vbucket_id, doc['id'].encode(), 0, 0, 0, b'', literal_eval( doc['value']), 0, 0, 0, 0) batch.append(msg, len(doc)) except StopIteration: self.done = True self.r = None except Exception as e: logging.error(f'error: fails to read from csv file {e}') continue if batch.size() <= 0: return 0, None return 0, batch
def provide_batch(self): if self.done: return 0, None if not self.r: try: self.r = csv.reader(open(self.spec, 'r', encoding='utf-8')) self.fields = next(self.r) if not 'id' in self.fields: return ("error: no 'id' field in 1st line of csv: %s" % (self.spec)), None except StopIteration: return ("error: could not read 1st line of csv: %s" % (self.spec)), None except IOError as e: return ("error: could not open csv: %s; exception: %s" % (self.spec, e)), None batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] cmd = couchbaseConstants.CMD_TAP_MUTATION vbucket_id = 0x0000ffff while (self.r and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): try: vals = next(self.r) doc = {} for i, field in enumerate(self.fields): if i >= len(vals): continue if field == 'id': doc[field] = vals[i] else: doc[field] = number_try_parse(vals[i]) if doc['id']: msg = (cmd, vbucket_id, doc['id'], 0, 0, 0, '', doc['value'], 0, 0, 0, 0) batch.append(msg, len(doc)) except StopIteration: self.done = True self.r = None except Exception as e: logging.error("error: fails to read from csv file, %s", e) continue if batch.size() <= 0: return 0, None return 0, batch
def change_callback(doc_info): if doc_info: # Handle the new key name spacing for collections and co cid, key = decodeCollectionID(doc_info.id.encode()) # Only support keys in the _default collection if cid != 0: logging.debug('Skipping as not default collection') return if self.skip(key, vbucket_id): return if doc_info.deleted: cmd = couchbaseConstants.CMD_DCP_DELETE val = b'' else: cmd = couchbaseConstants.CMD_DCP_MUTATION val = doc_info.getContents( options=couchstore.CouchStore.DECOMPRESS) try: rev_meta_bytes = doc_info.revMeta.get_bytes() if len(rev_meta_bytes) == 18: conf_res = 0 cas, exp, flg, flex_meta, dtype = struct.unpack( SFD_REV_META, rev_meta_bytes) elif len(rev_meta_bytes) == 19: cas, exp, flg, flex_meta, dtype, conf_res = struct.unpack( SFD_REV_META_PRE_4_6, rev_meta_bytes) else: raise ValueError( 'Does not match pre- or post-4.6 format') meta = bytes([doc_info.revSequence]) seqno = doc_info.sequence nmeta = 0 msg = (cmd, vbucket_id, key, flg, exp, cas, meta, val, seqno, dtype, nmeta, conf_res) abatch[0].append(msg, len(val)) except Exception as e: self.queue.put(( f'error: could not read couchstore file due to unsupported file format version;' f' exception: {e}', None)) return if (abatch[0].size() >= batch_max_size or abatch[0].bytes >= batch_max_bytes): self.queue.put((0, abatch[0])) abatch[0] = pump.Batch(self)
def provide_batch( self ) -> Tuple[couchbaseConstants.PUMP_ERROR, Optional[pump.Batch]]: if self.done: return 0, None # During the first iteration load the file names, this is only run once if not self.docs: self.prepare_docs() batch = pump.Batch(self) f = self.spec.replace(JSON_SCHEME, "") batch_max_size = self.opts.extra['batch_max_size'] # Each iteration should return a batch or mark the loading a finished if os.path.isfile(f) and f.endswith(".zip"): zf = zipfile.ZipFile(f) while batch.size() < batch_max_size and self.docs: path = self.docs.pop() key = os.path.basename(path) if key.endswith('.json'): key = key[:-5] value = zf.read(path) self.save_doc(batch, key, value) zf.close() else: while batch.size() < batch_max_size and self.docs: path = self.docs.pop() key = os.path.basename(path) if key.endswith('.json'): key = key[:-5] try: fp = open(path, 'rb') value = fp.read() fp.close() self.save_doc(batch, key.encode(), value) except IOError as error: logging.error( f'Fail to load json file with error: {error!s}') if not self.docs: self.done = True return 0, batch
def change_callback(doc_info): if doc_info: key = doc_info.id if self.skip(key, vbucket_id): return if doc_info.deleted: cmd = memcacheConstants.CMD_TAP_DELETE val = '' else: cmd = memcacheConstants.CMD_TAP_MUTATION val = doc_info.getContents( options=couchstore.CouchStore.DECOMPRESS) cas, exp, flg = struct.unpack(SFD_REV_META, doc_info.revMeta) meta = struct.pack(SFD_REV_SEQ, doc_info.revSequence) msg = (cmd, vbucket_id, key, flg, exp, cas, meta, val) abatch[0].append(msg, len(val)) if (abatch[0].size() >= batch_max_size or abatch[0].bytes >= batch_max_bytes): self.queue.put((0, abatch[0])) abatch[0] = pump.Batch(self)
def loader(self): rv, d = data_dir(self.spec) if rv != 0: self.queue.put((rv, None)) return source_vbucket_state = \ getattr(self.opts, 'source_vbucket_state', 'active') source_nodes = self.source_bucket['nodes'] if len(source_nodes) != 1: self.queue.put(( f'error: expected 1 node in source_bucket: {self.source_bucket["name"]}', None)) return vbucket_states = source_nodes[0].get('vbucket_states', None) if not vbucket_states: self.queue.put(( f'error: missing vbucket_states in source_bucket: {self.source_bucket["name"]}', None)) return vbuckets = vbucket_states.get(source_vbucket_state, None) if vbuckets is None: # Empty dict is valid. self.queue.put(( f'error: missing vbuckets in source_bucket: {self.source_bucket["name"]}', None)) return batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] store = None vbucket_id = None # Level of indirection since we can't use python 3 nonlocal statement. abatch: List[pump.Batch] = [pump.Batch(self)] def change_callback(doc_info): if doc_info: # Handle the new key name spacing for collections and co cid, key = decodeCollectionID(doc_info.id.encode()) # Only support keys in the _default collection if cid != 0: logging.debug('Skipping as not default collection') return if self.skip(key, vbucket_id): return if doc_info.deleted: cmd = couchbaseConstants.CMD_DCP_DELETE val = b'' else: cmd = couchbaseConstants.CMD_DCP_MUTATION val = doc_info.getContents( options=couchstore.CouchStore.DECOMPRESS) try: rev_meta_bytes = doc_info.revMeta.get_bytes() if len(rev_meta_bytes) == 18: conf_res = 0 cas, exp, flg, flex_meta, dtype = struct.unpack( SFD_REV_META, rev_meta_bytes) elif len(rev_meta_bytes) == 19: cas, exp, flg, flex_meta, dtype, conf_res = struct.unpack( SFD_REV_META_PRE_4_6, rev_meta_bytes) else: raise ValueError( 'Does not match pre- or post-4.6 format') meta = bytes([doc_info.revSequence]) seqno = doc_info.sequence nmeta = 0 msg = (cmd, vbucket_id, key, flg, exp, cas, meta, val, seqno, dtype, nmeta, conf_res) abatch[0].append(msg, len(val)) except Exception as e: self.queue.put(( f'error: could not read couchstore file due to unsupported file format version;' f' exception: {e}', None)) return if (abatch[0].size() >= batch_max_size or abatch[0].bytes >= batch_max_bytes): self.queue.put((0, abatch[0])) abatch[0] = pump.Batch(self) for f in latest_couch_files(f'{d}/{self.source_bucket["name"]}'): vbucket_id = int(re.match(SFD_RE, os.path.basename(f)).group(1)) if not vbucket_id in vbuckets: continue try: store = couchstore.CouchStore(f, 'r') store.forEachChange(0, change_callback) store.close() except Exception as e: # MB-12270: Some files may be deleted due to compaction. We can # safely ignore them and move to next file. pass if abatch[0].size(): self.queue.put((0, abatch[0])) self.queue.put((0, None))
def provide_dcp_batch_actual( self ) -> Tuple[couchbaseConstants.PUMP_ERROR, Optional[pump.Batch]]: batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] delta_ack_size = batch_max_bytes * 10 / 4 # ack every 25% of buffer size last_processed = 0 total_bytes_read = 0 vbid: int = 0 start_seqno: int = 0 end_seqno: int = 0 vb_uuid: int = 0 ss_start_seqno: int = 0 ss_end_seqno: int = 0 no_response_count: int = 0 try: while (not self.dcp_done and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): if self.response.empty(): if len(self.stream_list) > 0: logging.debug( f'no response while there {len(self.stream_list)} active streams' ) time.sleep(.25) no_response_count = no_response_count + 1 # if not had a response after a minimum of 30 seconds then state we are done if no_response_count == 120: logging.warning( f'no response for 30 seconds while there {len(self.stream_list)}' ' active streams') self.dcp_done = True else: self.dcp_done = True continue no_response_count = 0 unprocessed_size = total_bytes_read - last_processed if unprocessed_size > delta_ack_size: rv = self.ack_buffer_size(unprocessed_size) if rv: logging.error(rv) else: last_processed = total_bytes_read cmd, errcode, opaque, cas, keylen, extlen, data, datalen, dtype, bytes_read = \ self.response.get() # type: int, int, int, int, int, int, bytes, int, int, int total_bytes_read += bytes_read rv = 0 metalen = flags = flg = exp = 0 key = val = ext = b'' need_ack = False seqno = 0 if cmd == couchbaseConstants.CMD_DCP_REQUEST_STREAM: total_bytes_read -= bytes_read if errcode == couchbaseConstants.ERR_SUCCESS: pair_index = (self.source_bucket['name'], self.source_node['hostname']) start = 0 step = DCPStreamSource.HIGH_SEQNO_BYTE + DCPStreamSource.UUID_BYTE while start + step <= datalen: uuid, seqno = struct.unpack( couchbaseConstants.DCP_VB_UUID_SEQNO_PKT_FMT, data[start:start + step]) if pair_index not in self.cur['failoverlog']: self.cur['failoverlog'][pair_index] = {} if opaque not in self.cur['failoverlog'][pair_index] or \ not self.cur['failoverlog'][pair_index][opaque]: self.cur['failoverlog'][pair_index][opaque] = [ (uuid, seqno) ] else: self.cur['failoverlog'][pair_index][ opaque].append((uuid, seqno)) start = start + step elif errcode == couchbaseConstants.ERR_KEY_ENOENT: logging.warn( "producer doesn't know about the vbucket uuid, rollback to 0" ) vbid, flags, start_seqno, end_seqno, vb_uuid, ss_start_seqno, ss_end_seqno = \ self.stream_list[opaque] del self.stream_list[opaque] elif errcode == couchbaseConstants.ERR_KEY_EEXISTS: logging.warning( f'a stream exists on the connection for vbucket: {opaque}' ) elif errcode == couchbaseConstants.ERR_NOT_MY_VBUCKET: logging.warning( f'Vbucket is not active anymore, skip it:{vbid!s}') del self.stream_list[opaque] elif errcode == couchbaseConstants.ERR_ERANGE: logging.warning( f'Start or end sequence numbers specified incorrectly,({start_seqno},' f' {end_seqno})') del self.stream_list[opaque] elif errcode == couchbaseConstants.ERR_ROLLBACK: vbid, flags, start_seqno, end_seqno, vb_uuid, ss_start_seqno, ss_stop_seqno = \ self.stream_list[opaque] start_seqno, = struct.unpack( couchbaseConstants.DCP_VB_SEQNO_PKT_FMT, data) # find the most latest uuid, hi_seqno that fit start_seqno if self.cur['failoverlog']: pair_index = (self.source_bucket['name'], self.source_node['hostname']) if self.cur['failoverlog'][pair_index].get("vbid"): for uuid, seqno in self.cur['failoverlog'][ pair_index][vbid]: if start_seqno >= seqno: vb_uuid = uuid break ss_start_seqno = start_seqno ss_end_seqno = start_seqno self.request_dcp_stream(vbid, flags, start_seqno, end_seqno, vb_uuid, ss_start_seqno, ss_end_seqno) del self.stream_list[opaque] self.stream_list[opaque] = \ (vbid, flags, start_seqno, end_seqno, vb_uuid, ss_start_seqno, ss_end_seqno) else: logging.error(f'unprocessed errcode: {errcode}') del self.stream_list[opaque] elif cmd == couchbaseConstants.CMD_DCP_MUTATION: vbucket_id = errcode seqno, rev_seqno, flg, exp, locktime, metalen, nru = \ struct.unpack(couchbaseConstants.DCP_MUTATION_PKT_FMT, data[0:extlen]) key_start = extlen val_start = key_start + keylen val_len = datalen - keylen - metalen - extlen meta_start = val_start + val_len key = data[extlen:val_start] val = data[val_start:meta_start] conf_res = 0 if meta_start < datalen: # handle extra conflict resolution fields extra_meta = data[meta_start:] extra_index = 0 version = extra_meta[extra_index] extra_index += 1 while extra_index < metalen: id, extlen = struct.unpack( couchbaseConstants.DCP_EXTRA_META_PKG_FMT, extra_meta[extra_index:extra_index + 3]) extra_index += 3 if id == couchbaseConstants.DCP_EXTRA_META_CONFLICT_RESOLUTION: if extlen == 1: conf_res, = struct.unpack( ">B", extra_meta[extra_index:extra_index + 1]) elif extlen == 2: conf_res, = struct.unpack( ">H", extra_meta[extra_index:extra_index + 2]) elif extlen == 4: conf_res, = struct.unpack( ">I", extra_meta[extra_index:extra_index + 4]) elif extlen == 8: conf_res, = struct.unpack( ">Q", extra_meta[extra_index:extra_index + 8]) else: logging.error( f'unsupported extra meta data format: {extlen:d}' ) conf_res = 0 extra_index += extlen if not self.skip(key, vbucket_id): dtype, val = self.maybe_uncompress_value(dtype, val) msg = (cmd, vbucket_id, key, flg, exp, cas, rev_seqno.to_bytes(8, 'big'), val, seqno, dtype, metalen, conf_res) batch.append(msg, len(val)) self.num_msg += 1 elif cmd in [ couchbaseConstants.CMD_DCP_DELETE, couchbaseConstants.CMD_DCP_EXPIRATION ]: vbucket_id = errcode seqno, rev_seqno, metalen = struct.unpack( couchbaseConstants.DCP_DELETE_PKT_FMT, data[0:extlen]) key_start = extlen val_start = key_start + keylen key = data[extlen:val_start] # If the delete has the Xattr data type get the Xattrs from the body if dtype & couchbaseConstants.DATATYPE_HAS_XATTR: val = data[val_start:] if not self.skip(key, vbucket_id): dtype, val = self.maybe_uncompress_value(dtype, val) msg = (cmd, vbucket_id, key, flg, exp, cas, rev_seqno.to_bytes(8, 'big'), val, seqno, dtype, metalen, 0) batch.append(msg, len(val)) self.num_msg += 1 if cmd == couchbaseConstants.CMD_DCP_DELETE: batch.adjust_size += 1 elif cmd == couchbaseConstants.CMD_DCP_FLUSH: total_bytes_read -= bytes_read logging.warning("stopping: saw CMD_DCP_FLUSH") self.dcp_done = True break elif cmd == couchbaseConstants.CMD_DCP_END_STREAM: del self.stream_list[opaque] if not len(self.stream_list): self.dcp_done = True elif cmd == couchbaseConstants.CMD_DCP_SNAPSHOT_MARKER: ss_start_seqno, ss_end_seqno, _ = struct.unpack( couchbaseConstants.DCP_SNAPSHOT_PKT_FMT, data[0:extlen]) pair_index = (self.source_bucket['name'], self.source_node['hostname']) if not self.cur['snapshot']: self.cur['snapshot'] = {} if pair_index not in self.cur['snapshot']: self.cur['snapshot'][pair_index] = {} self.cur['snapshot'][pair_index][opaque] = (ss_start_seqno, ss_end_seqno) elif cmd == couchbaseConstants.CMD_DCP_NOOP: total_bytes_read -= bytes_read need_ack = True elif cmd == couchbaseConstants.CMD_DCP_BUFFER_ACK: total_bytes_read -= bytes_read if errcode != couchbaseConstants.ERR_SUCCESS: logging.warning( f'buffer ack response errcode: {errcode}') continue else: total_bytes_read -= bytes_read logging.warning(f'warning: unexpected DCP message: {cmd}') return f'unexpected DCP message: {cmd}', batch if need_ack: self.ack_last = True try: if self.dcp_conn is not None: self.dcp_conn._send_msg( cmd, b'', b'', opaque, vbucket_id=0, fmt=couchbaseConstants.RES_PKT_FMT, magic=couchbaseConstants.RES_MAGIC_BYTE) except socket.error: return f'error: socket.error on sendall(); perhaps the source server:' \ f' {self.source_node["hostname"]} was rebalancing or had' \ f' connectivity/server problems', batch except EOFError: self.dcp_done = True return f'error: EOFError on socket sendall(); perhaps the source server:' \ f' {self.source_node["hostname"]} was rebalancing or had ' \ f'connectivity/server problems', batch # Close the batch when there's an ACK handshake, so # the server can concurrently send us the next batch. # If we are slow, our slow ACK's will naturally slow # down the server. self.ack_buffer_size(total_bytes_read - last_processed) return 0, batch self.ack_last = False self.cmd_last = cmd except EOFError: if batch.size() <= 0 and self.ack_last: # A closed conn after an ACK means clean end of TAP dump. self.dcp_done = True if batch.size() <= 0: return 0, None self.ack_buffer_size(total_bytes_read - last_processed) return 0, batch
def provide_upr_batch_actual(self, upr_conn): batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] vbid = 0 cmd = 0 start_seqno = 0 end_seqno = 0 vb_uuid = 0 hi_seqno = 0 try: while (not self.upr_done and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): #if not self.queue.empty(): # vbid, cmd, start_seqno, end_seqno, vb_uuid, hi_seqno = self.queue.get() # self.request_upr_stream(vbid, 0, start_seqno, end_seqno, vb_uuid, hi_seqno) if self.response.empty(): if len(self.stream_list) > 0: time.sleep(.25) else: self.upr_done = True continue cmd, errcode, opaque, cas, keylen, extlen, data, datalen, dtype = \ self.response.get() #self.recv_upr_msg(self.upr_conn.s) #print cmd, errcode, opaque, cas, keylen, extlen, data #assert opaque == int(vbid), "expected opaque '%s', got '%s'" % (vbid, opaque) rv = 0 metalen = flags = flg = exp = 0 key = val = ext = '' need_ack = False seqno = 0 if cmd == couchbaseConstants.CMD_UPR_REQUEST_STREAM: if errcode == couchbaseConstants.ERR_SUCCESS: start = 0 step = UPRStreamSource.HIGH_SEQNO_BYTE + UPRStreamSource.UUID_BYTE #while start+step <= datalen: # uuid, seqno = struct.unpack(couchbaseConstants.UPR_VB_UUID_SEQNO_PKT_FMT, \ # data[start:start + step]) # #print "vbuuid: %s, seqno:%s" % (uuid, seqno) # start = start + step elif errcode == couchbaseConstants.ERR_KEY_ENOENT: logging.warn( "producer doesn't know about the vbucket uuid, rollback to 0" ) vbid, flags, start_seqno, end_seqno, vb_uuid, hi_seqno = self.stream_list[ opaque] del self.stream_list[opaque] print vbid, flags, start_seqno, end_seqno, vb_uuid, hi_seqno #self.request_upr_stream(vbid, flags, start_seqno, end_seqno, 0, hi_seqno) elif errcode == couchbaseConstants.ERR_KEY_EEXISTS: logging.warn( "a stream exists on the connection for vbucket:%s" % opaque) elif errcode == couchbaseConstants.ERR_NOT_MY_VBUCKET: logging.warn( "Vbucket is not active anymore, skip it:%s" % vbid) del self.stream_list[opaque] elif errcode == couchbaseConstants.ERR_ERANGE: #logging.warn("Start and end sequence numbers are specified incorrectly,(%s, %s)" % \ # (start_seqno, end_seqno)) del self.stream_list[opaque] elif errcode == couchbaseConstants.ERR_ROLLBACK: vbid, flags, start_seqno, end_seqno, vb_uuid, hi_seqno = self.stream_list[ opaque] start_seqno = struck.unpack( couchbaseConstants.UPR_VB_SEQNO_PKT_FMT, data) logging.warn("rollback at %s" % start_seqno) self.request_upr_stream(vbid, flags, start_seqno, end_seqno, 0, hi_seqno) del self.stream_list[opaque] self.stream_list[opaque] = (vbid, flags, start_seqno, end_seqno, vb_uuid, hi_seqno) elif cmd == couchbaseConstants.CMD_UPR_MUTATION: vbucket_id = errcode seqno, rev_seqno, flg, exp, locktime, metalen, nru = \ struct.unpack(couchbaseConstants.UPR_MUTATION_PKT_FMT, data[0:extlen]) key_start = extlen val_start = key_start + keylen key = data[extlen:val_start] val = data[val_start:] if not self.skip(key, vbucket_id): msg = (cmd, vbucket_id, key, flg, exp, cas, rev_seqno, val, seqno, dtype, metalen) #print msg batch.append(msg, len(val)) self.num_msg += 1 elif cmd == couchbaseConstants.CMD_UPR_DELETION or \ cmd == couchbaseConstants.CMD_UPR_EXPIRATION: vbucket_id = errcode seqno, rev_seqno, metalen = \ struct.unpack(couchbaseConstants.UPR_DELETE_PKT_FMT, data[0:extlen]) key_start = extlen val_start = key_start + keylen key = data[extlen:val_start] if not self.skip(key, vbucket_id): msg = (cmd, vbucket_id, key, flg, exp, cas, rev_seqno, val, seqno, dtype, metalen) batch.append(msg, len(val)) self.num_msg += 1 if cmd == couchbaseConstants.CMD_UPR_DELETE: batch.adjust_size += 1 elif cmd == couchbaseConstants.CMD_UPR_FLUSH: logging.warn("stopping: saw CMD_UPR_FLUSH") self.upr_done = True break elif cmd == couchbaseConstants.CMD_UPR_END_STREAM: del self.stream_list[opaque] if not len(self.stream_list): self.upr_done = True elif cmd == couchbaseConstants.CMD_UPR_SNAPSHOT_MARKER: logging.info("snapshot marker received, simply ignored:") else: logging.warn("warning: unexpected UPR message: %s" % cmd) return "unexpected UPR message: %s" % cmd, batch if need_ack: self.ack_last = True try: upr_conn._sendMsg( cmd, '', '', opaque, vbucketId=0, fmt=couchbaseConstants.RES_PKT_FMT, magic=couchbaseConstants.RES_MAGIC_BYTE) except socket.error: return ( "error: socket.error on send();" " perhaps the source server: %s was rebalancing" " or had connectivity/server problems" % (self.source_node['hostname'])), batch except EOFError: self.upr_done = True return ( "error: EOFError on socket send();" " perhaps the source server: %s was rebalancing" " or had connectivity/server problems" % (self.source_node['hostname'])), batch # Close the batch when there's an ACK handshake, so # the server can concurrently send us the next batch. # If we are slow, our slow ACK's will naturally slow # down the server. return 0, batch self.ack_last = False self.cmd_last = cmd except EOFError: if batch.size() <= 0 and self.ack_last: # A closed conn after an ACK means clean end of TAP dump. self.upr_done = True if batch.size() <= 0: return 0, None return 0, batch
def provide_batch(self): """Provides a batch of messages, with GET/SET ratios and keys controlled by a mcsoda-inspired approach, but simpler.""" if self.done: return 0, None cfg = self.source_map['cfg'] prefix = cfg['prefix'] max_items = cfg['max-items'] ratio_sets = cfg['ratio-sets'] exit_after_creates = cfg['exit-after-creates'] low_compression = cfg['low-compression'] json = cfg['json'] if not self.body: if low_compression: # Generate a document which snappy will struggle to compress. # Useful if your creating data-sets which utilise disk. random.seed( 0 ) # Seed to a fixed value so we always have the same document pattern. document = ''.join( random.choice(string.ascii_uppercase) for _ in range(cfg['min-value-size'])) else: # else a string of 0 is fine, but will compress very well. document = "0" * cfg['min-value-size'] if json: self.body = '{"name": "%s%s", "age": %s, "index": %s,' + \ ' "body": "%s"}' % document else: self.body = document batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] vbucket_id = 0x0000ffff cas, exp, flg = 0, 0, 0 while (batch.size() < batch_max_size and batch.bytes < batch_max_bytes): if ratio_sets >= float(self.cur_sets) / float(self.cur_ops or 1): self.cur_sets = self.cur_sets + 1 cmd = couchbaseConstants.CMD_TAP_MUTATION if self.cur_items < max_items: key = self.cur_items self.cur_items = self.cur_items + 1 else: key = self.cur_sets % self.cur_items else: self.cur_gets = self.cur_gets + 1 cmd = couchbaseConstants.CMD_GET key = self.cur_gets % self.cur_items self.cur_ops = self.cur_ops + 1 if json: value = self.body % (prefix, key, key % 101, key) else: value = self.body msg = (cmd, vbucket_id, prefix + str(key), flg, exp, cas, '', value, 0, 0, 0, 0) batch.append(msg, len(value)) if exit_after_creates and self.cur_items >= max_items: self.done = True return 0, batch if batch.size() <= 0: return 0, None return 0, batch
def provide_batch(self) -> Tuple[couchbaseConstants.PUMP_ERROR, Optional[pump.Batch]]: """Provides a batch of messages, with GET/SET ratios and keys controlled by a mcsoda-inspired approach, but simpler.""" if self.done: return 0, None cfg: Dict[str, Any] = self.source_map['cfg'] prefix: str = cfg['prefix'] max_items: int = cfg['max-items'] ratio_sets: float = cfg['ratio-sets'] exit_after_creates: bool = cfg['exit-after-creates'] low_compression: bool = cfg['low-compression'] xattrs: bool = cfg['xattr'] itr = None collections = self.opts.collection if collections: itr = iter(collections) json_body: bool = cfg['json'] if not self.body: if low_compression: # Generate a document which snappy will struggle to compress. # Useful if your creating data-sets which utilise disk. random.seed(0) # Seed to a fixed value so we always have the same document pattern. document = ''.join(random.choice(string.ascii_uppercase) for _ in range(cfg['min-value-size'])) else: # else a string of 0 is fine, but will compress very well. document = "0" * cfg['min-value-size'] self.body = document batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] vbucket_id = 0x0000ffff cas, exp, flg = 0, 0, 0 while (batch.size() < batch_max_size and batch.bytes < batch_max_bytes): if ratio_sets >= float(self.cur_sets) / float(self.cur_ops or 1): self.cur_sets = self.cur_sets + 1 if xattrs: cmd: int = couchbaseConstants.CMD_SUBDOC_MULTIPATH_MUTATION else: cmd = couchbaseConstants.CMD_DCP_MUTATION if self.cur_items < max_items: key = str(self.cur_items) self.cur_items = self.cur_items + 1 else: key = str(self.cur_sets % self.cur_items) else: self.cur_gets = self.cur_gets + 1 if xattrs: cmd = couchbaseConstants.CMD_SUBDOC_MULTIPATH_LOOKUP else: cmd = couchbaseConstants.CMD_GET key = str(self.cur_gets % self.cur_items) self.cur_ops = self.cur_ops + 1 if json_body: value = f'{{"name": "{prefix}{key}", "age": {int(key) % 101}, "index": "{key}", "body":"{self.body}"}}' else: value = self.body if xattrs: value = json.dumps({"obj": value, "xattr_f": "field1", "xattr_v": "\"value1\""}) value_bytes: bytes = value.encode() # generate a collection key if itr: try: cid = int(next(itr), 16) except StopIteration: itr = iter(collections) cid = int(next(itr), 16) except ValueError as e: return f'Invalid collection id, collection id must be a hexadecimal number: {e}', None encoded_cid = encode_collection_id(cid) # Generate the pack format and pack the key doc_key: bytes = struct.pack( ("!" + str(len(encoded_cid)) + "s" + str(len(prefix)) + "s" + str(len(key)) + "s").encode(), encoded_cid, prefix.encode(), key.encode()) else: doc_key = prefix.encode() + key.encode() datatype = 0x00 if json_body and cmd != couchbaseConstants.CMD_GET: datatype = 0x01 msg: couchbaseConstants.BATCH_MSG = (cmd, vbucket_id, doc_key, flg, exp, cas, b'', value_bytes, 0, datatype, 0, 0) batch.append(msg, len(value_bytes)) if exit_after_creates and self.cur_items >= max_items: self.done = True return 0, batch if batch.size() <= 0: return 0, None return 0, batch
class CSVSource(pump.Source): """Reads csv file, where first line is field names and one field should be 'id'.""" def __init__(self, opts, spec, source_bucket, source_node, source_map, sink_map, ctl, cur): super(CSVSource, self).__init__(opts, spec, source_bucket, source_node, source_map, sink_map, ctl, cur) self.done = False self.r = None # An iterator of csv.reader() @staticmethod def can_handle(opts, spec): return spec.endswith(".csv") and os.path.isfile(spec) @staticmethod def check(opts, spec): return 0, { 'spec': spec, 'buckets': [{ 'name': os.path.basename(spec), 'nodes': [{ 'hostname': 'N/A' }] }] } @staticmethod def provide_design(opts, source_spec, source_bucket, source_map): return 0, None def provide_batch(self): if self.done: return 0, None if not self.r: try: self.r = csv.reader(open(self.spec, 'rU')) self.fields = self.r.next() if not 'id' in self.fields: return ("error: no 'id' field in 1st line of csv: %s" % (self.spec)), None except StopIteration: return ("error: could not read 1st line of csv: %s" % (self.spec)), None except IOError, e: return ("error: could not open csv: %s; exception: %s" % (self.spec, e)), None batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] cmd = couchbaseConstants.CMD_TAP_MUTATION vbucket_id = 0x0000ffff cas, exp, flg = 0, 0, 0 while (self.r and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): try: vals = self.r.next() doc = {} for i, field in enumerate(self.fields): if i >= len(vals): continue if field == 'id': doc[field] = vals[i] else: doc[field] = number_try_parse(vals[i]) if doc['id']: doc_json = json.dumps(doc) msg = (cmd, vbucket_id, doc['id'], flg, exp, cas, '', doc_json, 0, 0, 0, 0) batch.append(msg, len(doc)) except StopIteration: self.done = True self.r = None except Exception, e: logging.error("error: fails to read from csv file, %s", e) continue
class BSONSource(pump.Source): """Reads bson file.""" def __init__(self, opts, spec, source_bucket, source_node, source_map, sink_map, ctl, cur): super(BSONSource, self).__init__(opts, spec, source_bucket, source_node, source_map, sink_map, ctl, cur) self.done = False self.f = None @staticmethod def can_handle(opts, spec): return spec.startswith(BSON_SCHEME) and \ os.path.isfile(spec.replace(BSON_SCHEME, "")) @staticmethod def check(opts, spec): return 0, {'spec': spec, 'buckets': [{'name': os.path.basename(spec), 'nodes': [{'hostname': 'N/A'}]}]} @staticmethod def provide_design(opts, source_spec, source_bucket, source_map): return 0, None def provide_batch(self): if self.done: return 0, None if not self.f: try: self.f = open(self.spec.replace(BSON_SCHEME, "")) except IOError, e: return "error: could not open bson: %s; exception: %s" % \ (self.spec, e), None batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] cmd = couchbaseConstants.CMD_TAP_MUTATION vbucket_id = 0x0000ffff cas, exp, flg = 0, 0, 0 while (self.f and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): doc_size_buf = self.f.read(4) if not doc_size_buf: self.done = True self.f.close() self.f = None break doc_size, = struct.unpack("<i", doc_size_buf) doc_buf = self.f.read(doc_size - 4) if not doc_buf: self.done = True self.f.close() self.f = None break doc = bson._elements_to_dict(doc_buf, dict, True) key = doc['_id'] doc_json = json.dumps(doc) msg = (cmd, vbucket_id, key, flg, exp, cas, '', doc_json, 0, 0, 0) batch.append(msg, len(doc)) if batch.size() <= 0: return 0, None return 0, batch
def provide_batch(self): if self.done: return 0, None batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] s = "SELECT cmd, vbucket_id, key, flg, exp, cas, meta, val FROM cbb_msg" if self.files is None: # None != [], as self.files will shrink to []. g = glob.glob(BFD.db_dir(self.spec, self.bucket_name(), self.node_name()) + "/data-*.cbb") self.files = sorted(g) try: while (not self.done and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): if self.cursor_db is None: if not self.files: self.done = True return 0, batch rv, db = connect_db(self.files[0], self.opts, CBB_VERSION) if rv != 0: return rv, None self.files = self.files[1:] cursor = db.cursor() cursor.execute(s) self.cursor_db = (cursor, db) cursor, db = self.cursor_db row = cursor.fetchone() if row: vbucket_id = row[1] key = row[2] val = row[7] if self.skip(key, vbucket_id): continue msg = (row[0], row[1], row[2], row[3], row[4], int(row[5]), # CAS as 64-bit integer not string. row[6], row[7]) batch.append(msg, len(val)) else: if self.cursor_db: self.cursor_db[0].close() self.cursor_db[1].close() self.cursor_db = None return 0, batch except Exception, e: self.done = True if self.cursor_db: self.cursor_db[0].close() self.cursor_db[1].close() self.cursor_db = None return "error: exception reading backup file: " + str(e), None
def provide_batch_actual(self, tap_conn): batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] try: while (not self.tap_done and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): # TODO: (1) TAPDumpSource - provide_batch timeout on inactivity. rv, cmd, vbucket_id, key, flg, exp, cas, meta, val, \ opaque, need_ack = self.read_tap_conn(tap_conn) if rv != 0: self.tap_done = True return rv, batch if (cmd == memcacheConstants.CMD_TAP_MUTATION or cmd == memcacheConstants.CMD_TAP_DELETE): if not self.skip(key, vbucket_id): msg = (cmd, vbucket_id, key, flg, exp, cas, meta, val) batch.append(msg, len(val)) self.num_msg += 1 elif cmd == memcacheConstants.CMD_TAP_OPAQUE: pass elif cmd == memcacheConstants.CMD_NOOP: # 1.8.x servers might not end the TAP dump on an empty bucket, # so we treat 2 NOOP's in a row as the end and proactively close. # Only do this when there've been no msgs to avoid closing # during a slow backfill. if (self.cmd_last == memcacheConstants.CMD_NOOP and self.num_msg == 0 and batch.size() <= 0): self.tap_done = True return 0, batch elif cmd == memcacheConstants.CMD_TAP_FLUSH: logging.warn("stopping: saw CMD_TAP_FLUSH") self.tap_done = True break else: s = str(pump.CMD_STR.get(cmd, cmd)) logging.warn("warning: unexpected TAP message: " + s) return "unexpected TAP message: " + s, batch if need_ack: self.ack_last = True try: tap_conn._sendMsg(cmd, '', '', opaque, vbucketId=0, fmt=memcacheConstants.RES_PKT_FMT, magic=memcacheConstants.RES_MAGIC_BYTE) except socket.error: return ("error: socket.error on send();" " perhaps the source server: %s was rebalancing" " or had connectivity/server problems" % (self.source_node['hostname'])), batch except EOFError: self.tap_done = True return ("error: EOFError on socket send();" " perhaps the source server: %s was rebalancing" " or had connectivity/server problems" % (self.source_node['hostname'])), batch # Close the batch when there's an ACK handshake, so # the server can concurrently send us the next batch. # If we are slow, our slow ACK's will naturally slow # down the server. return 0, batch self.ack_last = False self.cmd_last = cmd except EOFError: if batch.size() <= 0 and self.ack_last: # A closed conn after an ACK means clean end of TAP dump. self.tap_done = True if batch.size() <= 0: return 0, None return 0, batch
def provide_dcp_batch_actual(self): batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] delta_ack_size = batch_max_bytes * 10 / 4 #ack every 25% of buffer size last_processed = 0 total_bytes_read = 0 vbid = 0 cmd = 0 start_seqno = 0 end_seqno = 0 vb_uuid = 0 hi_seqno = 0 ss_start_seqno = 0 ss_end_seqno = 0 try: while (not self.dcp_done and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): if self.response.empty(): if len(self.stream_list) > 0: logging.debug( "no response while there %s active streams" % len(self.stream_list)) time.sleep(.25) else: self.dcp_done = True continue unprocessed_size = total_bytes_read - last_processed if unprocessed_size > delta_ack_size: rv = self.ack_buffer_size(unprocessed_size) if rv: logging.error(rv) else: last_processed = total_bytes_read cmd, errcode, opaque, cas, keylen, extlen, data, datalen, dtype, bytes_read = \ self.response.get() total_bytes_read += bytes_read rv = 0 metalen = flags = flg = exp = 0 key = val = ext = '' need_ack = False seqno = 0 if cmd == couchbaseConstants.CMD_DCP_REQUEST_STREAM: if errcode == couchbaseConstants.ERR_SUCCESS: pair_index = (self.source_bucket['name'], self.source_node['hostname']) start = 0 step = DCPStreamSource.HIGH_SEQNO_BYTE + DCPStreamSource.UUID_BYTE while start + step <= datalen: uuid, seqno = struct.unpack( couchbaseConstants.DCP_VB_UUID_SEQNO_PKT_FMT, \ data[start:start + step]) if pair_index not in self.cur['failoverlog']: self.cur['failoverlog'][pair_index] = {} if opaque not in self.cur['failoverlog'][pair_index] or \ not self.cur['failoverlog'][pair_index][opaque]: self.cur['failoverlog'][pair_index][opaque] = [ (uuid, seqno) ] else: self.cur['failoverlog'][pair_index][ opaque].append((uuid, seqno)) start = start + step elif errcode == couchbaseConstants.ERR_KEY_ENOENT: logging.warn( "producer doesn't know about the vbucket uuid, rollback to 0" ) vbid, flags, start_seqno, end_seqno, vb_uuid, ss_start_seqno, ss_end_seqno = \ self.stream_list[opaque] del self.stream_list[opaque] elif errcode == couchbaseConstants.ERR_KEY_EEXISTS: logging.warn( "a stream exists on the connection for vbucket:%s" % opaque) elif errcode == couchbaseConstants.ERR_NOT_MY_VBUCKET: logging.warn( "Vbucket is not active anymore, skip it:%s" % vbid) del self.stream_list[opaque] elif errcode == couchbaseConstants.ERR_ERANGE: logging.warn("Start or end sequence numbers specified incorrectly,(%s, %s)" % \ (start_seqno, end_seqno)) del self.stream_list[opaque] elif errcode == couchbaseConstants.ERR_ROLLBACK: vbid, flags, start_seqno, end_seqno, vb_uuid, ss_start_seqno, ss_stop_seqno = \ self.stream_list[opaque] start_seqno, = struct.unpack( couchbaseConstants.DCP_VB_SEQNO_PKT_FMT, data) #find the most latest uuid, hi_seqno that fit start_seqno if self.cur['failoverlog']: pair_index = (self.source_bucket['name'], self.source_node['hostname']) if self.cur['failoverlog'][pair_index].get("vbid"): for uuid, seqno in self.cur['failoverlog'][ pair_index][vbid]: if start_seqno >= seqno: vb_uuid = uuid break ss_start_seqno = start_seqno ss_end_seqno = start_seqno self.request_dcp_stream(vbid, flags, start_seqno, end_seqno, vb_uuid, ss_start_seqno, ss_end_seqno) del self.stream_list[opaque] self.stream_list[opaque] = \ (vbid, flags, start_seqno, end_seqno, vb_uuid, ss_start_seqno, ss_end_seqno) else: logging.error("unprocessed errcode:%s" % errcode) del self.stream_list[opaque] elif cmd == couchbaseConstants.CMD_DCP_MUTATION: vbucket_id = errcode seqno, rev_seqno, flg, exp, locktime, metalen, nru = \ struct.unpack(couchbaseConstants.DCP_MUTATION_PKT_FMT, data[0:extlen]) key_start = extlen val_start = key_start + keylen val_len = datalen - keylen - metalen - extlen meta_start = val_start + val_len key = data[extlen:val_start] val = data[val_start:meta_start] conf_res = 0 if meta_start < datalen: # handle extra conflict resolution fields extra_meta = data[meta_start:] extra_index = 0 version = extra_meta[extra_index] extra_index += 1 while extra_index < metalen: id, extlen = struct.unpack( couchbaseConstants.DCP_EXTRA_META_PKG_FMT, extra_meta[extra_index:extra_index + 3]) extra_index += 3 if id == couchbaseConstants.DCP_EXTRA_META_CONFLICT_RESOLUTION: if extlen == 1: conf_res, = struct.unpack( ">B", extra_meta[extra_index:extra_index + 1]) elif extlen == 2: conf_res, = struct.unpack( ">H", extra_meta[extra_index:extra_index + 2]) elif extlen == 4: conf_res, = struct.unpack( ">I", extra_meta[extra_index:extra_index + 4]) elif extlen == 8: conf_res, = struct.unpack( ">Q", extra_meta[extra_index:extra_index + 8]) else: logging.error( "unsupported extra meta data format:%d" % extlen) conf_res = 0 extra_index += extlen if not self.skip(key, vbucket_id): msg = (cmd, vbucket_id, key, flg, exp, cas, rev_seqno, val, seqno, dtype, \ metalen, conf_res) batch.append(msg, len(val)) self.num_msg += 1 elif cmd == couchbaseConstants.CMD_DCP_DELETE or \ cmd == couchbaseConstants.CMD_DCP_EXPIRATION: vbucket_id = errcode seqno, rev_seqno, metalen = \ struct.unpack(couchbaseConstants.DCP_DELETE_PKT_FMT, data[0:extlen]) key_start = extlen val_start = key_start + keylen key = data[extlen:val_start] if not self.skip(key, vbucket_id): msg = (cmd, vbucket_id, key, flg, exp, cas, rev_seqno, val, seqno, dtype, \ metalen, 0) batch.append(msg, len(val)) self.num_msg += 1 if cmd == couchbaseConstants.CMD_DCP_DELETE: batch.adjust_size += 1 elif cmd == couchbaseConstants.CMD_DCP_FLUSH: logging.warn("stopping: saw CMD_DCP_FLUSH") self.dcp_done = True break elif cmd == couchbaseConstants.CMD_DCP_END_STREAM: del self.stream_list[opaque] if not len(self.stream_list): self.dcp_done = True elif cmd == couchbaseConstants.CMD_DCP_SNAPSHOT_MARKER: ss_start_seqno, ss_end_seqno, _ = \ struct.unpack(couchbaseConstants.DCP_SNAPSHOT_PKT_FMT, data[0:extlen]) pair_index = (self.source_bucket['name'], self.source_node['hostname']) if not self.cur['snapshot']: self.cur['snapshot'] = {} if pair_index not in self.cur['snapshot']: self.cur['snapshot'][pair_index] = {} self.cur['snapshot'][pair_index][opaque] = (ss_start_seqno, ss_end_seqno) elif cmd == couchbaseConstants.CMD_DCP_NOOP: need_ack = True elif cmd == couchbaseConstants.CMD_DCP_BUFFER_ACK: if errcode != couchbaseConstants.ERR_SUCCESS: logging.warning("buffer ack response errcode:%s" % errcode) continue else: logging.warn("warning: unexpected DCP message: %s" % cmd) return "unexpected DCP message: %s" % cmd, batch if need_ack: self.ack_last = True try: self.dcp_conn._sendMsg( cmd, '', '', opaque, vbucketId=0, fmt=couchbaseConstants.RES_PKT_FMT, magic=couchbaseConstants.RES_MAGIC_BYTE) except socket.error: return ( "error: socket.error on send();" " perhaps the source server: %s was rebalancing" " or had connectivity/server problems" % (self.source_node['hostname'])), batch except EOFError: self.dcp_done = True return ( "error: EOFError on socket send();" " perhaps the source server: %s was rebalancing" " or had connectivity/server problems" % (self.source_node['hostname'])), batch # Close the batch when there's an ACK handshake, so # the server can concurrently send us the next batch. # If we are slow, our slow ACK's will naturally slow # down the server. self.ack_buffer_size(total_bytes_read - last_processed) return 0, batch self.ack_last = False self.cmd_last = cmd except EOFError: if batch.size() <= 0 and self.ack_last: # A closed conn after an ACK means clean end of TAP dump. self.dcp_done = True if batch.size() <= 0: return 0, None self.ack_buffer_size(total_bytes_read - last_processed) return 0, batch
def loader(self): rv, d = data_dir(self.spec) if rv != 0: self.queue.put((rv, None)) return source_vbucket_state = \ getattr(self.opts, 'source_vbucket_state', 'active') source_nodes = self.source_bucket['nodes'] if len(source_nodes) != 1: self.queue.put(("error: expected 1 node in source_bucket: %s" % (self.source_bucket['name']), None)) return vbucket_states = source_nodes[0].get('vbucket_states', None) if not vbucket_states: self.queue.put( ("error: missing vbucket_states in source_bucket: %s" % (self.source_bucket['name']), None)) return vbuckets = vbucket_states.get(source_vbucket_state, None) if vbuckets is None: # Empty dict is valid. self.queue.put(("error: missing vbuckets in source_bucket: %s" % (self.source_bucket['name']), None)) return batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] store = None vbucket_id = None # Level of indirection since we can't use python 3 nonlocal statement. abatch = [pump.Batch(self)] def change_callback(doc_info): if doc_info: key = doc_info.id if self.skip(key, vbucket_id): return if doc_info.deleted: cmd = memcacheConstants.CMD_TAP_DELETE val = '' else: cmd = memcacheConstants.CMD_TAP_MUTATION val = doc_info.getContents( options=couchstore.CouchStore.DECOMPRESS) cas, exp, flg = struct.unpack(SFD_REV_META, doc_info.revMeta) meta = struct.pack(SFD_REV_SEQ, doc_info.revSequence) msg = (cmd, vbucket_id, key, flg, exp, cas, meta, val) abatch[0].append(msg, len(val)) if (abatch[0].size() >= batch_max_size or abatch[0].bytes >= batch_max_bytes): self.queue.put((0, abatch[0])) abatch[0] = pump.Batch(self) for f in latest_couch_files(d + '/' + self.source_bucket['name']): vbucket_id = int(re.match(SFD_RE, os.path.basename(f)).group(1)) if not vbucket_id in vbuckets: continue try: store = couchstore.CouchStore(f, 'r') except Exception, e: self.queue.put(("error: could not open couchstore file: %s" "; exception: %s" % (f, e), None)) return store.forEachChange(0, change_callback) store.close()
def loader(self): rv, d = data_dir(self.spec) if rv != 0: self.queue.put((rv, None)) return source_vbucket_state = \ getattr(self.opts, 'source_vbucket_state', 'active') source_nodes = self.source_bucket['nodes'] if len(source_nodes) != 1: self.queue.put(("error: expected 1 node in source_bucket: %s" % (self.source_bucket['name']), None)) return vbucket_states = source_nodes[0].get('vbucket_states', None) if not vbucket_states: self.queue.put( ("error: missing vbucket_states in source_bucket: %s" % (self.source_bucket['name']), None)) return vbuckets = vbucket_states.get(source_vbucket_state, None) if vbuckets is None: # Empty dict is valid. self.queue.put(("error: missing vbuckets in source_bucket: %s" % (self.source_bucket['name']), None)) return batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] store = None vbucket_id = None # Level of indirection since we can't use python 3 nonlocal statement. abatch = [pump.Batch(self)] def change_callback(doc_info): if doc_info: key = doc_info.id if self.skip(key, vbucket_id): return if doc_info.deleted: cmd = couchbaseConstants.CMD_TAP_DELETE val = '' else: cmd = couchbaseConstants.CMD_TAP_MUTATION val = doc_info.getContents( options=couchstore.CouchStore.DECOMPRESS) try: cas, exp, flg, flex_meta, dtype = struct.unpack( SFD_REV_META, doc_info.revMeta) meta = doc_info.revSequence seqno = doc_info.sequence nmeta = 0 msg = (cmd, vbucket_id, key, flg, exp, cas, meta, val, seqno, dtype, nmeta, 0) abatch[0].append(msg, len(val)) except Exception, e: self.queue.put(( "error: could not read couchstore file due to unsupported file format version;" " exception: %s" % e, None)) return if (abatch[0].size() >= batch_max_size or abatch[0].bytes >= batch_max_bytes): self.queue.put((0, abatch[0])) abatch[0] = pump.Batch(self)
def provide_batch(self): if self.done: return 0, None batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] s = ["SELECT cmd, vbucket_id, key, flg, exp, cas, meta, val FROM cbb_msg", "SELECT cmd, vbucket_id, key, flg, exp, cas, meta, val, seqno, dtype, meta_size FROM cbb_msg", "SELECT cmd, vbucket_id, key, flg, exp, cas, meta, val, seqno, dtype, meta_size, conf_res FROM cbb_msg"] if self.files is None: # None != [], as self.files will shrink to []. g = glob.glob(BFD.db_dir(self.spec, self.bucket_name(), self.node_name()) + "/data-*.cbb") if not g: #check 3.0 file structure rv, file_list = BFDSource.list_files(self.opts, self.spec, self.bucket_name(), self.node_name(), "data-*.cbb") if rv != 0: return rv, None from_date = getattr(self.opts, "from_date", None) if from_date: from_date = datetime.datetime.strptime(from_date, "%Y-%m-%d") to_date = getattr(self.opts, "to_date", None) if to_date: to_date = datetime.datetime.strptime(to_date, "%Y-%m-%d") g = [] for f in file_list: mtime = datetime.datetime.fromtimestamp(os.path.getmtime(f)) if (not from_date or mtime >= from_date) and (not to_date or mtime <= to_date): g.append(f) self.files = sorted(g) try: ver = 0 while (not self.done and batch.size() < batch_max_size and batch.bytes < batch_max_bytes): if self.cursor_db is None: if not self.files: self.done = True return 0, batch rv, db, ver = connect_db(self.files[0], self.opts, CBB_VERSION) if rv != 0: return rv, None self.files = self.files[1:] cursor = db.cursor() cursor.execute(s[ver]) self.cursor_db = (cursor, db) cursor, db = self.cursor_db row = cursor.fetchone() if row: vbucket_id = row[1] key = row[2] val = row[7] if self.skip(key, vbucket_id): continue msg = (row[0], row[1], row[2], row[3], row[4], int(row[5]), # CAS as 64-bit integer not string. row[6], # revid as 64-bit integer too row[7]) if ver == 2: msg = msg + (row[8], row[9], row[10], row[11]) elif ver == 1: msg = msg + (row[8], row[9], row[10], 0) else: msg = msg + (0, 0, 0, 0) batch.append(msg, len(val)) else: if self.cursor_db: self.cursor_db[0].close() self.cursor_db[1].close() self.cursor_db = None return 0, batch except Exception, e: self.done = True if self.cursor_db: self.cursor_db[0].close() self.cursor_db[1].close() self.cursor_db = None return "error: exception reading backup file: " + str(e), None
def provide_batch(self): """Provides a batch of messages, with GET/SET ratios and keys controlled by a mcsoda-inspired approach, but simpler.""" if self.done: return 0, None cfg = self.source_map['cfg'] prefix = cfg['prefix'] max_items = cfg['max-items'] ratio_sets = cfg['ratio-sets'] exit_after_creates = cfg['exit-after-creates'] low_compression = cfg['low-compression'] xattrs = cfg['xattr'] itr = None collections = self.opts.collection if collections: itr = iter(collections) json = cfg['json'] if not self.body: if low_compression: # Generate a document which snappy will struggle to compress. # Useful if your creating data-sets which utilise disk. random.seed( 0 ) # Seed to a fixed value so we always have the same document pattern. document = ''.join( random.choice(string.ascii_uppercase) for _ in range(cfg['min-value-size'])) else: # else a string of 0 is fine, but will compress very well. document = "0" * cfg['min-value-size'] if json: self.body = '{"name": "%s%s", "age": %s, "index": %s,' + \ ' "body": "%s"}' % document else: self.body = document batch = pump.Batch(self) batch_max_size = self.opts.extra['batch_max_size'] batch_max_bytes = self.opts.extra['batch_max_bytes'] vbucket_id = 0x0000ffff cas, exp, flg = 0, 0, 0 while (batch.size() < batch_max_size and batch.bytes < batch_max_bytes): if ratio_sets >= float(self.cur_sets) / float(self.cur_ops or 1): self.cur_sets = self.cur_sets + 1 if xattrs: cmd = couchbaseConstants.CMD_SUBDOC_MULTIPATH_MUTATION else: cmd = couchbaseConstants.CMD_DCP_MUTATION if self.cur_items < max_items: key = str(self.cur_items) self.cur_items = self.cur_items + 1 else: key = str(self.cur_sets % self.cur_items) else: self.cur_gets = self.cur_gets + 1 if xattrs: cmd = couchbaseConstants.CMD_SUBDOC_MULTIPATH_LOOKUP else: cmd = couchbaseConstants.CMD_GET key = str(self.cur_gets % self.cur_items) self.cur_ops = self.cur_ops + 1 if json: value = self.body % (prefix, key, int(key) % 101, key) else: value = self.body if xattrs: value = { "obj": value, "xattr_f": "field1", "xattr_v": "\"value1\"" } # generate a collection key if itr: try: cid = int(next(itr), 16) except StopIteration: itr = iter(collections) cid = int(next(itr), 16) encodedCid = encodeCollectionId(cid) # Generate the pack format and pack the key docKey = struct.pack( "!" + str(len(encodedCid)) + "s" + str(len(prefix)) + "s" + str(len(key)) + "s", encodedCid, prefix, key) else: docKey = prefix + key msg = (cmd, vbucket_id, docKey, flg, exp, cas, '', value, 0, 0, 0, 0) batch.append(msg, len(value)) if exit_after_creates and self.cur_items >= max_items: self.done = True return 0, batch if batch.size() <= 0: return 0, None return 0, batch