def run_inserts_at_version(self, proto_ver): session = Cluster(protocol_version=proto_ver).connect(self.keyspace_name) try: p = session.prepare('insert into t (k, v) values (?, ?)') session.execute(p, (0, [{1, 2}, {3, 5}])) p = session.prepare('insert into u (k, v) values (?, ?)') session.execute(p, (0, {(1, 2), (3, 5)})) p = session.prepare('insert into v (k, v, v1) values (?, ?, ?)') session.execute(p, (0, {(1, 2): [1, 2, 3], (3, 5): [4, 5, 6]}, (123, 'four'))) p = session.prepare('insert into w (k, v) values (?, ?)') session.execute(p, (0, ({1: [1, 2, 3], 2: [4, 5, 6]}, [7, 8, 9]))) finally: session.cluster.shutdown()
def ccrt(Tuple, Qout): session = Cluster().connect() statement = session.prepare( 'INSERT INTO test.test2 (a, b) VALUES (?,?)' ) values = [(x,x) for x in xrange(Tuple[0], Tuple[1])] execute_concurrent_with_args( session, statement, values, concurrency=100 ) Qout.put(time.time())
class CassandraService: def __init__(self, keyspace='data'): self.session = Cluster().connect(keyspace) self.user_insert_stmt = self.prepare_user_insert_statement() self.tweet_insert_stmt = self.prepare_tweet_insert_statement() self.exception_insert_stmt = self.prepare_exception_insert_statement() def save_user(self, user: User): self.session.execute(self.user_insert_stmt, vars(user)) def save_tweet(self, tweet: Tweet): self.session.execute(self.tweet_insert_stmt, vars(tweet)) def save_exception(self, exception_data: dict): self.session.execute(self.exception_insert_stmt, exception_data) def prepare_user_insert_statement(self): return self.session.prepare(""" INSERT INTO pronbots_2019 (id, captured_at, created_at, description, entities, favourites_count, followers_count, following_count, friends_count, listed_count, name, pinned_tweet_id, profile_image_url, protected, tweets_count, url, user_name) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """) def prepare_tweet_insert_statement(self): return self.session.prepare(""" INSERT INTO pronbots_2019_tweets (id, captured_at, created_at, user_id, timezone, content, link, retweet, mentions, urls, photos, video, lang, replies_count, retweets_count, likes_count, hashtags, cashtags) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """) def prepare_exception_insert_statement(self): return self.session.prepare(""" INSERT INTO pronbots_2019_exceptions (user, exception) VALUES (?, ?) """)
def insert_data(): from cassandra.cluster import Cluster from cassandra import AlreadyExists # Conexion al cluster de Cassandra al keyspace "test" session = Cluster([ '192.168.101.140', '192.168.101.141', '192.168.101.142', '192.168.101.143' ]).connect("test") try: # Primero crea la tabla session.execute("CREATE TABLE test1 (id int PRIMARY KEY, col1 text)") # Inserta datos insertar = session.prepare( "INSERT INTO test1 (id, col1) VALUES (?, ?)") for i in range(10): session.execute(insertar, (i, 'hola' + str(i))) print(i) except AlreadyExists: # Inserta datos a la tabla ya existente insertar = session.prepare( "INSERT INTO test1 (id, col1) VALUES (?, ?)") for i in range(10): session.execute(insertar, (i, 'hola' + str(i))) print(i)
def insert_data(headers, data): ## Connect to Scylla cluster and create schema # session = cassandra.cluster.Cluster(SCYLLA_IP).connect() print("") print("## Connecting to Scylla cluster -> Creating schema") session = Cluster(SCYLLA_IP).connect() session.execute(create_ks) session.execute(create_t1) ## Connect to Elasticsearch print("") print("## Connecting to Elasticsearch -> Creating 'Catalog' index") es = Elasticsearch(ES_IP) ## Create Elasticsearch index. Ignore 400 = IF NOT EXIST es.indices.create(index="catalog", ignore=400) ## Non-prepared CQL statement #cql = "INSERT INTO catalog.apparel(sku,brand,group,sub_group,color,size,gender) VALUES(%(sku)s,%(brand)s,%(group)s,%(sub_group)s,%(color)s,%(size)s,%(gender)s)" ## Prepared CQL statement print("") print("## Preparing CQL statement") cql = "INSERT INTO catalog.apparel (sku,brand,group,sub_group,color,size,gender) VALUES (?,?,?,?,?,?,?) using TIMESTAMP ?" cql_prepared = session.prepare(cql) cql_prepared.consistency_level = ConsistencyLevel.ONE if random.random( ) < 0.2 else ConsistencyLevel.QUORUM print("") print("## Insert csv content into Scylla and Elasticsearch") for d in data: # See if we need to add code to wait for the ack. This should be synchronous. # Also, might need to switch to prepared statements to set the consistency level for sync requests. session.execute(cql_prepared, d) res = es.index(index="catalog", doc_type="apparel", id=d["sku"], body=d) ## After all the inserts, make a refresh, just in case print("") print("## Inserts completed, refreshing index") es.indices.refresh(index="catalog") print("")
class Searcher: start = 0 def __init__(self,name): self.filename = mapper[name][search] Searcher.start = datetime.now() self.log1 = getLogger('time', 'search_time.log') self.log2 = getLogger('result', 'search_result.log') self.log2.info('## connecting to cassandra cluster') self.session = Cluster(DSE_IP).connect() cql = "SELECT * FROM reddit.comment WHERE solr_query=?" self.solr_query = '{{"q":"body:*{0}*"}}' self.cql_prepared = self.session.prepare(cql) self.cql_prepared.consistency_level = ConsistencyLevel.ONE def search(self): counter = 0 with open(self.filename, 'r', encoding='utf-8') as f: for counter,line in enumerate(f): try: line = line.strip('\n') if counter%100 == 0: self.log1.info(str(counter)) if counter%1000 == 0: self.log1.info('{}'.format(datetime.now()- Searcher.start)) self.log1.info('') self.log2.info('({})'.format(line)) # format query body data = self.solr_query.format(line) res = self.session.execute(self.cql_prepared,[data],timeout=60000) self.log2.info(res[0]) except Exception as e: self.log2.warn(e) self.log1.info('total time:{}'.format(datetime.now() - Searcher.start))
def load(datafile, ks_name, table_name): with open(datafile) as f: reader = csv.reader(f) session = Cluster().connect() header = reader.next() stmt = ( "INSERT INTO {ks}.{tab} ({header_spec}) VALUES ({qs});").format( ks=ks_name, tab=table_name, header_spec=', '.join(header), qs=', '.join(['?' for _ in header])) print('preparing "{stmt}"'.format(stmt=stmt), file=sys.stderr) prepared = session.prepare(stmt) data = csv_handle_to_nested_list(reader) print('About to load {n} rows'.format(n=len(data))) for row in data: session.execute(prepared, row)
class Loader: start = 0 def __init__(self,name): logging.getLogger("elasticsearch").setLevel(logging.WARNING) self.log = getLogger('scy+es','load.log') Loader.start = datetime.now() self.index = name self.stat = Statement(name) self.filename = mapper[name]['load'] self.__init_scy() self.__init_es() self.pool_scy = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10) self.pool_es = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10) def load(self): if self.index == 'reddit': # main loop g = self.__line_generator() while True: try: line = json.loads(next(g)) self.pool_scy.submit(self.__insert_data,line) self.pool_es.submit(self.__insert_index,line) except StopIteration: break except Exception: continue elif self.index == 'amazon': df = pd.read_csv(self.filename,encoding='utf-8') end = df.index.max() df = df.fillna('missing') try: for line in zip(range(0,end+1),df['userId'],df['productId'], df['rating'],df['title'],df['comment'],df['timestamp']): counter = line[0] if counter%100000 == 0: self.log.info(str(counter)) if counter%1000000 == 0: self.log.info('{}'.format(datetime.now()-Loader.start)) self.log.info('') self.pool_scy.submit(self.__insert_data,line) self.pool_es.submit(self.__insert_index,line) except Exception as e: print(e) self.pool_scy.shutdown() self.pool_es.shutdown() self.es.indices.refresh(index=self.index) # shutdown self.session.shutdown() # print information self.log.info('## Total cost time: {}'.format(datetime.now() - Loader.start)) self.log.info('## Inserts completed') def __line_generator(self): with open(self.filename, 'r', encoding='utf-8') as f: for counter,line in enumerate(f): try: if counter%100000 == 0: self.log.info(str(counter)) if counter%1000000 == 0: self.log.info('{}'.format(datetime.now()-Loader.start)) self.log.info('') yield line except Exception as e: print(e) continue # get scylladb connect, create ks and tb, return session def __init_scy(self): # session = Cluster(contact_points=SCYLLA_IP,execution_profiles={EXEC_PROFILE_DEFAULT:ep}).connect() self.session = Cluster(contact_points=SCYLLA_IP).connect() # create a schema self.session.execute(self.stat.create_ks) # create a tb self.session.execute(self.stat.create_tb) self.cql_prepared = self.session.prepare(self.stat.cql) self.cql_prepared.consistency_level = ConsistencyLevel.LOCAL_QUORUM # get es connect, create index def __init_es(self): with open('mapping.json','r') as f: mapping = json.load(f)[self.index] self.es = Elasticsearch(ES_IP, timeout=30) # create es index self.es.indices.create(index=self.index,body=mapping ignore=400,timeout=30) # insert data into scylladb def __insert_data(self,line): data = list() # TODO: format your data if self.index == 'reddit': data = [line['id'], line['name'], line['link_id'], line['parent_id'], line['subreddit_id'],line['author'], line['body'], int(line['created_utc'])] elif self.index == 'amazon': # id | user_id | product_id | rating | title | body | timestamp data = [line[0], int(line[1]), int(line[2]), line[3], line[4], line[5], line[6]] self.log.info(data) res = self.session.execute(self.cql_prepared,data,timeout=60000) return # insert data into elasticsearch def __insert_index(self,line): # TODO: format your data if self.index == 'reddit': data = { k:v for k,v in line.items() if k in ['id', 'name', 'author', 'body'] } elif self.index == 'amazon': data = {} data['id'] = line[0] data['title'] = line[4] data['body'] = line[5] res = self.es.index(index=self.index, doc_type="comment", id=data['id'], body=data) return
def _contention_test(self, threads, iterations): """ Test threads repeatedly contending on the same row. """ verbose = False session = self.prepare(nodes=3) session.execute( "CREATE TABLE test (k int, v int static, id int, PRIMARY KEY (k, id))" ) session.execute("INSERT INTO test(k, v) VALUES (0, 0)") class Worker(Thread): def __init__(self, wid, session, iterations, query): Thread.__init__(self) self.wid = wid self.iterations = iterations self.query = query self.session = session self.errors = 0 self.retries = 0 def run(self): i = 0 prev = 0 while i < self.iterations: done = False while not done: try: res = self.session.execute( self.query, (prev + 1, prev, self.wid)) if verbose: print("[%3d] CAS %3d -> %3d (res: %s)" % (self.wid, prev, prev + 1, str(res))) if res[0][0] is True: done = True prev = prev + 1 else: self.retries = self.retries + 1 # There is 2 conditions, so 2 reasons to fail: if we failed because the row with our # worker ID already exists, it means we timeout earlier but our update did went in, # so do consider this as a success prev = res[0][3] if res[0][2] is not None: if verbose: print( "[%3d] Update was inserted on previous try (res = %s)" % (self.wid, str(res))) done = True except WriteTimeout as e: if verbose: print("[%3d] TIMEOUT (%s)" % (self.wid, str(e))) # This means a timeout: just retry, if it happens that our update was indeed persisted, # we'll figure it out on the next run. self.retries = self.retries + 1 except Exception as e: if verbose: print("[%3d] ERROR: %s" % (self.wid, str(e))) self.errors = self.errors + 1 done = True i = i + 1 # Clean up for next iteration while True: try: self.session.execute( "DELETE FROM test WHERE k = 0 AND id = %d IF EXISTS" % self.wid) break except WriteTimeout as e: pass nodes = self.cluster.nodelist() workers = [] session = Cluster([nodes[0].ip_addr], connect_timeout=15, idle_heartbeat_interval=0, execution_profiles={ EXEC_PROFILE_DEFAULT: ExecutionProfile(request_timeout=60) }).connect('ks') q = session.prepare(""" BEGIN BATCH UPDATE test SET v = ? WHERE k = 0 IF v = ?; INSERT INTO test (k, id) VALUES (0, ?) IF NOT EXISTS; APPLY BATCH """) for n in range(0, threads): workers.append(Worker(n, session, iterations, q)) start = time.time() for w in workers: w.start() for w in workers: w.join() if verbose: runtime = time.time() - start print("runtime:", runtime) query = SimpleStatement("SELECT v FROM test WHERE k = 0", consistency_level=ConsistencyLevel.ALL) rows = session.execute(query) value = rows[0][0] errors = 0 retries = 0 for w in workers: errors = errors + w.errors retries = retries + w.retries assert (value == threads * iterations) and ( errors == 0), "value={}, errors={}, retries={}".format( value, errors, retries)
def query_es(NUM_FILTERS): ## Connect to Elasticsearch print("") print("## Connecting to Elasticsearch") es = Elasticsearch(ES_IP) if NUM_FILTERS == 'single': ## Search using single field filter (group: 'pants') print("") print("## Searching for 'pants' in Elasticsearch (filter by group)") res = es.search(index="catalog", doc_type="apparel", body={ "query": { "match": { "group": "pants" } }, "size": 1000 }) if NUM_FILTERS == 'multiple': ## Search using multiple fields filter (color: 'white' AND sub_group: 'softshell') print("") print( "## Searching for 'white softshell' in Elasticsearch (filter by color + sub_group)" ) res = es.search(index="catalog", doc_type="apparel", body={ "query": { "bool": { "must": [{ "match": { "color": "white" } }, { "match": { "sub_group": "softshell" } }] } }, "size": 1000 }) if NUM_FILTERS == 'none': ## Search with NO filters (match_all) print("") print("## Searching with NO filter = 'match_all' in Elasticsearch") res = es.search(index="catalog", doc_type="apparel", body={ "query": { "match_all": {} }, "size": "1000" }) print("") print("## %d documents returned" % res['hits']['total']) es_results = [doc['_id'] for doc in res['hits']['hits']] ## Connect to Scylla print("") print("## Connecting to Scylla") session = Cluster(SCYLLA_IP).connect() ## Prepared cql statement print("") print("## Preparing CQL statement") cql = "SELECT * FROM catalog.apparel WHERE sku=?" cql_prepared = session.prepare(cql) cql_prepared.consistency_level = ConsistencyLevel.ONE if random.random( ) < 0.2 else ConsistencyLevel.QUORUM ## Query Scylla print("") print("## Query Scylla using SKU/s returned from Elasticsearch") print("") print("## Final results from Scylla:") print("") for r in es_results: scylla_res = session.execute(cql_prepared, (r, )) print("%s" % ([list(row) for row in scylla_res])) #for doc in res['hits']['hits']: ## Print all columns in Elasticsearch result set #print("SKU: %s | Color: %s | Size: %s | Brand: %s | Gender: %s | Group: %s | Sub_Group: %s" % (doc['_id'], doc['_source']['color'], doc['_source']['size'], doc['_source']['brand'], doc['_source']['gender'], doc['_source']['group'], doc['_source']['sub_group'])) ## Print only the id (sku) in the result set #print("SKU: %s" % (doc['_id'])) print("")
class KairosdbFinder(object): __fetch_multi__ = "kairosdb" def __init__(self, config): cfg = config.get('kairosdb', {}) es = cfg.get('es', {}) cas = cfg.get('cassandra', {}) self.config = { "cassandra": { "hosts": cas.get('hosts', ["localhost"]), "port": cas.get('port', 9042), }, "es": { "url": es.get('url', 'http://localhost:9200') } } logger.info("initialize kairosdbFinder", config=self.config) self.es = Elasticsearch([self.config['es']['url']]) self.cassandra = Cluster( self.config['cassandra']['hosts'], self.config['cassandra']['port']).connect('kairosdb') self.metric_lookup_stmt = self.cassandra.prepare( 'SELECT * FROM data_points WHERE key=? AND column1 > ? AND column1 <= ?' ) def find_nodes(self, query): seen_branches = set() leaf_regex = self.compile_regex(query, False) #query Elasticsearch for paths matches = self.search_series(leaf_regex, query) leafs = {} branches = {} for metric in matches: if metric.is_leaf(): if metric.name in leafs: leafs[metric.name].append(metric) else: leafs[metric.name] = [metric] else: if metric.name in branches: branches[metric.name].append(metric) else: branches[metric.name] = [metric] for name, metrics in leafs.iteritems(): yield KairosdbLeafNode(name, KairosdbReader(self.config, metrics)) for branchName, metrics in branches.iteritems(): name = branchName while '.' in name: name = name.rsplit('.', 1)[0] if name not in seen_branches: seen_branches.add(name) if leaf_regex.match(name) is not None: yield BranchNode(name) def fetch_from_cassandra(self, nodes, start_time, end_time): # datapoints are stored in rows that spane a 3week period. # so we need to determine the 1 or more periods we need to query. periods = [] start_period = start_time - (start_time % 1814400) periods.append({ 'key': start_period, 'start': start_time, 'end': end_time }) end_period = end_time - (end_time % 1814400) if start_period != end_period: pos = start_period + 1814400 count = 0 while pos <= end_period: periods.append({'key': pos, 'start': pos, 'end': end_time}) # set the end_time range boundry of the last period to the end of that period. periods[count]['end'] = pos - 1 count += 1 pos += 1814400 # we now need to generate all of the row_keys that we need. # we store an array of tuples, where each tuple is the (row_key, start_offset, end_offset) query_args = [] node_index = {} datapoints = {} for node in nodes: for metric in node.reader.metrics: measurement = metric.metric tags = "" tag_list = metric.tags tag_list.append('org_id:%d' % g.org) for tag in sorted(tag_list): parts = tag.split(":", 2) tags += "%s=%s:" % (parts[0], parts[1]) #keep a map between the measurement+tags to the node.path node_index["%s\0%s" % (measurement, tags)] = node.path #initialize where we will store the data. datapoints[node.path] = {} # now build or query_args for data_type in [ "kairos_double", "kairos_long" ]: #request both double and long values as kairos makes it impossible to know which in advance. data_type_size = len(data_type) for p in periods: row_timestamp = p['key'] * 1000 row_key = "%s00%s00%s%s%s" % ( measurement.encode('hex'), "%016x" % row_timestamp, "%02x" % data_type_size, data_type.encode('hex'), tags.encode('hex')) logger.debug("cassandra query", row_key=row_key) start = (p['start'] - p['key']) * 1000 end = (p['end'] - p['key']) * 1000 #The timestamps are shifted to support legacy datapoints that #used the extra bit to determine if the value was long or double row_key_bytes = bytearray(row_key.decode('hex')) try: start_bytes = bytearray( struct.pack(">L", start << 1)) except Exception as e: logger.error("failed to pack %d" % start) raise e try: end_bytes = bytearray(struct.pack(">L", end << 1)) except Exception as e: logger.error("failed to pack %d" % end) raise e query_args.append( (row_key_bytes, start_bytes, end_bytes)) #perform cassandra queries in parrallel using async requests. futures = [] for args in query_args: futures.append( self.cassandra.execute_async(self.metric_lookup_stmt, args)) # wait for them to complete and use the results for future in futures: rows = future.result() first = True for row in rows: if first: row_key = parse_row_key(row.key) path = node_index["%s\0%s" % (row_key['measurement'], row_key['tags'])] if path not in datapoints: datapoints[path] = {} first = False ts = parse_row_ts(row.column1, row_key['row_timestamp']) try: if row_key['data_type'] == "kairos_double": value = struct.unpack(">d", row.value)[0] else: value = unpack_kairos_long(row.value) except Exception as e: logger.error("failed to parse value", exception=e, data_type=row_key['data_type']) value = None datapoints[path][ts] = value return datapoints def fetch_multi(self, nodes, start_time, end_time): step = None for node in nodes: for metric in node.reader.metrics: if step is None or metric.interval < step: step = metric.interval with statsd.timer("graphite-api.fetch.kairosdb_query.query_duration"): data = self.fetch_from_cassandra(nodes, start_time, end_time) series = {} delta = None with statsd.timer( "graphite-api.fetch.unmarshal_kairosdb_resp.duration"): for path, points in data.items(): datapoints = [] next_time = start_time timestamps = points.keys() timestamps.sort() max_pos = len(timestamps) if max_pos == 0: for i in range(int((end_time - start_time) / step)): datapoints.append(None) series[path] = datapoints continue pos = 0 if delta is None: delta = (timestamps[0] % start_time) % step # ts[0] is always greater then start_time. if delta == 0: delta = step while next_time <= end_time: # check if there are missing values from the end of the time window if pos >= max_pos: datapoints.append(None) next_time += step continue ts = timestamps[pos] # read in the metric value. v = points[ts] # pad missing points with null. while ts > (next_time + step): datapoints.append(None) next_time += step datapoints.append(v) next_time += step pos += 1 if (ts + step) > end_time: break series[path] = datapoints if delta is None: delta = 1 time_info = (start_time + delta, end_time, step) return time_info, series def compile_regex(self, query, branch=False): # we turn graphite's custom glob-like thing into a regex, like so: # * becomes [^\.]* # . becomes \. if branch: regex = '{0}.*' else: regex = '^{0}$' regex = regex.format( query.pattern.replace('.', '\.').replace('*', '[^\.]*').replace( '{', '(').replace(',', '|').replace('}', ')')) logger.debug("compile_regex", pattern=query.pattern, regex=regex) return re.compile(regex) def search_series(self, leaf_regex, query): branch_regex = self.compile_regex(query, True) search_body = { "query": { "filtered": { "filter": { "or": [{ "term": { "org_id": g.org } }, { "term": { "org_id": -1 } }] }, "query": { "regexp": { "name": branch_regex.pattern } } } } } with statsd.timer( "graphite-api.search_series.es_search.query_duration"): ret = self.es.search(index="metric", doc_type="metric_index", body=search_body, size=10000) matches = [] if len(ret["hits"]["hits"]) > 0: for hit in ret["hits"]["hits"]: leaf = False source = hit['_source'] if leaf_regex.match(source['name']) is not None: leaf = True matches.append(RaintankMetric(source, leaf)) logger.debug('search_series', matches=len(matches)) return matches
statement = 'INSERT INTO test.test2 (a, b) VALUES (%s, %s);' for i in xrange(100000): insert = statement %(i,i) session.execute(insert) # 2) SINGLE THREAD/PROCESS CONCURRENT INSERT # ------------------------------------------ from cassandra.cluster import Cluster from cassandra.concurrent import execute_concurrent_with_args session = Cluster().connect() statement = session.prepare( 'INSERT INTO test.test2 (a, b) VALUES (?,?)' ) values = [(x,x) for x in xrange(0, 10000)] execute_concurrent_with_args( session, statement, values, concurrency=100 ) # 3) MULTI-PROCESS CONCURRENT INSERT # ---------------------------------- from multiprocessing import Process, Queue import time from cassandra.cluster import Cluster
def connect(): global session, prepared session = Cluster().connect() prepared = session.prepare( "SELECT title FROM wiki.categories WHERE category=?")
class Reassembler(Process): def initCluster(self): auth_provider = PlainTextAuthProvider(username=cfg.cassandraConfig["user"], password=cfg.cassandraConfig["password"]) self.session = Cluster([cfg.cassandraConfig["host"]], auth_provider=auth_provider).connect(cfg.cassandraConfig["db"]) self.preparedQuery = self.session.prepare("""INSERT into packet (source_addr, dest_addr, time_stamp, content, text_values) VALUES (?,?,?,?,?)""") def insertIntoDatabase(self, sourceAddr, destAddr, timeStamp, content, textValues): args = [sourceAddr, destAddr, timeStamp, content, textValues] self.session.execute(self.preparedQuery, args) def __init__(self, ports): self.ports = [] list = ports.split(',') # split CSV port list arguments self.ports = map(int,list) # convert ports to int (from string) self.initCluster() nids.register_tcp(self.handleTcpStream) # set up call back def __call__(self): # make a singleton clas return self def printableHex(self, buf): return ' '.join(x.encode('hex') for x in buf) def handleTcpStream(self, tcp): end_states = (nids.NIDS_CLOSE, nids.NIDS_TIMEOUT, nids.NIDS_RESET) logging.debug('tcps - {0} state: {1} timestamp: {2}'.format(str(tcp.addr),tcp.nids_state,nids.get_pkt_ts() * 1000)) if tcp.nids_state == nids.NIDS_JUST_EST: # new to us, but do we care? ((src, sport), (dst, dport)) = tcp.addr #if dport in self.ports: logging.info('collecting: {}'.format(str(tcp.addr))) tcp.client.collect = 1 tcp.server.collect = 1 elif tcp.nids_state == nids.NIDS_DATA: tcp.discard(0) # keep all of the stream's new data #informs nids how many bytes in the stream to discard #((src, sport), (dst, dport)) = tcp.addr #serverData = tcp.server.data[:tcp.server.count] #clientData = tcp.client.data[:tcp.client.count] #envelopeRegex = '<soap.*:envelope.*<.*MultiSpeakMsgHeader.*<soap.*:envelope>' #envelopeRegex2 = '</.+:[Ee]nvelope' #if serverData is None or clientData is None: # tcp.discard(0) #else: # if "Expect: 100-continue" not in serverData: # tcp.discard(0) # else: # if (re.search(envelopeRegex,serverData,re.S | re.IGNORECASE) and re.search(envelopeRegex2,serverData,re.S | re.IGNORECASE) and re.search(envelopeRegex,clientData,re.S | re.IGNORECASE) and re.search(envelopeRegex2,clientData,re.S | re.IGNORECASE)): # tcpaddr = ((dst,dport),(src,sport)) # logging.debug( "count_new: {}".format(tcp.server.count_new)) # logging.debug( "offset server: {}".format(tcp.server.offset)) # self.process_ipframe(serverData,tcp.addr,self.timestamp) # # logging.debug( "count_new: {}".format(tcp.server.count_new)) # logging.debug( "offset client: {}".format(tcp.client.offset)) # tcpaddr = ((dst,dport),(src,sport)) #flip it around to match our point of view (since this is the client # self.process_ipframe(clientData,tcpaddr,self.timestamp) # tcp.discard(tcp.server.count + tcp.client.count) # else: # tcp.discard(0) elif tcp.nids_state in end_states: ((src,sport),(dst,dport)) = tcp.addr serverData = tcp.server.data[:tcp.server.count] clientData = tcp.client.data[:tcp.client.count] #logging.debug("serverData: {0}".format( serverData)) #logging.debug("clientData: {0}".format(clientData)) self.timestamp = nids.get_pkt_ts() * 1000 #Add the MultiSpeakMsgHeader since we observed way too many false positives during #the virtual field test envelopeRegex = '<soap.*:envelope.*<.*MultiSpeakMsgHeader.*<soap.*:envelope>' logging.info("Serv Count: {0} Client Count {1} newc: {2} news: {3}".format(tcp.server.count,tcp.client.count,tcp.client.count_new,tcp.server.count_new)) #Match even if there is a newline since we've observed some payloads with the newline #print("server is ", tcp.server.data[:tcp.server.count], "client is ", tcp.client.data[:tcp.client.count], "count new is ", tcp.server.count_new) if serverData is not None: serverData = serverData.replace("\n","") if (re.search(envelopeRegex,serverData,re.S | re.IGNORECASE)): #and tcp.server.count_new > 0): logging.info('full message found in tcp server data') payload = tcp.server.data[:tcp.server.count] #tcpaddr = ((dst,dport),(src,sport)) logging.debug( "count_new: {}".format(tcp.server.count_new)) logging.debug( "offset server: {}".format(tcp.server.offset)) self.process_ipframe(payload,tcp.addr,self.timestamp) tcp.discard(tcp.server.count) elif "multispeak" in serverData.lower(): logging.warning("multispeak serverData but envelope failed: {}".format(serverData)) if clientData is not None: clientData = clientData.replace("\n","") if (re.search(envelopeRegex,clientData, re.S | re.IGNORECASE)): logging.info('full message found in tcp client data') tcpaddr = ((dst,dport),(src,sport)) #flip it around to match our point of view (since this is the client payload = tcp.client.data[:tcp.client.count] logging.debug("count_new client: {}".format(tcp.client.count_new)) logging.debug( "offset client: {}".format(tcp.client.offset)) self.process_ipframe(payload,tcpaddr,self.timestamp) #modified tcpaddr tcp.discard(tcp.client.count) elif "multispeak" in clientData.lower(): logging.warning("multispeak clientData but envelope failed: {}".format(clientData)) logging.debug( "addr: {}".format(tcp.addr)) logging.debug( "To server:") logging.debug( "bytes {}".format(str(tcp.server.count))) logging.debug( "To client:") logging.debug( "bytes: {}".format(str(tcp.client.count))) def process_ipframe(self,frame,tcpaddr, timestamp): # note that we are no longer checking source IP addresses # so we could be processing frames from other ips if not filtered # before this point cleansedFrame = frame.replace("\n","").replace("\r","").replace("\t","") envelopeRegex = '</.+:[Ee]nvelope' match = re.search(envelopeRegex,cleansedFrame) if match: ((src, sport),(dst,dport)) = tcpaddr #try to find the endpoint type from POST Request endpointRegex = '(?<=POST\s).*(?=\sHTTP)' # looks for POST and HTTP, and matches the URL match = re.search(endpointRegex,cleansedFrame, re.IGNORECASE) if match is not None: URLsplit = cleansedFrame[match.start():match.end()].split('/') # [foo,QA_SERVER] endpointCodeSplit = URLsplit[-1].split('_') # [QA,SERVER] endpointCode = endpointCodeSplit[0] # QA logging.debug("parsed MS endpoint code: {}".format(endpointCode)) else: endpointCode = 'NULL' logging.warning("Unable to parse endpoint code from header") #get the version from the SOAPAction http header versionRegex = '(?<=SOAPAction:\s"http:\/\/www.multispeak.org\/Version_).' match = re.search(versionRegex,cleansedFrame, re.IGNORECASE) mspVersion = 'NULL' if match is not None: mspVersion = cleansedFrame[match.start()] if (mspVersion != '3' and mspVersion != '5'): mspVersion = 'NULL' else: versionRegex2 = '(?<=SOAPAction:\shttp:\/\/www.multispeak.org\/Version_).' # uses lookbehind to extract only version num match2 = re.search(versionRegex2, cleansedFrame, re.IGNORECASE) if match2 is not None: mspVersion = cleansedFrame[match2.start()] if (mspVersion != '3' and mspVersion != '5'): mspVersion = 'NULL' else: versionRegex3 = '<MultiSpeakMsgHeader[^>]* Version="(\d)\.' match3 = re.search(versionRegex3, cleansedFrame, re.IGNORECASE) if match3 is not None: mspVersion = match3.group(1) if (mspVersion != '3' and mspVersion != '5'): mspVersion = 'NULL' # messageNameRegex = '(<[\w:]*[Bb]ody>)(\s*)(<)(?P<MsgName>[\w|:]+)' messageNameRegex = '(<([\w-]+:)?body>)(\s*)(<)(([\w-]+:)?)(?P<MsgName>\w*)' match = re.search(messageNameRegex,cleansedFrame, re.IGNORECASE) if match is not None: messageName = match.group('MsgName').split(':')[-1] text_values = {'endpoint':endpointCode,'messagetype':messageName,'mspVersion':mspVersion} logging.debug( "text values: {}".format(text_values)) self.insertIntoDatabase(src, dst, timestamp, frame, text_values) print("inserted packet") else: print("MsgName not found") logging.debug("frame with no MsgName: {}".format(cleansedFrame)) else: logging.debug("end of envelope not found: {}".format(cleansedFrame)) print("end of envelope not found")
from cassandra.query import dict_factory from cassandra.cluster import Cluster import cassandra_client as client KEYSPACE = "user_ratings" TABLE = "ratings" SESSION = Cluster(['127.0.0.1'], port=9042).connect() client.create_keyspace(SESSION, KEYSPACE) client.create_table(SESSION, KEYSPACE, TABLE) SESSION.set_keyspace(KEYSPACE) SESSION.row_factory = dict_factory RATING_QUERY = SESSION.prepare(f"SELECT * FROM {TABLE} WHERE user_id=?") DELETE_RATING_QUERY = SESSION.prepare(f"DELETE FROM {TABLE} WHERE user_id=?") def get(user_id: str) -> dict: user = SESSION.execute(RATING_QUERY, [user_id]) if user: return user[0] return {} def push(rating: dict): client.push_table(SESSION, KEYSPACE, TABLE, rating) def list() -> list: return client.list_table(SESSION, KEYSPACE, TABLE)
def connect(): global session, prepared session = Cluster().connect() prepared = session.prepare("SELECT release_version FROM system.local WHERE key=?")
class CassandraESSync: mappings = [] cassandra_nodes = [] cassandra_keyspace = None cassandra_cfs = [] es_nodes = [] es_index = None es_types = [] cassandra_client = None es_client = None last_synced = {'cassandra': {}, 'es': {} } # stores the last time each cf and index were synced, to avoid unnecessary queries def __init__(self, config): self.mappings = config["mappings"] self.cassandra_nodes = config['cassandra']['nodes'] self.cassandra_keyspace = config['cassandra']['keyspace'] self.cassandra_cfs = config['cassandra']['column_families'] self.es_nodes = config['elasticsearch']['nodes'] self.es_index = config['elasticsearch']['index'] self.es_types = config['elasticsearch']['types'] self.cassandra_client = Cluster().connect(self.cassandra_keyspace) self.es_client = elasticsearch.Elasticsearch() def sync_databases(self): for mapping in self.mappings: cassandra_cf, es_type = mapping self.sync_cf_type(cassandra_cf, es_type) def sync_cf_type(self, cassandra_cf, es_type): cf_id_column = self.cassandra_cfs[cassandra_cf]['id'] # column storing the document's uid cf_timestamp_column = self.cassandra_cfs[cassandra_cf]['timestamp'] # column storing the document's timestamp index_id_column = self.es_types[es_type]['id'] # column storing the document's timestamp index_timestamp_column = self.es_types[es_type]['timestamp'] # column storing the document's timestamp cf_data_fields = self.cassandra_cfs[cassandra_cf]['columns'] cf_fields = [cf_id_column, cf_timestamp_column] + cf_data_fields type_data_fields = self.es_types[es_type]['columns'] if cassandra_cf in self.last_synced['cassandra']: cf_start_time, cf_end_time = self.last_synced['cassandra'][cassandra_cf], time.time() else: cf_start_time, cf_end_time = None, None if es_type in self.last_synced['es']: index_start_time, index_end_time = self.last_synced['es'][es_type], time.time() else: index_start_time, index_end_time = None, None cassandra_data_query = 'SELECT %s, %s FROM %s' % (cf_id_column, cf_timestamp_column, cassandra_cf) range_filter = {} if index_start_time and index_end_time: range_filter = self.get_es_range_filter(index_timestamp_column, index_start_time, index_end_time) self.cassandra_client.set_keyspace(self.cassandra_keyspace) cassandra_data = self.cassandra_client.execute(cassandra_data_query) self.last_synced['cassandra'][cassandra_cf] = time.time() es_data = [] #self.es_client.search(index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], body=range_filter) es_scan = scan(self.es_client, index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], query=range_filter) self.last_synced['es'][es_type] = time.time() for data in es_scan: es_data.append(data) all_data = {} ids_to_insert_on_cassandra = [] ids_to_update_on_cassandra = [] ids_to_insert_on_es = [] ids_to_update_on_es = [] # because we cant make a range query on a non-primary key on cassandra, we have to retrieve it all, and then check for the timestamp by hand. for document in cassandra_data: doc_id, doc_timestamp = str(document[0]), int(calendar.timegm(document[1].utctimetuple())) if not(cf_start_time and cf_end_time): all_data[doc_id] = [doc_timestamp, None] elif cf_start_time and cf_end_time and doc_timestamp >= cf_start_time and doc_timestamp <= cf_end_time: # all_data[doc_id] = [doc_timestamp, None] for document in es_data: if "fields" in document: if index_id_column == '_id': # special case - is not inside fields. there must be a better way to do this ;( doc_id, doc_timestamp = document[index_id_column], int(document['fields'][index_timestamp_column][0]) else: doc_id, doc_timestamp = document['fields'][index_id_column], int(document['fields'][index_timestamp_column][0]) if doc_id in all_data: all_data[doc_id][1] = doc_timestamp else: all_data[doc_id] = [None, doc_timestamp] for uid in all_data: cassandra_ts, es_ts = all_data[uid] if cassandra_ts and es_ts: if cassandra_ts > es_ts: # same id, cassandra is the most recent. update that data on es. ids_to_update_on_es.append(uid) elif es_ts > cassandra_ts: # same id, es is the most recent. update that data on cassandra. ids_to_update_on_cassandra.append(uid) elif cassandra_ts: # present only on cassandra. add to es. ids_to_insert_on_es.append(uid) elif es_ts: #present only on es. add to cassandra. ids_to_insert_on_cassandra.append(uid) if ids_to_insert_on_es or ids_to_update_on_es: actions = [] from_cassandra_to_es = self.get_cassandra_documents_by_id(cassandra_cf, cf_fields, cf_id_column, ids_to_insert_on_es + ids_to_update_on_es) for document in from_cassandra_to_es: data = {} for i in range(len(cf_data_fields)): data[type_data_fields[i]] = getattr(document, cf_data_fields[i]) actions.append(self.get_es_bulk_action(es_type, index_id_column, getattr(document, cf_id_column), index_timestamp_column, getattr(document, cf_timestamp_column), data)) bulk(self.es_client, actions) # send all inserts/updates to es at once if ids_to_insert_on_cassandra or ids_to_update_on_cassandra: batch = BatchStatement() type_fields = type_data_fields + [index_id_column, index_timestamp_column] ids_filter = self.get_es_ids_filter(es_type, ids_to_insert_on_cassandra + ids_to_update_on_cassandra) from_es_to_cassandra = self.es_client.search(index=self.es_index, doc_type=es_type, fields=type_data_fields + [cf_timestamp_column], body=ids_filter) for document in from_es_to_cassandra['hits']['hits']: id_value = document[index_id_column] if index_id_column == '_id' else document["fields"][index_id_column] # this makes me a saaaad panda id_value = id_value es_data = [UUID(id_value), datetime.datetime.utcfromtimestamp(int(document['fields'][index_timestamp_column][0]))] for field in type_data_fields: es_data.append(document['fields'][field][0]) prepared_insert_statement = self.get_prepared_cassandra_insert_statement(cassandra_cf, cf_fields) prepared_update_statement = self.get_prepared_cassandra_update_statement(cassandra_cf, cf_id_column, cf_fields[1:]) if id_value in ids_to_insert_on_cassandra: batch.add(prepared_insert_statement, tuple(es_data)) else: batch.add(prepared_update_statement, tuple(es_data[1:] + [UUID(id_value)])) self.cassandra_client.execute(batch) def get_cassandra_documents_by_id(self, cf, fields, id_column, ids): from_cassandra_to_es_query = "SELECT %s FROM %s WHERE %s IN (%s)" % (",".join(fields), cf, id_column, ",".join(ids)) return self.cassandra_client.execute(from_cassandra_to_es_query) def get_prepared_cassandra_insert_statement(self, cf, cols): base_insert_statement = "INSERT INTO %s (%s) VALUES (%s)" % (cf, ",".join(cols), ",".join(["?"] * len(cols))) return self.cassandra_client.prepare(base_insert_statement) def get_prepared_cassandra_update_statement(self, cf, id_column, fields): base_update_statement = "UPDATE %s SET %s WHERE %s = ?" % (cf, ",".join([field + " = ?" for field in fields]), id_column) return self.cassandra_client.prepare(base_update_statement) def get_es_range_filter(self, col, start, end): return { "filter": { "range" : { col: { "gte" : start, "lte" : end } } } } def get_es_ids_filter(self, es_type, ids): return { "filter": { "ids" : { "type" : es_type, "values": ids } } } def get_es_bulk_action(self, es_type, id_column, id_value, timestamp_column, timestamp_value, data): if isinstance(timestamp_value, datetime.datetime): timestamp_value = calendar.timegm(timestamp_value.utctimetuple()) id_value = str(id_value) timestamp_value = str(timestamp_value) data[timestamp_column] = timestamp_value action = {} action['_index'] = self.es_index action['_type'] = es_type action[id_column] = id_value action['_source'] = data return action
class Loader: start = 0 def __init__(self,name): # set log logging.getLogger("elasticsearch").setLevel(logging.WARNING) self.log = getLogger('dse', 'dse.log') # start timer Loader.start = datetime.now() self.index = name self.stat = Statement(name) self.filename = mapper[name][load] self.pool = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10, queue_size=30) self.__init_dse() # init dse, create keyspace and table, prepare cql and set consistency level def __init_dse(self): self.session = Cluster(DSE_IP).connect() self.session.execute(self.stat.create_ks) self.session.execute(self.stat.create_tb) self.cql_prepared = self.session.prepare(self.stat.cql) self.cql_prepared.consistency_level = ConsistencyLevel.LOCAL_QUORUM def load(self): if self.index == 'reddit': g = self.__line_generator() while True: try: line = json.loads(next(g)) self.pool.submit(self.__insert_data,line) except StopIteration: break except Exception: continue elif self.index == 'amazon': df = pd.read_csv(self.filename) end = df.index.max() try: for line in zip(range(0,end+1),df['userId'],df['productId'], df['rating'],df['title'],df['comment'],df['timestamp']): self.pool.submit(self.__insert_data,line) except Exception as e: print(e) continue # shutdown self.pool.shutdown() # create search index end = datetime.now() self.log.info('Insert data cost time: {}'.format(end - Loader.start)) self.log.info('create search index') self.session.execute(self.stat.create_index, timeout=60000) self.session.shutdown() self.log.info('## Session closed') # print information self.log.info('Create search index cost time: {}'.format(datetime.now() - end)) self.log.info('Total cost time: {}'.format(datetime.now() - Loader.start)) self.log.info('## Inserts completed') def __line_generator(self): with open(self.filename, 'r', encoding='utf-8') as f: for counter,line in enumerate(f): try: if counter%100000 == 0: self.log.info(str(counter)) if counter%1000000 == 0: self.log.info('{}'.format(datetime.now()-Loader.start)) self.log.info('') yield line except Exception as e: self.log.warn(e) continue def __insert_data(self, line): if self.index == 'reddit' data = [line['id'], line['name'], line['link_id'], line['parent_id'], line['subreddit_id'],line['author'], line['body'], int(line['created_utc'])] elif self.index == 'amazon': pass res = self.session.execute(self.cql_prepared,data) return