class DbUtils: def __init__(self, connection_info): node_ips, namespace, user_id, password = connection_info auth_provider = PlainTextAuthProvider(username=user_id, password=password) self._session = Cluster(node_ips, auth_provider=auth_provider).connect(namespace) def insert_model_predictions(self, extracted_attribute): query = "insert into ae_batch_predictions1_tbl" \ "(batch_id,attribute,item_id,title,long_description," \ "short_description,model_prediction,trusted_prediction) " \ "VALUES" \ " (%s,%s,%s,%s,%s,%s,%s,%s)" for extracted_attribute in extracted_attribute: self._session.execute(query, extracted_attribute) return {"Message": "Insertion Successful"} def fetch_label_metrics(self, data): attribute, batch_id = data query = "select item_id,trusted_prediction,model_prediction from attributeextraction.ae_batch_predictions1_tbl " \ "where attribute='{}' and batch_id='{}'".format(attribute, batch_id) logger.info('Read query : %s', query) result = self._session.execute(query) return result
def create_schema(arguments): session = Cluster([arguments.host]).connect() session.execute("USE %s;" % arguments.keyspace) query = '''CREATE COLUMNFAMILY {columnfamily} ( key text PRIMARY KEY, color text, size text, qty int)'''.format(columnfamily=arguments.columnfamily) session.execute(query)
def test_custom_raw_row_results_all_types(self): """ Test to validate that custom protocol handlers work with varying types of results Connect, create a table with all sorts of data. Query the data, make the sure the custom results handler is used correctly. @since 2.7 @jira_ticket PYTHON-313 @expected_result custom protocol handler is invoked with various result types @test_category data_types:serialization """ # Connect using a custom protocol handler that tracks the various types the result message is used with. session = Cluster(protocol_version=PROTOCOL_VERSION).connect(keyspace="custserdes") session.client_protocol_handler = CustomProtocolHandlerResultMessageTracked session.row_factory = tuple_factory colnames = create_table_with_all_types("alltypes", session, 1) columns_string = ", ".join(colnames) # verify data params = get_all_primitive_params(0) results = session.execute("SELECT {0} FROM alltypes WHERE primkey=0".format(columns_string))[0] for expected, actual in zip(params, results): self.assertEqual(actual, expected) # Ensure we have covered the various primitive types self.assertEqual(len(CustomResultMessageTracked.checked_rev_row_set), len(PRIMITIVE_DATATYPES)-1) session.shutdown()
def insert_data(headers, data): ## Connect to Scylla cluster and create schema # session = cassandra.cluster.Cluster(SCYLLA_IP).connect() print("") print("## Connecting to Scylla cluster -> Creating schema") session = Cluster(SCYLLA_IP).connect() session.execute(create_ks) session.execute(create_t1) ## Connect to Elasticsearch print("") print("## Connecting to Elasticsearch -> Creating 'Catalog' index") es = Elasticsearch(ES_IP) ## Create Elasticsearch index. Ignore 400 = IF NOT EXIST es.indices.create(index="catalog", ignore=400) ## Non-prepared CQL statement #cql = "INSERT INTO catalog.apparel(sku,brand,group,sub_group,color,size,gender) VALUES(%(sku)s,%(brand)s,%(group)s,%(sub_group)s,%(color)s,%(size)s,%(gender)s)" ## Prepared CQL statement print("") print("## Preparing CQL statement") cql = "INSERT INTO catalog.apparel (sku,brand,group,sub_group,color,size,gender) VALUES (?,?,?,?,?,?,?) using TIMESTAMP ?" cql_prepared = session.prepare(cql) cql_prepared.consistency_level = ConsistencyLevel.ONE if random.random( ) < 0.2 else ConsistencyLevel.QUORUM print("") print("## Insert csv content into Scylla and Elasticsearch") for d in data: # See if we need to add code to wait for the ack. This should be synchronous. # Also, might need to switch to prepared statements to set the consistency level for sync requests. session.execute(cql_prepared, d) res = es.index(index="catalog", doc_type="apparel", id=d["sku"], body=d) ## After all the inserts, make a refresh, just in case print("") print("## Inserts completed, refreshing index") es.indices.refresh(index="catalog") print("")
def read_inserts_at_level(self, proto_ver): session = Cluster(protocol_version=proto_ver).connect(self.keyspace_name) try: results = session.execute('select * from t')[0] self.assertEqual("[SortedSet([1, 2]), SortedSet([3, 5])]", str(results.v)) results = session.execute('select * from u')[0] self.assertEqual("SortedSet([[1, 2], [3, 5]])", str(results.v)) results = session.execute('select * from v')[0] self.assertEqual("{SortedSet([1, 2]): [1, 2, 3], SortedSet([3, 5]): [4, 5, 6]}", str(results.v)) results = session.execute('select * from w')[0] self.assertEqual("typ(v0=OrderedMapSerializedKey([(1, [1, 2, 3]), (2, [4, 5, 6])]), v1=[7, 8, 9])", str(results.v)) finally: session.cluster.shutdown()
def initialize_connection(self): session = Cluster(contact_points=[self.host], port=self.port).connect(keyspace=self.keyspace) session.row_factory = panda_factory query = "SELECT epoch_time, post_count, byte_transfer, get_count, requests, visits FROM "+self.source DataFrame = session.execute(query).sort(columns=['epoch_time', 'post_count', 'byte_transfer', 'get_count', 'requests', 'visits'], ascending=[1,0,0,0,0,0]) process = retrieve_insert(DataFrame, session, self.dest) # create instance for the class retrieve_insert process.retrieve_variables() return 1
def __init__(self, nodes): """Establishes a connection to the ScyllaDB database, creates the "wdm" keyspace if it does not exist and creates or updates the users table. """ while True: try: session = Cluster(contact_points=nodes).connect() break except Exception: sleep(1) session.execute(""" CREATE KEYSPACE IF NOT EXISTS wdm WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '3' } """) session.default_consistency_level = ConsistencyLevel.QUORUM connection.setup(nodes, "wdm") sync_table(Payments)
def main(): logging.basicConfig(level=logging.INFO) logging.info("Using table: %s,%s", KEYSPACE, TABLE) session = Cluster(["127.0.0.1"]).connect(KEYSPACE) validators = {} for val in TYPES: logger.info("Creating table for '%s'", val) session.execute("CREATE TABLE {} (key int primary key, value {})" .format(TABLE, val)) row = session.execute("SELECT * FROM system.schema_columns WHERE " "keyspace_name = '{}' AND columnfamily_name = " "'{}' AND column_name = 'value'" .format(KEYSPACE, TABLE))[0] validator = row.validator logging.info("validator for '%s' = '%s'", val, validator) logger.info("Dropping table...") session.execute("DROP TABLE {}".format(TABLE)) validators[val] = validator print json.dumps(validators, indent=1, sort_keys=True)
def main(keyspace, output_keyspace): # connecting to keyspace session = Cluster(cluster_seeds).connect(output_keyspace) # loading orders table orders = load_table(keyspace, 'orders').cache() orders = orders.registerTempTable('orders') # loading lineitem table line_item = load_table(keyspace, 'lineitem') line_item.registerTempTable('lineitem') # loading part table part = load_table(keyspace, 'part') part.registerTempTable('part') # Creating orders_parts table if it doesn't exist and truncating all rows befor loading create_statement = 'CREATE TABLE IF NOT EXISTS orders_parts (orderkey int,custkey int,orderstatus text,totalprice decimal,orderdate date,order_priority text,clerk text,ship_priority int,comment text,part_names set<text>, PRIMARY KEY (orderkey));' session.execute(create_statement) truncate_statemet = 'TRUNCATE orders_parts;' session.execute(truncate_statemet) # Joining all three tables to get desired output join_statement = 'SELECT o.orderkey, p.name FROM orders o JOIN lineitem l ON o.orderkey==l.orderkey JOIN part p ON p.partkey == l.partkey' join_result = spark.sql(join_statement) # group by order key and get the associated partnames as a set result = join_result.groupBy(join_result['o.orderkey']).agg( functions.collect_set(join_result['p.name'])).withColumnRenamed( "collect_set(name)", "part_names").orderBy(join_result['o.orderkey'], ascending=True).cache() result.registerTempTable('order_part') # Finally join orders table and order_part table final = spark.sql( "SELECT a.*, b.part_names FROM orders a JOIN order_part b ON (a.orderkey = b.orderkey) order by a.orderkey asc" ) # Write it into cassandra table final.write.format("org.apache.spark.sql.cassandra").options( table='orders_parts', keyspace=output_keyspace).save()
def load(datafile, ks_name, table_name): with open(datafile) as f: reader = csv.reader(f) session = Cluster().connect() header = reader.next() stmt = ( "INSERT INTO {ks}.{tab} ({header_spec}) VALUES ({qs});").format( ks=ks_name, tab=table_name, header_spec=', '.join(header), qs=', '.join(['?' for _ in header])) print('preparing "{stmt}"'.format(stmt=stmt), file=sys.stderr) prepared = session.prepare(stmt) data = csv_handle_to_nested_list(reader) print('About to load {n} rows'.format(n=len(data))) for row in data: session.execute(prepared, row)
def test_custom_raw_uuid_row_results(self): """ Test to validate that custom protocol handlers work with raw row results Connect and validate that the normal protocol handler is used. Re-Connect and validate that the custom protocol handler is used. Re-Connect and validate that the normal protocol handler is used. @since 2.7 @jira_ticket PYTHON-313 @expected_result custom protocol handler is invoked appropriately. @test_category data_types:serialization """ # Ensure that we get normal uuid back first session = Cluster(protocol_version=PROTOCOL_VERSION).connect( keyspace="custserdes") session.row_factory = tuple_factory result_set = session.execute("SELECT schema_version FROM system.local") result = result_set.pop() uuid_type = result[0] self.assertEqual(type(uuid_type), uuid.UUID) # use our custom protocol handlder session.client_protocol_handler = CustomTestRawRowType session.row_factory = tuple_factory result_set = session.execute("SELECT schema_version FROM system.local") result = result_set.pop() raw_value = result.pop() self.assertTrue(isinstance(raw_value, binary_type)) self.assertEqual(len(raw_value), 16) # Ensure that we get normal uuid back when we re-connect session.client_protocol_handler = ProtocolHandler result_set = session.execute("SELECT schema_version FROM system.local") result = result_set.pop() uuid_type = result[0] self.assertEqual(type(uuid_type), uuid.UUID) session.shutdown()
def setUp(self): temp = Cluster(['127.0.0.1']).connect() temp.execute(''' CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; ''' % (self.keyspace)) temp = Cluster(['127.0.0.1']).connect(self.keyspace) temp.execute(''' DROP TABLE IF EXISTS superdupertable; ''') temp.execute(''' CREATE TABLE IF NOT EXISTS superdupertable ( uid UUID, name text, body text, last_change timestamp, PRIMARY KEY (uid, last_change) ); ''') self.cass = CassandraSync(self.keyspace) temp = Elasticsearch() temp.indices.create( self.index_name, {"settings": {"number_of_shards": 2, "number_of_replicas": 1}}, ignore=400) self.elas = ElasticSync(self.index_name)
def test_patch_unpatch(self): # Test patch idempotence patch() patch() tracer = get_dummy_tracer() Pin.get_from(Cluster).clone(tracer=tracer).onto(Cluster) session = Cluster(port=CASSANDRA_CONFIG['port']).connect(self.TEST_KEYSPACE) session.execute(self.TEST_QUERY) spans = tracer.writer.pop() assert spans, spans eq_(len(spans), 1) # Test unpatch unpatch() session = Cluster(port=CASSANDRA_CONFIG['port']).connect(self.TEST_KEYSPACE) session.execute(self.TEST_QUERY) spans = tracer.writer.pop() assert not spans, spans # Test patch again patch() Pin.get_from(Cluster).clone(tracer=tracer).onto(Cluster) session = Cluster(port=CASSANDRA_CONFIG['port']).connect(self.TEST_KEYSPACE) session.execute(self.TEST_QUERY) spans = tracer.writer.pop() assert spans, spans
def test_custom_raw_uuid_row_results(self): """ Test to validate that custom protocol handlers work with raw row results Connect and validate that the normal protocol handler is used. Re-Connect and validate that the custom protocol handler is used. Re-Connect and validate that the normal protocol handler is used. @since 2.7 @jira_ticket PYTHON-313 @expected_result custom protocol handler is invoked appropriately. @test_category data_types:serialization """ # Ensure that we get normal uuid back first session = Cluster().connect() session.row_factory = tuple_factory result_set = session.execute("SELECT schema_version FROM system.local") result = result_set.pop() uuid_type = result[0] self.assertEqual(type(uuid_type), uuid.UUID) # use our custom protocol handlder session.client_protocol_handler = CustomTestRawRowType session.row_factory = tuple_factory result_set = session.execute("SELECT schema_version FROM system.local") result = result_set.pop() raw_value = result.pop() self.assertTrue(isinstance(raw_value, binary_type)) self.assertEqual(len(raw_value), 16) # Ensure that we get normal uuid back when we re-connect session.client_protocol_handler = ProtocolHandler result_set = session.execute("SELECT schema_version FROM system.local") result = result_set.pop() uuid_type = result[0] self.assertEqual(type(uuid_type), uuid.UUID) session.shutdown()
class CassandraConnector(object): """ Handles exporting data from Cassandra to Neo4j using CSV as intermediary """ def __init__(self, keyspace_name, tables=None, schema_file=None, queries=None): self.KEYSPACE = keyspace_name self.session = Cluster().connect(self.KEYSPACE) self.keyspace_metadata = self.session.cluster.metadata.keyspaces[ self.KEYSPACE] self.tables = tables self.schema_file = schema_file def getTables(self): if self.tables: return self.tables else: return self.session.cluster.metadata.keyspaces[ self.KEYSPACE].tables.keys() def getColumnsForTable(self, table): return self.session.cluster.metadata.keyspaces[ self.KEYSPACE].tables[table].columns.keys() def parse(self): keyspace = self.session.cluster.metadata.keyspaces[self.KEYSPACE] parser = SchemaParser(keyspace.export_as_string()) parser.parse() sys.exit("Generated schema.yaml file.") def export(self): tableNames = self.getTables() fileNames = [t + "_results.csv" for t in tableNames] for t in tableNames: results_file = codecs.open(t + "_results.csv", encoding='utf-8', mode='w+') rows = self.session.execute('SELECT * FROM ' + t) writer = csv.writer(results_file) writer.writerow(self.getColumnsForTable(t)) writer.writerows([(e for e in row) for row in rows]) cypher_queries_gen = CypherQueriesGenerator(self.keyspace_metadata, schema_file) cypher_queries_gen.generate() cypher_queries_gen.build_queries(tableNames, fileNames)
def insert_cassandra(self, session, source, country, country_count): query = "SELECT id FROM main_count" print query session2 = Cluster(contact_points=[self.host], port=self.port).connect(keyspace=self.keyspace) session2.default_timeout = 100 data = session2.execute(query) for row in data: uid = row[0] # retrieving the id from the source table and updating the country_count (list) and country (list) columns print len(country), len(country_count) session.execute("UPDATE "+source+" SET country=%s, country_count=%s WHERE id=%s", parameters=[country, country_count, uid]) return 1
def run_inserts_at_version(self, proto_ver): session = Cluster(protocol_version=proto_ver).connect(self.keyspace_name) try: p = session.prepare('insert into t (k, v) values (?, ?)') session.execute(p, (0, [{1, 2}, {3, 5}])) p = session.prepare('insert into u (k, v) values (?, ?)') session.execute(p, (0, {(1, 2), (3, 5)})) p = session.prepare('insert into v (k, v, v1) values (?, ?, ?)') session.execute(p, (0, {(1, 2): [1, 2, 3], (3, 5): [4, 5, 6]}, (123, 'four'))) p = session.prepare('insert into w (k, v) values (?, ?)') session.execute(p, (0, ({1: [1, 2, 3], 2: [4, 5, 6]}, [7, 8, 9]))) finally: session.cluster.shutdown()
def get_todays_tweet(self): word_data, sorted_by_loc = {}, [] timestamp = datetime.utcnow().strftime('%y') + datetime.utcnow( ).strftime('%m') + datetime.utcnow().strftime( '%d') + datetime.utcnow().strftime('%H') cassandra = Cluster([ config.get("cassandra.host1"), config.get("cassandra.host2"), config.get("cassandra.host3") ]).connect('insight') tweeted_words = cassandra.execute(self.QUERY_TODAYS_TWEET % timestamp) # tweeted_words = self.CASSANDRA.execute(self.QUERY_TODAYS_TWEET % '15020221') # for (yymmddhh, timestamp, data, lat, lng) in tweeted_words: for (yymmddhh, location, string, data, timestamp) in tweeted_words: if location not in word_data: word_data[location] = {} for word, count in data.iteritems(): word = word.encode("utf-8") if word not in word_data[location]: word_data[location][word] = count else: word_data[location][word] += count for location in word_data: # In place sort by number of words in desc order sorted_by_loc = sorted(word_data[location].items(), key=operator.itemgetter(1)) sorted_by_loc.reverse() # Re-enter sorted data word_data[location] = {} for word_pair in sorted_by_loc: word = word_pair[0] count = word_pair[1] word_data[location][word] = count return word_data
class Searcher: start = 0 def __init__(self,name): self.filename = mapper[name][search] Searcher.start = datetime.now() self.log1 = getLogger('time', 'search_time.log') self.log2 = getLogger('result', 'search_result.log') self.log2.info('## connecting to cassandra cluster') self.session = Cluster(DSE_IP).connect() cql = "SELECT * FROM reddit.comment WHERE solr_query=?" self.solr_query = '{{"q":"body:*{0}*"}}' self.cql_prepared = self.session.prepare(cql) self.cql_prepared.consistency_level = ConsistencyLevel.ONE def search(self): counter = 0 with open(self.filename, 'r', encoding='utf-8') as f: for counter,line in enumerate(f): try: line = line.strip('\n') if counter%100 == 0: self.log1.info(str(counter)) if counter%1000 == 0: self.log1.info('{}'.format(datetime.now()- Searcher.start)) self.log1.info('') self.log2.info('({})'.format(line)) # format query body data = self.solr_query.format(line) res = self.session.execute(self.cql_prepared,[data],timeout=60000) self.log2.info(res[0]) except Exception as e: self.log2.warn(e) self.log1.info('total time:{}'.format(datetime.now() - Searcher.start))
def get_user_timeline(user: str, db_session: Cluster): """ Retrieve the 20 most recent photos from a user's timeline User timelines are stored in table home_status_updates Arguments: user: Username to retrieve db_session: Cassandra instance Returns: 20 most recent database records """ sql_query = f""" SELECT * FROM home_status_updates WHERE timeline_username = '******' limit 20;""" timeline = db_session.execute(sql_query) return timeline
class CassandraConnector(object): """ Handles exporting data from Cassandra to Neo4j using CSV as intermediary """ def __init__(self, keyspace_name, tables=None, schema_file=None, queries=None): self.KEYSPACE = keyspace_name self.session = Cluster().connect(self.KEYSPACE) self.keyspace_metadata = self.session.cluster.metadata.keyspaces[self.KEYSPACE] self.tables = tables self.schema_file = schema_file def getTables(self): if self.tables: return self.tables else: return self.session.cluster.metadata.keyspaces[self.KEYSPACE].tables.keys() def getColumnsForTable(self, table): return self.session.cluster.metadata.keyspaces[self.KEYSPACE].tables[table].columns.keys() def parse(self): keyspace = self.session.cluster.metadata.keyspaces[self.KEYSPACE] parser = SchemaParser(keyspace.export_as_string()) parser.parse() sys.exit("Generated schema.yaml file.") def export(self): tableNames = self.getTables() fileNames = [t + "_results.csv" for t in tableNames] for t in tableNames: results_file = codecs.open(t + "_results.csv", encoding='utf-8', mode='w+') rows = self.session.execute('SELECT * FROM ' + t) writer = csv.writer(results_file) writer.writerow(self.getColumnsForTable(t)) writer.writerows([(e for e in row) for row in rows]) cypher_queries_gen = CypherQueriesGenerator(self.keyspace_metadata, schema_file) cypher_queries_gen.generate() cypher_queries_gen.build_queries(tableNames, fileNames)
def get_todays_tweet(self): word_data, sorted_by_loc = {}, [] timestamp = datetime.utcnow().strftime('%y') + datetime.utcnow().strftime('%m') + datetime.utcnow().strftime('%d') + datetime.utcnow().strftime('%H') cassandra = Cluster([config.get("cassandra.host1"), config.get("cassandra.host2"), config.get("cassandra.host3")]).connect('insight') tweeted_words = cassandra.execute(self.QUERY_TODAYS_TWEET % timestamp) # tweeted_words = self.CASSANDRA.execute(self.QUERY_TODAYS_TWEET % '15020221') # for (yymmddhh, timestamp, data, lat, lng) in tweeted_words: for (yymmddhh, location, string, data, timestamp) in tweeted_words: if location not in word_data: word_data[location] = {} for word, count in data.iteritems(): word = word.encode("utf-8") if word not in word_data[location]: word_data[location][word] = count else: word_data[location][word] += count for location in word_data: # In place sort by number of words in desc order sorted_by_loc = sorted(word_data[location].items(), key=operator.itemgetter(1)) sorted_by_loc.reverse() # Re-enter sorted data word_data[location] = {} for word_pair in sorted_by_loc: word = word_pair[0] count = word_pair[1] word_data[location][word] = count return word_data
def initialize_connection(self): session = Cluster(contact_points=[self.host], port=self.port).connect(keyspace=self.keyspace) session.default_timeout = 100 query = "SELECT host, id FROM "+self.table statement = SimpleStatement(query) getdata = session.execute(statement) hosts, id_val, count = [], [], 0 count_list = {} for data in getdata: value = str(data[0]).strip() id_val.append(data[1]) if value.find(',') == -1: hosts.append(value) else: hosts.append(value.split(",")[0]) count += 1 print count #create instance for the class retrieve_location process = retrieve_location() # function calls count_list = process.get_location(session, self.table, hosts, id_val) self.insert_cassandra(session, self.source, count_list.keys(), count_list.values()) return 1
def main(input_keyspace, output_keyspace): session = Cluster(cluster_seeds).connect() session.execute( "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 2}" % output_keyspace) session.set_keyspace(output_keyspace) session.execute("CREATE TABLE IF NOT EXISTS orders_parts (" "orderkey int, " "custkey int, " "orderstatus text, " "totalprice decimal, " "orderdate date, " "order_priority text, " "clerk text, " "ship_priority int, " "comment text, " "part_names set<text>, " "PRIMARY KEY (orderkey))") session.execute('TRUNCATE orders_parts') session.shutdown() df_part = spark.read.format("org.apache.spark.sql.cassandra").options( table='part', keyspace=input_keyspace).load() df_orders = spark.read.format("org.apache.spark.sql.cassandra").options( table='orders', keyspace=input_keyspace).load().cache() df_lineitem = spark.read.format("org.apache.spark.sql.cassandra").options( table='lineitem', keyspace=input_keyspace).load() joined_df = df_orders.join(df_lineitem, df_orders.orderkey == df_lineitem.orderkey) final_joined_df = joined_df.join(df_part, df_part.partkey == joined_df.partkey) selected_df = final_joined_df.groupBy( df_orders['orderkey'], 'totalprice').agg( functions.collect_set( df_part['name']).alias('part_names')).drop('totalprice') output_df = selected_df.join(df_orders, "orderkey") output_df.write.format("org.apache.spark.sql.cassandra").options( table="orders_parts", keyspace=output_keyspace).save()
def setUp(self): temp = Cluster(['127.0.0.1']).connect() temp.execute(''' CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; ''' % (self.keyspace)) temp = Cluster(['127.0.0.1']).connect(self.keyspace) temp.execute(''' DROP TABLE IF EXISTS superdupertable; ''') temp.execute(''' CREATE TABLE IF NOT EXISTS superdupertable ( uid UUID, name text, body text, last_change timestamp, PRIMARY KEY (uid, last_change) ); ''') self.cass = CassandraSync(self.keyspace)
def insert_data(): from cassandra.cluster import Cluster from cassandra import AlreadyExists # Conexion al cluster de Cassandra al keyspace "test" session = Cluster([ '192.168.101.140', '192.168.101.141', '192.168.101.142', '192.168.101.143' ]).connect("test") try: # Primero crea la tabla session.execute("CREATE TABLE test1 (id int PRIMARY KEY, col1 text)") # Inserta datos insertar = session.prepare( "INSERT INTO test1 (id, col1) VALUES (?, ?)") for i in range(10): session.execute(insertar, (i, 'hola' + str(i))) print(i) except AlreadyExists: # Inserta datos a la tabla ya existente insertar = session.prepare( "INSERT INTO test1 (id, col1) VALUES (?, ?)") for i in range(10): session.execute(insertar, (i, 'hola' + str(i))) print(i)
def main(input_dir, keyspace_name, table_name): rdd = sc.textFile(input_dir) preprocess_rdd = rdd.map(lambda line: preprocess(line)) cleaned_rdd = preprocess_rdd.filter(lambda x: x is not None) schema = "id STRING, host STRING, datetime TIMESTAMP, path STRING, bytes INT" # sch = ['id', 'host', 'datetime', 'path', 'bytes'] df = spark.createDataFrame(data=cleaned_rdd, schema=schema) session = Cluster(cluster_seeds).connect() session.execute( "CREATE KEYSPACE IF NOT EXISTS " + keyspace_name + " WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':2}") session.set_keyspace(keyspace_name) session.execute("DROP TABLE IF EXISTS " + keyspace_name + "." + table_name) session.execute( "CREATE TABLE IF NOT EXISTS " + table_name + " (id UUID, host TEXT, datetime TIMESTAMP, path TEXT, bytes INT, PRIMARY KEY (host, id))" ) # session.execute("TRUNCATE "+table_name) session.shutdown() df.write.format("org.apache.spark.sql.cassandra").options( table=table_name, keyspace=keyspace_name).save()
class CassandraService: def __init__(self, keyspace='data'): self.session = Cluster().connect(keyspace) self.user_insert_stmt = self.prepare_user_insert_statement() self.tweet_insert_stmt = self.prepare_tweet_insert_statement() self.exception_insert_stmt = self.prepare_exception_insert_statement() def save_user(self, user: User): self.session.execute(self.user_insert_stmt, vars(user)) def save_tweet(self, tweet: Tweet): self.session.execute(self.tweet_insert_stmt, vars(tweet)) def save_exception(self, exception_data: dict): self.session.execute(self.exception_insert_stmt, exception_data) def prepare_user_insert_statement(self): return self.session.prepare(""" INSERT INTO pronbots_2019 (id, captured_at, created_at, description, entities, favourites_count, followers_count, following_count, friends_count, listed_count, name, pinned_tweet_id, profile_image_url, protected, tweets_count, url, user_name) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """) def prepare_tweet_insert_statement(self): return self.session.prepare(""" INSERT INTO pronbots_2019_tweets (id, captured_at, created_at, user_id, timezone, content, link, retweet, mentions, urls, photos, video, lang, replies_count, retweets_count, likes_count, hashtags, cashtags) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """) def prepare_exception_insert_statement(self): return self.session.prepare(""" INSERT INTO pronbots_2019_exceptions (user, exception) VALUES (?, ?) """)
def current_time(): return (datetime.now() - datetime(1970, 1, 1)).total_seconds() for lines in args.lines: for columns in args.columns: initial_size = 0 print("Deleting previous data") try: subprocess.check_output("rm " + args.queue_dir + " -R", shell=True) except subprocess.CalledProcessError: pass try: session = Cluster(['localhost']).connect('kairosdb') session.default_timeout = 1200 session.execute('drop keyspace kairosdb;') print("Data deleted.") except NoHostAvailable: print("No previous data exists. Skipping delete.") print("Starting Kairosdb") kairos = subprocess.Popen([args.kairos_path, "run"], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) time.sleep(4) while True: try: subprocess.check_output("curl localhost:8080", shell=True, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError: print("Kairos is not ready yet. Checking again in 2 seconds.")
def __str__(self): # just extracting request count from the size stats (which are recorded on all requests) request_sizes = dict(self.requests) count = request_sizes.pop('count') return "%d requests (%d errors)\nRequest size statistics:\n%s" % ( count, self.errors, pp.pformat(request_sizes)) # connect a session session = Cluster().connect() # attach a listener to this session ra = RequestAnalyzer(session) session.execute("SELECT release_version FROM system.local") session.execute("SELECT release_version FROM system.local") print(ra) # 2 requests (0 errors) # Request size statistics: # { '75percentile': 74, # '95percentile': 74, # '98percentile': 74, # '999percentile': 74, # '99percentile': 74, # 'max': 74, # 'mean': 74.0, # 'median': 74.0, # 'min': 74, # 'stddev': 0.0}
class CassandraESSync: mappings = [] cassandra_nodes = [] cassandra_keyspace = None cassandra_cfs = [] es_nodes = [] es_index = None es_types = [] cassandra_client = None es_client = None last_synced = {'cassandra': {}, 'es': {} } # stores the last time each cf and index were synced, to avoid unnecessary queries def __init__(self, config): self.mappings = config["mappings"] self.cassandra_nodes = config['cassandra']['nodes'] self.cassandra_keyspace = config['cassandra']['keyspace'] self.cassandra_cfs = config['cassandra']['column_families'] self.es_nodes = config['elasticsearch']['nodes'] self.es_index = config['elasticsearch']['index'] self.es_types = config['elasticsearch']['types'] self.cassandra_client = Cluster().connect(self.cassandra_keyspace) self.es_client = elasticsearch.Elasticsearch() def sync_databases(self): for mapping in self.mappings: cassandra_cf, es_type = mapping self.sync_cf_type(cassandra_cf, es_type) def sync_cf_type(self, cassandra_cf, es_type): cf_id_column = self.cassandra_cfs[cassandra_cf]['id'] # column storing the document's uid cf_timestamp_column = self.cassandra_cfs[cassandra_cf]['timestamp'] # column storing the document's timestamp index_id_column = self.es_types[es_type]['id'] # column storing the document's timestamp index_timestamp_column = self.es_types[es_type]['timestamp'] # column storing the document's timestamp cf_data_fields = self.cassandra_cfs[cassandra_cf]['columns'] cf_fields = [cf_id_column, cf_timestamp_column] + cf_data_fields type_data_fields = self.es_types[es_type]['columns'] if cassandra_cf in self.last_synced['cassandra']: cf_start_time, cf_end_time = self.last_synced['cassandra'][cassandra_cf], time.time() else: cf_start_time, cf_end_time = None, None if es_type in self.last_synced['es']: index_start_time, index_end_time = self.last_synced['es'][es_type], time.time() else: index_start_time, index_end_time = None, None cassandra_data_query = 'SELECT %s, %s FROM %s' % (cf_id_column, cf_timestamp_column, cassandra_cf) range_filter = {} if index_start_time and index_end_time: range_filter = self.get_es_range_filter(index_timestamp_column, index_start_time, index_end_time) self.cassandra_client.set_keyspace(self.cassandra_keyspace) cassandra_data = self.cassandra_client.execute(cassandra_data_query) self.last_synced['cassandra'][cassandra_cf] = time.time() es_data = [] #self.es_client.search(index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], body=range_filter) es_scan = scan(self.es_client, index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], query=range_filter) self.last_synced['es'][es_type] = time.time() for data in es_scan: es_data.append(data) all_data = {} ids_to_insert_on_cassandra = [] ids_to_update_on_cassandra = [] ids_to_insert_on_es = [] ids_to_update_on_es = [] # because we cant make a range query on a non-primary key on cassandra, we have to retrieve it all, and then check for the timestamp by hand. for document in cassandra_data: doc_id, doc_timestamp = str(document[0]), int(calendar.timegm(document[1].utctimetuple())) if not(cf_start_time and cf_end_time): all_data[doc_id] = [doc_timestamp, None] elif cf_start_time and cf_end_time and doc_timestamp >= cf_start_time and doc_timestamp <= cf_end_time: # all_data[doc_id] = [doc_timestamp, None] for document in es_data: if "fields" in document: if index_id_column == '_id': # special case - is not inside fields. there must be a better way to do this ;( doc_id, doc_timestamp = document[index_id_column], int(document['fields'][index_timestamp_column][0]) else: doc_id, doc_timestamp = document['fields'][index_id_column], int(document['fields'][index_timestamp_column][0]) if doc_id in all_data: all_data[doc_id][1] = doc_timestamp else: all_data[doc_id] = [None, doc_timestamp] for uid in all_data: cassandra_ts, es_ts = all_data[uid] if cassandra_ts and es_ts: if cassandra_ts > es_ts: # same id, cassandra is the most recent. update that data on es. ids_to_update_on_es.append(uid) elif es_ts > cassandra_ts: # same id, es is the most recent. update that data on cassandra. ids_to_update_on_cassandra.append(uid) elif cassandra_ts: # present only on cassandra. add to es. ids_to_insert_on_es.append(uid) elif es_ts: #present only on es. add to cassandra. ids_to_insert_on_cassandra.append(uid) if ids_to_insert_on_es or ids_to_update_on_es: actions = [] from_cassandra_to_es = self.get_cassandra_documents_by_id(cassandra_cf, cf_fields, cf_id_column, ids_to_insert_on_es + ids_to_update_on_es) for document in from_cassandra_to_es: data = {} for i in range(len(cf_data_fields)): data[type_data_fields[i]] = getattr(document, cf_data_fields[i]) actions.append(self.get_es_bulk_action(es_type, index_id_column, getattr(document, cf_id_column), index_timestamp_column, getattr(document, cf_timestamp_column), data)) bulk(self.es_client, actions) # send all inserts/updates to es at once if ids_to_insert_on_cassandra or ids_to_update_on_cassandra: batch = BatchStatement() type_fields = type_data_fields + [index_id_column, index_timestamp_column] ids_filter = self.get_es_ids_filter(es_type, ids_to_insert_on_cassandra + ids_to_update_on_cassandra) from_es_to_cassandra = self.es_client.search(index=self.es_index, doc_type=es_type, fields=type_data_fields + [cf_timestamp_column], body=ids_filter) for document in from_es_to_cassandra['hits']['hits']: id_value = document[index_id_column] if index_id_column == '_id' else document["fields"][index_id_column] # this makes me a saaaad panda id_value = id_value es_data = [UUID(id_value), datetime.datetime.utcfromtimestamp(int(document['fields'][index_timestamp_column][0]))] for field in type_data_fields: es_data.append(document['fields'][field][0]) prepared_insert_statement = self.get_prepared_cassandra_insert_statement(cassandra_cf, cf_fields) prepared_update_statement = self.get_prepared_cassandra_update_statement(cassandra_cf, cf_id_column, cf_fields[1:]) if id_value in ids_to_insert_on_cassandra: batch.add(prepared_insert_statement, tuple(es_data)) else: batch.add(prepared_update_statement, tuple(es_data[1:] + [UUID(id_value)])) self.cassandra_client.execute(batch) def get_cassandra_documents_by_id(self, cf, fields, id_column, ids): from_cassandra_to_es_query = "SELECT %s FROM %s WHERE %s IN (%s)" % (",".join(fields), cf, id_column, ",".join(ids)) return self.cassandra_client.execute(from_cassandra_to_es_query) def get_prepared_cassandra_insert_statement(self, cf, cols): base_insert_statement = "INSERT INTO %s (%s) VALUES (%s)" % (cf, ",".join(cols), ",".join(["?"] * len(cols))) return self.cassandra_client.prepare(base_insert_statement) def get_prepared_cassandra_update_statement(self, cf, id_column, fields): base_update_statement = "UPDATE %s SET %s WHERE %s = ?" % (cf, ",".join([field + " = ?" for field in fields]), id_column) return self.cassandra_client.prepare(base_update_statement) def get_es_range_filter(self, col, start, end): return { "filter": { "range" : { col: { "gte" : start, "lte" : end } } } } def get_es_ids_filter(self, es_type, ids): return { "filter": { "ids" : { "type" : es_type, "values": ids } } } def get_es_bulk_action(self, es_type, id_column, id_value, timestamp_column, timestamp_value, data): if isinstance(timestamp_value, datetime.datetime): timestamp_value = calendar.timegm(timestamp_value.utctimetuple()) id_value = str(id_value) timestamp_value = str(timestamp_value) data[timestamp_column] = timestamp_value action = {} action['_index'] = self.es_index action['_type'] = es_type action[id_column] = id_value action['_source'] = data return action
def _get_system_column_rows(server=None): server = server or "127.0.0.1" logging.info("Getting server columns from server: '%s'", server) session = Cluster([server]).connect() return session.execute("SELECT * FROM system.schema_columns")
#!/usr/bin/python # Copyright (C) 2014-2016, National Rural Electric Cooperative Association and Cigital, Inc from cassandra.cluster import Cluster session = Cluster(['127.0.0.1']).connect('demo') for currentRow in session.execute('select content from packet'): print currentRow[0], '\n'
class Reassembler(Process): def initCluster(self): auth_provider = PlainTextAuthProvider(username=cfg.cassandraConfig["user"], password=cfg.cassandraConfig["password"]) self.session = Cluster([cfg.cassandraConfig["host"]], auth_provider=auth_provider).connect(cfg.cassandraConfig["db"]) self.preparedQuery = self.session.prepare("""INSERT into packet (source_addr, dest_addr, time_stamp, content, text_values) VALUES (?,?,?,?,?)""") def insertIntoDatabase(self, sourceAddr, destAddr, timeStamp, content, textValues): args = [sourceAddr, destAddr, timeStamp, content, textValues] self.session.execute(self.preparedQuery, args) def __init__(self, ports): self.ports = [] list = ports.split(',') # split CSV port list arguments self.ports = map(int,list) # convert ports to int (from string) self.initCluster() nids.register_tcp(self.handleTcpStream) # set up call back def __call__(self): # make a singleton clas return self def printableHex(self, buf): return ' '.join(x.encode('hex') for x in buf) def handleTcpStream(self, tcp): end_states = (nids.NIDS_CLOSE, nids.NIDS_TIMEOUT, nids.NIDS_RESET) logging.debug('tcps - {0} state: {1} timestamp: {2}'.format(str(tcp.addr),tcp.nids_state,nids.get_pkt_ts() * 1000)) if tcp.nids_state == nids.NIDS_JUST_EST: # new to us, but do we care? ((src, sport), (dst, dport)) = tcp.addr #if dport in self.ports: logging.info('collecting: {}'.format(str(tcp.addr))) tcp.client.collect = 1 tcp.server.collect = 1 elif tcp.nids_state == nids.NIDS_DATA: tcp.discard(0) # keep all of the stream's new data #informs nids how many bytes in the stream to discard #((src, sport), (dst, dport)) = tcp.addr #serverData = tcp.server.data[:tcp.server.count] #clientData = tcp.client.data[:tcp.client.count] #envelopeRegex = '<soap.*:envelope.*<.*MultiSpeakMsgHeader.*<soap.*:envelope>' #envelopeRegex2 = '</.+:[Ee]nvelope' #if serverData is None or clientData is None: # tcp.discard(0) #else: # if "Expect: 100-continue" not in serverData: # tcp.discard(0) # else: # if (re.search(envelopeRegex,serverData,re.S | re.IGNORECASE) and re.search(envelopeRegex2,serverData,re.S | re.IGNORECASE) and re.search(envelopeRegex,clientData,re.S | re.IGNORECASE) and re.search(envelopeRegex2,clientData,re.S | re.IGNORECASE)): # tcpaddr = ((dst,dport),(src,sport)) # logging.debug( "count_new: {}".format(tcp.server.count_new)) # logging.debug( "offset server: {}".format(tcp.server.offset)) # self.process_ipframe(serverData,tcp.addr,self.timestamp) # # logging.debug( "count_new: {}".format(tcp.server.count_new)) # logging.debug( "offset client: {}".format(tcp.client.offset)) # tcpaddr = ((dst,dport),(src,sport)) #flip it around to match our point of view (since this is the client # self.process_ipframe(clientData,tcpaddr,self.timestamp) # tcp.discard(tcp.server.count + tcp.client.count) # else: # tcp.discard(0) elif tcp.nids_state in end_states: ((src,sport),(dst,dport)) = tcp.addr serverData = tcp.server.data[:tcp.server.count] clientData = tcp.client.data[:tcp.client.count] #logging.debug("serverData: {0}".format( serverData)) #logging.debug("clientData: {0}".format(clientData)) self.timestamp = nids.get_pkt_ts() * 1000 #Add the MultiSpeakMsgHeader since we observed way too many false positives during #the virtual field test envelopeRegex = '<soap.*:envelope.*<.*MultiSpeakMsgHeader.*<soap.*:envelope>' logging.info("Serv Count: {0} Client Count {1} newc: {2} news: {3}".format(tcp.server.count,tcp.client.count,tcp.client.count_new,tcp.server.count_new)) #Match even if there is a newline since we've observed some payloads with the newline #print("server is ", tcp.server.data[:tcp.server.count], "client is ", tcp.client.data[:tcp.client.count], "count new is ", tcp.server.count_new) if serverData is not None: serverData = serverData.replace("\n","") if (re.search(envelopeRegex,serverData,re.S | re.IGNORECASE)): #and tcp.server.count_new > 0): logging.info('full message found in tcp server data') payload = tcp.server.data[:tcp.server.count] #tcpaddr = ((dst,dport),(src,sport)) logging.debug( "count_new: {}".format(tcp.server.count_new)) logging.debug( "offset server: {}".format(tcp.server.offset)) self.process_ipframe(payload,tcp.addr,self.timestamp) tcp.discard(tcp.server.count) elif "multispeak" in serverData.lower(): logging.warning("multispeak serverData but envelope failed: {}".format(serverData)) if clientData is not None: clientData = clientData.replace("\n","") if (re.search(envelopeRegex,clientData, re.S | re.IGNORECASE)): logging.info('full message found in tcp client data') tcpaddr = ((dst,dport),(src,sport)) #flip it around to match our point of view (since this is the client payload = tcp.client.data[:tcp.client.count] logging.debug("count_new client: {}".format(tcp.client.count_new)) logging.debug( "offset client: {}".format(tcp.client.offset)) self.process_ipframe(payload,tcpaddr,self.timestamp) #modified tcpaddr tcp.discard(tcp.client.count) elif "multispeak" in clientData.lower(): logging.warning("multispeak clientData but envelope failed: {}".format(clientData)) logging.debug( "addr: {}".format(tcp.addr)) logging.debug( "To server:") logging.debug( "bytes {}".format(str(tcp.server.count))) logging.debug( "To client:") logging.debug( "bytes: {}".format(str(tcp.client.count))) def process_ipframe(self,frame,tcpaddr, timestamp): # note that we are no longer checking source IP addresses # so we could be processing frames from other ips if not filtered # before this point cleansedFrame = frame.replace("\n","").replace("\r","").replace("\t","") envelopeRegex = '</.+:[Ee]nvelope' match = re.search(envelopeRegex,cleansedFrame) if match: ((src, sport),(dst,dport)) = tcpaddr #try to find the endpoint type from POST Request endpointRegex = '(?<=POST\s).*(?=\sHTTP)' # looks for POST and HTTP, and matches the URL match = re.search(endpointRegex,cleansedFrame, re.IGNORECASE) if match is not None: URLsplit = cleansedFrame[match.start():match.end()].split('/') # [foo,QA_SERVER] endpointCodeSplit = URLsplit[-1].split('_') # [QA,SERVER] endpointCode = endpointCodeSplit[0] # QA logging.debug("parsed MS endpoint code: {}".format(endpointCode)) else: endpointCode = 'NULL' logging.warning("Unable to parse endpoint code from header") #get the version from the SOAPAction http header versionRegex = '(?<=SOAPAction:\s"http:\/\/www.multispeak.org\/Version_).' match = re.search(versionRegex,cleansedFrame, re.IGNORECASE) mspVersion = 'NULL' if match is not None: mspVersion = cleansedFrame[match.start()] if (mspVersion != '3' and mspVersion != '5'): mspVersion = 'NULL' else: versionRegex2 = '(?<=SOAPAction:\shttp:\/\/www.multispeak.org\/Version_).' # uses lookbehind to extract only version num match2 = re.search(versionRegex2, cleansedFrame, re.IGNORECASE) if match2 is not None: mspVersion = cleansedFrame[match2.start()] if (mspVersion != '3' and mspVersion != '5'): mspVersion = 'NULL' else: versionRegex3 = '<MultiSpeakMsgHeader[^>]* Version="(\d)\.' match3 = re.search(versionRegex3, cleansedFrame, re.IGNORECASE) if match3 is not None: mspVersion = match3.group(1) if (mspVersion != '3' and mspVersion != '5'): mspVersion = 'NULL' # messageNameRegex = '(<[\w:]*[Bb]ody>)(\s*)(<)(?P<MsgName>[\w|:]+)' messageNameRegex = '(<([\w-]+:)?body>)(\s*)(<)(([\w-]+:)?)(?P<MsgName>\w*)' match = re.search(messageNameRegex,cleansedFrame, re.IGNORECASE) if match is not None: messageName = match.group('MsgName').split(':')[-1] text_values = {'endpoint':endpointCode,'messagetype':messageName,'mspVersion':mspVersion} logging.debug( "text values: {}".format(text_values)) self.insertIntoDatabase(src, dst, timestamp, frame, text_values) print("inserted packet") else: print("MsgName not found") logging.debug("frame with no MsgName: {}".format(cleansedFrame)) else: logging.debug("end of envelope not found: {}".format(cleansedFrame)) print("end of envelope not found")
class Loader: start = 0 def __init__(self,name): logging.getLogger("elasticsearch").setLevel(logging.WARNING) self.log = getLogger('scy+es','load.log') Loader.start = datetime.now() self.index = name self.stat = Statement(name) self.filename = mapper[name]['load'] self.__init_scy() self.__init_es() self.pool_scy = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10) self.pool_es = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10) def load(self): if self.index == 'reddit': # main loop g = self.__line_generator() while True: try: line = json.loads(next(g)) self.pool_scy.submit(self.__insert_data,line) self.pool_es.submit(self.__insert_index,line) except StopIteration: break except Exception: continue elif self.index == 'amazon': df = pd.read_csv(self.filename,encoding='utf-8') end = df.index.max() df = df.fillna('missing') try: for line in zip(range(0,end+1),df['userId'],df['productId'], df['rating'],df['title'],df['comment'],df['timestamp']): counter = line[0] if counter%100000 == 0: self.log.info(str(counter)) if counter%1000000 == 0: self.log.info('{}'.format(datetime.now()-Loader.start)) self.log.info('') self.pool_scy.submit(self.__insert_data,line) self.pool_es.submit(self.__insert_index,line) except Exception as e: print(e) self.pool_scy.shutdown() self.pool_es.shutdown() self.es.indices.refresh(index=self.index) # shutdown self.session.shutdown() # print information self.log.info('## Total cost time: {}'.format(datetime.now() - Loader.start)) self.log.info('## Inserts completed') def __line_generator(self): with open(self.filename, 'r', encoding='utf-8') as f: for counter,line in enumerate(f): try: if counter%100000 == 0: self.log.info(str(counter)) if counter%1000000 == 0: self.log.info('{}'.format(datetime.now()-Loader.start)) self.log.info('') yield line except Exception as e: print(e) continue # get scylladb connect, create ks and tb, return session def __init_scy(self): # session = Cluster(contact_points=SCYLLA_IP,execution_profiles={EXEC_PROFILE_DEFAULT:ep}).connect() self.session = Cluster(contact_points=SCYLLA_IP).connect() # create a schema self.session.execute(self.stat.create_ks) # create a tb self.session.execute(self.stat.create_tb) self.cql_prepared = self.session.prepare(self.stat.cql) self.cql_prepared.consistency_level = ConsistencyLevel.LOCAL_QUORUM # get es connect, create index def __init_es(self): with open('mapping.json','r') as f: mapping = json.load(f)[self.index] self.es = Elasticsearch(ES_IP, timeout=30) # create es index self.es.indices.create(index=self.index,body=mapping ignore=400,timeout=30) # insert data into scylladb def __insert_data(self,line): data = list() # TODO: format your data if self.index == 'reddit': data = [line['id'], line['name'], line['link_id'], line['parent_id'], line['subreddit_id'],line['author'], line['body'], int(line['created_utc'])] elif self.index == 'amazon': # id | user_id | product_id | rating | title | body | timestamp data = [line[0], int(line[1]), int(line[2]), line[3], line[4], line[5], line[6]] self.log.info(data) res = self.session.execute(self.cql_prepared,data,timeout=60000) return # insert data into elasticsearch def __insert_index(self,line): # TODO: format your data if self.index == 'reddit': data = { k:v for k,v in line.items() if k in ['id', 'name', 'author', 'body'] } elif self.index == 'amazon': data = {} data['id'] = line[0] data['title'] = line[4] data['body'] = line[5] res = self.es.index(index=self.index, doc_type="comment", id=data['id'], body=data) return
profiles = {'node1': node1_profile} user_info = {'username':'******', 'password':'******'} auth_provider = PlainTextAuthProvider(username=user_info['username'], password=user_info['password']) logger.info(f'Logging into {ARGS["<cluster-ip>"]} with user={user_info["username"]}') session = Cluster( [ARGS['<cluster-ip>']], port=9042, protocol_version=4, execution_profiles=profiles, connect_timeout=20, auth_provider=auth_provider ).connect() connection.register_connection('cluster1', session=session) logger.info(session.execute('SELECT release_version FROM system.local').one()) print('Simple Queries:') print('\n' + 'Average temperature in NYC from 2006-2020:') statement = SimpleStatement('SELECT AVG(tavg) as avg_temp FROM spardata.weather_dataset') for row in session.execute(statement,timeout=30): print(row['avg_temp']) print('\n' + 'Average temperature in NYC by year:') statement = SimpleStatement('SELECT year, AVG(tavg) as avg_temp FROM spardata.weather_dataset GROUP BY year') for row in session.execute(statement,timeout=30): print(f"{row['year']} : {row['avg_temp']}") print('\n' + 'Average temperature in NYC by year and month:') statement = SimpleStatement('SELECT year, month, AVG(tavg) as avg_temp FROM spardata.weather_dataset GROUP BY year, month')
class CassWarehouse(LWWarehouse): compass_cls = CassWarehouseCompass NO_KEYSP_EXC = InvalidRequest NO_TABLE_EXC = InvalidRequest def connect(self): self.client = Cluster( contact_points=self.compass.hosts, port=self.compass.port, protocol_version=4, load_balancing_policy=RoundRobinPolicy() ).connect() self.set_keyspace() @keysp_dependent def set_keyspace(self): self.client.set_keyspace(self.keyspace) def create_keyspace(self): stmt = """ CREATE KEYSPACE {keysp} WITH REPLICATION = {{ 'class': 'SimpleStrategy', 'replication_factor': '{repl}' }} """.format( keysp=self.keyspace, repl=self.compass.replication ) self.client.execute(stmt) def create_table(self, hierarchy: StoreHierarchy, file_schema: List[FileSpec], *_, **__): stmt = """ CREATE TABLE {keysp}.{table} ( id_model VARCHAR, id_mover VARCHAR, id_file VARCHAR, file_content BLOB, PRIMARY KEY(id_model, id_mover, id_file) ) """.format( keysp=self.keyspace, table=self.TABLE_NAME ) self.client.execute(stmt) def delete(self, hierarchy: StoreHierarchy, ignore=False): deleted = True ids_stmt = """ SELECT * FROM {keysp}.{table} WHERE id_model='{_id_model}' AND id_mover='{_id_mover}' """.format( keysp=self.keyspace, table=self.TABLE_NAME, _id_model=hierarchy.parent, _id_mover=hierarchy.child ) files = [row.id_file for row in self.client.execute(ids_stmt).current_rows] if len(files) == 0 and not ignore: self._raise_not_found(hierarchy) elif len(files) == 0 and ignore: deleted = False else: self.LOG.debug("Removing files {} from Cassandra".format(", ".join(files))) stmt = """ DELETE FROM {keysp}.{table} WHERE id_model='{_id_model}' AND id_mover='{_id_mover}' AND id_file IN ('{_id_file}') """.format( keysp=self.keyspace, table=self.TABLE_NAME, _id_model=hierarchy.parent, _id_mover=hierarchy.child, _id_file="','".join(files) if len(files) > 1 else files[0] ) self.client.execute(stmt) return deleted @table_dependent def store_files(self, hierarchy: StoreHierarchy, file_schema: List[FileSpec]): fields = [] values = [] stmt = "BEGIN BATCH " for file_spec in file_schema: fields.append(file_spec.name) values.append(memoryview(file_spec.get_bytes())) stmt += """ INSERT INTO {keysp}.{table} (id_model, id_mover, id_file, file_content) VALUES ('{_id_model}', '{_id_mover}', '{_id_file}', %s); """.format( keysp=self.keyspace, table=self.TABLE_NAME, _id_model=hierarchy.parent, _id_mover=hierarchy.child, _id_file=file_spec.name ) stmt += " APPLY BATCH;" self.LOG.debug("Storing as blobs: {}".format(fields)) self.client.execute(stmt, values) @table_dependent def deploy_files(self, hierarchy: StoreHierarchy, file_schema: List[FileSpec], path_to: str): stmt = """ SELECT * FROM {keysp}.{table} WHERE id_model='{_id_model}' AND id_mover='{_id_mover}' """.format( keysp=self.keyspace, table=self.TABLE_NAME, _id_model=hierarchy.parent, _id_mover=hierarchy.child ) rows = self.client.execute(stmt).current_rows if len(rows) == 0: self._raise_not_found(hierarchy) for row in rows: self.LOG.debug('Deploying file: {}'.format(row.id_file)) write_path = os.path.join(path_to, row.id_file) bites = row.file_content open(write_path, 'wb').write(bites)
def data_from_cassandra(ks_name, table_name): session = Cluster().connect() return [ list(row) for row in session.execute( 'SELECT * FROM {ks}.{tab};'.format(ks=ks_name, tab=table_name)) ]
self.errors += 1 def __str__(self): # just extracting request count from the size stats (which are recorded on all requests) request_sizes = dict(self.requests) count = request_sizes.pop('count') return "%d requests (%d errors)\nRequest size statistics:\n%s" % (count, self.errors, pp.pformat(request_sizes)) # connect a session session = Cluster().connect() # attach a listener to this session ra = RequestAnalyzer(session) session.execute("SELECT release_version FROM system.local") session.execute("SELECT release_version FROM system.local") print(ra) # 2 requests (0 errors) # Request size statistics: # { '75percentile': 74, # '95percentile': 74, # '98percentile': 74, # '999percentile': 74, # '99percentile': 74, # 'max': 74, # 'mean': 74.0, # 'median': 74.0, # 'min': 74, # 'stddev': 0.0}
def query_es(NUM_FILTERS): ## Connect to Elasticsearch print("") print("## Connecting to Elasticsearch") es = Elasticsearch(ES_IP) if NUM_FILTERS == 'single': ## Search using single field filter (group: 'pants') print("") print("## Searching for 'pants' in Elasticsearch (filter by group)") res = es.search(index="catalog", doc_type="apparel", body={ "query": { "match": { "group": "pants" } }, "size": 1000 }) if NUM_FILTERS == 'multiple': ## Search using multiple fields filter (color: 'white' AND sub_group: 'softshell') print("") print( "## Searching for 'white softshell' in Elasticsearch (filter by color + sub_group)" ) res = es.search(index="catalog", doc_type="apparel", body={ "query": { "bool": { "must": [{ "match": { "color": "white" } }, { "match": { "sub_group": "softshell" } }] } }, "size": 1000 }) if NUM_FILTERS == 'none': ## Search with NO filters (match_all) print("") print("## Searching with NO filter = 'match_all' in Elasticsearch") res = es.search(index="catalog", doc_type="apparel", body={ "query": { "match_all": {} }, "size": "1000" }) print("") print("## %d documents returned" % res['hits']['total']) es_results = [doc['_id'] for doc in res['hits']['hits']] ## Connect to Scylla print("") print("## Connecting to Scylla") session = Cluster(SCYLLA_IP).connect() ## Prepared cql statement print("") print("## Preparing CQL statement") cql = "SELECT * FROM catalog.apparel WHERE sku=?" cql_prepared = session.prepare(cql) cql_prepared.consistency_level = ConsistencyLevel.ONE if random.random( ) < 0.2 else ConsistencyLevel.QUORUM ## Query Scylla print("") print("## Query Scylla using SKU/s returned from Elasticsearch") print("") print("## Final results from Scylla:") print("") for r in es_results: scylla_res = session.execute(cql_prepared, (r, )) print("%s" % ([list(row) for row in scylla_res])) #for doc in res['hits']['hits']: ## Print all columns in Elasticsearch result set #print("SKU: %s | Color: %s | Size: %s | Brand: %s | Gender: %s | Group: %s | Sub_Group: %s" % (doc['_id'], doc['_source']['color'], doc['_source']['size'], doc['_source']['brand'], doc['_source']['gender'], doc['_source']['group'], doc['_source']['sub_group'])) ## Print only the id (sku) in the result set #print("SKU: %s" % (doc['_id'])) print("")
def _contention_test(self, threads, iterations): """ Test threads repeatedly contending on the same row. """ verbose = False session = self.prepare(nodes=3) session.execute( "CREATE TABLE test (k int, v int static, id int, PRIMARY KEY (k, id))" ) session.execute("INSERT INTO test(k, v) VALUES (0, 0)") class Worker(Thread): def __init__(self, wid, session, iterations, query): Thread.__init__(self) self.wid = wid self.iterations = iterations self.query = query self.session = session self.errors = 0 self.retries = 0 def run(self): i = 0 prev = 0 while i < self.iterations: done = False while not done: try: res = self.session.execute( self.query, (prev + 1, prev, self.wid)) if verbose: print("[%3d] CAS %3d -> %3d (res: %s)" % (self.wid, prev, prev + 1, str(res))) if res[0][0] is True: done = True prev = prev + 1 else: self.retries = self.retries + 1 # There is 2 conditions, so 2 reasons to fail: if we failed because the row with our # worker ID already exists, it means we timeout earlier but our update did went in, # so do consider this as a success prev = res[0][3] if res[0][2] is not None: if verbose: print( "[%3d] Update was inserted on previous try (res = %s)" % (self.wid, str(res))) done = True except WriteTimeout as e: if verbose: print("[%3d] TIMEOUT (%s)" % (self.wid, str(e))) # This means a timeout: just retry, if it happens that our update was indeed persisted, # we'll figure it out on the next run. self.retries = self.retries + 1 except Exception as e: if verbose: print("[%3d] ERROR: %s" % (self.wid, str(e))) self.errors = self.errors + 1 done = True i = i + 1 # Clean up for next iteration while True: try: self.session.execute( "DELETE FROM test WHERE k = 0 AND id = %d IF EXISTS" % self.wid) break except WriteTimeout as e: pass nodes = self.cluster.nodelist() workers = [] session = Cluster([nodes[0].ip_addr], connect_timeout=15, idle_heartbeat_interval=0, execution_profiles={ EXEC_PROFILE_DEFAULT: ExecutionProfile(request_timeout=60) }).connect('ks') q = session.prepare(""" BEGIN BATCH UPDATE test SET v = ? WHERE k = 0 IF v = ?; INSERT INTO test (k, id) VALUES (0, ?) IF NOT EXISTS; APPLY BATCH """) for n in range(0, threads): workers.append(Worker(n, session, iterations, q)) start = time.time() for w in workers: w.start() for w in workers: w.join() if verbose: runtime = time.time() - start print("runtime:", runtime) query = SimpleStatement("SELECT v FROM test WHERE k = 0", consistency_level=ConsistencyLevel.ALL) rows = session.execute(query) value = rows[0][0] errors = 0 retries = 0 for w in workers: errors = errors + w.errors retries = retries + w.retries assert (value == threads * iterations) and ( errors == 0), "value={}, errors={}, retries={}".format( value, errors, retries)