Exemple #1
0
def main(keyspace_name, outdir, orderkeys):
    session = Cluster(cluster_seeds).connect()
    session.set_keyspace(keyspace_name)
    session.shutdown()

    df_part = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='part', keyspace=keyspace_name).load()
    df_orders = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='orders', keyspace=keyspace_name).load()
    df_lineitem = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='lineitem', keyspace=keyspace_name).load()
    joined_df = df_orders.join(df_lineitem,
                               df_orders.orderkey == df_lineitem.orderkey)
    final_joined_df = joined_df.join(df_part,
                                     df_part.partkey == joined_df.partkey)
    filter_df = final_joined_df.filter(df_orders.orderkey.isin(orderkeys))
    selected_df = filter_df.groupBy(df_orders['orderkey'], 'totalprice').agg(
        functions.collect_set('name').alias('names')).sort('orderkey')
    out_df = selected_df.rdd.map(output_line)
    out_df.saveAsTextFile(outdir)
Exemple #2
0
def main(input_keyspace, output_keyspace):
    session = Cluster(cluster_seeds).connect()
    session.execute(
        "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 2}"
        % output_keyspace)
    session.set_keyspace(output_keyspace)
    session.execute("CREATE TABLE IF NOT EXISTS orders_parts ("
                    "orderkey int, "
                    "custkey int, "
                    "orderstatus text, "
                    "totalprice decimal, "
                    "orderdate date, "
                    "order_priority text, "
                    "clerk text, "
                    "ship_priority int, "
                    "comment text, "
                    "part_names set<text>, "
                    "PRIMARY KEY (orderkey))")
    session.execute('TRUNCATE orders_parts')
    session.shutdown()

    df_part = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='part', keyspace=input_keyspace).load()
    df_orders = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='orders', keyspace=input_keyspace).load().cache()
    df_lineitem = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='lineitem', keyspace=input_keyspace).load()
    joined_df = df_orders.join(df_lineitem,
                               df_orders.orderkey == df_lineitem.orderkey)
    final_joined_df = joined_df.join(df_part,
                                     df_part.partkey == joined_df.partkey)
    selected_df = final_joined_df.groupBy(
        df_orders['orderkey'], 'totalprice').agg(
            functions.collect_set(
                df_part['name']).alias('part_names')).drop('totalprice')
    output_df = selected_df.join(df_orders, "orderkey")
    output_df.write.format("org.apache.spark.sql.cassandra").options(
        table="orders_parts", keyspace=output_keyspace).save()
Exemple #3
0
def main(input_dir, keyspace_name, table_name):
    rdd = sc.textFile(input_dir)
    preprocess_rdd = rdd.map(lambda line: preprocess(line))
    cleaned_rdd = preprocess_rdd.filter(lambda x: x is not None)

    schema = "id STRING, host STRING, datetime TIMESTAMP, path STRING, bytes INT"
    # sch = ['id', 'host', 'datetime', 'path', 'bytes']

    df = spark.createDataFrame(data=cleaned_rdd, schema=schema)
    session = Cluster(cluster_seeds).connect()
    session.execute(
        "CREATE KEYSPACE IF NOT EXISTS " + keyspace_name +
        " WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':2}")
    session.set_keyspace(keyspace_name)
    session.execute("DROP TABLE IF EXISTS " + keyspace_name + "." + table_name)
    session.execute(
        "CREATE TABLE IF NOT EXISTS " + table_name +
        " (id UUID, host TEXT, datetime TIMESTAMP, path TEXT, bytes INT, PRIMARY KEY (host, id))"
    )
    # session.execute("TRUNCATE "+table_name)
    session.shutdown()

    df.write.format("org.apache.spark.sql.cassandra").options(
        table=table_name, keyspace=keyspace_name).save()
class CassandraESSync:
	
	mappings = []

	cassandra_nodes = []
	cassandra_keyspace = None
	cassandra_cfs = []

	es_nodes = []
	es_index = None
	es_types = []

	cassandra_client = None 
	es_client = None



	last_synced = {'cassandra': {}, 'es': {} }  # stores the last time each cf and index were synced, to avoid unnecessary queries


	def __init__(self, config):

		self.mappings = config["mappings"]

		self.cassandra_nodes = config['cassandra']['nodes']
		self.cassandra_keyspace = config['cassandra']['keyspace']
		self.cassandra_cfs = config['cassandra']['column_families']

		self.es_nodes = config['elasticsearch']['nodes']
		self.es_index = config['elasticsearch']['index']
		self.es_types = config['elasticsearch']['types']

		self.cassandra_client = Cluster().connect(self.cassandra_keyspace)
		self.es_client = elasticsearch.Elasticsearch()


	def sync_databases(self):
		for mapping in self.mappings:
			cassandra_cf, es_type = mapping
			self.sync_cf_type(cassandra_cf, es_type)


	def sync_cf_type(self, cassandra_cf, es_type):


		cf_id_column = self.cassandra_cfs[cassandra_cf]['id'] # column storing the document's uid
		cf_timestamp_column = self.cassandra_cfs[cassandra_cf]['timestamp'] # column storing the document's timestamp

		index_id_column = self.es_types[es_type]['id'] # column storing the document's timestamp
		index_timestamp_column = self.es_types[es_type]['timestamp'] # column storing the document's timestamp

		cf_data_fields = self.cassandra_cfs[cassandra_cf]['columns']
		cf_fields = [cf_id_column, cf_timestamp_column] + cf_data_fields

		type_data_fields = self.es_types[es_type]['columns']

		if cassandra_cf in self.last_synced['cassandra']:
			cf_start_time, cf_end_time = self.last_synced['cassandra'][cassandra_cf], time.time()  
		else:
			cf_start_time, cf_end_time = None, None

		if es_type in self.last_synced['es']:
			index_start_time, index_end_time = self.last_synced['es'][es_type], time.time()
		else:
			index_start_time, index_end_time = None, None

		cassandra_data_query = 'SELECT %s, %s FROM %s' % (cf_id_column, cf_timestamp_column, cassandra_cf)
		
		range_filter = {}
		if index_start_time and index_end_time:
			range_filter = self.get_es_range_filter(index_timestamp_column, index_start_time, index_end_time)
		

		self.cassandra_client.set_keyspace(self.cassandra_keyspace)
		cassandra_data = self.cassandra_client.execute(cassandra_data_query)
		self.last_synced['cassandra'][cassandra_cf] = time.time()

		es_data = [] #self.es_client.search(index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], body=range_filter)
		es_scan = scan(self.es_client, index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], query=range_filter)
		self.last_synced['es'][es_type] = time.time()

		for data in es_scan:
			es_data.append(data)

		all_data = {}

		ids_to_insert_on_cassandra = []
		ids_to_update_on_cassandra = []

		ids_to_insert_on_es = []
		ids_to_update_on_es = []


		# because we cant make a range query on a non-primary key on cassandra, we have to retrieve it all, and then check for the timestamp by hand.
		for document in cassandra_data:
			doc_id, doc_timestamp = str(document[0]), int(calendar.timegm(document[1].utctimetuple()))
			if not(cf_start_time and cf_end_time):
				all_data[doc_id] = [doc_timestamp, None]
			elif cf_start_time and cf_end_time and doc_timestamp >= cf_start_time and doc_timestamp <= cf_end_time: # 
				all_data[doc_id] = [doc_timestamp, None]

		for document in es_data:
			if "fields" in document:
				if index_id_column == '_id': # special case - is not inside fields. there must be a better way to do this ;(
					doc_id, doc_timestamp = document[index_id_column], int(document['fields'][index_timestamp_column][0])
				else:
					doc_id, doc_timestamp = document['fields'][index_id_column], int(document['fields'][index_timestamp_column][0])

				if doc_id in all_data:
					all_data[doc_id][1] = doc_timestamp
				else:
					all_data[doc_id] = [None, doc_timestamp]

		
		for uid in all_data:
			cassandra_ts, es_ts = all_data[uid]
			if cassandra_ts and es_ts:
				if cassandra_ts > es_ts: # same id, cassandra is the most recent. update that data on es. 
					ids_to_update_on_es.append(uid)
				elif es_ts > cassandra_ts: # same id, es is the most recent. update that data on cassandra.
					ids_to_update_on_cassandra.append(uid)
			elif cassandra_ts: # present only on cassandra. add to es.
				ids_to_insert_on_es.append(uid)
			elif es_ts: #present only on es. add to cassandra.

				ids_to_insert_on_cassandra.append(uid)


		if ids_to_insert_on_es or ids_to_update_on_es:
			actions = []
			from_cassandra_to_es = self.get_cassandra_documents_by_id(cassandra_cf, cf_fields, cf_id_column, ids_to_insert_on_es + ids_to_update_on_es)

			for document in from_cassandra_to_es:
				data = {}
				for i in range(len(cf_data_fields)):
					data[type_data_fields[i]] = getattr(document, cf_data_fields[i]) 	

				actions.append(self.get_es_bulk_action(es_type, index_id_column, getattr(document, cf_id_column), index_timestamp_column, getattr(document, cf_timestamp_column), data))
			
			bulk(self.es_client, actions) # send all inserts/updates to es at once

		if ids_to_insert_on_cassandra or ids_to_update_on_cassandra:
			batch = BatchStatement()

			type_fields = type_data_fields + [index_id_column, index_timestamp_column]
			ids_filter = self.get_es_ids_filter(es_type, ids_to_insert_on_cassandra + ids_to_update_on_cassandra)
			from_es_to_cassandra = self.es_client.search(index=self.es_index, doc_type=es_type, fields=type_data_fields + [cf_timestamp_column], body=ids_filter)
			
			for document in from_es_to_cassandra['hits']['hits']:
				
				id_value = document[index_id_column] if index_id_column == '_id' else document["fields"][index_id_column] # this makes me a saaaad panda
				id_value = id_value
				
				es_data = [UUID(id_value), datetime.datetime.utcfromtimestamp(int(document['fields'][index_timestamp_column][0]))]
				
				for field in type_data_fields:
					es_data.append(document['fields'][field][0])

				prepared_insert_statement = self.get_prepared_cassandra_insert_statement(cassandra_cf, cf_fields)
				prepared_update_statement = self.get_prepared_cassandra_update_statement(cassandra_cf, cf_id_column, cf_fields[1:])
				
				if id_value in ids_to_insert_on_cassandra:
					batch.add(prepared_insert_statement, tuple(es_data))
				else: 
					batch.add(prepared_update_statement, tuple(es_data[1:] + [UUID(id_value)]))

			
			self.cassandra_client.execute(batch)


	def get_cassandra_documents_by_id(self, cf, fields, id_column, ids):
		from_cassandra_to_es_query = "SELECT %s FROM %s WHERE %s IN (%s)" % (",".join(fields), cf, id_column, ",".join(ids))
		return self.cassandra_client.execute(from_cassandra_to_es_query)

	def get_prepared_cassandra_insert_statement(self, cf, cols):
		base_insert_statement = "INSERT INTO %s (%s) VALUES (%s)" % (cf, ",".join(cols), ",".join(["?"] * len(cols)))	
		return self.cassandra_client.prepare(base_insert_statement)

	def get_prepared_cassandra_update_statement(self, cf, id_column, fields):
		base_update_statement = "UPDATE %s SET %s WHERE %s = ?" % (cf, ",".join([field + " = ?" for field in fields]), id_column)
		return self.cassandra_client.prepare(base_update_statement)

	def get_es_range_filter(self, col, start, end):
		return {
				"filter": {
					"range" : {
						col: {
			        		"gte" : start,
			        		"lte" : end
			        	}
			    	}
				}
			}

	def get_es_ids_filter(self, es_type, ids):
		return {
				"filter": {
					"ids" : {
			        	"type" : es_type,
			        	"values": ids 
			    	}
				}
			}

	def get_es_bulk_action(self, es_type, id_column, id_value, timestamp_column, timestamp_value, data):

		if isinstance(timestamp_value, datetime.datetime):
			timestamp_value = calendar.timegm(timestamp_value.utctimetuple())

		id_value = str(id_value)
		timestamp_value = str(timestamp_value)

		data[timestamp_column] = timestamp_value

		action = {}
		action['_index'] = self.es_index
		action['_type'] = es_type 
		action[id_column] = id_value
		action['_source'] = data

		return action
Exemple #5
0
from cassandra.query import dict_factory
from cassandra.cluster import Cluster
import cassandra_client as client


KEYSPACE = "user_ratings"
TABLE = "ratings"
SESSION = Cluster(['127.0.0.1'], port=9042).connect()

client.create_keyspace(SESSION, KEYSPACE)
client.create_table(SESSION, KEYSPACE, TABLE)

SESSION.set_keyspace(KEYSPACE)
SESSION.row_factory = dict_factory
RATING_QUERY = SESSION.prepare(f"SELECT * FROM {TABLE} WHERE user_id=?")
DELETE_RATING_QUERY = SESSION.prepare(f"DELETE FROM {TABLE} WHERE user_id=?")


def get(user_id: str) -> dict:
    user = SESSION.execute(RATING_QUERY, [user_id])
    if user:
        return user[0]
    return {}


def push(rating: dict):
    client.push_table(SESSION, KEYSPACE, TABLE, rating)


def list() -> list:
    return client.list_table(SESSION, KEYSPACE, TABLE)
Exemple #6
0
class CassWarehouse(LWWarehouse):
    
    compass_cls = CassWarehouseCompass
    
    NO_KEYSP_EXC = InvalidRequest
    NO_TABLE_EXC = InvalidRequest
    
    def connect(self):
        
        self.client = Cluster(
            contact_points=self.compass.hosts,
            port=self.compass.port,
            protocol_version=4,
            load_balancing_policy=RoundRobinPolicy()
        ).connect()
        
        self.set_keyspace()
    
    @keysp_dependent
    def set_keyspace(self):
        
        self.client.set_keyspace(self.keyspace)
    
    def create_keyspace(self):
        
        stmt = """
            CREATE KEYSPACE {keysp}
            WITH REPLICATION = {{
                'class': 'SimpleStrategy',
                'replication_factor': '{repl}' 
            }}
        """.format(
            keysp=self.keyspace,
            repl=self.compass.replication
        )
        
        self.client.execute(stmt)
    
    def create_table(self, hierarchy: StoreHierarchy, file_schema: List[FileSpec], *_, **__):

        stmt = """
            CREATE TABLE {keysp}.{table} (
                id_model VARCHAR,
                id_mover VARCHAR,
                id_file VARCHAR,
                file_content BLOB,
                PRIMARY KEY(id_model, id_mover, id_file)
            )
        """.format(
            keysp=self.keyspace,
            table=self.TABLE_NAME
        )
        
        self.client.execute(stmt)
    
    def delete(self, hierarchy: StoreHierarchy, ignore=False):

        deleted = True

        ids_stmt = """
            SELECT * FROM {keysp}.{table}
            WHERE 
                id_model='{_id_model}'
            AND id_mover='{_id_mover}'
        """.format(
            keysp=self.keyspace,
            table=self.TABLE_NAME,
            _id_model=hierarchy.parent,
            _id_mover=hierarchy.child
        )

        files = [row.id_file for row in self.client.execute(ids_stmt).current_rows]

        if len(files) == 0 and not ignore:
            self._raise_not_found(hierarchy)
        elif len(files) == 0 and ignore:
            deleted = False
        else:
            self.LOG.debug("Removing files {} from Cassandra".format(", ".join(files)))
        
        stmt = """
            DELETE FROM {keysp}.{table}
            WHERE
                id_model='{_id_model}'
            AND id_mover='{_id_mover}'
            AND id_file IN ('{_id_file}')
        """.format(
            keysp=self.keyspace,
            table=self.TABLE_NAME,
            _id_model=hierarchy.parent,
            _id_mover=hierarchy.child,
            _id_file="','".join(files) if len(files) > 1 else files[0]
        )
        
        self.client.execute(stmt)

        return deleted

    @table_dependent
    def store_files(self, hierarchy: StoreHierarchy, file_schema: List[FileSpec]):
        
        fields = []
        values = []
        stmt = "BEGIN BATCH "
        
        for file_spec in file_schema:

            fields.append(file_spec.name)
            values.append(memoryview(file_spec.get_bytes()))

            stmt += """
                INSERT INTO {keysp}.{table} (id_model, id_mover, id_file, file_content) 
                VALUES ('{_id_model}', '{_id_mover}', '{_id_file}', %s); 
            """.format(
                keysp=self.keyspace,
                table=self.TABLE_NAME,
                _id_model=hierarchy.parent,
                _id_mover=hierarchy.child,
                _id_file=file_spec.name
            )
        stmt += " APPLY BATCH;"
        self.LOG.debug("Storing as blobs: {}".format(fields))
        self.client.execute(stmt, values)
    
    @table_dependent
    def deploy_files(self, hierarchy: StoreHierarchy, file_schema: List[FileSpec], path_to: str):

        stmt = """
            SELECT * FROM {keysp}.{table}
            WHERE 
                id_model='{_id_model}'
            AND id_mover='{_id_mover}'
        """.format(
            keysp=self.keyspace,
            table=self.TABLE_NAME,
            _id_model=hierarchy.parent,
            _id_mover=hierarchy.child
        )

        rows = self.client.execute(stmt).current_rows

        if len(rows) == 0:
            self._raise_not_found(hierarchy)

        for row in rows:
            self.LOG.debug('Deploying file: {}'.format(row.id_file))
            write_path = os.path.join(path_to, row.id_file)
            bites = row.file_content
            open(write_path, 'wb').write(bites)
Exemple #7
0
def main():
    session = Cluster(["127.0.0.1"]).connect()
    session.set_keyspace("tweets_space")
    model = None
    main_task(session, model)
Exemple #8
0
def before_request():
    global sess
    sess = Cluster(['127.0.0.1']).connect('IMG')
    sess.set_keyspace('final_pro')