Exemple #1
0
class DbUtils:
    def __init__(self, connection_info):
        node_ips, namespace, user_id, password = connection_info
        auth_provider = PlainTextAuthProvider(username=user_id,
                                              password=password)
        self._session = Cluster(node_ips,
                                auth_provider=auth_provider).connect(namespace)

    def insert_model_predictions(self, extracted_attribute):
        query = "insert into ae_batch_predictions1_tbl" \
                "(batch_id,attribute,item_id,title,long_description," \
                "short_description,model_prediction,trusted_prediction) " \
                "VALUES" \
                " (%s,%s,%s,%s,%s,%s,%s,%s)"
        for extracted_attribute in extracted_attribute:
            self._session.execute(query, extracted_attribute)

        return {"Message": "Insertion Successful"}

    def fetch_label_metrics(self, data):
        attribute, batch_id = data
        query = "select item_id,trusted_prediction,model_prediction from attributeextraction.ae_batch_predictions1_tbl " \
                "where  attribute='{}' and batch_id='{}'".format(attribute, batch_id)
        logger.info('Read query : %s', query)
        result = self._session.execute(query)

        return result
def create_schema(arguments):
    session = Cluster([arguments.host]).connect()
    session.execute("USE %s;" % arguments.keyspace)

    query = '''CREATE COLUMNFAMILY {columnfamily} ( 
               key text PRIMARY KEY,
               color text,
               size text,
               qty int)'''.format(columnfamily=arguments.columnfamily)

    session.execute(query)
    def test_custom_raw_row_results_all_types(self):
        """
        Test to validate that custom protocol handlers work with varying types of
        results

        Connect, create a table with all sorts of data. Query the data, make the sure the custom results handler is
        used correctly.

        @since 2.7
        @jira_ticket PYTHON-313
        @expected_result custom protocol handler is invoked with various result types

        @test_category data_types:serialization
        """
        # Connect using a custom protocol handler that tracks the various types the result message is used with.
        session = Cluster(protocol_version=PROTOCOL_VERSION).connect(keyspace="custserdes")
        session.client_protocol_handler = CustomProtocolHandlerResultMessageTracked
        session.row_factory = tuple_factory

        colnames = create_table_with_all_types("alltypes", session, 1)
        columns_string = ", ".join(colnames)

        # verify data
        params = get_all_primitive_params(0)
        results = session.execute("SELECT {0} FROM alltypes WHERE primkey=0".format(columns_string))[0]
        for expected, actual in zip(params, results):
            self.assertEqual(actual, expected)
        # Ensure we have covered the various primitive types
        self.assertEqual(len(CustomResultMessageTracked.checked_rev_row_set), len(PRIMITIVE_DATATYPES)-1)
        session.shutdown()
    def test_custom_raw_row_results_all_types(self):
        """
        Test to validate that custom protocol handlers work with varying types of
        results

        Connect, create a table with all sorts of data. Query the data, make the sure the custom results handler is
        used correctly.

        @since 2.7
        @jira_ticket PYTHON-313
        @expected_result custom protocol handler is invoked with various result types

        @test_category data_types:serialization
        """
        # Connect using a custom protocol handler that tracks the various types the result message is used with.
        session = Cluster(protocol_version=PROTOCOL_VERSION).connect(keyspace="custserdes")
        session.client_protocol_handler = CustomProtocolHandlerResultMessageTracked
        session.row_factory = tuple_factory

        colnames = create_table_with_all_types("alltypes", session, 1)
        columns_string = ", ".join(colnames)

        # verify data
        params = get_all_primitive_params(0)
        results = session.execute("SELECT {0} FROM alltypes WHERE primkey=0".format(columns_string))[0]
        for expected, actual in zip(params, results):
            self.assertEqual(actual, expected)
        # Ensure we have covered the various primitive types
        self.assertEqual(len(CustomResultMessageTracked.checked_rev_row_set), len(PRIMITIVE_DATATYPES)-1)
        session.shutdown()
def insert_data(headers, data):
    ## Connect to Scylla cluster and create schema
    # session = cassandra.cluster.Cluster(SCYLLA_IP).connect()
    print("")
    print("## Connecting to Scylla cluster -> Creating schema")
    session = Cluster(SCYLLA_IP).connect()
    session.execute(create_ks)
    session.execute(create_t1)

    ## Connect to Elasticsearch
    print("")
    print("## Connecting to Elasticsearch -> Creating 'Catalog' index")
    es = Elasticsearch(ES_IP)

    ## Create Elasticsearch index. Ignore 400 = IF NOT EXIST
    es.indices.create(index="catalog", ignore=400)

    ## Non-prepared CQL statement
    #cql = "INSERT INTO catalog.apparel(sku,brand,group,sub_group,color,size,gender) VALUES(%(sku)s,%(brand)s,%(group)s,%(sub_group)s,%(color)s,%(size)s,%(gender)s)"

    ## Prepared CQL statement
    print("")
    print("## Preparing CQL statement")
    cql = "INSERT INTO catalog.apparel (sku,brand,group,sub_group,color,size,gender) VALUES (?,?,?,?,?,?,?) using TIMESTAMP ?"
    cql_prepared = session.prepare(cql)
    cql_prepared.consistency_level = ConsistencyLevel.ONE if random.random(
    ) < 0.2 else ConsistencyLevel.QUORUM

    print("")
    print("## Insert csv content into Scylla and Elasticsearch")

    for d in data:
        # See if we need to add code to wait for the ack. This should be synchronous.
        # Also, might need to switch to prepared statements to set the consistency level for sync requests.
        session.execute(cql_prepared, d)

        res = es.index(index="catalog",
                       doc_type="apparel",
                       id=d["sku"],
                       body=d)

    ## After all the inserts, make a refresh, just in case
    print("")
    print("## Inserts completed, refreshing index")
    es.indices.refresh(index="catalog")

    print("")
    def read_inserts_at_level(self, proto_ver):
        session = Cluster(protocol_version=proto_ver).connect(self.keyspace_name)
        try:
            results = session.execute('select * from t')[0]
            self.assertEqual("[SortedSet([1, 2]), SortedSet([3, 5])]", str(results.v))

            results = session.execute('select * from u')[0]
            self.assertEqual("SortedSet([[1, 2], [3, 5]])", str(results.v))

            results = session.execute('select * from v')[0]
            self.assertEqual("{SortedSet([1, 2]): [1, 2, 3], SortedSet([3, 5]): [4, 5, 6]}", str(results.v))

            results = session.execute('select * from w')[0]
            self.assertEqual("typ(v0=OrderedMapSerializedKey([(1, [1, 2, 3]), (2, [4, 5, 6])]), v1=[7, 8, 9])", str(results.v))

        finally:
            session.cluster.shutdown()
	def initialize_connection(self):

		session = Cluster(contact_points=[self.host], port=self.port).connect(keyspace=self.keyspace)
		session.row_factory = panda_factory
		query = "SELECT epoch_time, post_count, byte_transfer, get_count, requests, visits FROM "+self.source
		DataFrame = session.execute(query).sort(columns=['epoch_time', 'post_count', 'byte_transfer', 'get_count', 'requests', 'visits'], ascending=[1,0,0,0,0,0])
		process = retrieve_insert(DataFrame, session, self.dest) # create instance for the class retrieve_insert
		process.retrieve_variables()
		return 1
    def __init__(self, nodes):
        """Establishes a connection to the ScyllaDB database, creates the "wdm" keyspace if it does not exist
        and creates or updates the users table.
        """
        while True:
            try:
                session = Cluster(contact_points=nodes).connect()
                break
            except Exception:
                sleep(1)
        session.execute("""
            CREATE KEYSPACE IF NOT EXISTS wdm
            WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '3' }
            """)
        session.default_consistency_level = ConsistencyLevel.QUORUM

        connection.setup(nodes, "wdm")
        sync_table(Payments)
Exemple #9
0
def main():
    logging.basicConfig(level=logging.INFO)
    logging.info("Using table: %s,%s", KEYSPACE, TABLE)
    session = Cluster(["127.0.0.1"]).connect(KEYSPACE)
    validators = {}
    for val in TYPES:
        logger.info("Creating table for '%s'", val)
        session.execute("CREATE TABLE {} (key int primary key, value {})"
                        .format(TABLE, val))
        row = session.execute("SELECT * FROM system.schema_columns WHERE "
                              "keyspace_name = '{}' AND columnfamily_name = "
                              "'{}' AND column_name = 'value'"
                              .format(KEYSPACE, TABLE))[0]
        validator = row.validator
        logging.info("validator for '%s' = '%s'", val, validator)
        logger.info("Dropping table...")
        session.execute("DROP TABLE {}".format(TABLE))
        validators[val] = validator
    print json.dumps(validators, indent=1, sort_keys=True)
Exemple #10
0
def main(keyspace, output_keyspace):

    # connecting to keyspace
    session = Cluster(cluster_seeds).connect(output_keyspace)

    # loading orders table
    orders = load_table(keyspace, 'orders').cache()
    orders = orders.registerTempTable('orders')

    # loading lineitem table
    line_item = load_table(keyspace, 'lineitem')
    line_item.registerTempTable('lineitem')

    # loading part table
    part = load_table(keyspace, 'part')
    part.registerTempTable('part')

    # Creating orders_parts table if it doesn't exist and truncating all rows befor loading
    create_statement = 'CREATE TABLE IF NOT EXISTS orders_parts (orderkey int,custkey int,orderstatus text,totalprice decimal,orderdate date,order_priority text,clerk text,ship_priority int,comment text,part_names set<text>, PRIMARY KEY (orderkey));'
    session.execute(create_statement)
    truncate_statemet = 'TRUNCATE orders_parts;'
    session.execute(truncate_statemet)

    # Joining all three tables to get desired output
    join_statement = 'SELECT o.orderkey, p.name FROM orders o JOIN lineitem l ON o.orderkey==l.orderkey JOIN part p ON p.partkey == l.partkey'
    join_result = spark.sql(join_statement)

    # group by order key and get the associated partnames as a set
    result = join_result.groupBy(join_result['o.orderkey']).agg(
        functions.collect_set(join_result['p.name'])).withColumnRenamed(
            "collect_set(name)",
            "part_names").orderBy(join_result['o.orderkey'],
                                  ascending=True).cache()
    result.registerTempTable('order_part')

    # Finally join orders table and order_part table
    final = spark.sql(
        "SELECT a.*, b.part_names FROM orders a JOIN order_part b ON (a.orderkey = b.orderkey) order by a.orderkey asc"
    )

    # Write it into cassandra table
    final.write.format("org.apache.spark.sql.cassandra").options(
        table='orders_parts', keyspace=output_keyspace).save()
Exemple #11
0
def load(datafile, ks_name, table_name):
    with open(datafile) as f:
        reader = csv.reader(f)
        session = Cluster().connect()

        header = reader.next()

        stmt = (
            "INSERT INTO {ks}.{tab} ({header_spec}) VALUES ({qs});").format(
                ks=ks_name,
                tab=table_name,
                header_spec=', '.join(header),
                qs=', '.join(['?' for _ in header]))
        print('preparing "{stmt}"'.format(stmt=stmt), file=sys.stderr)
        prepared = session.prepare(stmt)

        data = csv_handle_to_nested_list(reader)
        print('About to load {n} rows'.format(n=len(data)))
        for row in data:
            session.execute(prepared, row)
Exemple #12
0
    def test_custom_raw_uuid_row_results(self):
        """
        Test to validate that custom protocol handlers work with raw row results

        Connect and validate that the normal protocol handler is used.
        Re-Connect and validate that the custom protocol handler is used.
        Re-Connect and validate that the normal protocol handler is used.

        @since 2.7
        @jira_ticket PYTHON-313
        @expected_result custom protocol handler is invoked appropriately.

        @test_category data_types:serialization
        """

        # Ensure that we get normal uuid back first
        session = Cluster(protocol_version=PROTOCOL_VERSION).connect(
            keyspace="custserdes")
        session.row_factory = tuple_factory
        result_set = session.execute("SELECT schema_version FROM system.local")
        result = result_set.pop()
        uuid_type = result[0]
        self.assertEqual(type(uuid_type), uuid.UUID)

        # use our custom protocol handlder

        session.client_protocol_handler = CustomTestRawRowType
        session.row_factory = tuple_factory
        result_set = session.execute("SELECT schema_version FROM system.local")
        result = result_set.pop()
        raw_value = result.pop()
        self.assertTrue(isinstance(raw_value, binary_type))
        self.assertEqual(len(raw_value), 16)

        # Ensure that we get normal uuid back when we re-connect
        session.client_protocol_handler = ProtocolHandler
        result_set = session.execute("SELECT schema_version FROM system.local")
        result = result_set.pop()
        uuid_type = result[0]
        self.assertEqual(type(uuid_type), uuid.UUID)
        session.shutdown()
Exemple #13
0
    def setUp(self):
        temp = Cluster(['127.0.0.1']).connect()
        temp.execute('''
            CREATE KEYSPACE IF NOT EXISTS %s
            WITH REPLICATION = {
                'class' : 'SimpleStrategy',
                'replication_factor' : 1 };
        ''' % (self.keyspace))

        temp = Cluster(['127.0.0.1']).connect(self.keyspace)
        temp.execute('''
            DROP TABLE IF EXISTS superdupertable;
        ''')
        temp.execute('''
            CREATE TABLE IF NOT EXISTS superdupertable (
              uid UUID,
              name text,
              body text,
              last_change timestamp,
              PRIMARY KEY (uid, last_change)
            );
        ''')
        self.cass = CassandraSync(self.keyspace)

        temp = Elasticsearch()
        temp.indices.create(
            self.index_name,
            {"settings": {"number_of_shards": 2, "number_of_replicas": 1}},
            ignore=400)
        self.elas = ElasticSync(self.index_name)
Exemple #14
0
    def test_patch_unpatch(self):
        # Test patch idempotence
        patch()
        patch()

        tracer = get_dummy_tracer()
        Pin.get_from(Cluster).clone(tracer=tracer).onto(Cluster)

        session = Cluster(port=CASSANDRA_CONFIG['port']).connect(self.TEST_KEYSPACE)
        session.execute(self.TEST_QUERY)

        spans = tracer.writer.pop()
        assert spans, spans
        eq_(len(spans), 1)

        # Test unpatch
        unpatch()

        session = Cluster(port=CASSANDRA_CONFIG['port']).connect(self.TEST_KEYSPACE)
        session.execute(self.TEST_QUERY)

        spans = tracer.writer.pop()
        assert not spans, spans

        # Test patch again
        patch()
        Pin.get_from(Cluster).clone(tracer=tracer).onto(Cluster)

        session = Cluster(port=CASSANDRA_CONFIG['port']).connect(self.TEST_KEYSPACE)
        session.execute(self.TEST_QUERY)

        spans = tracer.writer.pop()
        assert spans, spans
    def test_custom_raw_uuid_row_results(self):
        """
        Test to validate that custom protocol handlers work with raw row results

        Connect and validate that the normal protocol handler is used.
        Re-Connect and validate that the custom protocol handler is used.
        Re-Connect and validate that the normal protocol handler is used.

        @since 2.7
        @jira_ticket PYTHON-313
        @expected_result custom protocol handler is invoked appropriately.

        @test_category data_types:serialization
        """

        # Ensure that we get normal uuid back first
        session = Cluster().connect()
        session.row_factory = tuple_factory
        result_set = session.execute("SELECT schema_version FROM system.local")
        result = result_set.pop()
        uuid_type = result[0]
        self.assertEqual(type(uuid_type), uuid.UUID)

        # use our custom protocol handlder

        session.client_protocol_handler = CustomTestRawRowType
        session.row_factory = tuple_factory
        result_set = session.execute("SELECT schema_version FROM system.local")
        result = result_set.pop()
        raw_value = result.pop()
        self.assertTrue(isinstance(raw_value, binary_type))
        self.assertEqual(len(raw_value), 16)

        # Ensure that we get normal uuid back when we re-connect
        session.client_protocol_handler = ProtocolHandler
        result_set = session.execute("SELECT schema_version FROM system.local")
        result = result_set.pop()
        uuid_type = result[0]
        self.assertEqual(type(uuid_type), uuid.UUID)
        session.shutdown()
Exemple #16
0
    def test_patch_unpatch(self):
        # Test patch idempotence
        patch()
        patch()

        tracer = get_dummy_tracer()
        Pin.get_from(Cluster).clone(tracer=tracer).onto(Cluster)

        session = Cluster(port=CASSANDRA_CONFIG['port']).connect(self.TEST_KEYSPACE)
        session.execute(self.TEST_QUERY)

        spans = tracer.writer.pop()
        assert spans, spans
        eq_(len(spans), 1)

        # Test unpatch
        unpatch()

        session = Cluster(port=CASSANDRA_CONFIG['port']).connect(self.TEST_KEYSPACE)
        session.execute(self.TEST_QUERY)

        spans = tracer.writer.pop()
        assert not spans, spans

        # Test patch again
        patch()
        Pin.get_from(Cluster).clone(tracer=tracer).onto(Cluster)

        session = Cluster(port=CASSANDRA_CONFIG['port']).connect(self.TEST_KEYSPACE)
        session.execute(self.TEST_QUERY)

        spans = tracer.writer.pop()
        assert spans, spans
Exemple #17
0
class CassandraConnector(object):
    """
    Handles exporting data from Cassandra to Neo4j
    using CSV as intermediary

  """
    def __init__(self,
                 keyspace_name,
                 tables=None,
                 schema_file=None,
                 queries=None):
        self.KEYSPACE = keyspace_name
        self.session = Cluster().connect(self.KEYSPACE)
        self.keyspace_metadata = self.session.cluster.metadata.keyspaces[
            self.KEYSPACE]
        self.tables = tables
        self.schema_file = schema_file

    def getTables(self):
        if self.tables:
            return self.tables
        else:
            return self.session.cluster.metadata.keyspaces[
                self.KEYSPACE].tables.keys()

    def getColumnsForTable(self, table):
        return self.session.cluster.metadata.keyspaces[
            self.KEYSPACE].tables[table].columns.keys()

    def parse(self):
        keyspace = self.session.cluster.metadata.keyspaces[self.KEYSPACE]
        parser = SchemaParser(keyspace.export_as_string())
        parser.parse()
        sys.exit("Generated schema.yaml file.")

    def export(self):
        tableNames = self.getTables()
        fileNames = [t + "_results.csv" for t in tableNames]

        for t in tableNames:
            results_file = codecs.open(t + "_results.csv",
                                       encoding='utf-8',
                                       mode='w+')
            rows = self.session.execute('SELECT * FROM ' + t)
            writer = csv.writer(results_file)
            writer.writerow(self.getColumnsForTable(t))
            writer.writerows([(e for e in row) for row in rows])

        cypher_queries_gen = CypherQueriesGenerator(self.keyspace_metadata,
                                                    schema_file)
        cypher_queries_gen.generate()
        cypher_queries_gen.build_queries(tableNames, fileNames)
    def insert_cassandra(self, session, source, country, country_count):

        query = "SELECT id FROM main_count"
        print query
        session2 = Cluster(contact_points=[self.host], port=self.port).connect(keyspace=self.keyspace)
        session2.default_timeout = 100
        data = session2.execute(query)
        for row in data:
            uid = row[0]
        # retrieving the id from the source table and updating the country_count (list) and country (list) columns
        print len(country), len(country_count)
        session.execute("UPDATE "+source+" SET country=%s, country_count=%s WHERE id=%s", parameters=[country, country_count, uid])
        return 1
    def run_inserts_at_version(self, proto_ver):
        session = Cluster(protocol_version=proto_ver).connect(self.keyspace_name)
        try:
            p = session.prepare('insert into t (k, v) values (?, ?)')
            session.execute(p, (0, [{1, 2}, {3, 5}]))

            p = session.prepare('insert into u (k, v) values (?, ?)')
            session.execute(p, (0, {(1, 2), (3, 5)}))

            p = session.prepare('insert into v (k, v, v1) values (?, ?, ?)')
            session.execute(p, (0, {(1, 2): [1, 2, 3], (3, 5): [4, 5, 6]}, (123, 'four')))

            p = session.prepare('insert into w (k, v) values (?, ?)')
            session.execute(p, (0, ({1: [1, 2, 3], 2: [4, 5, 6]}, [7, 8, 9])))

        finally:
            session.cluster.shutdown()
Exemple #20
0
    def get_todays_tweet(self):

        word_data, sorted_by_loc = {}, []

        timestamp = datetime.utcnow().strftime('%y') + datetime.utcnow(
        ).strftime('%m') + datetime.utcnow().strftime(
            '%d') + datetime.utcnow().strftime('%H')

        cassandra = Cluster([
            config.get("cassandra.host1"),
            config.get("cassandra.host2"),
            config.get("cassandra.host3")
        ]).connect('insight')

        tweeted_words = cassandra.execute(self.QUERY_TODAYS_TWEET % timestamp)
        # tweeted_words = self.CASSANDRA.execute(self.QUERY_TODAYS_TWEET % '15020221')

        # for (yymmddhh, timestamp, data, lat, lng) in tweeted_words:
        for (yymmddhh, location, string, data, timestamp) in tweeted_words:

            if location not in word_data:
                word_data[location] = {}

            for word, count in data.iteritems():

                word = word.encode("utf-8")

                if word not in word_data[location]:
                    word_data[location][word] = count
                else:
                    word_data[location][word] += count

        for location in word_data:
            # In place sort by number of words in desc order
            sorted_by_loc = sorted(word_data[location].items(),
                                   key=operator.itemgetter(1))
            sorted_by_loc.reverse()

            # Re-enter sorted data
            word_data[location] = {}
            for word_pair in sorted_by_loc:
                word = word_pair[0]
                count = word_pair[1]
                word_data[location][word] = count

        return word_data
Exemple #21
0
class Searcher:
    start = 0
    def __init__(self,name):

        self.filename = mapper[name][search]
        Searcher.start = datetime.now()

        self.log1 = getLogger('time', 'search_time.log')
        self.log2 = getLogger('result', 'search_result.log')

        self.log2.info('## connecting to cassandra cluster')
        self.session = Cluster(DSE_IP).connect()

        cql = "SELECT * FROM reddit.comment WHERE solr_query=?"
        self.solr_query = '{{"q":"body:*{0}*"}}'
        self.cql_prepared = self.session.prepare(cql)
        self.cql_prepared.consistency_level = ConsistencyLevel.ONE
        
     
    def search(self):
        counter = 0

        with open(self.filename, 'r', encoding='utf-8') as f:
            for counter,line in enumerate(f):
                try:
                    line = line.strip('\n')
                    if counter%100 == 0:
                        self.log1.info(str(counter))
                        if counter%1000 == 0:
                            self.log1.info('{}'.format(datetime.now()- Searcher.start))
                            self.log1.info('')
                    self.log2.info('({})'.format(line))
                    
                    # format query body
                    data = self.solr_query.format(line)
                    
                    res = self.session.execute(self.cql_prepared,[data],timeout=60000)
                    
                    self.log2.info(res[0])
                except Exception as e:
                    self.log2.warn(e)


        self.log1.info('total time:{}'.format(datetime.now() - Searcher.start))
def get_user_timeline(user: str, db_session: Cluster):
    """
    Retrieve the 20 most recent photos from a user's timeline

    User timelines are stored in table home_status_updates

    Arguments:
        user: Username to retrieve
        db_session: Cassandra instance

    Returns:
        20 most recent database records
    """
    sql_query = f"""
    SELECT *
    FROM home_status_updates
    WHERE timeline_username = '******' limit 20;"""
    timeline = db_session.execute(sql_query)
    return timeline
class CassandraConnector(object):
  """
    Handles exporting data from Cassandra to Neo4j
    using CSV as intermediary

  """

  def __init__(self, keyspace_name, tables=None, schema_file=None, queries=None):
    self.KEYSPACE = keyspace_name
    self.session = Cluster().connect(self.KEYSPACE)
    self.keyspace_metadata = self.session.cluster.metadata.keyspaces[self.KEYSPACE]
    self.tables = tables
    self.schema_file = schema_file

  def getTables(self):
    if self.tables:
      return self.tables
    else:
      return self.session.cluster.metadata.keyspaces[self.KEYSPACE].tables.keys()

  def getColumnsForTable(self, table):
    return self.session.cluster.metadata.keyspaces[self.KEYSPACE].tables[table].columns.keys()

  def parse(self):
    keyspace = self.session.cluster.metadata.keyspaces[self.KEYSPACE]
    parser = SchemaParser(keyspace.export_as_string())
    parser.parse()
    sys.exit("Generated schema.yaml file.")

  def export(self):
    tableNames = self.getTables()
    fileNames = [t + "_results.csv" for t in tableNames]

    for t in tableNames:
      results_file = codecs.open(t + "_results.csv", encoding='utf-8', mode='w+')
      rows = self.session.execute('SELECT * FROM ' + t)
      writer = csv.writer(results_file)
      writer.writerow(self.getColumnsForTable(t))
      writer.writerows([(e for e in row) for row in rows])

    cypher_queries_gen = CypherQueriesGenerator(self.keyspace_metadata, schema_file)
    cypher_queries_gen.generate()
    cypher_queries_gen.build_queries(tableNames, fileNames)
Exemple #24
0
    def get_todays_tweet(self):

        word_data, sorted_by_loc = {}, []

        timestamp = datetime.utcnow().strftime('%y') + datetime.utcnow().strftime('%m') + datetime.utcnow().strftime('%d') + datetime.utcnow().strftime('%H')

        cassandra = Cluster([config.get("cassandra.host1"), config.get("cassandra.host2"), config.get("cassandra.host3")]).connect('insight')

        tweeted_words = cassandra.execute(self.QUERY_TODAYS_TWEET % timestamp)
        # tweeted_words = self.CASSANDRA.execute(self.QUERY_TODAYS_TWEET % '15020221')

        # for (yymmddhh, timestamp, data, lat, lng) in tweeted_words:
        for (yymmddhh, location, string, data, timestamp) in tweeted_words:

            if location not in word_data:
                word_data[location] = {}

            for word, count in data.iteritems():

                word = word.encode("utf-8")

                if word not in word_data[location]:
                    word_data[location][word] = count
                else:
                    word_data[location][word] += count

        for location in word_data:
            # In place sort by number of words in desc order
            sorted_by_loc = sorted(word_data[location].items(), key=operator.itemgetter(1))
            sorted_by_loc.reverse()

            # Re-enter sorted data
            word_data[location] = {}
            for word_pair in sorted_by_loc:
                word = word_pair[0]
                count = word_pair[1]
                word_data[location][word] = count

        return word_data
    def initialize_connection(self):

        session = Cluster(contact_points=[self.host], port=self.port).connect(keyspace=self.keyspace)
        session.default_timeout = 100
        query = "SELECT host, id FROM "+self.table
        statement = SimpleStatement(query)
        getdata = session.execute(statement)
        hosts, id_val, count = [], [], 0
        count_list = {}
        for data in getdata:
            value = str(data[0]).strip()
            id_val.append(data[1])
            if value.find(',') == -1:
                hosts.append(value)
            else:
                hosts.append(value.split(",")[0])
            count += 1
        print count
        #create instance for the class retrieve_location
        process = retrieve_location()
        # function calls
        count_list = process.get_location(session, self.table, hosts, id_val)
        self.insert_cassandra(session, self.source, count_list.keys(), count_list.values())
        return 1
Exemple #26
0
def main(input_keyspace, output_keyspace):
    session = Cluster(cluster_seeds).connect()
    session.execute(
        "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 2}"
        % output_keyspace)
    session.set_keyspace(output_keyspace)
    session.execute("CREATE TABLE IF NOT EXISTS orders_parts ("
                    "orderkey int, "
                    "custkey int, "
                    "orderstatus text, "
                    "totalprice decimal, "
                    "orderdate date, "
                    "order_priority text, "
                    "clerk text, "
                    "ship_priority int, "
                    "comment text, "
                    "part_names set<text>, "
                    "PRIMARY KEY (orderkey))")
    session.execute('TRUNCATE orders_parts')
    session.shutdown()

    df_part = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='part', keyspace=input_keyspace).load()
    df_orders = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='orders', keyspace=input_keyspace).load().cache()
    df_lineitem = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='lineitem', keyspace=input_keyspace).load()
    joined_df = df_orders.join(df_lineitem,
                               df_orders.orderkey == df_lineitem.orderkey)
    final_joined_df = joined_df.join(df_part,
                                     df_part.partkey == joined_df.partkey)
    selected_df = final_joined_df.groupBy(
        df_orders['orderkey'], 'totalprice').agg(
            functions.collect_set(
                df_part['name']).alias('part_names')).drop('totalprice')
    output_df = selected_df.join(df_orders, "orderkey")
    output_df.write.format("org.apache.spark.sql.cassandra").options(
        table="orders_parts", keyspace=output_keyspace).save()
Exemple #27
0
    def setUp(self):
        temp = Cluster(['127.0.0.1']).connect()
        temp.execute('''
            CREATE KEYSPACE IF NOT EXISTS %s
            WITH REPLICATION = {
                'class' : 'SimpleStrategy',
                'replication_factor' : 1 };
        ''' % (self.keyspace))

        temp = Cluster(['127.0.0.1']).connect(self.keyspace)
        temp.execute('''
            DROP TABLE IF EXISTS superdupertable;
        ''')
        temp.execute('''
            CREATE TABLE IF NOT EXISTS superdupertable (
              uid UUID,
              name text,
              body text,
              last_change timestamp,
              PRIMARY KEY (uid, last_change)
            );
        ''')
        self.cass = CassandraSync(self.keyspace)
Exemple #28
0
def insert_data():
    from cassandra.cluster import Cluster
    from cassandra import AlreadyExists
    # Conexion al cluster de Cassandra al keyspace "test"
    session = Cluster([
        '192.168.101.140', '192.168.101.141', '192.168.101.142',
        '192.168.101.143'
    ]).connect("test")
    try:
        # Primero crea la tabla
        session.execute("CREATE TABLE test1 (id int PRIMARY KEY, col1 text)")
        # Inserta datos
        insertar = session.prepare(
            "INSERT INTO test1 (id, col1) VALUES (?, ?)")
        for i in range(10):
            session.execute(insertar, (i, 'hola' + str(i)))
            print(i)
    except AlreadyExists:
        # Inserta datos a la tabla ya existente
        insertar = session.prepare(
            "INSERT INTO test1 (id, col1) VALUES (?, ?)")
        for i in range(10):
            session.execute(insertar, (i, 'hola' + str(i)))
            print(i)
Exemple #29
0
def main(input_dir, keyspace_name, table_name):
    rdd = sc.textFile(input_dir)
    preprocess_rdd = rdd.map(lambda line: preprocess(line))
    cleaned_rdd = preprocess_rdd.filter(lambda x: x is not None)

    schema = "id STRING, host STRING, datetime TIMESTAMP, path STRING, bytes INT"
    # sch = ['id', 'host', 'datetime', 'path', 'bytes']

    df = spark.createDataFrame(data=cleaned_rdd, schema=schema)
    session = Cluster(cluster_seeds).connect()
    session.execute(
        "CREATE KEYSPACE IF NOT EXISTS " + keyspace_name +
        " WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':2}")
    session.set_keyspace(keyspace_name)
    session.execute("DROP TABLE IF EXISTS " + keyspace_name + "." + table_name)
    session.execute(
        "CREATE TABLE IF NOT EXISTS " + table_name +
        " (id UUID, host TEXT, datetime TIMESTAMP, path TEXT, bytes INT, PRIMARY KEY (host, id))"
    )
    # session.execute("TRUNCATE "+table_name)
    session.shutdown()

    df.write.format("org.apache.spark.sql.cassandra").options(
        table=table_name, keyspace=keyspace_name).save()
class CassandraService:
    def __init__(self, keyspace='data'):
        self.session = Cluster().connect(keyspace)
        self.user_insert_stmt = self.prepare_user_insert_statement()
        self.tweet_insert_stmt = self.prepare_tweet_insert_statement()
        self.exception_insert_stmt = self.prepare_exception_insert_statement()

    def save_user(self, user: User):
        self.session.execute(self.user_insert_stmt, vars(user))

    def save_tweet(self, tweet: Tweet):
        self.session.execute(self.tweet_insert_stmt, vars(tweet))

    def save_exception(self, exception_data: dict):
        self.session.execute(self.exception_insert_stmt, exception_data)

    def prepare_user_insert_statement(self):
        return self.session.prepare("""
            INSERT INTO pronbots_2019 (id, captured_at, created_at, description, entities, favourites_count,
                followers_count, following_count, friends_count, listed_count, name, pinned_tweet_id, profile_image_url,
                protected, tweets_count, url, user_name)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """)

    def prepare_tweet_insert_statement(self):
        return self.session.prepare("""
            INSERT INTO pronbots_2019_tweets (id, captured_at, created_at, user_id, timezone, content, link, retweet,
                mentions, urls, photos, video, lang, replies_count, retweets_count, likes_count, hashtags, cashtags)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """)

    def prepare_exception_insert_statement(self):
        return self.session.prepare("""
            INSERT INTO pronbots_2019_exceptions (user, exception)
            VALUES (?, ?)
            """)
Exemple #31
0
def current_time():
    return (datetime.now() - datetime(1970, 1, 1)).total_seconds()


for lines in args.lines:
    for columns in args.columns:
        initial_size = 0
        print("Deleting previous data")
        try:
            subprocess.check_output("rm " + args.queue_dir + " -R", shell=True)
        except subprocess.CalledProcessError:
            pass
        try:
            session = Cluster(['localhost']).connect('kairosdb')
            session.default_timeout = 1200
            session.execute('drop keyspace kairosdb;')
            print("Data deleted.")
        except NoHostAvailable:
            print("No previous data exists. Skipping delete.")
        print("Starting Kairosdb")
        kairos = subprocess.Popen([args.kairos_path, "run"],
                                  stderr=subprocess.DEVNULL,
                                  stdout=subprocess.DEVNULL)
        time.sleep(4)
        while True:
            try:
                subprocess.check_output("curl localhost:8080",
                                        shell=True,
                                        stderr=subprocess.DEVNULL)
            except subprocess.CalledProcessError:
                print("Kairos is not ready yet. Checking again in 2 seconds.")
    def __str__(self):
        # just extracting request count from the size stats (which are recorded on all requests)
        request_sizes = dict(self.requests)
        count = request_sizes.pop('count')
        return "%d requests (%d errors)\nRequest size statistics:\n%s" % (
            count, self.errors, pp.pformat(request_sizes))


# connect a session
session = Cluster().connect()

# attach a listener to this session
ra = RequestAnalyzer(session)

session.execute("SELECT release_version FROM system.local")
session.execute("SELECT release_version FROM system.local")

print(ra)
# 2 requests (0 errors)
# Request size statistics:
# { '75percentile': 74,
#   '95percentile': 74,
#   '98percentile': 74,
#   '999percentile': 74,
#   '99percentile': 74,
#   'max': 74,
#   'mean': 74.0,
#   'median': 74.0,
#   'min': 74,
#   'stddev': 0.0}
class CassandraESSync:
	
	mappings = []

	cassandra_nodes = []
	cassandra_keyspace = None
	cassandra_cfs = []

	es_nodes = []
	es_index = None
	es_types = []

	cassandra_client = None 
	es_client = None



	last_synced = {'cassandra': {}, 'es': {} }  # stores the last time each cf and index were synced, to avoid unnecessary queries


	def __init__(self, config):

		self.mappings = config["mappings"]

		self.cassandra_nodes = config['cassandra']['nodes']
		self.cassandra_keyspace = config['cassandra']['keyspace']
		self.cassandra_cfs = config['cassandra']['column_families']

		self.es_nodes = config['elasticsearch']['nodes']
		self.es_index = config['elasticsearch']['index']
		self.es_types = config['elasticsearch']['types']

		self.cassandra_client = Cluster().connect(self.cassandra_keyspace)
		self.es_client = elasticsearch.Elasticsearch()


	def sync_databases(self):
		for mapping in self.mappings:
			cassandra_cf, es_type = mapping
			self.sync_cf_type(cassandra_cf, es_type)


	def sync_cf_type(self, cassandra_cf, es_type):


		cf_id_column = self.cassandra_cfs[cassandra_cf]['id'] # column storing the document's uid
		cf_timestamp_column = self.cassandra_cfs[cassandra_cf]['timestamp'] # column storing the document's timestamp

		index_id_column = self.es_types[es_type]['id'] # column storing the document's timestamp
		index_timestamp_column = self.es_types[es_type]['timestamp'] # column storing the document's timestamp

		cf_data_fields = self.cassandra_cfs[cassandra_cf]['columns']
		cf_fields = [cf_id_column, cf_timestamp_column] + cf_data_fields

		type_data_fields = self.es_types[es_type]['columns']

		if cassandra_cf in self.last_synced['cassandra']:
			cf_start_time, cf_end_time = self.last_synced['cassandra'][cassandra_cf], time.time()  
		else:
			cf_start_time, cf_end_time = None, None

		if es_type in self.last_synced['es']:
			index_start_time, index_end_time = self.last_synced['es'][es_type], time.time()
		else:
			index_start_time, index_end_time = None, None

		cassandra_data_query = 'SELECT %s, %s FROM %s' % (cf_id_column, cf_timestamp_column, cassandra_cf)
		
		range_filter = {}
		if index_start_time and index_end_time:
			range_filter = self.get_es_range_filter(index_timestamp_column, index_start_time, index_end_time)
		

		self.cassandra_client.set_keyspace(self.cassandra_keyspace)
		cassandra_data = self.cassandra_client.execute(cassandra_data_query)
		self.last_synced['cassandra'][cassandra_cf] = time.time()

		es_data = [] #self.es_client.search(index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], body=range_filter)
		es_scan = scan(self.es_client, index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], query=range_filter)
		self.last_synced['es'][es_type] = time.time()

		for data in es_scan:
			es_data.append(data)

		all_data = {}

		ids_to_insert_on_cassandra = []
		ids_to_update_on_cassandra = []

		ids_to_insert_on_es = []
		ids_to_update_on_es = []


		# because we cant make a range query on a non-primary key on cassandra, we have to retrieve it all, and then check for the timestamp by hand.
		for document in cassandra_data:
			doc_id, doc_timestamp = str(document[0]), int(calendar.timegm(document[1].utctimetuple()))
			if not(cf_start_time and cf_end_time):
				all_data[doc_id] = [doc_timestamp, None]
			elif cf_start_time and cf_end_time and doc_timestamp >= cf_start_time and doc_timestamp <= cf_end_time: # 
				all_data[doc_id] = [doc_timestamp, None]

		for document in es_data:
			if "fields" in document:
				if index_id_column == '_id': # special case - is not inside fields. there must be a better way to do this ;(
					doc_id, doc_timestamp = document[index_id_column], int(document['fields'][index_timestamp_column][0])
				else:
					doc_id, doc_timestamp = document['fields'][index_id_column], int(document['fields'][index_timestamp_column][0])

				if doc_id in all_data:
					all_data[doc_id][1] = doc_timestamp
				else:
					all_data[doc_id] = [None, doc_timestamp]

		
		for uid in all_data:
			cassandra_ts, es_ts = all_data[uid]
			if cassandra_ts and es_ts:
				if cassandra_ts > es_ts: # same id, cassandra is the most recent. update that data on es. 
					ids_to_update_on_es.append(uid)
				elif es_ts > cassandra_ts: # same id, es is the most recent. update that data on cassandra.
					ids_to_update_on_cassandra.append(uid)
			elif cassandra_ts: # present only on cassandra. add to es.
				ids_to_insert_on_es.append(uid)
			elif es_ts: #present only on es. add to cassandra.

				ids_to_insert_on_cassandra.append(uid)


		if ids_to_insert_on_es or ids_to_update_on_es:
			actions = []
			from_cassandra_to_es = self.get_cassandra_documents_by_id(cassandra_cf, cf_fields, cf_id_column, ids_to_insert_on_es + ids_to_update_on_es)

			for document in from_cassandra_to_es:
				data = {}
				for i in range(len(cf_data_fields)):
					data[type_data_fields[i]] = getattr(document, cf_data_fields[i]) 	

				actions.append(self.get_es_bulk_action(es_type, index_id_column, getattr(document, cf_id_column), index_timestamp_column, getattr(document, cf_timestamp_column), data))
			
			bulk(self.es_client, actions) # send all inserts/updates to es at once

		if ids_to_insert_on_cassandra or ids_to_update_on_cassandra:
			batch = BatchStatement()

			type_fields = type_data_fields + [index_id_column, index_timestamp_column]
			ids_filter = self.get_es_ids_filter(es_type, ids_to_insert_on_cassandra + ids_to_update_on_cassandra)
			from_es_to_cassandra = self.es_client.search(index=self.es_index, doc_type=es_type, fields=type_data_fields + [cf_timestamp_column], body=ids_filter)
			
			for document in from_es_to_cassandra['hits']['hits']:
				
				id_value = document[index_id_column] if index_id_column == '_id' else document["fields"][index_id_column] # this makes me a saaaad panda
				id_value = id_value
				
				es_data = [UUID(id_value), datetime.datetime.utcfromtimestamp(int(document['fields'][index_timestamp_column][0]))]
				
				for field in type_data_fields:
					es_data.append(document['fields'][field][0])

				prepared_insert_statement = self.get_prepared_cassandra_insert_statement(cassandra_cf, cf_fields)
				prepared_update_statement = self.get_prepared_cassandra_update_statement(cassandra_cf, cf_id_column, cf_fields[1:])
				
				if id_value in ids_to_insert_on_cassandra:
					batch.add(prepared_insert_statement, tuple(es_data))
				else: 
					batch.add(prepared_update_statement, tuple(es_data[1:] + [UUID(id_value)]))

			
			self.cassandra_client.execute(batch)


	def get_cassandra_documents_by_id(self, cf, fields, id_column, ids):
		from_cassandra_to_es_query = "SELECT %s FROM %s WHERE %s IN (%s)" % (",".join(fields), cf, id_column, ",".join(ids))
		return self.cassandra_client.execute(from_cassandra_to_es_query)

	def get_prepared_cassandra_insert_statement(self, cf, cols):
		base_insert_statement = "INSERT INTO %s (%s) VALUES (%s)" % (cf, ",".join(cols), ",".join(["?"] * len(cols)))	
		return self.cassandra_client.prepare(base_insert_statement)

	def get_prepared_cassandra_update_statement(self, cf, id_column, fields):
		base_update_statement = "UPDATE %s SET %s WHERE %s = ?" % (cf, ",".join([field + " = ?" for field in fields]), id_column)
		return self.cassandra_client.prepare(base_update_statement)

	def get_es_range_filter(self, col, start, end):
		return {
				"filter": {
					"range" : {
						col: {
			        		"gte" : start,
			        		"lte" : end
			        	}
			    	}
				}
			}

	def get_es_ids_filter(self, es_type, ids):
		return {
				"filter": {
					"ids" : {
			        	"type" : es_type,
			        	"values": ids 
			    	}
				}
			}

	def get_es_bulk_action(self, es_type, id_column, id_value, timestamp_column, timestamp_value, data):

		if isinstance(timestamp_value, datetime.datetime):
			timestamp_value = calendar.timegm(timestamp_value.utctimetuple())

		id_value = str(id_value)
		timestamp_value = str(timestamp_value)

		data[timestamp_column] = timestamp_value

		action = {}
		action['_index'] = self.es_index
		action['_type'] = es_type 
		action[id_column] = id_value
		action['_source'] = data

		return action
Exemple #34
0
def _get_system_column_rows(server=None):
    server = server or "127.0.0.1"
    logging.info("Getting server columns from server: '%s'", server)
    session = Cluster([server]).connect()
    return session.execute("SELECT * FROM system.schema_columns")
Exemple #35
0
#!/usr/bin/python
# Copyright (C) 2014-2016, National Rural Electric Cooperative Association and Cigital, Inc

from cassandra.cluster import Cluster

session = Cluster(['127.0.0.1']).connect('demo')
for currentRow in session.execute('select content from packet'):
    print currentRow[0], '\n'
Exemple #36
0
class Reassembler(Process):
    def initCluster(self):
        auth_provider = PlainTextAuthProvider(username=cfg.cassandraConfig["user"], password=cfg.cassandraConfig["password"])
        self.session = Cluster([cfg.cassandraConfig["host"]], auth_provider=auth_provider).connect(cfg.cassandraConfig["db"])
        self.preparedQuery = self.session.prepare("""INSERT into packet (source_addr, dest_addr, time_stamp, content, text_values)
                                                        VALUES (?,?,?,?,?)""")

    def insertIntoDatabase(self, sourceAddr, destAddr, timeStamp, content, textValues):
        args = [sourceAddr, destAddr, timeStamp, content, textValues]
        self.session.execute(self.preparedQuery, args)

    def __init__(self, ports):
        self.ports = []
        list = ports.split(',')     # split CSV port list arguments
        self.ports = map(int,list)  # convert ports to int (from string)
        self.initCluster()
        nids.register_tcp(self.handleTcpStream)                     # set up call back
        
    def __call__(self):     # make a singleton clas
        return self
    
    def printableHex(self, buf):
        return ' '.join(x.encode('hex') for x in buf)
        
    def handleTcpStream(self, tcp):
        end_states = (nids.NIDS_CLOSE, nids.NIDS_TIMEOUT, nids.NIDS_RESET)
        logging.debug('tcps - {0} state: {1} timestamp: {2}'.format(str(tcp.addr),tcp.nids_state,nids.get_pkt_ts() * 1000))
        if tcp.nids_state == nids.NIDS_JUST_EST:
            # new to us, but do we care?
            ((src, sport), (dst, dport)) = tcp.addr
            #if dport in self.ports:
            logging.info('collecting: {}'.format(str(tcp.addr)))
            tcp.client.collect = 1
            tcp.server.collect = 1
        elif tcp.nids_state == nids.NIDS_DATA:
            tcp.discard(0)
            # keep all of the stream's new data
            #informs nids how many bytes in the stream to discard
            #((src, sport), (dst, dport)) = tcp.addr
            #serverData = tcp.server.data[:tcp.server.count]
            #clientData = tcp.client.data[:tcp.client.count]
            #envelopeRegex = '<soap.*:envelope.*<.*MultiSpeakMsgHeader.*<soap.*:envelope>'
            #envelopeRegex2 = '</.+:[Ee]nvelope'
            #if serverData is None or clientData is None:
            #   tcp.discard(0)
            #else:
            #   if "Expect: 100-continue" not in serverData:
            #       tcp.discard(0)
            #   else:
            #       if (re.search(envelopeRegex,serverData,re.S | re.IGNORECASE) and re.search(envelopeRegex2,serverData,re.S | re.IGNORECASE) and re.search(envelopeRegex,clientData,re.S | re.IGNORECASE) and re.search(envelopeRegex2,clientData,re.S | re.IGNORECASE)):
            #           tcpaddr = ((dst,dport),(src,sport))
            #           logging.debug( "count_new: {}".format(tcp.server.count_new))
            #           logging.debug( "offset server: {}".format(tcp.server.offset))
            #           self.process_ipframe(serverData,tcp.addr,self.timestamp)
#
            #           logging.debug( "count_new: {}".format(tcp.server.count_new))
            #           logging.debug( "offset client: {}".format(tcp.client.offset))
            #           tcpaddr = ((dst,dport),(src,sport)) #flip it around to match our point of view (since this is the client
            #           self.process_ipframe(clientData,tcpaddr,self.timestamp)
            #           tcp.discard(tcp.server.count + tcp.client.count)
            #       else:
            #           tcp.discard(0)
        elif tcp.nids_state in end_states:
            ((src,sport),(dst,dport)) = tcp.addr
            serverData = tcp.server.data[:tcp.server.count]
            clientData = tcp.client.data[:tcp.client.count]
            #logging.debug("serverData: {0}".format(    serverData))
            #logging.debug("clientData: {0}".format(clientData))
            self.timestamp = nids.get_pkt_ts() * 1000
            #Add the MultiSpeakMsgHeader since we observed way too many false positives during
            #the virtual field test
            envelopeRegex = '<soap.*:envelope.*<.*MultiSpeakMsgHeader.*<soap.*:envelope>'
            logging.info("Serv Count: {0} Client Count {1} newc: {2} news: {3}".format(tcp.server.count,tcp.client.count,tcp.client.count_new,tcp.server.count_new))
            #Match even if there is a newline since we've observed some payloads with the newline
            #print("server is ", tcp.server.data[:tcp.server.count], "client is ", tcp.client.data[:tcp.client.count], "count new is ", tcp.server.count_new)
            if serverData is not None:
                serverData = serverData.replace("\n","")
                if (re.search(envelopeRegex,serverData,re.S | re.IGNORECASE)): #and tcp.server.count_new > 0):
                    logging.info('full message found in tcp server data')
                    payload = tcp.server.data[:tcp.server.count]
                    #tcpaddr = ((dst,dport),(src,sport))
                    logging.debug( "count_new: {}".format(tcp.server.count_new))
                    logging.debug( "offset server: {}".format(tcp.server.offset))
                    self.process_ipframe(payload,tcp.addr,self.timestamp)
                    tcp.discard(tcp.server.count)
                elif "multispeak" in serverData.lower():
                    logging.warning("multispeak serverData but envelope failed: {}".format(serverData))
            if clientData is not None:
                clientData = clientData.replace("\n","")
                if (re.search(envelopeRegex,clientData, re.S | re.IGNORECASE)):
                    logging.info('full message found in tcp client data')
                    tcpaddr = ((dst,dport),(src,sport)) #flip it around to match our point of view (since this is the client
                    payload = tcp.client.data[:tcp.client.count]
                    logging.debug("count_new client: {}".format(tcp.client.count_new))
                    logging.debug( "offset client: {}".format(tcp.client.offset))
                    self.process_ipframe(payload,tcpaddr,self.timestamp) #modified tcpaddr
                    tcp.discard(tcp.client.count)
                elif "multispeak" in clientData.lower():
                    logging.warning("multispeak clientData but envelope failed: {}".format(clientData))
            logging.debug( "addr: {}".format(tcp.addr))
            logging.debug( "To server:")
            logging.debug( "bytes {}".format(str(tcp.server.count)))
            logging.debug( "To client:")
            logging.debug( "bytes: {}".format(str(tcp.client.count)))

    def process_ipframe(self,frame,tcpaddr, timestamp):
        # note that we are no longer checking source IP addresses
        # so we could be processing frames from other ips if not filtered
        # before this point
        cleansedFrame = frame.replace("\n","").replace("\r","").replace("\t","")
        envelopeRegex = '</.+:[Ee]nvelope'
        match = re.search(envelopeRegex,cleansedFrame)
        if match:
            ((src, sport),(dst,dport)) = tcpaddr
            #try to find the endpoint type from POST Request
            endpointRegex = '(?<=POST\s).*(?=\sHTTP)'   # looks for POST and HTTP, and matches the URL
            match = re.search(endpointRegex,cleansedFrame, re.IGNORECASE)
            if match is not None:
                URLsplit = cleansedFrame[match.start():match.end()].split('/')  # [foo,QA_SERVER]
                endpointCodeSplit = URLsplit[-1].split('_') # [QA,SERVER]
                endpointCode = endpointCodeSplit[0] # QA
                logging.debug("parsed MS endpoint code: {}".format(endpointCode))
            else:
                endpointCode = 'NULL'
                logging.warning("Unable to parse endpoint code from header")
                #get the version from the SOAPAction http header
            versionRegex = '(?<=SOAPAction:\s"http:\/\/www.multispeak.org\/Version_).'
            match = re.search(versionRegex,cleansedFrame, re.IGNORECASE)
            mspVersion = 'NULL'
            if match is not None:
                mspVersion = cleansedFrame[match.start()]
                if (mspVersion != '3' and mspVersion != '5'):
                    mspVersion = 'NULL'
            else:
                versionRegex2 = '(?<=SOAPAction:\shttp:\/\/www.multispeak.org\/Version_).' # uses lookbehind to extract only version num
                match2 = re.search(versionRegex2, cleansedFrame, re.IGNORECASE)
                if match2 is not None:
                    mspVersion = cleansedFrame[match2.start()]
                    if (mspVersion != '3' and mspVersion != '5'):
                        mspVersion = 'NULL'
                else:
                    versionRegex3 = '<MultiSpeakMsgHeader[^>]* Version="(\d)\.'
                    match3 = re.search(versionRegex3, cleansedFrame, re.IGNORECASE)
                    if match3 is not None:
                        mspVersion = match3.group(1)
                        if (mspVersion != '3' and mspVersion != '5'):
                            mspVersion = 'NULL'
                    

#           messageNameRegex = '(<[\w:]*[Bb]ody>)(\s*)(<)(?P<MsgName>[\w|:]+)'
            messageNameRegex = '(<([\w-]+:)?body>)(\s*)(<)(([\w-]+:)?)(?P<MsgName>\w*)'
            match = re.search(messageNameRegex,cleansedFrame, re.IGNORECASE)
            if match is not None:
                    messageName = match.group('MsgName').split(':')[-1]
                    text_values = {'endpoint':endpointCode,'messagetype':messageName,'mspVersion':mspVersion}
                    logging.debug( "text values: {}".format(text_values))
                    self.insertIntoDatabase(src, dst, timestamp, frame, text_values)
                    print("inserted packet")
            else:
                    print("MsgName not found")
                    logging.debug("frame with no MsgName: {}".format(cleansedFrame))
        else:
            logging.debug("end of envelope not found: {}".format(cleansedFrame))
            print("end of envelope not found")
Exemple #37
0
#!/usr/bin/python
# Copyright (C) 2014-2016, National Rural Electric Cooperative Association and Cigital, Inc

from cassandra.cluster import Cluster

session = Cluster(['127.0.0.1']).connect('demo')
for currentRow in session.execute('select content from packet'):
        print currentRow[0], '\n'
Exemple #38
0
class Loader:
    start = 0
    def __init__(self,name):
        logging.getLogger("elasticsearch").setLevel(logging.WARNING)
        self.log = getLogger('scy+es','load.log')
        
        Loader.start = datetime.now()

    
        self.index = name
        self.stat = Statement(name)

        self.filename = mapper[name]['load']

        self.__init_scy()
        self.__init_es()
        
        self.pool_scy = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10)
        self.pool_es = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10)

    def load(self):

        if self.index == 'reddit':
            # main loop
            g = self.__line_generator()
            while True:
                try:
                    line = json.loads(next(g))
                    self.pool_scy.submit(self.__insert_data,line)
                    self.pool_es.submit(self.__insert_index,line)
                except StopIteration:
                    break
                except Exception:
                    continue 
        elif self.index == 'amazon':
            df = pd.read_csv(self.filename,encoding='utf-8')
            end = df.index.max()
            df = df.fillna('missing')
            try:
                for line in zip(range(0,end+1),df['userId'],df['productId'],
                                    df['rating'],df['title'],df['comment'],df['timestamp']):
                    counter = line[0]
                    if counter%100000 == 0:
                        self.log.info(str(counter))
                        if counter%1000000 == 0:
                            self.log.info('{}'.format(datetime.now()-Loader.start))
                            self.log.info('')
                    self.pool_scy.submit(self.__insert_data,line)
                    self.pool_es.submit(self.__insert_index,line)
            except Exception as e:
                print(e)
            
        self.pool_scy.shutdown()
        self.pool_es.shutdown()
        
        self.es.indices.refresh(index=self.index)
        # shutdown
        self.session.shutdown()

        # print information
        self.log.info('## Total cost time: {}'.format(datetime.now() - Loader.start))
        self.log.info('## Inserts completed')


    def __line_generator(self):

        with open(self.filename, 'r', encoding='utf-8') as f:
            for counter,line in enumerate(f):
                try:
                    if counter%100000 == 0:
                        self.log.info(str(counter))
                        if counter%1000000 == 0:
                            self.log.info('{}'.format(datetime.now()-Loader.start))
                            self.log.info('')
                    yield line
                except Exception as e:
                    print(e)
                    continue


    # get scylladb connect, create ks and tb, return session
    def __init_scy(self):
        # session = Cluster(contact_points=SCYLLA_IP,execution_profiles={EXEC_PROFILE_DEFAULT:ep}).connect()
        self.session = Cluster(contact_points=SCYLLA_IP).connect()
        # create a schema
        self.session.execute(self.stat.create_ks)
        # create a tb
        self.session.execute(self.stat.create_tb)

        self.cql_prepared = self.session.prepare(self.stat.cql)
        self.cql_prepared.consistency_level = ConsistencyLevel.LOCAL_QUORUM



    # get es connect, create index
    def __init_es(self):

        with open('mapping.json','r') as f:
           mapping = json.load(f)[self.index] 

        self.es = Elasticsearch(ES_IP, timeout=30)
        # create es index
        self.es.indices.create(index=self.index,body=mapping ignore=400,timeout=30)



    # insert data into scylladb
    def __insert_data(self,line):

        data = list()
        # TODO: format your data
        if self.index == 'reddit':
            data = [line['id'], line['name'], line['link_id'], line['parent_id'], line['subreddit_id'],line['author'], line['body'], int(line['created_utc'])]
        elif self.index == 'amazon':
            #       id | user_id | product_id | rating | title | body | timestamp
            data = [line[0], int(line[1]), int(line[2]), line[3], line[4], line[5], line[6]]
        self.log.info(data)
        res = self.session.execute(self.cql_prepared,data,timeout=60000)
        return 
    


    # insert data into elasticsearch
    def __insert_index(self,line): 

        # TODO: format your data
        if self.index == 'reddit':
            data = { k:v for k,v in line.items() if k in ['id', 'name', 'author', 'body'] }
        elif self.index == 'amazon':
            data = {}
            data['id'] = line[0]
            data['title'] = line[4]
            data['body'] = line[5]

        res = self.es.index(index=self.index, doc_type="comment", id=data['id'], body=data)
        return 
    profiles = {'node1': node1_profile}

    user_info = {'username':'******', 'password':'******'}
    auth_provider = PlainTextAuthProvider(username=user_info['username'], password=user_info['password'])
    logger.info(f'Logging into {ARGS["<cluster-ip>"]} with user={user_info["username"]}')
    session = Cluster(
        [ARGS['<cluster-ip>']],
        port=9042,
        protocol_version=4,
        execution_profiles=profiles,
        connect_timeout=20,
        auth_provider=auth_provider
    ).connect()

    connection.register_connection('cluster1', session=session)
    logger.info(session.execute('SELECT release_version FROM system.local').one())

    print('Simple Queries:')

    print('\n' + 'Average temperature in NYC from 2006-2020:')
    statement = SimpleStatement('SELECT AVG(tavg) as avg_temp FROM spardata.weather_dataset')
    for row in session.execute(statement,timeout=30):
        print(row['avg_temp'])

    print('\n' + 'Average temperature in NYC by year:')
    statement = SimpleStatement('SELECT year, AVG(tavg) as avg_temp FROM spardata.weather_dataset GROUP BY year')
    for row in session.execute(statement,timeout=30):
        print(f"{row['year']} : {row['avg_temp']}")

    print('\n' + 'Average temperature in NYC by year and month:')
    statement = SimpleStatement('SELECT year, month, AVG(tavg) as avg_temp FROM spardata.weather_dataset GROUP BY year, month')
Exemple #40
0
class CassWarehouse(LWWarehouse):
    
    compass_cls = CassWarehouseCompass
    
    NO_KEYSP_EXC = InvalidRequest
    NO_TABLE_EXC = InvalidRequest
    
    def connect(self):
        
        self.client = Cluster(
            contact_points=self.compass.hosts,
            port=self.compass.port,
            protocol_version=4,
            load_balancing_policy=RoundRobinPolicy()
        ).connect()
        
        self.set_keyspace()
    
    @keysp_dependent
    def set_keyspace(self):
        
        self.client.set_keyspace(self.keyspace)
    
    def create_keyspace(self):
        
        stmt = """
            CREATE KEYSPACE {keysp}
            WITH REPLICATION = {{
                'class': 'SimpleStrategy',
                'replication_factor': '{repl}' 
            }}
        """.format(
            keysp=self.keyspace,
            repl=self.compass.replication
        )
        
        self.client.execute(stmt)
    
    def create_table(self, hierarchy: StoreHierarchy, file_schema: List[FileSpec], *_, **__):

        stmt = """
            CREATE TABLE {keysp}.{table} (
                id_model VARCHAR,
                id_mover VARCHAR,
                id_file VARCHAR,
                file_content BLOB,
                PRIMARY KEY(id_model, id_mover, id_file)
            )
        """.format(
            keysp=self.keyspace,
            table=self.TABLE_NAME
        )
        
        self.client.execute(stmt)
    
    def delete(self, hierarchy: StoreHierarchy, ignore=False):

        deleted = True

        ids_stmt = """
            SELECT * FROM {keysp}.{table}
            WHERE 
                id_model='{_id_model}'
            AND id_mover='{_id_mover}'
        """.format(
            keysp=self.keyspace,
            table=self.TABLE_NAME,
            _id_model=hierarchy.parent,
            _id_mover=hierarchy.child
        )

        files = [row.id_file for row in self.client.execute(ids_stmt).current_rows]

        if len(files) == 0 and not ignore:
            self._raise_not_found(hierarchy)
        elif len(files) == 0 and ignore:
            deleted = False
        else:
            self.LOG.debug("Removing files {} from Cassandra".format(", ".join(files)))
        
        stmt = """
            DELETE FROM {keysp}.{table}
            WHERE
                id_model='{_id_model}'
            AND id_mover='{_id_mover}'
            AND id_file IN ('{_id_file}')
        """.format(
            keysp=self.keyspace,
            table=self.TABLE_NAME,
            _id_model=hierarchy.parent,
            _id_mover=hierarchy.child,
            _id_file="','".join(files) if len(files) > 1 else files[0]
        )
        
        self.client.execute(stmt)

        return deleted

    @table_dependent
    def store_files(self, hierarchy: StoreHierarchy, file_schema: List[FileSpec]):
        
        fields = []
        values = []
        stmt = "BEGIN BATCH "
        
        for file_spec in file_schema:

            fields.append(file_spec.name)
            values.append(memoryview(file_spec.get_bytes()))

            stmt += """
                INSERT INTO {keysp}.{table} (id_model, id_mover, id_file, file_content) 
                VALUES ('{_id_model}', '{_id_mover}', '{_id_file}', %s); 
            """.format(
                keysp=self.keyspace,
                table=self.TABLE_NAME,
                _id_model=hierarchy.parent,
                _id_mover=hierarchy.child,
                _id_file=file_spec.name
            )
        stmt += " APPLY BATCH;"
        self.LOG.debug("Storing as blobs: {}".format(fields))
        self.client.execute(stmt, values)
    
    @table_dependent
    def deploy_files(self, hierarchy: StoreHierarchy, file_schema: List[FileSpec], path_to: str):

        stmt = """
            SELECT * FROM {keysp}.{table}
            WHERE 
                id_model='{_id_model}'
            AND id_mover='{_id_mover}'
        """.format(
            keysp=self.keyspace,
            table=self.TABLE_NAME,
            _id_model=hierarchy.parent,
            _id_mover=hierarchy.child
        )

        rows = self.client.execute(stmt).current_rows

        if len(rows) == 0:
            self._raise_not_found(hierarchy)

        for row in rows:
            self.LOG.debug('Deploying file: {}'.format(row.id_file))
            write_path = os.path.join(path_to, row.id_file)
            bites = row.file_content
            open(write_path, 'wb').write(bites)
Exemple #41
0
def data_from_cassandra(ks_name, table_name):
    session = Cluster().connect()
    return [
        list(row) for row in session.execute(
            'SELECT * FROM {ks}.{tab};'.format(ks=ks_name, tab=table_name))
    ]
        self.errors += 1

    def __str__(self):
        # just extracting request count from the size stats (which are recorded on all requests)
        request_sizes = dict(self.requests)
        count = request_sizes.pop('count')
        return "%d requests (%d errors)\nRequest size statistics:\n%s" % (count, self.errors, pp.pformat(request_sizes))


# connect a session
session = Cluster().connect()

# attach a listener to this session
ra = RequestAnalyzer(session)

session.execute("SELECT release_version FROM system.local")
session.execute("SELECT release_version FROM system.local")

print(ra)
# 2 requests (0 errors)
# Request size statistics:
# { '75percentile': 74,
#   '95percentile': 74,
#   '98percentile': 74,
#   '999percentile': 74,
#   '99percentile': 74,
#   'max': 74,
#   'mean': 74.0,
#   'median': 74.0,
#   'min': 74,
#   'stddev': 0.0}
def query_es(NUM_FILTERS):

    ## Connect to Elasticsearch
    print("")
    print("## Connecting to Elasticsearch")
    es = Elasticsearch(ES_IP)

    if NUM_FILTERS == 'single':
        ## Search using single field filter (group: 'pants')
        print("")
        print("## Searching for 'pants' in Elasticsearch (filter by group)")
        res = es.search(index="catalog",
                        doc_type="apparel",
                        body={
                            "query": {
                                "match": {
                                    "group": "pants"
                                }
                            },
                            "size": 1000
                        })

    if NUM_FILTERS == 'multiple':
        ## Search using multiple fields filter (color: 'white' AND sub_group: 'softshell')
        print("")
        print(
            "## Searching for 'white softshell' in Elasticsearch (filter by color + sub_group)"
        )
        res = es.search(index="catalog",
                        doc_type="apparel",
                        body={
                            "query": {
                                "bool": {
                                    "must": [{
                                        "match": {
                                            "color": "white"
                                        }
                                    }, {
                                        "match": {
                                            "sub_group": "softshell"
                                        }
                                    }]
                                }
                            },
                            "size": 1000
                        })

    if NUM_FILTERS == 'none':
        ## Search with NO filters (match_all)
        print("")
        print("## Searching with NO filter = 'match_all' in Elasticsearch")
        res = es.search(index="catalog",
                        doc_type="apparel",
                        body={
                            "query": {
                                "match_all": {}
                            },
                            "size": "1000"
                        })

    print("")
    print("## %d documents returned" % res['hits']['total'])

    es_results = [doc['_id'] for doc in res['hits']['hits']]

    ## Connect to Scylla
    print("")
    print("## Connecting to Scylla")
    session = Cluster(SCYLLA_IP).connect()

    ## Prepared cql statement
    print("")
    print("## Preparing CQL statement")
    cql = "SELECT * FROM catalog.apparel WHERE sku=?"
    cql_prepared = session.prepare(cql)
    cql_prepared.consistency_level = ConsistencyLevel.ONE if random.random(
    ) < 0.2 else ConsistencyLevel.QUORUM

    ## Query Scylla
    print("")
    print("## Query Scylla using SKU/s returned from Elasticsearch")

    print("")
    print("## Final results from Scylla:")
    print("")
    for r in es_results:
        scylla_res = session.execute(cql_prepared, (r, ))
        print("%s" % ([list(row) for row in scylla_res]))

    #for doc in res['hits']['hits']:

    ## Print all columns in Elasticsearch result set
    #print("SKU: %s | Color: %s | Size: %s | Brand: %s | Gender: %s | Group: %s | Sub_Group: %s" % (doc['_id'], doc['_source']['color'], doc['_source']['size'], doc['_source']['brand'], doc['_source']['gender'], doc['_source']['group'], doc['_source']['sub_group']))

    ## Print only the id (sku) in the result set
    #print("SKU: %s" % (doc['_id']))

    print("")
    def _contention_test(self, threads, iterations):
        """
        Test threads repeatedly contending on the same row.
        """

        verbose = False

        session = self.prepare(nodes=3)
        session.execute(
            "CREATE TABLE test (k int, v int static, id int, PRIMARY KEY (k, id))"
        )
        session.execute("INSERT INTO test(k, v) VALUES (0, 0)")

        class Worker(Thread):
            def __init__(self, wid, session, iterations, query):
                Thread.__init__(self)
                self.wid = wid
                self.iterations = iterations
                self.query = query
                self.session = session
                self.errors = 0
                self.retries = 0

            def run(self):
                i = 0
                prev = 0
                while i < self.iterations:
                    done = False
                    while not done:
                        try:
                            res = self.session.execute(
                                self.query, (prev + 1, prev, self.wid))
                            if verbose:
                                print("[%3d] CAS %3d -> %3d (res: %s)" %
                                      (self.wid, prev, prev + 1, str(res)))
                            if res[0][0] is True:
                                done = True
                                prev = prev + 1
                            else:
                                self.retries = self.retries + 1
                                # There is 2 conditions, so 2 reasons to fail: if we failed because the row with our
                                # worker ID already exists, it means we timeout earlier but our update did went in,
                                # so do consider this as a success
                                prev = res[0][3]
                                if res[0][2] is not None:
                                    if verbose:
                                        print(
                                            "[%3d] Update was inserted on previous try (res = %s)"
                                            % (self.wid, str(res)))
                                    done = True
                        except WriteTimeout as e:
                            if verbose:
                                print("[%3d] TIMEOUT (%s)" %
                                      (self.wid, str(e)))
                            # This means a timeout: just retry, if it happens that our update was indeed persisted,
                            # we'll figure it out on the next run.
                            self.retries = self.retries + 1
                        except Exception as e:
                            if verbose:
                                print("[%3d] ERROR: %s" % (self.wid, str(e)))
                            self.errors = self.errors + 1
                            done = True
                    i = i + 1
                    # Clean up for next iteration
                    while True:
                        try:
                            self.session.execute(
                                "DELETE FROM test WHERE k = 0 AND id = %d IF EXISTS"
                                % self.wid)
                            break
                        except WriteTimeout as e:
                            pass

        nodes = self.cluster.nodelist()
        workers = []

        session = Cluster([nodes[0].ip_addr],
                          connect_timeout=15,
                          idle_heartbeat_interval=0,
                          execution_profiles={
                              EXEC_PROFILE_DEFAULT:
                              ExecutionProfile(request_timeout=60)
                          }).connect('ks')
        q = session.prepare("""
                BEGIN BATCH
                   UPDATE test SET v = ? WHERE k = 0 IF v = ?;
                   INSERT INTO test (k, id) VALUES (0, ?) IF NOT EXISTS;
                APPLY BATCH
            """)

        for n in range(0, threads):
            workers.append(Worker(n, session, iterations, q))

        start = time.time()

        for w in workers:
            w.start()

        for w in workers:
            w.join()

        if verbose:
            runtime = time.time() - start
            print("runtime:", runtime)

        query = SimpleStatement("SELECT v FROM test WHERE k = 0",
                                consistency_level=ConsistencyLevel.ALL)
        rows = session.execute(query)
        value = rows[0][0]

        errors = 0
        retries = 0
        for w in workers:
            errors = errors + w.errors
            retries = retries + w.retries

        assert (value == threads * iterations) and (
            errors == 0), "value={}, errors={}, retries={}".format(
                value, errors, retries)