def non_default_delimiter_template(self, delimiter):
        """
        @param delimiter the delimiter to use for the CSV file.

        Test exporting to CSV files using delimiters other than ',' by:

        - populating a table with integers,
        - exporting to a CSV file, specifying a delimiter, then
        - comparing the contents of the csv file to the SELECTed contents of the table.
        """

        self.prepare()
        self.session.execute("""
            CREATE TABLE testdelimiter (
                a int primary key
            )""")
        insert_statement = self.session.prepare("INSERT INTO testdelimiter (a) VALUES (?)")
        args = [(i,) for i in range(10000)]
        execute_concurrent_with_args(self.session, insert_statement, args)

        results = list(self.session.execute("SELECT * FROM testdelimiter"))

        self.tempfile = NamedTemporaryFile(delete=False)
        debug('Exporting to csv file: {name}'.format(name=self.tempfile.name))
        cmds = "COPY ks.testdelimiter TO '{name}'".format(name=self.tempfile.name)
        cmds += " WITH DELIMITER = '{d}'".format(d=delimiter)
        self.node1.run_cqlsh(cmds=cmds)

        self.assertCsvResultEqual(self.tempfile.name, results)
    def custom_null_indicator_template(self, indicator):
        """
        @param indicator the null indicator to be used in COPY

        A parametrized test that tests COPY with a given null indicator.
        """
        self.all_datatypes_prepare()
        self.session.execute("""
            CREATE TABLE testnullindicator (
                a int primary key,
                b text
            )""")
        insert_non_null = self.session.prepare("INSERT INTO testnullindicator (a, b) VALUES (?, ?)")
        execute_concurrent_with_args(self.session, insert_non_null,
                                     [(1, 'eggs'), (100, 'sausage')])
        insert_null = self.session.prepare("INSERT INTO testnullindicator (a) VALUES (?)")
        execute_concurrent_with_args(self.session, insert_null, [(2,), (200,)])

        self.tempfile = NamedTemporaryFile(delete=False)
        debug('Exporting to csv file: {name}'.format(name=self.tempfile.name))
        cmds = "COPY ks.testnullindicator TO '{name}'".format(name=self.tempfile.name)
        cmds += " WITH NULL = '{d}'".format(d=indicator)
        self.node1.run_cqlsh(cmds=cmds)

        results = list(self.session.execute("SELECT a, b FROM ks.testnullindicator"))
        results = [[indicator if value is None else value for value in row]
                   for row in results]

        self.assertCsvResultEqual(self.tempfile.name, results)
    def test_tuple_data(self):
        """
        Tests the COPY TO command with the tuple datatype by:

        - populating a table with tuples of uuids,
        - exporting the table to a CSV file with COPY TO,
        - comparing the CSV file to the SELECTed contents of the table.
        """
        self.prepare()
        self.session.execute("""
            CREATE TABLE testtuple (
                a int primary key,
                b tuple<uuid, uuid, uuid>
            )""")

        insert_statement = self.session.prepare("INSERT INTO testtuple (a, b) VALUES (?, ?)")
        args = [(i, random_list(gen=uuid4, n=3)) for i in range(1000)]
        execute_concurrent_with_args(self.session, insert_statement, args)

        results = list(self.session.execute("SELECT * FROM testtuple"))

        self.tempfile = NamedTemporaryFile(delete=False)
        debug('Exporting to csv file: {name}'.format(name=self.tempfile.name))
        self.node1.run_cqlsh(cmds="COPY ks.testtuple TO '{name}'".format(name=self.tempfile.name))

        self.assertCsvResultEqual(self.tempfile.name, results)
    def test_writing_use_header(self):
        """
        Test that COPY can write a CSV with a header by:

        - creating and populating a table,
        - exporting the contents of the table to a CSV file using COPY WITH
        HEADER = true
        - checking that the contents of the CSV file are the written values plus
        the header.
        """
        self.prepare()
        self.session.execute("""
            CREATE TABLE testheader (
                a int primary key,
                b int
            )""")
        insert_statement = self.session.prepare("INSERT INTO testheader (a, b) VALUES (?, ?)")
        args = [(1, 10), (2, 20), (3, 30)]
        execute_concurrent_with_args(self.session, insert_statement, args)

        self.tempfile = NamedTemporaryFile(delete=False)
        debug('Exporting to csv file: {name}'.format(name=self.tempfile.name))
        cmds = "COPY ks.testheader TO '{name}'".format(name=self.tempfile.name)
        cmds += " WITH HEADER = true"
        self.node1.run_cqlsh(cmds=cmds)

        with open(self.tempfile.name, 'r') as csvfile:
            csv_values = list(csv.reader(csvfile))

        self.assertItemsEqual(csv_values,
                              [['a', 'b'], ['1', '10'], ['2', '20'], ['3', '30']])
    def test_query_indexes_with_vnodes(self):
        """
        Verifies correct query behaviour in the presence of vnodes
        @jira_ticket CASSANDRA-11104
        """
        cluster = self.cluster
        cluster.populate(2).start()
        node1, node2 = cluster.nodelist()
        session = self.patient_cql_connection(node1)
        session.execute("CREATE KEYSPACE ks WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': '1'};")
        session.execute("CREATE TABLE ks.compact_table (a int PRIMARY KEY, b int) WITH COMPACT STORAGE;")
        session.execute("CREATE INDEX keys_index ON ks.compact_table (b);")
        session.execute("CREATE TABLE ks.regular_table (a int PRIMARY KEY, b int)")
        session.execute("CREATE INDEX composites_index on ks.regular_table (b)")

        insert_args = [(i, i % 2) for i in xrange(100)]
        execute_concurrent_with_args(session,
                                     session.prepare("INSERT INTO ks.compact_table (a, b) VALUES (?, ?)"),
                                     insert_args)
        execute_concurrent_with_args(session,
                                     session.prepare("INSERT INTO ks.regular_table (a, b) VALUES (?, ?)"),
                                     insert_args)

        res = session.execute("SELECT * FROM ks.compact_table WHERE b = 0")
        self.assertEqual(len(rows_to_list(res)), 50)
        res = session.execute("SELECT * FROM ks.regular_table WHERE b = 0")
        self.assertEqual(len(rows_to_list(res)), 50)
    def test_execute_concurrent_paged_result(self):
        if PROTOCOL_VERSION < 2:
            raise unittest.SkipTest(
                "Protocol 2+ is required for Paging, currently testing against %r"
                % (PROTOCOL_VERSION,))

        num_statements = 201
        statement = SimpleStatement(
            "INSERT INTO test3rf.test (k, v) VALUES (%s, %s)",
            consistency_level=ConsistencyLevel.QUORUM)
        parameters = [(i, i) for i in range(num_statements)]

        results = execute_concurrent_with_args(self.session, statement, parameters)
        self.assertEqual(num_statements, len(results))
        self.assertEqual([(True, None)] * num_statements, results)

        # read
        statement = SimpleStatement(
            "SELECT * FROM test3rf.test LIMIT %s",
            consistency_level=ConsistencyLevel.QUORUM,
            fetch_size=int(num_statements / 2))
        parameters = [(i, ) for i in range(num_statements)]

        results = execute_concurrent_with_args(self.session, statement, [(num_statements,)])
        self.assertEqual(1, len(results))
        self.assertTrue(results[0][0])
        result = results[0][1]
        self.assertIsInstance(result, PagedResult)
        self.assertEqual(num_statements, sum(1 for _ in result))
    def _assert_cdc_data_readable_on_round_trip(self, start_with_cdc_enabled):
        """
        Parameterized test asserting that data written to a table is still
        readable after flipping the CDC flag on that table, then flipping it
        again. Starts with CDC enabled if start_with_cdc_enabled, otherwise
        starts with it disabled.
        """
        ks_name, table_name = 'ks', 'tab'
        sequence = [True, False, True] if start_with_cdc_enabled else [False, True, False]
        start_enabled, alter_path = sequence[0], list(sequence[1:])

        node, session = self.prepare(ks_name=ks_name, table_name=table_name,
                                     cdc_enabled_table=start_enabled,
                                     data_schema='(a int PRIMARY KEY, b int)')
        set_cdc = _get_set_cdc_func(session=session, ks_name=ks_name, table_name=table_name)

        insert_stmt = session.prepare('INSERT INTO ' + table_name + ' (a, b) VALUES (?, ?)')
        data = tuple(zip(list(range(1000)), list(range(1000))))
        execute_concurrent_with_args(session, insert_stmt, data)

        # We need data to be in commitlogs, not sstables.
        self.assertEqual([], list(node.get_sstables(ks_name, table_name)))

        for enable in alter_path:
            set_cdc(enable)
            self.assertItemsEqual(session.execute('SELECT * FROM ' + table_name), data)
Beispiel #8
0
    def _assert_cdc_data_readable_on_round_trip(self, start_with_cdc_enabled):
        """
        Parameterized test asserting that data written to a table is still
        readable after flipping the CDC flag on that table, then flipping it
        again. Starts with CDC enabled if start_with_cdc_enabled, otherwise
        starts with it disabled.
        """
        ks_name, table_name = 'ks', 'tab'
        sequence = [True, False, True] if start_with_cdc_enabled else [False, True, False]
        start_enabled, alter_path = sequence[0], list(sequence[1:])

        node, session = self.prepare(ks_name=ks_name, table_name=table_name,
                                     cdc_enabled_table=start_enabled,
                                     column_spec='a int PRIMARY KEY, b int')
        set_cdc = _get_set_cdc_func(session=session, ks_name=ks_name, table_name=table_name)

        insert_stmt = session.prepare('INSERT INTO ' + table_name + ' (a, b) VALUES (?, ?)')
        # data = zip(list(range(1000)), list(range(1000)))
        start = 0
        stop = 1000
        step = 1
        data = [(n, min(n+step, stop)) for n in range(start, stop, step)]

        execute_concurrent_with_args(session, insert_stmt, data)

        # We need data to be in commitlogs, not sstables.
        assert [] == list(node.get_sstables(ks_name, table_name))

        for enable in alter_path:
            set_cdc(enable)
            assert_resultset_contains(session.execute('SELECT * FROM ' + table_name), data)
Beispiel #9
0
 def flush(self, force_clear=False):
     query = self.session.prepare("INSERT INTO states (crawl, fingerprint, state) VALUES (?, ?, ?)")
     cql_items = []
     for fingerprint, state_val in self._cache.iteritems():
         cql_i = (self.crawl_id, fingerprint, state_val)
         cql_items.append(cql_i)
     execute_concurrent_with_args(self.session, query, cql_items, concurrency=20000)
     super(States, self).flush(force_clear)
Beispiel #10
0
 def update_score(self, batch):
     query = self.session.prepare("UPDATE metadata SET score = ? WHERE crawl = ? AND fingerprint = ?")
     cql_items = []
     for fprint, score, request, schedule in batch:
         cql_i = (score, self.crawl_id, fprint)
         cql_items.append(cql_i)
     execute_concurrent_with_args(self.session, query, cql_items, concurrency=400)
     self.counter_cls.cass_count({"scored_urls": len(cql_items)})
Beispiel #11
0
def batch_insert(session, table, columns, contents, queue_length=120):
    """
    Populate the given table with the given values
    """
    column_names = ','.join(columns)
    question_marks = ','.join(list(repeat("?",len(columns))))
    insert_query = session.prepare('INSERT INTO ' + table + ' (' + column_names + ') VALUES (' + question_marks + ')')
    
    execute_concurrent_with_args(session, insert_query, contents)
Beispiel #12
0
def fetch_annotations(stream_key, time_range, session=None, prepared=None, query_consistency=None, with_mooring=True):
    #------- Query 1 -------
    # query where annotation effectivity is whithin the query time range
    # or stadles the end points
    select_columns = "subsite, node, sensor, time, time2, parameters, provenance, annotation, method, deployment, id "
    select_clause = "select " + select_columns + "from annotations "
    where_clause = "where subsite=? and node=? and sensor=?"
    time_constraint = " and time>=%s and time<=%s"
    query_base = select_clause + where_clause  + time_constraint
    query_string = query_base % (time_range.start, time_range.stop)

    query1 = session.prepare(query_string)
    query1.consistency_level = query_consistency

    #------- Query 2 --------
    # Where annoation effectivity straddles the entire query time range
    # -- This is necessary because of the way the Cassandra uses the 
    #    start-time in the primary key
    time_constraint_wide = " and time<=%s"
    query_base_wide = select_clause + where_clause + time_constraint_wide
    query_string_wide = query_base_wide % (time_range.start)

    query2 = session.prepare(query_string_wide)
    query2.consistency_level = query_consistency

    #----------------------------------------------------------------------
    # Prepare arguments for both query1 and query2
    #----------------------------------------------------------------------
    # [(subsite,node,sensor),(subsite,node,''),(subsite,'','')
    tup1 = (stream_key.subsite,stream_key.node,stream_key.sensor)
    tup2 = (stream_key.subsite,stream_key.node, '')
    tup3 = (stream_key.subsite,'','')
    args = []
    args.append(tup1)
    if with_mooring:
        args.append(tup2)
        args.append(tup3)

    result = []
    # query where annotation effectivity is whithin the query time range
    # or stadles the end points
    for success, rows in execute_concurrent_with_args(session, query1, args, concurrency=3):
        if success:
            result.extend(list(rows))

    temp = []
    for success, rows in execute_concurrent_with_args(session, query2, args, concurrency=3):
        if success:
            temp.extend(list(rows))

    for row in temp:
        time2 = row[4]
        if time_range.stop < time2:
            result.append(row) 

    return result
def write_to_trigger_fsync(session, ks, table):
    """
    Given a session, a keyspace name, and a table name, inserts enough values
    to trigger an fsync to the commitlog, assuming the cluster's
    commitlog_segment_size_in_mb is 1. Assumes the table's columns are
    (key int, a int, b int, c int).
    """
    execute_concurrent_with_args(session,
                                 session.prepare('INSERT INTO "{ks}"."{table}" (key, a, b, c) VALUES (?, ?, ?, ?)'.format(ks=ks, table=table)),
                                 ((x, x + 1, x + 2, x + 3) for x in range(50000)))
Beispiel #14
0
 def verify_insert_select(ins_statement, sel_statement):
     execute_concurrent_with_args(s, ins_statement, ((f, f) for f in items))
     for f in items:
         row = s.execute(sel_statement, (f,))[0]
         if math.isnan(f):
             self.assertTrue(math.isnan(row.f))
             self.assertTrue(math.isnan(row.d))
         else:
             self.assertEqual(row.f, f)
             self.assertEqual(row.d, f)
    def simple_bootstrap_test(self):
        cluster = self.cluster
        tokens = cluster.balanced_tokens(2)
        cluster.set_configuration_options(values={'num_tokens': 1})

        debug("[node1, node2] tokens: %r" % (tokens,))

        keys = 10000

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]
        node1.set_configuration_options(values={'initial_token': tokens[0]})
        cluster.start(wait_other_notice=True)

        session = self.patient_cql_connection(node1)
        self.create_ks(session, 'ks', 1)
        self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})

        # record the size before inserting any of our own data
        empty_size = node1.data_size()
        debug("node1 empty size : %s" % float(empty_size))

        insert_statement = session.prepare("INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')")
        execute_concurrent_with_args(session, insert_statement, [['k%d' % k] for k in range(keys)])

        node1.flush()
        node1.compact()
        initial_size = node1.data_size()
        debug("node1 size before bootstrapping node2: %s" % float(initial_size))

        # Reads inserted data all during the bootstrap process. We shouldn't
        # get any error
        reader = self.go(lambda _: query_c1c2(session, random.randint(0, keys - 1), ConsistencyLevel.ONE))

        # Bootstrapping a new node
        node2 = new_node(cluster)
        node2.set_configuration_options(values={'initial_token': tokens[1]})
        node2.start(wait_for_binary_proto=True)
        node2.compact()

        reader.check()
        node1.cleanup()
        debug("node1 size after cleanup: %s" % float(node1.data_size()))
        node1.compact()
        debug("node1 size after compacting: %s" % float(node1.data_size()))
        time.sleep(.5)
        reader.check()

        debug("node2 size after compacting: %s" % float(node2.data_size()))

        size1 = float(node1.data_size())
        size2 = float(node2.data_size())
        assert_almost_equal(size1, size2, error=0.3)
        assert_almost_equal(float(initial_size - empty_size), 2 * (size1 - float(empty_size)))
def insert_c1c2(session, keys=None, n=None, consistency=ConsistencyLevel.QUORUM):
    if (keys is None and n is None) or (keys is not None and n is not None):
        raise ValueError("Expected exactly one of 'keys' or 'n' arguments to not be None; "
                         "got keys={keys}, n={n}".format(keys=keys, n=n))
    if n:
        keys = list(range(n))

    statement = session.prepare("INSERT INTO cf (key, c1, c2) VALUES (?, 'value1', 'value2')")
    statement.consistency_level = consistency

    execute_concurrent_with_args(session, statement, [['k{}'.format(k)] for k in keys])
Beispiel #17
0
    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Dequeues new batch of requests for crawling.

        :param max_n_requests: maximum number of requests to return
        :param partition_id: partition id
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        results = []
        try:
            dequeued_urls = 0
            cql_ditems = []
            d_query = self.session.prepare("DELETE FROM queue WHERE crawl = ? AND fingerprint = ? AND partition_id = ? "
                                           "AND score = ? AND created_at = ?")
            for item in self.queue_model.objects.filter(crawl=self.crawl_id, partition_id=partition_id).\
                    order_by("partition_id", "score", self._order_by()).limit(max_n_requests):
                method = 'GET' if not item.method else item.method

                meta_dict2 = dict((name, getattr(item.meta, name)) for name in dir(item.meta)
                                  if not name.startswith('__'))
                # TODO: How the result can be an dict not an object -> Objects get error while encodeing for Message Bus
                # If I take meta_dict2 direct to Request i get the same error message

                meta_dict = dict()
                meta_dict["fingerprint"] = meta_dict2["fingerprint"]
                meta_dict["domain"] = meta_dict2["domain"]
                meta_dict["origin_is_frontier"] = meta_dict2["origin_is_frontier"]
                meta_dict["scrapy_callback"] = meta_dict2["scrapy_callback"]
                meta_dict["scrapy_errback"] = meta_dict2["scrapy_errback"]
                meta_dict["scrapy_meta"] = meta_dict2["scrapy_meta"]
                meta_dict["score"] = meta_dict2["score"]
                meta_dict["jid"] = meta_dict2["jid"]

                r = Request(item.url, method=method, meta=meta_dict, headers=item.headers, cookies=item.cookies)
                r.meta['fingerprint'] = item.fingerprint
                r.meta['score'] = item.score
                results.append(r)

                cql_d = (item.crawl, item.fingerprint, item.partition_id, item.score, item.created_at)
                cql_ditems.append(cql_d)
                dequeued_urls += 1

            if dequeued_urls > 0:
                execute_concurrent_with_args(self.session, d_query, cql_ditems, concurrency=200)

            self.counter_cls.cass_count({"dequeued_urls": dequeued_urls})

        except Exception, exc:
            self.logger.exception(exc)
Beispiel #18
0
def fetch_l0_provenance(stream_key, provenance_values, deployment):
    """
    Fetch the l0_provenance entry for the passed information.
    All of the necessary information should be stored as a tuple in the
    provenance metadata store.
    """
    # UUIDs are cast to strings so remove all 'None' values
    if stream_key.method.startswith('streamed'):
        deployment = 0

    prov_ids = []
    for each in set(provenance_values):
        try:
            prov_ids.append(uuid.UUID(each))
        except ValueError:
            pass

    provenance_arguments = [
        (stream_key.subsite, stream_key.node, stream_key.sensor,
         stream_key.method, deployment, prov_id) for prov_id in prov_ids]

    query = SessionManager.prepare(L0_DATASET)
    results = execute_concurrent_with_args(SessionManager.session(), query, provenance_arguments)
    records = [ProvTuple(*rows[0]) for success, rows in results if success and rows]

    if len(provenance_arguments) != len(records):
        log.warn("Could not find %d provenance entries", len(provenance_arguments) - len(records))

    prov_dict = {
        str(row.id): {'file_name': row.file_name,
                      'parser_name': row.parser_name,
                      'parser_version': row.parser_version}
        for row in records}
    return prov_dict
    def test_execute_concurrent_with_args(self):
        for num_statements in (0, 1, 2, 7, 10, 99, 100, 101, 199, 200, 201):
            statement = "INSERT INTO test3rf.test (k, v) VALUES (%s, %s)"
            parameters = [(i, i) for i in range(num_statements)]

            results = execute_concurrent_with_args(self.session, statement, parameters)
            self.assertEqual(num_statements, len(results))
            self.assertEqual([(True, None)] * num_statements, results)

            # read
            statement = "SELECT v FROM test3rf.test WHERE k=%s"
            parameters = [(i,) for i in range(num_statements)]

            results = execute_concurrent_with_args(self.session, statement, parameters)
            self.assertEqual(num_statements, len(results))
            self.assertEqual([(True, [(i,)]) for i in range(num_statements)], results)
def create_rows(data, session, table_name, cl=None, format_funcs=None, prefix='', postfix=''):
    """
    Creates db rows using given session, with table name provided,
    using data formatted like:

    |colname1|colname2|
    +--------+--------+
    |value2  |value2  |

    format_funcs should be a dictionary of {columnname: function} if data needs to be formatted
    before being included in CQL.

    Returns a list of maps describing the data created.
    """
    values = []
    dicts = parse_data_into_dicts(data, format_funcs=format_funcs)

    # use the first dictionary to build a prepared statement for all
    prepared = session.prepare(
        "{prefix} INSERT INTO {table} ({cols}) values ({vals}) {postfix}".format(
            prefix=prefix, table=table_name, cols=', '.join(dicts[0].keys()),
            vals=', '.join('?' for k in dicts[0].keys()), postfix=postfix)
    )
    if cl is not None:
        prepared.consistency_level = cl

    query_results = execute_concurrent_with_args(session, prepared, [d.values() for d in dicts])

    for i, (status, result_or_exc) in enumerate(query_results):
        # should maybe check status here before appening to expected values
        values.append(dicts[i])

    return values
Beispiel #21
0
 def add_seeds(self, seeds):
     cql_items = []
     for seed in seeds:
         query = self.session.prepare(
             "INSERT INTO metadata (crawl, fingerprint, url, created_at, meta, headers, cookies, method, depth) "
             "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)")
         meta = Meta(domain=seed.meta['domain'], fingerprint=seed.meta['fingerprint'],
                     origin_is_frontier=seed.meta['origin_is_frontier'],
                     scrapy_callback=seed.meta['scrapy_callback'], scrapy_errback=seed.meta['scrapy_errback'],
                     scrapy_meta=seed.meta['scrapy_meta'])
         cql_i = (self.crawl_id, seed.meta['fingerprint'], seed.url, datetime.utcnow(), meta,
                  seed.headers, seed.cookies, seed.method, 0)
         cql_items.append(cql_i)
     if len(seeds) > 0:
         execute_concurrent_with_args(self.session, query, cql_items, concurrency=400)
     self.counter_cls.cass_count({"seed_urls": len(seeds)})
    def test_source_copy_round_trip(self):
        """
        Like test_round_trip, but uses the SOURCE command to execute the
        COPY command.  This checks that we don't have unicode-related
        problems when sourcing COPY commands (CASSANDRA-9083).
        """
        self.prepare()
        self.session.execute("""
            CREATE TABLE testcopyto (
                a int,
                b text,
                c float,
                d uuid,
                PRIMARY KEY (a, b)
            )""")

        insert_statement = self.session.prepare("INSERT INTO testcopyto (a, b, c, d) VALUES (?, ?, ?, ?)")
        args = [(i, str(i), float(i) + 0.5, uuid4()) for i in range(1000)]
        execute_concurrent_with_args(self.session, insert_statement, args)

        results = list(self.session.execute("SELECT * FROM testcopyto"))

        self.tempfile = NamedTemporaryFile(delete=False)
        debug('Exporting to csv file: {name}'.format(name=self.tempfile.name))

        commandfile = NamedTemporaryFile(delete=False)
        commandfile.file.write('USE ks;\n')
        commandfile.file.write("COPY ks.testcopyto TO '{name}' WITH HEADER=false;".format(name=self.tempfile.name))
        commandfile.close()

        self.node1.run_cqlsh(cmds="SOURCE '{name}'".format(name=commandfile.name))
        os.unlink(commandfile.name)

        # import the CSV file with COPY FROM
        self.session.execute("TRUNCATE ks.testcopyto")
        debug('Importing from csv file: {name}'.format(name=self.tempfile.name))

        commandfile = NamedTemporaryFile(delete=False)
        commandfile.file.write('USE ks;\n')
        commandfile.file.write("COPY ks.testcopyto FROM '{name}' WITH HEADER=false;".format(name=self.tempfile.name))
        commandfile.close()

        self.node1.run_cqlsh(cmds="SOURCE '{name}'".format(name=commandfile.name))
        new_results = list(self.session.execute("SELECT * FROM testcopyto"))
        self.assertEqual(results, new_results)

        os.unlink(commandfile.name)
Beispiel #23
0
def fetch_concurrent(stream_key, cols, times, concurrency=50):
    query = "select %s from %s where subsite='%s' and node='%s' and sensor='%s' and bin=? and method='%s' and time>=? and time<=?" % \
            (','.join(cols), stream_key.stream.name, stream_key.subsite, stream_key.node, stream_key.sensor,
             stream_key.method)
    query = SessionManager.prepare(query)
    results = execute_concurrent_with_args(SessionManager.session(), query, times, concurrency=concurrency)
    results = [list(r[1]) if type(r[1]) == PagedResult else r[1] for r in results if r[0]]
    return results
    def _insert(self, session, keyspace, count=12,
                consistency_level=ConsistencyLevel.ONE):
        session.execute('USE %s' % keyspace)
        ss = SimpleStatement('INSERT INTO cf(k, i) VALUES (0, 0)', consistency_level=consistency_level)

        tries = 0
        while tries < 100:
            try:
                execute_concurrent_with_args(session, ss, [None] * count)
                return
            except (OperationTimedOut, WriteTimeout, WriteFailure):
                ex_type, ex, tb = sys.exc_info()
                log.warn("{0}: {1} Backtrace: {2}".format(ex_type.__name__, ex, traceback.extract_tb(tb)))
                del tb
                tries += 1

        raise RuntimeError("Failed to execute query after 100 attempts: {0}".format(ss))
Beispiel #25
0
    def schedule(self, batch):
        query = self.session.prepare("INSERT INTO queue (crawl, fingerprint, score, partition_id, host_crc32, url, "
                                     "created_at, meta, depth, headers, method, cookies) "
                                     "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
        cql_items = []
        for fprint, score, request, schedule in batch:
            if schedule:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                if not hostname:
                    self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint))
                    partition_id = self.partitions[0]
                    host_crc32 = 0
                else:
                    partition_id = self.partitioner.partition(hostname, self.partitions)
                    host_crc32 = get_crc32(hostname)
                created_at = time()*1E+6

                if "domain" not in request.meta:
                    request.meta["domain"] = {}
                if "origin_is_frontier" not in request.meta:
                    request.meta["origin_is_frontier"] = ''
                if "scrapy_callback" not in request.meta:
                    request.meta["scrapy_callback"] = None
                if "scrapy_errback" not in request.meta:
                    request.meta["scrapy_errback"] = None
                if "scrapy_meta" not in request.meta:
                    request.meta["scrapy_meta"] = {}
                if "score" not in request.meta:
                    request.meta["score"] = 0
                if "jid" not in request.meta:
                    request.meta["jid"] = 0

                meta = Meta(domain=request.meta['domain'], fingerprint=fprint,
                            origin_is_frontier=request.meta['origin_is_frontier'],
                            scrapy_callback=request.meta['scrapy_callback'],
                            scrapy_errback=request.meta['scrapy_errback'], scrapy_meta=request.meta['scrapy_meta'])

                cql_i = (self.crawl_id, fprint, score, partition_id, host_crc32, request.url, created_at, meta, 0,
                         request.headers, request.method, request.cookies)
                cql_items.append(cql_i)

                request.meta['state'] = States.QUEUED

        execute_concurrent_with_args(self.session, query, cql_items, concurrency=400)
        self.counter_cls.cass_count({"queued_urls": len(cql_items)})
def write_to_trigger_fsync(session, ks, table):
    """
    Given a session, a keyspace name, and a table name, inserts enough values
    to trigger an fsync to the commitlog, assuming the cluster's
    commitlog_segment_size_in_mb is 1. Assumes the table's columns are
    (key int, a int, b int, c int).
    """
    """
    From https://github.com/datastax/python-driver/pull/877/files
      "Note: in the case that `generators` are used, it is important to ensure the consumers do not
       block or attempt further synchronous requests, because no further IO will be processed until
       the consumer returns. This may also produce a deadlock in the IO event thread."
    """
    execute_concurrent_with_args(session,
                                 session.prepare('INSERT INTO "{ks}"."{table}" (key, a, b, c) '
                                                 'VALUES (?, ?, ?, ?)'.format(ks=ks, table=table)),
                                 ((x, x + 1, x + 2, x + 3)
                                 for x in range(50000)), concurrency=5)
    def _test_round_trip(self, nodes, partitioner, num_records=10000):
        """
        Test a simple round trip of a small CQL table to and from a CSV file via
        COPY.

        - creating and populating a table,
        - COPYing that table to a CSV file,
        - SELECTing the contents of the table,
        - TRUNCATEing the table,
        - COPYing the written CSV file back into the table, and
        - asserting that the previously-SELECTed contents of the table match the
        current contents of the table.
        """
        self.prepare(nodes=nodes, partitioner=partitioner)
        self.session.execute("""
            CREATE TABLE testcopyto (
                a text PRIMARY KEY,
                b int,
                c float,
                d uuid
            )""")

        insert_statement = self.session.prepare("INSERT INTO testcopyto (a, b, c, d) VALUES (?, ?, ?, ?)")
        args = [(str(i), i, float(i) + 0.5, uuid4()) for i in range(num_records)]
        execute_concurrent_with_args(self.session, insert_statement, args)

        results = list(self.session.execute("SELECT * FROM testcopyto"))

        self.tempfile = NamedTemporaryFile(delete=False)
        debug('Exporting to csv file: {}'.format(self.tempfile.name))
        out = self.node1.run_cqlsh(cmds="COPY ks.testcopyto TO '{}'".format(self.tempfile.name), return_output=True)
        debug(out)

        # check all records were exported
        self.assertEqual(num_records, sum(1 for line in open(self.tempfile.name)))

        # import the CSV file with COPY FROM
        self.session.execute("TRUNCATE ks.testcopyto")
        debug('Importing from csv file: {}'.format(self.tempfile.name))
        out = self.node1.run_cqlsh(cmds="COPY ks.testcopyto FROM '{}'".format(self.tempfile.name), return_output=True)
        debug(out)

        new_results = list(self.session.execute("SELECT * FROM testcopyto"))
        self.assertEqual(results, new_results)
    def test_null_types(self):
        """
        Test to validate that the numpy protocol handler can deal with null values.
        @since 3.3.0
         - updated 3.6.0: now numeric types used masked array
        @jira_ticket PYTHON-550
        @expected_result Numpy can handle non mapped types' null values.

        @test_category data_types:serialization
        """
        s = self.session
        s.row_factory = tuple_factory
        s.client_protocol_handler = NumpyProtocolHandler

        table = "%s.%s" % (self.keyspace_name, self.function_table_name)
        create_table_with_all_types(table, s, 10)

        begin_unset = max(s.execute('select primkey from %s' % (table,))[0]['primkey']) + 1
        keys_null = range(begin_unset, begin_unset + 10)

        # scatter some emptry rows in here
        insert = "insert into %s (primkey) values (%%s)" % (table,)
        execute_concurrent_with_args(s, insert, ((k,) for k in keys_null))

        result = s.execute("select * from %s" % (table,))[0]

        from numpy.ma import masked, MaskedArray
        result_keys = result.pop('primkey')
        mapped_index = [v[1] for v in sorted(zip(result_keys, count()))]

        had_masked = had_none = False
        for col_array in result.values():
            # these have to be different branches (as opposed to comparing against an 'unset value')
            # because None and `masked` have different identity and equals semantics
            if isinstance(col_array, MaskedArray):
                had_masked = True
                [self.assertIsNot(col_array[i], masked) for i in mapped_index[:begin_unset]]
                [self.assertIs(col_array[i], masked) for i in mapped_index[begin_unset:]]
            else:
                had_none = True
                [self.assertIsNotNone(col_array[i]) for i in mapped_index[:begin_unset]]
                [self.assertIsNone(col_array[i]) for i in mapped_index[begin_unset:]]
        self.assertTrue(had_masked)
        self.assertTrue(had_none)
    def test_tombstone_overflow_read_failure(self):
        """
        Test to validate that a ReadFailure is returned from the node when a specified threshold of tombstombs is
        reached.

        test_tombstomb_overflow_read_failure First sets the tombstone failure threshold down to a level that allows it
        to be more easily encountered. We then create some wide rows and ensure they are deleted appropriately. This
        produces the correct amount of tombstombs. Upon making a simple query we expect to get a read failure back
        from the coordinator.


        @since 2.6.0
        @jira_ticket PYTHON-238
        @expected_result Appropriate write failures from the coordinator

        @test_category queries:basic
        """

        # Setup table for "wide row"
        self._perform_cql_statement(
            """
            CREATE TABLE test3rf.test2 (
                k int,
                v0 int,
                v1 int, PRIMARY KEY (k,v0))
            """, consistency_level=ConsistencyLevel.ALL, expected_exception=None)

        statement = self.session.prepare("INSERT INTO test3rf.test2 (k, v0,v1) VALUES  (1,?,1)")
        parameters = [(x,) for x in range(3000)]
        execute_concurrent_with_args(self.session, statement, parameters, concurrency=50)

        statement = self.session.prepare("DELETE v1 FROM test3rf.test2 WHERE k = 1 AND v0 =?")
        parameters = [(x,) for x in range(2001)]
        execute_concurrent_with_args(self.session, statement, parameters, concurrency=50)

        self._perform_cql_statement(
            """
            SELECT * FROM test3rf.test2 WHERE k = 1
            """, consistency_level=ConsistencyLevel.ALL, expected_exception=ReadFailure)

        self._perform_cql_statement(
            """
            DROP TABLE test3rf.test2;
            """, consistency_level=ConsistencyLevel.ALL, expected_exception=None)
Beispiel #30
0
def query_full_bin(stream_key, bins_and_limit, cols):
    query = "select %s from %s where subsite='%s' and node='%s' and sensor='%s' and bin=? and method='%s' and time >= ? and time <= ?" % \
            (', '.join(cols), stream_key.stream.name, stream_key.subsite, stream_key.node,
             stream_key.sensor, stream_key.method)
    query = SessionManager.prepare(query)
    result = []
    for success, rows in execute_concurrent_with_args(SessionManager.session(), query, bins_and_limit, concurrency=50):
        if success:
            result.extend(list(rows))
    return result
    def insert(cls, params):

        idx_start, idx_end = params

        param_list = []

        for index in range(idx_start, idx_end, cls.concurrency):

            curr_batch_size = min(cls.concurrency, idx_end - index)
            for i in range(0, curr_batch_size):
                block = cls.chain[index + i]
                block_tx = [block.height, [tx_stats(x) for x in block.txes]]
                param_list.append(block_tx)

            results = execute_concurrent_with_args(session=cls.session,
                                                   statement=cls.prepared_stmt,
                                                   parameters=param_list,
                                                   concurrency=cls.concurrency)

            for (i, (success, _)) in enumerate(results):
                if not success:
                    while True:
                        try:
                            block = cls.chain[index + i]
                            block_tx = [
                                block.height,
                                [tx_stats(x) for x in block.txes]
                            ]
                            cls.session.execute(cls.prepared_stmt, block_tx)
                        except Exception as e:
                            print(e)
                            continue
                        break

            param_list = []

            with cls.counter.get_lock():
                cls.counter.value += curr_batch_size

            if (cls.counter.value % 1e4) == 0:
                print(f'#blocks {cls.counter.value:,.0f}')
Beispiel #32
0
def create_rows(data,
                cursor,
                table_name,
                cl=None,
                format_funcs=None,
                prefix='',
                postfix=''):
    """
    Creates db rows using given cursor, with table name provided,
    using data formatted like:

    |colname1|colname2|
    |value2  |value2  |

    format_funcs should be a dictionary of {columnname: function} if data needs to be formatted
    before being included in CQL.

    Returns a list of maps describing the data created.
    """
    values = []
    dicts = parse_data_into_dicts(data, format_funcs=format_funcs)

    # use the first dictionary to build a prepared statement for all
    prepared = cursor.prepare(
        "{prefix} INSERT INTO {table} ({cols}) values ({vals}) {postfix}".
        format(prefix=prefix,
               table=table_name,
               cols=', '.join(dicts[0].keys()),
               vals=', '.join('?' for k in dicts[0].keys()),
               postfix=postfix))
    if cl is not None:
        prepared.consistency_level = cl

    query_results = execute_concurrent_with_args(cursor, prepared,
                                                 [d.values() for d in dicts])

    for i, (status, result_or_exc) in enumerate(query_results):
        # should maybe check status here before appening to expected values
        values.append(dicts[i])

    return values
Beispiel #33
0
    def test_recursion_limited(self):
        """
        Verify that recursion is controlled when raise_on_first_error=False and something is wrong with the query.

        PYTHON-585
        """
        max_recursion = sys.getrecursionlimit()
        s = Session(Cluster(), [Host("127.0.0.1", SimpleConvictionPolicy)])
        self.assertRaises(TypeError,
                          execute_concurrent_with_args,
                          s,
                          "doesn't matter", [('param', )] * max_recursion,
                          raise_on_first_error=True)

        results = execute_concurrent_with_args(
            s,
            "doesn't matter", [('param', )] * max_recursion,
            raise_on_first_error=False)  # previously
        self.assertEqual(len(results), max_recursion)
        for r in results:
            self.assertFalse(r[0])
            self.assertIsInstance(r[1], TypeError)
Beispiel #34
0
    def execute(self, subscriber_group_index, subscriber_group_type,
                subscriber_group_name, params):
        success_count = 0
        try:
            self.session_using_lock.acquire()

            prepared_statement = self.prepared_query_store.get_prepared_statement(
                subscriber_group_index, subscriber_group_type,
                subscriber_group_name)
            if prepared_statement:
                results = execute_concurrent_with_args(self.session,
                                                       prepared_statement,
                                                       params)
                if self.debug_stats_enabled:
                    success_count = self.__get_execution_success_count(results)

        except Exception as err:
            print err
        finally:
            self.session_using_lock.release()

        return success_count
def insertAndTime():
  import time
  start = time.time()
  inserted = 0
  print "starting insert..."

  from cassandra.concurrent import execute_concurrent_with_args
  results = execute_concurrent_with_args(session, insert_stmt, rows, results_generator=True)
  # for row in rows:
    # session.execute_async(insert_stmt, row)
    # inserted = inserted + 1
    # if inserted % 1000 == 0:
      # print "inserted", inserted

  for (success, result) in results:
    if success:
      inserted = inserted + 1
    if inserted % 1000 == 0:
      print(inserted)

  end = time.time()
  print "time to insert", inserted, "rows: ", end - start
Beispiel #36
0
    def execute_concurrent_args_helper(self,
                                       session,
                                       query,
                                       params,
                                       results_generator=False):
        count = 0
        while count < 100:
            try:
                return execute_concurrent_with_args(
                    session,
                    query,
                    params,
                    results_generator=results_generator)
            except (ReadTimeout, WriteTimeout, OperationTimedOut, ReadFailure,
                    WriteFailure):
                ex_type, ex, tb = sys.exc_info()
                log.warn("{0}: {1} Backtrace: {2}".format(
                    ex_type.__name__, ex, traceback.extract_tb(tb)))
                del tb

        raise RuntimeError(
            "Failed to execute query after 100 attempts: {0}".format(query))
Beispiel #37
0
def delete_dataframe(dataframe, metadata_record):
    """
    Delete all rows exactly matching the deployment number, time and id from the matching rows in this dataframe
    """
    log.info('delete_dataframe(<DATAFRAME>, %s)', metadata_record)
    query = 'delete from %s where subsite=? and node=? and sensor=? ' \
            'and bin=? and method=? and time=? and deployment=? and id=?' % metadata_record.stream
    query = SessionManager.prepare(query)

    def values_generator(df):
        for index, row in df.iterrows():
            args = (metadata_record.subsite, metadata_record.node,
                    metadata_record.sensor, metadata_record.bin,
                    metadata_record.method, row.time, row.deployment, row.id)
            yield args

    sess = SessionManager.session()
    results = execute_concurrent_with_args(sess,
                                           query,
                                           values_generator(dataframe),
                                           concurrency=200)
    return sum((success for success, _ in results if success))
    def functional_test(self):
        cluster = self.cluster
        cluster.populate(3)
        node1 = cluster.nodelist()[0]

        for keycache_size in (0, 10):
            for rowcache_size in (0, 10):
                debug(
                    "Testing with keycache size of %d MB, rowcache size of %d MB "
                    % (keycache_size, rowcache_size))
                keyspace_name = 'ks_%d_%d' % (keycache_size, rowcache_size)

                # make the caches save every five seconds
                cluster.set_configuration_options(
                    values={
                        'key_cache_size_in_mb': keycache_size,
                        'row_cache_size_in_mb': rowcache_size,
                        'row_cache_save_period': 5,
                        'key_cache_save_period': 5,
                    })

                cluster.start()
                session = self.patient_cql_connection(node1)

                self.create_ks(session, keyspace_name, rf=3)

                session.set_keyspace(keyspace_name)
                session.execute(
                    "CREATE TABLE test (k int PRIMARY KEY, v1 int, v2 int)")
                session.execute(
                    "CREATE TABLE test_clustering (k int, v1 int, v2 int, PRIMARY KEY (k, v1))"
                )
                session.execute(
                    "CREATE TABLE test_counter (k int PRIMARY KEY, v1 counter)"
                )
                session.execute(
                    "CREATE TABLE test_counter_clustering (k int, v1 int, v2 counter, PRIMARY KEY (k, v1))"
                )

                # insert 100 rows into each table
                for cf in ('test', 'test_clustering'):
                    execute_concurrent_with_args(
                        session,
                        session.prepare(
                            "INSERT INTO %s (k, v1, v2) VALUES (?, ?, ?)" %
                            (cf, )), [(i, i, i) for i in range(100)])

                execute_concurrent_with_args(
                    session,
                    session.prepare(
                        "UPDATE test_counter SET v1 = v1 + ? WHERE k = ?"),
                    [(i, i) for i in range(100)],
                    concurrency=2)

                execute_concurrent_with_args(
                    session,
                    session.prepare(
                        "UPDATE test_counter_clustering SET v2 = v2 + ? WHERE k = ? AND v1 = ?"
                    ), [(i, i, i) for i in range(100)],
                    concurrency=2)

                # flush everything to get it into sstables
                for node in cluster.nodelist():
                    node.flush()

                # update the first 10 rows in every table
                # on non-counter tables, delete the first (remaining) row each round
                num_updates = 10
                for validation_round in range(3):
                    session.execute("DELETE FROM test WHERE k = %s",
                                    (validation_round, ))
                    execute_concurrent_with_args(
                        session,
                        session.prepare(
                            "UPDATE test SET v1 = ?, v2 = ? WHERE k = ?"),
                        [(i, validation_round, i)
                         for i in range(validation_round + 1, num_updates)])

                    session.execute(
                        "DELETE FROM test_clustering WHERE k = %s AND v1 = %s",
                        (validation_round, validation_round))
                    execute_concurrent_with_args(
                        session,
                        session.prepare(
                            "UPDATE test_clustering SET v2 = ? WHERE k = ? AND v1 = ?"
                        ), [(validation_round, i, i)
                            for i in range(validation_round + 1, num_updates)])

                    execute_concurrent_with_args(
                        session,
                        session.prepare(
                            "UPDATE test_counter SET v1 = v1 + ? WHERE k = ?"),
                        [(1, i) for i in range(num_updates)],
                        concurrency=2)

                    execute_concurrent_with_args(
                        session,
                        session.prepare(
                            "UPDATE test_counter_clustering SET v2 = v2 + ? WHERE k = ? AND v1 = ?"
                        ), [(1, i, i) for i in range(num_updates)],
                        concurrency=2)

                    self._validate_values(session, num_updates,
                                          validation_round)

                session.shutdown()

                # let the data be written to the row/key caches.
                debug("Letting caches be saved to disk")
                time.sleep(10)
                debug("Stopping cluster")
                cluster.stop()
                time.sleep(1)
                debug("Starting cluster")
                cluster.start()
                time.sleep(5)  # read the data back from row and key caches

                session = self.patient_cql_connection(node1)
                session.set_keyspace(keyspace_name)

                # check all values again
                self._validate_values(session, num_updates, validation_round=2)
Beispiel #39
0
 def _insert(self, session, keyspace, count=12,
             consistency_level=ConsistencyLevel.ONE):
     session.execute('USE %s' % keyspace)
     ss = SimpleStatement('INSERT INTO cf(k, i) VALUES (0, 0)',
                          consistency_level=consistency_level)
     execute_concurrent_with_args(session, ss, [None] * count)
    def test_upgrade_index_summary(self):
        cluster = self.cluster
        cluster.populate(1)
        node = cluster.nodelist()[0]
        original_install_dir = node.get_install_dir()

        # start out with a 2.0 version
        cluster.set_install_dir(version='2.0.12')
        node.set_install_dir(version='2.0.12')
        node.set_log_level("INFO")
        node.stop()

        remove_perf_disable_shared_mem(node)

        cluster.start()

        # Insert enough partitions to fill a full sample's worth of entries
        # in the index summary.  The default index_interval is 128, so every
        # 128th partition will get an entry in the summary.  The minimal downsampling
        # operation will remove every 128th entry in the summary.  So, we need
        # to have 128 entries in the summary, which means 128 * 128 partitions.
        session = self.patient_cql_connection(node, protocol_version=2)
        session.execute(
            "CREATE KEYSPACE testindexsummary WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}"
        )
        session.set_keyspace("testindexsummary")
        session.execute("CREATE TABLE test (k int PRIMARY KEY, v int)")

        insert_statement = session.prepare(
            "INSERT INTO test (k, v) VALUES (? , ?)")
        execute_concurrent_with_args(session, insert_statement,
                                     [(i, i) for i in range(128 * 128)])

        # upgrade to 2.1.3
        session.cluster.shutdown()
        node.drain()
        node.watch_log_for("DRAINED")
        node.stop()
        cluster.set_install_dir(
            version='2.1.3')  # 2.1.3 is affected by CASSANDRA-8993
        node.set_install_dir(version='2.1.3')
        debug("Set new cassandra dir for %s: %s" %
              (node.name, node.get_install_dir()))

        # setup log4j / logback again (necessary moving from 2.0 -> 2.1)
        node.set_log_level("INFO")

        remove_perf_disable_shared_mem(node)

        node.start()

        session = self.patient_cql_connection(node)

        mbean = make_mbean('db', 'IndexSummaries')
        with JolokiaAgent(node) as jmx:
            avg_interval = jmx.read_attribute(mbean, 'AverageIndexInterval')
            self.assertEqual(128.0, avg_interval)

            # force downsampling of the index summary (if it were allowed)
            jmx.write_attribute(mbean, 'MemoryPoolCapacityInMB', 0)
            jmx.execute_method(mbean, 'redistributeSummaries')

            avg_interval = jmx.read_attribute(mbean, 'AverageIndexInterval')

            # after downsampling, the average interval goes up
            self.assertGreater(avg_interval, 128.0)

        # upgrade to the latest 2.1+ by using the original install dir
        session.cluster.shutdown()
        node.drain()
        node.watch_log_for("DRAINED")
        node.stop()
        cluster.set_install_dir(original_install_dir)
        node.set_install_dir(original_install_dir)
        debug("Set new cassandra dir for %s: %s" %
              (node.name, node.get_install_dir()))

        node.set_log_level("INFO")

        remove_perf_disable_shared_mem(node)

        node.start()

        # on startup, it should detect that the old-format sstable had its
        # index summary downsampled (forcing it to be rebuilt)
        node.watch_log_for("Detected erroneously downsampled index summary")

        session = self.patient_cql_connection(node)

        mbean = make_mbean('db', 'IndexSummaries')
        with JolokiaAgent(node) as jmx:
            avg_interval = jmx.read_attribute(mbean, 'AverageIndexInterval')
            self.assertEqual(128.0, avg_interval)

            # force downsampling of the index summary (if it were allowed)
            jmx.write_attribute(mbean, 'MemoryPoolCapacityInMB', 0)
            jmx.execute_method(mbean, 'redistributeSummaries')

            avg_interval = jmx.read_attribute(mbean, 'AverageIndexInterval')

            # post-8993, it shouldn't allow downsampling of old-format sstables
            self.assertEqual(128.0, avg_interval)
frames = np.array_split(events, 5)
events = []

final_frames = []

for frame in frames:

    q = s.prepare(
        "select event_id, event_mention "
        "from event_mention_date where event_id = ? allow filtering;")

    args = []
    for id in frame.event_id.values:
        args.append((id, ))

    results = execute_concurrent_with_args(s, q, args, concurrency=n)

    data_dict = defaultdict(dict)

    for result in results:
        # the execute_concurrent functions return a higher-level object than session.execute
        # need to check that the query ran successfully, then access all rows in
        # result (nested ResultSet object in property .result_or_exc)
        if result.success:
            for row in result.result_or_exc:
                data_dict[row.event_id]['mention_date'] = row.event_mention
                data_dict[row.event_id]['event_id'] = row.event_id

        else:  # if the query failed, bubble up the exception that was thrown
            print('query failed!')
            raise result.result_or_exc
Beispiel #42
0
def insert_dataset(stream_key, dataset):
    """
    Insert an xray dataset into CASSANDRA using the specified stream_key.
    :param stream_key: Stream that we are using for the data insertion
    :param dataset: xray dataset we are inserting
    :return: str: results of insertion
    """
    # capture the current data from the stream metadata table for later processing
    cur_stream_meta = metadata_service_api.get_stream_metadata_record(
        *(stream_key.as_tuple()))

    # capture current partition metadata for each unique bin in the dataset for later processing
    cur_part_meta = {}
    for bin_val in numpy.unique(dataset['bin'].values).tolist():
        part_meta = metadata_service_api.get_partition_metadata_record(
            *(stream_key.as_tuple() + (bin_val, CASS_LOCATION_NAME)))
        if part_meta:
            # capture the parts that are useful for later
            cur_part_meta[bin_val] = (part_meta.get('count'),
                                      part_meta.get('first'),
                                      part_meta.get('last'))

    # get the data in the correct format
    cols = SessionManager.get_query_columns(stream_key.stream.name)
    dynamic_cols = cols[1:]
    key_cols = ['subsite', 'node', 'sensor', 'bin', 'method']
    cols = key_cols + dynamic_cols
    arrays = {
        p.name
        for p in stream_key.stream.parameters
        if not p.is_function and p.parameter_type == 'array<quantity>'
    }

    # build the data lists for all the particles to be populated into cassandra
    data_lists = {}
    # id and provenance are expected to be UUIDs so convert them to uuids
    data_lists['id'] = [uuid.UUID(x) for x in dataset['id'].values]
    data_lists['provenance'] = [
        uuid.UUID(x) for x in dataset['provenance'].values
    ]
    data_lists['bin'] = [bin_val for bin_val in dataset['bin'].values]

    for i in arrays:
        data_lists[i] = [msgpack.packb(x) for x in dataset[i].values.tolist()]
    for dc in dynamic_cols:
        # if it is in the dataset and not already in the datalist we need to put it in the list
        if dc in dataset and dc not in data_lists:
            if '_FillValue' in dataset[dc].attrs:
                temp_val = dataset[dc].values.astype(object)
                temp_val[temp_val == dataset[dc].attrs['_FillValue']] = None
                data_lists[dc] = temp_val
            else:
                data_lists[dc] = dataset[dc].values

    # get the query to insert information
    col_names = ', '.join(cols)
    # Take subsite, node, sensor, ?, and method: the ? is for bin
    key_str = "'{:s}', '{:s}', '{:s}', ?, '{:s}'".format(
        stream_key.subsite, stream_key.node, stream_key.sensor,
        stream_key.method)
    # and the rest as ? for the dynamic columns
    data_str = ', '.join(['?' for _ in dynamic_cols])
    full_str = '{:s}, {:s}'.format(key_str, data_str)
    upsert_query = 'INSERT INTO {:s} ({:s}) VALUES ({:s})'.format(
        stream_key.stream.name, col_names, full_str)
    upsert_query = SessionManager.prepare(upsert_query)

    count_query = "select count(1) from %s where subsite=? and node=? and sensor=? and bin=? " \
            % (stream_key.stream.name)
    count_query = SessionManager.prepare(count_query)

    prov_query = "select * from dataset_l0_provenance where subsite=? and node=? and sensor=? and method=? " \
            + "and deployment=?"
    prov_query = SessionManager.prepare(prov_query)

    prov_insert = "insert into dataset_l0_provenance (subsite, sensor, node, method, deployment, id, filename, " \
            + "parsername, parserversion) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"
    prov_insert = SessionManager.prepare(prov_insert)

    # obtain the provenance data for the existing dataset
    deployments = numpy.unique(dataset['deployment'].values).tolist()
    prov_query_args = [(dataset.subsite, dataset.node, dataset.sensor, dataset.collection_method, deployment) \
                       for deployment in deployments]
    prov_query_results = execute_concurrent_with_args(SessionManager.session(),
                                                      prov_query,
                                                      prov_query_args)

    # get the number of particles from the first dataset variable
    first_data_var_name = dataset.data_vars.items()[0][0]
    size = dataset[first_data_var_name].size

    # create the data insertion list
    to_insert = []
    data_names = ['bin'] + dynamic_cols
    for i in range(size):
        row = [data_lists[col][i] for col in data_names]
        to_insert.append(row)

    # Build query to create rows
    cluster_key_columns = [
        'subsite', 'node', 'sensor', 'bin', 'method', 'time', 'deployment',
        'id'
    ]
    create_rows_columns = ', '.join(cluster_key_columns)
    # Fill in (subsite, node, sensor, method) leaving (bin, time, deployment, id) to be bound
    create_rows_values = "'{:s}', '{:s}', '{:s}', ?, '{:s}', ?, ?, ?".format(
        stream_key.subsite, stream_key.node, stream_key.sensor,
        stream_key.method)
    create_rows_query = 'INSERT INTO {:s} ({:s}) VALUES ({:s}) IF NOT EXISTS'.format(
        stream_key.stream.name, create_rows_columns, create_rows_values)
    create_rows_query = SessionManager.prepare(create_rows_query)

    # summarize the partition_metadata information by bin from the insertion data
    count = prev_bin = 0
    first = last = 0.0
    new_part_meta = []
    for entry in to_insert:
        cur_bin = entry[0]
        time = entry[1]
        if prev_bin < 1:  # first iteration
            prev_bin = cur_bin
            first = last = time
        elif prev_bin < cur_bin:
            new_part_meta.append([prev_bin, count, first, last])
            count = 0
            first = last = time
            prev_bin = cur_bin
        else:
            last = time
        count += 1
    if 0 < prev_bin:
        new_part_meta.append([prev_bin, count, first, last])

    # combine current and new metadata as needed
    combined_part_meta = []
    for newmeta in new_part_meta:
        bin_val = newmeta[0]
        count = newmeta[1]
        first = newmeta[2]
        last = newmeta[3]
        curmeta = cur_part_meta.get(bin_val)
        if curmeta:
            curcount = curmeta[0]
            curfirst = curmeta[1]
            curlast = curmeta[2]
            count += curcount
            if curfirst < first:
                first = curfirst
            if curlast > last:
                last = curlast
        combined_part_meta.append((bin_val, count, first, last))

    # Delete the current data from the partition metadata table for the involved bins
    for pkey, pvals in cur_part_meta.items():
        del_resp = metadata_service_api.delete_partition_metadata_record(
            *(stream_key.as_tuple() + (pkey, 'cass')))
        del_stts = del_resp.get('statusCode')
        del_msg = del_resp.get('message')
        if del_stts == "OK" and del_msg.startswith("Successfully deleted"):
            log.info(
                "Deleted partition_metadata on bin %d for %s: count %d, first %f, last %f",
                pkey, stream_key.as_refdes(), pvals[0], pvals[1], pvals[2])
        else:
            error_msg = "Failed to delete partition_metadata on bin %d for %s", pkey, stream_key.as_refdes(
            )
            log.error(error_msg)
            return error_msg

    # Insert the combined data into the partition metadata table
    for pmeta in combined_part_meta:
        bin_val = pmeta[0]
        count = pmeta[1]
        first = pmeta[2]
        last = pmeta[3]
        part_meta = metadata_service_api.build_partition_metadata_record(
            *(stream_key.as_tuple() +
              (bin_val, CASS_LOCATION_NAME, first, last, count)))
        metadata_service_api.index_partition_metadata_record(part_meta)

    # Combine the new data into the stream metadata table with any current metadata
    total = sum(row[1] for row in combined_part_meta)
    first = combined_part_meta[0][2]
    last = combined_part_meta[len(combined_part_meta) - 1][3]
    # if there's a current stream metadata involved do the following
    if cur_stream_meta:
        # adjust the count, first and last entries
        curtotal = cur_stream_meta.get('count')
        curfirst = cur_stream_meta.get('first')
        curlast = cur_stream_meta.get('last')
        total += curtotal
        if curfirst < first:
            first = curfirst
        if curlast > last:
            last = curlast
        # delete the existing stream_metadata row prior to adding the updated one
        del_resp = metadata_service_api.delete_stream_metadata_record(
            *(stream_key.as_tuple()))
        del_stts = del_resp.get('statusCode')
        del_msg = del_resp.get('message')
        if del_stts == "OK" and del_msg.startswith("Successfully deleted"):
            count = cur_stream_meta.get('count')
            first = cur_stream_meta.get('first')
            last = cur_stream_meta.get('last')
            log.info(
                "Deleted stream_metadata that summarized the partition_metadata records for %s: "
                + "count %d, first %f, last %f", stream_key.as_refdes(), count,
                first, last)
        else:
            error_msg = "Failed to delete stream_metadata for %s", stream_key.as_refdes(
            )
            log.error(error_msg)
            return error_msg

    smeta = metadata_service_api.build_stream_metadata_record(
        *(stream_key.as_tuple() + (first, last, total)))
    metadata_service_api.create_stream_metadata_record(smeta)

    # Segregate insertion data list into chunks by bin
    insert_chunks = []
    beg = end = 0
    for pmeta in new_part_meta:
        bin_val = pmeta[0]
        count = pmeta[1]
        end += count
        # capture the create and the upsert data into separate numpy arrays for easy access later
        insert_chunks.append([
            bin_val,
            numpy.array(to_insert)[beg:end, :4],
            numpy.array(to_insert)[beg:end]
        ])
        beg = end

    # Process each chunk of the insertion data
    update_tracker = {}
    for insert_chunk in insert_chunks:
        data_bin = insert_chunk[0]
        # We only want (bin, time, deployment, id) for first insertion to create rows
        create_rows_data = insert_chunk[1]
        upsert_rows_data = insert_chunk[2]

        # captures insert results
        insert_rows = {}
        # Execute insert for each row: no individual insert is guaranteed due to
        #  potential of Coordinator node timeout waiting for replica node response, etc
        # tracking the insertion success isn't critical as the upsert step covers for failures
        idx = 0
        for success, _ in execute_concurrent_with_args(SessionManager.session(), create_rows_query, \
                create_rows_data, concurrency=50, raise_on_first_error=False):
            row = create_rows_data[idx]
            insert_rows[str(row.tolist())] = 1 if success else 0
            idx += 1

        rows_inserted = sum(r[1] for r in insert_rows.items())
        if rows_inserted < idx:
            fails = idx - rows_inserted
            log.info(
                "Failed to insert %d rows within Cassandra bin %d for %s.",
                fails, data_bin, stream_key.as_refdes())
            failed_rows = [r[0] for r in insert_rows.items() if r[1] == 0]
            log.info("Rows that failed: %s", failed_rows)

        # captures upsert results
        upsert_rows = {}
        # Update previously existing rows (if any) and newly added rows with complete set of data
        idx = 0
        for success, _ in execute_concurrent_with_args(SessionManager.session(), upsert_query, \
                upsert_rows_data, concurrency=50, raise_on_first_error=False):
            row = upsert_rows_data[idx]
            upsert_rows[str(row.tolist())] = 1 if success else 0
            idx += 1

        rows_upserted = sum(r[1] for r in upsert_rows.items())
        if rows_upserted < idx:
            fails = idx - rows_upserted
            log.warn(
                "Failed to update %d rows within Cassandra bin %d for %s!",
                fails, data_bin, stream_key.as_refdes())
            failed_rows = [r[0] for r in upsert_rows.items() if r[1] == 0]
            log.warn("Rows that failed: %s", failed_rows)

        # query the cassandra stream to capture its count for data in the stream for the stream key and bin
        rs = SessionManager.execute(
            count_query,
            (stream_key.subsite, stream_key.node, stream_key.sensor, data_bin))
        count = rs.current_rows[0][0]
        if count == idx:
            log.info("All rows were inserted into Cassandra on bin %d for %s.",
                     data_bin, stream_key.as_refdes())
        else:
            fails = idx - count
            log.warn(
                "There are %d rows missing in Cassandra on bin %d for %s!",
                fails, data_bin, stream_key.as_refdes())

        update_tracker[data_bin] = (idx, rows_inserted, rows_upserted, count)

    # Process the provenance data, updating the refdes fields as they may have changed from the dataset values
    prov_modified = []
    if prov_query_results[0].success:
        prov_query_iter = prov_query_results[0].result_or_exc
        for row in prov_query_iter:
            pv = ProvTuple(*row)
            prov_modified.append(
                pv._replace(subsite=stream_key.subsite,
                            sensor=stream_key.sensor,
                            node=stream_key.node))

    # insert the modified provenance data
    prov_ins_args = [
        (row.subsite, row.sensor, row.node, row.method, row.deployment, row.id,
         row.file_name, row.parser_name, row.parser_version)
        for row in prov_modified
    ]
    prov_ins_results = execute_concurrent_with_args(SessionManager.session(),
                                                    prov_insert, prov_ins_args)
    prov_insertions = list(filter(lambda r: r[0], prov_ins_results))
    mesg = 'Provenance row insertions: {:d} of {:d} succeeded'.format(
        len(prov_insertions), len(prov_modified))
    log.info(mesg)

    mesg = 'update_tracker length: {:d}'.format(len(update_tracker.items()))
    log.info(mesg)

    ret_val = ""
    for ut in update_tracker.items():
        bin_val = ut[0]
        counts = ut[1]
        inserted = counts[1]
        upserted = counts[2]
        in_table = counts[3]
        ret_val += '\n' if len(ret_val) > 0 else ''
        ret_val += 'Cassandra bin {:d} for {:s}: inserted {:d}, upserted {:d}, total rows in table {:d}.'. \
                format(bin_val, stream_key.as_refdes(), inserted, upserted, in_table)

    log.info(ret_val)
    return ret_val
 def insert_rows(self, session, start, end):
     insert_statement = session.prepare("INSERT INTO ks.cf (key, val) VALUES (?, 'asdf')")
     args = [(r,) for r in range(start, end)]
     execute_concurrent_with_args(session, insert_statement, args, concurrency=20)
    def _base_bootstrap_test(self,
                             bootstrap=None,
                             bootstrap_from_version=None,
                             enable_ssl=None):
        def default_bootstrap(cluster, token):
            node2 = new_node(cluster)
            node2.set_configuration_options(values={'initial_token': token})
            node2.start(wait_for_binary_proto=True)
            return node2

        if bootstrap is None:
            bootstrap = default_bootstrap

        cluster = self.cluster

        if enable_ssl:
            logger.debug("***using internode ssl***")
            generate_ssl_stores(self.fixture_dtest_setup.test_path)
            cluster.enable_internode_ssl(self.fixture_dtest_setup.test_path)

        tokens = cluster.balanced_tokens(2)
        cluster.set_configuration_options(values={'num_tokens': 1})

        logger.debug("[node1, node2] tokens: %r" % (tokens, ))

        keys = 10000

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]
        if bootstrap_from_version:
            logger.debug("starting source node on version {}".format(
                bootstrap_from_version))
            node1.set_install_dir(version=bootstrap_from_version)
        node1.set_configuration_options(values={'initial_token': tokens[0]})
        cluster.start(wait_other_notice=True)

        session = self.patient_cql_connection(node1)
        create_ks(session, 'ks', 1)
        create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})

        # record the size before inserting any of our own data
        empty_size = data_size(node1, 'ks', 'cf')
        logger.debug("node1 empty size for ks.cf: %s" % float(empty_size))

        insert_statement = session.prepare(
            "INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')")
        execute_concurrent_with_args(session, insert_statement,
                                     [['k%d' % k] for k in range(keys)])

        node1.flush()
        node1.compact()
        initial_size = data_size(node1, 'ks', 'cf')
        logger.debug("node1 size for ks.cf before bootstrapping node2: %s" %
                     float(initial_size))

        # Reads inserted data all during the bootstrap process. We shouldn't
        # get any error
        query_c1c2(session, random.randint(0, keys - 1), ConsistencyLevel.ONE)
        session.shutdown()

        # Bootstrapping a new node in the current version
        node2 = bootstrap(cluster, tokens[1])
        node2.compact()

        node1.cleanup()
        logger.debug("node1 size for ks.cf after cleanup: %s" %
                     float(data_size(node1, 'ks', 'cf')))
        node1.compact()
        logger.debug("node1 size for ks.cf after compacting: %s" %
                     float(data_size(node1, 'ks', 'cf')))

        logger.debug("node2 size for ks.cf after compacting: %s" %
                     float(data_size(node2, 'ks', 'cf')))

        size1 = float(data_size(node1, 'ks', 'cf'))
        size2 = float(data_size(node2, 'ks', 'cf'))
        assert_almost_equal(size1, size2, error=0.3)
        assert_almost_equal(float(initial_size - empty_size),
                            2 * (size1 - float(empty_size)))

        assert_bootstrap_state(self, node2, 'COMPLETED')
Beispiel #45
0
    def load_many(cls, conn, keys, concurrency=25):
        """Load multiple models concurrently.

        :param conn: Cassandra connection wrapper used to execute the query.
        :type: cqlengine.ConnectionInterface subclass
        :param keys: List of PRIMARY KEY values to load models by.  For simple
            models with only a single PRIMARY KEY (single partition key and no
            clustering keys) a simple list of values may be used.  For cases with
            multiple PRIMARY KEYS, a list of dicts mapping each primary key to
            it's value must be given.  The primary key name given in this dict must
            match the table name, so if you are using `db_field` in that column, you
            should use _that_ value, not the name of the Column field on your
            cqlmapper model.

            .. code-block:: python

                class SimpleModel(Model):
                    key = columns.Text(primary_key=True)
                    value = columns.Text()

                class ComplexModel(Model):
                    pk = columns.Text(primary_key=True)  # partition key
                    ck = columns.Integer(primary_key=True)  # clustering
                    value = columns.Text()

                class DBFieldModel(Model):
                    _key = columns.Text(primary_key=True, db_field="key")
                    value = columns.Text()

                valid_simple = SimpleModel.load_many(conn, ["fizz", "buzz"])
                valid_simple = SimpleModel.load_many(conn, [{"key": "fizz"}, {"key: "buzz"}])
                try:
                    invalid_simple = SimpleModel.load_many(conn, ["fizz", {"key: "buzz"}])
                except Exception:
                    pass

                valid_complex = ComplexModel.load_many(
                    conn=conn,
                    keys=[
                        {"pk": "fizz", "ck": "buzz},
                        {"pk", "foo", "ck": "bar"},
                    ],
                )
                try:
                    invalid_complex = SimpleModel.load_many(conn, [{"pk": "fizz"}])
                except Exception:
                    pass

                valid_db_field = DBFieldModel.load_many(conn, ["fizz", "buzz"])
                valid_db_field = DBFieldModel.load_many(conn, [{"key": "fizz"}, {"key: "buzz"}])
                try:
                    invalid_db_field = DBFieldModel.load_many(conn, [{"_key: "buzz"}])
                except Exception:
                    pass

        :type: List[Dict[str, Any]] or List[Any]
        :param concurrency: Maximum number of queries to run concurrently.
        :type: int
        """
        if not keys:
            return []

        if concurrency < 1:
            raise ValueError("'concurrency' in 'load_many' must be >= 1.")

        # cls._primary_keys is an OrderedDict so no need to sort the keys
        pks = [col.db_field_name for col in cls._primary_keys.values()]

        # Support the "simple" format for Models that allow it
        if len(pks) == 1 and not isinstance(keys[0], dict):
            keys = [{pks[0]: value} for value in keys]

        parameters = [
            tuple(key_values[key] for key in pks) for key_values in keys
        ]
        args_str = " AND ".join("{key} = ?".format(key=key) for key in pks)
        # cls._columns is an OrderedDict so no need to sort the keys
        cols = ",".join(col.db_field_name for col in cls._columns.values())
        statement = conn.session.prepare(
            "SELECT {columns} FROM {cf_name} WHERE {args}".format(
                columns=cols, cf_name=cls.column_family_name(), args=args_str))
        results = execute_concurrent_with_args(
            session=conn.session,
            statement=statement,
            parameters=parameters,
            concurrency=concurrency,
        )
        models = []
        for result in results:
            if not result.success:
                raise result.result_or_exc
            for values in result.result_or_exc:
                if isinstance(values, tuple) and hasattr(values, "_asdict"):
                    # Support the default 'row_factory' which returns a namedtuple
                    values = values._asdict()
                elif not isinstance(values, dict):
                    # The 'tuple' row factory is not supported
                    raise TypeError(
                        "The type returned by 'session.execute' must be a dict or a namedtuple"
                    )
                models.append(cls._construct_instance(values))
        return models
Beispiel #46
0
    def test_named_table_with_mv(self):
        """
        Test NamedTable access to materialized views

        Creates some materialized views using Traditional CQL. Then ensures we can access those materialized view using
        the NamedKeyspace, and NamedTable interfaces. Tests basic filtering as well.

        @since 3.0.0
        @jira_ticket PYTHON-406
        @expected_result Named Tables should have access to materialized views

        @test_category materialized_view
        """
        ks = models.DEFAULT_KEYSPACE
        self.session.execute("DROP MATERIALIZED VIEW IF EXISTS {0}.alltimehigh".format(ks))
        self.session.execute("DROP MATERIALIZED VIEW IF EXISTS {0}.monthlyhigh".format(ks))
        self.session.execute("DROP TABLE IF EXISTS {0}.scores".format(ks))
        create_table = """CREATE TABLE {0}.scores(
                        user TEXT,
                        game TEXT,
                        year INT,
                        month INT,
                        day INT,
                        score INT,
                        PRIMARY KEY (user, game, year, month, day)
                        )""".format(ks)

        self.session.execute(create_table)
        create_mv = """CREATE MATERIALIZED VIEW {0}.monthlyhigh AS
                        SELECT game, year, month, score, user, day FROM {0}.scores
                        WHERE game IS NOT NULL AND year IS NOT NULL AND month IS NOT NULL AND score IS NOT NULL AND user IS NOT NULL AND day IS NOT NULL
                        PRIMARY KEY ((game, year, month), score, user, day)
                        WITH CLUSTERING ORDER BY (score DESC, user ASC, day ASC)""".format(ks)

        self.session.execute(create_mv)

        create_mv_alltime = """CREATE MATERIALIZED VIEW {0}.alltimehigh AS
                        SELECT * FROM {0}.scores
                        WHERE game IS NOT NULL AND score IS NOT NULL AND user IS NOT NULL AND year IS NOT NULL AND month IS NOT NULL AND day IS NOT NULL
                        PRIMARY KEY (game, score, user, year, month, day)
                        WITH CLUSTERING ORDER BY (score DESC)""".format(ks)

        self.session.execute(create_mv_alltime)

        # Populate the base table with data
        prepared_insert = self.session.prepare("""INSERT INTO {0}.scores (user, game, year, month, day, score) VALUES  (?, ?, ? ,? ,?, ?)""".format(ks))
        parameters = (('pcmanus', 'Coup', 2015, 5, 1, 4000),
                      ('jbellis', 'Coup', 2015, 5, 3, 1750),
                      ('yukim', 'Coup', 2015, 5, 3, 2250),
                      ('tjake', 'Coup', 2015, 5, 3, 500),
                      ('iamaleksey', 'Coup', 2015, 6, 1, 2500),
                      ('tjake', 'Coup', 2015, 6, 2, 1000),
                      ('pcmanus', 'Coup', 2015, 6, 2, 2000),
                      ('jmckenzie', 'Coup', 2015, 6, 9, 2700),
                      ('jbellis', 'Coup', 2015, 6, 20, 3500),
                      ('jbellis', 'Checkers', 2015, 6, 20, 1200),
                      ('jbellis', 'Chess', 2015, 6, 21, 3500),
                      ('pcmanus', 'Chess', 2015, 1, 25, 3200))
        prepared_insert.consistency_level = ConsistencyLevel.ALL
        execute_concurrent_with_args(self.session, prepared_insert, parameters)

        # Attempt to query the data using Named Table interface
        # Also test filtering on mv's
        key_space = NamedKeyspace(ks)
        mv_monthly = key_space.table("monthlyhigh")
        mv_all_time = key_space.table("alltimehigh")
        self.assertTrue(self.check_table_size("scores", key_space, len(parameters)))
        self.assertTrue(self.check_table_size("monthlyhigh", key_space, len(parameters)))
        self.assertTrue(self.check_table_size("alltimehigh", key_space, len(parameters)))

        filtered_mv_monthly_objects = mv_monthly.objects.filter(game='Chess', year=2015, month=6)
        self.assertEqual(len(filtered_mv_monthly_objects), 1)
        self.assertEqual(filtered_mv_monthly_objects[0]['score'], 3500)
        self.assertEqual(filtered_mv_monthly_objects[0]['user'], 'jbellis')
        filtered_mv_alltime_objects = mv_all_time.objects.filter(game='Chess')
        self.assertEqual(len(filtered_mv_alltime_objects), 2)
        self.assertEqual(filtered_mv_alltime_objects[0]['score'], 3500)
Beispiel #47
0
 def slurp(self, table, stream, concurrency=1000):
     generic_stream = (genericize(self.columns[table], parameters) for parameters in stream)
     for success, result in execute_concurrent_with_args(self.session, self.insert_statements[table], generic_stream, concurrency=concurrency, results_generator=True):
         if not success:
             yield result
    def _base_bootstrap_test(self,
                             bootstrap=None,
                             bootstrap_from_version=None):
        def default_bootstrap(cluster, token):
            node2 = new_node(cluster)
            node2.set_configuration_options(values={'initial_token': token})
            node2.start(wait_for_binary_proto=True)
            return node2

        if bootstrap is None:
            bootstrap = default_bootstrap

        cluster = self.cluster
        tokens = cluster.balanced_tokens(2)
        cluster.set_configuration_options(values={'num_tokens': 1})

        debug("[node1, node2] tokens: %r" % (tokens, ))

        keys = 10000

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]
        if bootstrap_from_version:
            debug("starting source node on version {}".format(
                bootstrap_from_version))
            node1.set_install_dir(version=bootstrap_from_version)
        node1.set_configuration_options(values={'initial_token': tokens[0]})
        cluster.start(wait_other_notice=True)

        session = self.patient_cql_connection(node1)
        self.create_ks(session, 'ks', 1)
        self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})

        # record the size before inserting any of our own data
        empty_size = node1.data_size()
        debug("node1 empty size : %s" % float(empty_size))

        insert_statement = session.prepare(
            "INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')")
        execute_concurrent_with_args(session, insert_statement,
                                     [['k%d' % k] for k in range(keys)])

        node1.flush()
        node1.compact()
        initial_size = node1.data_size()
        debug("node1 size before bootstrapping node2: %s" %
              float(initial_size))

        # Reads inserted data all during the bootstrap process. We shouldn't
        # get any error
        reader = self.go(lambda _: query_c1c2(
            session, random.randint(0, keys - 1), ConsistencyLevel.ONE))

        # Bootstrapping a new node in the current version
        node2 = bootstrap(cluster, tokens[1])
        node2.compact()

        reader.check()
        node1.cleanup()
        debug("node1 size after cleanup: %s" % float(node1.data_size()))
        node1.compact()
        debug("node1 size after compacting: %s" % float(node1.data_size()))
        time.sleep(.5)
        reader.check()

        debug("node2 size after compacting: %s" % float(node2.data_size()))

        size1 = float(node1.data_size())
        size2 = float(node2.data_size())
        assert_almost_equal(size1, size2, error=0.3)
        assert_almost_equal(float(initial_size - empty_size),
                            2 * (size1 - float(empty_size)))

        assert_bootstrap_state(self, node2, 'COMPLETED')
        if bootstrap_from_version:
            self.assertTrue(
                node2.grep_log('does not support keep-alive',
                               filename='debug.log'))
 def _results_from_concurrent(cls, params):
     return [results[1] for results in execute_concurrent_with_args(cls.session, cls.prepared, params)]
Beispiel #50
0
def insert_dataset(stream_key, dataset):
    """
    Insert an xray dataset back into CASSANDRA.
    First we check to see if there is data in the bin, if there is we either overwrite and update
    the values or fail and let the user known why
    :param stream_key: Stream that we are updating
    :param dataset: xray dataset we are updating
    :return:
    """
    # All of the bins on SAN data will be the same in the netcdf file take the first
    data_bin = dataset['bin'].values[0]
    data_lists = {}
    size = dataset['index'].size
    # get the metadata partition
    bin_meta = metadata_service_api.get_partition_metadata_record(
        *(stream_key.as_tuple() + (data_bin, CASS_LOCATION_NAME)))
    if bin_meta is not None and not engine.app.config['SAN_CASS_OVERWRITE']:
        # If there is already data and we do not want overwriting return an error
        error_message = "Data present in Cassandra bin {:d} for {:s}. " + \
                        "Aborting operation!".format(data_bin, stream_key.as_refdes())
        log.error(error_message)
        return error_message
    # get the data in the correct format
    cols = SessionManager.get_query_columns(stream_key.stream.name)
    dynamic_cols = cols[1:]
    key_cols = ['subsite', 'node', 'sensor', 'bin', 'method']
    cols = key_cols + dynamic_cols
    arrays = {
        p.name
        for p in stream_key.stream.parameters
        if not p.is_function and p.parameter_type == 'array<quantity>'
    }
    data_lists['bin'] = [data_bin] * size
    # id and provenance are expected to be UUIDs so convert them to uuids
    data_lists['id'] = [uuid.UUID(x) for x in dataset['id'].values]
    data_lists['provenance'] = [
        uuid.UUID(x) for x in dataset['provenance'].values
    ]
    for i in arrays:
        data_lists[i] = [msgpack.packb(x) for x in dataset[i].values.tolist()]
    for dc in dynamic_cols:
        # if it is in the dataset and not already in the datalist we need to put it in the list
        if dc in dataset and dc not in data_lists:
            if '_FillValue' in dataset[dc].attrs:
                temp_val = dataset[dc].values.astype(object)
                temp_val[temp_val == dataset[dc].attrs['_FillValue']] = None
                data_lists[dc] = temp_val
            else:
                data_lists[dc] = dataset[dc].values

    # if we don't have metadata for the bin or we want to overwrite the values from cassandra continue
    if bin_meta is not None:
        log.warn(
            "Data present in Cassandra bin %s for %s.  Overwriting old and adding new data.",
            data_bin, stream_key.as_refdes())

    # get the query to insert information
    col_names = ', '.join(cols)
    # Take subsite, node, sensor, ?, and method
    key_str = "'{:s}', '{:s}', '{:s}', ?, '{:s}'".format(
        stream_key.subsite, stream_key.node, stream_key.sensor,
        stream_key.method)
    # and the rest as ? for thingskey_cols[:3] +['?'] + key_cols[4:] + ['?' for _ in cols]
    data_str = ', '.join(['?' for _ in dynamic_cols])
    full_str = '{:s}, {:s}'.format(key_str, data_str)
    query = 'INSERT INTO {:s} ({:s}) VALUES ({:s})'.format(
        stream_key.stream.name, col_names, full_str)
    query = SessionManager.prepare(query)

    # make the data list
    to_insert = []
    data_names = ['bin'] + dynamic_cols
    for i in range(size):
        row = [data_lists[col][i] for col in data_names]
        to_insert.append(row)

    ###############################################################
    # Build & execute query to create rows and count the new rows #
    ###############################################################
    primary_key_columns = [
        'subsite', 'node', 'sensor', 'bin', 'method', 'time', 'deployment',
        'id'
    ]
    create_rows_columns = ', '.join(primary_key_columns)
    # Fill in (subsite, node, sensor, method) leaving (bin, time, deployment, id) to be bound
    create_rows_values = "'{:s}', '{:s}', '{:s}', ?, '{:s}', ?, ?, ?".format(
        stream_key.subsite, stream_key.node, stream_key.sensor,
        stream_key.method)
    create_rows_query = 'INSERT INTO {:s} ({:s}) VALUES ({:s}) IF NOT EXISTS'.format(
        stream_key.stream.name, create_rows_columns, create_rows_values)
    create_rows_query = SessionManager.prepare(create_rows_query)
    # We only want (bin, time, deployment, id)
    create_rows_data = numpy.array(to_insert)[:, :4]
    # Execute query
    insert_count = 0
    fails = 0
    for success, result in execute_concurrent_with_args(
            SessionManager.session(),
            create_rows_query,
            create_rows_data,
            concurrency=50,
            raise_on_first_error=False):
        if not success:
            fails += 1
        elif result[0][0]:
            insert_count += 1
    if fails > 0:
        log.warn("Failed to create %d rows within Cassandra bin %d for %s!",
                 fails, data_bin, stream_key.as_refdes())

    # Update previously existing rows and new mostly empty rows
    fails = 0
    for success, _ in execute_concurrent_with_args(SessionManager.session(),
                                                   query,
                                                   to_insert,
                                                   concurrency=50,
                                                   raise_on_first_error=False):
        if not success:
            fails += 1
    if fails > 0:
        log.warn("Failed to update %d rows within Cassandra bin %d for %s!",
                 fails, data_bin, stream_key.as_refdes())
    update_count = len(to_insert) - fails - insert_count

    # Index the new data into the metadata record
    first = dataset['time'].min()
    last = dataset['time'].max()
    bin_meta = metadata_service_api.build_partition_metadata_record(
        *(stream_key.as_tuple() +
          (data_bin, CASS_LOCATION_NAME, first, last, insert_count)))
    metadata_service_api.index_partition_metadata_record(bin_meta)

    ret_val = 'Inserted {:d} and updated {:d} particles within Cassandra bin {:d} for {:s}.'.format(
        insert_count, update_count, data_bin, stream_key.as_refdes())
    log.info(ret_val)
    return ret_val
Beispiel #51
0
 def _results_from_concurrent(self, params):
     return [
         results[1] for results in execute_concurrent_with_args(
             self.session, self.prepared, params)
     ]
Beispiel #52
0
def main():
    cluster = Cluster()
    session = cluster.connect()

    session.execute(
        "CREATE KEYSPACE IF NOT EXISTS keyspace1 WITH replication={'class':'SimpleStrategy', 'replication_factor':1}"
    )
    session.execute("""CREATE TABLE IF NOT EXISTS keyspace1.movie_data (
        tconst text,
        titleType text,
        primaryTitle text,
        originalTitle text,
        isAdult boolean,
        startYear int,
        endYear int,
        runtimeMinutes int,
        genres list<text>,
        PRIMARY KEY (tconst, startYear)

    ) WITH CLUSTERING ORDER BY (startYear DESC);
    """)

    prepered_insert = session.prepare(
        "INSERT INTO keyspace1.movie_data (tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres)  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"
    )

    data = pd.read_csv('title.basics.tsv', sep='\t', dtype=str)

    pbar = tqdm(data.itertuples(), total=len(data.index))
    rows_data = []
    for r in pbar:
        if pd.isna(r.primaryTitle):
            continue

        d = [
            r.tconst, r.titleType, r.primaryTitle, r.originalTitle,
            parse_adult(r.isAdult),
            nullable_number(r.startYear),
            nullable_number(r.endYear),
            nullable_number(r.runtimeMinutes),
            parse_genres(r.genres)
        ]
        rows_data.append(d)
        if len(rows_data) == 1000:
            try:
                execute_concurrent_with_args(session,
                                             prepered_insert,
                                             rows_data,
                                             concurrency=100)
                rows_data = []
            except Exception:
                # in case of broken data, we try each row on it's own
                for row in rows_data:
                    print(row)
                    session.execute(prepered_insert, row)

    if rows_data:
        execute_concurrent_with_args(session,
                                     prepered_insert,
                                     rows_data,
                                     concurrency=100)
# 2) SINGLE THREAD/PROCESS CONCURRENT INSERT
# ------------------------------------------
from cassandra.cluster import Cluster
from cassandra.concurrent import execute_concurrent_with_args

session = Cluster().connect()

statement = session.prepare(
    'INSERT INTO test.test2 (a, b) VALUES (?,?)'
)

values = [(x,x) for x in xrange(0, 10000)]

execute_concurrent_with_args(
    session, statement, values, concurrency=100
)


# 3) MULTI-PROCESS CONCURRENT INSERT
# ----------------------------------

from multiprocessing import Process, Queue
import time
from cassandra.cluster import Cluster
from cassandra.concurrent import execute_concurrent_with_args

# Define multiprocessing function
def ccrt(Tuple, Qout):
    session = Cluster().connect()
Beispiel #54
0
LIWC_vars = ['c5.{}'.format(i) for i in range(1, 63)]
wordnet_affect_vars = ['c14.{}'.format(i) for i in range(1, 281)]

# prepare the CQL query w/in the active session -- this will catch any major mistakes in the query
q = s.prepare(
    'select gkg_id, url, event_ids, mft_data, gcam_data, event_locations, event_actors, named_entities, tone_avg, source, source_location,  wordcount, themes from gkg_record_by_day where gkg_day = ?;'
)

args = []

for dt in daterange:
    args.append((dt, ))

# bind the generic query to each particular argument and execute them concurrently, up to 32 at a time (in this case, will run all 31 days at once)
results = execute_concurrent_with_args(
    s, q, args, concurrency=4
)  # tricky quirk of the cassandra driver -- must cast the date argument to a tuple, which NB requires the trailing comma!

# initialize a nested dictionary for output
data_dict = defaultdict(dict)

for result in results:
    # the execute_concurrent functions return a higher-level object than session.execute
    # need to check that the query ran successfully, then access all rows in result (nested ResultSet object in property .result_or_exc)
    if result.success:
        for row in result.result_or_exc:
            data_dict[row.gkg_id]['url'] = row.url
            data_dict[row.gkg_id]['entities'] = row.named_entities
            data_dict[row.gkg_id]['themes'] = row.themes
            data_dict[row.gkg_id]['tone'] = row.tone_avg
            data_dict[row.gkg_id]['wordcount'] = row.wordcount
Beispiel #55
0
def load_concurrent(particles, concurrent_size=50):
    execute_concurrent_with_args(session,
                                 insert,
                                 particles,
                                 concurrency=concurrent_size)