def run_inserts_at_version(self, proto_ver):
        session = Cluster(protocol_version=proto_ver).connect(self.keyspace_name)
        try:
            p = session.prepare('insert into t (k, v) values (?, ?)')
            session.execute(p, (0, [{1, 2}, {3, 5}]))

            p = session.prepare('insert into u (k, v) values (?, ?)')
            session.execute(p, (0, {(1, 2), (3, 5)}))

            p = session.prepare('insert into v (k, v, v1) values (?, ?, ?)')
            session.execute(p, (0, {(1, 2): [1, 2, 3], (3, 5): [4, 5, 6]}, (123, 'four')))

            p = session.prepare('insert into w (k, v) values (?, ?)')
            session.execute(p, (0, ({1: [1, 2, 3], 2: [4, 5, 6]}, [7, 8, 9])))

        finally:
            session.cluster.shutdown()
def ccrt(Tuple, Qout):
    session = Cluster().connect()

    statement = session.prepare(
        'INSERT INTO test.test2 (a, b) VALUES (?,?)'
    )

    values = [(x,x) for x in xrange(Tuple[0], Tuple[1])]

    execute_concurrent_with_args(
        session, statement, values, concurrency=100
    )

    Qout.put(time.time())
class CassandraService:
    def __init__(self, keyspace='data'):
        self.session = Cluster().connect(keyspace)
        self.user_insert_stmt = self.prepare_user_insert_statement()
        self.tweet_insert_stmt = self.prepare_tweet_insert_statement()
        self.exception_insert_stmt = self.prepare_exception_insert_statement()

    def save_user(self, user: User):
        self.session.execute(self.user_insert_stmt, vars(user))

    def save_tweet(self, tweet: Tweet):
        self.session.execute(self.tweet_insert_stmt, vars(tweet))

    def save_exception(self, exception_data: dict):
        self.session.execute(self.exception_insert_stmt, exception_data)

    def prepare_user_insert_statement(self):
        return self.session.prepare("""
            INSERT INTO pronbots_2019 (id, captured_at, created_at, description, entities, favourites_count,
                followers_count, following_count, friends_count, listed_count, name, pinned_tweet_id, profile_image_url,
                protected, tweets_count, url, user_name)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """)

    def prepare_tweet_insert_statement(self):
        return self.session.prepare("""
            INSERT INTO pronbots_2019_tweets (id, captured_at, created_at, user_id, timezone, content, link, retweet,
                mentions, urls, photos, video, lang, replies_count, retweets_count, likes_count, hashtags, cashtags)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """)

    def prepare_exception_insert_statement(self):
        return self.session.prepare("""
            INSERT INTO pronbots_2019_exceptions (user, exception)
            VALUES (?, ?)
            """)
Exemple #4
0
def insert_data():
    from cassandra.cluster import Cluster
    from cassandra import AlreadyExists
    # Conexion al cluster de Cassandra al keyspace "test"
    session = Cluster([
        '192.168.101.140', '192.168.101.141', '192.168.101.142',
        '192.168.101.143'
    ]).connect("test")
    try:
        # Primero crea la tabla
        session.execute("CREATE TABLE test1 (id int PRIMARY KEY, col1 text)")
        # Inserta datos
        insertar = session.prepare(
            "INSERT INTO test1 (id, col1) VALUES (?, ?)")
        for i in range(10):
            session.execute(insertar, (i, 'hola' + str(i)))
            print(i)
    except AlreadyExists:
        # Inserta datos a la tabla ya existente
        insertar = session.prepare(
            "INSERT INTO test1 (id, col1) VALUES (?, ?)")
        for i in range(10):
            session.execute(insertar, (i, 'hola' + str(i)))
            print(i)
def insert_data(headers, data):
    ## Connect to Scylla cluster and create schema
    # session = cassandra.cluster.Cluster(SCYLLA_IP).connect()
    print("")
    print("## Connecting to Scylla cluster -> Creating schema")
    session = Cluster(SCYLLA_IP).connect()
    session.execute(create_ks)
    session.execute(create_t1)

    ## Connect to Elasticsearch
    print("")
    print("## Connecting to Elasticsearch -> Creating 'Catalog' index")
    es = Elasticsearch(ES_IP)

    ## Create Elasticsearch index. Ignore 400 = IF NOT EXIST
    es.indices.create(index="catalog", ignore=400)

    ## Non-prepared CQL statement
    #cql = "INSERT INTO catalog.apparel(sku,brand,group,sub_group,color,size,gender) VALUES(%(sku)s,%(brand)s,%(group)s,%(sub_group)s,%(color)s,%(size)s,%(gender)s)"

    ## Prepared CQL statement
    print("")
    print("## Preparing CQL statement")
    cql = "INSERT INTO catalog.apparel (sku,brand,group,sub_group,color,size,gender) VALUES (?,?,?,?,?,?,?) using TIMESTAMP ?"
    cql_prepared = session.prepare(cql)
    cql_prepared.consistency_level = ConsistencyLevel.ONE if random.random(
    ) < 0.2 else ConsistencyLevel.QUORUM

    print("")
    print("## Insert csv content into Scylla and Elasticsearch")

    for d in data:
        # See if we need to add code to wait for the ack. This should be synchronous.
        # Also, might need to switch to prepared statements to set the consistency level for sync requests.
        session.execute(cql_prepared, d)

        res = es.index(index="catalog",
                       doc_type="apparel",
                       id=d["sku"],
                       body=d)

    ## After all the inserts, make a refresh, just in case
    print("")
    print("## Inserts completed, refreshing index")
    es.indices.refresh(index="catalog")

    print("")
Exemple #6
0
class Searcher:
    start = 0
    def __init__(self,name):

        self.filename = mapper[name][search]
        Searcher.start = datetime.now()

        self.log1 = getLogger('time', 'search_time.log')
        self.log2 = getLogger('result', 'search_result.log')

        self.log2.info('## connecting to cassandra cluster')
        self.session = Cluster(DSE_IP).connect()

        cql = "SELECT * FROM reddit.comment WHERE solr_query=?"
        self.solr_query = '{{"q":"body:*{0}*"}}'
        self.cql_prepared = self.session.prepare(cql)
        self.cql_prepared.consistency_level = ConsistencyLevel.ONE
        
     
    def search(self):
        counter = 0

        with open(self.filename, 'r', encoding='utf-8') as f:
            for counter,line in enumerate(f):
                try:
                    line = line.strip('\n')
                    if counter%100 == 0:
                        self.log1.info(str(counter))
                        if counter%1000 == 0:
                            self.log1.info('{}'.format(datetime.now()- Searcher.start))
                            self.log1.info('')
                    self.log2.info('({})'.format(line))
                    
                    # format query body
                    data = self.solr_query.format(line)
                    
                    res = self.session.execute(self.cql_prepared,[data],timeout=60000)
                    
                    self.log2.info(res[0])
                except Exception as e:
                    self.log2.warn(e)


        self.log1.info('total time:{}'.format(datetime.now() - Searcher.start))
def load(datafile, ks_name, table_name):
    with open(datafile) as f:
        reader = csv.reader(f)
        session = Cluster().connect()

        header = reader.next()

        stmt = (
            "INSERT INTO {ks}.{tab} ({header_spec}) VALUES ({qs});").format(
                ks=ks_name,
                tab=table_name,
                header_spec=', '.join(header),
                qs=', '.join(['?' for _ in header]))
        print('preparing "{stmt}"'.format(stmt=stmt), file=sys.stderr)
        prepared = session.prepare(stmt)

        data = csv_handle_to_nested_list(reader)
        print('About to load {n} rows'.format(n=len(data)))
        for row in data:
            session.execute(prepared, row)
Exemple #8
0
class Loader:
    start = 0
    def __init__(self,name):
        logging.getLogger("elasticsearch").setLevel(logging.WARNING)
        self.log = getLogger('scy+es','load.log')
        
        Loader.start = datetime.now()

    
        self.index = name
        self.stat = Statement(name)

        self.filename = mapper[name]['load']

        self.__init_scy()
        self.__init_es()
        
        self.pool_scy = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10)
        self.pool_es = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10)

    def load(self):

        if self.index == 'reddit':
            # main loop
            g = self.__line_generator()
            while True:
                try:
                    line = json.loads(next(g))
                    self.pool_scy.submit(self.__insert_data,line)
                    self.pool_es.submit(self.__insert_index,line)
                except StopIteration:
                    break
                except Exception:
                    continue 
        elif self.index == 'amazon':
            df = pd.read_csv(self.filename,encoding='utf-8')
            end = df.index.max()
            df = df.fillna('missing')
            try:
                for line in zip(range(0,end+1),df['userId'],df['productId'],
                                    df['rating'],df['title'],df['comment'],df['timestamp']):
                    counter = line[0]
                    if counter%100000 == 0:
                        self.log.info(str(counter))
                        if counter%1000000 == 0:
                            self.log.info('{}'.format(datetime.now()-Loader.start))
                            self.log.info('')
                    self.pool_scy.submit(self.__insert_data,line)
                    self.pool_es.submit(self.__insert_index,line)
            except Exception as e:
                print(e)
            
        self.pool_scy.shutdown()
        self.pool_es.shutdown()
        
        self.es.indices.refresh(index=self.index)
        # shutdown
        self.session.shutdown()

        # print information
        self.log.info('## Total cost time: {}'.format(datetime.now() - Loader.start))
        self.log.info('## Inserts completed')


    def __line_generator(self):

        with open(self.filename, 'r', encoding='utf-8') as f:
            for counter,line in enumerate(f):
                try:
                    if counter%100000 == 0:
                        self.log.info(str(counter))
                        if counter%1000000 == 0:
                            self.log.info('{}'.format(datetime.now()-Loader.start))
                            self.log.info('')
                    yield line
                except Exception as e:
                    print(e)
                    continue


    # get scylladb connect, create ks and tb, return session
    def __init_scy(self):
        # session = Cluster(contact_points=SCYLLA_IP,execution_profiles={EXEC_PROFILE_DEFAULT:ep}).connect()
        self.session = Cluster(contact_points=SCYLLA_IP).connect()
        # create a schema
        self.session.execute(self.stat.create_ks)
        # create a tb
        self.session.execute(self.stat.create_tb)

        self.cql_prepared = self.session.prepare(self.stat.cql)
        self.cql_prepared.consistency_level = ConsistencyLevel.LOCAL_QUORUM



    # get es connect, create index
    def __init_es(self):

        with open('mapping.json','r') as f:
           mapping = json.load(f)[self.index] 

        self.es = Elasticsearch(ES_IP, timeout=30)
        # create es index
        self.es.indices.create(index=self.index,body=mapping ignore=400,timeout=30)



    # insert data into scylladb
    def __insert_data(self,line):

        data = list()
        # TODO: format your data
        if self.index == 'reddit':
            data = [line['id'], line['name'], line['link_id'], line['parent_id'], line['subreddit_id'],line['author'], line['body'], int(line['created_utc'])]
        elif self.index == 'amazon':
            #       id | user_id | product_id | rating | title | body | timestamp
            data = [line[0], int(line[1]), int(line[2]), line[3], line[4], line[5], line[6]]
        self.log.info(data)
        res = self.session.execute(self.cql_prepared,data,timeout=60000)
        return 
    


    # insert data into elasticsearch
    def __insert_index(self,line): 

        # TODO: format your data
        if self.index == 'reddit':
            data = { k:v for k,v in line.items() if k in ['id', 'name', 'author', 'body'] }
        elif self.index == 'amazon':
            data = {}
            data['id'] = line[0]
            data['title'] = line[4]
            data['body'] = line[5]

        res = self.es.index(index=self.index, doc_type="comment", id=data['id'], body=data)
        return 
    def _contention_test(self, threads, iterations):
        """
        Test threads repeatedly contending on the same row.
        """

        verbose = False

        session = self.prepare(nodes=3)
        session.execute(
            "CREATE TABLE test (k int, v int static, id int, PRIMARY KEY (k, id))"
        )
        session.execute("INSERT INTO test(k, v) VALUES (0, 0)")

        class Worker(Thread):
            def __init__(self, wid, session, iterations, query):
                Thread.__init__(self)
                self.wid = wid
                self.iterations = iterations
                self.query = query
                self.session = session
                self.errors = 0
                self.retries = 0

            def run(self):
                i = 0
                prev = 0
                while i < self.iterations:
                    done = False
                    while not done:
                        try:
                            res = self.session.execute(
                                self.query, (prev + 1, prev, self.wid))
                            if verbose:
                                print("[%3d] CAS %3d -> %3d (res: %s)" %
                                      (self.wid, prev, prev + 1, str(res)))
                            if res[0][0] is True:
                                done = True
                                prev = prev + 1
                            else:
                                self.retries = self.retries + 1
                                # There is 2 conditions, so 2 reasons to fail: if we failed because the row with our
                                # worker ID already exists, it means we timeout earlier but our update did went in,
                                # so do consider this as a success
                                prev = res[0][3]
                                if res[0][2] is not None:
                                    if verbose:
                                        print(
                                            "[%3d] Update was inserted on previous try (res = %s)"
                                            % (self.wid, str(res)))
                                    done = True
                        except WriteTimeout as e:
                            if verbose:
                                print("[%3d] TIMEOUT (%s)" %
                                      (self.wid, str(e)))
                            # This means a timeout: just retry, if it happens that our update was indeed persisted,
                            # we'll figure it out on the next run.
                            self.retries = self.retries + 1
                        except Exception as e:
                            if verbose:
                                print("[%3d] ERROR: %s" % (self.wid, str(e)))
                            self.errors = self.errors + 1
                            done = True
                    i = i + 1
                    # Clean up for next iteration
                    while True:
                        try:
                            self.session.execute(
                                "DELETE FROM test WHERE k = 0 AND id = %d IF EXISTS"
                                % self.wid)
                            break
                        except WriteTimeout as e:
                            pass

        nodes = self.cluster.nodelist()
        workers = []

        session = Cluster([nodes[0].ip_addr],
                          connect_timeout=15,
                          idle_heartbeat_interval=0,
                          execution_profiles={
                              EXEC_PROFILE_DEFAULT:
                              ExecutionProfile(request_timeout=60)
                          }).connect('ks')
        q = session.prepare("""
                BEGIN BATCH
                   UPDATE test SET v = ? WHERE k = 0 IF v = ?;
                   INSERT INTO test (k, id) VALUES (0, ?) IF NOT EXISTS;
                APPLY BATCH
            """)

        for n in range(0, threads):
            workers.append(Worker(n, session, iterations, q))

        start = time.time()

        for w in workers:
            w.start()

        for w in workers:
            w.join()

        if verbose:
            runtime = time.time() - start
            print("runtime:", runtime)

        query = SimpleStatement("SELECT v FROM test WHERE k = 0",
                                consistency_level=ConsistencyLevel.ALL)
        rows = session.execute(query)
        value = rows[0][0]

        errors = 0
        retries = 0
        for w in workers:
            errors = errors + w.errors
            retries = retries + w.retries

        assert (value == threads * iterations) and (
            errors == 0), "value={}, errors={}, retries={}".format(
                value, errors, retries)
def query_es(NUM_FILTERS):

    ## Connect to Elasticsearch
    print("")
    print("## Connecting to Elasticsearch")
    es = Elasticsearch(ES_IP)

    if NUM_FILTERS == 'single':
        ## Search using single field filter (group: 'pants')
        print("")
        print("## Searching for 'pants' in Elasticsearch (filter by group)")
        res = es.search(index="catalog",
                        doc_type="apparel",
                        body={
                            "query": {
                                "match": {
                                    "group": "pants"
                                }
                            },
                            "size": 1000
                        })

    if NUM_FILTERS == 'multiple':
        ## Search using multiple fields filter (color: 'white' AND sub_group: 'softshell')
        print("")
        print(
            "## Searching for 'white softshell' in Elasticsearch (filter by color + sub_group)"
        )
        res = es.search(index="catalog",
                        doc_type="apparel",
                        body={
                            "query": {
                                "bool": {
                                    "must": [{
                                        "match": {
                                            "color": "white"
                                        }
                                    }, {
                                        "match": {
                                            "sub_group": "softshell"
                                        }
                                    }]
                                }
                            },
                            "size": 1000
                        })

    if NUM_FILTERS == 'none':
        ## Search with NO filters (match_all)
        print("")
        print("## Searching with NO filter = 'match_all' in Elasticsearch")
        res = es.search(index="catalog",
                        doc_type="apparel",
                        body={
                            "query": {
                                "match_all": {}
                            },
                            "size": "1000"
                        })

    print("")
    print("## %d documents returned" % res['hits']['total'])

    es_results = [doc['_id'] for doc in res['hits']['hits']]

    ## Connect to Scylla
    print("")
    print("## Connecting to Scylla")
    session = Cluster(SCYLLA_IP).connect()

    ## Prepared cql statement
    print("")
    print("## Preparing CQL statement")
    cql = "SELECT * FROM catalog.apparel WHERE sku=?"
    cql_prepared = session.prepare(cql)
    cql_prepared.consistency_level = ConsistencyLevel.ONE if random.random(
    ) < 0.2 else ConsistencyLevel.QUORUM

    ## Query Scylla
    print("")
    print("## Query Scylla using SKU/s returned from Elasticsearch")

    print("")
    print("## Final results from Scylla:")
    print("")
    for r in es_results:
        scylla_res = session.execute(cql_prepared, (r, ))
        print("%s" % ([list(row) for row in scylla_res]))

    #for doc in res['hits']['hits']:

    ## Print all columns in Elasticsearch result set
    #print("SKU: %s | Color: %s | Size: %s | Brand: %s | Gender: %s | Group: %s | Sub_Group: %s" % (doc['_id'], doc['_source']['color'], doc['_source']['size'], doc['_source']['brand'], doc['_source']['gender'], doc['_source']['group'], doc['_source']['sub_group']))

    ## Print only the id (sku) in the result set
    #print("SKU: %s" % (doc['_id']))

    print("")
class KairosdbFinder(object):
    __fetch_multi__ = "kairosdb"

    def __init__(self, config):
        cfg = config.get('kairosdb', {})
        es = cfg.get('es', {})
        cas = cfg.get('cassandra', {})
        self.config = {
            "cassandra": {
                "hosts": cas.get('hosts', ["localhost"]),
                "port": cas.get('port', 9042),
            },
            "es": {
                "url": es.get('url', 'http://localhost:9200')
            }
        }
        logger.info("initialize kairosdbFinder", config=self.config)
        self.es = Elasticsearch([self.config['es']['url']])
        self.cassandra = Cluster(
            self.config['cassandra']['hosts'],
            self.config['cassandra']['port']).connect('kairosdb')
        self.metric_lookup_stmt = self.cassandra.prepare(
            'SELECT * FROM data_points WHERE key=? AND column1 > ? AND column1 <= ?'
        )

    def find_nodes(self, query):
        seen_branches = set()
        leaf_regex = self.compile_regex(query, False)
        #query Elasticsearch for paths
        matches = self.search_series(leaf_regex, query)
        leafs = {}
        branches = {}
        for metric in matches:
            if metric.is_leaf():
                if metric.name in leafs:
                    leafs[metric.name].append(metric)
                else:
                    leafs[metric.name] = [metric]
            else:
                if metric.name in branches:
                    branches[metric.name].append(metric)
                else:
                    branches[metric.name] = [metric]

        for name, metrics in leafs.iteritems():
            yield KairosdbLeafNode(name, KairosdbReader(self.config, metrics))
        for branchName, metrics in branches.iteritems():
            name = branchName
            while '.' in name:
                name = name.rsplit('.', 1)[0]
                if name not in seen_branches:
                    seen_branches.add(name)
                    if leaf_regex.match(name) is not None:
                        yield BranchNode(name)

    def fetch_from_cassandra(self, nodes, start_time, end_time):
        # datapoints are stored in rows that spane a 3week period.
        # so we need to determine the 1 or more periods we need to query.
        periods = []
        start_period = start_time - (start_time % 1814400)
        periods.append({
            'key': start_period,
            'start': start_time,
            'end': end_time
        })
        end_period = end_time - (end_time % 1814400)
        if start_period != end_period:
            pos = start_period + 1814400
            count = 0
            while pos <= end_period:
                periods.append({'key': pos, 'start': pos, 'end': end_time})
                # set the end_time range boundry of the last period to the end of that period.
                periods[count]['end'] = pos - 1
                count += 1
                pos += 1814400

        # we now need to generate all of the row_keys that we need.
        # we store an array of tuples, where each tuple is the (row_key, start_offset, end_offset)
        query_args = []
        node_index = {}
        datapoints = {}
        for node in nodes:
            for metric in node.reader.metrics:
                measurement = metric.metric
                tags = ""
                tag_list = metric.tags
                tag_list.append('org_id:%d' % g.org)
                for tag in sorted(tag_list):
                    parts = tag.split(":", 2)
                    tags += "%s=%s:" % (parts[0], parts[1])

                #keep a map between the measurement+tags to the node.path
                node_index["%s\0%s" % (measurement, tags)] = node.path

                #initialize where we will store the data.
                datapoints[node.path] = {}

                # now build or query_args
                for data_type in [
                        "kairos_double", "kairos_long"
                ]:  #request both double and long values as kairos makes it impossible to know which in advance.
                    data_type_size = len(data_type)
                    for p in periods:
                        row_timestamp = p['key'] * 1000
                        row_key = "%s00%s00%s%s%s" % (
                            measurement.encode('hex'), "%016x" % row_timestamp,
                            "%02x" % data_type_size, data_type.encode('hex'),
                            tags.encode('hex'))
                        logger.debug("cassandra query", row_key=row_key)
                        start = (p['start'] - p['key']) * 1000
                        end = (p['end'] - p['key']) * 1000

                        #The timestamps are shifted to support legacy datapoints that
                        #used the extra bit to determine if the value was long or double
                        row_key_bytes = bytearray(row_key.decode('hex'))
                        try:
                            start_bytes = bytearray(
                                struct.pack(">L", start << 1))
                        except Exception as e:
                            logger.error("failed to pack %d" % start)
                            raise e
                        try:
                            end_bytes = bytearray(struct.pack(">L", end << 1))
                        except Exception as e:
                            logger.error("failed to pack %d" % end)
                            raise e

                        query_args.append(
                            (row_key_bytes, start_bytes, end_bytes))

        #perform cassandra queries in parrallel using async requests.
        futures = []
        for args in query_args:
            futures.append(
                self.cassandra.execute_async(self.metric_lookup_stmt, args))

        # wait for them to complete and use the results
        for future in futures:
            rows = future.result()
            first = True
            for row in rows:
                if first:
                    row_key = parse_row_key(row.key)
                    path = node_index["%s\0%s" % (row_key['measurement'],
                                                  row_key['tags'])]

                    if path not in datapoints:
                        datapoints[path] = {}
                    first = False

                ts = parse_row_ts(row.column1, row_key['row_timestamp'])
                try:
                    if row_key['data_type'] == "kairos_double":
                        value = struct.unpack(">d", row.value)[0]
                    else:
                        value = unpack_kairos_long(row.value)
                except Exception as e:
                    logger.error("failed to parse value",
                                 exception=e,
                                 data_type=row_key['data_type'])
                    value = None
                datapoints[path][ts] = value

        return datapoints

    def fetch_multi(self, nodes, start_time, end_time):
        step = None
        for node in nodes:
            for metric in node.reader.metrics:
                if step is None or metric.interval < step:
                    step = metric.interval

        with statsd.timer("graphite-api.fetch.kairosdb_query.query_duration"):
            data = self.fetch_from_cassandra(nodes, start_time, end_time)
        series = {}
        delta = None
        with statsd.timer(
                "graphite-api.fetch.unmarshal_kairosdb_resp.duration"):
            for path, points in data.items():
                datapoints = []
                next_time = start_time
                timestamps = points.keys()
                timestamps.sort()
                max_pos = len(timestamps)

                if max_pos == 0:
                    for i in range(int((end_time - start_time) / step)):
                        datapoints.append(None)
                    series[path] = datapoints
                    continue

                pos = 0

                if delta is None:
                    delta = (timestamps[0] % start_time) % step
                    # ts[0] is always greater then start_time.
                    if delta == 0:
                        delta = step

                while next_time <= end_time:
                    # check if there are missing values from the end of the time window
                    if pos >= max_pos:
                        datapoints.append(None)
                        next_time += step
                        continue

                    ts = timestamps[pos]
                    # read in the metric value.
                    v = points[ts]

                    # pad missing points with null.
                    while ts > (next_time + step):
                        datapoints.append(None)
                        next_time += step

                    datapoints.append(v)
                    next_time += step
                    pos += 1
                    if (ts + step) > end_time:
                        break

                series[path] = datapoints

        if delta is None:
            delta = 1
        time_info = (start_time + delta, end_time, step)
        return time_info, series

    def compile_regex(self, query, branch=False):
        # we turn graphite's custom glob-like thing into a regex, like so:
        # * becomes [^\.]*
        # . becomes \.
        if branch:
            regex = '{0}.*'
        else:
            regex = '^{0}$'

        regex = regex.format(
            query.pattern.replace('.', '\.').replace('*', '[^\.]*').replace(
                '{', '(').replace(',', '|').replace('}', ')'))
        logger.debug("compile_regex", pattern=query.pattern, regex=regex)
        return re.compile(regex)

    def search_series(self, leaf_regex, query):
        branch_regex = self.compile_regex(query, True)

        search_body = {
            "query": {
                "filtered": {
                    "filter": {
                        "or": [{
                            "term": {
                                "org_id": g.org
                            }
                        }, {
                            "term": {
                                "org_id": -1
                            }
                        }]
                    },
                    "query": {
                        "regexp": {
                            "name": branch_regex.pattern
                        }
                    }
                }
            }
        }

        with statsd.timer(
                "graphite-api.search_series.es_search.query_duration"):
            ret = self.es.search(index="metric",
                                 doc_type="metric_index",
                                 body=search_body,
                                 size=10000)
            matches = []
            if len(ret["hits"]["hits"]) > 0:
                for hit in ret["hits"]["hits"]:
                    leaf = False
                    source = hit['_source']
                    if leaf_regex.match(source['name']) is not None:
                        leaf = True
                    matches.append(RaintankMetric(source, leaf))
            logger.debug('search_series', matches=len(matches))
        return matches
statement = 'INSERT INTO test.test2 (a, b) VALUES (%s, %s);'

for i in xrange(100000):
    insert = statement %(i,i)
    session.execute(insert)


# 2) SINGLE THREAD/PROCESS CONCURRENT INSERT
# ------------------------------------------
from cassandra.cluster import Cluster
from cassandra.concurrent import execute_concurrent_with_args

session = Cluster().connect()

statement = session.prepare(
    'INSERT INTO test.test2 (a, b) VALUES (?,?)'
)

values = [(x,x) for x in xrange(0, 10000)]

execute_concurrent_with_args(
    session, statement, values, concurrency=100
)


# 3) MULTI-PROCESS CONCURRENT INSERT
# ----------------------------------

from multiprocessing import Process, Queue
import time
from cassandra.cluster import Cluster
Exemple #13
0
def connect():
    global session, prepared
    session = Cluster().connect()
    prepared = session.prepare(
        "SELECT title FROM wiki.categories WHERE category=?")
Exemple #14
0
class Reassembler(Process):
    def initCluster(self):
        auth_provider = PlainTextAuthProvider(username=cfg.cassandraConfig["user"], password=cfg.cassandraConfig["password"])
        self.session = Cluster([cfg.cassandraConfig["host"]], auth_provider=auth_provider).connect(cfg.cassandraConfig["db"])
        self.preparedQuery = self.session.prepare("""INSERT into packet (source_addr, dest_addr, time_stamp, content, text_values)
                                                        VALUES (?,?,?,?,?)""")

    def insertIntoDatabase(self, sourceAddr, destAddr, timeStamp, content, textValues):
        args = [sourceAddr, destAddr, timeStamp, content, textValues]
        self.session.execute(self.preparedQuery, args)

    def __init__(self, ports):
        self.ports = []
        list = ports.split(',')     # split CSV port list arguments
        self.ports = map(int,list)  # convert ports to int (from string)
        self.initCluster()
        nids.register_tcp(self.handleTcpStream)                     # set up call back
        
    def __call__(self):     # make a singleton clas
        return self
    
    def printableHex(self, buf):
        return ' '.join(x.encode('hex') for x in buf)
        
    def handleTcpStream(self, tcp):
        end_states = (nids.NIDS_CLOSE, nids.NIDS_TIMEOUT, nids.NIDS_RESET)
        logging.debug('tcps - {0} state: {1} timestamp: {2}'.format(str(tcp.addr),tcp.nids_state,nids.get_pkt_ts() * 1000))
        if tcp.nids_state == nids.NIDS_JUST_EST:
            # new to us, but do we care?
            ((src, sport), (dst, dport)) = tcp.addr
            #if dport in self.ports:
            logging.info('collecting: {}'.format(str(tcp.addr)))
            tcp.client.collect = 1
            tcp.server.collect = 1
        elif tcp.nids_state == nids.NIDS_DATA:
            tcp.discard(0)
            # keep all of the stream's new data
            #informs nids how many bytes in the stream to discard
            #((src, sport), (dst, dport)) = tcp.addr
            #serverData = tcp.server.data[:tcp.server.count]
            #clientData = tcp.client.data[:tcp.client.count]
            #envelopeRegex = '<soap.*:envelope.*<.*MultiSpeakMsgHeader.*<soap.*:envelope>'
            #envelopeRegex2 = '</.+:[Ee]nvelope'
            #if serverData is None or clientData is None:
            #   tcp.discard(0)
            #else:
            #   if "Expect: 100-continue" not in serverData:
            #       tcp.discard(0)
            #   else:
            #       if (re.search(envelopeRegex,serverData,re.S | re.IGNORECASE) and re.search(envelopeRegex2,serverData,re.S | re.IGNORECASE) and re.search(envelopeRegex,clientData,re.S | re.IGNORECASE) and re.search(envelopeRegex2,clientData,re.S | re.IGNORECASE)):
            #           tcpaddr = ((dst,dport),(src,sport))
            #           logging.debug( "count_new: {}".format(tcp.server.count_new))
            #           logging.debug( "offset server: {}".format(tcp.server.offset))
            #           self.process_ipframe(serverData,tcp.addr,self.timestamp)
#
            #           logging.debug( "count_new: {}".format(tcp.server.count_new))
            #           logging.debug( "offset client: {}".format(tcp.client.offset))
            #           tcpaddr = ((dst,dport),(src,sport)) #flip it around to match our point of view (since this is the client
            #           self.process_ipframe(clientData,tcpaddr,self.timestamp)
            #           tcp.discard(tcp.server.count + tcp.client.count)
            #       else:
            #           tcp.discard(0)
        elif tcp.nids_state in end_states:
            ((src,sport),(dst,dport)) = tcp.addr
            serverData = tcp.server.data[:tcp.server.count]
            clientData = tcp.client.data[:tcp.client.count]
            #logging.debug("serverData: {0}".format(    serverData))
            #logging.debug("clientData: {0}".format(clientData))
            self.timestamp = nids.get_pkt_ts() * 1000
            #Add the MultiSpeakMsgHeader since we observed way too many false positives during
            #the virtual field test
            envelopeRegex = '<soap.*:envelope.*<.*MultiSpeakMsgHeader.*<soap.*:envelope>'
            logging.info("Serv Count: {0} Client Count {1} newc: {2} news: {3}".format(tcp.server.count,tcp.client.count,tcp.client.count_new,tcp.server.count_new))
            #Match even if there is a newline since we've observed some payloads with the newline
            #print("server is ", tcp.server.data[:tcp.server.count], "client is ", tcp.client.data[:tcp.client.count], "count new is ", tcp.server.count_new)
            if serverData is not None:
                serverData = serverData.replace("\n","")
                if (re.search(envelopeRegex,serverData,re.S | re.IGNORECASE)): #and tcp.server.count_new > 0):
                    logging.info('full message found in tcp server data')
                    payload = tcp.server.data[:tcp.server.count]
                    #tcpaddr = ((dst,dport),(src,sport))
                    logging.debug( "count_new: {}".format(tcp.server.count_new))
                    logging.debug( "offset server: {}".format(tcp.server.offset))
                    self.process_ipframe(payload,tcp.addr,self.timestamp)
                    tcp.discard(tcp.server.count)
                elif "multispeak" in serverData.lower():
                    logging.warning("multispeak serverData but envelope failed: {}".format(serverData))
            if clientData is not None:
                clientData = clientData.replace("\n","")
                if (re.search(envelopeRegex,clientData, re.S | re.IGNORECASE)):
                    logging.info('full message found in tcp client data')
                    tcpaddr = ((dst,dport),(src,sport)) #flip it around to match our point of view (since this is the client
                    payload = tcp.client.data[:tcp.client.count]
                    logging.debug("count_new client: {}".format(tcp.client.count_new))
                    logging.debug( "offset client: {}".format(tcp.client.offset))
                    self.process_ipframe(payload,tcpaddr,self.timestamp) #modified tcpaddr
                    tcp.discard(tcp.client.count)
                elif "multispeak" in clientData.lower():
                    logging.warning("multispeak clientData but envelope failed: {}".format(clientData))
            logging.debug( "addr: {}".format(tcp.addr))
            logging.debug( "To server:")
            logging.debug( "bytes {}".format(str(tcp.server.count)))
            logging.debug( "To client:")
            logging.debug( "bytes: {}".format(str(tcp.client.count)))

    def process_ipframe(self,frame,tcpaddr, timestamp):
        # note that we are no longer checking source IP addresses
        # so we could be processing frames from other ips if not filtered
        # before this point
        cleansedFrame = frame.replace("\n","").replace("\r","").replace("\t","")
        envelopeRegex = '</.+:[Ee]nvelope'
        match = re.search(envelopeRegex,cleansedFrame)
        if match:
            ((src, sport),(dst,dport)) = tcpaddr
            #try to find the endpoint type from POST Request
            endpointRegex = '(?<=POST\s).*(?=\sHTTP)'   # looks for POST and HTTP, and matches the URL
            match = re.search(endpointRegex,cleansedFrame, re.IGNORECASE)
            if match is not None:
                URLsplit = cleansedFrame[match.start():match.end()].split('/')  # [foo,QA_SERVER]
                endpointCodeSplit = URLsplit[-1].split('_') # [QA,SERVER]
                endpointCode = endpointCodeSplit[0] # QA
                logging.debug("parsed MS endpoint code: {}".format(endpointCode))
            else:
                endpointCode = 'NULL'
                logging.warning("Unable to parse endpoint code from header")
                #get the version from the SOAPAction http header
            versionRegex = '(?<=SOAPAction:\s"http:\/\/www.multispeak.org\/Version_).'
            match = re.search(versionRegex,cleansedFrame, re.IGNORECASE)
            mspVersion = 'NULL'
            if match is not None:
                mspVersion = cleansedFrame[match.start()]
                if (mspVersion != '3' and mspVersion != '5'):
                    mspVersion = 'NULL'
            else:
                versionRegex2 = '(?<=SOAPAction:\shttp:\/\/www.multispeak.org\/Version_).' # uses lookbehind to extract only version num
                match2 = re.search(versionRegex2, cleansedFrame, re.IGNORECASE)
                if match2 is not None:
                    mspVersion = cleansedFrame[match2.start()]
                    if (mspVersion != '3' and mspVersion != '5'):
                        mspVersion = 'NULL'
                else:
                    versionRegex3 = '<MultiSpeakMsgHeader[^>]* Version="(\d)\.'
                    match3 = re.search(versionRegex3, cleansedFrame, re.IGNORECASE)
                    if match3 is not None:
                        mspVersion = match3.group(1)
                        if (mspVersion != '3' and mspVersion != '5'):
                            mspVersion = 'NULL'
                    

#           messageNameRegex = '(<[\w:]*[Bb]ody>)(\s*)(<)(?P<MsgName>[\w|:]+)'
            messageNameRegex = '(<([\w-]+:)?body>)(\s*)(<)(([\w-]+:)?)(?P<MsgName>\w*)'
            match = re.search(messageNameRegex,cleansedFrame, re.IGNORECASE)
            if match is not None:
                    messageName = match.group('MsgName').split(':')[-1]
                    text_values = {'endpoint':endpointCode,'messagetype':messageName,'mspVersion':mspVersion}
                    logging.debug( "text values: {}".format(text_values))
                    self.insertIntoDatabase(src, dst, timestamp, frame, text_values)
                    print("inserted packet")
            else:
                    print("MsgName not found")
                    logging.debug("frame with no MsgName: {}".format(cleansedFrame))
        else:
            logging.debug("end of envelope not found: {}".format(cleansedFrame))
            print("end of envelope not found")
Exemple #15
0
from cassandra.query import dict_factory
from cassandra.cluster import Cluster
import cassandra_client as client


KEYSPACE = "user_ratings"
TABLE = "ratings"
SESSION = Cluster(['127.0.0.1'], port=9042).connect()

client.create_keyspace(SESSION, KEYSPACE)
client.create_table(SESSION, KEYSPACE, TABLE)

SESSION.set_keyspace(KEYSPACE)
SESSION.row_factory = dict_factory
RATING_QUERY = SESSION.prepare(f"SELECT * FROM {TABLE} WHERE user_id=?")
DELETE_RATING_QUERY = SESSION.prepare(f"DELETE FROM {TABLE} WHERE user_id=?")


def get(user_id: str) -> dict:
    user = SESSION.execute(RATING_QUERY, [user_id])
    if user:
        return user[0]
    return {}


def push(rating: dict):
    client.push_table(SESSION, KEYSPACE, TABLE, rating)


def list() -> list:
    return client.list_table(SESSION, KEYSPACE, TABLE)
Exemple #16
0
def connect():
    global session, prepared
    session = Cluster().connect()
    prepared = session.prepare("SELECT release_version FROM system.local WHERE key=?")
class CassandraESSync:
	
	mappings = []

	cassandra_nodes = []
	cassandra_keyspace = None
	cassandra_cfs = []

	es_nodes = []
	es_index = None
	es_types = []

	cassandra_client = None 
	es_client = None



	last_synced = {'cassandra': {}, 'es': {} }  # stores the last time each cf and index were synced, to avoid unnecessary queries


	def __init__(self, config):

		self.mappings = config["mappings"]

		self.cassandra_nodes = config['cassandra']['nodes']
		self.cassandra_keyspace = config['cassandra']['keyspace']
		self.cassandra_cfs = config['cassandra']['column_families']

		self.es_nodes = config['elasticsearch']['nodes']
		self.es_index = config['elasticsearch']['index']
		self.es_types = config['elasticsearch']['types']

		self.cassandra_client = Cluster().connect(self.cassandra_keyspace)
		self.es_client = elasticsearch.Elasticsearch()


	def sync_databases(self):
		for mapping in self.mappings:
			cassandra_cf, es_type = mapping
			self.sync_cf_type(cassandra_cf, es_type)


	def sync_cf_type(self, cassandra_cf, es_type):


		cf_id_column = self.cassandra_cfs[cassandra_cf]['id'] # column storing the document's uid
		cf_timestamp_column = self.cassandra_cfs[cassandra_cf]['timestamp'] # column storing the document's timestamp

		index_id_column = self.es_types[es_type]['id'] # column storing the document's timestamp
		index_timestamp_column = self.es_types[es_type]['timestamp'] # column storing the document's timestamp

		cf_data_fields = self.cassandra_cfs[cassandra_cf]['columns']
		cf_fields = [cf_id_column, cf_timestamp_column] + cf_data_fields

		type_data_fields = self.es_types[es_type]['columns']

		if cassandra_cf in self.last_synced['cassandra']:
			cf_start_time, cf_end_time = self.last_synced['cassandra'][cassandra_cf], time.time()  
		else:
			cf_start_time, cf_end_time = None, None

		if es_type in self.last_synced['es']:
			index_start_time, index_end_time = self.last_synced['es'][es_type], time.time()
		else:
			index_start_time, index_end_time = None, None

		cassandra_data_query = 'SELECT %s, %s FROM %s' % (cf_id_column, cf_timestamp_column, cassandra_cf)
		
		range_filter = {}
		if index_start_time and index_end_time:
			range_filter = self.get_es_range_filter(index_timestamp_column, index_start_time, index_end_time)
		

		self.cassandra_client.set_keyspace(self.cassandra_keyspace)
		cassandra_data = self.cassandra_client.execute(cassandra_data_query)
		self.last_synced['cassandra'][cassandra_cf] = time.time()

		es_data = [] #self.es_client.search(index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], body=range_filter)
		es_scan = scan(self.es_client, index=self.es_index, doc_type=es_type, fields=[index_id_column, index_timestamp_column], query=range_filter)
		self.last_synced['es'][es_type] = time.time()

		for data in es_scan:
			es_data.append(data)

		all_data = {}

		ids_to_insert_on_cassandra = []
		ids_to_update_on_cassandra = []

		ids_to_insert_on_es = []
		ids_to_update_on_es = []


		# because we cant make a range query on a non-primary key on cassandra, we have to retrieve it all, and then check for the timestamp by hand.
		for document in cassandra_data:
			doc_id, doc_timestamp = str(document[0]), int(calendar.timegm(document[1].utctimetuple()))
			if not(cf_start_time and cf_end_time):
				all_data[doc_id] = [doc_timestamp, None]
			elif cf_start_time and cf_end_time and doc_timestamp >= cf_start_time and doc_timestamp <= cf_end_time: # 
				all_data[doc_id] = [doc_timestamp, None]

		for document in es_data:
			if "fields" in document:
				if index_id_column == '_id': # special case - is not inside fields. there must be a better way to do this ;(
					doc_id, doc_timestamp = document[index_id_column], int(document['fields'][index_timestamp_column][0])
				else:
					doc_id, doc_timestamp = document['fields'][index_id_column], int(document['fields'][index_timestamp_column][0])

				if doc_id in all_data:
					all_data[doc_id][1] = doc_timestamp
				else:
					all_data[doc_id] = [None, doc_timestamp]

		
		for uid in all_data:
			cassandra_ts, es_ts = all_data[uid]
			if cassandra_ts and es_ts:
				if cassandra_ts > es_ts: # same id, cassandra is the most recent. update that data on es. 
					ids_to_update_on_es.append(uid)
				elif es_ts > cassandra_ts: # same id, es is the most recent. update that data on cassandra.
					ids_to_update_on_cassandra.append(uid)
			elif cassandra_ts: # present only on cassandra. add to es.
				ids_to_insert_on_es.append(uid)
			elif es_ts: #present only on es. add to cassandra.

				ids_to_insert_on_cassandra.append(uid)


		if ids_to_insert_on_es or ids_to_update_on_es:
			actions = []
			from_cassandra_to_es = self.get_cassandra_documents_by_id(cassandra_cf, cf_fields, cf_id_column, ids_to_insert_on_es + ids_to_update_on_es)

			for document in from_cassandra_to_es:
				data = {}
				for i in range(len(cf_data_fields)):
					data[type_data_fields[i]] = getattr(document, cf_data_fields[i]) 	

				actions.append(self.get_es_bulk_action(es_type, index_id_column, getattr(document, cf_id_column), index_timestamp_column, getattr(document, cf_timestamp_column), data))
			
			bulk(self.es_client, actions) # send all inserts/updates to es at once

		if ids_to_insert_on_cassandra or ids_to_update_on_cassandra:
			batch = BatchStatement()

			type_fields = type_data_fields + [index_id_column, index_timestamp_column]
			ids_filter = self.get_es_ids_filter(es_type, ids_to_insert_on_cassandra + ids_to_update_on_cassandra)
			from_es_to_cassandra = self.es_client.search(index=self.es_index, doc_type=es_type, fields=type_data_fields + [cf_timestamp_column], body=ids_filter)
			
			for document in from_es_to_cassandra['hits']['hits']:
				
				id_value = document[index_id_column] if index_id_column == '_id' else document["fields"][index_id_column] # this makes me a saaaad panda
				id_value = id_value
				
				es_data = [UUID(id_value), datetime.datetime.utcfromtimestamp(int(document['fields'][index_timestamp_column][0]))]
				
				for field in type_data_fields:
					es_data.append(document['fields'][field][0])

				prepared_insert_statement = self.get_prepared_cassandra_insert_statement(cassandra_cf, cf_fields)
				prepared_update_statement = self.get_prepared_cassandra_update_statement(cassandra_cf, cf_id_column, cf_fields[1:])
				
				if id_value in ids_to_insert_on_cassandra:
					batch.add(prepared_insert_statement, tuple(es_data))
				else: 
					batch.add(prepared_update_statement, tuple(es_data[1:] + [UUID(id_value)]))

			
			self.cassandra_client.execute(batch)


	def get_cassandra_documents_by_id(self, cf, fields, id_column, ids):
		from_cassandra_to_es_query = "SELECT %s FROM %s WHERE %s IN (%s)" % (",".join(fields), cf, id_column, ",".join(ids))
		return self.cassandra_client.execute(from_cassandra_to_es_query)

	def get_prepared_cassandra_insert_statement(self, cf, cols):
		base_insert_statement = "INSERT INTO %s (%s) VALUES (%s)" % (cf, ",".join(cols), ",".join(["?"] * len(cols)))	
		return self.cassandra_client.prepare(base_insert_statement)

	def get_prepared_cassandra_update_statement(self, cf, id_column, fields):
		base_update_statement = "UPDATE %s SET %s WHERE %s = ?" % (cf, ",".join([field + " = ?" for field in fields]), id_column)
		return self.cassandra_client.prepare(base_update_statement)

	def get_es_range_filter(self, col, start, end):
		return {
				"filter": {
					"range" : {
						col: {
			        		"gte" : start,
			        		"lte" : end
			        	}
			    	}
				}
			}

	def get_es_ids_filter(self, es_type, ids):
		return {
				"filter": {
					"ids" : {
			        	"type" : es_type,
			        	"values": ids 
			    	}
				}
			}

	def get_es_bulk_action(self, es_type, id_column, id_value, timestamp_column, timestamp_value, data):

		if isinstance(timestamp_value, datetime.datetime):
			timestamp_value = calendar.timegm(timestamp_value.utctimetuple())

		id_value = str(id_value)
		timestamp_value = str(timestamp_value)

		data[timestamp_column] = timestamp_value

		action = {}
		action['_index'] = self.es_index
		action['_type'] = es_type 
		action[id_column] = id_value
		action['_source'] = data

		return action
Exemple #18
0
class Loader:

    start = 0

    def __init__(self,name):
        # set log
        logging.getLogger("elasticsearch").setLevel(logging.WARNING)
        self.log = getLogger('dse', 'dse.log')
        # start timer
        Loader.start = datetime.now()

        self.index = name
        self.stat = Statement(name)
        self.filename = mapper[name][load] 
        
        self.pool = ThreadPoolExecutorWithQueueSizeLimit(max_workers=10, queue_size=30)

        self.__init_dse()
        


    # init dse, create keyspace and table, prepare cql and set consistency level
    def __init_dse(self):
        self.session = Cluster(DSE_IP).connect()

        self.session.execute(self.stat.create_ks)
        self.session.execute(self.stat.create_tb)

        self.cql_prepared = self.session.prepare(self.stat.cql)
        self.cql_prepared.consistency_level = ConsistencyLevel.LOCAL_QUORUM

    def load(self):

        if self.index == 'reddit': 
            g = self.__line_generator()
            while True:
                try:
                    line = json.loads(next(g))
                    self.pool.submit(self.__insert_data,line)
                except StopIteration:
                    break
                except Exception:
                    continue 
        elif self.index == 'amazon':
            df = pd.read_csv(self.filename)
            
            end = df.index.max()
            try:
                for line in zip(range(0,end+1),df['userId'],df['productId'],
                                    df['rating'],df['title'],df['comment'],df['timestamp']):
                    self.pool.submit(self.__insert_data,line)
            except Exception as e:
                print(e)
                continue

        # shutdown
        self.pool.shutdown() 

        # create search index
        end = datetime.now() 
        self.log.info('Insert data cost time: {}'.format(end - Loader.start))
        self.log.info('create search index')
        self.session.execute(self.stat.create_index, timeout=60000)

        self.session.shutdown()

        self.log.info('## Session closed')

        # print information
        self.log.info('Create search index cost time: {}'.format(datetime.now() - end))
        self.log.info('Total cost time: {}'.format(datetime.now() - Loader.start))
        self.log.info('## Inserts completed')

    def __line_generator(self):

        with open(self.filename, 'r', encoding='utf-8') as f:
            for counter,line in enumerate(f):
                try:
                    if counter%100000 == 0:
                        self.log.info(str(counter))
                        if counter%1000000 == 0:
                            self.log.info('{}'.format(datetime.now()-Loader.start))
                            self.log.info('')
                    yield line
                except Exception as e:
                    self.log.warn(e)
                    continue



       
    def __insert_data(self, line):
        if self.index == 'reddit'
            data = [line['id'], line['name'], line['link_id'], line['parent_id'],
                    line['subreddit_id'],line['author'], line['body'], int(line['created_utc'])]
        elif self.index == 'amazon':
            pass

        res = self.session.execute(self.cql_prepared,data)
        return