Exemple #1
0
def cassandra_connect(seed, keyspace=None):
    from cassandra.cluster import Cluster
    from cassandra.io.libevreactor import LibevConnection

    cluster = Cluster()
    cluster.connection_class = LibevConnection
    return cluster, cluster.connect(keyspace)
Exemple #2
0
def benchmark(run_fn):
    for conn_class in supported_reactors:
        setup()
        log.info("==== %s ====" % (conn_class.__name__,))

        cluster = Cluster(['127.0.0.1'])
        cluster.connection_class = conn_class
        session = cluster.connect(KEYSPACE)

        log.debug("Sleeping for two seconds...")
        time.sleep(2.0)

        query = SimpleStatement("""
            INSERT INTO {table} (thekey, col1, col2)
            VALUES (%(key)s, %(a)s, %(b)s)
            """.format(table=TABLE))
        values = {'key': 'key', 'a': 'a', 'b': 'b'}

        log.debug("Beginning inserts...")
        start = time.time()
        try:
            run_fn(session, query, values, NUM_QUERIES)
            end = time.time()
        finally:
            teardown()

        total = end - start
        log.info("Total time: %0.2fs" % total)
        log.info("Average throughput: %0.2f/sec" % (NUM_QUERIES / total))
Exemple #3
0
def get_session():
    '''
    Connect onto a Cassandra cluster with the driver.
    :return: A Cassandra session object for cluster interactions.
    '''

    # grab the cluster information using Docker-provided enviornmental variables
    CASSANDRA_HOST = os.environ['CASSANDRA_HOST']
    CASSANDRA_DC = os.environ['CASSANDRA_DC']

    # create a cluster object that will only connect to a single data center
    cluster = Cluster(
        [CASSANDRA_HOST],
        load_balancing_policy=DCAwareRoundRobinPolicy(local_dc=CASSANDRA_DC),
    )

    # use the faster event loop provider
    cluster.connection_class = LibevConnection

    # create the Cassandra session for cluster interaction
    session = cluster.connect()

    # Panda-centric row factory
    def pandas_factory(colnames, rows):
        return pd.DataFrame(rows, columns=colnames)

    # use Panda-centric settings
    session.row_factory = pandas_factory
    session.default_fetch_size = None

    return session
Exemple #4
0
def benchmark(run_fn):
    for conn_class in supported_reactors:
        setup()
        log.info("==== %s ====" % (conn_class.__name__, ))

        cluster = Cluster(['127.0.0.1'])
        cluster.connection_class = conn_class
        session = cluster.connect(KEYSPACE)

        log.debug("Sleeping for two seconds...")
        time.sleep(2.0)

        query = SimpleStatement("""
            INSERT INTO {table} (thekey, col1, col2)
            VALUES (%(key)s, %(a)s, %(b)s)
            """.format(table=TABLE))
        values = {'key': 'key', 'a': 'a', 'b': 'b'}

        log.debug("Beginning inserts...")
        start = time.time()
        try:
            run_fn(session, query, values, NUM_QUERIES)
            end = time.time()
        finally:
            teardown()

        total = end - start
        log.info("Total time: %0.2fs" % total)
        log.info("Average throughput: %0.2f/sec" % (NUM_QUERIES / total))
Exemple #5
0
def get_cluster():
    cluster = Cluster(*Connection.cluster_args, **Connection.cluster_kwargs)
    try:
        from cassandra.io.libevreactor import LibevConnection
        cluster.connection_class = LibevConnection
    except ImportError:
        pass
    return cluster
Exemple #6
0
def get_cluster():
    cluster = Cluster(*Connection.cluster_args, **Connection.cluster_kwargs)
    try:
        from cassandra.io.libevreactor import LibevConnection
        cluster.connection_class = LibevConnection
    except ImportError:
        pass
    return cluster
Exemple #7
0
    def execute(self, return_type='dict', renderOnly=False):
        rendered = RenderManagers().render(self)
        if renderOnly:
            return rendered

        cluster = Cluster()
        cluster.connection_class = LibevConnection
        connection = cluster.connect()
        connection.row_factory = getattr(decoder, return_type + '_factory', 'dict_factory')

        return connection.execute(rendered)
Exemple #8
0
def get_cluster():
    cluster = Cluster(*Connection.cluster_args, **Connection.cluster_kwargs)
    cluster.default_retry_policy = FeedlyRetryPolicy(
        max_read_retries=settings.FEEDLY_CASSANDRA_READ_RETRY_ATTEMPTS,
        max_write_retries=settings.FEEDLY_CASSANDRA_WRITE_RETRY_ATTEMPTS)
    try:
        from cassandra.io.libevreactor import LibevConnection
        cluster.connection_class = LibevConnection
    except ImportError:
        pass
    return cluster
Exemple #9
0
def get_cluster():
    cluster = Cluster(*Connection.cluster_args, **Connection.cluster_kwargs)
    cluster.default_retry_policy = FeedlyRetryPolicy(
        max_read_retries=settings.FEEDLY_CASSANDRA_READ_RETRY_ATTEMPTS,
        max_write_retries=settings.FEEDLY_CASSANDRA_WRITE_RETRY_ATTEMPTS
    )
    try:
        from cassandra.io.libevreactor import LibevConnection
        cluster.connection_class = LibevConnection
    except ImportError:
        pass
    return cluster
Exemple #10
0
def cassandra_connect(seed=None, keyspace=None):
    from cassandra.cluster import Cluster

    if seed is None:
        args = ()
    else:
        args = ([seed],)

    cluster = Cluster(*args)
    try:
        from cassandra.io.libevreactor import LibevConnection
        cluster.connection_class = LibevConnection
    except ImportError:
        pass

    return cluster, cluster.connect(keyspace)
Exemple #11
0
 def run(self):
     # Connect to Cassandra
     cluster = Cluster()
     cluster.connection_class = LibevConnection
     session = cluster.connect()
     # Connect to Kafka
     consumer = KafkaConsumer(bootstrap_servers='192.168.65.111:1026',
                              auto_offset_reset='earliest')
     consumer.subscribe(['time'])
     for message in consumer:
         object = ujson.loads(message)
         object_timestamp = self.add_timestamp(object)
         object_timestamp_json = ujson.dumps(object_timestamp)
         session.execute("insert into dev.time JSON" +
                         object_timestamp_json)
         print(message)
Exemple #12
0
def get_session():
    '''
    Connect onto a Cassandra cluster with the driver.
    :return: A Cassandra session object for cluster interactions.
    '''

    # grab the cluster information using Docker-provided enviornmental variables
    CASSANDRA_HOST = os.environ['CASSANDRA_HOST']
    CASSANDRA_DC = os.environ['CASSANDRA_DC']

    # create a cluster object that will only connect to a single data center
    cluster = Cluster([CASSANDRA_HOST],
                      load_balancing_policy=DCAwareRoundRobinPolicy(
                          local_dc=CASSANDRA_DC), )

    # use the faster event loop provider
    cluster.connection_class = LibevConnection

    # create the Cassandra session for cluster interaction
    session = cluster.connect()
    return session
def connect(seeds, keyspace, datacenter=None, port=9042):
    from cassandra.io.libevreactor import LibevConnection
    from cassandra.cluster import Cluster
    from cassandra.policies import DCAwareRoundRobinPolicy, RetryPolicy, ExponentialReconnectionPolicy

    class CustomRetryPolicy(RetryPolicy):

        def on_write_timeout(self, query, consistency, write_type,
                             required_responses, received_responses, retry_num):

            # retry at most 5 times regardless of query type
            if retry_num >= 5:
                return (self.RETHROW, None)

            return (self.RETRY, consistency)


    load_balancing_policy = None
    if datacenter:
        # If you are using multiple datacenters it's important to use
        # the DCAwareRoundRobinPolicy. If not then the client will
        # make cross DC connections. This defaults to round robin
        # which means round robin across all nodes irrespective of
        # data center.
        load_balancing_policy = DCAwareRoundRobinPolicy(local_dc=datacenter)

    cluster = Cluster(contact_points=seeds,
                      port=port,
                      default_retry_policy=CustomRetryPolicy(),
                      reconnection_policy=ExponentialReconnectionPolicy(1, 60),
                      load_balancing_policy=load_balancing_policy,
                      protocol_version=3)

    cluster.connection_class = LibevConnection
    cluster.connection_class = LibevConnection
    cluster.control_connection_timeout = 10.0
    cluster.compression = False
    session = cluster.connect(keyspace)
    print 'Connection established with seed(s): %s at port: %s and keyspace: %s' %(seeds,port,keyspace)
    return session
Exemple #14
0
"""

def parse(path):
    g = open(path, 'r')
    for l in g:
        yield eval(l)

if __name__ == '__main__':
    f = open('config.txt', 'r')
    contact_points = f.readline()
    meta_path = sys.argv[1]
    geo_path = sys.argv[2]
    input_path = sys.argv[1]
    cluster = Cluster([contact_points])
    cluster.connection_class = LibevConnection
    session = cluster.connect()
    # session.execute(META_CF_DROP_STATEMENT)
    # session.execute(RANK_CF_DROP_STATEMENT)
    session.execute(KS_CREATION_STATEMENT)
    session.execute(META_CF_CREATION_STATEMENT)
    session.execute(RANK_CF_CREATION_STATEMENT)

    meta_prepared = session.prepare(META_INSERT_STATEMENT)
    rank_prepared = session.prepare(RANK_INSERT_STATEMENT)

    for data in parse(input_path):
        asin = data['asin']
        title = data.get('title', "")
        imurl = data.get('imUrl', "")
        price = data.get('price', 0.0)
        return 1
    else:
        return 2
'''path_test = 'test/negative_Test'
obj3=open(path_test,"r")
str3 = obj3.read()
files_test = str3.split("\n\n")
obj3.close()
print files_test'''
start_time = time.time()
data_test = []
data_dict = {}
data_neg = []
result = []
cluster = Cluster()
cluster.connection_class = LibevConnection
session = cluster.connect('getfb')
rows = session.execute('SELECT * FROM getdata')

for row in rows:
    data_test.append(row.comment.encode('utf8'))
    id_comment = row.id_comment
    comment = row.comment
    tmp_dict = {id_comment.encode('utf8'): comment.encode('utf8')}
    data_dict.update(tmp_dict)

count_neg = 0
count_pos = 0
if (data_test):
    for val in data_test:
        # print val
Exemple #16
0
def benchmark(thread_class):
    options, args = parse_options()
    for conn_class in options.supported_reactors:
        setup(options.hosts)
        log.info("==== %s ====" % (conn_class.__name__,))

        cluster = Cluster(options.hosts, metrics_enabled=options.enable_metrics)
        cluster.connection_class = conn_class
        session = cluster.connect(KEYSPACE)

        log.debug("Sleeping for two seconds...")
        time.sleep(2.0)

        query = SimpleStatement(
            """
            INSERT INTO {table} (thekey, col1, col2)
            VALUES (%(key)s, %(a)s, %(b)s)
            """.format(
                table=TABLE
            )
        )
        values = {"key": "key", "a": "a", "b": "b"}

        per_thread = options.num_ops / options.threads
        threads = []

        log.debug("Beginning inserts...")
        start = time.time()
        try:
            for i in range(options.threads):
                thread = thread_class(i, session, query, values, per_thread, options.profile)
                thread.daemon = True
                threads.append(thread)

            for thread in threads:
                thread.start()

            for thread in threads:
                while thread.is_alive():
                    thread.join(timeout=0.5)

            end = time.time()
        finally:
            teardown(options.hosts)

        total = end - start
        log.info("Total time: %0.2fs" % total)
        log.info("Average throughput: %0.2f/sec" % (options.num_ops / total))
        if options.enable_metrics:
            stats = scales.getStats()["cassandra"]
            log.info("Connection errors: %d", stats["connection_errors"])
            log.info("Write timeouts: %d", stats["write_timeouts"])
            log.info("Read timeouts: %d", stats["read_timeouts"])
            log.info("Unavailables: %d", stats["unavailables"])
            log.info("Other errors: %d", stats["other_errors"])
            log.info("Retries: %d", stats["retries"])

            request_timer = stats["request_timer"]
            log.info("Request latencies:")
            log.info("  min: %0.4fs", request_timer["min"])
            log.info("  max: %0.4fs", request_timer["max"])
            log.info("  mean: %0.4fs", request_timer["mean"])
            log.info("  stddev: %0.4fs", request_timer["stddev"])
            log.info("  median: %0.4fs", request_timer["median"])
            log.info("  75th: %0.4fs", request_timer["75percentile"])
            log.info("  95th: %0.4fs", request_timer["95percentile"])
            log.info("  98th: %0.4fs", request_timer["98percentile"])
            log.info("  99th: %0.4fs", request_timer["99percentile"])
            log.info("  99.9th: %0.4fs", request_timer["999percentile"])
def main():
    KEYSPACE = "cryptos_keyspace"

    assets = [
        'bitcoin', "ethereum", "tether", "xrp", "litecoin", "cardano", "iota",
        "eos", "stellar"
    ]  # missing quel bastardo di bitcoin-cash

    cluster = Cluster()
    cluster.connection_class = LibevConnection
    session = cluster.connect()

    session.execute("DROP KEYSPACE " + KEYSPACE)

    session.execute("""
            CREATE KEYSPACE IF NOT EXISTS %s
            WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '2' }
            """ % KEYSPACE)

    session.set_keyspace(KEYSPACE)

    for currency in assets:

        print("Creating table " + currency + " in cassandra keyspace " +
              KEYSPACE + " . . .")

        # creating the table with the following specs
        create_table_command = """
                CREATE TABLE IF NOT EXISTS {} (
                    ts text,
                    price text,
                    date text,
                    hour text,
                    yhat text,
                    yhat_lower text,
                    yhat_upper text,
                    PRIMARY KEY (ts, date)
                )""".format(currency)

        session.execute(create_table_command)

        print("Table created. Now filling it with historical data . . .")

        command = "INSERT INTO {} (ts, price, date, hour, yhat, yhat_lower, yhat_upper) VALUES (?, ?, ?, ?, ?, ?, ?)".format(
            currency)

        prepared = session.prepare(command)

        folder_path = "../data/history/" + currency + "/only_days_" + currency + ".csv"

        df = pd.read_csv(folder_path)

        for i in range(len(df["timestamp"])):
            row = df.iloc[i]
            session.execute(prepared, ("{}".format(
                row["timestamp"]), "{}".format(row["price"]), "{}".format(
                    row["date"]), "{}".format(row["hour"]), "?", "?", "?"))

        print("Table " + currency + " filled.\n")

    print("Setup CASSANDRA completed!")
Exemple #18
0
def benchmark(thread_class):
    options, args = parse_options()
    for conn_class in options.supported_reactors:
        setup(options.hosts)
        log.info("==== %s ====" % (conn_class.__name__, ))

        cluster = Cluster(options.hosts,
                          metrics_enabled=options.enable_metrics)
        cluster.connection_class = conn_class
        session = cluster.connect(KEYSPACE)

        log.debug("Sleeping for two seconds...")
        time.sleep(2.0)

        query = session.prepare("""
            INSERT INTO {table} (thekey, col1, col2) VALUES (?, ?, ?)
            """.format(table=TABLE))
        values = ('key', 'a', 'b')

        per_thread = options.num_ops // options.threads
        threads = []

        log.debug("Beginning inserts...")
        start = time.time()
        try:
            for i in range(options.threads):
                thread = thread_class(i, session, query, values, per_thread,
                                      options.profile)
                thread.daemon = True
                threads.append(thread)

            for thread in threads:
                thread.start()

            for thread in threads:
                while thread.is_alive():
                    thread.join(timeout=0.5)

            end = time.time()
        finally:
            teardown(options.hosts)

        total = end - start
        log.info("Total time: %0.2fs" % total)
        log.info("Average throughput: %0.2f/sec" % (options.num_ops / total))
        if options.enable_metrics:
            stats = scales.getStats()['cassandra']
            log.info("Connection errors: %d", stats['connection_errors'])
            log.info("Write timeouts: %d", stats['write_timeouts'])
            log.info("Read timeouts: %d", stats['read_timeouts'])
            log.info("Unavailables: %d", stats['unavailables'])
            log.info("Other errors: %d", stats['other_errors'])
            log.info("Retries: %d", stats['retries'])

            request_timer = stats['request_timer']
            log.info("Request latencies:")
            log.info("  min: %0.4fs", request_timer['min'])
            log.info("  max: %0.4fs", request_timer['max'])
            log.info("  mean: %0.4fs", request_timer['mean'])
            log.info("  stddev: %0.4fs", request_timer['stddev'])
            log.info("  median: %0.4fs", request_timer['median'])
            log.info("  75th: %0.4fs", request_timer['75percentile'])
            log.info("  95th: %0.4fs", request_timer['95percentile'])
            log.info("  98th: %0.4fs", request_timer['98percentile'])
            log.info("  99th: %0.4fs", request_timer['99percentile'])
            log.info("  99.9th: %0.4fs", request_timer['999percentile'])
Exemple #19
0
def get_updated_df(currency):
    """
    This function gets in INPUT the name of the crypto the consumer is dealing with.
    It checks if the Cassandra DB is up to date w.r.t. to the current day and eventually updates the DB.
    Then it RETURNS the updated DB as a Pandas DataFrame to the consumer.
    NB: the Cassandra DB will be updated until the day before today (YESTERDAY), it's duty of the consumer
        reading from Kafka the current price of the crypto.
    """

    KEYSPACE = "cryptos_keyspace"

    # opening cassandra to retrieve the current df
    cluster = Cluster()
    cluster.connection_class = LibevConnection
    session = cluster.connect()
    session.set_keyspace(KEYSPACE)

    session.row_factory = pandas_factory
    session.default_fetch_size = None

    query = "SELECT * FROM {}.{};".format(KEYSPACE, currency)

    result = session.execute(query, timeout=None)
    df = result._current_rows

    # sort the df by timestamp, so that the first row is the last entry
    df = df.sort_values(by=["ts"], ascending=False)

    # getting only the last information stored in Cassandra
    last_entry_cassandra = df.iloc[0]
    last_day = last_entry_cassandra["date"]
    last_ts = last_entry_cassandra["ts"]
    last_hour = last_entry_cassandra["hour"]

    index_last_entry = df[df["ts"] == last_ts].index.values

    #print(index_last_entry)
    #print(len(df.index))
    #print(df)

    timestamp_curr = int(time.time()) * 1000  # to get milliseconds

    if int(last_ts) < timestamp_curr:
        # now get the actual price of the currency from the API
        url = "https://api.coincap.io/v2/assets/" + currency + "/history?interval=d1"
        start = "&start=" + str(last_ts)
        end = "&end=" + str(timestamp_curr)
        final_url = url + start + end

        response = req.get(final_url).json()

        for data_point in response["data"]:
            timestamp = data_point["time"]
            date = data_point["date"].replace(".000Z", "")
            split_date = date.split("T")
            date = split_date[0]
            hour = split_date[1]
            priceUsd = data_point["priceUsd"]

            if int(timestamp) > int(last_ts):

                df.loc[len(df.index)] = [
                    timestamp, date, hour, priceUsd, "?", "?", "?"
                ]

                command = "INSERT INTO {} (ts, price, date, hour, yhat, yhat_lower, yhat_upper) VALUES (?, ?, ?, ?, ?, ?, ?)".format(
                    currency)

                prepared = session.prepare(command)

                session.execute(
                    prepared,
                    ("{}".format(timestamp), "{}".format(priceUsd),
                     "{}".format(date), "{}".format(hour), "?", "?", "?"))

    return df.tail(len(df.index) - 8)