Ejemplo n.º 1
0
def main(input_dir, keyspace, table):
    from cassandra.cluster import Cluster
    cluster = Cluster(['199.60.17.32', '199.60.17.65'])
    session = cluster.connect(keyspace)
    batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM)
    query = "INSERT INTO %s(host,id,datetime,path,bytes) VALUES (?,?,?,?,?)" % (
        table)
    insert_data = session.prepare(query)
    counter = 0

    for f in os.listdir(input_dir):
        with gzip.open(os.path.join(input_dir, f), 'rt',
                       encoding='utf-8') as logfile:
            for line in logfile:
                server_logs_dis = disassemble(line)
                if len(server_logs_dis) == 6:
                    datetime_object = datetime.strptime(
                        server_logs_dis[2], '%d/%b/%Y:%H:%M:%S')
                    batch.add(
                        insert_data,
                        (server_logs_dis[1], uuid.uuid1(), datetime_object,
                         server_logs_dis[3], int(server_logs_dis[4])))
                    counter = counter + 1
                if counter == 200:
                    session.execute(batch)
                    batch.clear()
                    counter = 0
Ejemplo n.º 2
0
def putInCassandra(s3file, cassandraConnection):
    session = getSession(cassandraConnection)
    measures_by_date = session.prepare(
        "INSERT INTO measures_by_date (date, timestamp, measureReference, meta, value) VALUES (?, ?, ?, ?, ?)"
    )
    measures_by_measurereference = session.prepare(
        "INSERT INTO measures_by_measurereference (date, timestamp, measureReference, meta, value) VALUES (?, ?, ?, ?, ?)"
    )
    batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM)
    batch_number = 10
    print("Opening File")
    count = 0
    with gzip.open(s3file, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if (count >= batch_number):
                session.execute(batch)
                batch.clear()
                count = 0
            batch.add(measures_by_measurereference, createRow(row))
            batch.add(measures_by_date, createRow(row))
            count += 2
        session.execute(batch)
        batch.clear()
    print("Done")
Ejemplo n.º 3
0
def main(inputs, keyspace, table):
    cluster = Cluster(['199.60.17.188', '199.60.17.216'])
    session = cluster.connect(keyspace)
    batch = BatchStatement()
    insertLog = session.prepare(
        "INSERT INTO " + table +
        " (id, host, datetime, path, bytes) VALUES (?, ?, ?, ?, ?)")

    count = 0
    for f in os.listdir(inputs):
        with gzip.open(os.path.join(inputs, f), 'rt',
                       encoding='utf-8') as logfile:
            for line in logfile:
                fields = getFields(line)

                if fields:
                    batch.add(insertLog, (fields[0], fields[1], fields[2],
                                          fields[3], fields[4]))
                    count += 1

                if count > 300:
                    session.execute(batch)
                    batch.clear()
                    count = 0

    if count > 0:
        session.execute(batch)
Ejemplo n.º 4
0
def main(inputs, output, keyspace):
    # main logic starts here
    count = 1
    add_query = session.prepare(
        "INSERT INTO " + output +
        "(host, id, datetime, path, bytes) VALUES (?, uuid(), ?, ?, ?)")
    batch = BatchStatement(consistency_level=1)
    for f in os.listdir(inputs):
        with gzip.open(os.path.join(inputs, f), 'rt',
                       encoding='utf-8') as logfile:
            for line in logfile:
                list_values = splitline(line)
                if not list_values is None:
                    hostname = list_values[0]
                    date = list_values[1]
                    path = list_values[2]
                    bytes = list_values[3]
                    id = list_values[4]
                    count = count + 1
                    batch.add(add_query, (hostname, date, path, bytes))
                    if count == 300:
                        session.execute(batch)
                        batch.clear()
                        count = 1
            session.execute(batch)
            batch.clear()
Ejemplo n.º 5
0
def insert_data(session):
    """
    Insert data into the database.

    arguments:
    session - the Session object used to execute the KEYSPACE creation statement
    """
    df = get_dataframe()
    session_cols, song_cols, user_cols = get_table_col_idxs(df)

    batch = BatchStatement(batch_type=BatchType.UNLOGGED)
    batch_execute_at = 500
    stmt_count = 0

    for _, row in df.iterrows():
        # let's do the inserts
        #
        # we get the table specific data from the row by mapping the
        # __getitem__ function of the row values to the table column
        # indices.
        batch.add(queries['session_library']['insert'],
                  tuple(map(row.values.__getitem__, session_cols)))
        batch.add(queries['song_library']['insert'],
                  tuple(map(row.values.__getitem__, song_cols)))
        batch.add(queries['user_library']['insert'],
                  tuple(map(row.values.__getitem__, user_cols)))
        stmt_count += 3

        if stmt_count > batch_execute_at:
            try:
                session.execute(batch)
                batch.clear()
            except Exception as e:
                print(e)
            stmt_count = 0
Ejemplo n.º 6
0
    def insert(cls, params):

        idx_start, idx_end = params

        batch_size = 500
        batch_stmt = BatchStatement()

        for index in range(idx_start, idx_end, batch_size):

            curr_batch_size = min(batch_size, idx_end - index)
            for i in range(0, curr_batch_size):
                tx = blocksci.Tx(index + i, cls.chain)
                batch_stmt.add(cls.prepared_stmt, tx_summary(tx))

            try:
                cls.session.execute(batch_stmt)
            except Exception as e:
                # ingest single transactions if batch ingest fails
                # (batch too large error)
                print(e)
                for i in range(0, curr_batch_size):
                    while True:
                        try:
                            tx = blocksci.Tx(index + i, cls.chain)
                            cls.session.execute(cls.prepared_stmt,
                                                tx_summary(tx))
                        except Exception as e:
                            print(e)
                            continue
                        break
            batch_stmt.clear()

            with cls.counter.get_lock():
                cls.counter.value += curr_batch_size
            print('#tx {:,.0f}'.format(cls.counter.value), end='\r')
Ejemplo n.º 7
0
def main(inputs, table):
    create_Table()
    session.execute("""TRUNCATE nasalogs;""")
    insert_log = session.prepare(
        "INSERT INTO " + table +
        " (host,datetime,path,bytes,id) VALUES (?,?,?,?,?)")
    batch = BatchStatement(consistency_level=ConsistencyLevel.ONE)
    c = 0
    for g_file in os.listdir(inputs):
        with gzip.open(os.path.join(inputs, g_file), 'rt',
                       encoding='utf-8') as logfile:
            for line in logfile:
                w = get_words(line)
                if len(w) > 4:
                    c += 1
                    batch.add(
                        insert_log,
                        (w[1],
                         datetime.datetime.strptime(w[2], '%d/%b/%Y:%H:%M:%S'),
                         w[3], int(w[4]), uid()))
                if (c == 400):
                    session.execute(batch)
                    batch.clear()
                    c = 0

    session.execute(batch)
    cluster.shutdown()
Ejemplo n.º 8
0
def main(input_dir, keyspace, table):
    cluster = Cluster(['199.60.17.32', '199.0.17.65'])
    session = cluster.connect(keyspace)
    insert_log = session.prepare(
        'INSERT INTO ' + table +
        ' (id,host,datetime,path,bytes) VALUES (?, ?, ?, ?, ?)')
    batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM)
    counter = 0
    for f in os.listdir(input_dir):
        with gzip.open(os.path.join(input_dir, f), 'rt',
                       encoding='utf-8') as logfile:
            for line in logfile:
                m = line_re.match(line)
                if m is not None:
                    batch.add(
                        insert_log,
                        (uuid.uuid4(), m.group(1),
                         datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S %z'),
                         m.group(3), int(m.group(4))))
                    counter = counter + 1
                    if counter == 300:
                        session.execute(batch)
                        counter = 0
                        batch.clear()
            session.execute(batch)  #submit the rest records
            batch.clear()
            counter = 0
Ejemplo n.º 9
0
def main(keyspace, table):
	cluster = Cluster(['199.60.17.171', '199.60.17.188'])
	session = cluster.connect(keyspace)

	insert_statement = SimpleStatement("INSERT INTO " +table+ \
		" (author, description, publishedAt, title, url, urlToImage, source) VALUES (%s, %s, %s, %s, %s, %s, %s)")

	count = 0
	batch = BatchStatement()

	for i in sources:
		# api.get(source=i, sort_by="popular")	
		# print("News Source: " + str(i))
		fetched=api.get_by_top(source=i)
		# jsonobj = json.dumps(fetched)
		fetcheddictobj = dict(fetched)
		source = fetcheddictobj['source']
		for art in fetcheddictobj['articles']:
			author = art['author']
			description = art['description']
			publishedAt = art['publishedAt']
			title = art['title']
			url = art['url']
			urlToImage = art['urlToImage']
			batch.add(insert_statement,(author, description, publishedAt, title, url, urlToImage, source))
			count = count + 1
			if count == 50:
				count = 0
				session.execute(batch)
				print('Batch of 50 insert statements executed')
				batch.clear()
	session.execute(batch)
Ejemplo n.º 10
0
    def test_clear(self):
        keyspace = 'keyspace'
        routing_key = 'routing_key'
        custom_payload = {'key': six.b('value')}

        ss = SimpleStatement('whatever',
                             keyspace=keyspace,
                             routing_key=routing_key,
                             custom_payload=custom_payload)

        batch = BatchStatement()
        batch.add(ss)

        self.assertTrue(batch._statements_and_parameters)
        self.assertEqual(batch.keyspace, keyspace)
        self.assertEqual(batch.routing_key, routing_key)
        self.assertEqual(batch.custom_payload, custom_payload)

        batch.clear()
        self.assertFalse(batch._statements_and_parameters)
        self.assertIsNone(batch.keyspace)
        self.assertIsNone(batch.routing_key)
        self.assertFalse(batch.custom_payload)

        batch.add(ss)
def main(inputs, keyspace, table):
    session = cluster.connect(keyspace)
    insert_user = session.prepare(
        "INSERT INTO %s (uid, host, datetime, path, bytes)\
        VALUES (?, ?, ? , ?, ?)" % table)
    BATCH_SIZE = 300
    batch = BatchStatement()

    for f in os.listdir(inputs):
        with gzip.open(os.path.join(inputs, f), 'rt', \
            encoding='utf-8', errors='ignore') as logfile:

            count = 0

            for line in logfile:
                l = parseline(line)
                if l is not None:
                    (host, date, path, bys) = l
                    batch.add(insert_user,
                              (uuid.uuid1(), host, date, path, int(bys)))
                    count = count + 1
                if count == BATCH_SIZE:
                    session.execute(batch)
                    batch.clear()
                    count = 0

    session.execute(batch)
def main(input_dir, keyspace, tab_name):
    cluster = Cluster(['199.60.17.188', '199.60.17.216'])
    session = cluster.connect(keyspace)
    session.execute('TRUNCATE ' + tab_name)
    batch = BatchStatement()
    count = 0
    for f in os.listdir(input_dir):
        with gzip.open(os.path.join(input_dir, f), 'rt',
                       encoding='utf-8') as logfile:
            for line in logfile:
                line_re = re.compile(
                    r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$'
                )
                split_tup = line_re.split(line)
                if len(split_tup) == 6:
                    batch.add(
                        SimpleStatement(
                            "INSERT INTO " + tab_name +
                            " (id,host,datetime,path,bytes) VALUES (%s, %s, %s, %s, %s )"
                        ), (uuid.uuid4(), split_tup[1],
                            datetime.datetime.strptime(
                                split_tup[2], "%d/%b/%Y:%H:%M:%S").strftime(
                                    "%Y-%m-%d %H:%M:%S"), split_tup[3],
                            int(split_tup[4])))
                    count += 1
                    if count == 200:
                        session.execute(batch)
                        batch.clear()
                        count = 0
    session.execute(batch)
    rows = session.execute('SELECT path, bytes FROM tab_name WHERE host=%s',
                           ['uplherc.upl.com'])
Ejemplo n.º 13
0
def log_insert(log_file, table):
    counter = 1
    insert_query = session.prepare(
        "INSERT INTO " + table +
        " (host, id, datetime, path, bytes) VALUES (?, uuid(), ?, ?, ?)")
    batch = BatchStatement(consistency_level=1)
    for line in log_file:
        line_re = re.compile(
            r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$'
        )
        m = re.match(line_re, line)
        if m:  # Skip lines that do not satisfy parsing
            host = m.group(1)
            p_datetime = datetime.datetime.strptime(m.group(2),
                                                    '%d/%b/%Y:%H:%M:%S')
            path = m.group(3)
            bytes_pro = int(m.group(4))
            counter += 1
            batch.add(insert_query, (host, p_datetime, path, bytes_pro))
            if counter == 200:
                session.execute(batch)
                batch.clear()
                counter = 1
    session.execute(batch)
    batch.clear()
Ejemplo n.º 14
0
    def import_from_file(self, filename):
        """ import data from csv to db """
        f = open(filename)

        # read the csv file and skip the next header
        csv_f = csv.reader(f)
        next(csv_f, None)
        # insert usersById
        insert_user_by_id = self._db_cur.prepare("INSERT INTO USER_BY_ID (id,first_name, last_name, email, company, city) VALUES ( ?, ?, ?, ?, ?, ? )")
        insert_user_by_email = self._db_cur.prepare("INSERT INTO USER_BY_EMAIL (id,first_name, last_name, email, company, city) VALUES ( ?, ?, ?, ?, ?, ? )")
        insert_user_by_company_city = self._db_cur.prepare("INSERT INTO USER_BY_COMPANY_CITY (id,first_name, last_name, email, company, city) VALUES ( ?, ?, ?, ?, ?, ? )")
        update_user_by_domain = self._db_cur.prepare("UPDATE USER_BY_DOMAIN SET counter = counter + 1 WHERE domain = ? AND city = ?")

        batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM)

        for iid, first_name, last_name, email, company , city in csv_f:
            while len(batch) > self._batch * len(self._tables_to_sync):
                self._db_cur.execute(batch)
                batch.clear()
            # insert into UserById table
            batch.add(insert_user_by_id, (int(iid), first_name, last_name, email, company, city))
            batch.add(insert_user_by_email, (int(iid), first_name, last_name, email, company, city))
            batch.add(insert_user_by_company_city, (int(iid), first_name, last_name, email, company, city))

            # get the email domain
            domain = re.search('@(.*)$',email, re.IGNORECASE).group()

            self._db_cur.execute(update_user_by_domain, (domain[1:], city))

            # Save company name to redis
            self._redis_db.set(iid, company)

        if len(batch) != 0:
            self._db_cur.execute(batch)
    def test_clear_empty(self):
        batch = BatchStatement()
        batch.clear()
        self.assertFalse(batch._statements_and_parameters)
        self.assertIsNone(batch.keyspace)
        self.assertIsNone(batch.routing_key)
        self.assertFalse(batch.custom_payload)

        batch.add('something')
Ejemplo n.º 16
0
def main(input_directory, keyspace, table_name):

    # connecting to keyspace
    cluster = Cluster(['199.60.17.188', '199.60.17.216'])
    session = cluster.connect(keyspace)

    #initialize batch count
    batch_count = 0

    # Creating a table with the inputed table name if it doesnt exixt
    create_statement = 'CREATE TABLE IF NOT EXISTS ' + table_name + ' (id UUID, host TEXT, date_time TIMESTAMP, path_value TEXT, bytes INT, PRIMARY KEY (host,id))'
    session.execute(create_statement)

    # Truncating all old values
    truncate_statemet = 'TRUNCATE ' + table_name + ';'
    session.execute(truncate_statemet)

    # Opening the inputed file directory and initializing batch statement
    open_directory = os.listdir(input_directory)
    batch = BatchStatement(consistency_level=ConsistencyLevel.ONE)

    # Insert statement used to insert values into the table
    insert_log = session.prepare(
        'INSERT INTO ' + table_name +
        ' (host, id, date_time, path_value, bytes) VALUES (?,?,?,?,?)')

    for file_value in open_directory:
        if '.gz' in file_value:
            with gzip.open(os.path.join(input_directory, file_value),
                           'rt',
                           encoding='utf-8') as logfile:
                for line in logfile:

                    # split and get log values
                    log_values = log_split(line)

                    if log_values is not None:

                        # Insert values in batch
                        batch.add(insert_log,
                                  (log_values[0], uuid.uuid1(),
                                   datetime.datetime.strptime(
                                       log_values[1], "%d/%b/%Y:%H:%M:%S"),
                                   log_values[2], log_values[3]))
                        batch_count = batch_count + 1

                        #Inserts values in batches of 200
                        if batch_count % 200 == 0:
                            session.execute(batch)
                            batch.clear()
    session.execute(batch)
    print('Query Exectuion Completed')
    cluster.shutdown()
Ejemplo n.º 17
0
def main(input_dir, keyspace, table_name):
    cluster = Cluster(['199.60.17.103', '199.60.17.105'])
    session = cluster.connect()
    session.execute(
        "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 2}"
        % keyspace)
    session.set_keyspace(keyspace)
    session.execute("DROP TABLE IF EXISTS " + keyspace + "." + table_name)
    session.execute(
        "CREATE TABLE IF NOT EXISTS table_name (id UUID, host TEXT, datetime TIMESTAMP, path TEXT, bytes INT, PRIMARY KEY (host, id))"
    )
    # session.execute('TRUNCATE table_name')

    batch = BatchStatement()
    count = 0
    for f in os.listdir(input_dir):
        with gzip.open(os.path.join(input_dir, f), 'rt',
                       encoding='utf-8') as logfile:
            for line in logfile:
                line_re = re.compile(
                    r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$'
                )
                splitted = line_re.split(line)
                if len(splitted) == 6:
                    batch.add(
                        SimpleStatement(
                            "INSERT INTO table_name (id, host, datetime, path, bytes) VALUES (%s, %s, %s, %s, %s)"
                        ), (uuid.uuid4(), splitted[1],
                            datetime.strptime(
                                splitted[2], '%d/%b/%Y:%H:%M:%S').strftime(
                                    '%Y-%m-%d %H:%M:%S'), splitted[3],
                            int(splitted[4])))
                    count += 1
                    if count == 300:
                        session.execute(batch)
                        count = 0
                        batch.clear()

    session.execute(batch)
    batch.clear()
    table = session.execute(
        "SELECT path, bytes FROM table_name WHERE host='uplherc.upl.com'")
    cont = 1
    total_bytes = 0
    for row in table:
        if row.bytes:
            cont += 1
            total_bytes += row.bytes
    print("No. of times uplherc.upl.com occurred: {}".format(cont))
    print("No. of bytes: {}".format(total_bytes))
    session.shutdown()
Ejemplo n.º 18
0
def main():
    #Defining input directory, keyspace and table name
    MAX_LINES = 500
    temp_line = 0
    inputs = sys.argv[1]
    keyspace = sys.argv[2]
    table_name = sys.argv[3]

    #Cluster configuration
    cluster = Cluster(['199.60.17.136', '199.60.17.173'])
    session = cluster.connect(keyspace)
    session.execute('USE %s;' % keyspace)

    #Defining the query for inserting values into table nasalogs
    insert_query = session.prepare(
        "INSERT INTO %s (host, datetime, path, bytes) VALUES (?, ?, ?, ?);" %
        table_name)
    linere = re.compile(
        "^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$"
    )

    for f in os.listdir(inputs):
        with gzip.GzipFile(os.path.join(inputs, f)) as logfile:
            batch = BatchStatement()
            for line in logfile:
                #splitting the row data as per the regular expression
                single_row = linere.split(line)

                #retrieving required values in the specific format as host,datetime,path and bytes
                if len(single_row) == 6:
                    host = single_row[1]
                    #stripping date-time to its format
                    date_time = dt.datetime.strptime(single_row[2],
                                                     '%d/%b/%Y:%H:%M:%S')
                    path = single_row[3]
                    bytes_transferred = single_row[4]

                #packaging multiple insert queries into one batch statement
                if temp_line <= MAX_LINES:
                    temp_line += 1
                    batch.add(insert_query,
                              [host, date_time, path,
                               int(bytes_transferred)])
                    session.execute(batch)

                if temp_line == MAX_LINES:
                    #checking batch threshold and clearing the batch when it meets the threshold
                    batch.clear()
                    temp_line = 0
Ejemplo n.º 19
0
    def import_from_activity(self, filename):
        """ import from csv file to db """
        f = open(filename)

        # read the csv fle and skip th next header
        csv_f = csv.reader(f)
        next(csv_f, None)

        # insert ActivityByUser
        insert_count_activity_by_user = self._db_cur.prepare("UPDATE COUNT_ACTIVITY_BY_USER SET counter = counter + 1 where date = ? and company = ? ")
        insert_activity_by_user = self._db_cur.prepare("INSERT INTO ACTIVITY_BY_USER (date, event, domain, id, url, user_id, datetime) VALUES(?, ?, ?, ?, ?, ?, ? )")

        batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM)
        for row in csv_f:
            if len(row) != 4:
                continue
            (user_id, event, url, time_stamp) = row
            domain = re.findall('https?://[^/]*',url, re.IGNORECASE)

            if len(domain) != 1:
                domain = ""
            else:
                domain = domain[0][7:]
                if domain[0] == '/':
                    domain = domain[1:]

            # if not correct time format. continue
            if( not self.check_time(time_stamp)):
                continue

            try:
                dt = datetime.datetime.strptime(time_stamp, "%Y/%m/%d %H:%M:%S.%f")    
            except ValueError:
                dt = datetime.datetime.strptime(time_stamp, "%Y/%m/%d %H:%M:%S")

            while len(batch) > self._batch:
                self._db_cur.execute(batch)
                batch.clear()
            
            bucket = dt.strftime("%Y/%m/%d")
            company = self.get_company(user_id)
            batch.add(insert_activity_by_user,(bucket, event, domain, uuid.uuid1(), url, int(user_id), dt))
            self._db_cur.execute(insert_count_activity_by_user, (bucket, company))
            self.update_event_by_company(event, company, bucket)
        
        if len(batch) != 0:
            self._db_cur.execute(batch)
Ejemplo n.º 20
0
def insert(cluster, keyspace, cql_stmt, generator, batch_size):
    session = cluster.connect(keyspace)
    session.default_timeout = 60
    session.default_consistency_level = ConsistencyLevel.LOCAL_ONE
    prepared_stmt = session.prepare(cql_stmt)
    batch_stmt = BatchStatement()

    values = take(batch_size, generator)
    count = 0
    while values:
        batch_stmt.add_all([prepared_stmt] * batch_size, values)
        session.execute(batch_stmt)

        values = take(batch_size, generator)
        batch_stmt.clear()
        if (count % 1e3) == 0:
            print('#blocks {:,.0f}'.format(count), end='\r')
        count += batch_size
Ejemplo n.º 21
0
def read_files(inputs, table):
    record_counter = 0
    batch_counter = 0

    batch_insert = BatchStatement()
    insert_statement = session.prepare(
        "INSERT INTO " + table +
        " (host, id, datetime, path, bytes) VALUES (?, ?, ?, ?, ?)")

    # get all files in input folder
    for file in os.listdir(inputs):

        # unzip files
        with gzip.open(os.path.join(inputs, file), 'rt',
                       encoding='utf-8') as logfile:

            # read file line by line
            for line in logfile:
                # create a tuple of requried fields
                log_object = separate_columns(line)

                # if log object is valid
                if (log_object is not None):
                    record_counter += 1
                    batch_insert.add(
                        insert_statement,
                        (log_object[0], log_object[1], log_object[2],
                         log_object[3], log_object[4]))

                # insert records when reached to declared batch size
                if (record_counter >= BATCH_SIZE):
                    print("writing batch " + str(batch_counter))

                    session.execute(batch_insert)
                    batch_insert.clear()

                    record_counter = 0
                    batch_counter += 1

    # to insert the final part with number of rows less than batch size
    if (record_counter > 0):

        print("writing final batch " + str((batch_counter + 1)))
        session.execute(batch_insert)
    async def _write_batchwise(cls, query, batch_size_limit):
        """
        Insert data into db via batch statements.

        :param query: query to execute
        :param batch_size_limit: maximum batch size

        :return: result object
        """
        cls.logger.debug("Writing data in batches of maximum size " +
                         str(batch_size_limit))
        res = []
        try:
            cls._check_write_parameters()
            batch = BatchStatement()
            prepared = cls.session.prepare(query)
            batch_size = 0
            for i in range(len(cls.result_id)):
                cls.logger.debug("Writing results for: " +
                                 str(cls.result_id[i]))
                # Send each 50k values in batches
                for d in cls.output_data:
                    batch.add(prepared,
                              (cls.result_id[i], d[0], str(d[i + 1])))
                    batch_size += 1
                    if batch_size >= batch_size_limit:
                        cls.logger.debug("Writing batch of " +
                                         str(batch_size) + " rows")
                        res.append(cls.session.execute(batch))
                        batch.clear()
                        batch_size = 0
                # Send remaining values
                if batch_size > 0:
                    cls.logger.debug("Writing batch of " + str(batch_size) +
                                     " rows")
                    res.append(cls.session.execute(batch))
                    batch.clear()
                    batch_size = 0
        except Exception as err:
            cls.logger.error("Batch writing failed")
            raise Exception("Impossible to write in batches: " + str(err))
        return res
Ejemplo n.º 23
0
def execute_batch():
    query = "INSERT INTO data (device_id, data_source_id, time_upload, value) VALUES (?, ?, ?, ?) IF NOT EXISTS"
    batch = BatchStatement()
    prepared = session.prepare(query)
    batch_size = 0
    # send each 50k values in batches
    for d in dates_data:
        batch.add(prepared, (device_id, data_source_id, d[0], str(d[1])))
        batch_size += 1
        if batch_size >= 25_000:
            res = session.execute(batch)
            print('values sent', str(batch_size))
            batch.clear()
            batch_size = 0
    # send remaining values
    if batch_size > 0:
        res = session.execute(batch)
        print('values sent', str(batch_size))
        batch.clear()
        batch_size = 0
Ejemplo n.º 24
0
    def test_clear(self):
        keyspace = 'keyspace'
        routing_key = 'routing_key'
        custom_payload = {'key': six.b('value')}

        ss = SimpleStatement('whatever', keyspace=keyspace, routing_key=routing_key, custom_payload=custom_payload)

        batch = BatchStatement()
        batch.add(ss)

        self.assertTrue(batch._statements_and_parameters)
        self.assertEqual(batch.keyspace, keyspace)
        self.assertEqual(batch.routing_key, routing_key)
        self.assertEqual(batch.custom_payload, custom_payload)

        batch.clear()
        self.assertFalse(batch._statements_and_parameters)
        self.assertIsNone(batch.keyspace)
        self.assertIsNone(batch.routing_key)
        self.assertFalse(batch.custom_payload)

        batch.add(ss)
Ejemplo n.º 25
0
    def insert(cls, params):

        idx_start, idx_end = params

        batch_size = 25
        batch_stmt = BatchStatement()

        for index in range(idx_start, idx_end, batch_size):

            curr_batch_size = min(batch_size, idx_end - index)
            for i in range(0, curr_batch_size):
                block = cls.chain[index + i]
                block_tx = [block.height, [tx_stats(x) for x in block.txes]]
                batch_stmt.add(cls.prepared_stmt, block_tx)

            try:
                cls.session.execute(batch_stmt)
            except Exception as e:
                # ingest single blocks batch ingest fails
                # (batch too large error)
                print(e)
                for i in range(0, curr_batch_size):
                    while True:
                        try:
                            block = cls.chain[index + i]
                            block_tx = [
                                block.height,
                                [tx_stats(x) for x in block.txes]
                            ]
                            cls.session.execute(cls.prepared_stmt, block_tx)
                        except Exception as e:
                            print(e)
                            continue
                        break
            batch_stmt.clear()

            with cls.counter.get_lock():
                cls.counter.value += curr_batch_size
            print('#blocks {:,.0f}'.format(cls.counter.value), end='\r')
Ejemplo n.º 26
0
def main(inputs, key_space, table):
    cluster = Cluster(['199.60.17.188', '199.60.17.216'])
    session = cluster.connect(key_space)
    session.execute("""
            CREATE TABLE IF NOT EXISTS nasalogs (
                host TEXT,                
                datetime TIMESTAMP,
                path TEXT,
                bytes INT,
                recId UUID,
                PRIMARY KEY (host,recId)
            )
            """)
    session.execute("""TRUNCATE nasalogs;""")
    insert_log = session.prepare(
        "INSERT INTO " + table +
        " (host,datetime,path,bytes,recId) VALUES (?,?,?,?,?)")
    batch = BatchStatement(consistency_level=ConsistencyLevel.ONE)
    c = 0
    for g_file in os.listdir(inputs):
        with gzip.open(os.path.join(inputs, g_file), 'rt',
                       encoding='utf-8') as logfile:
            for line in logfile:
                w = get_words(line)
                if len(w) > 4:
                    c += 1
                    batch.add(
                        insert_log,
                        (w[1],
                         datetime.datetime.strptime(w[2], '%d/%b/%Y:%H:%M:%S'),
                         w[3], int(w[4]), uid()))
                if (c == 400):
                    session.execute(batch)
                    batch.clear()
                    c = 0

    session.execute(batch)
    cluster.shutdown()
def insert_newlog(log_file, table_name):
    count = 1
    insert_query = session.prepare(
        "INSERT INTO " + table_name +
        " (host, id, datetime, path, bytes) VALUES (?, uuid(), ?, ?, ?)")
    batch = BatchStatement()
    for line in log_file:
        values = log_dissemble.split(line)
        if len(
                values
        ) >= 4:  # Only consider lines which can be split as host, dtime, path, num_bytes
            host = values[1]
            dtime = datetime.datetime.strptime(values[2], '%d/%b/%Y:%H:%M:%S')
            path = values[3]
            num_bytes = int(values[4])
            count += 1
            batch.add(insert_query, (host, dtime, path, num_bytes))
            if count == 300:
                session.execute(batch)
                batch.clear()
                count = 1
    session.execute(batch)
    batch.clear()
batch = BatchStatement(consistency_level=ConsistencyLevel.ONE)
insert_query = session.prepare(
    'Insert into ' + table_name +
    ' (host,id,bytes,datetime,path) VALUES (?,?,?,?,?)')
counter = 0
batch_size = 200
for f in os.listdir(inputs):
    with gzip.open(os.path.join(inputs, f), 'rt', encoding='utf-8') as logfile:
        for line in logfile:
            line_re = re.compile(
                r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$'
            )
            val = re.search(line_re, line)
            if val:
                y = line_re.split(line)
                y[2] = datetime.datetime.strptime(y[2], "%d/%b/%Y:%H:%M:%S")
                y[4] = int(y[4])
                #print(y[1],y[2],y[3],y[4])
                batch.add(insert_query, [y[1], uuid.uuid4(), y[4], y[2], y[3]])
                counter = counter + 1
                #insert_query.consistency_level = ConsistencyLevel.ONE

                #for i in range (batch_size):
                if (counter > batch_size):
                    session.execute(batch)
                    batch.clear()
                    counter = 0
session.execute(batch)
#session.execute(insert_query,[y[1],uuid.uuid4(),y[4],y[2],y[3]])
Ejemplo n.º 29
0
def ingest_folder(root_folder, initial_batch_size, db_nodes):
    config = yaml.safe_load(open(path.join(root_folder, CONFIG_FILE), 'r'))
    config_baseURI = config['baseURI']
    config_tagpacks = config['targetKeyspace']
    cluster = Cluster(db_nodes)
    session = cluster.connect(config_tagpacks)
    session.default_timeout = 60

    packs_path = path.join(root_folder, PACKS_FOLDER)
    for tag_pack_file in listdir(packs_path):
        if tag_pack_file[-4:] == 'yaml':
            batch_size = initial_batch_size
            print('Ingesting', tag_pack_file)
            tag_pack_path = path.join(packs_path, tag_pack_file)
            tag_pack = yaml.safe_load(open(tag_pack_path, 'r'))
            tag_pack_uri = path.join(config_baseURI,
                                     path.join(PACKS_FOLDER, tag_pack_file))

            # Convert lastmod values from datetime to UNIX timestamp
            tag_pack = lastmod_to_timestamp(tag_pack)

            # Insert metadata into tagpack_by_uri table
            tag_pack_meta = extract_meta(tag_pack)
            tag_pack_meta['uri'] = tag_pack_uri
            tag_pack_meta_json = json.dumps(tag_pack_meta)
            cql_stmt = """INSERT INTO tagpack_by_uri JSON '{}';"""\
                .format(tag_pack_meta_json)
            session.execute(cql_stmt)

            # Insert tags into tag_by_address table
            extracted_tags = extract_tags(tag_pack)
            batch_size = min(batch_size, len(extracted_tags))
            batch_stmt = BatchStatement()

            print('Ingesting tags with batch size:', batch_size)
            success = False
            while not success and batch_size:
                try:  # batch might be too large
                    prepared_stmt = session.prepare('INSERT INTO '
                                                    'tag_by_address JSON ?')
                    idx_start, idx_end = 0, len(extracted_tags)
                    for index in range(idx_start, idx_end, batch_size):
                        curr_batch_size = min(batch_size, idx_end - index)
                        for i in range(0, curr_batch_size):
                            tag = extracted_tags[index + i]
                            tag['tagpack_uri'] = tag_pack_uri
                            tag_json = json.dumps(tag)
                            batch_stmt.add(prepared_stmt, [tag_json])
                        session.execute(batch_stmt)
                        batch_stmt.clear()
                    success = True
                except Exception as e:
                    print(e)
                    batch_size = min(int(batch_size / 2), BATCH_SIZE_LIMIT)
                    batch_stmt.clear()
                    print('Trying again with batch size:', batch_size)

            print('Ingesting tags with batch size:', batch_size)
            success = False
            while not success and batch_size:
                try:
                    # Insert tags into tag_by_category table
                    prepared_stmt = session.prepare('INSERT INTO '
                                                    'tag_by_category JSON ?')
                    idx_start, idx_end = 0, len(extracted_tags)
                    for index in range(idx_start, idx_end, batch_size):
                        curr_batch_size = min(batch_size, idx_end - index)
                        for i in range(0, curr_batch_size):
                            tag = extracted_tags[index + i]
                            tag['label_norm'] = normalize_label(tag['label'])
                            tag['tagpack_uri'] = tag_pack_uri
                            tag_json = json.dumps(tag)
                            batch_stmt.add(prepared_stmt, [tag_json])
                        session.execute(batch_stmt)
                        batch_stmt.clear()
                    success = True
                    print("Ingested TagPack {} [1/2]".format(tag_pack_file))
                except Exception as e:
                    print(e)
                    batch_size = min(int(batch_size / 2), BATCH_SIZE_LIMIT)
                    batch_stmt.clear()
                    print('Trying again with batch size:', batch_size)

            print('Ingesting tags with batch size:', batch_size)
            success = False
            while not success and batch_size:
                try:
                    # Insert tags into tag_by_label table
                    prepared_stmt = session.prepare('INSERT INTO '
                                                    'tag_by_label JSON ?')
                    idx_start, idx_end = 0, len(extracted_tags)
                    for index in range(idx_start, idx_end, batch_size):
                        curr_batch_size = min(batch_size, idx_end - index)
                        for i in range(0, curr_batch_size):
                            tag = extracted_tags[index + i]
                            tag['label_norm'] = normalize_label(tag['label'])
                            tag['label_norm_prefix'] = tag['label_norm'][:3]
                            tag['tagpack_uri'] = tag_pack_uri
                            tag_json = json.dumps(tag)
                            batch_stmt.add(prepared_stmt, [tag_json])
                        session.execute(batch_stmt)
                        batch_stmt.clear()
                    success = True
                    print("Ingested TagPack {} [2/2]".format(tag_pack_file))
                except Exception as e:
                    print(e)
                    batch_size = min(int(batch_size / 2), BATCH_SIZE_LIMIT)
                    batch_stmt.clear()
                    print('Trying again with batch size:', batch_size)

    # Insert categories
    for i, c in enumerate(schema_categories):
        category_json = json.dumps({'category': c, 'ID': i})
        cql_stmt = """INSERT INTO categories JSON '{}';"""\
            .format(category_json)
        session.execute(cql_stmt)

    cluster.shutdown()
class SessionEventsRepository():
    """
    class to interact with SessionEvents database
    """
    def __init__(self):
        """
        Sets up the cassandra cluster, session, keyspace, table, and a reusable
        batch statement
        in order for the data older than 1 year old to be discarded, we set a
        default ttl of 365*24*60*60 or 31536000 seconds.
        There are multiple configurations including the default ttl that may
        be set in a separate configuration file.
        """
        # connect to cluster
        cluster = Cluster()
        self.session = cluster.connect()
        self.session.execute("""
            CREATE KEYSPACE IF NOT EXISTS unity_assignment
            WITH REPLICATION = { 
                'class' : 'SimpleStrategy',
                'replication_factor' : 1
            };
            """)
        # TODO: POC if separating start and end event would be more efficient
        self.session.execute("""
            CREATE TABLE IF NOT EXISTS unity_assignment.session_events (
                event TEXT,
                country TEXT,
                player_id UUID,
                session_id UUID,
                ts timestamp,
                PRIMARY KEY (player_id, event, ts)
            )  WITH default_time_to_live = 31536000 AND CLUSTERING ORDER BY (event DESC, ts DESC); 
            """)

        self.batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM)
        self.insert_event = self.session.prepare(
            """INSERT INTO unity_assignment.session_events
            (event, country, player_id, session_id, ts)
            VALUES (?, ?, ?, ?, ?)
            """)

    @staticmethod
    def _typecast(event: Dict):
        """
        casts type of passed events to be stored into Session Events table
        :param event: event whose fields' types are going to be cast
        """
        if isinstance(event, dict):
            return (event.get('event'), event.get('country'),
                    uuid.UUID(event.get('player_id')),
                    uuid.UUID(event.get('session_id')),
                    datetime.fromisoformat(event.get('ts')))
        else:
            raise TypeError(f'{type(event)} is an unsupported type for the '
                            'passed event to be typecast')

    def insert_events_batch(self, events: List[Dict[str, str]]):
        """
        inserts events batch into the SessionEvents database
        :param events: list of dictionary of events such as:
        [{
            "event": "start",
            "country": "FI",
            "player_id": "0a2d12a1a7e145de8bae44c0c6e06629", "session_id": "4a0c43c9-c43a-42ff-ba55-67563dfa35d4", "ts": "2016-12-02T12:48:05.520022"
        },
        {
            "event": "end",
            "player_id": "0a2d12a1a7e145de8bae44c0c6e06629", "session_id": "4a0c43c9-c43a-42ff-ba55-67563dfa35d4", "ts": "2016-12-02T12:49:05.520022"
        }]
        :return: http body message
        """
        for event in events:
            self.batch.add(self.insert_event,
                           SessionEventsRepository._typecast(event))
        try:
            self.session.execute(self.batch)
        finally:
            self.batch.clear()

    def fetch_recent_completed_sessions(self, player_id: str) -> List[str]:
        """
        fetches up to 20 recent completed sessions associated with a player
        :param player_id: the player id of the player whose recent sessions
        we'd want to query
        :returns: list of recent sessions sorted from latest to the earliest
        """
        rows = self.session.execute(
            """
            SELECT session_id
            FROM unity_assignment.session_events
            WHERE player_id=%s AND event=%s
            LIMIT 20;
            """, (uuid.UUID(player_id), 'end'))
        # TODO: We may want to query the same rows from start events to make
        # sure the events correspond to begin events as well. Alternatively,
        # we may redisign table, say to normalize the pair of start and end
        # events in the same row, to order the results by the timestamps, or
        # again work out at application level. However, this might come for
        # free from the downstream. Therefore, I just assume the start events
        # are guaranteed to exist for end events.
        return [str(row.session_id) for row in rows]
    def push(self, OHLCs, timestamp):
        # Sometimes there will be missing timestamp in the queue, it is caused by overmuch time consumed when asking stock data from IEX API, so the later data retriever write data earlier than the earlier data retriever
        # So when there's a missing value, we will supplement it with the duplicate of the current value
        # Later when the program which is responsible for this timestamp starts to write data, it will update the previous supplemented value
        # In other situation if the time consumed is longer than the set interval, it will force the data retriever to skip one round of data retrieval
        # In that case the supplemented data won't be update and will be just the same as the descendent
        # There are 4 situations that could happen at the moment when the data is about to be pushed into the queue
        # 1. the latest timestamp is exactly 1 second before the current timestamp, then we can append our current data to the end of the queue
        # 2. the lastest timestamp is earlier than the current timestamp more than 1 second but not more than 10 seconds. In this case we duplicate the current data and supplement the missing value
        # 3. the latest timestamp is bigger than the current timestamp.
        #     In this case, we pop out all the data which is later than the current timestamp until we found a timestamp which is less than or equal to the current timestamp, and see now it belongs to situation 1 or situation 2
        # 4. large difference between current timestamp and the last timestamp. It indicate the current one is the beginning of a new day, so just append it to the end of the queue
        # The above analysis is only for the problem in retrieving data every second. Introducing interval into the code can generalize it for every minute data and every hour data

        # unit of timestamp is second
        stack = []
        while self.queue and self.queue[-1]['timestamp'] > timestamp:
            stack.append(self.queue.pop())

        # An execution of a batch of operations is faster than multiple executions every single operations in a loop
        batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM)

        if self.queue and timestamp == self.queue[-1]['timestamp']:
            # modify content of the element in the queue
            self.queue[-1]['OHLCs'] = OHLCs

            # update data in Cassandra table "stocks_history"
            dt = datetime.fromtimestamp(timestamp)
            update = self.__UpdateCassandraStock()

            for symbol in OHLCs.keys():
                # Convert dict to json so that it can be insert into Cassandra table
                OHLC_json = json.dumps(OHLCs[symbol])
                batch.add(update, (OHLC_json, symbol, self.interval_type, dt))
                # Update H and L
                self.H[symbol] = max(OHLCs[symbol]['high'], self.H[symbol])
                self.L[symbol] = min(OHLCs[symbol]['low'], self.L[symbol])

        elif self.queue and timestamp - self.queue[-1][
                'timestamp'] < 10 * self.interval:
            # the timestamp that is less than or equal to the current timestamp, in a certain range
            # supplement the intermediate missing data
            latest = self.queue[-1]['timestamp']
            insert = self.__InsertCassandraStock()

            for t in range((timestamp - latest) // self.interval):
                ts = latest + (t + 1) * self.interval
                self.queue.append({'OHLCs': OHLCs, 'timestamp': ts})
                dt = datetime.fromtimestamp(ts)
                for symbol in OHLCs.keys():
                    OHLC_json = json.dumps(OHLCs[symbol])
                    batch.add(insert,
                              (OHLC_json, symbol, self.interval_type, dt))
                    if t < 1:
                        # Because they are all same values, so update H and L only once
                        self.H[symbol] = max(OHLCs[symbol]['high'],
                                             self.H[symbol])
                        self.L[symbol] = min(OHLCs[symbol]['low'],
                                             self.L[symbol])

        else:
            self.queue.append({'OHLCs': OHLCs, 'timestamp': timestamp})
            dt = datetime.fromtimestamp(timestamp)
            insert = self.__InsertCassandraStock()
            for symbol in OHLCs.keys():
                OHLC_json = json.dumps(OHLCs[symbol])
                batch.add(insert, (OHLC_json, symbol, self.interval_type, dt))
                # Update H and L
                self.H[symbol] = max(OHLCs[symbol]['high'], self.H[symbol])
                self.L[symbol] = min(OHLCs[symbol]['low'], self.L[symbol])

        while stack:
            self.queue.append(stack.pop())

        # Pop the exceeded data out if the queue is full
        # Delete the data being pop out from Cassandra table
        delete = self.__DeleteCassandraStock()
        while len(self.queue) > self.length:
            del_data = self.queue.popleft()
            dt = datetime.fromtimestamp(del_data['timestamp'])
            for symbol in del_data['OHLCs'].keys():
                batch.add(delete, (symbol, self.interval_type, dt))

        self.c_session.execute(batch)
        batch.clear()