def main(input_dir, keyspace, table): from cassandra.cluster import Cluster cluster = Cluster(['199.60.17.32', '199.60.17.65']) session = cluster.connect(keyspace) batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM) query = "INSERT INTO %s(host,id,datetime,path,bytes) VALUES (?,?,?,?,?)" % ( table) insert_data = session.prepare(query) counter = 0 for f in os.listdir(input_dir): with gzip.open(os.path.join(input_dir, f), 'rt', encoding='utf-8') as logfile: for line in logfile: server_logs_dis = disassemble(line) if len(server_logs_dis) == 6: datetime_object = datetime.strptime( server_logs_dis[2], '%d/%b/%Y:%H:%M:%S') batch.add( insert_data, (server_logs_dis[1], uuid.uuid1(), datetime_object, server_logs_dis[3], int(server_logs_dis[4]))) counter = counter + 1 if counter == 200: session.execute(batch) batch.clear() counter = 0
def putInCassandra(s3file, cassandraConnection): session = getSession(cassandraConnection) measures_by_date = session.prepare( "INSERT INTO measures_by_date (date, timestamp, measureReference, meta, value) VALUES (?, ?, ?, ?, ?)" ) measures_by_measurereference = session.prepare( "INSERT INTO measures_by_measurereference (date, timestamp, measureReference, meta, value) VALUES (?, ?, ?, ?, ?)" ) batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM) batch_number = 10 print("Opening File") count = 0 with gzip.open(s3file, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if (count >= batch_number): session.execute(batch) batch.clear() count = 0 batch.add(measures_by_measurereference, createRow(row)) batch.add(measures_by_date, createRow(row)) count += 2 session.execute(batch) batch.clear() print("Done")
def main(inputs, keyspace, table): cluster = Cluster(['199.60.17.188', '199.60.17.216']) session = cluster.connect(keyspace) batch = BatchStatement() insertLog = session.prepare( "INSERT INTO " + table + " (id, host, datetime, path, bytes) VALUES (?, ?, ?, ?, ?)") count = 0 for f in os.listdir(inputs): with gzip.open(os.path.join(inputs, f), 'rt', encoding='utf-8') as logfile: for line in logfile: fields = getFields(line) if fields: batch.add(insertLog, (fields[0], fields[1], fields[2], fields[3], fields[4])) count += 1 if count > 300: session.execute(batch) batch.clear() count = 0 if count > 0: session.execute(batch)
def main(inputs, output, keyspace): # main logic starts here count = 1 add_query = session.prepare( "INSERT INTO " + output + "(host, id, datetime, path, bytes) VALUES (?, uuid(), ?, ?, ?)") batch = BatchStatement(consistency_level=1) for f in os.listdir(inputs): with gzip.open(os.path.join(inputs, f), 'rt', encoding='utf-8') as logfile: for line in logfile: list_values = splitline(line) if not list_values is None: hostname = list_values[0] date = list_values[1] path = list_values[2] bytes = list_values[3] id = list_values[4] count = count + 1 batch.add(add_query, (hostname, date, path, bytes)) if count == 300: session.execute(batch) batch.clear() count = 1 session.execute(batch) batch.clear()
def insert_data(session): """ Insert data into the database. arguments: session - the Session object used to execute the KEYSPACE creation statement """ df = get_dataframe() session_cols, song_cols, user_cols = get_table_col_idxs(df) batch = BatchStatement(batch_type=BatchType.UNLOGGED) batch_execute_at = 500 stmt_count = 0 for _, row in df.iterrows(): # let's do the inserts # # we get the table specific data from the row by mapping the # __getitem__ function of the row values to the table column # indices. batch.add(queries['session_library']['insert'], tuple(map(row.values.__getitem__, session_cols))) batch.add(queries['song_library']['insert'], tuple(map(row.values.__getitem__, song_cols))) batch.add(queries['user_library']['insert'], tuple(map(row.values.__getitem__, user_cols))) stmt_count += 3 if stmt_count > batch_execute_at: try: session.execute(batch) batch.clear() except Exception as e: print(e) stmt_count = 0
def insert(cls, params): idx_start, idx_end = params batch_size = 500 batch_stmt = BatchStatement() for index in range(idx_start, idx_end, batch_size): curr_batch_size = min(batch_size, idx_end - index) for i in range(0, curr_batch_size): tx = blocksci.Tx(index + i, cls.chain) batch_stmt.add(cls.prepared_stmt, tx_summary(tx)) try: cls.session.execute(batch_stmt) except Exception as e: # ingest single transactions if batch ingest fails # (batch too large error) print(e) for i in range(0, curr_batch_size): while True: try: tx = blocksci.Tx(index + i, cls.chain) cls.session.execute(cls.prepared_stmt, tx_summary(tx)) except Exception as e: print(e) continue break batch_stmt.clear() with cls.counter.get_lock(): cls.counter.value += curr_batch_size print('#tx {:,.0f}'.format(cls.counter.value), end='\r')
def main(inputs, table): create_Table() session.execute("""TRUNCATE nasalogs;""") insert_log = session.prepare( "INSERT INTO " + table + " (host,datetime,path,bytes,id) VALUES (?,?,?,?,?)") batch = BatchStatement(consistency_level=ConsistencyLevel.ONE) c = 0 for g_file in os.listdir(inputs): with gzip.open(os.path.join(inputs, g_file), 'rt', encoding='utf-8') as logfile: for line in logfile: w = get_words(line) if len(w) > 4: c += 1 batch.add( insert_log, (w[1], datetime.datetime.strptime(w[2], '%d/%b/%Y:%H:%M:%S'), w[3], int(w[4]), uid())) if (c == 400): session.execute(batch) batch.clear() c = 0 session.execute(batch) cluster.shutdown()
def main(input_dir, keyspace, table): cluster = Cluster(['199.60.17.32', '199.0.17.65']) session = cluster.connect(keyspace) insert_log = session.prepare( 'INSERT INTO ' + table + ' (id,host,datetime,path,bytes) VALUES (?, ?, ?, ?, ?)') batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM) counter = 0 for f in os.listdir(input_dir): with gzip.open(os.path.join(input_dir, f), 'rt', encoding='utf-8') as logfile: for line in logfile: m = line_re.match(line) if m is not None: batch.add( insert_log, (uuid.uuid4(), m.group(1), datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S %z'), m.group(3), int(m.group(4)))) counter = counter + 1 if counter == 300: session.execute(batch) counter = 0 batch.clear() session.execute(batch) #submit the rest records batch.clear() counter = 0
def main(keyspace, table): cluster = Cluster(['199.60.17.171', '199.60.17.188']) session = cluster.connect(keyspace) insert_statement = SimpleStatement("INSERT INTO " +table+ \ " (author, description, publishedAt, title, url, urlToImage, source) VALUES (%s, %s, %s, %s, %s, %s, %s)") count = 0 batch = BatchStatement() for i in sources: # api.get(source=i, sort_by="popular") # print("News Source: " + str(i)) fetched=api.get_by_top(source=i) # jsonobj = json.dumps(fetched) fetcheddictobj = dict(fetched) source = fetcheddictobj['source'] for art in fetcheddictobj['articles']: author = art['author'] description = art['description'] publishedAt = art['publishedAt'] title = art['title'] url = art['url'] urlToImage = art['urlToImage'] batch.add(insert_statement,(author, description, publishedAt, title, url, urlToImage, source)) count = count + 1 if count == 50: count = 0 session.execute(batch) print('Batch of 50 insert statements executed') batch.clear() session.execute(batch)
def test_clear(self): keyspace = 'keyspace' routing_key = 'routing_key' custom_payload = {'key': six.b('value')} ss = SimpleStatement('whatever', keyspace=keyspace, routing_key=routing_key, custom_payload=custom_payload) batch = BatchStatement() batch.add(ss) self.assertTrue(batch._statements_and_parameters) self.assertEqual(batch.keyspace, keyspace) self.assertEqual(batch.routing_key, routing_key) self.assertEqual(batch.custom_payload, custom_payload) batch.clear() self.assertFalse(batch._statements_and_parameters) self.assertIsNone(batch.keyspace) self.assertIsNone(batch.routing_key) self.assertFalse(batch.custom_payload) batch.add(ss)
def main(inputs, keyspace, table): session = cluster.connect(keyspace) insert_user = session.prepare( "INSERT INTO %s (uid, host, datetime, path, bytes)\ VALUES (?, ?, ? , ?, ?)" % table) BATCH_SIZE = 300 batch = BatchStatement() for f in os.listdir(inputs): with gzip.open(os.path.join(inputs, f), 'rt', \ encoding='utf-8', errors='ignore') as logfile: count = 0 for line in logfile: l = parseline(line) if l is not None: (host, date, path, bys) = l batch.add(insert_user, (uuid.uuid1(), host, date, path, int(bys))) count = count + 1 if count == BATCH_SIZE: session.execute(batch) batch.clear() count = 0 session.execute(batch)
def main(input_dir, keyspace, tab_name): cluster = Cluster(['199.60.17.188', '199.60.17.216']) session = cluster.connect(keyspace) session.execute('TRUNCATE ' + tab_name) batch = BatchStatement() count = 0 for f in os.listdir(input_dir): with gzip.open(os.path.join(input_dir, f), 'rt', encoding='utf-8') as logfile: for line in logfile: line_re = re.compile( r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$' ) split_tup = line_re.split(line) if len(split_tup) == 6: batch.add( SimpleStatement( "INSERT INTO " + tab_name + " (id,host,datetime,path,bytes) VALUES (%s, %s, %s, %s, %s )" ), (uuid.uuid4(), split_tup[1], datetime.datetime.strptime( split_tup[2], "%d/%b/%Y:%H:%M:%S").strftime( "%Y-%m-%d %H:%M:%S"), split_tup[3], int(split_tup[4]))) count += 1 if count == 200: session.execute(batch) batch.clear() count = 0 session.execute(batch) rows = session.execute('SELECT path, bytes FROM tab_name WHERE host=%s', ['uplherc.upl.com'])
def log_insert(log_file, table): counter = 1 insert_query = session.prepare( "INSERT INTO " + table + " (host, id, datetime, path, bytes) VALUES (?, uuid(), ?, ?, ?)") batch = BatchStatement(consistency_level=1) for line in log_file: line_re = re.compile( r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$' ) m = re.match(line_re, line) if m: # Skip lines that do not satisfy parsing host = m.group(1) p_datetime = datetime.datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S') path = m.group(3) bytes_pro = int(m.group(4)) counter += 1 batch.add(insert_query, (host, p_datetime, path, bytes_pro)) if counter == 200: session.execute(batch) batch.clear() counter = 1 session.execute(batch) batch.clear()
def import_from_file(self, filename): """ import data from csv to db """ f = open(filename) # read the csv file and skip the next header csv_f = csv.reader(f) next(csv_f, None) # insert usersById insert_user_by_id = self._db_cur.prepare("INSERT INTO USER_BY_ID (id,first_name, last_name, email, company, city) VALUES ( ?, ?, ?, ?, ?, ? )") insert_user_by_email = self._db_cur.prepare("INSERT INTO USER_BY_EMAIL (id,first_name, last_name, email, company, city) VALUES ( ?, ?, ?, ?, ?, ? )") insert_user_by_company_city = self._db_cur.prepare("INSERT INTO USER_BY_COMPANY_CITY (id,first_name, last_name, email, company, city) VALUES ( ?, ?, ?, ?, ?, ? )") update_user_by_domain = self._db_cur.prepare("UPDATE USER_BY_DOMAIN SET counter = counter + 1 WHERE domain = ? AND city = ?") batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM) for iid, first_name, last_name, email, company , city in csv_f: while len(batch) > self._batch * len(self._tables_to_sync): self._db_cur.execute(batch) batch.clear() # insert into UserById table batch.add(insert_user_by_id, (int(iid), first_name, last_name, email, company, city)) batch.add(insert_user_by_email, (int(iid), first_name, last_name, email, company, city)) batch.add(insert_user_by_company_city, (int(iid), first_name, last_name, email, company, city)) # get the email domain domain = re.search('@(.*)$',email, re.IGNORECASE).group() self._db_cur.execute(update_user_by_domain, (domain[1:], city)) # Save company name to redis self._redis_db.set(iid, company) if len(batch) != 0: self._db_cur.execute(batch)
def test_clear_empty(self): batch = BatchStatement() batch.clear() self.assertFalse(batch._statements_and_parameters) self.assertIsNone(batch.keyspace) self.assertIsNone(batch.routing_key) self.assertFalse(batch.custom_payload) batch.add('something')
def main(input_directory, keyspace, table_name): # connecting to keyspace cluster = Cluster(['199.60.17.188', '199.60.17.216']) session = cluster.connect(keyspace) #initialize batch count batch_count = 0 # Creating a table with the inputed table name if it doesnt exixt create_statement = 'CREATE TABLE IF NOT EXISTS ' + table_name + ' (id UUID, host TEXT, date_time TIMESTAMP, path_value TEXT, bytes INT, PRIMARY KEY (host,id))' session.execute(create_statement) # Truncating all old values truncate_statemet = 'TRUNCATE ' + table_name + ';' session.execute(truncate_statemet) # Opening the inputed file directory and initializing batch statement open_directory = os.listdir(input_directory) batch = BatchStatement(consistency_level=ConsistencyLevel.ONE) # Insert statement used to insert values into the table insert_log = session.prepare( 'INSERT INTO ' + table_name + ' (host, id, date_time, path_value, bytes) VALUES (?,?,?,?,?)') for file_value in open_directory: if '.gz' in file_value: with gzip.open(os.path.join(input_directory, file_value), 'rt', encoding='utf-8') as logfile: for line in logfile: # split and get log values log_values = log_split(line) if log_values is not None: # Insert values in batch batch.add(insert_log, (log_values[0], uuid.uuid1(), datetime.datetime.strptime( log_values[1], "%d/%b/%Y:%H:%M:%S"), log_values[2], log_values[3])) batch_count = batch_count + 1 #Inserts values in batches of 200 if batch_count % 200 == 0: session.execute(batch) batch.clear() session.execute(batch) print('Query Exectuion Completed') cluster.shutdown()
def main(input_dir, keyspace, table_name): cluster = Cluster(['199.60.17.103', '199.60.17.105']) session = cluster.connect() session.execute( "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 2}" % keyspace) session.set_keyspace(keyspace) session.execute("DROP TABLE IF EXISTS " + keyspace + "." + table_name) session.execute( "CREATE TABLE IF NOT EXISTS table_name (id UUID, host TEXT, datetime TIMESTAMP, path TEXT, bytes INT, PRIMARY KEY (host, id))" ) # session.execute('TRUNCATE table_name') batch = BatchStatement() count = 0 for f in os.listdir(input_dir): with gzip.open(os.path.join(input_dir, f), 'rt', encoding='utf-8') as logfile: for line in logfile: line_re = re.compile( r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$' ) splitted = line_re.split(line) if len(splitted) == 6: batch.add( SimpleStatement( "INSERT INTO table_name (id, host, datetime, path, bytes) VALUES (%s, %s, %s, %s, %s)" ), (uuid.uuid4(), splitted[1], datetime.strptime( splitted[2], '%d/%b/%Y:%H:%M:%S').strftime( '%Y-%m-%d %H:%M:%S'), splitted[3], int(splitted[4]))) count += 1 if count == 300: session.execute(batch) count = 0 batch.clear() session.execute(batch) batch.clear() table = session.execute( "SELECT path, bytes FROM table_name WHERE host='uplherc.upl.com'") cont = 1 total_bytes = 0 for row in table: if row.bytes: cont += 1 total_bytes += row.bytes print("No. of times uplherc.upl.com occurred: {}".format(cont)) print("No. of bytes: {}".format(total_bytes)) session.shutdown()
def main(): #Defining input directory, keyspace and table name MAX_LINES = 500 temp_line = 0 inputs = sys.argv[1] keyspace = sys.argv[2] table_name = sys.argv[3] #Cluster configuration cluster = Cluster(['199.60.17.136', '199.60.17.173']) session = cluster.connect(keyspace) session.execute('USE %s;' % keyspace) #Defining the query for inserting values into table nasalogs insert_query = session.prepare( "INSERT INTO %s (host, datetime, path, bytes) VALUES (?, ?, ?, ?);" % table_name) linere = re.compile( "^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$" ) for f in os.listdir(inputs): with gzip.GzipFile(os.path.join(inputs, f)) as logfile: batch = BatchStatement() for line in logfile: #splitting the row data as per the regular expression single_row = linere.split(line) #retrieving required values in the specific format as host,datetime,path and bytes if len(single_row) == 6: host = single_row[1] #stripping date-time to its format date_time = dt.datetime.strptime(single_row[2], '%d/%b/%Y:%H:%M:%S') path = single_row[3] bytes_transferred = single_row[4] #packaging multiple insert queries into one batch statement if temp_line <= MAX_LINES: temp_line += 1 batch.add(insert_query, [host, date_time, path, int(bytes_transferred)]) session.execute(batch) if temp_line == MAX_LINES: #checking batch threshold and clearing the batch when it meets the threshold batch.clear() temp_line = 0
def import_from_activity(self, filename): """ import from csv file to db """ f = open(filename) # read the csv fle and skip th next header csv_f = csv.reader(f) next(csv_f, None) # insert ActivityByUser insert_count_activity_by_user = self._db_cur.prepare("UPDATE COUNT_ACTIVITY_BY_USER SET counter = counter + 1 where date = ? and company = ? ") insert_activity_by_user = self._db_cur.prepare("INSERT INTO ACTIVITY_BY_USER (date, event, domain, id, url, user_id, datetime) VALUES(?, ?, ?, ?, ?, ?, ? )") batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM) for row in csv_f: if len(row) != 4: continue (user_id, event, url, time_stamp) = row domain = re.findall('https?://[^/]*',url, re.IGNORECASE) if len(domain) != 1: domain = "" else: domain = domain[0][7:] if domain[0] == '/': domain = domain[1:] # if not correct time format. continue if( not self.check_time(time_stamp)): continue try: dt = datetime.datetime.strptime(time_stamp, "%Y/%m/%d %H:%M:%S.%f") except ValueError: dt = datetime.datetime.strptime(time_stamp, "%Y/%m/%d %H:%M:%S") while len(batch) > self._batch: self._db_cur.execute(batch) batch.clear() bucket = dt.strftime("%Y/%m/%d") company = self.get_company(user_id) batch.add(insert_activity_by_user,(bucket, event, domain, uuid.uuid1(), url, int(user_id), dt)) self._db_cur.execute(insert_count_activity_by_user, (bucket, company)) self.update_event_by_company(event, company, bucket) if len(batch) != 0: self._db_cur.execute(batch)
def insert(cluster, keyspace, cql_stmt, generator, batch_size): session = cluster.connect(keyspace) session.default_timeout = 60 session.default_consistency_level = ConsistencyLevel.LOCAL_ONE prepared_stmt = session.prepare(cql_stmt) batch_stmt = BatchStatement() values = take(batch_size, generator) count = 0 while values: batch_stmt.add_all([prepared_stmt] * batch_size, values) session.execute(batch_stmt) values = take(batch_size, generator) batch_stmt.clear() if (count % 1e3) == 0: print('#blocks {:,.0f}'.format(count), end='\r') count += batch_size
def read_files(inputs, table): record_counter = 0 batch_counter = 0 batch_insert = BatchStatement() insert_statement = session.prepare( "INSERT INTO " + table + " (host, id, datetime, path, bytes) VALUES (?, ?, ?, ?, ?)") # get all files in input folder for file in os.listdir(inputs): # unzip files with gzip.open(os.path.join(inputs, file), 'rt', encoding='utf-8') as logfile: # read file line by line for line in logfile: # create a tuple of requried fields log_object = separate_columns(line) # if log object is valid if (log_object is not None): record_counter += 1 batch_insert.add( insert_statement, (log_object[0], log_object[1], log_object[2], log_object[3], log_object[4])) # insert records when reached to declared batch size if (record_counter >= BATCH_SIZE): print("writing batch " + str(batch_counter)) session.execute(batch_insert) batch_insert.clear() record_counter = 0 batch_counter += 1 # to insert the final part with number of rows less than batch size if (record_counter > 0): print("writing final batch " + str((batch_counter + 1))) session.execute(batch_insert)
async def _write_batchwise(cls, query, batch_size_limit): """ Insert data into db via batch statements. :param query: query to execute :param batch_size_limit: maximum batch size :return: result object """ cls.logger.debug("Writing data in batches of maximum size " + str(batch_size_limit)) res = [] try: cls._check_write_parameters() batch = BatchStatement() prepared = cls.session.prepare(query) batch_size = 0 for i in range(len(cls.result_id)): cls.logger.debug("Writing results for: " + str(cls.result_id[i])) # Send each 50k values in batches for d in cls.output_data: batch.add(prepared, (cls.result_id[i], d[0], str(d[i + 1]))) batch_size += 1 if batch_size >= batch_size_limit: cls.logger.debug("Writing batch of " + str(batch_size) + " rows") res.append(cls.session.execute(batch)) batch.clear() batch_size = 0 # Send remaining values if batch_size > 0: cls.logger.debug("Writing batch of " + str(batch_size) + " rows") res.append(cls.session.execute(batch)) batch.clear() batch_size = 0 except Exception as err: cls.logger.error("Batch writing failed") raise Exception("Impossible to write in batches: " + str(err)) return res
def execute_batch(): query = "INSERT INTO data (device_id, data_source_id, time_upload, value) VALUES (?, ?, ?, ?) IF NOT EXISTS" batch = BatchStatement() prepared = session.prepare(query) batch_size = 0 # send each 50k values in batches for d in dates_data: batch.add(prepared, (device_id, data_source_id, d[0], str(d[1]))) batch_size += 1 if batch_size >= 25_000: res = session.execute(batch) print('values sent', str(batch_size)) batch.clear() batch_size = 0 # send remaining values if batch_size > 0: res = session.execute(batch) print('values sent', str(batch_size)) batch.clear() batch_size = 0
def insert(cls, params): idx_start, idx_end = params batch_size = 25 batch_stmt = BatchStatement() for index in range(idx_start, idx_end, batch_size): curr_batch_size = min(batch_size, idx_end - index) for i in range(0, curr_batch_size): block = cls.chain[index + i] block_tx = [block.height, [tx_stats(x) for x in block.txes]] batch_stmt.add(cls.prepared_stmt, block_tx) try: cls.session.execute(batch_stmt) except Exception as e: # ingest single blocks batch ingest fails # (batch too large error) print(e) for i in range(0, curr_batch_size): while True: try: block = cls.chain[index + i] block_tx = [ block.height, [tx_stats(x) for x in block.txes] ] cls.session.execute(cls.prepared_stmt, block_tx) except Exception as e: print(e) continue break batch_stmt.clear() with cls.counter.get_lock(): cls.counter.value += curr_batch_size print('#blocks {:,.0f}'.format(cls.counter.value), end='\r')
def main(inputs, key_space, table): cluster = Cluster(['199.60.17.188', '199.60.17.216']) session = cluster.connect(key_space) session.execute(""" CREATE TABLE IF NOT EXISTS nasalogs ( host TEXT, datetime TIMESTAMP, path TEXT, bytes INT, recId UUID, PRIMARY KEY (host,recId) ) """) session.execute("""TRUNCATE nasalogs;""") insert_log = session.prepare( "INSERT INTO " + table + " (host,datetime,path,bytes,recId) VALUES (?,?,?,?,?)") batch = BatchStatement(consistency_level=ConsistencyLevel.ONE) c = 0 for g_file in os.listdir(inputs): with gzip.open(os.path.join(inputs, g_file), 'rt', encoding='utf-8') as logfile: for line in logfile: w = get_words(line) if len(w) > 4: c += 1 batch.add( insert_log, (w[1], datetime.datetime.strptime(w[2], '%d/%b/%Y:%H:%M:%S'), w[3], int(w[4]), uid())) if (c == 400): session.execute(batch) batch.clear() c = 0 session.execute(batch) cluster.shutdown()
def insert_newlog(log_file, table_name): count = 1 insert_query = session.prepare( "INSERT INTO " + table_name + " (host, id, datetime, path, bytes) VALUES (?, uuid(), ?, ?, ?)") batch = BatchStatement() for line in log_file: values = log_dissemble.split(line) if len( values ) >= 4: # Only consider lines which can be split as host, dtime, path, num_bytes host = values[1] dtime = datetime.datetime.strptime(values[2], '%d/%b/%Y:%H:%M:%S') path = values[3] num_bytes = int(values[4]) count += 1 batch.add(insert_query, (host, dtime, path, num_bytes)) if count == 300: session.execute(batch) batch.clear() count = 1 session.execute(batch) batch.clear()
batch = BatchStatement(consistency_level=ConsistencyLevel.ONE) insert_query = session.prepare( 'Insert into ' + table_name + ' (host,id,bytes,datetime,path) VALUES (?,?,?,?,?)') counter = 0 batch_size = 200 for f in os.listdir(inputs): with gzip.open(os.path.join(inputs, f), 'rt', encoding='utf-8') as logfile: for line in logfile: line_re = re.compile( r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$' ) val = re.search(line_re, line) if val: y = line_re.split(line) y[2] = datetime.datetime.strptime(y[2], "%d/%b/%Y:%H:%M:%S") y[4] = int(y[4]) #print(y[1],y[2],y[3],y[4]) batch.add(insert_query, [y[1], uuid.uuid4(), y[4], y[2], y[3]]) counter = counter + 1 #insert_query.consistency_level = ConsistencyLevel.ONE #for i in range (batch_size): if (counter > batch_size): session.execute(batch) batch.clear() counter = 0 session.execute(batch) #session.execute(insert_query,[y[1],uuid.uuid4(),y[4],y[2],y[3]])
def ingest_folder(root_folder, initial_batch_size, db_nodes): config = yaml.safe_load(open(path.join(root_folder, CONFIG_FILE), 'r')) config_baseURI = config['baseURI'] config_tagpacks = config['targetKeyspace'] cluster = Cluster(db_nodes) session = cluster.connect(config_tagpacks) session.default_timeout = 60 packs_path = path.join(root_folder, PACKS_FOLDER) for tag_pack_file in listdir(packs_path): if tag_pack_file[-4:] == 'yaml': batch_size = initial_batch_size print('Ingesting', tag_pack_file) tag_pack_path = path.join(packs_path, tag_pack_file) tag_pack = yaml.safe_load(open(tag_pack_path, 'r')) tag_pack_uri = path.join(config_baseURI, path.join(PACKS_FOLDER, tag_pack_file)) # Convert lastmod values from datetime to UNIX timestamp tag_pack = lastmod_to_timestamp(tag_pack) # Insert metadata into tagpack_by_uri table tag_pack_meta = extract_meta(tag_pack) tag_pack_meta['uri'] = tag_pack_uri tag_pack_meta_json = json.dumps(tag_pack_meta) cql_stmt = """INSERT INTO tagpack_by_uri JSON '{}';"""\ .format(tag_pack_meta_json) session.execute(cql_stmt) # Insert tags into tag_by_address table extracted_tags = extract_tags(tag_pack) batch_size = min(batch_size, len(extracted_tags)) batch_stmt = BatchStatement() print('Ingesting tags with batch size:', batch_size) success = False while not success and batch_size: try: # batch might be too large prepared_stmt = session.prepare('INSERT INTO ' 'tag_by_address JSON ?') idx_start, idx_end = 0, len(extracted_tags) for index in range(idx_start, idx_end, batch_size): curr_batch_size = min(batch_size, idx_end - index) for i in range(0, curr_batch_size): tag = extracted_tags[index + i] tag['tagpack_uri'] = tag_pack_uri tag_json = json.dumps(tag) batch_stmt.add(prepared_stmt, [tag_json]) session.execute(batch_stmt) batch_stmt.clear() success = True except Exception as e: print(e) batch_size = min(int(batch_size / 2), BATCH_SIZE_LIMIT) batch_stmt.clear() print('Trying again with batch size:', batch_size) print('Ingesting tags with batch size:', batch_size) success = False while not success and batch_size: try: # Insert tags into tag_by_category table prepared_stmt = session.prepare('INSERT INTO ' 'tag_by_category JSON ?') idx_start, idx_end = 0, len(extracted_tags) for index in range(idx_start, idx_end, batch_size): curr_batch_size = min(batch_size, idx_end - index) for i in range(0, curr_batch_size): tag = extracted_tags[index + i] tag['label_norm'] = normalize_label(tag['label']) tag['tagpack_uri'] = tag_pack_uri tag_json = json.dumps(tag) batch_stmt.add(prepared_stmt, [tag_json]) session.execute(batch_stmt) batch_stmt.clear() success = True print("Ingested TagPack {} [1/2]".format(tag_pack_file)) except Exception as e: print(e) batch_size = min(int(batch_size / 2), BATCH_SIZE_LIMIT) batch_stmt.clear() print('Trying again with batch size:', batch_size) print('Ingesting tags with batch size:', batch_size) success = False while not success and batch_size: try: # Insert tags into tag_by_label table prepared_stmt = session.prepare('INSERT INTO ' 'tag_by_label JSON ?') idx_start, idx_end = 0, len(extracted_tags) for index in range(idx_start, idx_end, batch_size): curr_batch_size = min(batch_size, idx_end - index) for i in range(0, curr_batch_size): tag = extracted_tags[index + i] tag['label_norm'] = normalize_label(tag['label']) tag['label_norm_prefix'] = tag['label_norm'][:3] tag['tagpack_uri'] = tag_pack_uri tag_json = json.dumps(tag) batch_stmt.add(prepared_stmt, [tag_json]) session.execute(batch_stmt) batch_stmt.clear() success = True print("Ingested TagPack {} [2/2]".format(tag_pack_file)) except Exception as e: print(e) batch_size = min(int(batch_size / 2), BATCH_SIZE_LIMIT) batch_stmt.clear() print('Trying again with batch size:', batch_size) # Insert categories for i, c in enumerate(schema_categories): category_json = json.dumps({'category': c, 'ID': i}) cql_stmt = """INSERT INTO categories JSON '{}';"""\ .format(category_json) session.execute(cql_stmt) cluster.shutdown()
class SessionEventsRepository(): """ class to interact with SessionEvents database """ def __init__(self): """ Sets up the cassandra cluster, session, keyspace, table, and a reusable batch statement in order for the data older than 1 year old to be discarded, we set a default ttl of 365*24*60*60 or 31536000 seconds. There are multiple configurations including the default ttl that may be set in a separate configuration file. """ # connect to cluster cluster = Cluster() self.session = cluster.connect() self.session.execute(""" CREATE KEYSPACE IF NOT EXISTS unity_assignment WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; """) # TODO: POC if separating start and end event would be more efficient self.session.execute(""" CREATE TABLE IF NOT EXISTS unity_assignment.session_events ( event TEXT, country TEXT, player_id UUID, session_id UUID, ts timestamp, PRIMARY KEY (player_id, event, ts) ) WITH default_time_to_live = 31536000 AND CLUSTERING ORDER BY (event DESC, ts DESC); """) self.batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM) self.insert_event = self.session.prepare( """INSERT INTO unity_assignment.session_events (event, country, player_id, session_id, ts) VALUES (?, ?, ?, ?, ?) """) @staticmethod def _typecast(event: Dict): """ casts type of passed events to be stored into Session Events table :param event: event whose fields' types are going to be cast """ if isinstance(event, dict): return (event.get('event'), event.get('country'), uuid.UUID(event.get('player_id')), uuid.UUID(event.get('session_id')), datetime.fromisoformat(event.get('ts'))) else: raise TypeError(f'{type(event)} is an unsupported type for the ' 'passed event to be typecast') def insert_events_batch(self, events: List[Dict[str, str]]): """ inserts events batch into the SessionEvents database :param events: list of dictionary of events such as: [{ "event": "start", "country": "FI", "player_id": "0a2d12a1a7e145de8bae44c0c6e06629", "session_id": "4a0c43c9-c43a-42ff-ba55-67563dfa35d4", "ts": "2016-12-02T12:48:05.520022" }, { "event": "end", "player_id": "0a2d12a1a7e145de8bae44c0c6e06629", "session_id": "4a0c43c9-c43a-42ff-ba55-67563dfa35d4", "ts": "2016-12-02T12:49:05.520022" }] :return: http body message """ for event in events: self.batch.add(self.insert_event, SessionEventsRepository._typecast(event)) try: self.session.execute(self.batch) finally: self.batch.clear() def fetch_recent_completed_sessions(self, player_id: str) -> List[str]: """ fetches up to 20 recent completed sessions associated with a player :param player_id: the player id of the player whose recent sessions we'd want to query :returns: list of recent sessions sorted from latest to the earliest """ rows = self.session.execute( """ SELECT session_id FROM unity_assignment.session_events WHERE player_id=%s AND event=%s LIMIT 20; """, (uuid.UUID(player_id), 'end')) # TODO: We may want to query the same rows from start events to make # sure the events correspond to begin events as well. Alternatively, # we may redisign table, say to normalize the pair of start and end # events in the same row, to order the results by the timestamps, or # again work out at application level. However, this might come for # free from the downstream. Therefore, I just assume the start events # are guaranteed to exist for end events. return [str(row.session_id) for row in rows]
def push(self, OHLCs, timestamp): # Sometimes there will be missing timestamp in the queue, it is caused by overmuch time consumed when asking stock data from IEX API, so the later data retriever write data earlier than the earlier data retriever # So when there's a missing value, we will supplement it with the duplicate of the current value # Later when the program which is responsible for this timestamp starts to write data, it will update the previous supplemented value # In other situation if the time consumed is longer than the set interval, it will force the data retriever to skip one round of data retrieval # In that case the supplemented data won't be update and will be just the same as the descendent # There are 4 situations that could happen at the moment when the data is about to be pushed into the queue # 1. the latest timestamp is exactly 1 second before the current timestamp, then we can append our current data to the end of the queue # 2. the lastest timestamp is earlier than the current timestamp more than 1 second but not more than 10 seconds. In this case we duplicate the current data and supplement the missing value # 3. the latest timestamp is bigger than the current timestamp. # In this case, we pop out all the data which is later than the current timestamp until we found a timestamp which is less than or equal to the current timestamp, and see now it belongs to situation 1 or situation 2 # 4. large difference between current timestamp and the last timestamp. It indicate the current one is the beginning of a new day, so just append it to the end of the queue # The above analysis is only for the problem in retrieving data every second. Introducing interval into the code can generalize it for every minute data and every hour data # unit of timestamp is second stack = [] while self.queue and self.queue[-1]['timestamp'] > timestamp: stack.append(self.queue.pop()) # An execution of a batch of operations is faster than multiple executions every single operations in a loop batch = BatchStatement(consistency_level=ConsistencyLevel.QUORUM) if self.queue and timestamp == self.queue[-1]['timestamp']: # modify content of the element in the queue self.queue[-1]['OHLCs'] = OHLCs # update data in Cassandra table "stocks_history" dt = datetime.fromtimestamp(timestamp) update = self.__UpdateCassandraStock() for symbol in OHLCs.keys(): # Convert dict to json so that it can be insert into Cassandra table OHLC_json = json.dumps(OHLCs[symbol]) batch.add(update, (OHLC_json, symbol, self.interval_type, dt)) # Update H and L self.H[symbol] = max(OHLCs[symbol]['high'], self.H[symbol]) self.L[symbol] = min(OHLCs[symbol]['low'], self.L[symbol]) elif self.queue and timestamp - self.queue[-1][ 'timestamp'] < 10 * self.interval: # the timestamp that is less than or equal to the current timestamp, in a certain range # supplement the intermediate missing data latest = self.queue[-1]['timestamp'] insert = self.__InsertCassandraStock() for t in range((timestamp - latest) // self.interval): ts = latest + (t + 1) * self.interval self.queue.append({'OHLCs': OHLCs, 'timestamp': ts}) dt = datetime.fromtimestamp(ts) for symbol in OHLCs.keys(): OHLC_json = json.dumps(OHLCs[symbol]) batch.add(insert, (OHLC_json, symbol, self.interval_type, dt)) if t < 1: # Because they are all same values, so update H and L only once self.H[symbol] = max(OHLCs[symbol]['high'], self.H[symbol]) self.L[symbol] = min(OHLCs[symbol]['low'], self.L[symbol]) else: self.queue.append({'OHLCs': OHLCs, 'timestamp': timestamp}) dt = datetime.fromtimestamp(timestamp) insert = self.__InsertCassandraStock() for symbol in OHLCs.keys(): OHLC_json = json.dumps(OHLCs[symbol]) batch.add(insert, (OHLC_json, symbol, self.interval_type, dt)) # Update H and L self.H[symbol] = max(OHLCs[symbol]['high'], self.H[symbol]) self.L[symbol] = min(OHLCs[symbol]['low'], self.L[symbol]) while stack: self.queue.append(stack.pop()) # Pop the exceeded data out if the queue is full # Delete the data being pop out from Cassandra table delete = self.__DeleteCassandraStock() while len(self.queue) > self.length: del_data = self.queue.popleft() dt = datetime.fromtimestamp(del_data['timestamp']) for symbol in del_data['OHLCs'].keys(): batch.add(delete, (symbol, self.interval_type, dt)) self.c_session.execute(batch) batch.clear()