def _delete_cassandra_records(self, sync_params, res): """ Delete the records from Cassandra. The correct version will be brought from elastic search :param sync_params: parameters of this sync :param res: response from the elastic search insert operation :return: """ # helpers session = self.cassandra['session'] cs_params = sync_params['cassandra'] keyspace = self.cassandra['keyspace'] re_comp = re.compile('provided \[(.*?)\]') # prepare statement stmt = '''delete from {keyspace}.{table} where {id_col} = :{id_col} and {version_col}=:{version_col}'''.format( keyspace=keyspace, table=cs_params['table'], version_col=sync_params['version_col'], id_col=sync_params['id_col']) try: data_statement = session.prepare(stmt) except: log.error('Sync: %s - Step: %s - Problem deleting data' % (sync_params['name'], sys._getframe().f_code.co_name)) log.error(getError()) return None, None # prepare dictionary with records that should be deleted batch = BatchStatement() count = 0 total = 0 errors = 0 for row in res[1]: # for each "error" entry, check if it is the type of conflict row = row['index'] if row['status'] == 409: # TODO: create a log table for deletions # reg ex to find the current version version_col = long(re.findall(re_comp, row['error'])[0]) data = { sync_params['id_col']: uuid.UUID(row['_id']), sync_params['version_col']: version_col } #sync_params['date_col']:version_col} batch.add(data_statement, data) count += 1 # every x records, commit. There is a limitation on the driver if (count % 65000) == 0: try: # execute the batch session.execute(batch) total += count except: exc_info = sys.exc_info() log.error(exc_info[1]) log.error(exc_info[2]) errors += count count = 0 # hack to get around the 65k limit of python driver batch._statements_and_parameters = [] if count > 0: try: # execute the batch session.execute(batch) total += count except: log.error( 'Sync: %s - Step: %s - Problem inserting data' % (sync_params['name'], sys._getframe().f_code.co_name)) log.error(getError()) errors += count return total, errors
def insert_cassandra(self, sync_params, rows): """ Insert data into :rtype : object :param sync_params: :param rows: :return: """ # helpers session = self.cassandra['session'] params = sync_params['cassandra'] keyspace = self.cassandra['keyspace'] # get the table schema and order so that we can insert on query in correct order schema = self._get_table_schema(keyspace, params['table']) if not schema: return None, None cols = schema.keys() cols.sort() # Prepare the statements stmt = "INSERT INTO {keyspace}.{table} (" stmt += ", ".join(['%s' % k for k in cols]) stmt += ") VALUES (" stmt += ", ".join([':' + k for k in cols]) stmt += ") USING TIMESTAMP :p_timestamp " stmt = stmt.format(keyspace=keyspace, table=params['table']) try: data_statement = session.prepare(stmt) except: log.error('Sync: %s - Step: %s - Problem inserting data' % (sync_params['name'], sys._getframe().f_code.co_name)) log.error(getError()) return None, None # add the prepared statements to a batch count = 0 total = 0 errors = 0 batch = BatchStatement() cols.remove(sync_params['id_col']) for row in rows: # convert to the cassandra structure try: # fill the data dictionary and put none on columns that are not present data = {} source = row['_source'] for col in cols: data[col] = source.get(col, None) date = datetime.strptime(source[sync_params['date_col']], '%Y-%m-%dT%H:%M:%S.%f') data[sync_params['id_col']] = uuid.UUID(row['_id']) data[sync_params['date_col']] = unix_time_millis(date) data['p_timestamp'] = data['version'] batch.add(data_statement, data) count += 1 except: log.error('Problem converting data {}'.format(row['_id'])) log.error(getError()) continue # every x records, commit. There is a limitation on the driver if (count % 5000) == 0: try: # execute the batch session.execute(batch) total += count except: exc_info = sys.exc_info() log.error(exc_info[1]) log.error(exc_info[2]) errors += count count = 0 # hack to get around the 65k limit of python driver batch._statements_and_parameters = [] if count > 0: try: # execute the batch session.execute(batch) total += count except: log.error( 'Sync: %s - Step: %s - Problem inserting data' % (sync_params['name'], sys._getframe().f_code.co_name)) log.error(getError()) errors += count return total, errors