예제 #1
0
    def _delete_cassandra_records(self, sync_params, res):
        """
         Delete the records from Cassandra. The correct version will be brought from elastic search
        :param sync_params: parameters of this sync
        :param res: response from the elastic search insert operation
        :return:
        """
        # helpers
        session = self.cassandra['session']
        cs_params = sync_params['cassandra']
        keyspace = self.cassandra['keyspace']
        re_comp = re.compile('provided \[(.*?)\]')

        # prepare statement
        stmt = '''delete from  {keyspace}.{table}
                  where {id_col} = :{id_col} and {version_col}=:{version_col}'''.format(
            keyspace=keyspace,
            table=cs_params['table'],
            version_col=sync_params['version_col'],
            id_col=sync_params['id_col'])
        try:
            data_statement = session.prepare(stmt)
        except:
            log.error('Sync: %s - Step: %s - Problem deleting data' %
                      (sync_params['name'], sys._getframe().f_code.co_name))
            log.error(getError())
            return None, None

        # prepare dictionary with records that should be deleted
        batch = BatchStatement()
        count = 0
        total = 0
        errors = 0
        for row in res[1]:
            # for each "error" entry, check if it is the type of conflict
            row = row['index']
            if row['status'] == 409:
                # TODO: create a log table for deletions
                # reg ex to find the current version
                version_col = long(re.findall(re_comp, row['error'])[0])
                data = {
                    sync_params['id_col']: uuid.UUID(row['_id']),
                    sync_params['version_col']: version_col
                }
                #sync_params['date_col']:version_col}
                batch.add(data_statement, data)
                count += 1

            # every x records, commit. There is a limitation on the driver
            if (count % 65000) == 0:
                try:
                    # execute the batch
                    session.execute(batch)
                    total += count
                except:
                    exc_info = sys.exc_info()
                    log.error(exc_info[1])
                    log.error(exc_info[2])
                    errors += count

                count = 0
                # hack to get around the 65k limit of python driver
                batch._statements_and_parameters = []

        if count > 0:
            try:
                # execute the batch
                session.execute(batch)
                total += count
            except:
                log.error(
                    'Sync: %s - Step: %s - Problem inserting data' %
                    (sync_params['name'], sys._getframe().f_code.co_name))
                log.error(getError())
                errors += count

        return total, errors
예제 #2
0
    def insert_cassandra(self, sync_params, rows):
        """
        Insert data into
        :rtype : object
        :param sync_params:
        :param rows:
        :return:
        """
        # helpers
        session = self.cassandra['session']
        params = sync_params['cassandra']
        keyspace = self.cassandra['keyspace']

        # get the table schema and order so that we can insert on query in correct order
        schema = self._get_table_schema(keyspace, params['table'])
        if not schema:
            return None, None
        cols = schema.keys()
        cols.sort()

        # Prepare the statements
        stmt = "INSERT INTO {keyspace}.{table} ("
        stmt += ", ".join(['%s' % k for k in cols])
        stmt += ") VALUES ("
        stmt += ", ".join([':' + k for k in cols])
        stmt += ") USING TIMESTAMP :p_timestamp "
        stmt = stmt.format(keyspace=keyspace, table=params['table'])

        try:
            data_statement = session.prepare(stmt)
        except:
            log.error('Sync: %s - Step: %s - Problem inserting data' %
                      (sync_params['name'], sys._getframe().f_code.co_name))
            log.error(getError())
            return None, None

        # add the prepared statements to a batch
        count = 0
        total = 0
        errors = 0
        batch = BatchStatement()
        cols.remove(sync_params['id_col'])
        for row in rows:
            # convert to the cassandra structure
            try:
                # fill the data dictionary and put none on columns that are not present
                data = {}
                source = row['_source']
                for col in cols:
                    data[col] = source.get(col, None)
                date = datetime.strptime(source[sync_params['date_col']],
                                         '%Y-%m-%dT%H:%M:%S.%f')
                data[sync_params['id_col']] = uuid.UUID(row['_id'])
                data[sync_params['date_col']] = unix_time_millis(date)
                data['p_timestamp'] = data['version']
                batch.add(data_statement, data)
                count += 1
            except:
                log.error('Problem converting data {}'.format(row['_id']))
                log.error(getError())
                continue

            # every x records, commit. There is a limitation on the driver
            if (count % 5000) == 0:
                try:
                    # execute the batch
                    session.execute(batch)
                    total += count
                except:
                    exc_info = sys.exc_info()
                    log.error(exc_info[1])
                    log.error(exc_info[2])
                    errors += count

                count = 0
                # hack to get around the 65k limit of python driver
                batch._statements_and_parameters = []

        if count > 0:
            try:
                # execute the batch
                session.execute(batch)
                total += count
            except:
                log.error(
                    'Sync: %s - Step: %s - Problem inserting data' %
                    (sync_params['name'], sys._getframe().f_code.co_name))
                log.error(getError())
                errors += count

        return total, errors