Example #1
0
class ClientCassandra():
    
    def __init__(self,keySpace):
        self.pool = ConnectionPool(keySpace, ['localhost:9160'])
        self.col_fam_page        = ColumnFamily(self.pool, 'Page')
        self.col_fam_publication = ColumnFamily(self.pool, 'Publication')
        self.col_fam_company     = ColumnFamily(self.pool, 'Company')
        self.col_fam_location    = ColumnFamily(self.pool, 'Location')
        self.col_fam_category    = ColumnFamily(self.pool, 'Category')
        
    #pycassaShell
    #SYSTEM_MANAGER.create_keyspace('BlwData', strategy_options={"replication_factor": "1"});
    #SYSTEM_MANAGER.create_column_family('BlwData', 'Page');
    #SYSTEM_MANAGER.create_column_family('BlwData', 'Publication');
    #SYSTEM_MANAGER.create_column_family('BlwData', 'Company');
    #SYSTEM_MANAGER.create_column_family('BlwData', 'Location');
    #SYSTEM_MANAGER.create_column_family('BlwData', 'Category');
        
    def insertPage(self,page):
        timestamp= self.col_fam_page.insert(page.getUrl(), page.toJson())
        print "sizeof category " + page.category.name + " is " + str(self.col_fam_category.get_count(page.category.name))
        self.col_fam_category.insert(page.category.name,{'url': page.getUrl()})
        print "sizeof category " + page.category.name + " is " + str(self.col_fam_category.get_count(page.category.name))
        return timestamp
        # should raise an exception for an immutable sequence
        #self.assertRaises(TypeError, random.shuffle, (1,2,3))
    def getPages(self,url,column):
        readData = self.col_fam_page .get(url,columns=[column])
        return readData
    
    def getCountCategory(self,category):
            return self.col_fam_category.get_count(category)
  def _check_cassandra(self, del_network_keyname, local_hostname, cassandra_listen_port):
	from pycassa.pool import ConnectionPool
	from pycassa.columnfamily import ColumnFamily

	pool1=ConnectionPool('config_db_uuid', [local_hostname+":"+cassandra_listen_port])
	col_fam=ColumnFamily(pool1, 'obj_fq_name_table')
        return col_fam.get_count('virtual_network', columns=[del_network_keyname])
Example #3
0
def GetValueCount(pool, columnFamily, key, *args, **kwargs):
    d = None
    try:
        col_fam = ColumnFamily(pool, columnFamily)
        d = col_fam.get_count(key, *args, **kwargs)
    except Exception,e:
        #print('empty column '+key)
        pass
class DailyTemporalBloomFilter(DailyTemporalBase):
    """Long Range Temporal BloomFilter using a daily resolution.

    For really high value of expiration (like 60 days) with low requirement on precision.
    The actual error of this BF will the be native error of the BF + the error related
    to the coarse aspect of the expiration, since we no longer expires information precisely.
    Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member)
    AND false negative (reporting non-membership for a member).

    The upper bound of the temporal_error can be theoricaly quite high. However, if the
    items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration
    """

    def __new__(cls, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'):
        return super(DailyTemporalBloomFilter, cls).__new__(cls, capacity=capacity, error_rate=error_rate)

    def __init__(self, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'):
        filename = ""
        super(DailyTemporalBloomFilter, self).__init__(capacity=capacity, error_rate=error_rate)
        self.bf_name = name
        self.expiration = expiration
        self.initialize_period()
        self.cassandra_session = cassandra_session
        self.cassandra_columns_family = "temporal_bf"
        self.keyspace = 'parsely'
        self.uncommited_keys = []
        self.commit_batch = 1000
        self.columnfamily = None
        self.ensure_cassandra_cf()
        self.snapshot_path = snapshot_path

    def ensure_cassandra_cf(self):
        s = SystemManager()
        if self.keyspace not in s.list_keyspaces():
            s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'})
        if self.cassandra_columns_family not in s.get_keyspace_column_families(self.keyspace):
            s.create_column_family(self.keyspace, self.cassandra_columns_family)
        self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family)

    def archive_bf_key(self, bf_key):
        self.uncommited_keys.append(bf_key)
        if len(self.uncommited_keys) >= self.commit_batch:
            current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H')
            self.columnfamily.insert('%s_%s' % (self.bf_name, current_period_hour), {k:'' for k in self.uncommited_keys})
            self.uncommited_keys = []

    def _hour_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the hours between a start and end datetime
        (inclusive)."""

        def total_seconds(td):
            return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6

        hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0)))
        if inclusive:
            hours += 1
        for i in xrange(hours):
            if reverse:
                yield end - dt.timedelta(hours=i)
            else:
                yield start + dt.timedelta(hours=i)

    def _day_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the days between a start and end datetime
        (inclusive)."""
        days = (end - start).days
        if inclusive:
            days += 1
        for i in xrange(days):
            if reverse:
                yield end - dt.timedelta(days=i)
            else:
                yield start + dt.timedelta(days=i)

    def _drop_archive(self):
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        hours = self._hour_range(last_period, dt.datetime.now())
        for hour in hours:
            try:
                row = "%s_%s" % (self.bf_name, hour.strftime('%Y-%m-%d:%H'))
                nbr_keys = self.columnfamily.get_count(row)
                keys = self.columnfamily.remove(row)
            except:
                pass

    def rebuild_from_archive(self, rebuild_snapshot=True):
        """Rebuild the BF using the archived items"""
        self.initialize_bitarray()

        #if rebuild_snapshot:
        #    self.delete_snapshots()

        def multi_rows_itr(rows):
            for row in rows.values():
                for k in row.keys():
                    yield k

        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        hours = self._hour_range(last_period, dt.datetime.now())
        days = self._day_range(last_period, dt.datetime.now())
        rows = []
        for i,day in enumerate(days):
            rows = ["%s_%s:%s" % (self.bf_name, day.strftime('%Y-%m-%d'), hour_str) for hour_str in ["%02d" % i for i in range(24)]]
            rows_content = self.columnfamily.multiget(rows, column_count=1E6)
            update_current = day == self.current_period

            for k in multi_rows_itr(rows_content):
                self.add_rebuild(k, update_current)

            if rebuild_snapshot:
                self.save_snaphot(override_period=day)

            if not update_current:
                self.initialize_current_day_bitarray()

    def restore_from_disk(self, clean_old_snapshot=False):
        """Restore the state of the BF using previous snapshots.

        :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration)
        """
        base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.bf_name, self.expiration)
        availables_snapshots = glob.glob(base_filename)
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        for filename in availables_snapshots:
            snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
            if snapshot_period <  last_period and not clean_old_snapshot:
                continue
            else:
                self._union_bf_from_file(filename)
                if snapshot_period == self.current_period:
                    self._union_bf_from_file(filename, current=True)

            if snapshot_period < last_period and clean_old_snapshot:
                os.remove(filename)
        self.ready = True

    def add_rebuild(self, key, update_current=True):
        super(DailyTemporalBloomFilter, self).add(key, update_current)

    def add(self, key_string):
        if isinstance(key_string, unicode):
            key = key_string.encode('utf8')
        else:
            key = key_string

        self.archive_bf_key(key)
        result = super(DailyTemporalBloomFilter, self).add(key)

        return result

    def resize(self, new_capacity=None, new_error_rate=None):
        self._set_capacity(new_capacity or self.capacity)
        self._set_error_rate(new_error_rate or self.error_rate)
        self._initialize_parameters()
        self.initialize_bitarray()
        self.rebuild_from_archive(rebuild_snapshot=True)

    def initialize_period(self, period=None):
        """Initialize the period of BF.

        :period: datetime.datetime for setting the period explicity.
        """
        if not period:
            self.current_period = dt.datetime.now()
        else:
            self.current_period = period
        self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day)
        self.date = self.current_period.strftime("%Y-%m-%d")

    def save_snaphot(self, override_period=None):
        """Save the current state of the current day bitarray on disk.

        Save the internal representation (bitarray) into a binary file using this format:
            filename : name_expiration_2013-01-01.dat
        """
        period = override_period or self.current_period
        filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.bf_name, self.expiration, period.strftime("%Y-%m-%d"))
        self._save_snapshot(filename)
Example #5
0
def get_count(columnFamily, uid): 
	"get number of columns in a row" 
	column = ColumnFamily(pool, columnFamily) 
	count = column.get_count(uid) 
	print uid, count 
	return count
class DailyTemporalBloomFilter(DailyTemporalBase):
    """Long Range Temporal BloomFilter using a daily resolution.

    For really high value of expiration (like 60 days) with low requirement on precision.
    The actual error of this BF will the be native error of the BF + the error related
    to the coarse aspect of the expiration, since we no longer expires information precisely.
    Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member)
    AND false negative (reporting non-membership for a member).

    The upper bound of the temporal_error can be theoricaly quite high. However, if the
    items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration
    """
    def __new__(cls,
                capacity,
                error_rate,
                expiration,
                name,
                cassandra_session,
                snapshot_path='./'):
        return super(DailyTemporalBloomFilter,
                     cls).__new__(cls,
                                  capacity=capacity,
                                  error_rate=error_rate)

    def __init__(self,
                 capacity,
                 error_rate,
                 expiration,
                 name,
                 cassandra_session,
                 snapshot_path='./'):
        filename = ""
        super(DailyTemporalBloomFilter, self).__init__(capacity=capacity,
                                                       error_rate=error_rate)
        self.bf_name = name
        self.expiration = expiration
        self.initialize_period()
        self.cassandra_session = cassandra_session
        self.cassandra_columns_family = "temporal_bf"
        self.keyspace = 'parsely'
        self.uncommited_keys = []
        self.commit_batch = 1000
        self.columnfamily = None
        self.ensure_cassandra_cf()
        self.snapshot_path = snapshot_path

    def ensure_cassandra_cf(self):
        s = SystemManager()
        if self.keyspace not in s.list_keyspaces():
            s.create_keyspace(self.keyspace, SIMPLE_STRATEGY,
                              {'replication_factor': '1'})
        if self.cassandra_columns_family not in s.get_keyspace_column_families(
                self.keyspace):
            s.create_column_family(self.keyspace,
                                   self.cassandra_columns_family)
        self.columnfamily = ColumnFamily(self.cassandra_session,
                                         self.cassandra_columns_family)

    def archive_bf_key(self, bf_key):
        self.uncommited_keys.append(bf_key)
        if len(self.uncommited_keys) >= self.commit_batch:
            current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H')
            self.columnfamily.insert(
                '%s_%s' % (self.bf_name, current_period_hour),
                {k: ''
                 for k in self.uncommited_keys})
            self.uncommited_keys = []

    def _hour_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the hours between a start and end datetime
        (inclusive)."""
        def total_seconds(td):
            return (td.microseconds +
                    (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6

        hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0)))
        if inclusive:
            hours += 1
        for i in xrange(hours):
            if reverse:
                yield end - dt.timedelta(hours=i)
            else:
                yield start + dt.timedelta(hours=i)

    def _day_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the days between a start and end datetime
        (inclusive)."""
        days = (end - start).days
        if inclusive:
            days += 1
        for i in xrange(days):
            if reverse:
                yield end - dt.timedelta(days=i)
            else:
                yield start + dt.timedelta(days=i)

    def _drop_archive(self):
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        hours = self._hour_range(last_period, dt.datetime.now())
        for hour in hours:
            try:
                row = "%s_%s" % (self.bf_name, hour.strftime('%Y-%m-%d:%H'))
                nbr_keys = self.columnfamily.get_count(row)
                keys = self.columnfamily.remove(row)
            except:
                pass

    def rebuild_from_archive(self, rebuild_snapshot=True):
        """Rebuild the BF using the archived items"""
        self.initialize_bitarray()

        #if rebuild_snapshot:
        #    self.delete_snapshots()

        def multi_rows_itr(rows):
            for row in rows.values():
                for k in row.keys():
                    yield k

        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        hours = self._hour_range(last_period, dt.datetime.now())
        days = self._day_range(last_period, dt.datetime.now())
        rows = []
        for i, day in enumerate(days):
            rows = [
                "%s_%s:%s" % (self.bf_name, day.strftime('%Y-%m-%d'), hour_str)
                for hour_str in ["%02d" % i for i in range(24)]
            ]
            rows_content = self.columnfamily.multiget(rows, column_count=1E6)
            update_current = day == self.current_period

            for k in multi_rows_itr(rows_content):
                self.add_rebuild(k, update_current)

            if rebuild_snapshot:
                self.save_snaphot(override_period=day)

            if not update_current:
                self.initialize_current_day_bitarray()

    def restore_from_disk(self, clean_old_snapshot=False):
        """Restore the state of the BF using previous snapshots.

        :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration)
        """
        base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.bf_name,
                                            self.expiration)
        availables_snapshots = glob.glob(base_filename)
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        for filename in availables_snapshots:
            snapshot_period = dt.datetime.strptime(
                filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
            if snapshot_period < last_period and not clean_old_snapshot:
                continue
            else:
                self._union_bf_from_file(filename)
                if snapshot_period == self.current_period:
                    self._union_bf_from_file(filename, current=True)

            if snapshot_period < last_period and clean_old_snapshot:
                os.remove(filename)
        self.ready = True

    def add_rebuild(self, key, update_current=True):
        super(DailyTemporalBloomFilter, self).add(key, update_current)

    def add(self, key_string):
        if isinstance(key_string, unicode):
            key = key_string.encode('utf8')
        else:
            key = key_string

        self.archive_bf_key(key)
        result = super(DailyTemporalBloomFilter, self).add(key)

        return result

    def resize(self, new_capacity=None, new_error_rate=None):
        self._set_capacity(new_capacity or self.capacity)
        self._set_error_rate(new_error_rate or self.error_rate)
        self._initialize_parameters()
        self.initialize_bitarray()
        self.rebuild_from_archive(rebuild_snapshot=True)

    def initialize_period(self, period=None):
        """Initialize the period of BF.

        :period: datetime.datetime for setting the period explicity.
        """
        if not period:
            self.current_period = dt.datetime.now()
        else:
            self.current_period = period
        self.current_period = dt.datetime(self.current_period.year,
                                          self.current_period.month,
                                          self.current_period.day)
        self.date = self.current_period.strftime("%Y-%m-%d")

    def save_snaphot(self, override_period=None):
        """Save the current state of the current day bitarray on disk.

        Save the internal representation (bitarray) into a binary file using this format:
            filename : name_expiration_2013-01-01.dat
        """
        period = override_period or self.current_period
        filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.bf_name,
                                        self.expiration,
                                        period.strftime("%Y-%m-%d"))
        self._save_snapshot(filename)
Example #7
0
from pycassa.pool import ConnectionPool
from pycassa.columnfamilymap import ColumnFamilyMap
from pycassa.columnfamily import ColumnFamily


if __name__ == '__main__':
    #['10.15.62.100:9160','10.15.62.101:9160','10.15.62.102:9160'] 
    pool = ConnectionPool('Cassandra_Test',['10.107.4.187:9160'])
    print pool
#    cf_map = ColumnFamilyMap(User, pool, 'Users')
    col_fam =  ColumnFamily(pool, 'Users')
    print col_fam.get('author')
    print col_fam.get_count('author')
    col_fam.insert('row_key', {'col_name': 'col_val'})
    col_fam.insert('row_key', {'col_name':'col_val', 'col_name2':'col_val2'})
    col_fam.batch_insert({'row1': {'name1': 'val1', 'name2': 'val2'},'row2': {'foo': 'bar'}})
    #col_fam.insert('super_key', {'key':{'col_name':'col_val', 'col_name2':'col_val2'}})
    print col_fam.get_count('row_key', columns=['foo', 'bar'])
    print col_fam.get_count('row_key', column_start='foo') 
    print col_fam.multiget_count(['fib0', 'fib1', 'fib2', 'fib3', 'fib4'])
    print col_fam.multiget_count(['fib0', 'fib1', 'fib2', 'fib3', 'fib4'],columns=['col1', 'col2', 'col3'])
    print col_fam.multiget_count(['fib0', 'fib1', 'fib2', 'fib3', 'fib4'],column_start='col1', column_finish='col3')
    print col_fam.get_count('row_key')
    print col_fam.get('row_key')
    print col_fam.get('author')
    print col_fam.get('row_key', columns=['col_name', 'col_name2'])
    print col_fam.get('row_key', column_reversed=True, column_count=3)
    print col_fam.multiget(['row1', 'row2'])
    for i in range(1, 10):
        col_fam.insert('row_key', {str(i): 'val'})
    print col_fam.get('row_key', column_start='5', column_finish='7')
Example #8
0
class CassandraDemo(object):
    def __init__(self, database, table):
        self.database = database
        self.table = table

    def create_connections(self):
        self.pool = ConnectionPool(self.database)
        self.cf = ColumnFamily(self.pool, self.table)

    def create_database_and_table(self):
        super_cf = False # consider super columns to be deprecated
        s = SystemManager()

        # create keyspace if it doesn't exist
        if database not in s.list_keyspaces():
            s.create_keyspace(database, SIMPLE_STRATEGY, {'replication_factor': '1'})

        # delete column family from the keyspace if it does exist.
        if table in s.get_keyspace_column_families(database):
            s.drop_column_family(database, table)

        # create coulmn family in the keyspace
        if table not in s.get_keyspace_column_families(database):
            print("table is creating...")
            s.create_column_family(database, table, super = super_cf, comparator_type = ASCII_TYPE)
        s.close()

        return True

    def insert_data(self):
        print '\nemployee data is inserting...'
        self.cf.insert('1', {'fn':'yogesh', 'ln':'kumar', 'ct': 'Ajmer', 'em': '*****@*****.**'})
        self.cf.insert('2', {'fn':'amit', 'ln':'pandita', 'ct': 'Delhi', 'em': '*****@*****.**'})
        self.cf.insert('3', {'fn':'sandeep', 'ln':'tak', 'ct': 'Ajmer', 'em': '*****@*****.**', 'mb': '8890467032'})


    def get_data(self):
        print '\nemployee data is featching...'
        data1 = self.cf.get('1')
        data2 = self.cf.get('2', columns = ['fn', 'ln', 'em'])
        data3 = self.cf.get('3', column_start = 'ct', column_finish = 'fn')
        data4 = self.cf.get('1', column_reversed = False, column_count = 3)
        data5 = self.cf.get('1', column_reversed = True, column_count = 3)
        print data1
        print data2
        print data3
        print data4
        print data5

    def get_multiple_data(self):
        print '\ngetting multiple employees data...'
        row_keys = ['1','2','3']
        data = self.cf.multiget(row_keys)
        print data

    def get_data_by_range(self):
        '''
        if you get an error don't worry about this, it's a Cassandra limitation Issue
        '''
        print '\ngetting employees data by range...'
        start_row_key = '1'
        end_row_key = '3'
        data = self.cf.get_range(start = start_row_key, finish = end_row_key)
        for key, columns in data:
            print key,coulmns

    def get_count(self):
        print '\nget employee row\'s colunm count'
        print self.cf.get_count('1')
        print self.cf.get_count('1', columns = ['fn', 'ln'])
        print self.cf.get_count('1', column_start = 'em')

    def get_multi_count(self):
        print '\nget multiple employees row\'s colunm count'
        row_keys = ['1','2','3']
        columns = ['fn', 'ln', 'mb']
        column_start = 'ct'
        column_finish = 'fn'
        print self.cf.multiget_count(row_keys)
        print self.cf.multiget_count(row_keys, columns = columns)
        print self.cf.multiget_count(row_keys, column_start = column_start, column_finish = column_finish)

    def update_data(self):
        print '\nemployee data is updating...'
        self.cf.insert('1', {'pwd':'yoku@2010', 'ct':'Noida'})


    def delete_data(self):
        print '\ndelete data from employee'
        row = '2'
        self.cf.remove(row)

    def get_all_rows(self):
        print '\ngetting rows name...'
        print [v[0] for v in self.cf.get_range()]

    def get_all_columns_of_row(self):
        print '\ngetting columns name of a row'
        row = '1'
        data = self.cf.get(row)
        print data.keys()
Example #9
0
import pycassa
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily

# connecting to Cassandra
pool = ConnectionPool('Keyspace1')

# getting a ColumnFamily
col_fam = ColumnFamily(pool, 'ColumnFamily1')

# inserting Data
col_fam.insert('row_key', {'col_name':'col_val', 'col_name2':'col_val2'})

# getting Data
col_fam.get('row_key')
# {'col_name': 'col_val', 'col_name2': 'col_val2'}

# counting
col_fam.get_count('row_key')
#!/usr/bin/env python
import pycassa
from pycassa.pool import ConnectionPool
from pycassa.columnfamily import ColumnFamily
pool = ConnectionPool('Keyspace1',server_list=['localhost:9160'])
col_fam = ColumnFamily(pool, 'ColumnFamily1')
check1 = col_fam.insert('row_key', {'col_name': 'col_val'})
print check1
print " is the added row into a column\n"
check2 = col_fam.insert('row_key', {'col_name':'col_val', 'col_name2':'col_val2'})
print check2
print " Multiple columns are added \n"
get_data1 = col_fam.get('row_key')
print get_data1
get_data2 = col_fam.get('row_key', columns=['col_name', 'col_name2'])
print get_data2
print "\n"
print "Slicing\n"
for i in range(1, 10):
	col_fam.insert('row_key', {str(i): 'val'})
print col_fam.get('row_key', column_start='5', column_finish='7')
print "\n"
print "Counting rows: "
print col_fam.get_count('row_key')
Example #11
0
    dinvCost = float(e1['invCost']) - float(e2['invCost'])
    if (abs(dinvCost) > epson):
        fl.write("+invCost:" + str(dinvCost) + '.' + e1['invCost'] + ',' + e2['invCost'] + ':' + str(e1) + ',' + str(e2) + '\n')
        return
    dbillingInvoice = float(e1['billingInvoice']) - float(e2['billingInvoice'])
    if (abs(dbillingInvoice) > epson):
        fl.write("+billingInvoice:" + str(dbillingInvoice) + ',' + e1['billingInvoice'] + ',' + e2['billingInvoice'] + ':' + str(e1) + ',' + str(e2) + '\n')
        return
    c.write(str(e1) + '\n')

servers = ['pb036:9160', 'pb037:9160', 'pb038:9160']
pool = ConnectionPool('RSS', server_list = servers, timeout = 1, pool_size=20)
meta = ColumnFamily(pool, 'MetaData')
counter = ColumnFamily(pool, 'Counter')

oid_count = meta.get_count('rss.All')
print 'Total oids => ', oid_count

oids_gen = meta.xget('rss.All', column_reversed=True, include_timestamp=True)
oids = dict(oids_gen)
#oids = meta.get('rss.All', column_reversed=True)
#print oids

home = os.path.expanduser("~")
fi = open(home + "/rss/diff_cas.txt", "w")
fj = open(home + "/rss/diff_rss.txt", "w")
difi = open(home + "/rss/difi.txt", "w")
difl = open(home + "/rss/difl.txt", "w")

f = open(home + "/rss/good.txt", "w")
class DailyTemporalBloomFilter(object):
    """Long Range Temporal BloomFilter using a daily resolution.

    For really high value of expiration (like 60 days) with low requirement on precision.
    The actual error of this BF will the be native error of the BF + the error related
    to the coarse aspect of the expiration, since we no longer expires information precisely.
    Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member)
    AND false negative (reporting non-membership for a member).

    The upper bound of the temporal_error can be theoricaly quite high. However, if the
    items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration
    """

    def __init__(self, capacity, error_rate, expiration, name, snapshot_path, cassandra_session):
        self.error_rate = error_rate
        self.capacity = capacity
        self._initialize_parameters()
        self.initialize_bitarray()
        self.count = 0
        self.hashed_values = []
        self.name = name
        self.snapshot_path = snapshot_path
        self.expiration = expiration
        self.initialize_period()
        self.snapshot_to_load = None
        self.ready = False
        self.warm_period = None
        self.next_snapshot_load = time.time()
        self.cassandra_session = cassandra_session
        self.cassandra_columns_family = "temporal_bf"
        self.keyspace = 'parsely'
        self.uncommited_keys = []
        self.commit_batch = 1000
        self.columnfamily = None
        self.ensure_cassandra_cf()

    def _initialize_parameters(self):
        self.nbr_slices = int(np.ceil(np.log2(1.0 / self.error_rate)))
        self.bits_per_slice = int(np.ceil((self.capacity * abs(np.log(self.error_rate))) / (self.nbr_slices * (np.log(2) ** 2))))
        self.nbr_bits = self.nbr_slices * self.bits_per_slice
        self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices)

    def ensure_cassandra_cf(self):
        s = SystemManager()
        if self.keyspace not in s.list_keyspaces():
            s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'})
        if self.cassandra_columns_family not in s.get_keyspace_column_families(self.keyspace):
            s.create_column_family(self.keyspace, self.cassandra_columns_family)
        self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family)

    def archive_bf_key(self, bf_key):
        self.uncommited_keys.append(bf_key)
        if len(self.uncommited_keys) >= self.commit_batch:
            current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H')
            self.columnfamily.insert('%s_%s' % (self.name, current_period_hour), {k:'' for k in self.uncommited_keys})
            self.uncommited_keys = []

    def _hour_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the hours between a start and end datetime
        (inclusive)."""

        def total_seconds(td):
            return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6

        hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0)))
        if inclusive:
            hours += 1
        for i in xrange(hours):
            if reverse:
                yield end - dt.timedelta(hours=i)
            else:
                yield start + dt.timedelta(hours=i)

    def resize(self, new_capacity):
        self.capacity = new_capacity
        self._initialize_parameters()
        self.rebuild_from_archive()

    def _drop_archive(self):
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        hours = self._hour_range(last_period, dt.datetime.now())
        for hour in hours:
            try:
                row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H'))
                nbr_keys = self.columnfamily.get_count(row)
                keys = self.columnfamily.remove(row)
            except:
                pass

    def rebuild_from_archive(self):
        """Rebuild the BF using the archived items"""
        self.initialize_bitarray()
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        hours = self._hour_range(last_period, dt.datetime.now())
        rows = []
        for i,hour in enumerate(hours):
            row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H'))
            rows.append(row)
        rows_content = self.columnfamily.multiget(rows, column_count=1E6)

        for row_content in rows_content.values():
            for k in row_content.keys():
                self.add(k, rebuild_mode=True)

    def initialize_bitarray(self):
        """Initialize both bitarray.

        This BF contain two bit arrays instead of single one like a plain BF. bitarray
        is the main bit array where all the historical items are stored. It's the one
        used for the membership query. The second one, current_day_bitarray is the one
        used for creating the daily snapshot.
        """
        self.bitarray = bitarray.bitarray(self.nbr_bits)
        self.current_day_bitarray = bitarray.bitarray(self.nbr_bits)
        self.bitarray.setall(False)
        self.current_day_bitarray.setall(False)

    def __contains__(self, key):
        """Check membership."""
        self.hashed_values = self.hashes(key)
        offset = 0
        for value in self.hashed_values:
            if not self.bitarray[offset + value]:
                return False
            offset += self.bits_per_slice
        return True

    def add(self, key, rebuild_mode=False):
        if not rebuild_mode:
            self.archive_bf_key(key)
        if key in self:
            return True
        offset = 0
        if not self.hashed_values:
            self.hashed_values = self.hashes(key)
        for value in self.hashed_values:
            self.bitarray[offset + value] = True
            self.current_day_bitarray[offset + value] = True
            offset += self.bits_per_slice
        self.count += 1
        return False

    def initialize_period(self, period=None):
        """Initialize the period of BF.

        :period: datetime.datetime for setting the period explicity.
        """
        if not period:
            self.current_period = dt.datetime.now()
        else:
            self.current_period = period
        self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day)
        self.date = self.current_period.strftime("%Y-%m-%d")

    def maintenance(self):
        """Expire the old element of the set.

        Initialize a new bitarray and load the previous snapshot. Execute this guy
        at the beginining of each day.
        """
        self.initialize_period()
        self.initialize_bitarray()
        self.restore_from_disk()

    def compute_refresh_period(self):
        self.warm_period =  (60 * 60 * 24) // (self.expiration-2)

    def _should_warm(self):
        return time.time() >= self.next_snapshot_load

    def warm(self, jittering_ratio=0.2):
        """Progressively load the previous snapshot during the day.

        Loading all the snapshots at once can takes a substantial amount of time. This method, if called
        periodically during the day will progressively load those snapshots one by one. Because many workers are
        going to use this method at the same time, we add a jittering to the period between load to avoid
        hammering the disk at the same time.
        """
        if self.snapshot_to_load == None:
            last_period = self.current_period - dt.timedelta(days=self.expiration-1)
            self.compute_refresh_period()
            self.snapshot_to_load = []
            base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration)
            availables_snapshots = glob.glob(base_filename)
            for filename in availables_snapshots:
                snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
                if snapshot_period >= last_period:
                    self.snapshot_to_load.append(filename)
                    self.ready = False

        if self.snapshot_to_load and self._should_warm():
            filename = self.snapshot_to_load.pop()
            self._union_bf_from_file(filename)
            jittering = self.warm_period * (np.random.random()-0.5) * jittering_ratio
            self.next_snapshot_load = time.time() + self.warm_period + jittering
            if not self.snapshot_to_load:
                self.ready = True


    def _union_bf_from_file(self, filename, current=False):
        snapshot = cPickle.loads(zlib.decompress(open(filename,'r').read()))
        if current:
            self.current_day_bitarray = self.current_day_bitarray | snapshot
        else:
            self.bitarray = self.bitarray | snapshot

    def restore_from_disk(self, clean_old_snapshot=False):
        """Restore the state of the BF using previous snapshots.

        :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration)
        """
        base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration)
        availables_snapshots = glob.glob(base_filename)
        last_period = self.current_period - dt.timedelta(days=self.expiration-1)
        for filename in availables_snapshots:
            snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
            if snapshot_period <  last_period and not clean_old_snapshot:
                continue
            else:
                self._union_bf_from_file(filename)
                if snapshot_period == self.current_period:
                    self._union_bf_from_file(filename, current=True)

            if snapshot_period < last_period and clean_old_snapshot:
                os.remove(filename)
        self.ready = True

    def save_snaphot(self):
        """Save the current state of the current day bitarray on disk.

        Save the internal representation (bitarray) into a binary file using this format:
            filename : name_expiration_2013-01-01.dat
        """
        filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.name, self.expiration, self.date)
        with open(filename, 'w') as f:
            f.write(zlib.compress(cPickle.dumps(self.current_day_bitarray, protocol=cPickle.HIGHEST_PROTOCOL)))

    def union_current_day(self, bf):
        """Union only the current_day of an other BF."""
        self.bitarray = self.bitarray | bf.current_day_bitarray
Example #13
0
class DailyTemporalBloomFilter(object):
    """Long Range Temporal BloomFilter using a daily resolution.

    For really high value of expiration (like 60 days) with low requirement on precision.
    The actual error of this BF will the be native error of the BF + the error related
    to the coarse aspect of the expiration, since we no longer expires information precisely.
    Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member)
    AND false negative (reporting non-membership for a member).

    The upper bound of the temporal_error can be theoricaly quite high. However, if the
    items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration
    """
    def __init__(self, capacity, error_rate, expiration, name, snapshot_path,
                 cassandra_session):
        self.error_rate = error_rate
        self.capacity = capacity
        self._initialize_parameters()
        self.initialize_bitarray()
        self.count = 0
        self.hashed_values = []
        self.name = name
        self.snapshot_path = snapshot_path
        self.expiration = expiration
        self.initialize_period()
        self.snapshot_to_load = None
        self.ready = False
        self.warm_period = None
        self.next_snapshot_load = time.time()
        self.cassandra_session = cassandra_session
        self.cassandra_columns_family = "temporal_bf"
        self.keyspace = 'parsely'
        self.uncommited_keys = []
        self.commit_batch = 1000
        self.columnfamily = None
        self.ensure_cassandra_cf()

    def _initialize_parameters(self):
        self.nbr_slices = int(np.ceil(np.log2(1.0 / self.error_rate)))
        self.bits_per_slice = int(
            np.ceil((self.capacity * abs(np.log(self.error_rate))) /
                    (self.nbr_slices * (np.log(2)**2))))
        self.nbr_bits = self.nbr_slices * self.bits_per_slice
        self.hashes = generate_hashfunctions(self.bits_per_slice,
                                             self.nbr_slices)

    def ensure_cassandra_cf(self):
        s = SystemManager()
        if self.keyspace not in s.list_keyspaces():
            s.create_keyspace(self.keyspace, SIMPLE_STRATEGY,
                              {'replication_factor': '1'})
        if self.cassandra_columns_family not in s.get_keyspace_column_families(
                self.keyspace):
            s.create_column_family(self.keyspace,
                                   self.cassandra_columns_family)
        self.columnfamily = ColumnFamily(self.cassandra_session,
                                         self.cassandra_columns_family)

    def archive_bf_key(self, bf_key):
        self.uncommited_keys.append(bf_key)
        if len(self.uncommited_keys) >= self.commit_batch:
            current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H')
            self.columnfamily.insert(
                '%s_%s' % (self.name, current_period_hour),
                {k: ''
                 for k in self.uncommited_keys})
            self.uncommited_keys = []

    def _hour_range(self, start, end, reverse=False, inclusive=True):
        """Generator that gives us all the hours between a start and end datetime
        (inclusive)."""
        def total_seconds(td):
            return (td.microseconds +
                    (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6

        hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0)))
        if inclusive:
            hours += 1
        for i in xrange(hours):
            if reverse:
                yield end - dt.timedelta(hours=i)
            else:
                yield start + dt.timedelta(hours=i)

    def resize(self, new_capacity):
        self.capacity = new_capacity
        self._initialize_parameters()
        self.rebuild_from_archive()

    def _drop_archive(self):
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        hours = self._hour_range(last_period, dt.datetime.now())
        for hour in hours:
            try:
                row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H'))
                nbr_keys = self.columnfamily.get_count(row)
                keys = self.columnfamily.remove(row)
            except:
                pass

    def rebuild_from_archive(self):
        """Rebuild the BF using the archived items"""
        self.initialize_bitarray()
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        hours = self._hour_range(last_period, dt.datetime.now())
        rows = []
        for i, hour in enumerate(hours):
            row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H'))
            rows.append(row)
        rows_content = self.columnfamily.multiget(rows, column_count=1E6)

        for row_content in rows_content.values():
            for k in row_content.keys():
                self.add(k, rebuild_mode=True)

    def initialize_bitarray(self):
        """Initialize both bitarray.

        This BF contain two bit arrays instead of single one like a plain BF. bitarray
        is the main bit array where all the historical items are stored. It's the one
        used for the membership query. The second one, current_day_bitarray is the one
        used for creating the daily snapshot.
        """
        self.bitarray = bitarray.bitarray(self.nbr_bits)
        self.current_day_bitarray = bitarray.bitarray(self.nbr_bits)
        self.bitarray.setall(False)
        self.current_day_bitarray.setall(False)

    def __contains__(self, key):
        """Check membership."""
        self.hashed_values = self.hashes(key)
        offset = 0
        for value in self.hashed_values:
            if not self.bitarray[offset + value]:
                return False
            offset += self.bits_per_slice
        return True

    def add(self, key, rebuild_mode=False):
        if not rebuild_mode:
            self.archive_bf_key(key)
        if key in self:
            return True
        offset = 0
        if not self.hashed_values:
            self.hashed_values = self.hashes(key)
        for value in self.hashed_values:
            self.bitarray[offset + value] = True
            self.current_day_bitarray[offset + value] = True
            offset += self.bits_per_slice
        self.count += 1
        return False

    def initialize_period(self, period=None):
        """Initialize the period of BF.

        :period: datetime.datetime for setting the period explicity.
        """
        if not period:
            self.current_period = dt.datetime.now()
        else:
            self.current_period = period
        self.current_period = dt.datetime(self.current_period.year,
                                          self.current_period.month,
                                          self.current_period.day)
        self.date = self.current_period.strftime("%Y-%m-%d")

    def maintenance(self):
        """Expire the old element of the set.

        Initialize a new bitarray and load the previous snapshot. Execute this guy
        at the beginining of each day.
        """
        self.initialize_period()
        self.initialize_bitarray()
        self.restore_from_disk()

    def compute_refresh_period(self):
        self.warm_period = (60 * 60 * 24) // (self.expiration - 2)

    def _should_warm(self):
        return time.time() >= self.next_snapshot_load

    def warm(self, jittering_ratio=0.2):
        """Progressively load the previous snapshot during the day.

        Loading all the snapshots at once can takes a substantial amount of time. This method, if called
        periodically during the day will progressively load those snapshots one by one. Because many workers are
        going to use this method at the same time, we add a jittering to the period between load to avoid
        hammering the disk at the same time.
        """
        if self.snapshot_to_load == None:
            last_period = self.current_period - dt.timedelta(
                days=self.expiration - 1)
            self.compute_refresh_period()
            self.snapshot_to_load = []
            base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name,
                                                self.expiration)
            availables_snapshots = glob.glob(base_filename)
            for filename in availables_snapshots:
                snapshot_period = dt.datetime.strptime(
                    filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
                if snapshot_period >= last_period:
                    self.snapshot_to_load.append(filename)
                    self.ready = False

        if self.snapshot_to_load and self._should_warm():
            filename = self.snapshot_to_load.pop()
            self._union_bf_from_file(filename)
            jittering = self.warm_period * (np.random.random() -
                                            0.5) * jittering_ratio
            self.next_snapshot_load = time.time(
            ) + self.warm_period + jittering
            if not self.snapshot_to_load:
                self.ready = True

    def _union_bf_from_file(self, filename, current=False):
        snapshot = cPickle.loads(zlib.decompress(open(filename, 'r').read()))
        if current:
            self.current_day_bitarray = self.current_day_bitarray | snapshot
        else:
            self.bitarray = self.bitarray | snapshot

    def restore_from_disk(self, clean_old_snapshot=False):
        """Restore the state of the BF using previous snapshots.

        :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration)
        """
        base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name,
                                            self.expiration)
        availables_snapshots = glob.glob(base_filename)
        last_period = self.current_period - dt.timedelta(days=self.expiration -
                                                         1)
        for filename in availables_snapshots:
            snapshot_period = dt.datetime.strptime(
                filename.split('_')[-1].strip('.dat'), "%Y-%m-%d")
            if snapshot_period < last_period and not clean_old_snapshot:
                continue
            else:
                self._union_bf_from_file(filename)
                if snapshot_period == self.current_period:
                    self._union_bf_from_file(filename, current=True)

            if snapshot_period < last_period and clean_old_snapshot:
                os.remove(filename)
        self.ready = True

    def save_snaphot(self):
        """Save the current state of the current day bitarray on disk.

        Save the internal representation (bitarray) into a binary file using this format:
            filename : name_expiration_2013-01-01.dat
        """
        filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.name,
                                        self.expiration, self.date)
        with open(filename, 'w') as f:
            f.write(
                zlib.compress(
                    cPickle.dumps(self.current_day_bitarray,
                                  protocol=cPickle.HIGHEST_PROTOCOL)))

    def union_current_day(self, bf):
        """Union only the current_day of an other BF."""
        self.bitarray = self.bitarray | bf.current_day_bitarray
Example #14
0
 def get_count(self, *args, **kwargs):
     col_fam = ColumnFamily(self.pool, self.__column_family__)
     return col_fam.get_count(*args, **kwargs)
name_cf = ColumnFamily(pool, 'myname')

x = ['acharya1', 'acharya2']
name_cf.insert('sacharya3', {'last_name': x})
names3 = name_cf.get('sacharya3')
print "List as a value"
print names3
attrs = dict([(attr_name, set([attr_values])) for attr_name, attr_values in
names3.iteritems()])
name_cf.insert("sacharya3", {'last_name':
attrs['last_name'].append("acharya3")})
print name_cf.get('sacharya3')

################################# COUNT #######################################
# Count the number of columns for the row key
count=author_cf.get_count("sacharya1")
print count 

count=author_cf.multiget_count(["sacharya1","sacharya2"])
print count
################################## REMOVE #####################################
# Remove the column for the row key and column key
print "Removing the column last_name for row key sacharya1"
author_cf.remove('sacharya1', columns=['last_name'])

time.sleep(5)

authors = author_cf.get('sacharya')
print authors

# REMOVE the entire row