def consume(self, targets): print(targets) nm = NmapProcess(targets, options='-v -sn') rc = nm.run() try: parsed = NmapParser.parse(nm.stdout) except NmapParserException as e: print("Exception raised while parsing scan: %s" % (e.msg)) HOST_UP = 1 HOST_DOWN = 0 scans = Table('host_up', connection=self.dynamo) with scans.batch_write() as batch: for host in parsed.hosts: # Insert into database and delete from queue if (host.status == 'down'): status = 0 elif (host.status == 'up'): status = 1 else: status = -1 batch.put_item( data={ 'ip': host.address, 'status': status, 'datetime': int(time.time()) })
def main(): sqs_cnx = boto.sqs.connect_to_region('us-east-1') queue = sqs_cnx.get_queue('reviewboard-slack-logs') queue.set_message_class(RawMessage) table = Table('reviewboard-slack-logs') while 1: messages = queue.get_messages(num_messages=10, wait_time_seconds=10) to_delete = [] if messages: with table.batch_write() as batch: for message in messages: body = message.get_body() try: data = json.loads(body) attrs = { key: value for key, value in data.iteritems() if value != '' } print data['timestamp'] batch.put_item(attrs) to_delete.append(message) except ValueError: print '??? %r' % body sqs_cnx.delete_message_batch(queue, messages) else: break
def dynamo_main(): conn = dynamodb2.connect_to_region(region_name=region, aws_access_key_id=access_key, aws_secret_access_key=secret_key) prop_appl_table = Table(dynamodb_prop_appl_table, connection=conn) table = Table(dynamodb_diag_rep_table, connection=conn) prop_appl_res = prop_appl_table.scan() prop_appl_ids = [] for prop_appl in prop_appl_res: prop_appl_ids.append(dict(prop_appl.get_raw_keys())['propApplId']['S']) with table.batch_write() as batch: for i in range(0, 20): diag_rep = {} global next_id diag_rep['diagRepId'] = str(next_id) next_id += 1 diag_rep['propApplId'] = prop_appl_ids[randint( 0, len(prop_appl_ids) - 1)] diag_rep['managerId'] = manager_ids[randint( 0, len(manager_ids) - 1)] diag_rep['organisations'] = ["1"] diag_rep['timestamp'] = int(time.time()) diag_rep['description'] = "Description" batch.put_item(data=diag_rep)
def consume(self, targets): print(targets) nm = NmapProcess(targets, options='-v -sn') rc = nm.run() try: parsed = NmapParser.parse(nm.stdout) except NmapParserException as e: print("Exception raised while parsing scan: %s" % (e.msg)) HOST_UP = 1 HOST_DOWN = 0 scans = Table('host_up', connection=self.dynamo) with scans.batch_write() as batch: for host in parsed.hosts: # Insert into database and delete from queue if (host.status == 'down'): status = 0 elif (host.status == 'up'): status = 1 else: status = -1 batch.put_item(data={ 'ip': host.address, 'status': status, 'datetime': int(time.time()) })
def data(): from boto import dynamodb2 from boto.dynamodb2.table import Table TABLE_NAME = "issdata" REGION = "us-west-1" conn = dynamodb2.connect_to_region( REGION, aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'], ) table = Table(TABLE_NAME, connection=conn) absolute_junk = { "favorite_color": "blue", "quest": "seek_holy_grail", } with table.batch_write() as table_batch: for example_counter in xrange(10): required_hash_data = { "user_id": 11, "timestamp": datetime_to_timestamp_ms(datetime.datetime.utcnow()) } final_dynamo_data = dict(absolute_junk.items() + required_hash_data.items()) table_batch.put_item(data=final_dynamo_data)
def do_insert(): conn = boto.dynamodb2.connect_to_region(region) table = Table(table_name, connection=conn) cx = sqlite3.connect(name_db) cu = cx.cursor() count = 0 cu.execute("select * from name") test_count = 1000 while True: rets = cu.fetchmany(batch_count) if len(rets) <= 0: break with table.batch_write() as batch: for ret in rets: name = ret[0] dates = [] while len(dates) < count_per_user: date = generate_date() if date in dates: continue dates.append(date) for date in dates: score = generate_score() batch.put_item(data={ u'name': name, u'date': date, u'score': score }) count += 1 with open(u'/tmp/insert_count', u'w') as f: f.write(u'%s\n' % unicode(count)) with open(u'/tmp/insert_count', u'a') as f: f.write(u'done\n')
def dynamo_main(): conn = dynamodb2.connect_to_region(region_name=region, aws_access_key_id=access_key, aws_secret_access_key=secret_key) diag_rep_table = Table(dynamodb_diag_rep_table, connection=conn) table = Table(dynamodb_main_org_table, connection=conn) diag_rep_res = diag_rep_table.scan() diag_rep_ids = [] for diag_rep in diag_rep_res: diag_rep_ids.append(dict(diag_rep.get_raw_keys())['diagRepId']['S']) with table.batch_write() as batch: for i in range(0, 10): main_org = {} global next_id main_org['orgId'] = str(next_id) next_id += 1 engineers = [] engineers.append(engineer_ids[0]) engineers.append(engineer_ids[1]) main_org['engineers'] = engineers pending = [] pending.append(diag_rep_ids[randint(0, len(diag_rep_ids) - 1)]) pending.append(diag_rep_ids[randint(0, len(diag_rep_ids) - 1)]) resp = [] resp.append(diag_rep_ids[randint(0, len(diag_rep_ids) - 1)]) resp.append(diag_rep_ids[randint(0, len(diag_rep_ids) - 1)]) main_org['pendingDiagnosticReports'] = pending main_org['respondedDiagnosticReports'] = resp
class DDBToBeSlurped(Dynamo): def __init__(self, access_key=None, secret=None): """ ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables """ super(DDBToBeSlurped, self).__init__(access_key, secret) self.table = Table('to_be_slurped', connection=self.connection) def save_info(self, search_terms): """ search_terms can either be in the form of a list of dicts or else a single dict. If slurp_info is a list, batch write will be used """ if isinstance(search_terms, basestring): search_terms = [search_terms] # search_terms = {'searchterm': search_terms} search_terms = [{'searchterm': x} for x in search_terms] # print search_terms with self.table.batch_write() as batch: for s in search_terms: batch.put_item(data=s, overwrite=True) def get_table(self, table_name=None): """ Convenience method for client who may wish to get a specific table from the DynamoDB connection """ table_name = table_name or self.table.table_name return Table(table_name, connection=self.connection) def truncate_table(self): """ Delete whole table """ with self.table.batch_write() as batch: for item in self.table.scan(): batch.delete_item(searchterm=item['searchterm']) def modify_slurps_throughput(self, requested_read, requested_write): return self.modify_throughput(requested_read, requested_write, self.table) def get_slurps_table_info(self): return self.get_table_info(self.table) def get_table_as_df(self): return DataFrame([dict(r) for r in self.table.scan()])
def truncate_slurp_table(self): """ WARNING! Only use for test mode table """ assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually" test_slurps_table = Table('test_slurps', connection=self.connection) with test_slurps_table.batch_write() as batch: for item in self.slurps_table.scan(): batch.delete_item(searchterm=item['searchterm'])
def dynamo_main(appliances): conn = dynamodb2.connect_to_region(region_name=region, aws_access_key_id=access_key, aws_secret_access_key=secret_key) table = Table(dynamodb_appliance_table, connection=conn) with table.batch_write() as batch: for appliance in appliances: batch.put_item(data=appliance)
def save(self, items, overwrite=None): """ Save models to dynamo Parameters ---------- items : list or :class:`~flywheel.models.Model` overwrite : bool, optional If False, raise exception if item already exists (default set by :attr:`.default_conflict`) Raises ------ exc : :class:`boto.dynamodb2.exceptions.ConditionalCheckFailedException` If overwrite is False and an item already exists in the database Notes ----- Overwrite will replace the *entire* item with the new one, not just different fields. After calling save(overwrite=True) you are guaranteed that the item in the database is exactly the item you saved. Due to the structure of the AWS API, saving with overwrite=True is much faster because the requests can be batched. """ if overwrite is None: overwrite = self.default_conflict in ('update', 'overwrite') if isinstance(items, Model): items = [items] if not items: return tables = defaultdict(list) for item in items: tables[item.meta_.ddb_tablename].append(item) for tablename, items in tables.iteritems(): table = Table(tablename, connection=self.dynamo) if overwrite: with table.batch_write() as batch: for item in items: item.pre_save_(self) batch.put_item(data=item.ddb_dump_()) item.post_save_() else: for item in items: expected = {} for name in item.meta_.fields: expected[name] = { 'Exists': False, } item.pre_save_(self) boto_item = Item(table, data=item.ddb_dump_()) self.dynamo.put_item(tablename, boto_item.prepare_full(), expected=expected) item.post_save_()
def delete(self, items, raise_on_conflict=None): """ Delete items from dynamo Parameters ---------- items : list or :class:`~flywheel.model.Model` List of :class:`~flywheel.models.Model` objects to delete raise_on_conflict : bool, optional If True, raise exception if the object was changed concurrently in the database (default set by :attr:`.default_conflict`) Raises ------ exc : :class:`boto.dynamodb2.exceptions.ConditionalCheckFailedException` If overwrite is False and an item already exists in the database Notes ----- Due to the structure of the AWS API, deleting with raise_on_conflict=False is much faster because the requests can be batched. """ if raise_on_conflict is None: raise_on_conflict = self.default_conflict == 'raise' if isinstance(items, Model): items = [items] if not items: return tables = defaultdict(list) for item in items: tables[item.meta_.ddb_tablename].append(item) count = 0 for tablename, items in tables.iteritems(): if raise_on_conflict: for item in items: expected = item.construct_ddb_expects_() count += 1 self.dynamo.delete_item(tablename, item.pk_dict_, expected=expected) else: table = Table(tablename, connection=self.dynamo) with table.batch_write() as batch: for item in items: if isinstance(item, Model): keys = item.pk_dict_ else: keys = dict(item) count += 1 batch.delete_item(**keys) return count
def dynamo_main(appliance_statuses, stat_icon): conn = dynamodb2.connect_to_region(region_name=region, aws_access_key_id=access_key, aws_secret_access_key=secret_key) table = Table(dynamodb_appliance_status_table, connection=conn) with table.batch_write() as batch: for appliance_stat in appliance_statuses: global next_id appliance_stat['statusId'] = str(next_id) appliance_stat['icon'] = stat_icon next_id += 1 batch.put_item(data=appliance_stat)
def add_sample_schedules(): try: tb_schedules = Table('bus_n_stops', connection=cm.db) with tb_schedules.batch_write() as batch: batch.put_item(data={ 'id': '7', 's_id': '2', 'stop_id': 'alpy', 'time': get_xhd_from_time(hour=12, minute=0), 'cnt': 5, }) batch.put_item(data={ 'id': '8', 's_id': '2', 'stop_id': 'kykm', 'time': get_xhd_from_time(hour=12, minute=15), 'cnt': 5, }) batch.put_item(data={ 'id': '9', 's_id': '2', 'stop_id': 'ochr', 'time': get_xhd_from_time(hour=12, minute=30), 'cnt': 5, }) batch.put_item(data={ 'id': '10', 's_id': '2', 'stop_id': 'vlkv', 'time': get_xhd_from_time(hour=12, minute=45), 'cnt': 5, }) batch.put_item(data={ 'id': '11', 's_id': '2', 'stop_id': 'kpy', 'time': get_xhd_from_time(hour=13, minute=0), 'cnt': 5, }) batch.put_item(data={ 'id': '12', 's_id': '2', 'stop_id': 'klm', 'time': get_xhd_from_time(hour=13, minute=15), 'cnt': 5, }) return message_helper.success() except Exception, e: return message_helper.error(str(e))
class DDBKids(Dynamo): @classmethod def from_test_mode(cls, access_key=None, secret=None): """ Use this for getting an instance of this class that uses test tables. """ instance = cls(access_key, secret) instance.table = Table('test_kids', connection=instance.connection) return instance def __init__(self, access_key=None, secret=None): super(DDBKids, self).__init__(access_key, secret) self.table = Table('kids', connection=self.connection) def set_max_kid(self, account_name, kid): """ Set the max kid used for an account :param account_name: :param kid: int >= 0. """ return self.table.put_item({'account_name': account_name, 'kid': kid}, overwrite=True) def get_max_kid(self, account_name): """ Get the max kid already used for an account. If the account does not exist, create it in the db with a KID of 0. :param account_name: """ res = self.table.get_item(account_name=account_name) if res['kid']: return int(res['kid']) else: self.logger.warn("Creating a new (max) KID entry for account {} because it did not yet exist in ddb_kids".format(account_name)) self.set_max_kid(account_name, 0) return 0 def modify_throughput(self, requested_read, requested_write, table=None): table = table or self.table return super(DDBKids, self).modify_throughput(requested_read, requested_write, table) def truncate_table(self): """ WARNING! Only use for test mode table """ assert self.table.table_name == 'test_kids', "Will only truncate test table. To truncate production table, run code manually" with self.table.batch_write() as batch: for item in self.table.scan(): batch.delete_item(dt=item['dt'])
class CorgiCache: def __init__(self): self.dynamo = boto.dynamodb2.connect_to_region(DYNAMO_REGION) self.feeds = Table("Feeds", connection=self.dynamo) self.tokens = Table("Tokens", connection=self.dynamo) return def feed_id_exists(self, feed_id): items = list(self.feeds.query_2(ID__eq=feed_id)) return len(items) > 0 def put_feed(self, data): if "ID" not in data or "URL" not in data: logging.debug("invalid item, {0}".format(data)) raise ValueError self.feeds.put_item(data=data) return def put_feed_batch(self, data): with self.feeds.batch_write() as batch: for item in data: if "ID" not in item or "URL" not in item: logging.debug("invalid item, {0}".format(item)) raise ValueError batch.put_item(data=item) return def get_all_feeds(self): return self.feeds.scan() def get_token(self, use): items = list(self.tokens.query_2(USE__eq=use)) if len(items) > 1: raise ValueError item = items[0] return item def update_token(self, token): if "USE" not in token or "TOKEN" not in token or "REFRESH_TOKEN" not in token: logging.debug("invalid token, {0}".format(token)) raise ValueError self.tokens.put_item(data=token) return
def delete_key(self, model, pkeys=None, **kwargs): """ Delete one or more items from dynamo as specified by primary keys Parameters ---------- model : :class:`~flywheel.models.Model` pkeys : list, optional List of primary key dicts **kwargs : dict If pkeys is None, delete only a single item and use kwargs as the primary key dict Returns ------- count : int The number of deleted items Notes ----- If the model being deleted has no range key, you may use strings instead of primary key dicts. ex: .. code-block:: python >>> class Item(Model): ... id = Field(hash_key=True) ... >>> items = engine.delete_key(Item, ['abc', 'def', '123', '456']) """ if pkeys is not None: keys = pkeys else: keys = [kwargs] count = 0 table = Table(model.meta_.ddb_tablename, connection=self.dynamo) with table.batch_write() as batch: for key in keys: pkey = model.meta_.pk_dict(scope=key) batch.delete_item(**pkey) count += 1 return count
def insertData(): conn = boto.dynamodb.connect_to_region('us-east-1'); try: tdescr=conn.describe_table('consumer_complaint') consumer_complaint=Table('consumer_complaint') except: consumer_complaint = createTable() try: isTableActive='false' while isTableActive=='false': tdescr=conn.describe_table('consumer_complaint') if(((tdescr['Table'])['TableStatus']) == 'ACTIVE'): #consumer_complaint=Table('consumer_complaint') start_time = time.time() reader=downloadData() i=0 for row in reader: if i==0: i=i+1 else: with consumer_complaint.batch_write() as batch: batch.put_item(data={ 'Complaint_ID' : row[0], 'Product' : row[1], 'Sub-product' : row[2], 'Issue' : row[3], 'State' : row[4], 'ZIP_code' : row[5], 'Company' : row[6], 'Company_response' : row[7], 'Timely_response?' : row[8], 'Consumer_disputed': row[9], }) print("--- Time %s in seconds for Insert Query ---" % (time.time() - start_time)) time=time.time() - start_time isTableActive='true' except: consumer_complaint.delete() return render_template('form_submit.html',tableStatus='false') #consumer_complaint.delete() return render_template('form_submit.html',tableStatus='true')
def updateFromSensor(self, listab): tr=Table("APPosto_posti")#da correggere with tr.batch_write() as batch: for item in listab: batch.put_item(data={ 'idposto': int(item[0]), 'extra': item[1], 'latitudine': item[2], 'longitudine': item[3], 'stato' : item[1]}) if(self.cache==True): dictio={'idposto': int(item[0]), 'extra': item[1], 'latitudine': item[2], 'longitudine': item[3], 'stato' : item[1]} self.cacheClient.setValue(str(item[0]),dictio,time=self.cexpire) dictio={}
def dynamo_main(property_appliances): conn= dynamodb2.connect_to_region(region_name=region,aws_access_key_id=access_key,aws_secret_access_key=secret_key) property_table = Table(dynamodb_property_table, connection=conn) appliance_table = Table(dynamodb_appliance_table, connection=conn) status_table = Table(dynamodb_appl_status_table, connection=conn) table = Table(dynamodb_prop_appl_table, connection=conn) properties_res = property_table.scan() appliance_res = appliance_table.scan() statuses_res = status_table.scan() properties = [] appliances = [] statuses = [] for prop in properties_res: properties.append(dict(prop.get_raw_keys())['propId']['S'].encode('ascii')) for appl in appliance_res: appliances.append(dict(appl.get_raw_keys())['applId']['S']) for status in statuses_res: statuses.append(dict(status.get_raw_keys())['statusId']['S']) with table.batch_write() as batch: for property_appliance in property_appliances: global next_id # Insert ids here property_appliance['propApplId'] = str(next_id) next_id+= 1 property_appliance['propertyId'] = properties[randint(0, len(properties) - 1)] property_appliance['applianceId'] = str(appliances[randint(0, len(appliances) - 1)]) property_appliance['statusId'] = str(statuses[randint(0, len(statuses) - 1)]) property_appliance['statusHistory'] = [ {'statusId': str(statuses[randint(0, len(statuses) - 1)]), 'dateTime':1487611362}, {'statusId': str(statuses[randint(0, len(statuses) - 1)]), 'dateTime':1487611362}, {'statusId': str(statuses[randint(0, len(statuses) - 1)]), 'dateTime':1487611362} ] batch.put_item(data=property_appliance)
class DynamoTable(object): conn = None table = None table_name = 'test-table' hash_key = 'hash_key' range_key = 'range_key' indexes = [] read_units = 10 write_units = 10 counters = {'reads':0,'writes':0,'delete':0,'batch_w':0} def __init__(self, table_name, hash_key, range_key, indexes, read_units=10, write_units=10 ): self.table_name = table_name self.hash_key = hash_key self.range_key = range_key self.indexes = indexes self.read_units = read_units self.write_units = write_units try: self.connect() self.setup() except: logger.warn('Unable to connect or handle DynamoDB Table') traceback.print_exc() def connect(self): # create initial database tables self.conn = boto.dynamodb2.connect_to_region( settings.AWS_DYNAMODB_REGION, aws_access_key_id=settings.AWS_DYNAMODB_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_DYNAMODB_SECRET_ACCESS_KEY ) def setup(self): ''' Set's up the table schema if table does not exists yet Return the Table ''' try: self.table = Table.create(self.table_name, connection=self.conn, schema=[ HashKey(self.hash_key), RangeKey(self.range_key), ], throughput={'read':self.read_units,'write':self.write_units}) logger.warning('Created new DynamoDB Table') except: self.table = Table(self.table_name, connection=self.conn, schema=[ HashKey(self.hash_key), RangeKey(self.range_key), ], throughput={'read':self.read_units,'write':self.write_units}) return self.table def put(self, hash_key, range_key, data): ''' puts the data to the table if key/range_key exists ''' if settings.DEBUG: bench_start = time() data[self.hash_key] = hash_key data[self.range_key] = range_key item = self.table.put_item( data=data, overwrite=True ) if settings.DEBUG: if not hash_key in self.counters: self.counters[hash_key] = {'reads':0,'writes':0} self.counters[hash_key]['writes'] +=1 self.counters['writes'] +=1 elapsed_time = time() - bench_start logger.info(data) logger.info("R%sW%s - write %0.5f seconds" % (self.counters[hash_key]['reads'], self.counters[hash_key]['writes'], elapsed_time)) return item def get_latest(self, hash_key ): ''' retreive the last recorded data hash_key item for the hash key ''' if settings.DEBUG: bench_start = time() kwargs = {} kwargs[self.hash_key+'__eq'] = hash_key kwargs['limit'] = 1 items = self.table.query( **kwargs ) if items: data = {} for item in items: for key in item.keys(): if not key in (self.hash_key, self.range_key): data[key] = item[key] else: return None if not len(data): return None if settings.DEBUG: if not hash_key in self.counters: self.counters[hash_key] = {'reads':0,'writes':0} self.counters[hash_key]['reads'] +=1 self.counters['reads'] +=1 elapsed_time = time() - bench_start logger.info("R%sW%s - %s - read %0.5f seconds" % (self.counters[hash_key]['reads'], self.counters[hash_key]['writes'], hash_key, elapsed_time)) return data def get_range_obj(self, hash_key): if settings.DEBUG: bench_start = time() kwargs = {} kwargs[self.hash_key+'__eq'] = hash_key # TODO - use batch_get items = self.table.query( **kwargs ) self.counters['reads'] +=1 data = {} for item in items: rkey_data = {} rkey = item[self.range_key] if rkey == 'index': data = json.loads(item['value']) break else: for key in item.keys(): if key != None and not key in (self.hash_key, self.range_key) and key != 'index': if key == 'value': value = item[key] try: rkey_data = json.loads(str(value)) except: rkey_data = value #else: # rkey_data[key] = item[key] data[rkey] = rkey_data if settings.DEBUG: if not hash_key in self.counters: self.counters[hash_key] = {'reads':0,'writes':0} self.counters[hash_key]['reads'] +=1 self.counters['reads'] +=1 elapsed_time = time() - bench_start #logger.info(data) logger.info("R%sW%s - %s - read %0.5f seconds" % (self.counters[hash_key]['reads'], self.counters[hash_key]['writes'], hash_key, elapsed_time)) return data def set_range_obj(self, hash_key, data, range_keys=None): # avoid crashing on attempt to write None data if data == None: return if range_keys == None: range_keys = data.keys() # TODO # add better size estimate datablocks = 0 for range_key in data.keys(): try: len_size = len( data[range_key] ) except: len_size = 1 datablocks += len_size # update date in msecs since epoch update_date = time() if datablocks > 1000: #print hash_key, #print datablocks # split over multiple items by data dict key with self.table.batch_write() as batch: for range_key in range_keys: value = json.dumps( data[range_key] ) batch_data = {} batch_data[self.hash_key] = hash_key batch_data[self.range_key] = range_key batch_data['value'] = value batch_data['update_date'] = update_date batch.put_item(data=batch_data) self.counters['batch_w'] +=1 # delete index if exists self.remove_range_obj(hash_key, range_keys=['index']) else: value = json.dumps(data) batch_data = {} batch_data[self.hash_key] = hash_key batch_data[self.range_key] = 'index' batch_data['value'] = value batch_data['update_date'] = update_date self.table.put_item(data=batch_data, overwrite=True) self.counters['writes'] +=1 return True def remove_range_obj(self, hash_key, range_keys=None): ''' deletes ranged object or specific range_keys ''' # get range object if range_keys == None: data = self.get_range_obj(hash_key) range_keys = data.keys() # remove possible index try: kwargs = {} kwargs[self.hash_key] = hash_key kwargs[self.range_key] = 'index' self.table.delete_item( **kwargs ) except: pass with self.table.batch_write() as batch: for range_key in range_keys: kwargs = {} kwargs[self.hash_key] = hash_key kwargs[self.range_key] = range_key batch.delete_item( **kwargs ) self.counters['delete'] +=1 return True
def main(argv): # load and transform data keywords = load_dict(dictionary_file) index = load_index(date_file) try: opts, args = getopt.getopt(argv, "td") except getopt.GetoptError: sys.exit(2) for opt, arg in opts: # load tables if opt == '-t': words_date, occurrences_by_words, occurrences_by_date = process_index(linear_matrix_file, index, keywords) table = Table(dynamoDB_table) with table.batch_write() as batch: for word in tqdm(words_date.keys(), desc='Upload to dynamoDB', leave=True): output = {} data = words_date[word] occurrences = [] dates_size = [] dates = [] for key,value in data.iteritems(): occurrences.append(value) dates.append(key) dates_size.append(occurrences_by_date[key]) zipped = zip(dates, occurrences, dates_size) zipped.sort() dates, occurrences, dates_size = zip(*zipped) output['word'] = word output['source'] = linear_matrix_file output['occurrences'] = list(occurrences) output['dates_size'] = list(dates_size) output['dates'] = list(dates) output['occurrences_size'] = occurrences_by_words[word] batch.put_item(data=output) # send to database elif opt =='-d': con = lite.connect(database) cur = con.cursor() # write correspondence to db cur.execute("drop table if exists words_stats") # create tables cur.execute("create table words_stats(stamp date, word_id int, nb int)") # export to db lst = list() words_stats = process_index_doc(linear_matrix_file,index) for date in tqdm(words_stats.keys(), leave=True): stats = words_stats[date] for word_id in stats.keys(): nb_doc = stats[word_id] if nb_doc>10: # otherwise, too big and doesn't bring a lot of information lst.append((date,int(word_id),nb_doc)) if len(lst)>50000: cur.executemany("insert into words_stats values (?,?,?)", lst) lst = list() cur.executemany("insert into words_stats values (?,?,?)", lst) con.commit()
class DynamoDBUtils(object): # <TEST ONLY> Mapping of Customer & Pie/Cam cust_pie_dict = { "cid1" : ["cam1","cam2"], "cid2" : ["cam1","cam2"] } FACES = { 'videos/video_100_frames_1.mp4': { 'face_count': 59, 'face_count_dtl': ['0', '1', '8', '12', '12', '11', '10', '4', '1', '0'], 'face_count_uniq': 3, 'face_count_uniq_dtl': ['0', '1', '0', '0', '0', '1', '0', '0', '1', '0'], 'frame_count': 100, 'time_taken': '0:00:04.731971' }, 'videos/video_100_frames_2.mp4': {'face_count': 62, 'face_count_dtl': ['10', '10', '0', '0', '0', '9', '5', '8', '10', '10'], 'face_count_uniq': 2, 'face_count_uniq_dtl': ['1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], 'frame_count': 100, 'time_taken': '0:00:04.955812' } } rasp_names = ["kitchen", "garage"] cols = ['START_TIME','LEN','PROCESSED','S3_BUCKET','S3_KEY','VERSION'] S3_BUCKET = 'smart-cam' def __init__(self): cfg = Config() aws_access_key_id = cfg.get("aws", "access_key_id") aws_secret_access_key = cfg.get("aws", "secret_access_key") self.conn = boto.dynamodb2.connect_to_region('us-west-1', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) self.sc = Table('SMARTCAM', connection=self.conn) logger.info(self.conn.list_tables()) pprint.pprint(self.conn.describe_table('SMARTCAM')) # <TEST ONLY> Creates one item in table def create_items(self, num_items=2): cnt = 0 for rasp_name in DynamoDBUtils.rasp_names: for i in xrange(num_items): cnt += 1 self.__create_item(rasp_name, cnt) time.sleep(num_items) # <TEST ONLY> Creates one item in table def __create_item(self, rasp_name, num): data = dict() data['RASP_NAME'] = rasp_name data['START_TIME'] = time.time() data['S3_BUCKET'] = DynamoDBUtils.S3_BUCKET data['S3_KEY'] = 'videos/video_{0}.avi'.format(num) data['PROCESSED'] = 0 data['CLASSIFIED'] = 0 data['VERSION'] = 0 logger.info("# Uploading Data for {0}: {1}".format(rasp_name, num)) self.sc.put_item(data) # <TEST ONLY> Creates multiple full items in table def create_full_items(self, num_items=10, start_time=1459555200): cnt = 0 with self.sc.batch_write() as batch: for rasp_name in DynamoDBUtils.rasp_names: st = start_time for i in xrange(num_items): cnt += 1 if cnt % 2 == 0: batch.put_item(self.__create_full_item(rasp_name, st, 'videos/video_100_frames_1.mp4')) else: batch.put_item(self.__create_full_item(rasp_name, st, 'videos/video_100_frames_2.mp4')) st += 11.25 # 10 + 1.25 secs between 2 video files # <TEST ONLY> Creates multiple full items in table # All Hard code values for purpose of testing the Backend/UI Integration def __create_full_item(self, rasp_name, start_time, s3_key): data = dict() data['RASP_NAME'] = rasp_name data['START_TIME'] = start_time data['UPDATE_TIME'] = start_time + 5 data['S3_BUCKET'] = DynamoDBUtils.S3_BUCKET data['S3_KEY'] = s3_key data['FRAME_COUNT'] = DynamoDBUtils.FACES[s3_key]['frame_count'] data['FACE_COUNT'] = DynamoDBUtils.FACES[s3_key]['face_count'] data['FACE_COUNT_UNIQ'] = DynamoDBUtils.FACES[s3_key]['face_count_uniq'] # Face Counts / Detail d = {} #d['data'] = ['5','5','5','5','5','5','5','5','5','6'] data['FACE_COUNT_DTL'] = DynamoDBUtils.FACES[s3_key]['face_count_dtl'] d = {} #d['data'] = ['0','0','0','0','0','0','0','0','0','1'] data['FACE_COUN_UNIQ_DTL'] = DynamoDBUtils.FACES[s3_key]['face_count_uniq_dtl'] d = {} d['data'] = ['0.1','0.1','0.1','0.05','0.05','0.15','0.001','0.05','0.01','0.01'] data['FOREGROUND'] = d data['PROCESSED'] = 1 data['VERSION'] = 1 logger.info("# Uploading Data for {0}: {1}".format(rasp_name, start_time)) # Converted to a Batch Write #self.sc.put_item(data) return data # Creates one item in table def create_item(self, rasp_name, s3_bucket, s3_key, s_time): data = dict() data['RASP_NAME'] = rasp_name data['START_TIME'] = s_time data['S3_BUCKET'] = s3_bucket data['S3_KEY'] = s3_key data['PROCESSED'] = 0 data['CLASSIFIED'] = 0 data['VERSION'] = 0 data['LEN'] = randint(5, 60) logger.info("# Uploading Data for {0}: {1}:{2}".format(rasp_name, s3_bucket, s3_key)) self.sc.put_item(data) # Fetch items def display_items(self): rows = self.sc.query_2(index='PROCESSED-index',PROCESSED__eq=0) cnt = 0 for row in rows: logger.info('{0},{1},{2}'.format(row['RASP_NAME'],row['START_TIME'],row['PROCESSED'])) cnt += 1 logger.info('# Total unprocessed items: {0}'.format(cnt)) return rows def purge_table(self): cnt = 0 for row in self.sc.scan(): cnt += 1 row.delete() logger.info('Deleted Row: {0}'.format(cnt)) def delete_by_id(self,id): cnt = 0 for row in self.get_items_by_id(id): cnt += 1 row.delete() logger.info('Deleted Row: {0}'.format(cnt)) def reset_processed(self): cnt = 0 for row in self.sc.scan(): cnt += 1 row['PROCESSED'] = 0 self.update(row) logger.info('Update Row: {0}'.format(cnt)) def reset_classified(self): cnt = 0 for row in self.sc.scan(): cnt += 1 row['CLASSIFIED'] = 0 self.update(row) logger.info('Update Row: {0}'.format(cnt)) def add_classified(self): cnt = 0 for row in self.sc.scan(): cnt += 1 row['CLASSIFIED'] = 0 self.update(row) logger.info('Update Row: {0}'.format(cnt)) def get_unprocessed_items(self): return self.sc.query_2(index='PROCESSED-index',PROCESSED__eq=0) def get_processed_items(self): return self.sc.query_2(index='PROCESSED-index',PROCESSED__eq=1) def get_unclassified_items(self): return self.sc.query_2(index='CLASSIFIED-index',CLASSIFIED__eq=0) def get_classified_items(self): return self.sc.query_2(index='CLASSIFIED-index',CLASSIFIED__eq=1) def get_items_by_id(self, id): return self.sc.query_2(RASP_NAME__eq=id) def get_items_by_id_range(self, id, start, end): return self.sc.query_2(RASP_NAME__eq=id, START_TIME__between=(start, end)) def update(self, row): try: row.save(overwrite=True) except Exception as e: logger.error(e) logger.info('[FAILED] Processing: ', row['RASP_NAME'],row['START_TIME'],row['PROCESSED']) def stats(self,lst): quotient, remainder = divmod(len(lst), 2) if remainder: return sorted(lst)[quotient] return sum(lst) / len(lst), sum(sorted(lst)[quotient - 1:quotient + 1]) / 2 def close(self): self.conn.close()
class S3mper: """ S3mper is a metastore library used to provide a layer of consistency on top of S3 by using dynamodb to record what files should be in the S3 listing. See go/s3mper for more information. """ def __init__(self, disabled=False, fail_on_error=False, table_name="ConsistentListingMetastoreTest"): self.disabled = disabled self.disable_alerts = False self.fail_on_error = fail_on_error if self.disabled: logger.warning("S3mper Explicitly Disabled") return self.db = Table(table_name) def add(self, paths): """ Adds a list of Paths to the file metastore and returns True on success. Example: s.add([path1, path2]) -> True """ if self.disabled: return epoch = self.__time_now() paths = self.__as_paths(paths) with self.db.batch_write() as batch: for path in paths: batch.put_item(data={"path": path.parent().normalize(), "file": path.filename(), "epoch": epoch}) def list(self, path, include_delete_marked=False): """ Lists the given directory in the metastore. The passed in path must be a directory. Example: s.list(path) -> [] """ if self.disabled: return if isinstance(path, basestring): path = Path(path) listing = self.db.query(path__eq=path.normalize(), consistent=True) paths = [] for e in listing: if (not include_delete_marked) and "deleted" in e: continue paths.append(Path("s3n:" + e["path"] + "/" + e["file"])) return paths def checked_listing(self, s3_listing, path): """ Checks the s3_listing against the metastore listing. All attempts are made to use the boto generator for listing if a check isn't necessary, but if a check must be made the whole listing for both the metastore and s3 listing need to be pulled into memory. """ if self.disabled: return s3_listing expected = set([p.url for p in self.list(path)]) if not expected: return s3_listing # This isn't ideal since we are sucking in the whole listing # to perform the check, but if we check on-the-fly, processing # could be partially complete before inconsistency is detected s3_listing = list(s3_listing()) for p in s3_listing: expected.discard(p if not isinstance(p, Key) else "s3://%s/%s" % (p.bucket, p.name)) if not expected: return s3_listing else: logger.error( "Failed consistency check. Missing file count %d. Missing paths: %s" % (len(expected), expected) ) self.__send_alert(expected) if self.fail_on_error: raise S3ConsistencyException(expected) def delete(self, paths, delete_marker=False): """ Deletes the provided paths from the metastore. Completly removing files from the metastore can cause problems because the s3 listing may show the files even though the data may not be available. This will cause MR jobs to fail. The delete marker can be used to hide files from the listing. Example: s.delete([path1, path2]) -> True """ if self.disabled: return paths = self.__as_paths(paths) if delete_marker: for path in paths: item = self.db.get_item(path=path.parent().normalize(), file=path.filename()) item["deleted"] = "true" else: with self.db.batch_write() as batch: for path in paths: batch.delete_item(path=path.parent().normalize(), file=path.filename()) def __send_alert(self, paths, detail={}): if self.disable_alerts: return try: body = { "truncated": detail.get("truncated", False), "paths": paths if len(paths) <= 10 else paths[0:9], "recovered": detail.get("recovered", False), "missingFiles": len(paths), "stackTrace": traceback.extract_stack(), "timestamp": "%s" % datetime.utcnow(), "queryId": detail.get("", None), "taskId": detail.get("", None), "hostname": platform.node(), "username": getpass.getuser(), "queryType": "DSE Platform Lib", "jobId": detail.get("jobId", None), "attemptId": detail.get("attemptId", None), "email": detail.get("email", None), "dataovenId": detail.get("dataovenId", None), "logFile": detail.get("logFile", None), "inputFile": detail.get("inputFile", None), "genieId": detail.get("genieId", None), "epoch": self.__time_now(), } message = RawMessage() message.set_body(body) conn = sqs.connect_to_region("us-east-1") queue = conn.get_queue("s3mper-alert-queue") queue.write(message) except Exception as e: print e def __as_paths(self, paths): if isinstance(paths, basestring): return [Path(paths)] elif isinstance(paths, Path): return [paths] else: return paths def __time_now(self): """ Returns current time in milliseconds. """ return int(time.time())
class DynamoDBAdapter(key_value_store.KeyValueStore): """ Implementation of an abstract key-value store defined in key_value_store.py. The underlying database is amazon DynamoDB. The store keeps all objects in a single table with following schema: [HashKey('kind', data_type=STRING), RangeKey('id')]. 'kind' is the string with the object type ('vector', 'set' or 'int') and 'id' is the object id. The object value is stored in the 'value' attribute of the table items. The table should be created before this code is executed. Amazon configuration is assumed to be stored in ~/.boto file as described in http://boto.readthedocs.org/en/latest/boto_config_tut.html """ def __init__(self, precision=np.dtype('float32'), table_name='test'): """ Create an instance of the dynamodb key-value store. precision - a numpy type, elements of all vectors are converted and stored in this type; table_name - the name of the DynamoDB table which keeps the objects. """ conn = boto.dynamodb2.connect_to_region('eu-west-1') if not isinstance(precision, np.dtype): raise TypeError("Precision should be a numpy.dtype subtype") self.precision = precision self.precision_name = precision.name self.table = Table(table_name, connection=conn) def _get_or_create_item(self, kind, item_id): try: item = self.table.get_item(kind=kind, id=item_id) except ItemNotFound: item = Item(self.table) item['kind'] = kind item['id'] = item_id return item def _create_vector_item(self, vec_id, vector): item = self._get_or_create_item('vector', vec_id) item['value'] = Binary(vector.astype(self.precision).tostring()) item['precision'] = self.precision_name return item def _vector_value(self, item): return np.fromstring(str(item['value']), np.dtype(item['precision'])) def get_vector_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='vector')] def get_int_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='int')] def get_set_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='set')] def store_vector(self, vec_id, vector): item = self._create_vector_item(vec_id, vector) item.save() def get_vector(self, vec_id): try: item = self.table.get_item(kind='vector', id=vec_id) except ItemNotFound: raise KeyError('Vector key %s is unknown' % (vec_id, )) return self._vector_value(item) def bulk_get_vector(self, vec_ids): keys = [{'kind': 'vector', 'id': i} for i in vec_ids] vs = self.table.batch_get(keys=keys) return [self._vector_value(i) for i in vs] def remove_vector(self, vec_id): try: item = self.table.get_item(kind='vector', id=vec_id) except ItemNotFound: raise KeyError('Vector key %s is unknown' % (vec_id, )) item.delete() def add_to_set(self, set_id, element_id): item = self._get_or_create_item('set', set_id) if 'value' not in item.keys() or not isinstance(item['value'], set): item['value'] = set() item['value'].add(element_id) item.save(overwrite=True) def remove_from_set(self, set_id, element_id): try: item = self.table.get_item(kind='set', id=set_id) except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id, )) if 'value' not in item.keys() or not isinstance(item['value'], set): raise KeyError('Incorrect value in item %s' % (set_id, )) if element_id not in item['value']: raise KeyError('Element %s not in set %s' % (element_id, set_id)) item['value'].remove(element_id) item.save() def remove_set(self, set_id): try: item = self.table.get_item(kind='set', id=set_id) item.delete() except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id, )) def get_set(self, set_id): try: the_set = self.table.get_item(kind='set', id=set_id)['value'] return set([str(entry) for entry in the_set]) except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id, )) def store_int(self, int_id, integer): item = self._get_or_create_item('int', int_id) item['value'] = integer item.save() def get_int(self, int_id): try: return int(self.table.get_item(kind='int', id=int_id)['value']) except ItemNotFound: raise KeyError('Int key %s is unknown' % (int_id, )) def remove_int(self, int_id): try: item = self.table.get_item(kind='int', id=int_id) except ItemNotFound: raise KeyError('Int key %s is unknown' % (int_id, )) item.delete() def _aggregate_set_id_element_pairs(self, setpairs): """Turns a list of pairs of the form (set_id, element_id) into a list 'L' of pairs 'p' of the form (set_id, set_of_element_ids). 'L' has the property that if 'p' and 'q' are distinct entries in 'L', then p[0] and q[0] are also distinct.""" set_ids = set([entry[0] for entry in setpairs]) listlist = [[entry for entry in setpairs if entry[0] == set_id] for set_id in set_ids] result = [(pairlist[0][0], set([entry[1] for entry in pairlist])) for pairlist in listlist] return result def bulk_store_vector(self, vec_ids, vectors): if len(vec_ids) != len(vectors): raise ValueError vecpairs = zip(vec_ids, vectors) with self.table.batch_write() as batch: for vec_id, vec in vecpairs: item = self._create_vector_item(vec_id, vec) batch.put_item(item) def bulk_store_vector_old(self, vectors_df): """Argument 'vectors' is a dataframe with index vector ids.""" if len(vec_ids) != len(vectors): raise ValueError with self.table.batch_write() as batch: for ind in vectors_df.index: vec_id = str(ind) vec = vectors_df.loc[ind].values item = self._create_vector_item(vec_id, vec) batch.put_item(item) def bulk_store_int(self, int_ids, integers): """Argument 'intpairs' is a list of pairs of the form (int_id, integer).""" if len(int_ids) != len(integers): raise ValueError intpairs = zip(int_ids, integers) with self.table.batch_write() as batch: for pair in intpairs: int_id, integer = pair item = self._get_or_create_item('int', int_id) item['value'] = integer batch.put_item(item) def bulk_add_to_set(self, set_ids, element_ids): """batch_write() objects if the same item is written to more than once per batch, hence we aggregate all (set_id, element_id) pairs into a list of pairs (set_id, element_ids), where the 'set_id's are pairwise distinct, and the 'element_ids' are sets.""" if len(set_ids) != len(element_ids): raise ValueError setpairs = zip(set_ids, element_ids) setlist = self._aggregate_set_id_element_pairs(setpairs) with self.table.batch_write() as batch: for pair in setlist: set_id, element_ids = pair item = self._get_or_create_item('set', set_id) if 'value' not in item.keys() or not isinstance( item['value'], set): item['value'] = set() item['value'].update(element_ids) batch.put_item(item)
def roundDecimal(flt): return int(round(Decimal(flt), 7) * 10000000) row = rows.next() tableRowCount = 1 #round to 6 decimal points so we can save to dynamodb (known bug) while row != None: #write 25 rows at the time, saves thoroughput and improves performance batchIndex = 0 try: with tdidfIndexTable.batch_write() as batch: print "starting new batch" while (batchIndex != 25): if (row == None): break #calculate Tf-idf tdIdfValue = tdIdfCalculator.Calculate(row[columnWithBody]) articleId = row[columnWithUniqueId] #No need to add entry for tf-idf if keyword does not appear in the source at all. if tdIdfValue > 0: data = { 'id': uniqueId, 'word': keyword,
class AwsDataFactory(DataFactory): def __init__(self, config): ''' Constructor. @param config Configuration settings. Expected definition: Section: database Key: data_table Type: string Desc: Name of the Data model table @paramType ConfigParser @returns n/a ''' self.global_table = Table(config.get('database', 'global_data_table')) self.set_table = Table(config.get('database', 'set_data_table')) def create_data(self, content, datum_id, location, set_id, timestamp, type): ''' {@inheritDocs} ''' assert content is not None assert datum_id is not None assert -180 <= location[0] and location[0] < 180, location[0] assert -90 <= location[1] and location[1] < 90, location[1] assert set_id is not None assert timestamp is not None assert type is not None # Normalize the values lat_norm = int(location[1] * 10000000) lon_norm = int(location[0] * 10000000) timestamp_norm = strftime('%Y-%m-%d %H:%M:%S', timestamp) # Create the database record data = { 'content' : content, 'datum_id' : datum_id, 'lat' : lat_norm, 'lat_copy' : lat_norm, 'lon' : lon_norm, 'lon_copy' : lon_norm, 'set_id' : set_id, 'timestamp' : timestamp_norm, 'timestamp_copy' : timestamp_norm, 'type' : type } result = False if set_id == 'global': # If this is a global data point result = self.global_table.put_item(data=data) else: # If this is a set data point result = self.set_table.put_item(data=data) # If we failed to create the database record if result is False: raise CreateError("Failed to create the Data(" + str(data) + ")!") def copy_data(self, set_id, datas): ''' {@inheritDocs} ''' assert set_id is not None with self.set_table.batch_write() as batch: for data in datas: batch.put_item(data = { 'content' : data.get_content(), 'datum_id' : data.get_datum_id(), 'lat' : data.record['lat'], 'lat_copy' : data.record['lat_copy'], 'lon' : data.record['lon'], 'lon_copy' : data.record['lon_copy'], 'set_id' : set_id, 'timestamp' : data.record['timestamp'], 'timestamp_copy' : data.record['timestamp_copy'], 'type' : data.record['type'] }) def filter_global_data(self, min_timestamp=None, max_timestamp=None, min_lat=None, max_lat=None, min_lon=None, max_lon=None, segment_id=0, num_segments=1, type=None ): ''' {@inheritDocs} ''' kwargs = {} if min_timestamp is not None: kwargs['timestamp__gte'] = strftime('%Y-%m-%d %H:%M:%S', min_timestamp) if max_timestamp is not None: kwargs['timestamp_copy__lte'] = strftime('%Y-%m-%d %H:%M:%S', max_timestamp) if min_lat is not None: kwargs['lat__gte'] = int(min_lat * 10000000) if max_lat is not None: kwargs['lat_copy__lte'] = int(max_lat * 10000000) if min_lon is not None: kwargs['lon__gte'] = int(min_lon * 10000000) if max_lon is not None: kwargs['lon_copy__lte'] = int(max_lon * 10000000) if type is not None: kwargs['type__eq'] = type kwargs['set_id__eq'] = 'global' kwargs['segment'] = segment_id kwargs['total_segments'] = num_segments logger.debug("Scan Args: %s", kwargs) return AwsDataIterator(self.global_table.scan(**kwargs)) def get_data_set(self, set_id): ''' {@inheritDocs} ''' return AwsDataIterator(self.set_table.query(set_id__eq=set_id))
# check if the file is present """ c.execute('''CREATE TABLE users (email text, token text)''') c.execute('''CREATE TABLE result (token text, q1 text, q2 text, q3 text)''') """ conn = boto.dynamodb2.connect_to_region( 'us-west-1', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY ) users = Table('survey2_users', connection=conn) unique_emails = set() with users.batch_write() as batch: with open(csvfile) as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') for line in csvreader: if line[0] not in unique_emails: batch.put_item(data={'email':line[0], 'token': str(uuid.uuid4()), }) unique_emails.add(line[0]) else: print "DUPE: %s"%line[0]
class DDBRuns(Dynamo): @classmethod def from_test_mode(cls, access_key=None, secret=None): """ Use this for getting an instance of this class that uses test tables. """ instance = cls(access_key, secret) instance.table = Table('test_runs', connection=instance.connection) return instance def __init__(self, access_key=None, secret=None): """ When called directly (as should be done for production code), sets table to the production 'runs' table. """ super(DDBRuns, self).__init__(access_key, secret) self.table = Table('runs', connection=self.connection) def save_new_run(self, dt_str=None, start_date_str=None, end_date_str=None): """ dt_str = datetime of run. Defaults to now. start_date_str = the start date for look-back of query performance data processing. * No default end_date_str = the end date for query performance data processing. Defaults to today. """ assert start_date_str, "start_date_str is required when saving a new run to runs table." assert DAY_STR_RE.match(start_date_str) if end_date_str: assert DAY_STR_RE.match(end_date_str) if dt_str: assert SECOND_STR_RE.match(dt_str) dt_str = dt_str or datetime.now().strftime(SECOND_STR_FORMAT) end_date_str = end_date_str or datetime.now().strftime(DAY_STR_FORMAT) return self.table.put_item(data={'dt': dt_str, 'start': start_date_str, 'end': end_date_str}) def most_recent_start_date_str(self): """ :return: a string representing most recent start date from db """ df = self.get_runs_df() if df.empty: return None else: # should already be sorted, but just in case... df.sort(columns=['dt'], ascending=True, inplace=True) return df.iloc[len(df)-1]['start'] def most_recent_end_date_str(self): """ :return: a string representing most recent end date from db """ df = self.get_runs_df() if df.empty: return None else: # should already be sorted, but just in case... df.sort(columns=['dt'], ascending=True, inplace=True) return df.iloc[len(df)-1]['end'] def get_runs_df(self): """ Returns all table as dataframe, sorted with most recent entry on bottom (ascending order) """ df = DataFrame([{k: v for k, v in r.items()} for r in self.table.scan()]) if df.empty: return df else: df.sort(columns=['dt'], ascending=True, inplace=True) # force df to have columns in this order return df[['dt', 'start', 'end']] def modify_throughput(self, requested_read, requested_write, table=None): table = table or self.table return super(DDBRuns, self).modify_throughput(requested_read, requested_write, table) def truncate_table(self): """ WARNING! Only use for test mode table """ assert self.table.table_name == 'test_runs', "Will only truncate test table. To truncate production table, run code manually" with self.table.batch_write() as batch: for item in self.table.scan(): batch.delete_item(dt=item['dt']) def thors_start_end_date_strings(self, new_run=True, days_ago_start=30): if new_run: if days_ago_start is not None: print days_ago_start start_date_str = self._days_ago_str(days_ago_start) else: start_date_str = self.most_recent_end_date_str() end_date_str = date.today().strftime(DAY_STR_FORMAT) else: start_date_str = self.most_recent_start_date_str() end_date_str = self.most_recent_end_date_str() assert start_date_str, "Start date string is None, please check the database since we are not doing a new run" assert end_date_str, "End date string is None, please check the database since we are not doing a new run" return start_date_str, end_date_str def _days_ago_str(self, num_days_ago): return (date.today() - timedelta(days=num_days_ago)).strftime(DAY_STR_FORMAT) def start_end_date_strings(self, new_run=True, days_ago_start=30): if new_run: start_date_str = self.most_recent_end_date_str() or self._days_ago_str(days_ago_start) end_date_str = date.today().strftime(DAY_STR_FORMAT) else: start_date_str = self.most_recent_start_date_str() end_date_str = self.most_recent_end_date_str() return start_date_str, end_date_str
def dump_to_dynamo(cur, dynamoDB_table, json_output_file): # get occurrences by date print "get occurrences by date" sql = """ select stamp, count(*) from dates group by stamp""" cur.execute(sql) total_occ = {} dates = cur.fetchall() for date in dates: total_occ[date[0]]=date[1] # get occurences by topic print "get occurences by topic" sql = """ select m.topic, stamp, count(*) n from ( select id, topic, cast(max(occ) as real) occu,(select cast(sum(k.occ) as real) from document k where d.id=k.id) t from document d group by id ) m join dates da on m.id=da.id where occu/t>0.5 group by m.topic, stamp order by m.topic, stamp asc""" cur.execute(sql) occurrences = cur.fetchall() topics = defaultdict(lambda: defaultdict(lambda:[])) print "iterate over result set" for row in tqdm(occurrences, leave=True): topic = row[0] stamp = row[1] occ = row[2] topics[topic]['occurrences'].append(int(occ)) topics[topic]['dates'].append(str(stamp)) topics[topic]['dates_size'].append(int(total_occ[stamp])) print "push to dynamodb table" table = Table(dynamoDB_table) with table.batch_write() as batch: for topic in tqdm(topics.keys(), leave=True): output = {} output['word'] = "key_topic_"+str(topic) output['source'] = 'load_db_topics' output['occurrences'] = topics[topic]["occurrences"] output['dates_size'] = topics[topic]["dates_size"] output['dates'] = topics[topic]["dates"] output['occurrences_size']=99 batch.put_item(data=output) # output json description file print "output json description file" # get topics --because sqlite doesn't support row numbers topics = [] sql = "select distinct topic_id from model" cur.execute(sql) rows = cur.fetchall() for row in rows: topics.append(row[0]) # iterate over sql to build json output = [] for index, topic in tqdm(enumerate(topics)): sql = "select word from model m join dictionary d on d.id=m.word_id where topic_id=? order by m.occ desc limit 10" cur.execute(sql,(topic,)) rows = cur.fetchall() words = [] for word in rows: words.append(word[0]) line = dict() line['id']=index line['name']=", ".join(words) line['key']="key_topic_"+str(topic) output.append(line) # output to file with open(json_output_file, 'w') as outfile: json.dump(output,outfile)
class S3mper: """ S3mper is a metastore library used to provide a layer of consistency on top of S3 by using dynamodb to record what files should be in the S3 listing. See go/s3mper for more information. """ def __init__(self, disabled=False, fail_on_error=False, table_name='ConsistentListingMetastoreTest'): self.disabled = disabled self.disable_alerts = False self.fail_on_error = fail_on_error if self.disabled: logger.warning('S3mper Explicitly Disabled') return self.db = Table(table_name) def add(self, paths): """ Adds a list of Paths to the file metastore and returns True on success. Example: s.add([path1, path2]) -> True """ if self.disabled: return epoch = self.__time_now() paths = self.__as_paths(paths) with self.db.batch_write() as batch: for path in paths: batch.put_item( data={ 'path': path.parent().normalize(), 'file': path.filename(), 'epoch': epoch }) def list(self, path, include_delete_marked=False): """ Lists the given directory in the metastore. The passed in path must be a directory. Example: s.list(path) -> [] """ if self.disabled: return if isinstance(path, basestring): path = Path(path) listing = self.db.query(path__eq=path.normalize(), consistent=True) paths = [] for e in listing: if (not include_delete_marked) and 'deleted' in e: continue paths.append(Path('s3n:' + e['path'] + "/" + e['file'])) return paths def checked_listing(self, s3_listing, path): """ Checks the s3_listing against the metastore listing. All attempts are made to use the boto generator for listing if a check isn't necessary, but if a check must be made the whole listing for both the metastore and s3 listing need to be pulled into memory. """ if self.disabled: return s3_listing expected = set([p.url for p in self.list(path)]) if not expected: return s3_listing #This isn't ideal since we are sucking in the whole listing #to perform the check, but if we check on-the-fly, processing #could be partially complete before inconsistency is detected s3_listing = list(s3_listing()) for p in s3_listing: expected.discard(p if not isinstance(p, Key) else 's3://%s/%s' % (p.bucket, p.name)) if not expected: return s3_listing else: logger.error( "Failed consistency check. Missing file count %d. Missing paths: %s" % (len(expected), expected)) self.__send_alert(expected) if self.fail_on_error: raise S3ConsistencyException(expected) def delete(self, paths, delete_marker=False): """ Deletes the provided paths from the metastore. Completly removing files from the metastore can cause problems because the s3 listing may show the files even though the data may not be available. This will cause MR jobs to fail. The delete marker can be used to hide files from the listing. Example: s.delete([path1, path2]) -> True """ if (self.disabled): return paths = self.__as_paths(paths) if delete_marker: for path in paths: item = self.db.get_item(path=path.parent().normalize(), file=path.filename()) item['deleted'] = "true" else: with self.db.batch_write() as batch: for path in paths: batch.delete_item(path=path.parent().normalize(), file=path.filename()) def __send_alert(self, paths, detail={}): if self.disable_alerts: return try: body = { "truncated": detail.get('truncated', False), "paths": paths if len(paths) <= 10 else paths[0:9], "recovered": detail.get('recovered', False), "missingFiles": len(paths), "stackTrace": traceback.extract_stack(), "timestamp": "%s" % datetime.utcnow(), "queryId": detail.get('', None), "taskId": detail.get('', None), "hostname": platform.node(), "username": getpass.getuser(), "queryType": "DSE Platform Lib", "jobId": detail.get('jobId', None), "attemptId": detail.get('attemptId', None), "email": detail.get('email', None), "dataovenId": detail.get('dataovenId', None), "logFile": detail.get('logFile', None), "inputFile": detail.get('inputFile', None), "genieId": detail.get('genieId', None), "epoch": self.__time_now() } message = RawMessage() message.set_body(body) conn = sqs.connect_to_region("us-east-1") queue = conn.get_queue('s3mper-alert-queue') queue.write(message) except Exception as e: print e def __as_paths(self, paths): if isinstance(paths, basestring): return [Path(paths)] elif isinstance(paths, Path): return [paths] else: return paths def __time_now(self): """ Returns current time in milliseconds. """ return int(time.time())
class ContentStore: """ DynamoDb proxy for wikidata store, which contains page ids, titles, and contents. This class is designed to be used in a "with" block, as in with ContentStore() as store: ... statements using store ... You must have a configuration file in your home directory containing the AWS access key id and secret key for the IAM identity that has access to the database. This file must be named ".boto", and must contain a section of the form: [Credentials] aws_access_key_id = <access key id> aws_secret_access_key = <secret key id> """ def add_page(self, pageId, pageTitle, pageText): """ Adds a new page to the data store. New pages are batched and sent to DynamoDB 25 at a time. Clients may call flush() to cause all pending pages to be uploaded immediately. :param str pageId: The unique identifier of the page :param str pageTitle: The title of the page :param str pageText: The text contents of the page, in wiki markup format """ self._batch_write.put_item(data={ 'pageId': pageId, 'pageTitle': pageTitle, 'pageText': pageText}) def flush(self): """ Ensure that all added pages are persisted to DynamoDB. """ self._batch_write.flush() def get_content(self, pageId): """ Retrieve the content of a page :param str pageId: The unique page identifier :return: The text content of a page in wiki markup format """ item = self._table.get_item(pageId=pageId) return item['pageText'] def get_title(self, pageId): """ Get the title of a page :param str pageId: The unique page identifier :return: The title content of a page """ item = self._table.get_item(pageId=pageId) return item['pageTitle'] def __enter__(self): self._previous_access_key_id = os.environ["AWS_ACCESS_KEY_ID"] self._previous_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"] del os.environ["AWS_ACCESS_KEY_ID"] del os.environ["AWS_SECRET_ACCESS_KEY"] connection = boto.dynamodb2.connect_to_region('us-east-1') self._table = Table('wikidata', connection=connection) self._batch_write = self._table.batch_write() return self def __exit__(self, exc_type, exc_value, traceback): os.environ["AWS_ACCESS_KEY_ID"] = self._previous_access_key_id os.environ["AWS_SECRET_ACCESS_KEY"] = self._previous_secret_key if self._batch_write.should_flush(): self._batch_write.flush() # Cause the exception to be re-raised if one has occurred return exc_value is None
class DDBSlurps(Dynamo): @classmethod def from_test_mode(cls, access_key=None, secret=None): """ Use this for getting an instance of this class that uses test tables. """ instance = cls(access_key, secret) instance.slurps_table = Table('test_slurps', connection=instance.connection) instance.failed_slurps_table = Table('test_failed_slurps', connection=instance.connection) return instance def __init__(self, access_key=None, secret=None): """ ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables """ super(DDBSlurps, self).__init__(access_key, secret) self.slurps_table = Table('slurps', connection=self.connection) self.failed_slurps_table = Table('failed_slurps', connection=self.connection) def save_slurp_info(self, slurp_info_, overwrite=True): """ slurp_info_ can either be in the form of a list of dicts or else a single dict. If slurp_info is a list, batch write will be used """ if isinstance(slurp_info_, dict): self.slurps_table.put_item(slurp_info_, overwrite=overwrite) elif isinstance(slurp_info_, list): with self.slurps_table.batch_write() as batch: for s in slurp_info_: batch.put_item(data=s, overwrite=overwrite) else: raise TypeError, "slurp_info must be a dict or a list of dicts, not a {}".format(type(slurp_info_)) def save_failed_slurp(self, searchterm): self.failed_slurps_table.put_item(data={'searchterm': searchterm, 'datetime': datetime.now().isoformat()}, overwrite=True) def get_slurp_info(self, search_term_=None): """ search_term_ can be either a string or a list of strings. Each string should be a search term you are looking for in the db. Returns either a single list of key-value tuples (if search_term_ was a string) or a list of key-value tuples (if search_term_ was a list) Each list of key-value tuples can easily be converted to a dict or an OrderedDict by the client. """ # searchterm_ is a STRING if isinstance(search_term_, basestring): if search_term_: slurp_info = (self.slurps_table.get_item(searchterm=search_term_)).items() else: slurp_info = [] # searchterm is a LIST of strings elif isinstance(search_term_, list): if search_term_: # create a set of non-empty searchterms. We us a set to avoid a duplicate query error from the db set_of_sts = {st for st in search_term_ if st} # create a list of dicts from the set list_of_st_dicts = [{'searchterm': st} for st in set_of_sts] res = self.slurps_table.batch_get(list_of_st_dicts) try: slurp_info = [i.items() for i in res] except (StopIteration, IndexError): # If res is empty, we get one of these errors when trying to iterate. slurp_info = [] else: slurp_info = [] # searchterm is an unexpected type else: raise TypeError, "search_term_ must be a dict or a list of dicts, not a {}".format(type(search_term_)) return slurp_info def existing_and_missing_uni(self, searchterm_list): """ Takes a list of searchterm strings and returns a list of searchterm strings that were found in the db (in unicode) and a list of the searchterms that were missing from the found results """ # make sure in utf8 before we send request to the db input_sts_utf8 = [to_utf8_or_bust(i) for i in searchterm_list] found_sts_info = self.get_slurp_info(input_sts_utf8) found_sts_uni = [to_unicode_or_bust(dict(i)['searchterm']) for i in found_sts_info] input_sts_uni = [to_unicode_or_bust(i) for i in input_sts_utf8] missing_sts_uni = order_conserving.setdiff(input_sts_uni, found_sts_uni) return found_sts_uni, missing_sts_uni def get_table(self, table_name): """ Convenience method for client who may wish to get a specific table from the DynamoDB connection """ return Table(table_name, connection=self.connection) def truncate_failed_slurp_table(self): """ """ with self.failed_slurps_table.batch_write() as batch: for item in self.failed_slurps_table.scan(): batch.delete_item(searchterm=item['searchterm']) def truncate_slurp_table(self): """ WARNING! Only use for test mode table """ assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually" test_slurps_table = Table('test_slurps', connection=self.connection) with test_slurps_table.batch_write() as batch: for item in self.slurps_table.scan(): batch.delete_item(searchterm=item['searchterm']) def modify_failed_slurps_throughput(self, requested_read, requested_write): return self.modify_throughput(requested_read, requested_write, self.failed_slurps_table) def modify_slurps_throughput(self, requested_read, requested_write): return self.modify_throughput(requested_read, requested_write, self.slurps_table) def get_slurps_table_info(self): return self.get_table_info(self.slurps_table) def get_failed_slurps_table_info(self): return self.get_table_info(self.failed_slurps_table)
pending_acc_ids = [7, 8] property_acc_ids = [9, 10] conn= dynamodb2.connect_to_region(region_name=region,aws_access_key_id=access_key,aws_secret_access_key=secret_key) property_table = Table(dynamodb_property_table, connection=conn) pend_tnt_table = Table(dynamodb_pending_tenant_table, connection=conn) prop_tnt_table = Table(dynamodb_property_tenant_table, connection=conn) properties = [] for prop in properties_res: properties.append(dict(prop.get_raw_keys())['id']['S'].encode('ascii')) with pend_tnt_table.batch_write() as batch: for pend_acc_id in pending_acc_ids: # Insert ids here pending = {} pending['pendTntid'] = str(pend_next_id) pend_next_id+= 1 pending['propertyId'] = properties[randint(0, len(properties) - 1)] pending['status'] = pending_types[randint(0, len(pending_types) - 1)] pending['accountId'] = pend_acc_id batch.put_item(data=pending) with prop_tnt_table.batch_write() as batch: for property_acc_id in property_acc_ids prop = {} prop['propTntId'] = str(prop_next_id) prop_next_id+= 1
def add_sample_stops(): # try: c = Coords() tb_stops = Table('stops', connection=cm.db) tb_stops_loc = Table('stops_loc', connection=cm.db) with tb_stops.batch_write() as batch: batch.put_item(data={ 'stop_id': 'alpy', 'name_part': 'a', 'name' : 'aleppey', 'level_2': 'alappuzha', 'level_1': 'kerala', 'country': 'india' }) batch.put_item(data={ 'stop_id': 'kykm', 'name_part': 'k', 'name' : 'kayankulam', 'level_2': 'alappuzha', 'level_1': 'kerala', 'country': 'india' }) batch.put_item(data={ 'stop_id': 'ochr', 'name_part': 'o', 'name' : 'oachira', 'level_2': 'kollam', 'level_1': 'kerala', 'country': 'india' }) batch.put_item(data={ 'stop_id': 'vlkv', 'name_part': 'v', 'name' : 'vallikavu', 'level_2': 'kollam', 'level_1': 'kerala', 'country': 'india' }) batch.put_item(data={ 'stop_id': 'kpy', 'name_part': 'k', 'name' : 'karunagapally', 'level_2': 'kollam', 'level_1': 'kerala', 'country': 'india' }) batch.put_item(data={ 'stop_id': 'klm', 'name_part': 'k', 'name' : 'kollam', 'level_2': 'kollam', 'level_1': 'kerala', 'country': "india" }) # Adding to location with tb_stops_loc.batch_write() as batch: batch.put_item(data={ 'stop_id': 'alpy', 'lat_part': "9", 'lat': c.integerify(9.5010367), 'lon': c.integerify(76.3421059), 'lon_part': '76' }) batch.put_item(data={ 'stop_id': 'kykm', 'lat_part':"9", 'lon_part': '76', 'lat': c.integerify(9.1729609), 'lon': c.integerify(76.5073299) }) batch.put_item(data={ 'stop_id': 'ochr', 'lat_part':"9", 'lon_part': '76', 'lat': c.integerify(9.1272739), 'lon': c.integerify(76.5065333) }) batch.put_item(data={ 'stop_id': 'vlkv', 'lat': c.integerify(9.0938471), 'lon': c.integerify(76.4916068), 'lat_part':"9", 'lon_part': '76' }) batch.put_item(data={ 'stop_id': 'kpy', 'lat_part':"9", 'lon_part': '76', 'lat': c.integerify(9.0609902), 'lon': c.integerify(76.5341999) }) batch.put_item(data={ 'stop_id': 'klm', 'lat_part':"8", 'lon_part': '76', 'lat': c.integerify(8.8862714), 'lon': c.integerify(76.5938379) }) return message_helper.success()
#boto dynamo2 bug won't let us python float, thus rounding by 7 decimal points and multiple by 10^7 def roundDecimal(flt): return int(round(Decimal(flt),7) * 10000000) row = rows.next() tableRowCount = 1 #round to 6 decimal points so we can save to dynamodb (known bug) while row != None: #write 25 rows at the time, saves thoroughput and improves performance batchIndex = 0 try: with tdidfIndexTable.batch_write() as batch: print "starting new batch" while (batchIndex != 25): if (row == None): break #calculate Tf-idf tdIdfValue = tdIdfCalculator.Calculate(row[columnWithBody]) articleId = row[columnWithUniqueId] #No need to add entry for tf-idf if keyword does not appear in the source at all. if tdIdfValue > 0: data={ 'id': uniqueId, 'word': keyword,
connection=conn ) #Input json file name js = sys.argv[2] #Loading data in Tables with open(js) as json_file : data = json.load(json_file) size = len(data) i = 0 printProgress(i, size, prefix = 'Data', suffix = 'Complete', barLength = 50) for date in data: topics_list = [] for index,topic in enumerate(data[date]) : topics_list.append(str(data[date][topic]["topic_id"]) + '#' + topic + '#' + data[date][topic]['category'] + '#' + ''.join(reversed(date.split('-'))) + '#' + str(data[date][topic]["score"])) if len(topics_list) == 10 or index == len(data[date]) - 1 : try: with topics.batch_write() as batch: for item in topics_list: items = item.split('#') batch.put_item(data={'Name' : items[1] , 'Category' : items[2] ,'Date' : sys.argv[1] + items[3] ,'Score': Decimal(items[4])}) #print items topics_list = [] sleep(0.1) except : print sys.exc_info()[0], items printProgress(i, size, prefix = 'Data', suffix = 'Complete', barLength = 50) i += 1
class ContentStore: """ DynamoDb proxy for wikidata store, which contains page ids, titles, and contents. This class is designed to be used in a "with" block, as in with ContentStore() as store: ... statements using store ... You must have a configuration file in your home directory containing the AWS access key id and secret key for the IAM identity that has access to the database. This file must be named ".boto", and must contain a section of the form: [Credentials] aws_access_key_id = <access key id> aws_secret_access_key = <secret key id> """ def add_page(self, pageId, pageTitle, pageText): """ Adds a new page to the data store. New pages are batched and sent to DynamoDB 25 at a time. Clients may call flush() to cause all pending pages to be uploaded immediately. :param str pageId: The unique identifier of the page :param str pageTitle: The title of the page :param str pageText: The text contents of the page, in wiki markup format """ self._batch_write.put_item(data={ 'pageId': pageId, 'pageTitle': pageTitle, 'pageText': pageText }) def flush(self): """ Ensure that all added pages are persisted to DynamoDB. """ self._batch_write.flush() def get_content(self, pageId): """ Retrieve the content of a page :param str pageId: The unique page identifier :return: The text content of a page in wiki markup format """ item = self._table.get_item(pageId=pageId) return item['pageText'] def get_title(self, pageId): """ Get the title of a page :param str pageId: The unique page identifier :return: The title content of a page """ item = self._table.get_item(pageId=pageId) return item['pageTitle'] def __enter__(self): self._previous_access_key_id = os.environ["AWS_ACCESS_KEY_ID"] self._previous_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"] del os.environ["AWS_ACCESS_KEY_ID"] del os.environ["AWS_SECRET_ACCESS_KEY"] connection = boto.dynamodb2.connect_to_region('us-east-1') self._table = Table('wikidata', connection=connection) self._batch_write = self._table.batch_write() return self def __exit__(self, exc_type, exc_value, traceback): os.environ["AWS_ACCESS_KEY_ID"] = self._previous_access_key_id os.environ["AWS_SECRET_ACCESS_KEY"] = self._previous_secret_key if self._batch_write.should_flush(): self._batch_write.flush() # Cause the exception to be re-raised if one has occurred return exc_value is None
class DynamoDBUtils(object): # <TEST ONLY> Mapping of Customer & Pie/Cam cust_pie_dict = {"cid1": ["cam1", "cam2"], "cid2": ["cam1", "cam2"]} FACES = { 'videos/video_100_frames_1.mp4': { 'face_count': 59, 'face_count_dtl': ['0', '1', '8', '12', '12', '11', '10', '4', '1', '0'], 'face_count_uniq': 3, 'face_count_uniq_dtl': ['0', '1', '0', '0', '0', '1', '0', '0', '1', '0'], 'frame_count': 100, 'time_taken': '0:00:04.731971' }, 'videos/video_100_frames_2.mp4': { 'face_count': 62, 'face_count_dtl': ['10', '10', '0', '0', '0', '9', '5', '8', '10', '10'], 'face_count_uniq': 2, 'face_count_uniq_dtl': ['1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], 'frame_count': 100, 'time_taken': '0:00:04.955812' } } rasp_names = ["kitchen", "garage"] cols = ['START_TIME', 'LEN', 'PROCESSED', 'S3_BUCKET', 'S3_KEY', 'VERSION'] S3_BUCKET = 'smart-cam' def __init__(self): cfg = Config() aws_access_key_id = cfg.get("aws", "access_key_id") aws_secret_access_key = cfg.get("aws", "secret_access_key") self.conn = boto.dynamodb2.connect_to_region( 'us-west-1', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) self.sc = Table('SMARTCAM', connection=self.conn) logger.info(self.conn.list_tables()) pprint.pprint(self.conn.describe_table('SMARTCAM')) # <TEST ONLY> Creates one item in table def create_items(self, num_items=2): cnt = 0 for rasp_name in DynamoDBUtils.rasp_names: for i in xrange(num_items): cnt += 1 self.__create_item(rasp_name, cnt) time.sleep(num_items) # <TEST ONLY> Creates one item in table def __create_item(self, rasp_name, num): data = dict() data['RASP_NAME'] = rasp_name data['START_TIME'] = time.time() data['S3_BUCKET'] = DynamoDBUtils.S3_BUCKET data['S3_KEY'] = 'videos/video_{0}.avi'.format(num) data['PROCESSED'] = 0 data['CLASSIFIED'] = 0 data['VERSION'] = 0 logger.info("# Uploading Data for {0}: {1}".format(rasp_name, num)) self.sc.put_item(data) # <TEST ONLY> Creates multiple full items in table def create_full_items(self, num_items=10, start_time=1459555200): cnt = 0 with self.sc.batch_write() as batch: for rasp_name in DynamoDBUtils.rasp_names: st = start_time for i in xrange(num_items): cnt += 1 if cnt % 2 == 0: batch.put_item( self.__create_full_item( rasp_name, st, 'videos/video_100_frames_1.mp4')) else: batch.put_item( self.__create_full_item( rasp_name, st, 'videos/video_100_frames_2.mp4')) st += 11.25 # 10 + 1.25 secs between 2 video files # <TEST ONLY> Creates multiple full items in table # All Hard code values for purpose of testing the Backend/UI Integration def __create_full_item(self, rasp_name, start_time, s3_key): data = dict() data['RASP_NAME'] = rasp_name data['START_TIME'] = start_time data['UPDATE_TIME'] = start_time + 5 data['S3_BUCKET'] = DynamoDBUtils.S3_BUCKET data['S3_KEY'] = s3_key data['FRAME_COUNT'] = DynamoDBUtils.FACES[s3_key]['frame_count'] data['FACE_COUNT'] = DynamoDBUtils.FACES[s3_key]['face_count'] data['FACE_COUNT_UNIQ'] = DynamoDBUtils.FACES[s3_key][ 'face_count_uniq'] # Face Counts / Detail d = {} #d['data'] = ['5','5','5','5','5','5','5','5','5','6'] data['FACE_COUNT_DTL'] = DynamoDBUtils.FACES[s3_key]['face_count_dtl'] d = {} #d['data'] = ['0','0','0','0','0','0','0','0','0','1'] data['FACE_COUN_UNIQ_DTL'] = DynamoDBUtils.FACES[s3_key][ 'face_count_uniq_dtl'] d = {} d['data'] = [ '0.1', '0.1', '0.1', '0.05', '0.05', '0.15', '0.001', '0.05', '0.01', '0.01' ] data['FOREGROUND'] = d data['PROCESSED'] = 1 data['VERSION'] = 1 logger.info("# Uploading Data for {0}: {1}".format( rasp_name, start_time)) # Converted to a Batch Write #self.sc.put_item(data) return data # Creates one item in table def create_item(self, rasp_name, s3_bucket, s3_key, s_time): data = dict() data['RASP_NAME'] = rasp_name data['START_TIME'] = s_time data['S3_BUCKET'] = s3_bucket data['S3_KEY'] = s3_key data['PROCESSED'] = 0 data['CLASSIFIED'] = 0 data['VERSION'] = 0 data['LEN'] = randint(5, 60) logger.info("# Uploading Data for {0}: {1}:{2}".format( rasp_name, s3_bucket, s3_key)) self.sc.put_item(data) # Fetch items def display_items(self): rows = self.sc.query_2(index='PROCESSED-index', PROCESSED__eq=0) cnt = 0 for row in rows: logger.info('{0},{1},{2}'.format(row['RASP_NAME'], row['START_TIME'], row['PROCESSED'])) cnt += 1 logger.info('# Total unprocessed items: {0}'.format(cnt)) return rows def purge_table(self): cnt = 0 for row in self.sc.scan(): cnt += 1 row.delete() logger.info('Deleted Row: {0}'.format(cnt)) def delete_by_id(self, id): cnt = 0 for row in self.get_items_by_id(id): cnt += 1 row.delete() logger.info('Deleted Row: {0}'.format(cnt)) def reset_processed(self): cnt = 0 for row in self.sc.scan(): cnt += 1 row['PROCESSED'] = 0 self.update(row) logger.info('Update Row: {0}'.format(cnt)) def reset_classified(self): cnt = 0 for row in self.sc.scan(): cnt += 1 row['CLASSIFIED'] = 0 self.update(row) logger.info('Update Row: {0}'.format(cnt)) def add_classified(self): cnt = 0 for row in self.sc.scan(): cnt += 1 row['CLASSIFIED'] = 0 self.update(row) logger.info('Update Row: {0}'.format(cnt)) def get_unprocessed_items(self): return self.sc.query_2(index='PROCESSED-index', PROCESSED__eq=0) def get_processed_items(self): return self.sc.query_2(index='PROCESSED-index', PROCESSED__eq=1) def get_unclassified_items(self): return self.sc.query_2(index='CLASSIFIED-index', CLASSIFIED__eq=0) def get_classified_items(self): return self.sc.query_2(index='CLASSIFIED-index', CLASSIFIED__eq=1) def get_items_by_id(self, id): return self.sc.query_2(RASP_NAME__eq=id) def get_items_by_id_range(self, id, start, end): return self.sc.query_2(RASP_NAME__eq=id, START_TIME__between=(start, end)) def update(self, row): try: row.save(overwrite=True) except Exception as e: logger.error(e) logger.info('[FAILED] Processing: ', row['RASP_NAME'], row['START_TIME'], row['PROCESSED']) def stats(self, lst): quotient, remainder = divmod(len(lst), 2) if remainder: return sorted(lst)[quotient] return sum(lst) / len(lst), sum( sorted(lst)[quotient - 1:quotient + 1]) / 2 def close(self): self.conn.close()
class DDBRuns(Dynamo): @classmethod def from_test_mode(cls, access_key=None, secret=None): """ Use this for getting an instance of this class that uses test tables. """ instance = cls(access_key, secret) instance.table = Table('test_runs', connection=instance.connection) return instance def __init__(self, access_key=None, secret=None): """ When called directly (as should be done for production code), sets table to the production 'runs' table. """ super(DDBRuns, self).__init__(access_key, secret) self.table = Table('runs', connection=self.connection) def save_new_run(self, dt_str=None, start_date_str=None, end_date_str=None): """ dt_str = datetime of run. Defaults to now. start_date_str = the start date for look-back of query performance data processing. * No default end_date_str = the end date for query performance data processing. Defaults to today. """ assert start_date_str, "start_date_str is required when saving a new run to runs table." assert DAY_STR_RE.match(start_date_str) if end_date_str: assert DAY_STR_RE.match(end_date_str) if dt_str: assert SECOND_STR_RE.match(dt_str) dt_str = dt_str or datetime.now().strftime(SECOND_STR_FORMAT) end_date_str = end_date_str or datetime.now().strftime(DAY_STR_FORMAT) return self.table.put_item(data={ 'dt': dt_str, 'start': start_date_str, 'end': end_date_str }) def most_recent_start_date_str(self): """ :return: a string representing most recent start date from db """ df = self.get_runs_df() if df.empty: return None else: # should already be sorted, but just in case... df.sort(columns=['dt'], ascending=True, inplace=True) return df.iloc[len(df) - 1]['start'] def most_recent_end_date_str(self): """ :return: a string representing most recent end date from db """ df = self.get_runs_df() if df.empty: return None else: # should already be sorted, but just in case... df.sort(columns=['dt'], ascending=True, inplace=True) return df.iloc[len(df) - 1]['end'] def get_runs_df(self): """ Returns all table as dataframe, sorted with most recent entry on bottom (ascending order) """ df = DataFrame([{k: v for k, v in r.items()} for r in self.table.scan()]) if df.empty: return df else: df.sort(columns=['dt'], ascending=True, inplace=True) # force df to have columns in this order return df[['dt', 'start', 'end']] def modify_throughput(self, requested_read, requested_write, table=None): table = table or self.table return super(DDBRuns, self).modify_throughput(requested_read, requested_write, table) def truncate_table(self): """ WARNING! Only use for test mode table """ assert self.table.table_name == 'test_runs', "Will only truncate test table. To truncate production table, run code manually" with self.table.batch_write() as batch: for item in self.table.scan(): batch.delete_item(dt=item['dt']) def thors_start_end_date_strings(self, new_run=True, days_ago_start=30): if new_run: if days_ago_start is not None: print days_ago_start start_date_str = self._days_ago_str(days_ago_start) else: start_date_str = self.most_recent_end_date_str() end_date_str = date.today().strftime(DAY_STR_FORMAT) else: start_date_str = self.most_recent_start_date_str() end_date_str = self.most_recent_end_date_str() assert start_date_str, "Start date string is None, please check the database since we are not doing a new run" assert end_date_str, "End date string is None, please check the database since we are not doing a new run" return start_date_str, end_date_str def _days_ago_str(self, num_days_ago): return (date.today() - timedelta(days=num_days_ago)).strftime(DAY_STR_FORMAT) def start_end_date_strings(self, new_run=True, days_ago_start=30): if new_run: start_date_str = self.most_recent_end_date_str( ) or self._days_ago_str(days_ago_start) end_date_str = date.today().strftime(DAY_STR_FORMAT) else: start_date_str = self.most_recent_start_date_str() end_date_str = self.most_recent_end_date_str() return start_date_str, end_date_str
class Index(Base): '''An Index for docker-registry that uses Amazon AWS DynamoDB as the storage engine. Boto is used to do all access to DynamoDB. Configure the following dynamodb_config variables or environment variables: dynamodb_index_database - optional, if not specified will default to 'docker-registry' and the repository and version table names will be constructed using the {dynamodb_index_database}-repository and {dynamodb_index_database}-version. DynamoDB does not have a database concept, just tables in the data store. dynamodb_index_repository_table - override the default table name (above) with a new name dynamodb_index_version_table - override the default table name with a new name dynamodb_region - the AWS region for the dynamodb. This will default to the s3_region and if that is not defined, it defaults to 'us-east-1'. dynamodb_access_key - the AWS access key to use dynamodb_secret_access_key - the AWS secret part of the access key ''' _initLock = Lock() def __init__(self, database=None, dynamodb_access_key=None, dynamodb_secret_access_key=None): ''' Constructor ''' cfg = dynamodb_config.load() if database is None: database = cfg['extensions.dynamodb_index.database'] if dynamodb_access_key is None: dynamodb_access_key = cfg['extensions.dynamodb_index.access_key'] if dynamodb_secret_access_key is None: dynamodb_secret_access_key = cfg['extensions.dynamodb_index.secret_access_key'] self.repositoryTableName = cfg['extensions.dynamodb_index.repository_table'] self.versionTableName = cfg['extensions.dynamodb_index.version_table'] if dynamodb_access_key is None: self._db = dynamodb2.connect_to_region(cfg['extensions.dynamodb_index.region']) else: self._db = dynamodb2.connect_to_region(cfg['extensions.dynamodb_index.region'], aws_access_key_id=dynamodb_access_key, aws_secret_access_key=dynamodb_secret_access_key) self._repositoryTable = Table(self.repositoryTableName, schema=[HashKey('name', data_type=STRING)], global_indexes=[GlobalAllIndex('Repositories-By-Description-Index', parts=[HashKey('description', data_type=STRING)])], connection=self._db) self._versionTable = Table(self.versionTableName, schema=[HashKey('version', data_type=NUMBER)], connection=self._db) self.version = 1 Index._initLock.acquire() try: self._setup_database() finally: Index._initLock.release() super(Index, self).__init__() def _describe_or_create_tables(self): dynamodb_util.create_table_if_not_exists(self._repositoryTable) dynamodb_util.create_table_if_not_exists(self._versionTable) def _wait_for_tables(self): dynamodb_util.wait_for_table_active(self._repositoryTable) dynamodb_util.wait_for_table_active(self._versionTable) def _read_or_set_schema_version(self, default_version): def read_schema_version(): v = 0 try: results = self._versionTable.scan() row = results.next() v = row['version'] except: v = -1 return v # Read or insert the schema_version. Keep doing it until one # of them works. This is in case another thread is attempting the same # thing. Reading first will allow this thread to complete. schemaVersion = read_schema_version() while (schemaVersion <= 0): try: self._versionTable.put_item(data={'version': default_version}) schemaVersion = default_version except: sleep(0.5) schemaVersion = read_schema_version() return schemaVersion def _setup_database(self): needs_index = not dynamodb_util.table_exists(self._versionTable) self._describe_or_create_tables() self._wait_for_tables() version = self._read_or_set_schema_version(self.version) if (version != self.version): raise NotImplementedError('unrecognized search index version {0}'.format(version)) if needs_index: self._generate_index() def _generate_index(self): store = storage.load() with self._repositoryTable.batch_write() as batch: for repository in self._walk_storage(store=store): logger.info('Populating repository: {0}'.format(repository['name'])) batch.put_item(data=repository) def _handle_repository_created( self, sender, namespace, repository, value): name = '{0}/{1}'.format(namespace, repository) description = '' # TODO(wking): store descriptions logger.info('Creating new repository {0}'.format(name)) self._repositoryTable.put_item(data={'name': name, 'description': description}) def _handle_repository_updated( self, sender, namespace, repository, value): name = '{0}/{1}'.format(namespace, repository) description = '' # TODO(wking): store descriptions logger.info('Updating repository {0}'.format(name)) repo = self._repositoryTable.get_item(name=name) repo['description'] = description repo.save(overwrite=True) def _handle_repository_deleted(self, sender, namespace, repository): name = '{0}/{1}'.format(namespace, repository) logger.info('Deleting repository {0}'.format(name)) self._repositoryTable.delete_item(name=name) def results(self, search_term=None): """Return a list of results matching search_term The list elements should be dictionaries: {'name': name, 'description': description} """ if not search_term or len(search_term) == 0: logger.info('Index query: full table scan') repositories = self._repositoryTable.scan() else: logger.info('Index query: {0}'.format(search_term)) repositories = self._repositoryTable.scan(conditional_operator='OR', name__contains=search_term, description__contains=search_term) if repositories: return [{'name': repo['name'], 'description': repo['description'], } for repo in repositories] return []
########################### ## Batch writing ########################### # If you’re loading a lot of data at a time, making use of batch writing can both speed up the process & # reduce the number of write requests made to the service. # Batch writing involves wrapping the calls you want batched in a context manager. The context manager # imitates the Table.put_item & Table.delete_item APIs. Getting & using the context manager looks like: import time from boto.dynamodb2.table import Table tweets = Table('tweets') with tweets.batch_write() as batch: batch.put_item( data={ 'id': '1111', 'username': '******', 'screen_name': 'yyyy', 'tweet': 'yes yes', }) batch.put_item( data={ 'id': '2222', 'username': '******', 'screen_name': 'dddd', 'tweet': 'no no', })
def main(): aud = sys.argv[1] f = open(aud) region = 'us-east-1' #region = 'ap-southeast-1' print 'Connecting to %s with IAM role' % (region) #conn = boto.dynamodb.connect_to_region(region) # table = conn.get_table('users1') table = Table('users1') skipped = 0 newcnt = 0 updatedcnt = 0 samecnt = 0 cnt = 0 batchcnt = 0 errcnt = 0 batch = None for line in f: if not batch: batch = table.batch_write() print "Got batch %s" % batch (cookie, segs) = line.split("\t") if " " in cookie: cookie = cookie.replace(" ", "+") if not cookie.endswith("=="): cookie = cookie + "==" try: # print "Decoding %s" % cookie cdec = base64.b64decode(cookie) s = struct.unpack("<IIII", cdec) uid = "%08X%08X%08X%08X" % s except: errcnt += 1 continue # print "%s -> %s" % (cookie, uid) seg_list = segs.split(",") seg_list = ['%s:tp:1' % s for s in seg_list] try: item = table.get_item(key=uid) json = item['doAttr'] e = simplejson.loads(json) if not e: newcnt += 1 e = [] except boto.dynamodb2.exceptions.ItemNotFound: newcnt += 1 item = {'dtAttr': 'java.util.Set', 'doAttr': '[]'} e = [] # e - existing e = [s.replace(':fp:', ':tp:').strip() for s in e] e = sets.Set(e) # n - new n = sets.Set(seg_list) # combine n.update(e) # if the same no need to write if n == e: samecnt += 1 skipped += 1 continue elif e: updatedcnt += 1 n = list(n) item['doAttr'] = simplejson.dumps(n) #print "Putting %s" % item batchcnt += 1 batch.put_item(data={ 'doAttr': item['doAttr'], 'dtAttr': 'java.util.Set', 'key': uid }) #item.put() cnt += 1 if cnt % BATCH_SIZE == 0: batch.flush() batch = None if cnt % 5000 == 0: print "OK" print item print "User count: %s total, updated %s, same %s, new %s, error %s" % ( cnt, updatedcnt, samecnt, newcnt, errcnt) print "Wrote %s users" % cnt item2 = table.new_item( key='LAST_WRITE', attrs={ 'dtAttr': 'java.lang.String', 'doAttr': "LOTAME: User count: %s total, updated %s, same %s, new %s, error %s at %s\nLast user: %s : %s" % (cnt, updatedcnt, samecnt, newcnt, errcnt, datetime.datetime.now(), user, str(item)) }) print item2 item2.put() batch.flush() item2 = table.new_item({ key: 'LAST_WRITE', 'dtAttr': 'java.util.String', 'doAttr': "LOTAME: User count: %s total, updated %s, same %s, new %s, error %s at %s" % (cnt, updatedcnt, samecnt, newcnt, errcnt, datetime.datetime.now()), 'item': item }) print item2 item2.put() print "Added or updated %s users, skipped %s, to %s region" % ( cnt, skipped, region)
class DBconn(object): def __init__(self): aws_access_key_id = os.environ['S3_KEY'] # I AM OPS U NO GET MY KEYS aws_secret_access_key = os.environ['S3_SECRET'] # DIS IS MY JOB self._conn = DynamoDBConnection( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) self.works_table = Table('ao3rdr-works', connection=self._conn) self.immutable_fields = ['work_id', 'user_id'] def get_user(self, user_id): res = self.works_table.query_2( user_id__eq=user_id, work_id__eq='settings', attributes=['user_id']) out = [] for entry in res: out.append(self.serialize(entry)['user_id']) return out def add_user(self, user_id): """ Adding a user adds a special "work" which is used to store a user's settings. """ return self.works_table.put_item(data={ 'user_id': user_id, 'work_id': 'settings', 'created': time.time() }) def update_work(self, user_id, work_id, data): item = self.works_table.get_item(user_id=user_id, work_id=work_id) # update the item for key, value in data.iteritems(): if key not in self.immutable_fields: item[key] = value item['updated'] = time.time() item.partial_save() def create_work(self, user_id, work_id, data): data['user_id'] = user_id data['work_id'] = work_id self.works_table.put_item(data) def batch_update(self, data_list): with self.works_table.batch_write() as batch: for data in data_list: batch.put_item(data=data) def get_work(self, user_id, work_id): try: res = self.works_table.get_item(user_id=user_id, work_id=work_id) except ItemNotFound: return {} return self.serialize(res) def get_all_works(self, user_id): res = self.works_table.query_2(user_id__eq=user_id) for entry in res: yield self.serialize(entry) def close(self): self._conn.close() def serialize(self, item): out = serialize(dict(item)) return out
filer = codecs.open(directory +"/nyt-" + dayStr + ".json", "rb", encoding="utf-8") except: logging.debug("could not open file" + filename) exit() data = json.loads(filer.read()) filer.close() for dayArticle in data: #for each day if (dayArticle["response"] != None): #articles are bundled in group of 10 (because that is how much article per request NYT API was returning. # will batch write for better performance and thoroughput saving. with nyt.batch_write() as batch: for doc in dayArticle["response"]['docs']: logging.debug("processing articleId " + doc['_id'] + " published at " + doc['pub_date']) #must replace all empty string b/c otherwise dynamodb complains doc = replaceEmptyString2(doc) convertedDate = dateutil.parser.parse(doc['pub_date']) #saving datetime as timestamp timestamp = time.mktime((convertedDate.year, convertedDate.month, convertedDate.day,convertedDate.hour, convertedDate.minute, convertedDate.second,-1, -1, -1)) + convertedDate.microsecond / 1e6 data = { "id" : doc['_id'],
class DDBSlurps(Dynamo): @classmethod def from_test_mode(cls, access_key=None, secret=None): """ Use this for getting an instance of this class that uses test tables. """ instance = cls(access_key, secret) instance.slurps_table = Table('test_slurps', connection=instance.connection) instance.failed_slurps_table = Table('test_failed_slurps', connection=instance.connection) return instance def __init__(self, access_key=None, secret=None): """ ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables """ super(DDBSlurps, self).__init__(access_key, secret) self.slurps_table = Table('slurps', connection=self.connection) self.failed_slurps_table = Table('failed_slurps', connection=self.connection) def save_slurp_info(self, slurp_info_, overwrite=True): """ slurp_info_ can either be in the form of a list of dicts or else a single dict. If slurp_info is a list, batch write will be used """ if isinstance(slurp_info_, dict): self.slurps_table.put_item(slurp_info_, overwrite=overwrite) elif isinstance(slurp_info_, list): with self.slurps_table.batch_write() as batch: for s in slurp_info_: batch.put_item(data=s, overwrite=overwrite) else: raise TypeError, "slurp_info must be a dict or a list of dicts, not a {}".format( type(slurp_info_)) def save_failed_slurp(self, searchterm): self.failed_slurps_table.put_item(data={ 'searchterm': searchterm, 'datetime': datetime.now().isoformat() }, overwrite=True) def get_slurp_info(self, search_term_=None): """ search_term_ can be either a string or a list of strings. Each string should be a search term you are looking for in the db. Returns either a single list of key-value tuples (if search_term_ was a string) or a list of key-value tuples (if search_term_ was a list) Each list of key-value tuples can easily be converted to a dict or an OrderedDict by the client. """ # searchterm_ is a STRING if isinstance(search_term_, basestring): if search_term_: slurp_info = (self.slurps_table.get_item( searchterm=search_term_)).items() else: slurp_info = [] # searchterm is a LIST of strings elif isinstance(search_term_, list): if search_term_: # create a set of non-empty searchterms. We us a set to avoid a duplicate query error from the db set_of_sts = {st for st in search_term_ if st} # create a list of dicts from the set list_of_st_dicts = [{'searchterm': st} for st in set_of_sts] res = self.slurps_table.batch_get(list_of_st_dicts) try: slurp_info = [i.items() for i in res] except (StopIteration, IndexError): # If res is empty, we get one of these errors when trying to iterate. slurp_info = [] else: slurp_info = [] # searchterm is an unexpected type else: raise TypeError, "search_term_ must be a dict or a list of dicts, not a {}".format( type(search_term_)) return slurp_info def existing_and_missing_uni(self, searchterm_list): """ Takes a list of searchterm strings and returns a list of searchterm strings that were found in the db (in unicode) and a list of the searchterms that were missing from the found results """ # make sure in utf8 before we send request to the db input_sts_utf8 = [to_utf8_or_bust(i) for i in searchterm_list] found_sts_info = self.get_slurp_info(input_sts_utf8) found_sts_uni = [ to_unicode_or_bust(dict(i)['searchterm']) for i in found_sts_info ] input_sts_uni = [to_unicode_or_bust(i) for i in input_sts_utf8] missing_sts_uni = order_conserving.setdiff(input_sts_uni, found_sts_uni) return found_sts_uni, missing_sts_uni def get_table(self, table_name): """ Convenience method for client who may wish to get a specific table from the DynamoDB connection """ return Table(table_name, connection=self.connection) def truncate_failed_slurp_table(self): """ """ with self.failed_slurps_table.batch_write() as batch: for item in self.failed_slurps_table.scan(): batch.delete_item(searchterm=item['searchterm']) def truncate_slurp_table(self): """ WARNING! Only use for test mode table """ assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually" test_slurps_table = Table('test_slurps', connection=self.connection) with test_slurps_table.batch_write() as batch: for item in self.slurps_table.scan(): batch.delete_item(searchterm=item['searchterm']) def modify_failed_slurps_throughput(self, requested_read, requested_write): return self.modify_throughput(requested_read, requested_write, self.failed_slurps_table) def modify_slurps_throughput(self, requested_read, requested_write): return self.modify_throughput(requested_read, requested_write, self.slurps_table) def get_slurps_table_info(self): return self.get_table_info(self.slurps_table) def get_failed_slurps_table_info(self): return self.get_table_info(self.failed_slurps_table)
class CloudTrailTable(): """ Class to represent the cloudtrail table in dynamodb """ def __init__(self, table_name=settings.DYNAMODB_CLOUDTRAIL_TABLE): self.conn = dynamodb_connection() self.table = Table(table_name, connection=self.conn) def save_items(self, items, project_id): """ Saves the items into db after 'dynamizing' them as a batch operation :param items: the list of items :param project_id: ProjectId hashkey for the table :return: None """ # Parse every item in the response, add keys as per dynamodb, and # do a batch update geo_conn = geo_connection() with self.table.batch_write() as batch: for item in items: ctj = json.loads(item['CloudTrailEvent']) item['ProjectId'] = int(project_id) ip = ctj['sourceIPAddress'] item['sourceIPAddress'] = ip item['countryCode'] = country_from_ip(ip, geo_conn=geo_conn) item['awsRegion'] = ctj['awsRegion'] project_content_type_id = ContentType.objects.get_for_model( ProjectAWS).pk project_object_id = int(project_id) # Signal the receiver for event names here cloudtrail_notification_signal.send( sender=item['EventId'], context_data=ctj, project_content_type_id=project_content_type_id, project_object_id=project_object_id) if settings.IS_DYNAMODB_LOCAL: # We need to dynamize to store data in form of list, map etc dy = types.Dynamizer() for k, v in item.iteritems(): item[k] = dy.encode(v) batch.put_item(data=item) def delete_items(self, project_id, before_time): """ Deletes the items before the given time :param before_time: time to query for items and delete :return: None """ for item in self.table.query_2(ProjectId__eq=project_id, EventTime__lt=before_time, index='EventTime-index'): item.delete() def query_events(self, project_id): """ Returns the items in the table for a given project id :param project_id: The hashkey project_id :return: table rows matching the argument """ return self.table.query_2(ProjectId__eq=project_id)
class DynamoDBAdapter(key_value_store.KeyValueStore): """ Implementation of an abstract key-value store defined in key_value_store.py. The underlying database is amazon DynamoDB. The store keeps all objects in a single table with following schema: [HashKey('kind', data_type=STRING), RangeKey('id')]. 'kind' is the string with the object type ('vector', 'set' or 'int') and 'id' is the object id. The object value is stored in the 'value' attribute of the table items. The table should be created before this code is executed. Amazon configuration is assumed to be stored in ~/.boto file as described in http://boto.readthedocs.org/en/latest/boto_config_tut.html """ def __init__(self, precision=np.dtype('float32'), table_name='test'): """ Create an instance of the dynamodb key-value store. precision - a numpy type, elements of all vectors are converted and stored in this type; table_name - the name of the DynamoDB table which keeps the objects. """ conn = boto.dynamodb2.connect_to_region('eu-west-1') if not isinstance(precision, np.dtype): raise TypeError("Precision should be a numpy.dtype subtype") self.precision = precision self.precision_name = precision.name self.table = Table(table_name, connection=conn) def _get_or_create_item(self, kind, item_id): try: item = self.table.get_item(kind=kind, id=item_id) except ItemNotFound: item = Item(self.table) item['kind'] = kind item['id'] = item_id return item def _create_vector_item(self, vec_id, vector): item = self._get_or_create_item('vector', vec_id) item['value'] = Binary(vector.astype(self.precision).tostring()) item['precision'] = self.precision_name return item def _vector_value(self, item): return np.fromstring(str(item['value']), np.dtype(item['precision'])) def get_vector_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='vector')] def get_int_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='int')] def get_set_ids(self): return [v['id'] for v in self.table.query_2(kind__eq='set')] def store_vector(self, vec_id, vector): item = self._create_vector_item(vec_id, vector) item.save() def get_vector(self, vec_id): try: item = self.table.get_item(kind='vector', id=vec_id) except ItemNotFound: raise KeyError('Vector key %s is unknown' % (vec_id,)) return self._vector_value(item) def bulk_get_vector(self, vec_ids): keys = [{'kind': 'vector', 'id': i} for i in vec_ids] vs = self.table.batch_get(keys=keys) return [self._vector_value(i) for i in vs] def remove_vector(self, vec_id): try: item = self.table.get_item(kind='vector', id=vec_id) except ItemNotFound: raise KeyError('Vector key %s is unknown' % (vec_id,)) item.delete() def add_to_set(self, set_id, element_id): item = self._get_or_create_item('set', set_id) if 'value' not in item.keys() or not isinstance(item['value'], set): item['value'] = set() item['value'].add(element_id) item.save(overwrite=True) def remove_from_set(self, set_id, element_id): try: item = self.table.get_item(kind='set', id=set_id) except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id,)) if 'value' not in item.keys() or not isinstance(item['value'], set): raise KeyError('Incorrect value in item %s' % (set_id,)) if element_id not in item['value']: raise KeyError('Element %s not in set %s' % (element_id, set_id)) item['value'].remove(element_id) item.save() def remove_set(self, set_id): try: item = self.table.get_item(kind='set', id=set_id) item.delete() except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id,)) def get_set(self, set_id): try: the_set = self.table.get_item(kind='set', id=set_id)['value'] return set([str(entry) for entry in the_set]) except ItemNotFound: raise KeyError('Set key %s is unknown' % (set_id,)) def store_int(self, int_id, integer): item = self._get_or_create_item('int', int_id) item['value'] = integer item.save() def get_int(self, int_id): try: return int(self.table.get_item(kind='int', id=int_id)['value']) except ItemNotFound: raise KeyError('Int key %s is unknown' % (int_id,)) def remove_int(self, int_id): try: item = self.table.get_item(kind='int', id=int_id) except ItemNotFound: raise KeyError('Int key %s is unknown' % (int_id,)) item.delete() def _aggregate_set_id_element_pairs(self, setpairs): """Turns a list of pairs of the form (set_id, element_id) into a list 'L' of pairs 'p' of the form (set_id, set_of_element_ids). 'L' has the property that if 'p' and 'q' are distinct entries in 'L', then p[0] and q[0] are also distinct.""" set_ids = set([entry[0] for entry in setpairs]) listlist = [[entry for entry in setpairs if entry[0] == set_id] for set_id in set_ids] result = [(pairlist[0][0], set([entry[1] for entry in pairlist])) for pairlist in listlist] return result def bulk_store_vector(self, vec_ids, vectors): if len(vec_ids) != len(vectors): raise ValueError vecpairs = zip(vec_ids, vectors) with self.table.batch_write() as batch: for vec_id, vec in vecpairs: item = self._create_vector_item(vec_id, vec) batch.put_item(item) def bulk_store_vector_old(self, vectors_df): """Argument 'vectors' is a dataframe with index vector ids.""" if len(vec_ids) != len(vectors): raise ValueError with self.table.batch_write() as batch: for ind in vectors_df.index: vec_id = str(ind) vec = vectors_df.loc[ind].values item = self._create_vector_item(vec_id, vec) batch.put_item(item) def bulk_store_int(self, int_ids, integers): """Argument 'intpairs' is a list of pairs of the form (int_id, integer).""" if len(int_ids) != len(integers): raise ValueError intpairs = zip(int_ids, integers) with self.table.batch_write() as batch: for pair in intpairs: int_id, integer = pair item = self._get_or_create_item('int', int_id) item['value'] = integer batch.put_item(item) def bulk_add_to_set(self, set_ids, element_ids): """batch_write() objects if the same item is written to more than once per batch, hence we aggregate all (set_id, element_id) pairs into a list of pairs (set_id, element_ids), where the 'set_id's are pairwise distinct, and the 'element_ids' are sets.""" if len(set_ids) != len(element_ids): raise ValueError setpairs = zip(set_ids, element_ids) setlist = self._aggregate_set_id_element_pairs(setpairs) with self.table.batch_write() as batch: for pair in setlist: set_id, element_ids = pair item = self._get_or_create_item('set', set_id) if 'value' not in item.keys() or not isinstance( item['value'], set): item['value'] = set() item['value'].update(element_ids) batch.put_item(item)
"rb", encoding="utf-8") except: logging.debug("could not open file" + filename) exit() data = json.loads(filer.read()) filer.close() for dayArticle in data: #for each day if (dayArticle["response"] != None): #articles are bundled in group of 10 (because that is how much article per request NYT API was returning. # will batch write for better performance and thoroughput saving. with nyt.batch_write() as batch: for doc in dayArticle["response"]['docs']: logging.debug("processing articleId " + doc['_id'] + " published at " + doc['pub_date']) #must replace all empty string b/c otherwise dynamodb complains doc = replaceEmptyString2(doc) convertedDate = dateutil.parser.parse(doc['pub_date']) #saving datetime as timestamp timestamp = time.mktime( (convertedDate.year, convertedDate.month, convertedDate.day, convertedDate.hour, convertedDate.minute, convertedDate.second, -1, -1,